init repo

badoux · badoux · commit f15061064fa7 · 2017-01-26T11:47:23.000+01:00
diff --git a/LICENSE b/LICENSE
@@ -0,0 +1,22 @@
+(The MIT License)
+
+Copyright (c) 2017 Florian Carrere <florian@carrere.cc>
+
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of this software and associated documentation files (the
+'Software'), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+
+The above copyright notice and this permission notice shall be
+included in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE
diff --git a/README.md b/README.md
@@ -0,0 +1,27 @@
+# goscraper
+[Golang](http://golang.org/) package to quickly return a preview of a webpage, you can get easily its title, description & images
+
+## Usage
+    func main() {
+		s, err := goscraper.Scrape("https://www.w3.org/", 5)
+		if err != nil {
+			fmt.Println(err)
+			return
+		}
+		fmt.Printf("Title : %s\n", s.Preview.Title)
+		fmt.Printf("Description : %s\n", s.Preview.Description)
+		fmt.Printf("Image: %s\n", s.Preview.Images[0])
+		fmt.Printf("Url : %s\n", s.Preview.Link)
+	}
+
+output: 
+
+`Title : World Wide Web Consortium (W3C)
+Description : The World Wide Web Consortium (W3C) is an international community where Member organizations, a full-time staff, and the public work together to develop Web standards.
+Image: https://www.w3.org/2008/site/images/logo-w3c-mobile-lg
+Url : https://www.w3.org/`
+
+
+## License
+
+Goscraper is licensed under the [MIT License](./LICENSE).
diff --git a/goscraper.go b/goscraper.go
@@ -0,0 +1,319 @@
+package goscraper
+
+import (
+	"bytes"
+	"fmt"
+	"io"
+	"net/http"
+	"net/url"
+	"regexp"
+	"strings"
+
+	"golang.org/x/net/html"
+)
+
+var (
+	EscapedFragment string = "_escaped_fragment_="
+)
+
+type Scraper struct {
+	Url                *url.URL
+	EscapedFragmentUrl *url.URL
+	MaxRedirect        int
+}
+
+type Document struct {
+	Body    bytes.Buffer
+	Preview DocumentPreview
+}
+
+type DocumentPreview struct {
+	Title       string
+	Description string
+	Images      []string
+	Link        string
+}
+
+func Scrape(uri string, maxRedirect int) (*Document, error) {
+	u, err := url.Parse(uri)
+	if err != nil {
+		return nil, err
+	}
+	return (&Scraper{Url: u, MaxRedirect: maxRedirect}).Scrape()
+}
+
+func (scraper *Scraper) getUrl() string {
+	if scraper.EscapedFragmentUrl != nil {
+		return scraper.EscapedFragmentUrl.String()
+	}
+	return scraper.Url.String()
+}
+
+func (scraper *Scraper) toFragmentUrl() error {
+	re := regexp.MustCompile("#!(.*)")
+	unescapedurl, err := url.QueryUnescape(scraper.Url.String())
+	if err != nil {
+		return err
+	}
+	matches := re.FindStringSubmatch(unescapedurl)
+	if len(matches) > 1 {
+		escapedFragment := EscapedFragment
+		for _, r := range matches[1] {
+			b := byte(r)
+			if avoidByte(b) {
+				continue
+			}
+			if escapeByte(b) {
+				escapedFragment += url.QueryEscape(string(r))
+			} else {
+				escapedFragment += string(r)
+			}
+		}
+
+		p := "?"
+		if len(scraper.Url.Query()) > 0 {
+			p = "&"
+		}
+		fragmentUrl, err := url.Parse(strings.Replace(unescapedurl, matches[0], p+escapedFragment, 1))
+		if err != nil {
+			return err
+		}
+		scraper.EscapedFragmentUrl = fragmentUrl
+	} else {
+		p := "?"
+		if len(scraper.Url.Query()) > 0 {
+			p = "&"
+		}
+		fragmentUrl, err := url.Parse(unescapedurl + p + EscapedFragment)
+		if err != nil {
+			return err
+		}
+		scraper.EscapedFragmentUrl = fragmentUrl
+	}
+	return nil
+}
+
+func (scraper *Scraper) getDocument() (*Document, error) {
+	scraper.MaxRedirect -= 1
+	if strings.Contains(scraper.Url.String(), "#!") {
+		scraper.toFragmentUrl()
+	}
+	if strings.Contains(scraper.Url.String(), EscapedFragment) {
+		scraper.EscapedFragmentUrl = scraper.Url
+	}
+
+	req, err := http.NewRequest("GET", scraper.getUrl(), nil)
+	if err != nil {
+		return nil, err
+	}
+	req.Header.Add("User-Agent", "GoScraper")
+
+	resp, err := http.DefaultClient.Do(req)
+	if resp != nil {
+		defer resp.Body.Close()
+	}
+	if err != nil {
+		return nil, err
+	}
+
+	dst := bytes.Buffer{}
+	_, err = io.Copy(&dst, resp.Body)
+	if err != nil {
+		return nil, err
+	}
+	if resp.Request.URL.String() != scraper.getUrl() {
+		scraper.EscapedFragmentUrl = nil
+		scraper.Url = resp.Request.URL
+	}
+	doc := &Document{Body: dst, Preview: DocumentPreview{Link: scraper.Url.String()}}
+
+	return doc, nil
+}
+
+func (scraper *Scraper) parseDocument(doc *Document) error {
+	t := html.NewTokenizer(&doc.Body)
+	var ogImage bool
+	var headPassed bool
+	var hasFragment bool
+	var hasCanonical bool
+	var canonicalUrl *url.URL
+	doc.Preview.Images = []string{}
+	// saves previews' link in case that <link rel="canonical"> is found after <meta property="og:url">
+	link := doc.Preview.Link
+	for {
+		tokenType := t.Next()
+		if tokenType == html.ErrorToken {
+			return nil
+		}
+		if tokenType != html.SelfClosingTagToken && tokenType != html.StartTagToken && tokenType != html.EndTagToken {
+			continue
+		}
+		token := t.Token()
+
+		switch token.Data {
+		case "head":
+			if tokenType == html.EndTagToken {
+				headPassed = true
+			}
+		case "body":
+			headPassed = true
+
+		case "link":
+			var canonical bool
+			var href string
+			for _, attr := range token.Attr {
+				if cleanStr(attr.Key) == "rel" && cleanStr(attr.Val) == "canonical" {
+					canonical = true
+				}
+				if cleanStr(attr.Key) == "href" {
+					href = attr.Val
+				}
+				if len(href) > 0 && canonical && link != href {
+					hasCanonical = true
+					var err error
+					canonicalUrl, err = url.Parse(href)
+					if err != nil {
+						return err
+					}
+				}
+			}
+
+		case "meta":
+			if len(token.Attr) != 2 {
+				break
+			}
+			if metaFragment(token) && scraper.EscapedFragmentUrl == nil {
+				hasFragment = true
+			}
+			var property string
+			var content string
+			for _, attr := range token.Attr {
+				if cleanStr(attr.Key) == "property" || cleanStr(attr.Key) == "name" {
+					property = attr.Val
+				}
+				if cleanStr(attr.Key) == "content" {
+					content = attr.Val
+				}
+			}
+			switch cleanStr(property) {
+			case "og:title":
+				doc.Preview.Title = content
+			case "og:description":
+				doc.Preview.Description = content
+			case "description":
+				if len(doc.Preview.Description) == 0 {
+					doc.Preview.Description = content
+				}
+			case "og:url":
+				doc.Preview.Link = content
+			case "og:image":
+				ogImage = true
+				doc.Preview.Images = []string{content}
+
+			}
+
+		case "title":
+			if tokenType == html.StartTagToken {
+				t.Next()
+				token = t.Token()
+				if len(doc.Preview.Title) == 0 {
+					doc.Preview.Title = token.Data
+				}
+			}
+
+		case "img":
+			for _, attr := range token.Attr {
+				if cleanStr(attr.Key) == "src" {
+					imgUrl, err := url.Parse(attr.Val)
+					if err != nil {
+						return err
+					}
+					if !imgUrl.IsAbs() {
+						doc.Preview.Images = append(doc.Preview.Images, fmt.Sprintf("%s://%s%s", scraper.Url.Scheme, scraper.Url.Host, imgUrl.Path))
+					} else {
+						doc.Preview.Images = append(doc.Preview.Images, attr.Val)
+					}
+
+				}
+			}
+		}
+
+		if hasCanonical && headPassed && scraper.MaxRedirect > 0 {
+			scraper.Url = canonicalUrl
+			scraper.EscapedFragmentUrl = nil
+			fdoc, err := scraper.getDocument()
+			if err != nil {
+				return err
+			}
+			*doc = *fdoc
+			return scraper.parseDocument(doc)
+		}
+
+		if hasFragment && headPassed && scraper.MaxRedirect > 0 {
+			scraper.toFragmentUrl()
+			fdoc, err := scraper.getDocument()
+			if err != nil {
+				return err
+			}
+			*doc = *fdoc
+			return scraper.parseDocument(doc)
+		}
+
+		if len(doc.Preview.Title) > 0 && len(doc.Preview.Description) > 0 && len(doc.Preview.Title) > 0 && ogImage && headPassed {
+			return nil
+		}
+
+	}
+
+	return nil
+}
+
+func (scraper *Scraper) Scrape() (*Document, error) {
+	doc, err := scraper.getDocument()
+	if err != nil {
+		return nil, err
+	}
+	err = scraper.parseDocument(doc)
+	if err != nil {
+		return nil, err
+	}
+	return doc, nil
+}
+
+func avoidByte(b byte) bool {
+	i := int(b)
+	if i == 127 || (i >= 0 && i <= 31) {
+		return true
+	}
+	return false
+}
+
+func escapeByte(b byte) bool {
+	i := int(b)
+	if i == 32 || i == 35 || i == 37 || i == 38 || i == 43 || (i >= 127 && i <= 255) {
+		return true
+	}
+	return false
+}
+
+func metaFragment(token html.Token) bool {
+	var name string
+	var content string
+
+	for _, attr := range token.Attr {
+		if cleanStr(attr.Key) == "name" {
+			name = attr.Val
+		}
+		if cleanStr(attr.Key) == "content" {
+			content = attr.Val
+		}
+	}
+	if name == "fragment" && content == "!" {
+		return true
+	}
+	return false
+}
+
+func cleanStr(str string) string {
+	return strings.ToLower(strings.TrimSpace(str))
+}