cuzic
diff --git a/‎.gitignore
+2 b/‎.gitignore
+2
diff --git a/‎lib/.gitignore
+2 b/‎lib/.gitignore
+2
diff --git a/‎lib/crawler.rb
+25-12 b/‎lib/crawler.rb
+25-12
diff --git a/‎lib/formatter.rb
+7-6 b/‎lib/formatter.rb
+7-6
diff --git a/‎lib/model.rb
+1-1 b/‎lib/model.rb
+1-1
diff --git a/‎lib/parser.rb
+38-9 b/‎lib/parser.rb
+38-9
diff --git a/‎plugins/diamond/diamond_crawler.rb
+21-21 b/‎plugins/diamond/diamond_crawler.rb
+21-21
diff --git a/‎plugins/fsight/fsight_crawler.rb
+43-13 b/‎plugins/fsight/fsight_crawler.rb
+43-13
diff --git a/‎plugins/fsight/fsight_parser.rb
+7-7 b/‎plugins/fsight/fsight_parser.rb
+7-7
@@ -0,0 +1,2 @@
+*rc
+*~
@@ -0,0 +1,2 @@
+database_setting.rb
+
@@ -31,7 +31,15 @@ def self.crawl_rss
       count = UrlsToCrawl.append_urls curls, feed
     end
 
-    def self.get_canonical_url url
+    def self.get_canonical_url url, limit = 10
+      return url if limit == 0
+      uri = URI(url)
+      Net::HTTP.start(uri.host, uri.port) do |http|
+        response = http.head(uri.request_uri)
+        if Net::HTTPRedirection === response then
+          return get_canonical_url(response["location"], limit - 1)
+        end
+      end
       return url
     end
 
@@ -60,14 +68,15 @@ def self.crawl_article
           crawler = find_crawler url
           crawler ||= self
           article = crawler.fetch_whole_article url
+
           if article.nil? then
             UrlsToCrawl.finish url
             next
           end
 
           crawler.after_crawl article
 
-          if article["id"] then
+          if article["md5"] then
             Articles.regist article
           end
           UrlsToCrawl.finish url
@@ -129,6 +138,7 @@ def self.fetch_whole_article url
     rescue StandardError => e
       case e.to_s
       when /404 Not Found/
+        $stderr.puts "404 Not Found"
         return nil
       else
         raise e
@@ -139,20 +149,23 @@ def self.after_crawl article
       article["images"].map! do |image|
         case image["url"]
         when /\.jpg$/i then
-          type = "image/jpeg"
-          digest = Digest::MD5.hexdigest(image["url"])
-          filename = digest + ".jpg"
-          image["file"]     = httpclient.get(image["url"]) rescue nil
-          image["filename"] = filename
-          image["type"]     = type
-          image["md5"]      = digest
-          image
+          begin
+            type = "image/jpeg"
+            digest = Digest::MD5.hexdigest(image["url"])
+            filename = digest + ".jpg"
+            binary            = httpclient.get(image["url"])
+            image["file"]     = binary
+            image["filename"] = filename
+            image["type"]     = type
+            image["md5"]      = digest
+            image
+          rescue
+            nil
+          end
         else
           nil
         end
       end.reject! {|value| value.nil? }
-
-      article["id"] = Digest::MD5.digest article["url"] + article["title"] rescue nil
     end
 
     def self.httpclient
 
@@ -136,10 +136,10 @@ def self.epub epub_filename, duration, feeds = nil
           end
         end
       end
-      compose_epub epub_filename, generator
+      compose_epub epub_filename, generator, feeds
     end
 
-    def self.compose_epub epub_filename, generator
+    def self.compose_epub epub_filename, generator, title
       mimetype      = file_read "template/mimetype"
       container_xml = file_read "template/container.xml"
 
@@ -159,9 +159,8 @@ def self.compose_epub epub_filename, generator
         article.body       = self.format record
         article.created_at = record["created_at"]
         article.images     = record["images"].map do |image|
-          image_md5 = image["md5"]
           {
-            :id       => image_md5,
+            :id       => image["md5"],
             :filename => image["filename"],
             :file     => image["file"],
             :type     => image["type"]
@@ -196,7 +195,7 @@ def self.compose_epub epub_filename, generator
         r << {
           :id   => article.md5,
           :href => article.filename,
-          :type => article.type
+          :type => article.type || "application/xhtml+xml"
         }
         article.images.each do |image|
           r << {
@@ -228,17 +227,19 @@ def self.compose_epub epub_filename, generator
       name = "takibi"
 
       content_opf = erb_result "content.opf.erb" do
-        @title     = name + " " + Time.now.strftime("%Y-%m-%d")
+        @title     = title + " " + Time.now.strftime("%Y-%m-%d")
         @author    = name
         @publisher = name
         @items     = opf_items.uniq
         @itemrefs  = opf_itemrefs
+        @uuid      = @@uuid
       end
 
       toc_ncx = erb_result "toc.ncx.erb" do
         @title      = name
         @author     = name
         @nav_points = nav_points
+        @uuid      = @@uuid
       end
 
       toc_xhtml = erb_result "toc.xhtml.erb" do
 
@@ -163,7 +163,7 @@ def self.fetch_row row
           "published_time" => row[:published_time],
           "created_at"     => row[:created_at],
           "body"           => (row[:body] || "").force_encoding("utf-8"),
-          "images"         => unpack((row[:images] || "").force_encoding("utf-8")),
+          "images"         => unpack((row[:images] || "").force_encoding("ASCII-8BIT")),
         }
       else
         article = {
 
@@ -120,14 +120,31 @@ def self.extract_author doc, url
   end
 
   def self.extract_images doc, url
-    doc.xpath(images_xpath).map do |div|
-      path = div.xpath('.//img').first[:src]
-      url  = URI.join(url, path).to_s
-      caption = div.xpath(image_caption_xpath).text.strip
-      {"url" => url, "caption" => caption}
+    urls = []
+    unless images_xpath.empty? then
+      urls += doc.xpath(images_xpath).map do |div|
+        path = div.xpath('.//img').first[:src]
+        src  = URI.join(url, path).to_s
+        caption = div.xpath(image_caption_xpath).text.strip
+        {"url" => src, "caption" => caption}
+      end
     end
-  rescue
-    return []
+    urls +=
+      doc.xpath(body_xpath + "//img").map do |img|
+        path = img[:src]
+        src = ""
+        if path.start_with?("http://")
+          src = path
+        else
+          src  = URI.join(url, path).to_s
+        end
+        caption = img[:title] || img[:alt]
+        {"url" => src, "caption" => caption}
+      end
+    urls
+  #rescue
+    #$stderr.puts $!.inspect
+    #return []
   end
 
@@default_body_xpath = '//div[@id="main-contents"]'
@@ -149,13 +166,25 @@ def self.extract_body doc, url
     body.xpath('.//a[@href]').each do |anchor|
       begin
         path = anchor[:href].strip
-        url  = URI.join(url, path).to_s
-        anchor.set_attribute("href", url)
+        href  = URI.join(url, path).to_s
+        anchor.set_attribute("href", href)
         anchor.remove_attribute "onclick"
       rescue 
       end
     end
 
+    body.xpath('.//img[@src]').each do |img|
+      path = img[:src].strip
+      if path.start_with?("http://") then
+        src = path
+      else
+        src  = URI.join(url, path).to_s
+      end
+      digest = Digest::MD5.hexdigest(src)
+      suffix = src[/(\.\w+)$/, 1]
+      img.set_attribute("src", "#{digest}#{suffix}")
+    end
+
     noisy_elems_xpaths.each do |xpath|
       body.xpath(xpath).each do |node|
         node.remove
 
@@ -10,27 +10,27 @@ def match url
       url.include?("diamond.jp/")
     end
 
-    def self.httpclient
-      return @httpclient if defined? @httpclient and @httpclient
-      load_config
-      m = Mechanize.new
-      login_url = "https://web.diamond.jp/member/memberpage.cgi"
-      m.get login_url do |login_page|
-        h = {:action => "memberpage.cgi"}
-        logged_in = login_page.form_with(h) do |form|
-          form.mail = Takibi::DiamondConf["email"]
-          form.pass = Takibi::DiamondConf["password"]
-        end.click_button
-      end
-      def m.get url
-        page = super url
-        return page.body
-      rescue
-        nil
-      end
-      @httpclient = m
-      return @httpclient
-    end
+#    def self.httpclient
+#      return @httpclient if defined? @httpclient and @httpclient
+#      load_config
+#      m = Mechanize.new
+#      login_url = "https://web.diamond.jp/member/memberpage.cgi"
+#      m.get login_url do |login_page|
+#        h = {:action => "memberpage.cgi"}
+#        logged_in = login_page.form_with(h) do |form|
+#          form.mail = Takibi::DiamondConf["email"]
+#          form.pass = Takibi::DiamondConf["password"]
+#        end.click_button
+#      end
+#      def m.get url
+#        page = super url
+#        return page.body
+#      rescue
+#        nil
+#      end
+#      @httpclient = m
+#      return @httpclient
+#    end
 
     def self.load_config
       filename = File.join(File.dirname(__FILE__), "diamondrc")
 
@@ -1,35 +1,41 @@
 # encoding: utf-8
 #
 
-#TAKIBI_ROOT = "."
-require "lib/crawler"
-require "rubygems"
-require "mechanize"
+unless defined? TAKIBI_ROOT
+  TAKIBI_ROOT = "."
+  require "lib/crawler"
+  require "rubygems"
+  require 'mechanize'
+end
 
 module Takibi
   class FsightCrawler < Crawler
-    rss_url "http://www.fsight.jp/rss/article/all/all/rss.xml"
+    rss_url "http://www.fsight.jp/feed"
 
     def match url
       url.include?("fsight.jp/")
     end
 
     def self.httpclient
-      return @httpclient if defined? @httpclient and @httpclient
+      # return @httpclient if defined? @httpclient and @httpclient
       load_config
       m = Mechanize.new
-      login_url = "https://www.fsight.jp/user"
+      login_url = "https://www.fsight.jp/login"
       login_page = m.get login_url
-      form = login_page.forms[2]
+      form = login_page.forms[0]
 
-      form["name"] = Takibi::FsightConf["name"]
-      form["pass"] = Takibi::FsightConf["password"]
+      form["log"] = Takibi::FsightConf["name"]
+      form["pwd"] = Takibi::FsightConf["password"]
 
       m.submit form
 
       def m.get url
         page = super url
         return page.body
+      rescue Mechanize::ResponseCodeError
+        return nil
+      rescue Errno::ETIMEDOUT
+        return nil
       end
       @httpclient = m
       return @httpclient
@@ -42,9 +48,33 @@ def self.load_config
   end
 end
 
+def pack binary
+  [MessagePack.pack(binary)].pack("m")
+end
+
+def unpack binary
+  MessagePack.unpack binary.unpack("m").first rescue []
+end
+
 if $0 == __FILE__
   require 'pp'
-  url = "http://www.fsight.jp/article/10668"
-  article = Takibi::FsightCrawler.httpclient.get url
-  puts article
+  require File.dirname(__FILE__) + "/fsight_parser"
+  require 'msgpack'
+  require 'lib/model'
+  url = "http://www.fsight.jp/15127"
+  if false then
+    record = Takibi::FsightCrawler.fetch_whole_article url
+    Takibi::FsightCrawler.after_crawl record
+    mime64 = pack record["images"]
+    open("tmp/unpack2.txt", "w") do |w|
+      w.write mime64
+    end
+
+    Takibi::Articles.regist record
+  else
+    Takibi::Articles.fetch(["url = '#{url}'"]) do |article|
+      pp article["images"]
+    end
+  end
 end
+
@@ -11,15 +11,15 @@ def self.extract src, url
     super src, url
   end
 
-  rss_regex %r(fsight.jp/article)
+  rss_regex %r(fsight.jp/)
 
-  title_xpath          '//h1[@class="heading"]'
-  published_time_xpath '//div[@class="date"]'
-  author_xpath         '//div[@class="author"]/text()'
-  images_xpath         ''
-  image_caption_xpath  ''
+  title_xpath          '//div[@class="article-Block"]/h1'
+  published_time_xpath '//li[@class="date"]'
+  author_xpath         '//li[@class="writer"]/a'
+  images_xpath         '//div[contains(@class, "alignright")]'
+  image_caption_xpath  '//div[contains(@class, "wp-caption-text")]'
 
-  body_xpath           '//div[@class="column"]'
+  body_xpath           '//div[@class="fs-content"]'
   noisy_elems_xpaths   %W(//div[@class="listBlock-tag"] //div[@class="headingBlock-article"]
                          //div[@class="columnBlock-value"]
                          //div[@class="columnBlock-socialBookmark"] //div[@class="listBlock-pagenation"])
Original file line number	Diff line number	Diff line change
`@@ -163,7 +163,7 @@ def self.fetch_row row`
`163`	`163`	`"published_time" => row[:published_time],`
`164`	`164`	`"created_at" => row[:created_at],`
`165`	`165`	`"body" => (row[:body] \|\| "").force_encoding("utf-8"),`
`166`		`- "images" => unpack((row[:images] \|\| "").force_encoding("utf-8")),`
	`166`	`+ "images" => unpack((row[:images] \|\| "").force_encoding("ASCII-8BIT")),`
`167`	`167`	`}`
`168`	`168`	`else`
`169`	`169`	`article = {`