Skip to content

Commit 67b12af

Browse files
author
cuzic
committed
fix about fsight crawler
1 parent f4f395a commit 67b12af

14 files changed

+224
-76
lines changed

.gitignore

+2
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
*rc
2+
*~

lib/.gitignore

+2
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
database_setting.rb
2+

lib/crawler.rb

+25-12
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,15 @@ def self.crawl_rss
3131
count = UrlsToCrawl.append_urls curls, feed
3232
end
3333

34-
def self.get_canonical_url url
34+
def self.get_canonical_url url, limit = 10
35+
return url if limit == 0
36+
uri = URI(url)
37+
Net::HTTP.start(uri.host, uri.port) do |http|
38+
response = http.head(uri.request_uri)
39+
if Net::HTTPRedirection === response then
40+
return get_canonical_url(response["location"], limit - 1)
41+
end
42+
end
3543
return url
3644
end
3745

@@ -60,14 +68,15 @@ def self.crawl_article
6068
crawler = find_crawler url
6169
crawler ||= self
6270
article = crawler.fetch_whole_article url
71+
6372
if article.nil? then
6473
UrlsToCrawl.finish url
6574
next
6675
end
6776

6877
crawler.after_crawl article
6978

70-
if article["id"] then
79+
if article["md5"] then
7180
Articles.regist article
7281
end
7382
UrlsToCrawl.finish url
@@ -129,6 +138,7 @@ def self.fetch_whole_article url
129138
rescue StandardError => e
130139
case e.to_s
131140
when /404 Not Found/
141+
$stderr.puts "404 Not Found"
132142
return nil
133143
else
134144
raise e
@@ -139,20 +149,23 @@ def self.after_crawl article
139149
article["images"].map! do |image|
140150
case image["url"]
141151
when /\.jpg$/i then
142-
type = "image/jpeg"
143-
digest = Digest::MD5.hexdigest(image["url"])
144-
filename = digest + ".jpg"
145-
image["file"] = httpclient.get(image["url"]) rescue nil
146-
image["filename"] = filename
147-
image["type"] = type
148-
image["md5"] = digest
149-
image
152+
begin
153+
type = "image/jpeg"
154+
digest = Digest::MD5.hexdigest(image["url"])
155+
filename = digest + ".jpg"
156+
binary = httpclient.get(image["url"])
157+
image["file"] = binary
158+
image["filename"] = filename
159+
image["type"] = type
160+
image["md5"] = digest
161+
image
162+
rescue
163+
nil
164+
end
150165
else
151166
nil
152167
end
153168
end.reject! {|value| value.nil? }
154-
155-
article["id"] = Digest::MD5.digest article["url"] + article["title"] rescue nil
156169
end
157170

158171
def self.httpclient

lib/formatter.rb

+7-6
Original file line numberDiff line numberDiff line change
@@ -136,10 +136,10 @@ def self.epub epub_filename, duration, feeds = nil
136136
end
137137
end
138138
end
139-
compose_epub epub_filename, generator
139+
compose_epub epub_filename, generator, feeds
140140
end
141141

142-
def self.compose_epub epub_filename, generator
142+
def self.compose_epub epub_filename, generator, title
143143
mimetype = file_read "template/mimetype"
144144
container_xml = file_read "template/container.xml"
145145

@@ -159,9 +159,8 @@ def self.compose_epub epub_filename, generator
159159
article.body = self.format record
160160
article.created_at = record["created_at"]
161161
article.images = record["images"].map do |image|
162-
image_md5 = image["md5"]
163162
{
164-
:id => image_md5,
163+
:id => image["md5"],
165164
:filename => image["filename"],
166165
:file => image["file"],
167166
:type => image["type"]
@@ -196,7 +195,7 @@ def self.compose_epub epub_filename, generator
196195
r << {
197196
:id => article.md5,
198197
:href => article.filename,
199-
:type => article.type
198+
:type => article.type || "application/xhtml+xml"
200199
}
201200
article.images.each do |image|
202201
r << {
@@ -228,17 +227,19 @@ def self.compose_epub epub_filename, generator
228227
name = "takibi"
229228

230229
content_opf = erb_result "content.opf.erb" do
231-
@title = name + " " + Time.now.strftime("%Y-%m-%d")
230+
@title = title + " " + Time.now.strftime("%Y-%m-%d")
232231
@author = name
233232
@publisher = name
234233
@items = opf_items.uniq
235234
@itemrefs = opf_itemrefs
235+
@uuid = @@uuid
236236
end
237237

238238
toc_ncx = erb_result "toc.ncx.erb" do
239239
@title = name
240240
@author = name
241241
@nav_points = nav_points
242+
@uuid = @@uuid
242243
end
243244

244245
toc_xhtml = erb_result "toc.xhtml.erb" do

lib/model.rb

+1-1
Original file line numberDiff line numberDiff line change
@@ -163,7 +163,7 @@ def self.fetch_row row
163163
"published_time" => row[:published_time],
164164
"created_at" => row[:created_at],
165165
"body" => (row[:body] || "").force_encoding("utf-8"),
166-
"images" => unpack((row[:images] || "").force_encoding("utf-8")),
166+
"images" => unpack((row[:images] || "").force_encoding("ASCII-8BIT")),
167167
}
168168
else
169169
article = {

lib/parser.rb

+38-9
Original file line numberDiff line numberDiff line change
@@ -120,14 +120,31 @@ def self.extract_author doc, url
120120
end
121121

122122
def self.extract_images doc, url
123-
doc.xpath(images_xpath).map do |div|
124-
path = div.xpath('.//img').first[:src]
125-
url = URI.join(url, path).to_s
126-
caption = div.xpath(image_caption_xpath).text.strip
127-
{"url" => url, "caption" => caption}
123+
urls = []
124+
unless images_xpath.empty? then
125+
urls += doc.xpath(images_xpath).map do |div|
126+
path = div.xpath('.//img').first[:src]
127+
src = URI.join(url, path).to_s
128+
caption = div.xpath(image_caption_xpath).text.strip
129+
{"url" => src, "caption" => caption}
130+
end
128131
end
129-
rescue
130-
return []
132+
urls +=
133+
doc.xpath(body_xpath + "//img").map do |img|
134+
path = img[:src]
135+
src = ""
136+
if path.start_with?("http://")
137+
src = path
138+
else
139+
src = URI.join(url, path).to_s
140+
end
141+
caption = img[:title] || img[:alt]
142+
{"url" => src, "caption" => caption}
143+
end
144+
urls
145+
#rescue
146+
#$stderr.puts $!.inspect
147+
#return []
131148
end
132149

133150
@@default_body_xpath = '//div[@id="main-contents"]'
@@ -149,13 +166,25 @@ def self.extract_body doc, url
149166
body.xpath('.//a[@href]').each do |anchor|
150167
begin
151168
path = anchor[:href].strip
152-
url = URI.join(url, path).to_s
153-
anchor.set_attribute("href", url)
169+
href = URI.join(url, path).to_s
170+
anchor.set_attribute("href", href)
154171
anchor.remove_attribute "onclick"
155172
rescue
156173
end
157174
end
158175

176+
body.xpath('.//img[@src]').each do |img|
177+
path = img[:src].strip
178+
if path.start_with?("http://") then
179+
src = path
180+
else
181+
src = URI.join(url, path).to_s
182+
end
183+
digest = Digest::MD5.hexdigest(src)
184+
suffix = src[/(\.\w+)$/, 1]
185+
img.set_attribute("src", "#{digest}#{suffix}")
186+
end
187+
159188
noisy_elems_xpaths.each do |xpath|
160189
body.xpath(xpath).each do |node|
161190
node.remove

plugins/diamond/diamond_crawler.rb

+21-21
Original file line numberDiff line numberDiff line change
@@ -10,27 +10,27 @@ def match url
1010
url.include?("diamond.jp/")
1111
end
1212

13-
def self.httpclient
14-
return @httpclient if defined? @httpclient and @httpclient
15-
load_config
16-
m = Mechanize.new
17-
login_url = "https://web.diamond.jp/member/memberpage.cgi"
18-
m.get login_url do |login_page|
19-
h = {:action => "memberpage.cgi"}
20-
logged_in = login_page.form_with(h) do |form|
21-
form.mail = Takibi::DiamondConf["email"]
22-
form.pass = Takibi::DiamondConf["password"]
23-
end.click_button
24-
end
25-
def m.get url
26-
page = super url
27-
return page.body
28-
rescue
29-
nil
30-
end
31-
@httpclient = m
32-
return @httpclient
33-
end
13+
# def self.httpclient
14+
# return @httpclient if defined? @httpclient and @httpclient
15+
# load_config
16+
# m = Mechanize.new
17+
# login_url = "https://web.diamond.jp/member/memberpage.cgi"
18+
# m.get login_url do |login_page|
19+
# h = {:action => "memberpage.cgi"}
20+
# logged_in = login_page.form_with(h) do |form|
21+
# form.mail = Takibi::DiamondConf["email"]
22+
# form.pass = Takibi::DiamondConf["password"]
23+
# end.click_button
24+
# end
25+
# def m.get url
26+
# page = super url
27+
# return page.body
28+
# rescue
29+
# nil
30+
# end
31+
# @httpclient = m
32+
# return @httpclient
33+
# end
3434

3535
def self.load_config
3636
filename = File.join(File.dirname(__FILE__), "diamondrc")

plugins/fsight/fsight_crawler.rb

+43-13
Original file line numberDiff line numberDiff line change
@@ -1,35 +1,41 @@
11
# encoding: utf-8
22
#
33

4-
#TAKIBI_ROOT = "."
5-
require "lib/crawler"
6-
require "rubygems"
7-
require "mechanize"
4+
unless defined? TAKIBI_ROOT
5+
TAKIBI_ROOT = "."
6+
require "lib/crawler"
7+
require "rubygems"
8+
require 'mechanize'
9+
end
810

911
module Takibi
1012
class FsightCrawler < Crawler
11-
rss_url "http://www.fsight.jp/rss/article/all/all/rss.xml"
13+
rss_url "http://www.fsight.jp/feed"
1214

1315
def match url
1416
url.include?("fsight.jp/")
1517
end
1618

1719
def self.httpclient
18-
return @httpclient if defined? @httpclient and @httpclient
20+
# return @httpclient if defined? @httpclient and @httpclient
1921
load_config
2022
m = Mechanize.new
21-
login_url = "https://www.fsight.jp/user"
23+
login_url = "https://www.fsight.jp/login"
2224
login_page = m.get login_url
23-
form = login_page.forms[2]
25+
form = login_page.forms[0]
2426

25-
form["name"] = Takibi::FsightConf["name"]
26-
form["pass"] = Takibi::FsightConf["password"]
27+
form["log"] = Takibi::FsightConf["name"]
28+
form["pwd"] = Takibi::FsightConf["password"]
2729

2830
m.submit form
2931

3032
def m.get url
3133
page = super url
3234
return page.body
35+
rescue Mechanize::ResponseCodeError
36+
return nil
37+
rescue Errno::ETIMEDOUT
38+
return nil
3339
end
3440
@httpclient = m
3541
return @httpclient
@@ -42,9 +48,33 @@ def self.load_config
4248
end
4349
end
4450

51+
def pack binary
52+
[MessagePack.pack(binary)].pack("m")
53+
end
54+
55+
def unpack binary
56+
MessagePack.unpack binary.unpack("m").first rescue []
57+
end
58+
4559
if $0 == __FILE__
4660
require 'pp'
47-
url = "http://www.fsight.jp/article/10668"
48-
article = Takibi::FsightCrawler.httpclient.get url
49-
puts article
61+
require File.dirname(__FILE__) + "/fsight_parser"
62+
require 'msgpack'
63+
require 'lib/model'
64+
url = "http://www.fsight.jp/15127"
65+
if false then
66+
record = Takibi::FsightCrawler.fetch_whole_article url
67+
Takibi::FsightCrawler.after_crawl record
68+
mime64 = pack record["images"]
69+
open("tmp/unpack2.txt", "w") do |w|
70+
w.write mime64
71+
end
72+
73+
Takibi::Articles.regist record
74+
else
75+
Takibi::Articles.fetch(["url = '#{url}'"]) do |article|
76+
pp article["images"]
77+
end
78+
end
5079
end
80+

plugins/fsight/fsight_parser.rb

+7-7
Original file line numberDiff line numberDiff line change
@@ -11,15 +11,15 @@ def self.extract src, url
1111
super src, url
1212
end
1313

14-
rss_regex %r(fsight.jp/article)
14+
rss_regex %r(fsight.jp/)
1515

16-
title_xpath '//h1[@class="heading"]'
17-
published_time_xpath '//div[@class="date"]'
18-
author_xpath '//div[@class="author"]/text()'
19-
images_xpath ''
20-
image_caption_xpath ''
16+
title_xpath '//div[@class="article-Block"]/h1'
17+
published_time_xpath '//li[@class="date"]'
18+
author_xpath '//li[@class="writer"]/a'
19+
images_xpath '//div[contains(@class, "alignright")]'
20+
image_caption_xpath '//div[contains(@class, "wp-caption-text")]'
2121

22-
body_xpath '//div[@class="column"]'
22+
body_xpath '//div[@class="fs-content"]'
2323
noisy_elems_xpaths %W(//div[@class="listBlock-tag"] //div[@class="headingBlock-article"]
2424
//div[@class="columnBlock-value"]
2525
//div[@class="columnBlock-socialBookmark"] //div[@class="listBlock-pagenation"])

0 commit comments

Comments
 (0)