@@ -120,14 +120,31 @@ def self.extract_author doc, url
120
120
end
121
121
122
122
def self . extract_images doc , url
123
- doc . xpath ( images_xpath ) . map do |div |
124
- path = div . xpath ( './/img' ) . first [ :src ]
125
- url = URI . join ( url , path ) . to_s
126
- caption = div . xpath ( image_caption_xpath ) . text . strip
127
- { "url" => url , "caption" => caption }
123
+ urls = [ ]
124
+ unless images_xpath . empty? then
125
+ urls += doc . xpath ( images_xpath ) . map do |div |
126
+ path = div . xpath ( './/img' ) . first [ :src ]
127
+ src = URI . join ( url , path ) . to_s
128
+ caption = div . xpath ( image_caption_xpath ) . text . strip
129
+ { "url" => src , "caption" => caption }
130
+ end
128
131
end
129
- rescue
130
- return [ ]
132
+ urls +=
133
+ doc . xpath ( body_xpath + "//img" ) . map do |img |
134
+ path = img [ :src ]
135
+ src = ""
136
+ if path . start_with? ( "http://" )
137
+ src = path
138
+ else
139
+ src = URI . join ( url , path ) . to_s
140
+ end
141
+ caption = img [ :title ] || img [ :alt ]
142
+ { "url" => src , "caption" => caption }
143
+ end
144
+ urls
145
+ #rescue
146
+ #$stderr.puts $!.inspect
147
+ #return []
131
148
end
132
149
133
150
@@default_body_xpath = '//div[@id="main-contents"]'
@@ -149,13 +166,25 @@ def self.extract_body doc, url
149
166
body . xpath ( './/a[@href]' ) . each do |anchor |
150
167
begin
151
168
path = anchor [ :href ] . strip
152
- url = URI . join ( url , path ) . to_s
153
- anchor . set_attribute ( "href" , url )
169
+ href = URI . join ( url , path ) . to_s
170
+ anchor . set_attribute ( "href" , href )
154
171
anchor . remove_attribute "onclick"
155
172
rescue
156
173
end
157
174
end
158
175
176
+ body . xpath ( './/img[@src]' ) . each do |img |
177
+ path = img [ :src ] . strip
178
+ if path . start_with? ( "http://" ) then
179
+ src = path
180
+ else
181
+ src = URI . join ( url , path ) . to_s
182
+ end
183
+ digest = Digest ::MD5 . hexdigest ( src )
184
+ suffix = src [ /(\. \w +)$/ , 1 ]
185
+ img . set_attribute ( "src" , "#{ digest } #{ suffix } " )
186
+ end
187
+
159
188
noisy_elems_xpaths . each do |xpath |
160
189
body . xpath ( xpath ) . each do |node |
161
190
node . remove
0 commit comments