Skip to content

Commit e8f859e

Browse files
author
Kristinn Sigurðsson
committed
Merge pull request #251 from kris-sigur/Issue-116
Issue 116 and 250
2 parents 98741c5 + b9cc524 commit e8f859e

File tree

8 files changed

+529
-240
lines changed

8 files changed

+529
-240
lines changed

pom.xml

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -388,6 +388,12 @@
388388
</developers>
389389

390390
<contributors>
391+
<contributor>
392+
<name>Mohamed Elsayed</name>
393+
<url>https://github.com/MohammedElsayyed</url>
394+
<organization>The New Library of Alexandria</organization>
395+
<organizationUrl>http://bibalex.org/</organizationUrl>
396+
</contributor>
391397
</contributors>
392398

393399
<reporting>

src/site/xdoc/release_notes.xml

292 Bytes
Binary file not shown.

wayback-core/pom.xml

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -125,6 +125,12 @@
125125
<version>2.5.1</version>
126126
<scope>test</scope>
127127
</dependency>
128+
<dependency>
129+
<groupId>org.apache.httpcomponents</groupId>
130+
<artifactId>httpclient</artifactId>
131+
<version>4.3.5</version>
132+
<type>jar</type>
133+
</dependency>
128134
</dependencies>
129135

130136
</project>
Lines changed: 240 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,240 @@
1+
/*
2+
* This file is part of the Wayback archival access software
3+
* (http://archive-access.sourceforge.net/projects/wayback/).
4+
*
5+
* Licensed to the Internet Archive (IA) by one or more individual
6+
* contributors.
7+
*
8+
* The IA licenses this file to You under the Apache License, Version 2.0
9+
* (the "License"); you may not use this file except in compliance with
10+
* the License. You may obtain a copy of the License at
11+
*
12+
* http://www.apache.org/licenses/LICENSE-2.0
13+
*
14+
* Unless required by applicable law or agreed to in writing, software
15+
* distributed under the License is distributed on an "AS IS" BASIS,
16+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
17+
* See the License for the specific language governing permissions and
18+
* limitations under the License.
19+
*/
20+
package org.archive.wayback.liveweb;
21+
22+
import java.io.ByteArrayInputStream;
23+
import java.io.IOException;
24+
import java.net.ConnectException;
25+
import java.net.SocketException;
26+
import java.net.SocketTimeoutException;
27+
import java.net.URL;
28+
import java.util.logging.Logger;
29+
import java.util.zip.GZIPInputStream;
30+
31+
import org.apache.commons.httpclient.ConnectTimeoutException;
32+
import org.apache.commons.httpclient.HostConfiguration;
33+
import org.apache.commons.httpclient.HttpClient;
34+
import org.apache.commons.httpclient.HttpMethod;
35+
import org.apache.commons.httpclient.MultiThreadedHttpConnectionManager;
36+
import org.apache.commons.httpclient.NoHttpResponseException;
37+
import org.apache.commons.httpclient.methods.GetMethod;
38+
import org.apache.commons.httpclient.params.HttpClientParams;
39+
import org.archive.io.arc.ARCRecord;
40+
import org.archive.wayback.core.Resource;
41+
import org.archive.wayback.exception.LiveDocumentNotAvailableException;
42+
import org.archive.wayback.exception.LiveWebCacheUnavailableException;
43+
import org.archive.wayback.exception.LiveWebTimeoutException;
44+
import org.archive.wayback.exception.ResourceNotAvailableException;
45+
import org.archive.wayback.resourcestore.resourcefile.ArcResource;
46+
import org.archive.wayback.resourcestore.resourcefile.ResourceFactory;
47+
48+
/**
49+
* This class fetches resource from live web.
50+
* It works with {@link ARCRecordingProxy} not standard proxy servers.
51+
*
52+
* @author brad
53+
* @see LiveWebCache
54+
* @see StdRemoteLiveWebCache
55+
*
56+
*/
57+
public class ArcRemoteLiveWebCache implements LiveWebCache {
58+
private static final Logger LOGGER = Logger.getLogger(
59+
ArcRemoteLiveWebCache.class.getName());
60+
61+
protected MultiThreadedHttpConnectionManager connectionManager = null;
62+
protected HostConfiguration hostConfiguration = null;
63+
protected HttpClient http = null;
64+
protected String requestPrefix = null;
65+
66+
/**
67+
*
68+
*/
69+
public ArcRemoteLiveWebCache() {
70+
connectionManager = new MultiThreadedHttpConnectionManager();
71+
hostConfiguration = new HostConfiguration();
72+
HttpClientParams params = new HttpClientParams();
73+
params.setParameter(HttpClientParams.RETRY_HANDLER, new NoRetryHandler());
74+
http = new HttpClient(params,connectionManager);
75+
http.setHostConfiguration(hostConfiguration);
76+
}
77+
78+
/* (non-Javadoc)
79+
* @see org.archive.wayback.liveweb.LiveWebCache#getCachedResource(java.net.URL, long, boolean)
80+
*/
81+
public Resource getCachedResource(URL url, long maxCacheMS,
82+
boolean bUseOlder) throws LiveDocumentNotAvailableException,
83+
LiveWebCacheUnavailableException, LiveWebTimeoutException, IOException {
84+
String urlString = url.toExternalForm();
85+
86+
if (requestPrefix != null) {
87+
urlString = requestPrefix + urlString;
88+
}
89+
90+
HttpMethod method = null;
91+
try {
92+
method = new GetMethod(urlString);
93+
} catch(IllegalArgumentException e) {
94+
LOGGER.warning("Bad URL for live web fetch:" + urlString);
95+
throw new LiveDocumentNotAvailableException("Url:" + urlString +
96+
"does not look like an URL?");
97+
}
98+
boolean success = false;
99+
try {
100+
int status = http.executeMethod(method);
101+
if(status == 200) {
102+
103+
ByteArrayInputStream bais = new ByteArrayInputStream(method.getResponseBody());
104+
ARCRecord r = new ARCRecord(
105+
new GZIPInputStream(bais),
106+
"id",0L,false,false,true);
107+
ArcResource ar = (ArcResource)
108+
ResourceFactory.ARCArchiveRecordToResource(r, null);
109+
if(ar.getStatusCode() == 502) {
110+
throw new LiveDocumentNotAvailableException(urlString);
111+
} else if(ar.getStatusCode() == 504) {
112+
throw new LiveWebTimeoutException("Timeout:" + urlString);
113+
}
114+
success = true;
115+
return ar;
116+
117+
} else {
118+
throw new LiveWebCacheUnavailableException(urlString);
119+
}
120+
121+
} catch (ResourceNotAvailableException e) {
122+
throw new LiveDocumentNotAvailableException(urlString);
123+
124+
} catch (NoHttpResponseException e) {
125+
126+
throw new LiveWebCacheUnavailableException("No Http Response for "
127+
+ urlString);
128+
129+
} catch (ConnectException e) {
130+
throw new LiveWebCacheUnavailableException(e.getLocalizedMessage()
131+
+ " : " + urlString);
132+
} catch (SocketException e) {
133+
throw new LiveWebCacheUnavailableException(e.getLocalizedMessage()
134+
+ " : " + urlString);
135+
} catch (SocketTimeoutException e) {
136+
throw new LiveWebTimeoutException(e.getLocalizedMessage()
137+
+ " : " + urlString);
138+
} catch(ConnectTimeoutException e) {
139+
throw new LiveWebTimeoutException(e.getLocalizedMessage()
140+
+ " : " + urlString);
141+
} finally {
142+
if (!success) {
143+
method.abort();
144+
}
145+
method.releaseConnection();
146+
}
147+
}
148+
149+
/* (non-Javadoc)
150+
* @see org.archive.wayback.liveweb.LiveWebCache#shutdown()
151+
*/
152+
public void shutdown() {
153+
// TODO Auto-generated method stub
154+
}
155+
156+
157+
/**
158+
* @param hostPort to proxy requests through - ex. "localhost:3128"
159+
*/
160+
public void setProxyHostPort(String hostPort) {
161+
int colonIdx = hostPort.indexOf(':');
162+
if(colonIdx > 0) {
163+
String host = hostPort.substring(0,colonIdx);
164+
int port = Integer.valueOf(hostPort.substring(colonIdx+1));
165+
166+
// http.getHostConfiguration().setProxy(host, port);
167+
hostConfiguration.setProxy(host, port);
168+
}
169+
}
170+
/**
171+
* @param maxTotalConnections the HttpConnectionManagerParams config
172+
*/
173+
public void setMaxTotalConnections(int maxTotalConnections) {
174+
connectionManager.getParams().
175+
setMaxTotalConnections(maxTotalConnections);
176+
}
177+
/**
178+
* @return the HttpConnectionManagerParams maxTotalConnections config
179+
*/
180+
public int getMaxTotalConnections() {
181+
return connectionManager.getParams().getMaxTotalConnections();
182+
}
183+
184+
/**
185+
* @param maxHostConnections the HttpConnectionManagerParams config
186+
*/
187+
public void setMaxHostConnections(int maxHostConnections) {
188+
connectionManager.getParams().
189+
setMaxConnectionsPerHost(hostConfiguration, maxHostConnections);
190+
}
191+
192+
/**
193+
* @return the HttpConnectionManagerParams maxHostConnections config
194+
*/
195+
public int getMaxHostConnections() {
196+
return connectionManager.getParams().
197+
getMaxConnectionsPerHost(hostConfiguration);
198+
}
199+
200+
/**
201+
* @return the connectionTimeoutMS
202+
*/
203+
public int getConnectionTimeoutMS() {
204+
return connectionManager.getParams().getConnectionTimeout();
205+
}
206+
207+
/**
208+
* @param connectionTimeoutMS the connectionTimeoutMS to set
209+
*/
210+
public void setConnectionTimeoutMS(int connectionTimeoutMS) {
211+
connectionManager.getParams().setConnectionTimeout(connectionTimeoutMS);
212+
}
213+
214+
/**
215+
* @return the socketTimeoutMS
216+
*/
217+
public int getSocketTimeoutMS() {
218+
return connectionManager.getParams().getSoTimeout();
219+
}
220+
221+
/**
222+
* @param socketTimeoutMS the socketTimeoutMS to set
223+
*/
224+
public void setSocketTimeoutMS(int socketTimeoutMS) {
225+
connectionManager.getParams().setSoTimeout(socketTimeoutMS);
226+
}
227+
228+
public String getRequestPrefix() {
229+
return requestPrefix;
230+
}
231+
232+
public void setRequestPrefix(String requestPrefix) {
233+
this.requestPrefix = requestPrefix;
234+
}
235+
236+
public HttpClient getHttpClient()
237+
{
238+
return http;
239+
}
240+
}

wayback-core/src/main/java/org/archive/wayback/liveweb/LiveRobotsNoCache.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@
1616

1717
import com.google.common.io.ByteStreams;
1818

19-
public class LiveRobotsNoCache extends RemoteLiveWebCache {
19+
public class LiveRobotsNoCache extends ArcRemoteLiveWebCache {
2020

2121
protected int maxRobotsSize = 512000;
2222

0 commit comments

Comments
 (0)