better handling of status codes

git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@150837 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
cmarschner 2002-10-22 15:02:43 +00:00
parent 5cf76aa76a
commit b6243a3cbe
1 changed files with 323 additions and 150 deletions

View File

@ -1,4 +1,5 @@
/* ==================================================================== /*
* ====================================================================
* The Apache Software License, Version 1.1 * The Apache Software License, Version 1.1
* *
* Copyright (c) 2001 The Apache Software Foundation. All rights * Copyright (c) 2001 The Apache Software Foundation. All rights
@ -51,7 +52,6 @@
* information on the Apache Software Foundation, please see * information on the Apache Software Foundation, please see
* <http://www.apache.org/>. * <http://www.apache.org/>.
*/ */
package de.lanlab.larm.fetcher; package de.lanlab.larm.fetcher;
import java.net.URL; import java.net.URL;
@ -78,17 +78,20 @@ import de.lanlab.larm.net.*;
* this class gets the documents from the web. It connects to the server given * this class gets the documents from the web. It connects to the server given
* by the IP address in the URLMessage, gets the document, and forwards it to * by the IP address in the URLMessage, gets the document, and forwards it to
* the storage. If it's an HTML document, it will be parsed and all links will * the storage. If it's an HTML document, it will be parsed and all links will
* be put into the message handler again. * be put into the message handler again. stores contents of the files in field
* * "contents"
* stores contents of the files in field "contents"
* *
* @author Clemens Marschner * @author Clemens Marschner
* @created 28. Juni 2002
* @version $Id$ * @version $Id$
*/ */
public class FetcherTask public class FetcherTask
implements InterruptableTask, LinkHandler, Serializable implements InterruptableTask, LinkHandler, Serializable
{ {
/**
* Description of the Field
*/
protected volatile boolean isInterrupted = false; protected volatile boolean isInterrupted = false;
/** /**
@ -109,8 +112,7 @@ public class FetcherTask
private volatile URL base; private volatile URL base;
/** /**
* the URL of the docuzment * the URL of the docuzment only valid within a doTask call
* only valid within a doTask call
*/ */
private volatile URL contextUrl; private volatile URL contextUrl;
@ -120,8 +122,7 @@ public class FetcherTask
protected static volatile MessageHandler messageHandler; protected static volatile MessageHandler messageHandler;
/** /**
* actual number of bytes read * actual number of bytes read only valid within a doTask call
* only valid within a doTask call
*/ */
private volatile long bytesRead = 0; private volatile long bytesRead = 0;
@ -135,30 +136,61 @@ public class FetcherTask
*/ */
private static volatile LinkStorage linkStorage; private static volatile LinkStorage linkStorage;
/** /**
* task state IDs. comparisons will be done by their references, so always * task state IDs. comparisons will be done by their references, so always
* use the IDs * use the IDs
*/ */
public final static String FT_IDLE = "idle"; public final static String FT_IDLE = "idle";
/**
* Description of the Field
*/
public final static String FT_STARTED = "started"; public final static String FT_STARTED = "started";
/**
* Description of the Field
*/
public final static String FT_OPENCONNECTION = "opening connection"; public final static String FT_OPENCONNECTION = "opening connection";
/**
* Description of the Field
*/
public final static String FT_CONNECTING = "connecting"; public final static String FT_CONNECTING = "connecting";
/**
* Description of the Field
*/
public final static String FT_GETTING = "getting"; public final static String FT_GETTING = "getting";
/**
* Description of the Field
*/
public final static String FT_READING = "reading"; public final static String FT_READING = "reading";
/**
* Description of the Field
*/
public final static String FT_SCANNING = "scanning"; public final static String FT_SCANNING = "scanning";
/**
* Description of the Field
*/
public final static String FT_STORING = "storing"; public final static String FT_STORING = "storing";
/**
* Description of the Field
*/
public final static String FT_READY = "ready"; public final static String FT_READY = "ready";
/**
* Description of the Field
*/
public final static String FT_CLOSING = "closing"; public final static String FT_CLOSING = "closing";
/**
* Description of the Field
*/
public final static String FT_EXCEPTION = "exception"; public final static String FT_EXCEPTION = "exception";
/**
* Description of the Field
*/
public final static String FT_INTERRUPTED = "interrupted"; public final static String FT_INTERRUPTED = "interrupted";
private volatile State taskState = new State(FT_IDLE); private volatile State taskState = new State(FT_IDLE);
/** /**
* the URLs found will be stored and only added to the message handler in the very * the URLs found will be stored and only added to the message handler in
* end, to avoid too many synchronizations * the very end, to avoid too many synchronizations
*/ */
private volatile LinkedList foundUrls; private volatile LinkedList foundUrls;
@ -172,17 +204,6 @@ public class FetcherTask
*/ */
private volatile String title; private volatile String title;
/**
* headers for HTTPClient
*/
private static volatile NVPair headers[] = new NVPair[1];
static
{
headers[0] = new HTTPClient.NVPair("User-Agent", Constants.CRAWLER_AGENT);
}
/** /**
* Gets a copy of the current taskState * Gets a copy of the current taskState
@ -227,6 +248,7 @@ public class FetcherTask
FetcherTask.docStorage = docStorage; FetcherTask.docStorage = docStorage;
} }
/** /**
* Sets the document linkStorage * Sets the document linkStorage
* *
@ -268,27 +290,54 @@ public class FetcherTask
return actURLMessage.getUrl(); return actURLMessage.getUrl();
} }
volatile SimpleLogger log; volatile SimpleLogger log;
volatile SimpleLogger errorLog; volatile SimpleLogger errorLog;
volatile HostManager hostManager; volatile HostManager hostManager;
volatile HostResolver hostResolver;
//private long startTime; //private long startTime;
/** /**
* this will be called by the fetcher thread and will do all the work * this will be called by the fetcher thread and will do all the work
* *
* @TODO probably split this up into different processing steps
* @param thread Description of the Parameter * @param thread Description of the Parameter
* @TODO probably split this up into different processing steps
*/ */
public void run(ServerThread thread) public void run(ServerThread thread)
{ {
taskState.setState(FT_STARTED); // state information is always set to make the thread monitor happy
taskState.setState(FT_STARTED);
// state information is always set to make the thread monitor happy
log = thread.getLog(); log = thread.getLog();
hostManager = ((FetcherThread) thread).getHostManager(); hostManager = ((FetcherThread) thread).getHostManager();
hostResolver = hostManager.getHostResolver();
base = contextUrl = actURLMessage.getUrl();
String urlString = actURLMessage.getURLString();
String host = contextUrl.getHost().toLowerCase();
HostInfo hi = hostManager.getHostInfo(host);
// System.out.println("FetcherTask with " + urlString + " started");
if(actURLMessage.linkType == URLMessage.LINKTYPE_REDIRECT)
{
taskState.setState(FT_READY, null);
hi.releaseLock();
return; // we've already crawled that (see below)
}
NVPair[] headers = ((FetcherThread) thread).getDefaultHeaders();
int numHeaders = ((FetcherThread) thread).getUsedDefaultHeaders();
boolean isIncremental = false;
if (actURLMessage instanceof WebDocument)
{
// this is an incremental crawl where we only have to check whether the doc crawled
// is newer
isIncremental = true;
headers[numHeaders] = new NVPair("If-Modified-Since", HTTPClient.Util.httpDate(((WebDocument) actURLMessage).getLastModified()));
}
//HostManager hm = ((FetcherThread)thread).getHostManager(); //HostManager hm = ((FetcherThread)thread).getHostManager();
errorLog = thread.getErrorLog(); errorLog = thread.getErrorLog();
@ -297,21 +346,19 @@ public class FetcherTask
int threadNr = ((FetcherThread) thread).getThreadNumber(); int threadNr = ((FetcherThread) thread).getThreadNumber();
log.log("start"); log.log("start");
base = contextUrl = actURLMessage.getUrl();
String urlString = actURLMessage.getURLString();
String host = contextUrl.getHost().toLowerCase();
int hostPos = urlString.indexOf(host); int hostPos = urlString.indexOf(host);
int hostLen = host.length(); int hostLen = host.length();
HostInfo hi = hostManager.getHostInfo(host); // get and create // get and create
if (!hi.isHealthy()) if (!hi.isHealthy())
{ {
// we make this check as late as possible to get the most current information // we make this check as late as possible to get the most current information
log.log("Bad Host: " + contextUrl + "; returning"); log.log("Bad Host: " + contextUrl + "; returning");
System.out.println("[" + threadNr + "] bad host: " + this.actURLMessage.getUrl()); // System.out.println("[" + threadNr + "] bad host: " + this.actURLMessage.getUrl());
taskState.setState(FT_READY, null); taskState.setState(FT_READY, null);
hi.releaseLock();
return; return;
} }
@ -319,14 +366,13 @@ public class FetcherTask
HTTPConnection conn = null; HTTPConnection conn = null;
title = "*untitled*"; title = "";
int size = 1; int size = 1;
InputStream in = null; InputStream in = null;
bytesRead = 0; bytesRead = 0;
try try
{ {
@ -339,6 +385,7 @@ public class FetcherTask
conn = new HTTPConnection(host); conn = new HTTPConnection(host);
conn.setDefaultTimeout(75000); conn.setDefaultTimeout(75000);
// 75 s // 75 s
conn.setDefaultAllowUserInteraction(false); conn.setDefaultAllowUserInteraction(false);
@ -353,8 +400,99 @@ public class FetcherTask
int contentLength = 0; int contentLength = 0;
Date date = null; Date date = null;
if (statusCode != 404 && statusCode != 403) if (isIncremental)
{ {
// experimental
System.out.println("ftask: if modified since: " + HTTPClient.Util.httpDate(((WebDocument) actURLMessage).getLastModified()));
}
URL realURL;
switch (statusCode)
{
case 404: // file not found
case 403: // access forbidden
// if this is an incremental crawl, remove the doc from the repository
if (isIncremental)
{
WebDocument d = (WebDocument) actURLMessage;
d.setResultCode(statusCode);
// the repository will remove the doc if this statuscode is matched
docStorage.store(d);
}
// otherwise, do nothing
// Todo: we could add an error marker to the referal link
break;
case 304:
// not modified
System.out.println("ftask: -> not modified");
// "not modified since"
taskState.setState(FT_STORING, ipURL);
// let the repository take care of the links
// it will determine that this is the old document (because it already
// has a docId), and will put back the links associated with it
try
{
WebDocument doc = (WebDocument) this.actURLMessage;
doc.setModified(false);
docStorage.store(doc);
this.bytesRead += doc.getSize();
}
catch (ClassCastException e)
{
System.out.println("error while casting to WebDoc: " + actURLMessage.getInfo());
}
break;
case 301: // moved permanently
case 302: // moved temporarily
case 303: // see other
case 307: // temporary redirect
/*
* this is a redirect. save it as a link and return.
* note that we could read the doc from the open connection here, but this could mean
* the filters were useless
*/
realURL = response.getEffectiveURI().toURL();
foundUrls.add(new URLMessage(realURL, contextUrl, URLMessage.LINKTYPE_REDIRECT, "", hostResolver));
linkStorage.storeLinks(foundUrls);
break;
default:
// this can be a 30x code that was resolved by the HTTPClient and is passed to us as 200
// we could turn this off and do it ourselves. But then we'd have to take care that
// we don't get into an endless redirection loop -> i.e. extend URLMessage by a counter
// at the moment we add the real URL to the message queue and mark it as a REDIRECT link
// that way it is added to the visited filter. Then we take care that we don't crawl it again
// the other possibility is that we receive a "Location:" header along with a 200 status code
// I have experienced that HTTPClient has an error with parsing this, so we do it ourselves
//String location = response.getHeader("Location");
realURL = response.getEffectiveURI().toURL();
/*if(location != null)
{
//System.out.println("interesting: location header with url " + location);
foundUrls.add(new URLMessage(new URL(location), contextUrl, URLMessage.LINKTYPE_REDIRECT, "", hostManager));
this.base = this.contextUrl = location;
}
else*/
if(!(realURL.equals(contextUrl)))
{
//System.out.println("interesting: redirect with url " + realURL + " -context: " + contextUrl);
foundUrls.add(new URLMessage(realURL, contextUrl, URLMessage.LINKTYPE_REDIRECT, "", hostResolver));
this.base = this.contextUrl = realURL;
//System.out.println(response);
}
if (isIncremental)
{
// experimental
System.out.println("ftask: -> was modified at " + response.getHeaderAsDate("Last-Modified"));
}
// read up to Constants.FETCHERTASK_MAXFILESIZE bytes into a byte array // read up to Constants.FETCHERTASK_MAXFILESIZE bytes into a byte array
taskState.setState(FT_READING, ipURL); taskState.setState(FT_READING, ipURL);
contentType = response.getHeader("Content-Type"); contentType = response.getHeader("Content-Type");
@ -366,22 +504,17 @@ public class FetcherTask
contentLength = Integer.parseInt(length); contentLength = Integer.parseInt(length);
} }
log.log("reading"); log.log("reading");
realURL = response.getEffectiveURI().toURL();
fullBuffer = response.getData(Constants.FETCHERTASK_MAXFILESIZE); // max. 2 MB if (contentType != null && contentType.startsWith("text/html"))
base = contextUrl = response.getEffectiveURI().toURL(); {
// may have changed after a 30x result code fullBuffer = response.getData(Constants.FETCHERTASK_MAXFILESIZE);
// to do: record the link between original and effective URL hi.releaseLock();
// like this the effectiveURL may be crawled twice // max. 2 MB
if (fullBuffer != null) if (fullBuffer != null)
{ {
contentLength = fullBuffer.length; contentLength = fullBuffer.length;
this.bytesRead += contentLength; this.bytesRead += contentLength;
} }
}
//conn.stop(); // close connection. todo: Do some caching...
/* /*
* conn.disconnect(); * conn.disconnect();
@ -399,10 +532,10 @@ public class FetcherTask
log.log("read file (" + fullBuffer.length + " bytes). Now scanning."); log.log("read file (" + fullBuffer.length + " bytes). Now scanning.");
if (contentType.startsWith("text/html")) // convert the bytes to Java characters
{
// ouch. I haven't found a better solution yet. just slower ones. // ouch. I haven't found a better solution yet. just slower ones.
// remember: for better runtime performance avoid decorators, since they
// multiply function calls
char[] fullCharBuffer = new char[contentLength]; char[] fullCharBuffer = new char[contentLength];
new InputStreamReader(new ByteArrayInputStream(fullBuffer)).read(fullCharBuffer); new InputStreamReader(new ByteArrayInputStream(fullBuffer)).read(fullCharBuffer);
Tokenizer tok = new Tokenizer(); Tokenizer tok = new Tokenizer();
@ -411,25 +544,55 @@ public class FetcherTask
taskState.setState(FT_STORING, ipURL); taskState.setState(FT_STORING, ipURL);
linkStorage.storeLinks(foundUrls); linkStorage.storeLinks(foundUrls);
WebDocument d = new WebDocument(contextUrl, contentType, statusCode, actURLMessage.getReferer(), contentLength, title, date, hostManager); WebDocument d;
if (isIncremental)
{
d = ((WebDocument) this.actURLMessage);
d.setModified(true);
// file is new or newer
d.setUrl(contextUrl);
d.setMimeType(contentType);
d.setResultCode(statusCode);
d.setSize(contentLength);
d.setTitle(title);
d.setLastModified(date);
}
else
{
d = new WebDocument(contextUrl, contentType, statusCode, actURLMessage.getReferer(), contentLength, title, date, hostResolver);
}
d.addField("content", fullCharBuffer); d.addField("content", fullCharBuffer);
d.addField("contentBytes", fullBuffer);
docStorage.store(d); docStorage.store(d);
} }
log.log("scanned");
}
log.log("stored");
}
else else
{ {
// System.out.println("Discovered unknown content type: " + contentType + " at " + urlString); // System.out.println("Discovered unknown content type: " + contentType + " at " + urlString);
//errorLog.log("[" + threadNr + "] Discovered unknown content type at " + urlString + ": " + contentType + ". just storing"); //errorLog.log("[" + threadNr + "] Discovered unknown content type at " + urlString + ": " + contentType + ". just storing");
taskState.setState(FT_STORING, ipURL); taskState.setState(FT_STORING, ipURL);
linkStorage.storeLinks(foundUrls); linkStorage.storeLinks(foundUrls);
WebDocument d = new WebDocument(contextUrl, contentType, statusCode, actURLMessage.getReferer(), contentLength, title, date, hostManager); WebDocument d = new WebDocument(contextUrl, contentType, statusCode, actURLMessage.getReferer(),
d.addField("content", fullBuffer); /*
* contentLength
*/
0, title, date, hostResolver);
//d.addField("content", fullBuffer);
//d.addField("content", null);
docStorage.store(d); docStorage.store(d);
} }
log.log("scanned"); break;
} }
/*
* switch
*/
//conn.stop(); // close connection. todo: Do some caching...
log.log("stored");
}
} }
catch (InterruptedIOException e) catch (InterruptedIOException e)
{ {
@ -461,6 +624,7 @@ public class FetcherTask
//System.out.println("[" + threadNr + "] " + e.getClass().getName() + ": " + e.getMessage()); //System.out.println("[" + threadNr + "] " + e.getClass().getName() + ": " + e.getMessage());
// e.printStackTrace(); // e.printStackTrace();
errorLog.log("error: " + e.getClass().getName() + ": " + e.getMessage()); errorLog.log("error: " + e.getClass().getName() + ": " + e.getMessage());
} }
catch (SocketException e) catch (SocketException e)
{ {
@ -500,10 +664,10 @@ public class FetcherTask
e.printStackTrace(); e.printStackTrace();
System.out.println("[" + threadNr + "]: stopping"); System.out.println("[" + threadNr + "]: stopping");
errorLog.log("error: " + e.getClass().getName() + ": " + e.getMessage() + "; stopping"); errorLog.log("error: " + e.getClass().getName() + ": " + e.getMessage() + "; stopping");
} }
finally finally
{ {
hi.releaseLock();
if (isInterrupted) if (isInterrupted)
{ {
@ -521,7 +685,6 @@ public class FetcherTask
*/ */
taskState.setState(FT_CLOSING); taskState.setState(FT_CLOSING);
conn.stop(); conn.stop();
taskState.setState(FT_READY); taskState.setState(FT_READY);
foundUrls = null; foundUrls = null;
} }
@ -529,6 +692,7 @@ public class FetcherTask
/** /**
* the interrupt method. not in use since the change to HTTPClient * the interrupt method. not in use since the change to HTTPClient
*
* @TODO decide if we need this anymore * @TODO decide if we need this anymore
*/ */
public void interrupt() public void interrupt()
@ -563,11 +727,12 @@ public class FetcherTask
/** /**
* this is called whenever a link was found in the current document, * this is called whenever a link was found in the current document, Don't
* Don't create too many objects here, as this will be called * create too many objects here, as this will be called millions of times
* millions of times
* *
* @param link Description of the Parameter * @param link Description of the Parameter
* @param anchor Description of the Parameter
* @param isFrame Description of the Parameter
*/ */
public void handleLink(String link, String anchor, boolean isFrame) public void handleLink(String link, String anchor, boolean isFrame)
{ {
@ -599,8 +764,11 @@ public class FetcherTask
// relative url // relative url
url = new URL(base, link); url = new URL(base, link);
} }
if(url.getPath() == null || url.getPath().length() == 0)
URLMessage urlMessage = new URLMessage(url, contextUrl, isFrame, anchor, hostManager); {
url = new URL(url.getProtocol(), url.getHost(), url.getPort(), "/" + url.getFile());
}
URLMessage urlMessage = new URLMessage(url, contextUrl, isFrame ? URLMessage.LINKTYPE_FRAME : URLMessage.LINKTYPE_ANCHOR, anchor, hostResolver);
//String urlString = urlMessage.getURLString(); //String urlString = urlMessage.getURLString();
@ -670,6 +838,11 @@ public class FetcherTask
* /System.out.println("Task " + this.taskNr + " finished (" + totalRead + " bytes in " + timeElapsed + " ms with " + totalRead / (timeElapsed / 1000.0) + " bytes/s)"); * /System.out.println("Task " + this.taskNr + " finished (" + totalRead + " bytes in " + timeElapsed + " ms with " + totalRead / (timeElapsed / 1000.0) + " bytes/s)");
* } * }
*/ */
/**
* Gets the bytesRead attribute of the FetcherTask object
*
* @return The bytesRead value
*/
public long getBytesRead() public long getBytesRead()
{ {
return bytesRead; return bytesRead;