removed bug: doc is saved under new URL if 301/302 error occured

git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@150780 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
cmarschner 2002-06-17 13:58:33 +00:00
parent 574ec2994b
commit 14fdfb458f
1 changed files with 23 additions and 9 deletions

View File

@ -65,7 +65,6 @@ import de.lanlab.larm.storage.LinkStorage;
import de.lanlab.larm.util.State; import de.lanlab.larm.util.State;
import de.lanlab.larm.util.SimpleLogger; import de.lanlab.larm.util.SimpleLogger;
import de.lanlab.larm.net.HttpTimeoutFactory;
import HTTPClient.*; import HTTPClient.*;
import java.net.*; import java.net.*;
import java.io.*; import java.io.*;
@ -73,6 +72,7 @@ import java.util.*;
import java.text.*; import java.text.*;
import de.lanlab.larm.parser.Tokenizer; import de.lanlab.larm.parser.Tokenizer;
import de.lanlab.larm.parser.LinkHandler; import de.lanlab.larm.parser.LinkHandler;
import de.lanlab.larm.net.*;
/** /**
* this class gets the documents from the web. It connects to the server given * this class gets the documents from the web. It connects to the server given
@ -266,8 +266,11 @@ public class FetcherTask
return actURLMessage.getUrl(); return actURLMessage.getUrl();
} }
SimpleLogger log; volatile SimpleLogger log;
SimpleLogger errorLog;
volatile SimpleLogger errorLog;
volatile HostManager hostManager;
//private long startTime; //private long startTime;
/** /**
@ -282,7 +285,9 @@ public class FetcherTask
taskState.setState(FT_STARTED); // state information is always set to make the thread monitor happy taskState.setState(FT_STARTED); // state information is always set to make the thread monitor happy
log = thread.getLog(); log = thread.getLog();
HostManager hm = ((FetcherThread)thread).getHostManager(); hostManager = ((FetcherThread)thread).getHostManager();
//HostManager hm = ((FetcherThread)thread).getHostManager();
errorLog = thread.getErrorLog(); errorLog = thread.getErrorLog();
@ -292,11 +297,11 @@ public class FetcherTask
log.log("start"); log.log("start");
base = contextUrl = actURLMessage.getUrl(); base = contextUrl = actURLMessage.getUrl();
String urlString = actURLMessage.getURLString(); String urlString = actURLMessage.getURLString();
String host = contextUrl.getHost(); String host = contextUrl.getHost().toLowerCase();
int hostPos = urlString.indexOf(host); int hostPos = urlString.indexOf(host);
int hostLen = host.length(); int hostLen = host.length();
HostInfo hi = hm.getHostInfo(host); // get and create HostInfo hi = hostManager.getHostInfo(host); // get and create
if(!hi.isHealthy()) if(!hi.isHealthy())
{ {
@ -344,6 +349,7 @@ public class FetcherTask
byte[] fullBuffer = null; byte[] fullBuffer = null;
String contentType = ""; String contentType = "";
int contentLength = 0; int contentLength = 0;
Date date = null;
if (statusCode != 404 && statusCode != 403) if (statusCode != 404 && statusCode != 403)
{ {
@ -351,6 +357,8 @@ public class FetcherTask
taskState.setState(FT_READING, ipURL); taskState.setState(FT_READING, ipURL);
contentType = response.getHeader("Content-Type"); contentType = response.getHeader("Content-Type");
String length = response.getHeader("Content-Length"); String length = response.getHeader("Content-Length");
date = response.getHeaderAsDate("Last-Modified");
if (length != null) if (length != null)
{ {
contentLength = Integer.parseInt(length); contentLength = Integer.parseInt(length);
@ -358,6 +366,12 @@ public class FetcherTask
log.log("reading"); log.log("reading");
fullBuffer = response.getData(Constants.FETCHERTASK_MAXFILESIZE); // max. 2 MB fullBuffer = response.getData(Constants.FETCHERTASK_MAXFILESIZE); // max. 2 MB
base = contextUrl = response.getEffectiveURI().toURL();
// may have changed after a 30x result code
// to do: record the link between original and effective URL
// like this the effectiveURL may be crawled twice
if (fullBuffer != null) if (fullBuffer != null)
{ {
contentLength = fullBuffer.length; contentLength = fullBuffer.length;
@ -403,7 +417,7 @@ public class FetcherTask
taskState.setState(FT_STORING, ipURL); taskState.setState(FT_STORING, ipURL);
linkStorage.storeLinks(foundUrls); linkStorage.storeLinks(foundUrls);
//messageHandler.putMessages(foundUrls); //messageHandler.putMessages(foundUrls);
docStorage.store(new WebDocument(contextUrl, contentType, fullBuffer, statusCode, actURLMessage.getReferer(), contentLength, title)); docStorage.store(new WebDocument(contextUrl, contentType, fullBuffer, statusCode, actURLMessage.getReferer(), contentLength, title, hostManager));
log.log("stored"); log.log("stored");
} }
} }
@ -576,9 +590,9 @@ public class FetcherTask
url = new URL(base, link); url = new URL(base, link);
} }
URLMessage urlMessage = new URLMessage(url, contextUrl, isFrame, anchor); URLMessage urlMessage = new URLMessage(url, contextUrl, isFrame, anchor, hostManager);
String urlString = urlMessage.getURLString(); //String urlString = urlMessage.getURLString();
foundUrls.add(urlMessage); foundUrls.add(urlMessage);
//messageHandler.putMessage(new actURLMessage(url)); // put them in the very end //messageHandler.putMessage(new actURLMessage(url)); // put them in the very end