better handling of status codes

git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@150837 13f79535-47bb-0310-9956-ffa450edef68
2002-10-22 15:02:43 +00:00 · 2002-10-22 15:02:43 +00:00 · b6243a3cbe
parent 5cf76aa76a
commit b6243a3cbe
1 changed files with 323 additions and 150 deletions
--- a/sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/fetcher/FetcherTask.java
+++ b/sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/fetcher/FetcherTask.java
@ -1,4 +1,5 @@
-/* ====================================================================
+/*
 *  ====================================================================
 *  The Apache Software License, Version 1.1
 *
 *  Copyright (c) 2001 The Apache Software Foundation.  All rights
@ -51,7 +52,6 @@
 *  information on the Apache Software Foundation, please see
 *  <http://www.apache.org/>.
 */
 package de.lanlab.larm.fetcher;
 import java.net.URL;
@ -78,17 +78,20 @@ import de.lanlab.larm.net.*;
 * this class gets the documents from the web. It connects to the server given
 * by the IP address in the URLMessage, gets the document, and forwards it to
 * the storage. If it's an HTML document, it will be parsed and all links will
- * be put into the message handler again.
+ * be put into the message handler again. stores contents of the files in field
- *
+ * "contents"
 * stores contents of the files in field "contents"
 *
 * @author    Clemens Marschner
 * @created   28. Juni 2002
 * @version   $Id$
 */
 public class FetcherTask
         implements InterruptableTask, LinkHandler, Serializable
 {
    /**
     * Description of the Field
     */
    protected volatile boolean isInterrupted = false;
    /**
@ -109,8 +112,7 @@ public class FetcherTask
    private volatile URL base;
    /**
-     * the URL of the docuzment
+     * the URL of the docuzment only valid within a doTask call
     * only valid within a doTask call
     */
    private volatile URL contextUrl;
@ -120,8 +122,7 @@ public class FetcherTask
    protected static volatile MessageHandler messageHandler;
    /**
-     * actual number of bytes read
+     * actual number of bytes read only valid within a doTask call
     * only valid within a doTask call
     */
    private volatile long bytesRead = 0;
@ -135,30 +136,61 @@ public class FetcherTask
     */
    private static volatile LinkStorage linkStorage;
    /**
     * task state IDs. comparisons will be done by their references, so always
     * use the IDs
     */
    public final static String FT_IDLE = "idle";
    /**
     * Description of the Field
     */
    public final static String FT_STARTED = "started";
    /**
     * Description of the Field
     */
    public final static String FT_OPENCONNECTION = "opening connection";
    /**
     * Description of the Field
     */
    public final static String FT_CONNECTING = "connecting";
    /**
     * Description of the Field
     */
    public final static String FT_GETTING = "getting";
    /**
     * Description of the Field
     */
    public final static String FT_READING = "reading";
    /**
     * Description of the Field
     */
    public final static String FT_SCANNING = "scanning";
    /**
     * Description of the Field
     */
    public final static String FT_STORING = "storing";
    /**
     * Description of the Field
     */
    public final static String FT_READY = "ready";
    /**
     * Description of the Field
     */
    public final static String FT_CLOSING = "closing";
    /**
     * Description of the Field
     */
    public final static String FT_EXCEPTION = "exception";
    /**
     * Description of the Field
     */
    public final static String FT_INTERRUPTED = "interrupted";
    private volatile State taskState = new State(FT_IDLE);
    /**
-     * the URLs found will be stored and only added to the message handler in the very
+     * the URLs found will be stored and only added to the message handler in
-     * end, to avoid too many synchronizations
+     * the very end, to avoid too many synchronizations
     */
    private volatile LinkedList foundUrls;
@ -172,17 +204,6 @@ public class FetcherTask
     */
    private volatile String title;
    /**
     * headers for HTTPClient
     */
    private static volatile NVPair headers[] = new NVPair[1];
    static
    {
        headers[0] = new HTTPClient.NVPair("User-Agent", Constants.CRAWLER_AGENT);
    }
    /**
     * Gets a copy of the current taskState
@ -227,6 +248,7 @@ public class FetcherTask
        FetcherTask.docStorage = docStorage;
    }
    /**
     * Sets the document linkStorage
     *
@ -268,27 +290,54 @@ public class FetcherTask
        return actURLMessage.getUrl();
    }
    volatile SimpleLogger log;
    volatile SimpleLogger errorLog;
    volatile HostManager hostManager;
    volatile HostResolver hostResolver;
    //private long startTime;
    /**
     * this will be called by the fetcher thread and will do all the work
     *
     * @TODO probably split this up into different processing steps
     * @param thread  Description of the Parameter
     * @TODO          probably split this up into different processing steps
     */
    public void run(ServerThread thread)
    {
-        taskState.setState(FT_STARTED); // state information is always set to make the thread monitor happy
+
        taskState.setState(FT_STARTED);
        // state information is always set to make the thread monitor happy
        log = thread.getLog();
        hostManager = ((FetcherThread) thread).getHostManager();
        hostResolver = hostManager.getHostResolver();
        base = contextUrl = actURLMessage.getUrl();
        String urlString = actURLMessage.getURLString();
        String host = contextUrl.getHost().toLowerCase();
        HostInfo hi = hostManager.getHostInfo(host);
 //        System.out.println("FetcherTask with " + urlString + " started");
        if(actURLMessage.linkType == URLMessage.LINKTYPE_REDIRECT)
        {
            taskState.setState(FT_READY, null);
            hi.releaseLock();
            return;     // we've already crawled that (see below)
        }
        NVPair[] headers = ((FetcherThread) thread).getDefaultHeaders();
        int numHeaders = ((FetcherThread) thread).getUsedDefaultHeaders();
        boolean isIncremental = false;
        if (actURLMessage instanceof WebDocument)
        {
            // this is an incremental crawl where we only have to check whether the doc crawled
            // is newer
            isIncremental = true;
            headers[numHeaders] = new NVPair("If-Modified-Since", HTTPClient.Util.httpDate(((WebDocument) actURLMessage).getLastModified()));
        }
        //HostManager hm = ((FetcherThread)thread).getHostManager();
        errorLog = thread.getErrorLog();
@ -297,21 +346,19 @@ public class FetcherTask
        int threadNr = ((FetcherThread) thread).getThreadNumber();
        log.log("start");
        base = contextUrl = actURLMessage.getUrl();
        String urlString = actURLMessage.getURLString();
        String host = contextUrl.getHost().toLowerCase();
        int hostPos = urlString.indexOf(host);
        int hostLen = host.length();
-        HostInfo hi = hostManager.getHostInfo(host); // get and create
+        // get and create
        if (!hi.isHealthy())
        {
            // we make this check as late as possible to get the most current information
            log.log("Bad Host: " + contextUrl + "; returning");
-            System.out.println("[" + threadNr + "] bad host: " + this.actURLMessage.getUrl());
+//            System.out.println("[" + threadNr + "] bad host: " + this.actURLMessage.getUrl());
            taskState.setState(FT_READY, null);
            hi.releaseLock();
            return;
        }
@ -319,14 +366,13 @@ public class FetcherTask
        HTTPConnection conn = null;
-        title = "*untitled*";
+        title = "";
        int size = 1;
        InputStream in = null;
        bytesRead = 0;
        try
        {
@ -339,6 +385,7 @@ public class FetcherTask
            conn = new HTTPConnection(host);
            conn.setDefaultTimeout(75000);
            // 75 s
            conn.setDefaultAllowUserInteraction(false);
@ -353,8 +400,99 @@ public class FetcherTask
            int contentLength = 0;
            Date date = null;
-            if (statusCode != 404 && statusCode != 403)
+             if (isIncremental)
            {
                // experimental
                System.out.println("ftask: if modified since: " + HTTPClient.Util.httpDate(((WebDocument) actURLMessage).getLastModified()));
            }
            URL realURL;
            switch (statusCode)
            {
                case 404:                // file not found
                case 403:                    // access forbidden
                    // if this is an incremental crawl, remove the doc from the repository
                    if (isIncremental)
                    {
                        WebDocument d = (WebDocument) actURLMessage;
                        d.setResultCode(statusCode);
                        // the repository will remove the doc if this statuscode is matched
                        docStorage.store(d);
                    }
                    // otherwise, do nothing
                    // Todo: we could add an error marker to the referal link
                    break;
                case 304:
                    // not modified
                    System.out.println("ftask: -> not modified");
                    // "not modified since"
                    taskState.setState(FT_STORING, ipURL);
                    // let the repository take care of the links
                    // it will determine that this is the old document (because it already
                    // has a docId), and will put back the links associated with it
                    try
                    {
                        WebDocument doc = (WebDocument) this.actURLMessage;
                        doc.setModified(false);
                        docStorage.store(doc);
                        this.bytesRead += doc.getSize();
                    }
                    catch (ClassCastException e)
                    {
                        System.out.println("error while casting to WebDoc: " + actURLMessage.getInfo());
                    }
                    break;
                case 301:                // moved permanently
                case 302:                // moved temporarily
                case 303:                // see other
                case 307:                // temporary redirect
                    /*
                     *  this is a redirect. save it as a link and return.
                     *  note that we could read the doc from the open connection here, but this could mean
                     *  the filters were useless
                     */
                    realURL = response.getEffectiveURI().toURL();
                    foundUrls.add(new URLMessage(realURL, contextUrl, URLMessage.LINKTYPE_REDIRECT, "", hostResolver));
                    linkStorage.storeLinks(foundUrls);
                    break;
                default:
                    // this can be a 30x code that was resolved by the HTTPClient and is passed to us as 200
                    // we could turn this off and do it ourselves. But then we'd have to take care that
                    // we don't get into an endless redirection loop -> i.e. extend URLMessage by a counter
                    // at the moment we add the real URL to the message queue and mark it as a REDIRECT link
                    // that way it is added to the visited filter. Then we take care that we don't crawl it again
                    // the other possibility is that we receive a "Location:" header along with a 200 status code
                    // I have experienced that HTTPClient has an error with parsing this, so we do it ourselves
                    //String location = response.getHeader("Location");
                    realURL = response.getEffectiveURI().toURL();
                    /*if(location != null)
                    {
                        //System.out.println("interesting: location header with url " + location);
                        foundUrls.add(new URLMessage(new URL(location), contextUrl, URLMessage.LINKTYPE_REDIRECT, "", hostManager));
                        this.base = this.contextUrl = location;
                    }
                    else*/
                    if(!(realURL.equals(contextUrl)))
                    {
                        //System.out.println("interesting: redirect with url " + realURL + " -context: " + contextUrl);
                        foundUrls.add(new URLMessage(realURL, contextUrl, URLMessage.LINKTYPE_REDIRECT, "", hostResolver));
                        this.base = this.contextUrl = realURL;
                        //System.out.println(response);
                    }
                    if (isIncremental)
                    {
                        // experimental
                        System.out.println("ftask: -> was modified at " + response.getHeaderAsDate("Last-Modified"));
                    }
                    // read up to Constants.FETCHERTASK_MAXFILESIZE bytes into a byte array
                    taskState.setState(FT_READING, ipURL);
                    contentType = response.getHeader("Content-Type");
@ -366,22 +504,17 @@ public class FetcherTask
                        contentLength = Integer.parseInt(length);
                    }
                    log.log("reading");
-
+                    realURL = response.getEffectiveURI().toURL();
-                fullBuffer = response.getData(Constants.FETCHERTASK_MAXFILESIZE); // max. 2 MB
+                    if (contentType != null && contentType.startsWith("text/html"))
-                base = contextUrl = response.getEffectiveURI().toURL();
+                    {
-                // may have changed after a 30x result code
+                        fullBuffer = response.getData(Constants.FETCHERTASK_MAXFILESIZE);
-                // to do: record the link between original and effective URL
+                        hi.releaseLock();
-                // like this the effectiveURL may be crawled twice
+                        // max. 2 MB
                        if (fullBuffer != null)
                        {
                            contentLength = fullBuffer.length;
                            this.bytesRead += contentLength;
                        }
            }
            //conn.stop();    // close connection. todo: Do some caching...
                        /*
                         *  conn.disconnect();
@ -399,10 +532,10 @@ public class FetcherTask
                                log.log("read file (" + fullBuffer.length + " bytes). Now scanning.");
-                    if (contentType.startsWith("text/html"))
+                                // convert the bytes to Java characters
                    {
                                // ouch. I haven't found a better solution yet. just slower ones.
                                // remember: for better runtime performance avoid decorators, since they
                                // multiply function calls
                                char[] fullCharBuffer = new char[contentLength];
                                new InputStreamReader(new ByteArrayInputStream(fullBuffer)).read(fullCharBuffer);
                                Tokenizer tok = new Tokenizer();
@ -411,25 +544,55 @@ public class FetcherTask
                                taskState.setState(FT_STORING, ipURL);
                                linkStorage.storeLinks(foundUrls);
-                        WebDocument d = new WebDocument(contextUrl, contentType, statusCode, actURLMessage.getReferer(), contentLength, title, date, hostManager);
+                                WebDocument d;
                                if (isIncremental)
                                {
                                    d = ((WebDocument) this.actURLMessage);
                                    d.setModified(true);
                                    // file is new or newer
                                    d.setUrl(contextUrl);
                                    d.setMimeType(contentType);
                                    d.setResultCode(statusCode);
                                    d.setSize(contentLength);
                                    d.setTitle(title);
                                    d.setLastModified(date);
                                }
                                else
                                {
                                    d = new WebDocument(contextUrl, contentType, statusCode, actURLMessage.getReferer(), contentLength, title, date, hostResolver);
                                }
                                d.addField("content", fullCharBuffer);
                                d.addField("contentBytes", fullBuffer);
                                docStorage.store(d);
                            }
                            log.log("scanned");
                        }
                        log.log("stored");
                    }
                    else
                    {
                        // System.out.println("Discovered unknown content type: " + contentType + " at " + urlString);
                        //errorLog.log("[" + threadNr + "] Discovered unknown content type at " + urlString + ": " + contentType + ". just storing");
                        taskState.setState(FT_STORING, ipURL);
                        linkStorage.storeLinks(foundUrls);
-                        WebDocument d = new WebDocument(contextUrl, contentType, statusCode, actURLMessage.getReferer(), contentLength, title, date, hostManager);
+                        WebDocument d = new WebDocument(contextUrl, contentType, statusCode, actURLMessage.getReferer(),
-                        d.addField("content", fullBuffer);
+                        /*
                         *  contentLength
                         */
                                0, title, date, hostResolver);
                        //d.addField("content", fullBuffer);
                        //d.addField("content", null);
                        docStorage.store(d);
                    }
-                    log.log("scanned");
+                    break;
            }
            /*
             *  switch
             */
            //conn.stop();    // close connection. todo: Do some caching...
                log.log("stored");
            }
        }
        catch (InterruptedIOException e)
        {
@ -461,6 +624,7 @@ public class FetcherTask
            //System.out.println("[" + threadNr + "] " + e.getClass().getName() + ": " + e.getMessage());
            // e.printStackTrace();
            errorLog.log("error: " + e.getClass().getName() + ": " + e.getMessage());
        }
        catch (SocketException e)
        {
@ -500,10 +664,10 @@ public class FetcherTask
            e.printStackTrace();
            System.out.println("[" + threadNr + "]: stopping");
            errorLog.log("error: " + e.getClass().getName() + ": " + e.getMessage() + "; stopping");
        }
        finally
        {
            hi.releaseLock();
            if (isInterrupted)
            {
@ -521,7 +685,6 @@ public class FetcherTask
         */
        taskState.setState(FT_CLOSING);
        conn.stop();
        taskState.setState(FT_READY);
        foundUrls = null;
    }
@ -529,6 +692,7 @@ public class FetcherTask
    /**
     * the interrupt method. not in use since the change to HTTPClient
     *
     * @TODO   decide if we need this anymore
     */
    public void interrupt()
@ -563,11 +727,12 @@ public class FetcherTask
    /**
-     * this is called whenever a link was found in the current document,
+     * this is called whenever a link was found in the current document, Don't
-     * Don't create too many objects here, as this will be called
+     * create too many objects here, as this will be called millions of times
     * millions of times
     *
     * @param link     Description of the Parameter
     * @param anchor   Description of the Parameter
     * @param isFrame  Description of the Parameter
     */
    public void handleLink(String link, String anchor, boolean isFrame)
    {
@ -599,8 +764,11 @@ public class FetcherTask
                // relative url
                url = new URL(base, link);
            }
-
+            if(url.getPath() == null || url.getPath().length() == 0)
-            URLMessage urlMessage =  new URLMessage(url, contextUrl, isFrame, anchor, hostManager);
+            {
                url = new URL(url.getProtocol(), url.getHost(), url.getPort(), "/" + url.getFile());
            }
            URLMessage urlMessage = new URLMessage(url, contextUrl, isFrame ? URLMessage.LINKTYPE_FRAME : URLMessage.LINKTYPE_ANCHOR, anchor, hostResolver);
            //String urlString = urlMessage.getURLString();
@ -670,6 +838,11 @@ public class FetcherTask
     *  /System.out.println("Task " + this.taskNr + " finished (" + totalRead + " bytes in " + timeElapsed + " ms with " + totalRead / (timeElapsed / 1000.0) + " bytes/s)");
     *  }
     */
    /**
     * Gets the bytesRead attribute of the FetcherTask object
     *
     * @return   The bytesRead value
     */
    public long getBytesRead()
    {
        return bytesRead;