better handling of status codes

git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@150837 13f79535-47bb-0310-9956-ffa450edef68
2002-10-22 15:02:43 +00:00 · 2002-10-22 15:02:43 +00:00 · b6243a3cbe
parent 5cf76aa76a
commit b6243a3cbe
1 changed files with 323 additions and 150 deletions
--- a/sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/fetcher/FetcherTask.java
+++ b/sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/fetcher/FetcherTask.java
@ -1,57 +1,57 @@
-/* ====================================================================
- * The Apache Software License, Version 1.1
+/*
+ *  ====================================================================
+ *  The Apache Software License, Version 1.1
 *
- * Copyright (c) 2001 The Apache Software Foundation.  All rights
- * reserved.
+ *  Copyright (c) 2001 The Apache Software Foundation.  All rights
+ *  reserved.
 *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
+ *  Redistribution and use in source and binary forms, with or without
+ *  modification, are permitted provided that the following conditions
+ *  are met:
 *
- * 1. Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
+ *  1. Redistributions of source code must retain the above copyright
+ *  notice, this list of conditions and the following disclaimer.
 *
- * 2. Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in
- *    the documentation and/or other materials provided with the
- *    distribution.
+ *  2. Redistributions in binary form must reproduce the above copyright
+ *  notice, this list of conditions and the following disclaimer in
+ *  the documentation and/or other materials provided with the
+ *  distribution.
 *
- * 3. The end-user documentation included with the redistribution,
- *    if any, must include the following acknowledgment:
- *       "This product includes software developed by the
- *        Apache Software Foundation (http://www.apache.org/)."
- *    Alternately, this acknowledgment may appear in the software itself,
- *    if and wherever such third-party acknowledgments normally appear.
+ *  3. The end-user documentation included with the redistribution,
+ *  if any, must include the following acknowledgment:
+ *  "This product includes software developed by the
+ *  Apache Software Foundation (http://www.apache.org/)."
+ *  Alternately, this acknowledgment may appear in the software itself,
+ *  if and wherever such third-party acknowledgments normally appear.
 *
- * 4. The names "Apache" and "Apache Software Foundation" and
- *    "Apache Lucene" must not be used to endorse or promote products
- *    derived from this software without prior written permission. For
- *    written permission, please contact apache@apache.org.
+ *  4. The names "Apache" and "Apache Software Foundation" and
+ *  "Apache Lucene" must not be used to endorse or promote products
+ *  derived from this software without prior written permission. For
+ *  written permission, please contact apache@apache.org.
 *
- * 5. Products derived from this software may not be called "Apache",
- *    "Apache Lucene", nor may "Apache" appear in their name, without
- *    prior written permission of the Apache Software Foundation.
+ *  5. Products derived from this software may not be called "Apache",
+ *  "Apache Lucene", nor may "Apache" appear in their name, without
+ *  prior written permission of the Apache Software Foundation.
 *
- * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
- * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
- * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED.  IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
- * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
- * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
- * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- * SUCH DAMAGE.
- * ====================================================================
+ *  THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
+ *  WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ *  OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ *  DISCLAIMED.  IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
+ *  ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+ *  USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ *  ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ *  OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+ *  OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ *  SUCH DAMAGE.
+ *  ====================================================================
 *
- * This software consists of voluntary contributions made by many
- * individuals on behalf of the Apache Software Foundation.  For more
- * information on the Apache Software Foundation, please see
- * <http://www.apache.org/>.
+ *  This software consists of voluntary contributions made by many
+ *  individuals on behalf of the Apache Software Foundation.  For more
+ *  information on the Apache Software Foundation, please see
+ *  <http://www.apache.org/>.
 */
-
 package de.lanlab.larm.fetcher;

 import java.net.URL;
@ -78,17 +78,20 @@ import de.lanlab.larm.net.*;
 * this class gets the documents from the web. It connects to the server given
 * by the IP address in the URLMessage, gets the document, and forwards it to
 * the storage. If it's an HTML document, it will be parsed and all links will
- * be put into the message handler again.
- *
- * stores contents of the files in field "contents"
+ * be put into the message handler again. stores contents of the files in field
+ * "contents"
 *
 * @author    Clemens Marschner
- * @version $Id$
+ * @created   28. Juni 2002
+ * @version   $Id$
 */
 public class FetcherTask
         implements InterruptableTask, LinkHandler, Serializable
 {

+    /**
+     * Description of the Field
+     */
    protected volatile boolean isInterrupted = false;

    /**
@ -109,8 +112,7 @@ public class FetcherTask
    private volatile URL base;

    /**
-     * the URL of the docuzment
-     * only valid within a doTask call
+     * the URL of the docuzment only valid within a doTask call
     */
    private volatile URL contextUrl;

@ -120,8 +122,7 @@ public class FetcherTask
    protected static volatile MessageHandler messageHandler;

    /**
-     * actual number of bytes read
-     * only valid within a doTask call
+     * actual number of bytes read only valid within a doTask call
     */
    private volatile long bytesRead = 0;

@ -135,30 +136,61 @@ public class FetcherTask
     */
    private static volatile LinkStorage linkStorage;

-
-
    /**
     * task state IDs. comparisons will be done by their references, so always
     * use the IDs
     */
    public final static String FT_IDLE = "idle";
+    /**
+     * Description of the Field
+     */
    public final static String FT_STARTED = "started";
+    /**
+     * Description of the Field
+     */
    public final static String FT_OPENCONNECTION = "opening connection";
+    /**
+     * Description of the Field
+     */
    public final static String FT_CONNECTING = "connecting";
+    /**
+     * Description of the Field
+     */
    public final static String FT_GETTING = "getting";
+    /**
+     * Description of the Field
+     */
    public final static String FT_READING = "reading";
+    /**
+     * Description of the Field
+     */
    public final static String FT_SCANNING = "scanning";
+    /**
+     * Description of the Field
+     */
    public final static String FT_STORING = "storing";
+    /**
+     * Description of the Field
+     */
    public final static String FT_READY = "ready";
+    /**
+     * Description of the Field
+     */
    public final static String FT_CLOSING = "closing";
+    /**
+     * Description of the Field
+     */
    public final static String FT_EXCEPTION = "exception";
+    /**
+     * Description of the Field
+     */
    public final static String FT_INTERRUPTED = "interrupted";

    private volatile State taskState = new State(FT_IDLE);

    /**
-     * the URLs found will be stored and only added to the message handler in the very
-     * end, to avoid too many synchronizations
+     * the URLs found will be stored and only added to the message handler in
+     * the very end, to avoid too many synchronizations
     */
    private volatile LinkedList foundUrls;

@ -172,17 +204,6 @@ public class FetcherTask
     */
    private volatile String title;

-    /**
-     * headers for HTTPClient
-     */
-    private static volatile NVPair headers[] = new NVPair[1];
-
-    static
-    {
-        headers[0] = new HTTPClient.NVPair("User-Agent", Constants.CRAWLER_AGENT);
-
-    }
-

    /**
     * Gets a copy of the current taskState
@ -198,7 +219,7 @@ public class FetcherTask
    /**
     * Constructor for the FetcherTask object
     *
-     * @param urlMessage   Description of the Parameter
+     * @param urlMessage  Description of the Parameter
     */
    public FetcherTask(URLMessage urlMessage)
    {
@ -227,6 +248,7 @@ public class FetcherTask
        FetcherTask.docStorage = docStorage;
    }

+
    /**
     * Sets the document linkStorage
     *
@ -268,27 +290,54 @@ public class FetcherTask
        return actURLMessage.getUrl();
    }

+
    volatile SimpleLogger log;

    volatile SimpleLogger errorLog;

    volatile HostManager hostManager;
+    volatile HostResolver hostResolver;
+
    //private long startTime;

    /**
     * this will be called by the fetcher thread and will do all the work
     *
-     * @TODO probably split this up into different processing steps
     * @param thread  Description of the Parameter
+     * @TODO          probably split this up into different processing steps
     */
    public void run(ServerThread thread)
    {

-        taskState.setState(FT_STARTED); // state information is always set to make the thread monitor happy
+
+        taskState.setState(FT_STARTED);
+        // state information is always set to make the thread monitor happy

        log = thread.getLog();
-        hostManager = ((FetcherThread)thread).getHostManager();
+        hostManager = ((FetcherThread) thread).getHostManager();
+        hostResolver = hostManager.getHostResolver();
+        base = contextUrl = actURLMessage.getUrl();
+        String urlString = actURLMessage.getURLString();
+        String host = contextUrl.getHost().toLowerCase();
+        HostInfo hi = hostManager.getHostInfo(host);
+//        System.out.println("FetcherTask with " + urlString + " started");
+        if(actURLMessage.linkType == URLMessage.LINKTYPE_REDIRECT)
+        {
+            taskState.setState(FT_READY, null);
+            hi.releaseLock();
+            return;     // we've already crawled that (see below)
+        }

+        NVPair[] headers = ((FetcherThread) thread).getDefaultHeaders();
+        int numHeaders = ((FetcherThread) thread).getUsedDefaultHeaders();
+        boolean isIncremental = false;
+        if (actURLMessage instanceof WebDocument)
+        {
+            // this is an incremental crawl where we only have to check whether the doc crawled
+            // is newer
+            isIncremental = true;
+            headers[numHeaders] = new NVPair("If-Modified-Since", HTTPClient.Util.httpDate(((WebDocument) actURLMessage).getLastModified()));
+        }
        //HostManager hm = ((FetcherThread)thread).getHostManager();

        errorLog = thread.getErrorLog();
@ -297,21 +346,19 @@ public class FetcherTask
        int threadNr = ((FetcherThread) thread).getThreadNumber();

        log.log("start");
-        base = contextUrl = actURLMessage.getUrl();
-        String urlString = actURLMessage.getURLString();
-        String host = contextUrl.getHost().toLowerCase();
        int hostPos = urlString.indexOf(host);
        int hostLen = host.length();

-        HostInfo hi = hostManager.getHostInfo(host); // get and create
+        // get and create

-        if(!hi.isHealthy())
+        if (!hi.isHealthy())
        {
            // we make this check as late as possible to get the most current information
            log.log("Bad Host: " + contextUrl + "; returning");
-            System.out.println("[" + threadNr + "] bad host: " + this.actURLMessage.getUrl());
+//            System.out.println("[" + threadNr + "] bad host: " + this.actURLMessage.getUrl());

            taskState.setState(FT_READY, null);
+            hi.releaseLock();
            return;
        }

@ -319,14 +366,13 @@ public class FetcherTask

        HTTPConnection conn = null;

-        title = "*untitled*";
+        title = "";

        int size = 1;

        InputStream in = null;
        bytesRead = 0;

-
        try
        {

@ -339,6 +385,7 @@ public class FetcherTask
            conn = new HTTPConnection(host);

            conn.setDefaultTimeout(75000);
+
            // 75 s
            conn.setDefaultAllowUserInteraction(false);

@ -353,83 +400,199 @@ public class FetcherTask
            int contentLength = 0;
            Date date = null;

-            if (statusCode != 404 && statusCode != 403)
+             if (isIncremental)
            {
-                // read up to Constants.FETCHERTASK_MAXFILESIZE bytes into a byte array
-                taskState.setState(FT_READING, ipURL);
-                contentType = response.getHeader("Content-Type");
-                String length = response.getHeader("Content-Length");
-                date = response.getHeaderAsDate("Last-Modified");
-
-                if (length != null)
-                {
-                    contentLength = Integer.parseInt(length);
-                }
-                log.log("reading");
-
-                fullBuffer = response.getData(Constants.FETCHERTASK_MAXFILESIZE); // max. 2 MB
-                base = contextUrl = response.getEffectiveURI().toURL();
-                // may have changed after a 30x result code
-                // to do: record the link between original and effective URL
-                // like this the effectiveURL may be crawled twice
-
-
-                if (fullBuffer != null)
-                {
-                    contentLength = fullBuffer.length;
-                    this.bytesRead += contentLength;
-                }
+                // experimental
+                System.out.println("ftask: if modified since: " + HTTPClient.Util.httpDate(((WebDocument) actURLMessage).getLastModified()));
            }
-            //conn.stop();    // close connection. todo: Do some caching...

+            URL realURL;

-            /*
-             *  conn.disconnect();
-             */
-            if (isInterrupted)
+            switch (statusCode)
            {
-                System.out.println("FetcherTask: interrupted while reading. File truncated");
-                log.log("interrupted while reading. File truncated");
-            }
-            else
-            {
-                if (fullBuffer != null)
-                {
-                    taskState.setState(FT_SCANNING, ipURL);
+                case 404:                // file not found
+                case 403:                    // access forbidden

-                    log.log("read file (" + fullBuffer.length + " bytes). Now scanning.");
-
-                    if (contentType.startsWith("text/html"))
+                    // if this is an incremental crawl, remove the doc from the repository
+                    if (isIncremental)
                    {
-
-                        // ouch. I haven't found a better solution yet. just slower ones.
-                        char[] fullCharBuffer = new char[contentLength];
-                        new InputStreamReader(new ByteArrayInputStream(fullBuffer)).read(fullCharBuffer);
-                        Tokenizer tok = new Tokenizer();
-                        tok.setLinkHandler(this);
-                        tok.parse(new SimpleCharArrayReader(fullCharBuffer));
-
-                        taskState.setState(FT_STORING, ipURL);
-                        linkStorage.storeLinks(foundUrls);
-                        WebDocument d = new WebDocument(contextUrl, contentType, statusCode, actURLMessage.getReferer(), contentLength, title, date, hostManager);
-                        d.addField("content", fullCharBuffer);
+                        WebDocument d = (WebDocument) actURLMessage;
+                        d.setResultCode(statusCode);
+                        // the repository will remove the doc if this statuscode is matched
                        docStorage.store(d);
                    }
+                    // otherwise, do nothing
+                    // Todo: we could add an error marker to the referal link
+                    break;
+                case 304:
+                    // not modified
+                    System.out.println("ftask: -> not modified");
+                    // "not modified since"
+                    taskState.setState(FT_STORING, ipURL);
+                    // let the repository take care of the links
+                    // it will determine that this is the old document (because it already
+                    // has a docId), and will put back the links associated with it
+                    try
+                    {
+                        WebDocument doc = (WebDocument) this.actURLMessage;
+                        doc.setModified(false);
+                        docStorage.store(doc);
+                        this.bytesRead += doc.getSize();
+                    }
+                    catch (ClassCastException e)
+                    {
+                        System.out.println("error while casting to WebDoc: " + actURLMessage.getInfo());
+                    }
+                    break;
+                case 301:                // moved permanently
+                case 302:                // moved temporarily
+                case 303:                // see other
+                case 307:                // temporary redirect
+                    /*
+                     *  this is a redirect. save it as a link and return.
+                     *  note that we could read the doc from the open connection here, but this could mean
+                     *  the filters were useless
+                     */
+                    realURL = response.getEffectiveURI().toURL();
+                    foundUrls.add(new URLMessage(realURL, contextUrl, URLMessage.LINKTYPE_REDIRECT, "", hostResolver));
+                    linkStorage.storeLinks(foundUrls);
+                    break;
+                default:
+                    // this can be a 30x code that was resolved by the HTTPClient and is passed to us as 200
+                    // we could turn this off and do it ourselves. But then we'd have to take care that
+                    // we don't get into an endless redirection loop -> i.e. extend URLMessage by a counter
+                    // at the moment we add the real URL to the message queue and mark it as a REDIRECT link
+                    // that way it is added to the visited filter. Then we take care that we don't crawl it again
+
+                    // the other possibility is that we receive a "Location:" header along with a 200 status code
+                    // I have experienced that HTTPClient has an error with parsing this, so we do it ourselves
+                    //String location = response.getHeader("Location");
+                    realURL = response.getEffectiveURI().toURL();
+
+                    /*if(location != null)
+                    {
+                        //System.out.println("interesting: location header with url " + location);
+                        foundUrls.add(new URLMessage(new URL(location), contextUrl, URLMessage.LINKTYPE_REDIRECT, "", hostManager));
+                        this.base = this.contextUrl = location;
+                    }
+                    else*/
+                    if(!(realURL.equals(contextUrl)))
+                    {
+                        //System.out.println("interesting: redirect with url " + realURL + " -context: " + contextUrl);
+                        foundUrls.add(new URLMessage(realURL, contextUrl, URLMessage.LINKTYPE_REDIRECT, "", hostResolver));
+                        this.base = this.contextUrl = realURL;
+                        //System.out.println(response);
+
+                    }
+
+
+
+
+                    if (isIncremental)
+                    {
+                        // experimental
+                        System.out.println("ftask: -> was modified at " + response.getHeaderAsDate("Last-Modified"));
+                    }
+                    // read up to Constants.FETCHERTASK_MAXFILESIZE bytes into a byte array
+                    taskState.setState(FT_READING, ipURL);
+                    contentType = response.getHeader("Content-Type");
+                    String length = response.getHeader("Content-Length");
+                    date = response.getHeaderAsDate("Last-Modified");
+
+                    if (length != null)
+                    {
+                        contentLength = Integer.parseInt(length);
+                    }
+                    log.log("reading");
+                    realURL = response.getEffectiveURI().toURL();
+                    if (contentType != null && contentType.startsWith("text/html"))
+                    {
+                        fullBuffer = response.getData(Constants.FETCHERTASK_MAXFILESIZE);
+                        hi.releaseLock();
+                        // max. 2 MB
+                        if (fullBuffer != null)
+                        {
+                            contentLength = fullBuffer.length;
+                            this.bytesRead += contentLength;
+                        }
+
+                        /*
+                         *  conn.disconnect();
+                         */
+                        if (isInterrupted)
+                        {
+                            System.out.println("FetcherTask: interrupted while reading. File truncated");
+                            log.log("interrupted while reading. File truncated");
+                        }
+                        else
+                        {
+                            if (fullBuffer != null)
+                            {
+                                taskState.setState(FT_SCANNING, ipURL);
+
+                                log.log("read file (" + fullBuffer.length + " bytes). Now scanning.");
+
+                                // convert the bytes to Java characters
+                                // ouch. I haven't found a better solution yet. just slower ones.
+                                // remember: for better runtime performance avoid decorators, since they
+                                // multiply function calls
+                                char[] fullCharBuffer = new char[contentLength];
+                                new InputStreamReader(new ByteArrayInputStream(fullBuffer)).read(fullCharBuffer);
+                                Tokenizer tok = new Tokenizer();
+                                tok.setLinkHandler(this);
+                                tok.parse(new SimpleCharArrayReader(fullCharBuffer));
+
+                                taskState.setState(FT_STORING, ipURL);
+                                linkStorage.storeLinks(foundUrls);
+                                WebDocument d;
+                                if (isIncremental)
+                                {
+                                    d = ((WebDocument) this.actURLMessage);
+                                    d.setModified(true);
+                                    // file is new or newer
+                                    d.setUrl(contextUrl);
+                                    d.setMimeType(contentType);
+                                    d.setResultCode(statusCode);
+                                    d.setSize(contentLength);
+                                    d.setTitle(title);
+                                    d.setLastModified(date);
+                                }
+                                else
+                                {
+                                    d = new WebDocument(contextUrl, contentType, statusCode, actURLMessage.getReferer(), contentLength, title, date, hostResolver);
+                                }
+                                d.addField("content", fullCharBuffer);
+                                d.addField("contentBytes", fullBuffer);
+                                docStorage.store(d);
+                            }
+
+                            log.log("scanned");
+                        }
+
+                        log.log("stored");
+                    }
                    else
                    {
                        // System.out.println("Discovered unknown content type: " + contentType + " at " + urlString);
                        //errorLog.log("[" + threadNr + "] Discovered unknown content type at " + urlString + ": " + contentType + ". just storing");
                        taskState.setState(FT_STORING, ipURL);
                        linkStorage.storeLinks(foundUrls);
-                        WebDocument d = new WebDocument(contextUrl, contentType, statusCode, actURLMessage.getReferer(), contentLength, title, date, hostManager);
-                        d.addField("content", fullBuffer);
+                        WebDocument d = new WebDocument(contextUrl, contentType, statusCode, actURLMessage.getReferer(),
+                        /*
+                         *  contentLength
+                         */
+                                0, title, date, hostResolver);
+                        //d.addField("content", fullBuffer);
+                        //d.addField("content", null);
                        docStorage.store(d);
                    }
-                    log.log("scanned");
-                }
-
-                log.log("stored");
+                    break;
            }
+            /*
+             *  switch
+             */
+            //conn.stop();    // close connection. todo: Do some caching...
+
        }
        catch (InterruptedIOException e)
        {
@ -444,7 +607,7 @@ public class FetcherTask
            //System.out.println("[" + threadNr + "] FetcherTask: File not Found: " + this.actURLMessage.getUrl());
            errorLog.log("error: File not Found: " + this.actURLMessage.getUrl());
        }
-        catch(NoRouteToHostException e)
+        catch (NoRouteToHostException e)
        {
            // router is down or firewall prevents to connect
            hi.setReachable(false);
@ -453,7 +616,7 @@ public class FetcherTask
            // e.printStackTrace();
            errorLog.log("error: " + e.getClass().getName() + ": " + e.getMessage());
        }
-        catch(ConnectException e)
+        catch (ConnectException e)
        {
            // no server is listening at this port
            hi.setReachable(false);
@ -461,6 +624,7 @@ public class FetcherTask
            //System.out.println("[" + threadNr + "] " + e.getClass().getName() + ": " + e.getMessage());
            // e.printStackTrace();
            errorLog.log("error: " + e.getClass().getName() + ": " + e.getMessage());
+
        }
        catch (SocketException e)
        {
@ -469,7 +633,7 @@ public class FetcherTask
            errorLog.log("error: " + e.getClass().getName() + ": " + e.getMessage());

        }
-        catch(UnknownHostException e)
+        catch (UnknownHostException e)
        {
            // IP Address not to be determined
            hi.setReachable(false);
@ -500,10 +664,10 @@ public class FetcherTask
            e.printStackTrace();
            System.out.println("[" + threadNr + "]: stopping");
            errorLog.log("error: " + e.getClass().getName() + ": " + e.getMessage() + "; stopping");
-
        }
        finally
        {
+            hi.releaseLock();

            if (isInterrupted)
            {
@ -521,7 +685,6 @@ public class FetcherTask
         */
        taskState.setState(FT_CLOSING);
        conn.stop();
-
        taskState.setState(FT_READY);
        foundUrls = null;
    }
@ -529,7 +692,8 @@ public class FetcherTask

    /**
     * the interrupt method. not in use since the change to HTTPClient
-     * @TODO decide if we need this anymore
+     *
+     * @TODO   decide if we need this anymore
     */
    public void interrupt()
    {
@ -563,11 +727,12 @@ public class FetcherTask


    /**
-     * this is called whenever a link was found in the current document,
-     * Don't create too many objects here, as this will be called
-     * millions of times
+     * this is called whenever a link was found in the current document, Don't
+     * create too many objects here, as this will be called millions of times
     *
-     * @param link  Description of the Parameter
+     * @param link     Description of the Parameter
+     * @param anchor   Description of the Parameter
+     * @param isFrame  Description of the Parameter
     */
    public void handleLink(String link, String anchor, boolean isFrame)
    {
@ -599,8 +764,11 @@ public class FetcherTask
                // relative url
                url = new URL(base, link);
            }
-
-            URLMessage urlMessage =  new URLMessage(url, contextUrl, isFrame, anchor, hostManager);
+            if(url.getPath() == null || url.getPath().length() == 0)
+            {
+                url = new URL(url.getProtocol(), url.getHost(), url.getPort(), "/" + url.getFile());
+            }
+            URLMessage urlMessage = new URLMessage(url, contextUrl, isFrame ? URLMessage.LINKTYPE_FRAME : URLMessage.LINKTYPE_ANCHOR, anchor, hostResolver);

            //String urlString = urlMessage.getURLString();

@ -670,6 +838,11 @@ public class FetcherTask
     *  /System.out.println("Task " + this.taskNr + " finished (" + totalRead + " bytes in " + timeElapsed + " ms with " + totalRead / (timeElapsed / 1000.0) + " bytes/s)");
     *  }
     */
+    /**
+     * Gets the bytesRead attribute of the FetcherTask object
+     *
+     * @return   The bytesRead value
+     */
    public long getBytesRead()
    {
        return bytesRead;