added URLNormalizer. Changed filters to use normalized URLs if possible

git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@150781 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
cmarschner 2002-06-17 13:59:28 +00:00
parent 14fdfb458f
commit 5b90c10cb5
11 changed files with 270 additions and 431 deletions

View File

@ -65,6 +65,7 @@ import java.net.URL;
import java.util.LinkedList; import java.util.LinkedList;
import de.lanlab.larm.fetcher.FetcherTask; import de.lanlab.larm.fetcher.FetcherTask;
import de.lanlab.larm.net.*;
/** /**
* filter class; the Fetcher is the main class which keeps the ThreadPool that * filter class; the Fetcher is the main class which keeps the ThreadPool that

View File

@ -62,6 +62,7 @@ import java.util.*;
import de.lanlab.larm.gui.*; import de.lanlab.larm.gui.*;
import de.lanlab.larm.util.*; import de.lanlab.larm.util.*;
import de.lanlab.larm.storage.*; import de.lanlab.larm.storage.*;
import de.lanlab.larm.net.*;
import javax.swing.UIManager; import javax.swing.UIManager;
import HTTPClient.*; import HTTPClient.*;
import org.apache.oro.text.regex.MalformedPatternException; import org.apache.oro.text.regex.MalformedPatternException;
@ -278,7 +279,7 @@ public class FetcherMain
{ {
try try
{ {
messageHandler.putMessage(new URLMessage(url, null, isFrame, null)); messageHandler.putMessage(new URLMessage(url, null, isFrame, null, this.hostManager));
} }
catch (Exception e) catch (Exception e)
{ {

View File

@ -186,16 +186,17 @@ public class FetcherTaskQueue extends TaskQueue
public static void main(String args[]) public static void main(String args[])
{ {
FetcherTaskQueue q = new FetcherTaskQueue(); FetcherTaskQueue q = new FetcherTaskQueue();
de.lanlab.larm.net.HostManager hm = new de.lanlab.larm.net.HostManager(10);
System.out.println("Test 1. put in 4 yahoos and 3 lmus. pull out LMU/Yahoo/LMU/Yahoo/LMU/Yahoo/Yahoo"); System.out.println("Test 1. put in 4 yahoos and 3 lmus. pull out LMU/Yahoo/LMU/Yahoo/LMU/Yahoo/Yahoo");
try try
{ {
q.insert(new FetcherTask(new URLMessage(new URL("http://www.lmu.de/1"), null, false, null))); q.insert(new FetcherTask(new URLMessage(new URL("http://www.lmu.de/1"), null, false, null, hm)));
q.insert(new FetcherTask(new URLMessage(new URL("http://www.lmu.de/2"), null, false, null))); q.insert(new FetcherTask(new URLMessage(new URL("http://www.lmu.de/2"), null, false, null, hm)));
q.insert(new FetcherTask(new URLMessage(new URL("http://www.yahoo.de/1"), null, false, null))); q.insert(new FetcherTask(new URLMessage(new URL("http://www.yahoo.de/1"), null, false, null, hm)));
q.insert(new FetcherTask(new URLMessage(new URL("http://www.yahoo.de/2"), null, false, null))); q.insert(new FetcherTask(new URLMessage(new URL("http://www.yahoo.de/2"), null, false, null, hm)));
q.insert(new FetcherTask(new URLMessage(new URL("http://www.yahoo.de/3"), null, false, null))); q.insert(new FetcherTask(new URLMessage(new URL("http://www.yahoo.de/3"), null, false, null, hm)));
q.insert(new FetcherTask(new URLMessage(new URL("http://www.yahoo.de/4"), null, false, null))); q.insert(new FetcherTask(new URLMessage(new URL("http://www.yahoo.de/4"), null, false, null, hm)));
q.insert(new FetcherTask(new URLMessage(new URL("http://www.lmu.de/3"), null, false, null))); q.insert(new FetcherTask(new URLMessage(new URL("http://www.lmu.de/3"), null, false, null, hm)));
} }
catch (Throwable t) catch (Throwable t)
{ {
@ -217,9 +218,9 @@ public class FetcherTaskQueue extends TaskQueue
try try
{ {
System.out.println("put 3 lmus."); System.out.println("put 3 lmus.");
q.insert(new FetcherTask(new URLMessage(new URL("http://www.lmu.de/1"), null, false, null))); q.insert(new FetcherTask(new URLMessage(new URL("http://www.lmu.de/1"), null, false, null, hm)));
q.insert(new FetcherTask(new URLMessage(new URL("http://www.lmu.de/2"), null, false, null))); q.insert(new FetcherTask(new URLMessage(new URL("http://www.lmu.de/2"), null, false, null, hm)));
q.insert(new FetcherTask(new URLMessage(new URL("http://www.lmu.de/3"), null, false, null))); q.insert(new FetcherTask(new URLMessage(new URL("http://www.lmu.de/3"), null, false, null, hm)));
System.out.print("pull out 1st element [lmu/1]: "); System.out.print("pull out 1st element [lmu/1]: ");
System.out.println(((FetcherTask) q.remove()).getInfo()); System.out.println(((FetcherTask) q.remove()).getInfo());
System.out.println("size now [2]: " + q.size()); System.out.println("size now [2]: " + q.size());
@ -227,9 +228,9 @@ public class FetcherTaskQueue extends TaskQueue
System.out.println(((FetcherTask) q.remove()).getInfo()); System.out.println(((FetcherTask) q.remove()).getInfo());
System.out.println("size now [1]: " + q.size()); System.out.println("size now [1]: " + q.size());
System.out.println("put in 3 yahoos"); System.out.println("put in 3 yahoos");
q.insert(new FetcherTask(new URLMessage(new URL("http://www.yahoo.de/1"), null, false, null))); q.insert(new FetcherTask(new URLMessage(new URL("http://www.yahoo.de/1"), null, false, null, hm)));
q.insert(new FetcherTask(new URLMessage(new URL("http://www.yahoo.de/2"), null, false, null))); q.insert(new FetcherTask(new URLMessage(new URL("http://www.yahoo.de/2"), null, false, null, hm)));
q.insert(new FetcherTask(new URLMessage(new URL("http://www.yahoo.de/3"), null, false, null))); q.insert(new FetcherTask(new URLMessage(new URL("http://www.yahoo.de/3"), null, false, null, hm)));
System.out.println("remove [?]: " + ((FetcherTask) q.remove()).getInfo()); System.out.println("remove [?]: " + ((FetcherTask) q.remove()).getInfo());
System.out.println("Size now [3]: " + q.size()); System.out.println("Size now [3]: " + q.size());
System.out.println("remove [?]: " + ((FetcherTask) q.remove()).getInfo()); System.out.println("remove [?]: " + ((FetcherTask) q.remove()).getInfo());
@ -237,7 +238,7 @@ public class FetcherTaskQueue extends TaskQueue
System.out.println("remove [?]: " + ((FetcherTask) q.remove()).getInfo()); System.out.println("remove [?]: " + ((FetcherTask) q.remove()).getInfo());
System.out.println("Size now [1]: " + q.size()); System.out.println("Size now [1]: " + q.size());
System.out.println("put in another Yahoo"); System.out.println("put in another Yahoo");
q.insert(new FetcherTask(new URLMessage(new URL("http://www.yahoo.de/4"), null, false, null))); q.insert(new FetcherTask(new URLMessage(new URL("http://www.yahoo.de/4"), null, false, null, hm)));
System.out.println("remove [?]: " + ((FetcherTask) q.remove()).getInfo()); System.out.println("remove [?]: " + ((FetcherTask) q.remove()).getInfo());
System.out.println("Size now [1]: " + q.size()); System.out.println("Size now [1]: " + q.size());
System.out.println("remove [?]: " + ((FetcherTask) q.remove()).getInfo()); System.out.println("remove [?]: " + ((FetcherTask) q.remove()).getInfo());

View File

@ -56,6 +56,7 @@ package de.lanlab.larm.fetcher;
import de.lanlab.larm.threads.ServerThread; import de.lanlab.larm.threads.ServerThread;
import de.lanlab.larm.util.State; import de.lanlab.larm.util.State;
import de.lanlab.larm.net.HostManager;
/** /**
* a server thread for the thread pool that records the number * a server thread for the thread pool that records the number

View File

@ -1,64 +1,69 @@
/* ==================================================================== /*
* The Apache Software License, Version 1.1 * ====================================================================
* The Apache Software License, Version 1.1
* *
* Copyright (c) 2001 The Apache Software Foundation. All rights * Copyright (c) 2001 The Apache Software Foundation. All rights
* reserved. * reserved.
* *
* Redistribution and use in source and binary forms, with or without * Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions * modification, are permitted provided that the following conditions
* are met: * are met:
* *
* 1. Redistributions of source code must retain the above copyright * 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer. * notice, this list of conditions and the following disclaimer.
* *
* 2. Redistributions in binary form must reproduce the above copyright * 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in * notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the * the documentation and/or other materials provided with the
* distribution. * distribution.
* *
* 3. The end-user documentation included with the redistribution, * 3. The end-user documentation included with the redistribution,
* if any, must include the following acknowledgment: * if any, must include the following acknowledgment:
* "This product includes software developed by the * "This product includes software developed by the
* Apache Software Foundation (http://www.apache.org/)." * Apache Software Foundation (http://www.apache.org/)."
* Alternately, this acknowledgment may appear in the software itself, * Alternately, this acknowledgment may appear in the software itself,
* if and wherever such third-party acknowledgments normally appear. * if and wherever such third-party acknowledgments normally appear.
* *
* 4. The names "Apache" and "Apache Software Foundation" and * 4. The names "Apache" and "Apache Software Foundation" and
* "Apache Lucene" must not be used to endorse or promote products * "Apache Lucene" must not be used to endorse or promote products
* derived from this software without prior written permission. For * derived from this software without prior written permission. For
* written permission, please contact apache@apache.org. * written permission, please contact apache@apache.org.
* *
* 5. Products derived from this software may not be called "Apache", * 5. Products derived from this software may not be called "Apache",
* "Apache Lucene", nor may "Apache" appear in their name, without * "Apache Lucene", nor may "Apache" appear in their name, without
* prior written permission of the Apache Software Foundation. * prior written permission of the Apache Software Foundation.
* *
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR * DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE. * SUCH DAMAGE.
* ==================================================================== * ====================================================================
* *
* This software consists of voluntary contributions made by many * This software consists of voluntary contributions made by many
* individuals on behalf of the Apache Software Foundation. For more * individuals on behalf of the Apache Software Foundation. For more
* information on the Apache Software Foundation, please see * information on the Apache Software Foundation, please see
* <http://www.apache.org/>. * <http://www.apache.org/>.
*/ */
package de.lanlab.larm.fetcher; package de.lanlab.larm.fetcher;
import de.lanlab.larm.threads.*; import de.lanlab.larm.threads.*;
import de.lanlab.larm.net.*;
/** /**
* this factory simply creates fetcher threads. It's passed * this factory simply creates fetcher threads. It's passed to the ThreadPool
* to the ThreadPool because the pool is creating the threads on its own * because the pool is creating the threads on its own
* @version $Id$ *
* @author Administrator
* @created 14. Juni 2002
* @version $Id: FetcherThreadFactory.java,v 1.2 2002/05/22 23:09:17
* cmarschner Exp $
*/ */
public class FetcherThreadFactory extends ThreadFactory public class FetcherThreadFactory extends ThreadFactory
{ {
@ -69,13 +74,25 @@ public class FetcherThreadFactory extends ThreadFactory
HostManager hostManager; HostManager hostManager;
/**
* Constructor for the FetcherThreadFactory object
*
* @param hostManager Description of the Parameter
*/
public FetcherThreadFactory(HostManager hostManager) public FetcherThreadFactory(HostManager hostManager)
{ {
this.hostManager = hostManager; this.hostManager = hostManager;
} }
public ServerThread createServerThread(int count) /**
* Description of the Method
*
* @param count Description of the Parameter
* @return Description of the Return Value
*/
public ServerThread createServerThread(int count)
{ {
ServerThread newThread = new FetcherThread(count, threadGroup, hostManager); ServerThread newThread = new FetcherThread(count, threadGroup, hostManager);
newThread.setPriority(4); newThread.setPriority(4);

View File

@ -1,168 +0,0 @@
/* ====================================================================
* The Apache Software License, Version 1.1
*
* Copyright (c) 2001 The Apache Software Foundation. All rights
* reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
*
* 3. The end-user documentation included with the redistribution,
* if any, must include the following acknowledgment:
* "This product includes software developed by the
* Apache Software Foundation (http://www.apache.org/)."
* Alternately, this acknowledgment may appear in the software itself,
* if and wherever such third-party acknowledgments normally appear.
*
* 4. The names "Apache" and "Apache Software Foundation" and
* "Apache Lucene" must not be used to endorse or promote products
* derived from this software without prior written permission. For
* written permission, please contact apache@apache.org.
*
* 5. Products derived from this software may not be called "Apache",
* "Apache Lucene", nor may "Apache" appear in their name, without
* prior written permission of the Apache Software Foundation.
*
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
* ====================================================================
*
* This software consists of voluntary contributions made by many
* individuals on behalf of the Apache Software Foundation. For more
* information on the Apache Software Foundation, please see
* <http://www.apache.org/>.
*/
package de.lanlab.larm.fetcher;
import java.util.HashMap;
import java.net.*;
import de.lanlab.larm.util.CachingQueue;
import de.lanlab.larm.util.Queue;
/**
* contains information about a host. If a host doesn't respond too often, it's
* excluded from the crawl.
* This class is used by the HostManager
*
* @author Clemens Marschner
* @created 16. Februar 2002
* @version $Id$
*/
public class HostInfo
{
static final String[] emptyKeepOutDirectories = new String[0];
int id;
int healthyCount = 5; // five strikes, and you're out
boolean isReachable = true;
boolean robotTxtChecked = false;
String[] disallows; // robot exclusion
boolean isLoadingRobotsTxt = false;
Queue queuedRequests = null; // robot exclusion
String hostName;
public HostInfo(String hostName, int id)
{
this.id = id;
this.disallows = HostInfo.emptyKeepOutDirectories;
this.hostName = hostName;
}
/**
* is this host reachable and responding?
*/
public boolean isHealthy()
{
return (healthyCount > 0) && isReachable;
}
/**
* signals that the host returned with a bad request of whatever type
*/
public void badRequest()
{
healthyCount--;
}
public void setReachable(boolean reachable)
{
isReachable = reachable;
}
public boolean isReachable()
{
return isReachable;
}
public boolean isRobotTxtChecked()
{
return robotTxtChecked;
}
/**
* must be synchronized externally
*/
public boolean isLoadingRobotsTxt()
{
return this.isLoadingRobotsTxt;
}
public void setLoadingRobotsTxt(boolean isLoading)
{
this.isLoadingRobotsTxt = isLoading;
if(isLoading)
{
this.queuedRequests = new CachingQueue("HostInfo_" + id + "_QueuedRequests", 100);
}
}
public void setRobotsChecked(boolean isChecked, String[] disallows)
{
this.robotTxtChecked = isChecked;
if(disallows != null)
{
this.disallows = disallows;
}
else
{
this.disallows = emptyKeepOutDirectories;
}
}
public synchronized boolean isAllowed(String path)
{
// assume keepOutDirectories is pretty short
// assert disallows != null
int length = disallows.length;
for(int i=0; i<length; i++)
{
if(path.startsWith(disallows[i]))
{
return false;
}
}
return true;
}
}

View File

@ -1,133 +0,0 @@
/* ====================================================================
* The Apache Software License, Version 1.1
*
* Copyright (c) 2001 The Apache Software Foundation. All rights
* reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
*
* 3. The end-user documentation included with the redistribution,
* if any, must include the following acknowledgment:
* "This product includes software developed by the
* Apache Software Foundation (http://www.apache.org/)."
* Alternately, this acknowledgment may appear in the software itself,
* if and wherever such third-party acknowledgments normally appear.
*
* 4. The names "Apache" and "Apache Software Foundation" and
* "Apache Lucene" must not be used to endorse or promote products
* derived from this software without prior written permission. For
* written permission, please contact apache@apache.org.
*
* 5. Products derived from this software may not be called "Apache",
* "Apache Lucene", nor may "Apache" appear in their name, without
* prior written permission of the Apache Software Foundation.
*
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
* ====================================================================
*
* This software consists of voluntary contributions made by many
* individuals on behalf of the Apache Software Foundation. For more
* information on the Apache Software Foundation, please see
* <http://www.apache.org/>.
*/
package de.lanlab.larm.fetcher;
import java.util.HashMap;
/**
* Description of the Class
*
* @author Administrator
* @created 16. Februar 2002
* @version $Id$
*/
public class HostManager
{
HashMap hosts;
static int hostCount = 0;
/**
* Constructor for the HostInfo object
*
* @param initialSize Description of the Parameter
*/
public HostManager(int initialCapacity)
{
hosts = new HashMap(initialCapacity);
}
/**
* Description of the Method
*
* @param hostName Description of the Parameter
* @return Description of the Return Value
*/
public HostInfo put(String hostName)
{
if (!hosts.containsKey(hostName))
{
int hostID;
synchronized (this)
{
hostID = hostCount++;
}
HostInfo hi = new HostInfo(hostName,hostID);
hosts.put(hostName, hi);
return hi;
}
return (HostInfo)hosts.get(hostName);
/*else
{
hostID = hosts.get()
}
// assert hostID != -1;
return hostID;*/
}
/**
* Gets the hostID attribute of the HostInfo object
*
* @param hostName Description of the Parameter
* @return The hostID value
*/
public HostInfo getHostInfo(String hostName)
{
HostInfo hi = (HostInfo)hosts.get(hostName);
if(hi == null)
{
return put(hostName);
}
return hi;
}
public int getSize()
{
return hosts.size();
}
}

View File

@ -63,6 +63,7 @@ import org.apache.oro.text.perl.Perl5Util;
import de.lanlab.larm.util.*; import de.lanlab.larm.util.*;
import de.lanlab.larm.threads.*; import de.lanlab.larm.threads.*;
import HTTPClient.*; import HTTPClient.*;
import de.lanlab.larm.net.*;
/** /**
* this factory simply creates fetcher threads. It's gonna be passed to the * this factory simply creates fetcher threads. It's gonna be passed to the
@ -164,13 +165,13 @@ public class RobotExclusionFilter extends Filter implements MessageListener
URLMessage urlMsg = ((URLMessage) message); URLMessage urlMsg = ((URLMessage) message);
URL url = urlMsg.getUrl(); URL url = urlMsg.getUrl();
//assert url != null; //assert url != null;
HostInfo h = hostManager.getHostInfo(url.getHost()); HostInfo h = hostManager.getHostInfo(url.getHost().toLowerCase());
if (!h.isRobotTxtChecked() && !h.isLoadingRobotsTxt()) if (!h.isRobotTxtChecked() && !h.isLoadingRobotsTxt())
{ {
log.logThreadSafe("handleRequest: starting to get robots.txt"); log.logThreadSafe("handleRequest: starting to get robots.txt");
// probably this results in Race Conditions here // probably this results in Race Conditions here
rePool.doTask(new RobotExclusionTask(h), new Integer(h.id)); rePool.doTask(new RobotExclusionTask(h), new Integer(h.getId()));
h.setLoadingRobotsTxt(true); h.setLoadingRobotsTxt(true);
} }
@ -182,7 +183,7 @@ public class RobotExclusionFilter extends Filter implements MessageListener
//log.logThreadSafe("handleRequest: other thread is loading"); //log.logThreadSafe("handleRequest: other thread is loading");
// assert h.queuedRequests != null // assert h.queuedRequests != null
h.queuedRequests.insert(message); h.insertIntoQueue(message);
// not thread safe // not thread safe
log.logThreadSafe("handleRequest: queued file " + url); log.logThreadSafe("handleRequest: queued file " + url);
return null; return null;
@ -273,14 +274,14 @@ public class RobotExclusionFilter extends Filter implements MessageListener
// assert hostInfo != null; // assert hostInfo != null;
String threadName = Thread.currentThread().getName(); String threadName = Thread.currentThread().getName();
log.logThreadSafe("task " + threadName + ": starting to load " + hostInfo.hostName); log.logThreadSafe("task " + threadName + ": starting to load " + hostInfo.getHostName());
//hostInfo.setLoadingRobotsTxt(true); //hostInfo.setLoadingRobotsTxt(true);
String[] disallows = null; String[] disallows = null;
boolean errorOccured = false; boolean errorOccured = false;
try try
{ {
log.logThreadSafe("task " + threadName + ": getting connection"); log.logThreadSafe("task " + threadName + ": getting connection");
HTTPConnection conn = new HTTPConnection(hostInfo.hostName); HTTPConnection conn = new HTTPConnection(hostInfo.getHostName());
conn.setTimeout(30000); conn.setTimeout(30000);
// wait at most 20 secs // wait at most 20 secs
@ -348,8 +349,8 @@ public class RobotExclusionFilter extends Filter implements MessageListener
// crawl everything // crawl everything
hostInfo.setLoadingRobotsTxt(false); hostInfo.setLoadingRobotsTxt(false);
log.logThreadSafe("task " + threadName + ": error occured"); log.logThreadSafe("task " + threadName + ": error occured");
log.logThreadSafe("task " + threadName + ": now put " + hostInfo.queuedRequests.size() + " queueud requests back"); log.logThreadSafe("task " + threadName + ": now put " + hostInfo.getQueueSize() + " queueud requests back");
hostInfo.isLoadingRobotsTxt = false; hostInfo.setLoadingRobotsTxt(false);
putBackURLs(); putBackURLs();
} }
} }
@ -359,8 +360,8 @@ public class RobotExclusionFilter extends Filter implements MessageListener
{ {
hostInfo.setRobotsChecked(true, disallows); hostInfo.setRobotsChecked(true, disallows);
log.logThreadSafe("task " + threadName + ": done"); log.logThreadSafe("task " + threadName + ": done");
log.logThreadSafe("task " + threadName + ": now put " + hostInfo.queuedRequests.size() + " queueud requests back"); log.logThreadSafe("task " + threadName + ": now put " + hostInfo.getQueueSize() + " queueud requests back");
hostInfo.isLoadingRobotsTxt = false; hostInfo.setLoadingRobotsTxt(false);
putBackURLs(); putBackURLs();
} }
} }
@ -373,12 +374,12 @@ public class RobotExclusionFilter extends Filter implements MessageListener
*/ */
private void putBackURLs() private void putBackURLs()
{ {
while (hostInfo.queuedRequests.size() > 0) while (hostInfo.getQueueSize() > 0)
{ {
messageHandler.putMessage((Message) hostInfo.queuedRequests.remove()); messageHandler.putMessage((Message) hostInfo.removeFromQueue());
} }
log.logThreadSafe("task " + Thread.currentThread().getName() + ": finished"); log.logThreadSafe("task " + Thread.currentThread().getName() + ": finished");
hostInfo.queuedRequests = null; hostInfo.removeQueue();
} }

View File

@ -61,6 +61,7 @@ import java.text.*;
import java.io.*; import java.io.*;
import de.lanlab.larm.util.State; import de.lanlab.larm.util.State;
import de.lanlab.larm.util.SimpleLoggerManager; import de.lanlab.larm.util.SimpleLoggerManager;
import de.lanlab.larm.net.*;
/** /**
* this monitor takes a sample of every thread every x milliseconds, * this monitor takes a sample of every thread every x milliseconds,

View File

@ -1,66 +1,71 @@
/* ==================================================================== /*
* The Apache Software License, Version 1.1 * ====================================================================
* The Apache Software License, Version 1.1
* *
* Copyright (c) 2001 The Apache Software Foundation. All rights * Copyright (c) 2001 The Apache Software Foundation. All rights
* reserved. * reserved.
* *
* Redistribution and use in source and binary forms, with or without * Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions * modification, are permitted provided that the following conditions
* are met: * are met:
* *
* 1. Redistributions of source code must retain the above copyright * 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer. * notice, this list of conditions and the following disclaimer.
* *
* 2. Redistributions in binary form must reproduce the above copyright * 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in * notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the * the documentation and/or other materials provided with the
* distribution. * distribution.
* *
* 3. The end-user documentation included with the redistribution, * 3. The end-user documentation included with the redistribution,
* if any, must include the following acknowledgment: * if any, must include the following acknowledgment:
* "This product includes software developed by the * "This product includes software developed by the
* Apache Software Foundation (http://www.apache.org/)." * Apache Software Foundation (http://www.apache.org/)."
* Alternately, this acknowledgment may appear in the software itself, * Alternately, this acknowledgment may appear in the software itself,
* if and wherever such third-party acknowledgments normally appear. * if and wherever such third-party acknowledgments normally appear.
* *
* 4. The names "Apache" and "Apache Software Foundation" and * 4. The names "Apache" and "Apache Software Foundation" and
* "Apache Lucene" must not be used to endorse or promote products * "Apache Lucene" must not be used to endorse or promote products
* derived from this software without prior written permission. For * derived from this software without prior written permission. For
* written permission, please contact apache@apache.org. * written permission, please contact apache@apache.org.
* *
* 5. Products derived from this software may not be called "Apache", * 5. Products derived from this software may not be called "Apache",
* "Apache Lucene", nor may "Apache" appear in their name, without * "Apache Lucene", nor may "Apache" appear in their name, without
* prior written permission of the Apache Software Foundation. * prior written permission of the Apache Software Foundation.
* *
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR * DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE. * SUCH DAMAGE.
* ==================================================================== * ====================================================================
* *
* This software consists of voluntary contributions made by many * This software consists of voluntary contributions made by many
* individuals on behalf of the Apache Software Foundation. For more * individuals on behalf of the Apache Software Foundation. For more
* information on the Apache Software Foundation, please see * information on the Apache Software Foundation, please see
* <http://www.apache.org/>. * <http://www.apache.org/>.
*/ */
package de.lanlab.larm.fetcher; package de.lanlab.larm.fetcher;
import java.net.*; import java.net.*;
import java.io.*; import java.io.*;
import de.lanlab.larm.util.URLUtils; import de.lanlab.larm.util.URLUtils;
import de.lanlab.larm.net.URLNormalizer;
import de.lanlab.larm.net.HostManager;
/** /**
* represents a URL which is passed around in the messageHandler * represents a URL which is passed around in the messageHandler
* @version $Id$ *
* @author Administrator
* @created 14. Juni 2002
* @version $Id$
*/ */
public class URLMessage implements Message, Serializable public class URLMessage implements Message, Serializable
{ {
@ -68,14 +73,51 @@ public class URLMessage implements Message, Serializable
* the URL * the URL
*/ */
protected URL url; protected URL url;
protected String urlString;
/**
* Description of the Field
*/
protected volatile String urlString;
/**
* referer or null
*/
protected URL referer; protected URL referer;
protected String refererString;
/**
* externalized referer URL, to prevent multiple calls to url.toExternalForm()
*/
protected volatile String refererString;
/**
* externalized referer URL, to prevent multiple calls to url.toExternalForm()
*/
protected volatile String refererNormalizedString;
/**
* normalized URL, as defined by {@link de.lanlab.larm.net.URLNormalizer}
* (lower case, index.* removed, all characters except alphanumeric ones escaped)
*/
protected String normalizedURLString;
boolean isFrame; boolean isFrame;
/**
* anchor text, as in &lt;a href="..."&gt;Anchor&lt;/a&gt;
*/
protected String anchor; protected String anchor;
public URLMessage(URL url, URL referer, boolean isFrame, String anchor)
/**
* Constructor for the URLMessage object
*
* @param url Description of the Parameter
* @param referer Description of the Parameter
* @param isFrame Description of the Parameter
* @param anchor Description of the Parameter
*/
public URLMessage(URL url, URL referer, boolean isFrame, String anchor, HostManager hostManager)
{ {
//super(); //super();
this.url = url; this.url = url;
@ -83,69 +125,144 @@ public class URLMessage implements Message, Serializable
this.referer = referer; this.referer = referer;
this.refererString = referer != null ? URLUtils.toExternalFormNoRef(referer) : null; this.refererString = referer != null ? URLUtils.toExternalFormNoRef(referer) : null;
this.refererNormalizedString = referer != null ? URLUtils.toExternalFormNoRef(URLNormalizer.normalize(referer, hostManager)) : null;
this.isFrame = isFrame; this.isFrame = isFrame;
this.anchor = anchor != null ? anchor : ""; this.anchor = anchor != null ? anchor : "";
this.normalizedURLString = URLUtils.toExternalFormNoRef(URLNormalizer.normalize(url, hostManager));
//this.normalizedURLString = URLNormalizer.
//System.out.println("" + refererString + " -> " + urlString); //System.out.println("" + refererString + " -> " + urlString);
} }
public String getNormalizedURLString()
{
return this.normalizedURLString;
}
/**
* Gets the url attribute of the URLMessage object
*
* @return The url value
*/
public URL getUrl() public URL getUrl()
{ {
return this.url; return this.url;
} }
/**
* Gets the referer attribute of the URLMessage object
*
* @return The referer value
*/
public URL getReferer() public URL getReferer()
{ {
return this.referer; return this.referer;
} }
/**
* Description of the Method
*
* @return Description of the Return Value
*/
public String toString() public String toString()
{ {
return urlString; return urlString;
} }
/**
* Gets the uRLString attribute of the URLMessage object
*
* @return The uRLString value
*/
public String getURLString() public String getURLString()
{ {
return urlString; return urlString;
} }
/**
* Gets the refererString attribute of the URLMessage object
*
* @return The refererString value
*/
public String getRefererString() public String getRefererString()
{ {
return refererString; return refererString;
} }
/**
* Gets the anchor attribute of the URLMessage object
*
* @return The anchor value
*/
public String getAnchor() public String getAnchor()
{ {
return anchor; return anchor;
} }
/**
* Description of the Method
*
* @return Description of the Return Value
*/
public int hashCode() public int hashCode()
{ {
return url.hashCode(); return url.hashCode();
} }
private void writeObject(java.io.ObjectOutputStream out) throws IOException
/**
* Description of the Method
*
* @param out Description of the Parameter
* @exception IOException Description of the Exception
*/
private void writeObject(java.io.ObjectOutputStream out)
throws IOException
{ {
out.writeObject(url); out.writeObject(url);
out.writeObject(referer); out.writeObject(referer);
out.writeBoolean(isFrame); out.writeBoolean(isFrame);
out.writeUTF(anchor); out.writeUTF(anchor);
out.writeUTF(refererNormalizedString);
out.writeUTF(normalizedURLString);
} }
private void readObject(java.io.ObjectInputStream in) throws IOException, ClassNotFoundException
/**
* Description of the Method
*
* @param in Description of the Parameter
* @exception IOException Description of the Exception
* @exception ClassNotFoundException Description of the Exception
*/
private void readObject(java.io.ObjectInputStream in)
throws IOException, ClassNotFoundException
{ {
url = (URL)in.readObject(); url = (URL) in.readObject();
referer = (URL)in.readObject(); referer = (URL) in.readObject();
urlString = url.toExternalForm(); urlString = url.toExternalForm();
refererString = referer.toExternalForm(); refererString = referer.toExternalForm();
isFrame = in.readBoolean(); isFrame = in.readBoolean();
anchor = in.readUTF(); anchor = in.readUTF();
refererNormalizedString = in.readUTF();
normalizedURLString = in.readUTF();
} }
/**
* Gets the info attribute of the URLMessage object
*
* @return The info value
*/
public String getInfo() public String getInfo()
{ {
return (referer != null ? refererString : "<start>") + "\t" + urlString + "\t" + (isFrame ? "1" : "0") + "\t" + anchor; return (referer != null ? refererString : "<start>") + "\t" + urlString + "\t" + this.getNormalizedURLString() + "\t" + (isFrame ? "1" : "0") + "\t" + anchor;
} }
} }

View File

@ -123,7 +123,7 @@ public class URLVisitedFilter extends Filter implements MessageListener
{ {
URLMessage urlMessage = ((URLMessage) message); URLMessage urlMessage = ((URLMessage) message);
URL url = urlMessage.getUrl(); URL url = urlMessage.getUrl();
String urlString = urlMessage.getURLString(); String urlString = urlMessage.getNormalizedURLString();
if (urlHash.contains(urlString)) if (urlHash.contains(urlString))
{ {
//System.out.println("URLVisitedFilter: " + urlString + " already present."); //System.out.println("URLVisitedFilter: " + urlString + " already present.");