diff --git a/sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/fetcher/Fetcher.java b/sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/fetcher/Fetcher.java index 0783da522b5..ec831f616c3 100644 --- a/sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/fetcher/Fetcher.java +++ b/sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/fetcher/Fetcher.java @@ -65,6 +65,7 @@ import java.net.URL; import java.util.LinkedList; import de.lanlab.larm.fetcher.FetcherTask; +import de.lanlab.larm.net.*; /** * filter class; the Fetcher is the main class which keeps the ThreadPool that diff --git a/sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/fetcher/FetcherMain.java b/sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/fetcher/FetcherMain.java index d724791c0a5..a326ceee19c 100644 --- a/sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/fetcher/FetcherMain.java +++ b/sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/fetcher/FetcherMain.java @@ -62,6 +62,7 @@ import java.util.*; import de.lanlab.larm.gui.*; import de.lanlab.larm.util.*; import de.lanlab.larm.storage.*; +import de.lanlab.larm.net.*; import javax.swing.UIManager; import HTTPClient.*; import org.apache.oro.text.regex.MalformedPatternException; @@ -278,7 +279,7 @@ public class FetcherMain { try { - messageHandler.putMessage(new URLMessage(url, null, isFrame, null)); + messageHandler.putMessage(new URLMessage(url, null, isFrame, null, this.hostManager)); } catch (Exception e) { diff --git a/sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/fetcher/FetcherTaskQueue.java b/sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/fetcher/FetcherTaskQueue.java index f299e348ae8..7355add2ff5 100644 --- a/sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/fetcher/FetcherTaskQueue.java +++ b/sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/fetcher/FetcherTaskQueue.java @@ -186,16 +186,17 @@ public class FetcherTaskQueue extends TaskQueue public static void main(String args[]) { FetcherTaskQueue q = new FetcherTaskQueue(); + de.lanlab.larm.net.HostManager hm = new de.lanlab.larm.net.HostManager(10); System.out.println("Test 1. put in 4 yahoos and 3 lmus. pull out LMU/Yahoo/LMU/Yahoo/LMU/Yahoo/Yahoo"); try { - q.insert(new FetcherTask(new URLMessage(new URL("http://www.lmu.de/1"), null, false, null))); - q.insert(new FetcherTask(new URLMessage(new URL("http://www.lmu.de/2"), null, false, null))); - q.insert(new FetcherTask(new URLMessage(new URL("http://www.yahoo.de/1"), null, false, null))); - q.insert(new FetcherTask(new URLMessage(new URL("http://www.yahoo.de/2"), null, false, null))); - q.insert(new FetcherTask(new URLMessage(new URL("http://www.yahoo.de/3"), null, false, null))); - q.insert(new FetcherTask(new URLMessage(new URL("http://www.yahoo.de/4"), null, false, null))); - q.insert(new FetcherTask(new URLMessage(new URL("http://www.lmu.de/3"), null, false, null))); + q.insert(new FetcherTask(new URLMessage(new URL("http://www.lmu.de/1"), null, false, null, hm))); + q.insert(new FetcherTask(new URLMessage(new URL("http://www.lmu.de/2"), null, false, null, hm))); + q.insert(new FetcherTask(new URLMessage(new URL("http://www.yahoo.de/1"), null, false, null, hm))); + q.insert(new FetcherTask(new URLMessage(new URL("http://www.yahoo.de/2"), null, false, null, hm))); + q.insert(new FetcherTask(new URLMessage(new URL("http://www.yahoo.de/3"), null, false, null, hm))); + q.insert(new FetcherTask(new URLMessage(new URL("http://www.yahoo.de/4"), null, false, null, hm))); + q.insert(new FetcherTask(new URLMessage(new URL("http://www.lmu.de/3"), null, false, null, hm))); } catch (Throwable t) { @@ -217,9 +218,9 @@ public class FetcherTaskQueue extends TaskQueue try { System.out.println("put 3 lmus."); - q.insert(new FetcherTask(new URLMessage(new URL("http://www.lmu.de/1"), null, false, null))); - q.insert(new FetcherTask(new URLMessage(new URL("http://www.lmu.de/2"), null, false, null))); - q.insert(new FetcherTask(new URLMessage(new URL("http://www.lmu.de/3"), null, false, null))); + q.insert(new FetcherTask(new URLMessage(new URL("http://www.lmu.de/1"), null, false, null, hm))); + q.insert(new FetcherTask(new URLMessage(new URL("http://www.lmu.de/2"), null, false, null, hm))); + q.insert(new FetcherTask(new URLMessage(new URL("http://www.lmu.de/3"), null, false, null, hm))); System.out.print("pull out 1st element [lmu/1]: "); System.out.println(((FetcherTask) q.remove()).getInfo()); System.out.println("size now [2]: " + q.size()); @@ -227,9 +228,9 @@ public class FetcherTaskQueue extends TaskQueue System.out.println(((FetcherTask) q.remove()).getInfo()); System.out.println("size now [1]: " + q.size()); System.out.println("put in 3 yahoos"); - q.insert(new FetcherTask(new URLMessage(new URL("http://www.yahoo.de/1"), null, false, null))); - q.insert(new FetcherTask(new URLMessage(new URL("http://www.yahoo.de/2"), null, false, null))); - q.insert(new FetcherTask(new URLMessage(new URL("http://www.yahoo.de/3"), null, false, null))); + q.insert(new FetcherTask(new URLMessage(new URL("http://www.yahoo.de/1"), null, false, null, hm))); + q.insert(new FetcherTask(new URLMessage(new URL("http://www.yahoo.de/2"), null, false, null, hm))); + q.insert(new FetcherTask(new URLMessage(new URL("http://www.yahoo.de/3"), null, false, null, hm))); System.out.println("remove [?]: " + ((FetcherTask) q.remove()).getInfo()); System.out.println("Size now [3]: " + q.size()); System.out.println("remove [?]: " + ((FetcherTask) q.remove()).getInfo()); @@ -237,7 +238,7 @@ public class FetcherTaskQueue extends TaskQueue System.out.println("remove [?]: " + ((FetcherTask) q.remove()).getInfo()); System.out.println("Size now [1]: " + q.size()); System.out.println("put in another Yahoo"); - q.insert(new FetcherTask(new URLMessage(new URL("http://www.yahoo.de/4"), null, false, null))); + q.insert(new FetcherTask(new URLMessage(new URL("http://www.yahoo.de/4"), null, false, null, hm))); System.out.println("remove [?]: " + ((FetcherTask) q.remove()).getInfo()); System.out.println("Size now [1]: " + q.size()); System.out.println("remove [?]: " + ((FetcherTask) q.remove()).getInfo()); diff --git a/sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/fetcher/FetcherThread.java b/sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/fetcher/FetcherThread.java index 3473a273ee8..9a0aca2483c 100644 --- a/sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/fetcher/FetcherThread.java +++ b/sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/fetcher/FetcherThread.java @@ -56,6 +56,7 @@ package de.lanlab.larm.fetcher; import de.lanlab.larm.threads.ServerThread; import de.lanlab.larm.util.State; +import de.lanlab.larm.net.HostManager; /** * a server thread for the thread pool that records the number diff --git a/sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/fetcher/FetcherThreadFactory.java b/sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/fetcher/FetcherThreadFactory.java index 24a47d31fc5..7167728b43a 100644 --- a/sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/fetcher/FetcherThreadFactory.java +++ b/sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/fetcher/FetcherThreadFactory.java @@ -1,64 +1,69 @@ -/* ==================================================================== - * The Apache Software License, Version 1.1 +/* + * ==================================================================== + * The Apache Software License, Version 1.1 * - * Copyright (c) 2001 The Apache Software Foundation. All rights - * reserved. + * Copyright (c) 2001 The Apache Software Foundation. All rights + * reserved. * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: * - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. * - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in - * the documentation and/or other materials provided with the - * distribution. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. * - * 3. The end-user documentation included with the redistribution, - * if any, must include the following acknowledgment: - * "This product includes software developed by the - * Apache Software Foundation (http://www.apache.org/)." - * Alternately, this acknowledgment may appear in the software itself, - * if and wherever such third-party acknowledgments normally appear. + * 3. The end-user documentation included with the redistribution, + * if any, must include the following acknowledgment: + * "This product includes software developed by the + * Apache Software Foundation (http://www.apache.org/)." + * Alternately, this acknowledgment may appear in the software itself, + * if and wherever such third-party acknowledgments normally appear. * - * 4. The names "Apache" and "Apache Software Foundation" and - * "Apache Lucene" must not be used to endorse or promote products - * derived from this software without prior written permission. For - * written permission, please contact apache@apache.org. + * 4. The names "Apache" and "Apache Software Foundation" and + * "Apache Lucene" must not be used to endorse or promote products + * derived from this software without prior written permission. For + * written permission, please contact apache@apache.org. * - * 5. Products derived from this software may not be called "Apache", - * "Apache Lucene", nor may "Apache" appear in their name, without - * prior written permission of the Apache Software Foundation. + * 5. Products derived from this software may not be called "Apache", + * "Apache Lucene", nor may "Apache" appear in their name, without + * prior written permission of the Apache Software Foundation. * - * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED - * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES - * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR - * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF - * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND - * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, - * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT - * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - * ==================================================================== + * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED + * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR + * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF + * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT + * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * ==================================================================== * - * This software consists of voluntary contributions made by many - * individuals on behalf of the Apache Software Foundation. For more - * information on the Apache Software Foundation, please see - * . + * This software consists of voluntary contributions made by many + * individuals on behalf of the Apache Software Foundation. For more + * information on the Apache Software Foundation, please see + * . */ - package de.lanlab.larm.fetcher; import de.lanlab.larm.threads.*; +import de.lanlab.larm.net.*; /** - * this factory simply creates fetcher threads. It's passed - * to the ThreadPool because the pool is creating the threads on its own - * @version $Id$ + * this factory simply creates fetcher threads. It's passed to the ThreadPool + * because the pool is creating the threads on its own + * + * @author Administrator + * @created 14. Juni 2002 + * @version $Id: FetcherThreadFactory.java,v 1.2 2002/05/22 23:09:17 + * cmarschner Exp $ */ public class FetcherThreadFactory extends ThreadFactory { @@ -69,16 +74,28 @@ public class FetcherThreadFactory extends ThreadFactory HostManager hostManager; + + /** + * Constructor for the FetcherThreadFactory object + * + * @param hostManager Description of the Parameter + */ public FetcherThreadFactory(HostManager hostManager) { this.hostManager = hostManager; } - public ServerThread createServerThread(int count) + /** + * Description of the Method + * + * @param count Description of the Parameter + * @return Description of the Return Value + */ + public ServerThread createServerThread(int count) { ServerThread newThread = new FetcherThread(count, threadGroup, hostManager); newThread.setPriority(4); return newThread; } -} \ No newline at end of file +} diff --git a/sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/fetcher/HostInfo.java b/sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/fetcher/HostInfo.java deleted file mode 100644 index dd28bba7beb..00000000000 --- a/sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/fetcher/HostInfo.java +++ /dev/null @@ -1,168 +0,0 @@ -/* ==================================================================== - * The Apache Software License, Version 1.1 - * - * Copyright (c) 2001 The Apache Software Foundation. All rights - * reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in - * the documentation and/or other materials provided with the - * distribution. - * - * 3. The end-user documentation included with the redistribution, - * if any, must include the following acknowledgment: - * "This product includes software developed by the - * Apache Software Foundation (http://www.apache.org/)." - * Alternately, this acknowledgment may appear in the software itself, - * if and wherever such third-party acknowledgments normally appear. - * - * 4. The names "Apache" and "Apache Software Foundation" and - * "Apache Lucene" must not be used to endorse or promote products - * derived from this software without prior written permission. For - * written permission, please contact apache@apache.org. - * - * 5. Products derived from this software may not be called "Apache", - * "Apache Lucene", nor may "Apache" appear in their name, without - * prior written permission of the Apache Software Foundation. - * - * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED - * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES - * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR - * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF - * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND - * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, - * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT - * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - * ==================================================================== - * - * This software consists of voluntary contributions made by many - * individuals on behalf of the Apache Software Foundation. For more - * information on the Apache Software Foundation, please see - * . - */ - -package de.lanlab.larm.fetcher; - -import java.util.HashMap; -import java.net.*; -import de.lanlab.larm.util.CachingQueue; -import de.lanlab.larm.util.Queue; - -/** - * contains information about a host. If a host doesn't respond too often, it's - * excluded from the crawl. - * This class is used by the HostManager - * - * @author Clemens Marschner - * @created 16. Februar 2002 - * @version $Id$ - */ -public class HostInfo -{ - static final String[] emptyKeepOutDirectories = new String[0]; - - int id; - int healthyCount = 5; // five strikes, and you're out - boolean isReachable = true; - boolean robotTxtChecked = false; - String[] disallows; // robot exclusion - boolean isLoadingRobotsTxt = false; - Queue queuedRequests = null; // robot exclusion - String hostName; - - public HostInfo(String hostName, int id) - { - this.id = id; - this.disallows = HostInfo.emptyKeepOutDirectories; - this.hostName = hostName; - } - - /** - * is this host reachable and responding? - */ - public boolean isHealthy() - { - return (healthyCount > 0) && isReachable; - } - - /** - * signals that the host returned with a bad request of whatever type - */ - public void badRequest() - { - healthyCount--; - } - - public void setReachable(boolean reachable) - { - isReachable = reachable; - } - - public boolean isReachable() - { - return isReachable; - } - - public boolean isRobotTxtChecked() - { - return robotTxtChecked; - } - - /** - * must be synchronized externally - */ - public boolean isLoadingRobotsTxt() - { - return this.isLoadingRobotsTxt; - } - - public void setLoadingRobotsTxt(boolean isLoading) - { - this.isLoadingRobotsTxt = isLoading; - if(isLoading) - { - this.queuedRequests = new CachingQueue("HostInfo_" + id + "_QueuedRequests", 100); - } - - } - - public void setRobotsChecked(boolean isChecked, String[] disallows) - { - this.robotTxtChecked = isChecked; - if(disallows != null) - { - this.disallows = disallows; - } - else - { - this.disallows = emptyKeepOutDirectories; - } - - } - - public synchronized boolean isAllowed(String path) - { - // assume keepOutDirectories is pretty short - // assert disallows != null - int length = disallows.length; - for(int i=0; i. - */ - -package de.lanlab.larm.fetcher; - -import java.util.HashMap; - -/** - * Description of the Class - * - * @author Administrator - * @created 16. Februar 2002 - * @version $Id$ - */ -public class HostManager -{ - HashMap hosts; - static int hostCount = 0; - - - /** - * Constructor for the HostInfo object - * - * @param initialSize Description of the Parameter - */ - public HostManager(int initialCapacity) - { - hosts = new HashMap(initialCapacity); - } - - - /** - * Description of the Method - * - * @param hostName Description of the Parameter - * @return Description of the Return Value - */ - public HostInfo put(String hostName) - { - if (!hosts.containsKey(hostName)) - { - int hostID; - synchronized (this) - { - hostID = hostCount++; - } - HostInfo hi = new HostInfo(hostName,hostID); - hosts.put(hostName, hi); - return hi; - } - return (HostInfo)hosts.get(hostName); - /*else - { - hostID = hosts.get() - } - // assert hostID != -1; - return hostID;*/ - - } - - - /** - * Gets the hostID attribute of the HostInfo object - * - * @param hostName Description of the Parameter - * @return The hostID value - */ - public HostInfo getHostInfo(String hostName) - { - HostInfo hi = (HostInfo)hosts.get(hostName); - if(hi == null) - { - return put(hostName); - } - return hi; - } - - public int getSize() - { - return hosts.size(); - } -} diff --git a/sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/fetcher/RobotExclusionFilter.java b/sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/fetcher/RobotExclusionFilter.java index 76e1a363dee..24b988c3529 100644 --- a/sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/fetcher/RobotExclusionFilter.java +++ b/sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/fetcher/RobotExclusionFilter.java @@ -63,6 +63,7 @@ import org.apache.oro.text.perl.Perl5Util; import de.lanlab.larm.util.*; import de.lanlab.larm.threads.*; import HTTPClient.*; +import de.lanlab.larm.net.*; /** * this factory simply creates fetcher threads. It's gonna be passed to the @@ -164,13 +165,13 @@ public class RobotExclusionFilter extends Filter implements MessageListener URLMessage urlMsg = ((URLMessage) message); URL url = urlMsg.getUrl(); //assert url != null; - HostInfo h = hostManager.getHostInfo(url.getHost()); + HostInfo h = hostManager.getHostInfo(url.getHost().toLowerCase()); if (!h.isRobotTxtChecked() && !h.isLoadingRobotsTxt()) { log.logThreadSafe("handleRequest: starting to get robots.txt"); // probably this results in Race Conditions here - rePool.doTask(new RobotExclusionTask(h), new Integer(h.id)); + rePool.doTask(new RobotExclusionTask(h), new Integer(h.getId())); h.setLoadingRobotsTxt(true); } @@ -182,7 +183,7 @@ public class RobotExclusionFilter extends Filter implements MessageListener //log.logThreadSafe("handleRequest: other thread is loading"); // assert h.queuedRequests != null - h.queuedRequests.insert(message); + h.insertIntoQueue(message); // not thread safe log.logThreadSafe("handleRequest: queued file " + url); return null; @@ -273,14 +274,14 @@ public class RobotExclusionFilter extends Filter implements MessageListener // assert hostInfo != null; String threadName = Thread.currentThread().getName(); - log.logThreadSafe("task " + threadName + ": starting to load " + hostInfo.hostName); + log.logThreadSafe("task " + threadName + ": starting to load " + hostInfo.getHostName()); //hostInfo.setLoadingRobotsTxt(true); String[] disallows = null; boolean errorOccured = false; try { log.logThreadSafe("task " + threadName + ": getting connection"); - HTTPConnection conn = new HTTPConnection(hostInfo.hostName); + HTTPConnection conn = new HTTPConnection(hostInfo.getHostName()); conn.setTimeout(30000); // wait at most 20 secs @@ -348,8 +349,8 @@ public class RobotExclusionFilter extends Filter implements MessageListener // crawl everything hostInfo.setLoadingRobotsTxt(false); log.logThreadSafe("task " + threadName + ": error occured"); - log.logThreadSafe("task " + threadName + ": now put " + hostInfo.queuedRequests.size() + " queueud requests back"); - hostInfo.isLoadingRobotsTxt = false; + log.logThreadSafe("task " + threadName + ": now put " + hostInfo.getQueueSize() + " queueud requests back"); + hostInfo.setLoadingRobotsTxt(false); putBackURLs(); } } @@ -359,8 +360,8 @@ public class RobotExclusionFilter extends Filter implements MessageListener { hostInfo.setRobotsChecked(true, disallows); log.logThreadSafe("task " + threadName + ": done"); - log.logThreadSafe("task " + threadName + ": now put " + hostInfo.queuedRequests.size() + " queueud requests back"); - hostInfo.isLoadingRobotsTxt = false; + log.logThreadSafe("task " + threadName + ": now put " + hostInfo.getQueueSize() + " queueud requests back"); + hostInfo.setLoadingRobotsTxt(false); putBackURLs(); } } @@ -373,12 +374,12 @@ public class RobotExclusionFilter extends Filter implements MessageListener */ private void putBackURLs() { - while (hostInfo.queuedRequests.size() > 0) + while (hostInfo.getQueueSize() > 0) { - messageHandler.putMessage((Message) hostInfo.queuedRequests.remove()); + messageHandler.putMessage((Message) hostInfo.removeFromQueue()); } log.logThreadSafe("task " + Thread.currentThread().getName() + ": finished"); - hostInfo.queuedRequests = null; + hostInfo.removeQueue(); } diff --git a/sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/fetcher/ThreadMonitor.java b/sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/fetcher/ThreadMonitor.java index 3e07a0401a8..945f03631f3 100644 --- a/sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/fetcher/ThreadMonitor.java +++ b/sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/fetcher/ThreadMonitor.java @@ -61,6 +61,7 @@ import java.text.*; import java.io.*; import de.lanlab.larm.util.State; import de.lanlab.larm.util.SimpleLoggerManager; +import de.lanlab.larm.net.*; /** * this monitor takes a sample of every thread every x milliseconds, diff --git a/sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/fetcher/URLMessage.java b/sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/fetcher/URLMessage.java index c70a7370c34..80b2fe15d07 100644 --- a/sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/fetcher/URLMessage.java +++ b/sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/fetcher/URLMessage.java @@ -1,66 +1,71 @@ -/* ==================================================================== - * The Apache Software License, Version 1.1 +/* + * ==================================================================== + * The Apache Software License, Version 1.1 * - * Copyright (c) 2001 The Apache Software Foundation. All rights - * reserved. + * Copyright (c) 2001 The Apache Software Foundation. All rights + * reserved. * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: * - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. * - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in - * the documentation and/or other materials provided with the - * distribution. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. * - * 3. The end-user documentation included with the redistribution, - * if any, must include the following acknowledgment: - * "This product includes software developed by the - * Apache Software Foundation (http://www.apache.org/)." - * Alternately, this acknowledgment may appear in the software itself, - * if and wherever such third-party acknowledgments normally appear. + * 3. The end-user documentation included with the redistribution, + * if any, must include the following acknowledgment: + * "This product includes software developed by the + * Apache Software Foundation (http://www.apache.org/)." + * Alternately, this acknowledgment may appear in the software itself, + * if and wherever such third-party acknowledgments normally appear. * - * 4. The names "Apache" and "Apache Software Foundation" and - * "Apache Lucene" must not be used to endorse or promote products - * derived from this software without prior written permission. For - * written permission, please contact apache@apache.org. + * 4. The names "Apache" and "Apache Software Foundation" and + * "Apache Lucene" must not be used to endorse or promote products + * derived from this software without prior written permission. For + * written permission, please contact apache@apache.org. * - * 5. Products derived from this software may not be called "Apache", - * "Apache Lucene", nor may "Apache" appear in their name, without - * prior written permission of the Apache Software Foundation. + * 5. Products derived from this software may not be called "Apache", + * "Apache Lucene", nor may "Apache" appear in their name, without + * prior written permission of the Apache Software Foundation. * - * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED - * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES - * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR - * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF - * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND - * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, - * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT - * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - * ==================================================================== + * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED + * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR + * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF + * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT + * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * ==================================================================== * - * This software consists of voluntary contributions made by many - * individuals on behalf of the Apache Software Foundation. For more - * information on the Apache Software Foundation, please see - * . + * This software consists of voluntary contributions made by many + * individuals on behalf of the Apache Software Foundation. For more + * information on the Apache Software Foundation, please see + * . */ - package de.lanlab.larm.fetcher; import java.net.*; import java.io.*; import de.lanlab.larm.util.URLUtils; +import de.lanlab.larm.net.URLNormalizer; +import de.lanlab.larm.net.HostManager; /** * represents a URL which is passed around in the messageHandler - * @version $Id$ + * + * @author Administrator + * @created 14. Juni 2002 + * @version $Id$ */ public class URLMessage implements Message, Serializable { @@ -68,14 +73,51 @@ public class URLMessage implements Message, Serializable * the URL */ protected URL url; - protected String urlString; + /** + * Description of the Field + */ + protected volatile String urlString; + + /** + * referer or null + */ protected URL referer; - protected String refererString; + + /** + * externalized referer URL, to prevent multiple calls to url.toExternalForm() + */ + protected volatile String refererString; + + /** + * externalized referer URL, to prevent multiple calls to url.toExternalForm() + */ + protected volatile String refererNormalizedString; + + /** + * normalized URL, as defined by {@link de.lanlab.larm.net.URLNormalizer} + * (lower case, index.* removed, all characters except alphanumeric ones escaped) + */ + protected String normalizedURLString; + + boolean isFrame; + + /** + * anchor text, as in <a href="...">Anchor</a> + */ protected String anchor; - public URLMessage(URL url, URL referer, boolean isFrame, String anchor) + + /** + * Constructor for the URLMessage object + * + * @param url Description of the Parameter + * @param referer Description of the Parameter + * @param isFrame Description of the Parameter + * @param anchor Description of the Parameter + */ + public URLMessage(URL url, URL referer, boolean isFrame, String anchor, HostManager hostManager) { //super(); this.url = url; @@ -83,69 +125,144 @@ public class URLMessage implements Message, Serializable this.referer = referer; this.refererString = referer != null ? URLUtils.toExternalFormNoRef(referer) : null; + this.refererNormalizedString = referer != null ? URLUtils.toExternalFormNoRef(URLNormalizer.normalize(referer, hostManager)) : null; this.isFrame = isFrame; this.anchor = anchor != null ? anchor : ""; + this.normalizedURLString = URLUtils.toExternalFormNoRef(URLNormalizer.normalize(url, hostManager)); + //this.normalizedURLString = URLNormalizer. //System.out.println("" + refererString + " -> " + urlString); } + public String getNormalizedURLString() + { + return this.normalizedURLString; + } + + /** + * Gets the url attribute of the URLMessage object + * + * @return The url value + */ public URL getUrl() { return this.url; } + + /** + * Gets the referer attribute of the URLMessage object + * + * @return The referer value + */ public URL getReferer() { return this.referer; } + /** + * Description of the Method + * + * @return Description of the Return Value + */ public String toString() { return urlString; } + + /** + * Gets the uRLString attribute of the URLMessage object + * + * @return The uRLString value + */ public String getURLString() { return urlString; } + + /** + * Gets the refererString attribute of the URLMessage object + * + * @return The refererString value + */ public String getRefererString() { return refererString; } + + /** + * Gets the anchor attribute of the URLMessage object + * + * @return The anchor value + */ public String getAnchor() { return anchor; } + /** + * Description of the Method + * + * @return Description of the Return Value + */ public int hashCode() { return url.hashCode(); } - private void writeObject(java.io.ObjectOutputStream out) throws IOException + + /** + * Description of the Method + * + * @param out Description of the Parameter + * @exception IOException Description of the Exception + */ + private void writeObject(java.io.ObjectOutputStream out) + throws IOException { out.writeObject(url); out.writeObject(referer); out.writeBoolean(isFrame); out.writeUTF(anchor); + out.writeUTF(refererNormalizedString); + out.writeUTF(normalizedURLString); + } - private void readObject(java.io.ObjectInputStream in) throws IOException, ClassNotFoundException + + /** + * Description of the Method + * + * @param in Description of the Parameter + * @exception IOException Description of the Exception + * @exception ClassNotFoundException Description of the Exception + */ + private void readObject(java.io.ObjectInputStream in) + throws IOException, ClassNotFoundException { - url = (URL)in.readObject(); - referer = (URL)in.readObject(); + url = (URL) in.readObject(); + referer = (URL) in.readObject(); urlString = url.toExternalForm(); refererString = referer.toExternalForm(); isFrame = in.readBoolean(); anchor = in.readUTF(); + refererNormalizedString = in.readUTF(); + normalizedURLString = in.readUTF(); } + + /** + * Gets the info attribute of the URLMessage object + * + * @return The info value + */ public String getInfo() { - return (referer != null ? refererString : "") + "\t" + urlString + "\t" + (isFrame ? "1" : "0") + "\t" + anchor; + return (referer != null ? refererString : "") + "\t" + urlString + "\t" + this.getNormalizedURLString() + "\t" + (isFrame ? "1" : "0") + "\t" + anchor; } } diff --git a/sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/fetcher/URLVisitedFilter.java b/sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/fetcher/URLVisitedFilter.java index 4ed5feb9e0d..728f6dcb528 100644 --- a/sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/fetcher/URLVisitedFilter.java +++ b/sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/fetcher/URLVisitedFilter.java @@ -123,7 +123,7 @@ public class URLVisitedFilter extends Filter implements MessageListener { URLMessage urlMessage = ((URLMessage) message); URL url = urlMessage.getUrl(); - String urlString = urlMessage.getURLString(); + String urlString = urlMessage.getNormalizedURLString(); if (urlHash.contains(urlString)) { //System.out.println("URLVisitedFilter: " + urlString + " already present.");