mirror of https://github.com/apache/lucene.git
added URLNormalizer. Changed filters to use normalized URLs if possible
git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@150781 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
14fdfb458f
commit
5b90c10cb5
|
@ -65,6 +65,7 @@ import java.net.URL;
|
||||||
import java.util.LinkedList;
|
import java.util.LinkedList;
|
||||||
|
|
||||||
import de.lanlab.larm.fetcher.FetcherTask;
|
import de.lanlab.larm.fetcher.FetcherTask;
|
||||||
|
import de.lanlab.larm.net.*;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* filter class; the Fetcher is the main class which keeps the ThreadPool that
|
* filter class; the Fetcher is the main class which keeps the ThreadPool that
|
||||||
|
|
|
@ -62,6 +62,7 @@ import java.util.*;
|
||||||
import de.lanlab.larm.gui.*;
|
import de.lanlab.larm.gui.*;
|
||||||
import de.lanlab.larm.util.*;
|
import de.lanlab.larm.util.*;
|
||||||
import de.lanlab.larm.storage.*;
|
import de.lanlab.larm.storage.*;
|
||||||
|
import de.lanlab.larm.net.*;
|
||||||
import javax.swing.UIManager;
|
import javax.swing.UIManager;
|
||||||
import HTTPClient.*;
|
import HTTPClient.*;
|
||||||
import org.apache.oro.text.regex.MalformedPatternException;
|
import org.apache.oro.text.regex.MalformedPatternException;
|
||||||
|
@ -278,7 +279,7 @@ public class FetcherMain
|
||||||
{
|
{
|
||||||
try
|
try
|
||||||
{
|
{
|
||||||
messageHandler.putMessage(new URLMessage(url, null, isFrame, null));
|
messageHandler.putMessage(new URLMessage(url, null, isFrame, null, this.hostManager));
|
||||||
}
|
}
|
||||||
catch (Exception e)
|
catch (Exception e)
|
||||||
{
|
{
|
||||||
|
|
|
@ -186,16 +186,17 @@ public class FetcherTaskQueue extends TaskQueue
|
||||||
public static void main(String args[])
|
public static void main(String args[])
|
||||||
{
|
{
|
||||||
FetcherTaskQueue q = new FetcherTaskQueue();
|
FetcherTaskQueue q = new FetcherTaskQueue();
|
||||||
|
de.lanlab.larm.net.HostManager hm = new de.lanlab.larm.net.HostManager(10);
|
||||||
System.out.println("Test 1. put in 4 yahoos and 3 lmus. pull out LMU/Yahoo/LMU/Yahoo/LMU/Yahoo/Yahoo");
|
System.out.println("Test 1. put in 4 yahoos and 3 lmus. pull out LMU/Yahoo/LMU/Yahoo/LMU/Yahoo/Yahoo");
|
||||||
try
|
try
|
||||||
{
|
{
|
||||||
q.insert(new FetcherTask(new URLMessage(new URL("http://www.lmu.de/1"), null, false, null)));
|
q.insert(new FetcherTask(new URLMessage(new URL("http://www.lmu.de/1"), null, false, null, hm)));
|
||||||
q.insert(new FetcherTask(new URLMessage(new URL("http://www.lmu.de/2"), null, false, null)));
|
q.insert(new FetcherTask(new URLMessage(new URL("http://www.lmu.de/2"), null, false, null, hm)));
|
||||||
q.insert(new FetcherTask(new URLMessage(new URL("http://www.yahoo.de/1"), null, false, null)));
|
q.insert(new FetcherTask(new URLMessage(new URL("http://www.yahoo.de/1"), null, false, null, hm)));
|
||||||
q.insert(new FetcherTask(new URLMessage(new URL("http://www.yahoo.de/2"), null, false, null)));
|
q.insert(new FetcherTask(new URLMessage(new URL("http://www.yahoo.de/2"), null, false, null, hm)));
|
||||||
q.insert(new FetcherTask(new URLMessage(new URL("http://www.yahoo.de/3"), null, false, null)));
|
q.insert(new FetcherTask(new URLMessage(new URL("http://www.yahoo.de/3"), null, false, null, hm)));
|
||||||
q.insert(new FetcherTask(new URLMessage(new URL("http://www.yahoo.de/4"), null, false, null)));
|
q.insert(new FetcherTask(new URLMessage(new URL("http://www.yahoo.de/4"), null, false, null, hm)));
|
||||||
q.insert(new FetcherTask(new URLMessage(new URL("http://www.lmu.de/3"), null, false, null)));
|
q.insert(new FetcherTask(new URLMessage(new URL("http://www.lmu.de/3"), null, false, null, hm)));
|
||||||
}
|
}
|
||||||
catch (Throwable t)
|
catch (Throwable t)
|
||||||
{
|
{
|
||||||
|
@ -217,9 +218,9 @@ public class FetcherTaskQueue extends TaskQueue
|
||||||
try
|
try
|
||||||
{
|
{
|
||||||
System.out.println("put 3 lmus.");
|
System.out.println("put 3 lmus.");
|
||||||
q.insert(new FetcherTask(new URLMessage(new URL("http://www.lmu.de/1"), null, false, null)));
|
q.insert(new FetcherTask(new URLMessage(new URL("http://www.lmu.de/1"), null, false, null, hm)));
|
||||||
q.insert(new FetcherTask(new URLMessage(new URL("http://www.lmu.de/2"), null, false, null)));
|
q.insert(new FetcherTask(new URLMessage(new URL("http://www.lmu.de/2"), null, false, null, hm)));
|
||||||
q.insert(new FetcherTask(new URLMessage(new URL("http://www.lmu.de/3"), null, false, null)));
|
q.insert(new FetcherTask(new URLMessage(new URL("http://www.lmu.de/3"), null, false, null, hm)));
|
||||||
System.out.print("pull out 1st element [lmu/1]: ");
|
System.out.print("pull out 1st element [lmu/1]: ");
|
||||||
System.out.println(((FetcherTask) q.remove()).getInfo());
|
System.out.println(((FetcherTask) q.remove()).getInfo());
|
||||||
System.out.println("size now [2]: " + q.size());
|
System.out.println("size now [2]: " + q.size());
|
||||||
|
@ -227,9 +228,9 @@ public class FetcherTaskQueue extends TaskQueue
|
||||||
System.out.println(((FetcherTask) q.remove()).getInfo());
|
System.out.println(((FetcherTask) q.remove()).getInfo());
|
||||||
System.out.println("size now [1]: " + q.size());
|
System.out.println("size now [1]: " + q.size());
|
||||||
System.out.println("put in 3 yahoos");
|
System.out.println("put in 3 yahoos");
|
||||||
q.insert(new FetcherTask(new URLMessage(new URL("http://www.yahoo.de/1"), null, false, null)));
|
q.insert(new FetcherTask(new URLMessage(new URL("http://www.yahoo.de/1"), null, false, null, hm)));
|
||||||
q.insert(new FetcherTask(new URLMessage(new URL("http://www.yahoo.de/2"), null, false, null)));
|
q.insert(new FetcherTask(new URLMessage(new URL("http://www.yahoo.de/2"), null, false, null, hm)));
|
||||||
q.insert(new FetcherTask(new URLMessage(new URL("http://www.yahoo.de/3"), null, false, null)));
|
q.insert(new FetcherTask(new URLMessage(new URL("http://www.yahoo.de/3"), null, false, null, hm)));
|
||||||
System.out.println("remove [?]: " + ((FetcherTask) q.remove()).getInfo());
|
System.out.println("remove [?]: " + ((FetcherTask) q.remove()).getInfo());
|
||||||
System.out.println("Size now [3]: " + q.size());
|
System.out.println("Size now [3]: " + q.size());
|
||||||
System.out.println("remove [?]: " + ((FetcherTask) q.remove()).getInfo());
|
System.out.println("remove [?]: " + ((FetcherTask) q.remove()).getInfo());
|
||||||
|
@ -237,7 +238,7 @@ public class FetcherTaskQueue extends TaskQueue
|
||||||
System.out.println("remove [?]: " + ((FetcherTask) q.remove()).getInfo());
|
System.out.println("remove [?]: " + ((FetcherTask) q.remove()).getInfo());
|
||||||
System.out.println("Size now [1]: " + q.size());
|
System.out.println("Size now [1]: " + q.size());
|
||||||
System.out.println("put in another Yahoo");
|
System.out.println("put in another Yahoo");
|
||||||
q.insert(new FetcherTask(new URLMessage(new URL("http://www.yahoo.de/4"), null, false, null)));
|
q.insert(new FetcherTask(new URLMessage(new URL("http://www.yahoo.de/4"), null, false, null, hm)));
|
||||||
System.out.println("remove [?]: " + ((FetcherTask) q.remove()).getInfo());
|
System.out.println("remove [?]: " + ((FetcherTask) q.remove()).getInfo());
|
||||||
System.out.println("Size now [1]: " + q.size());
|
System.out.println("Size now [1]: " + q.size());
|
||||||
System.out.println("remove [?]: " + ((FetcherTask) q.remove()).getInfo());
|
System.out.println("remove [?]: " + ((FetcherTask) q.remove()).getInfo());
|
||||||
|
|
|
@ -56,6 +56,7 @@ package de.lanlab.larm.fetcher;
|
||||||
|
|
||||||
import de.lanlab.larm.threads.ServerThread;
|
import de.lanlab.larm.threads.ServerThread;
|
||||||
import de.lanlab.larm.util.State;
|
import de.lanlab.larm.util.State;
|
||||||
|
import de.lanlab.larm.net.HostManager;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* a server thread for the thread pool that records the number
|
* a server thread for the thread pool that records the number
|
||||||
|
|
|
@ -1,64 +1,69 @@
|
||||||
/* ====================================================================
|
/*
|
||||||
* The Apache Software License, Version 1.1
|
* ====================================================================
|
||||||
|
* The Apache Software License, Version 1.1
|
||||||
*
|
*
|
||||||
* Copyright (c) 2001 The Apache Software Foundation. All rights
|
* Copyright (c) 2001 The Apache Software Foundation. All rights
|
||||||
* reserved.
|
* reserved.
|
||||||
*
|
*
|
||||||
* Redistribution and use in source and binary forms, with or without
|
* Redistribution and use in source and binary forms, with or without
|
||||||
* modification, are permitted provided that the following conditions
|
* modification, are permitted provided that the following conditions
|
||||||
* are met:
|
* are met:
|
||||||
*
|
*
|
||||||
* 1. Redistributions of source code must retain the above copyright
|
* 1. Redistributions of source code must retain the above copyright
|
||||||
* notice, this list of conditions and the following disclaimer.
|
* notice, this list of conditions and the following disclaimer.
|
||||||
*
|
*
|
||||||
* 2. Redistributions in binary form must reproduce the above copyright
|
* 2. Redistributions in binary form must reproduce the above copyright
|
||||||
* notice, this list of conditions and the following disclaimer in
|
* notice, this list of conditions and the following disclaimer in
|
||||||
* the documentation and/or other materials provided with the
|
* the documentation and/or other materials provided with the
|
||||||
* distribution.
|
* distribution.
|
||||||
*
|
*
|
||||||
* 3. The end-user documentation included with the redistribution,
|
* 3. The end-user documentation included with the redistribution,
|
||||||
* if any, must include the following acknowledgment:
|
* if any, must include the following acknowledgment:
|
||||||
* "This product includes software developed by the
|
* "This product includes software developed by the
|
||||||
* Apache Software Foundation (http://www.apache.org/)."
|
* Apache Software Foundation (http://www.apache.org/)."
|
||||||
* Alternately, this acknowledgment may appear in the software itself,
|
* Alternately, this acknowledgment may appear in the software itself,
|
||||||
* if and wherever such third-party acknowledgments normally appear.
|
* if and wherever such third-party acknowledgments normally appear.
|
||||||
*
|
*
|
||||||
* 4. The names "Apache" and "Apache Software Foundation" and
|
* 4. The names "Apache" and "Apache Software Foundation" and
|
||||||
* "Apache Lucene" must not be used to endorse or promote products
|
* "Apache Lucene" must not be used to endorse or promote products
|
||||||
* derived from this software without prior written permission. For
|
* derived from this software without prior written permission. For
|
||||||
* written permission, please contact apache@apache.org.
|
* written permission, please contact apache@apache.org.
|
||||||
*
|
*
|
||||||
* 5. Products derived from this software may not be called "Apache",
|
* 5. Products derived from this software may not be called "Apache",
|
||||||
* "Apache Lucene", nor may "Apache" appear in their name, without
|
* "Apache Lucene", nor may "Apache" appear in their name, without
|
||||||
* prior written permission of the Apache Software Foundation.
|
* prior written permission of the Apache Software Foundation.
|
||||||
*
|
*
|
||||||
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
|
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
|
||||||
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
|
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
|
||||||
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||||
* DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
|
* DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
|
||||||
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||||
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||||
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
|
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
|
||||||
* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
||||||
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||||
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
|
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
|
||||||
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
|
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
|
||||||
* SUCH DAMAGE.
|
* SUCH DAMAGE.
|
||||||
* ====================================================================
|
* ====================================================================
|
||||||
*
|
*
|
||||||
* This software consists of voluntary contributions made by many
|
* This software consists of voluntary contributions made by many
|
||||||
* individuals on behalf of the Apache Software Foundation. For more
|
* individuals on behalf of the Apache Software Foundation. For more
|
||||||
* information on the Apache Software Foundation, please see
|
* information on the Apache Software Foundation, please see
|
||||||
* <http://www.apache.org/>.
|
* <http://www.apache.org/>.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
package de.lanlab.larm.fetcher;
|
package de.lanlab.larm.fetcher;
|
||||||
import de.lanlab.larm.threads.*;
|
import de.lanlab.larm.threads.*;
|
||||||
|
import de.lanlab.larm.net.*;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* this factory simply creates fetcher threads. It's passed
|
* this factory simply creates fetcher threads. It's passed to the ThreadPool
|
||||||
* to the ThreadPool because the pool is creating the threads on its own
|
* because the pool is creating the threads on its own
|
||||||
* @version $Id$
|
*
|
||||||
|
* @author Administrator
|
||||||
|
* @created 14. Juni 2002
|
||||||
|
* @version $Id: FetcherThreadFactory.java,v 1.2 2002/05/22 23:09:17
|
||||||
|
* cmarschner Exp $
|
||||||
*/
|
*/
|
||||||
public class FetcherThreadFactory extends ThreadFactory
|
public class FetcherThreadFactory extends ThreadFactory
|
||||||
{
|
{
|
||||||
|
@ -69,13 +74,25 @@ public class FetcherThreadFactory extends ThreadFactory
|
||||||
|
|
||||||
HostManager hostManager;
|
HostManager hostManager;
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Constructor for the FetcherThreadFactory object
|
||||||
|
*
|
||||||
|
* @param hostManager Description of the Parameter
|
||||||
|
*/
|
||||||
public FetcherThreadFactory(HostManager hostManager)
|
public FetcherThreadFactory(HostManager hostManager)
|
||||||
{
|
{
|
||||||
this.hostManager = hostManager;
|
this.hostManager = hostManager;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
public ServerThread createServerThread(int count)
|
/**
|
||||||
|
* Description of the Method
|
||||||
|
*
|
||||||
|
* @param count Description of the Parameter
|
||||||
|
* @return Description of the Return Value
|
||||||
|
*/
|
||||||
|
public ServerThread createServerThread(int count)
|
||||||
{
|
{
|
||||||
ServerThread newThread = new FetcherThread(count, threadGroup, hostManager);
|
ServerThread newThread = new FetcherThread(count, threadGroup, hostManager);
|
||||||
newThread.setPriority(4);
|
newThread.setPriority(4);
|
||||||
|
|
|
@ -1,168 +0,0 @@
|
||||||
/* ====================================================================
|
|
||||||
* The Apache Software License, Version 1.1
|
|
||||||
*
|
|
||||||
* Copyright (c) 2001 The Apache Software Foundation. All rights
|
|
||||||
* reserved.
|
|
||||||
*
|
|
||||||
* Redistribution and use in source and binary forms, with or without
|
|
||||||
* modification, are permitted provided that the following conditions
|
|
||||||
* are met:
|
|
||||||
*
|
|
||||||
* 1. Redistributions of source code must retain the above copyright
|
|
||||||
* notice, this list of conditions and the following disclaimer.
|
|
||||||
*
|
|
||||||
* 2. Redistributions in binary form must reproduce the above copyright
|
|
||||||
* notice, this list of conditions and the following disclaimer in
|
|
||||||
* the documentation and/or other materials provided with the
|
|
||||||
* distribution.
|
|
||||||
*
|
|
||||||
* 3. The end-user documentation included with the redistribution,
|
|
||||||
* if any, must include the following acknowledgment:
|
|
||||||
* "This product includes software developed by the
|
|
||||||
* Apache Software Foundation (http://www.apache.org/)."
|
|
||||||
* Alternately, this acknowledgment may appear in the software itself,
|
|
||||||
* if and wherever such third-party acknowledgments normally appear.
|
|
||||||
*
|
|
||||||
* 4. The names "Apache" and "Apache Software Foundation" and
|
|
||||||
* "Apache Lucene" must not be used to endorse or promote products
|
|
||||||
* derived from this software without prior written permission. For
|
|
||||||
* written permission, please contact apache@apache.org.
|
|
||||||
*
|
|
||||||
* 5. Products derived from this software may not be called "Apache",
|
|
||||||
* "Apache Lucene", nor may "Apache" appear in their name, without
|
|
||||||
* prior written permission of the Apache Software Foundation.
|
|
||||||
*
|
|
||||||
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
|
|
||||||
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
|
|
||||||
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
|
||||||
* DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
|
|
||||||
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
|
||||||
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
|
||||||
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
|
|
||||||
* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
|
||||||
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
|
||||||
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
|
|
||||||
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
|
|
||||||
* SUCH DAMAGE.
|
|
||||||
* ====================================================================
|
|
||||||
*
|
|
||||||
* This software consists of voluntary contributions made by many
|
|
||||||
* individuals on behalf of the Apache Software Foundation. For more
|
|
||||||
* information on the Apache Software Foundation, please see
|
|
||||||
* <http://www.apache.org/>.
|
|
||||||
*/
|
|
||||||
|
|
||||||
package de.lanlab.larm.fetcher;
|
|
||||||
|
|
||||||
import java.util.HashMap;
|
|
||||||
import java.net.*;
|
|
||||||
import de.lanlab.larm.util.CachingQueue;
|
|
||||||
import de.lanlab.larm.util.Queue;
|
|
||||||
|
|
||||||
/**
|
|
||||||
* contains information about a host. If a host doesn't respond too often, it's
|
|
||||||
* excluded from the crawl.
|
|
||||||
* This class is used by the HostManager
|
|
||||||
*
|
|
||||||
* @author Clemens Marschner
|
|
||||||
* @created 16. Februar 2002
|
|
||||||
* @version $Id$
|
|
||||||
*/
|
|
||||||
public class HostInfo
|
|
||||||
{
|
|
||||||
static final String[] emptyKeepOutDirectories = new String[0];
|
|
||||||
|
|
||||||
int id;
|
|
||||||
int healthyCount = 5; // five strikes, and you're out
|
|
||||||
boolean isReachable = true;
|
|
||||||
boolean robotTxtChecked = false;
|
|
||||||
String[] disallows; // robot exclusion
|
|
||||||
boolean isLoadingRobotsTxt = false;
|
|
||||||
Queue queuedRequests = null; // robot exclusion
|
|
||||||
String hostName;
|
|
||||||
|
|
||||||
public HostInfo(String hostName, int id)
|
|
||||||
{
|
|
||||||
this.id = id;
|
|
||||||
this.disallows = HostInfo.emptyKeepOutDirectories;
|
|
||||||
this.hostName = hostName;
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* is this host reachable and responding?
|
|
||||||
*/
|
|
||||||
public boolean isHealthy()
|
|
||||||
{
|
|
||||||
return (healthyCount > 0) && isReachable;
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* signals that the host returned with a bad request of whatever type
|
|
||||||
*/
|
|
||||||
public void badRequest()
|
|
||||||
{
|
|
||||||
healthyCount--;
|
|
||||||
}
|
|
||||||
|
|
||||||
public void setReachable(boolean reachable)
|
|
||||||
{
|
|
||||||
isReachable = reachable;
|
|
||||||
}
|
|
||||||
|
|
||||||
public boolean isReachable()
|
|
||||||
{
|
|
||||||
return isReachable;
|
|
||||||
}
|
|
||||||
|
|
||||||
public boolean isRobotTxtChecked()
|
|
||||||
{
|
|
||||||
return robotTxtChecked;
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* must be synchronized externally
|
|
||||||
*/
|
|
||||||
public boolean isLoadingRobotsTxt()
|
|
||||||
{
|
|
||||||
return this.isLoadingRobotsTxt;
|
|
||||||
}
|
|
||||||
|
|
||||||
public void setLoadingRobotsTxt(boolean isLoading)
|
|
||||||
{
|
|
||||||
this.isLoadingRobotsTxt = isLoading;
|
|
||||||
if(isLoading)
|
|
||||||
{
|
|
||||||
this.queuedRequests = new CachingQueue("HostInfo_" + id + "_QueuedRequests", 100);
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
public void setRobotsChecked(boolean isChecked, String[] disallows)
|
|
||||||
{
|
|
||||||
this.robotTxtChecked = isChecked;
|
|
||||||
if(disallows != null)
|
|
||||||
{
|
|
||||||
this.disallows = disallows;
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
this.disallows = emptyKeepOutDirectories;
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
public synchronized boolean isAllowed(String path)
|
|
||||||
{
|
|
||||||
// assume keepOutDirectories is pretty short
|
|
||||||
// assert disallows != null
|
|
||||||
int length = disallows.length;
|
|
||||||
for(int i=0; i<length; i++)
|
|
||||||
{
|
|
||||||
if(path.startsWith(disallows[i]))
|
|
||||||
{
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
}
|
|
|
@ -1,133 +0,0 @@
|
||||||
/* ====================================================================
|
|
||||||
* The Apache Software License, Version 1.1
|
|
||||||
*
|
|
||||||
* Copyright (c) 2001 The Apache Software Foundation. All rights
|
|
||||||
* reserved.
|
|
||||||
*
|
|
||||||
* Redistribution and use in source and binary forms, with or without
|
|
||||||
* modification, are permitted provided that the following conditions
|
|
||||||
* are met:
|
|
||||||
*
|
|
||||||
* 1. Redistributions of source code must retain the above copyright
|
|
||||||
* notice, this list of conditions and the following disclaimer.
|
|
||||||
*
|
|
||||||
* 2. Redistributions in binary form must reproduce the above copyright
|
|
||||||
* notice, this list of conditions and the following disclaimer in
|
|
||||||
* the documentation and/or other materials provided with the
|
|
||||||
* distribution.
|
|
||||||
*
|
|
||||||
* 3. The end-user documentation included with the redistribution,
|
|
||||||
* if any, must include the following acknowledgment:
|
|
||||||
* "This product includes software developed by the
|
|
||||||
* Apache Software Foundation (http://www.apache.org/)."
|
|
||||||
* Alternately, this acknowledgment may appear in the software itself,
|
|
||||||
* if and wherever such third-party acknowledgments normally appear.
|
|
||||||
*
|
|
||||||
* 4. The names "Apache" and "Apache Software Foundation" and
|
|
||||||
* "Apache Lucene" must not be used to endorse or promote products
|
|
||||||
* derived from this software without prior written permission. For
|
|
||||||
* written permission, please contact apache@apache.org.
|
|
||||||
*
|
|
||||||
* 5. Products derived from this software may not be called "Apache",
|
|
||||||
* "Apache Lucene", nor may "Apache" appear in their name, without
|
|
||||||
* prior written permission of the Apache Software Foundation.
|
|
||||||
*
|
|
||||||
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
|
|
||||||
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
|
|
||||||
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
|
||||||
* DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
|
|
||||||
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
|
||||||
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
|
||||||
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
|
|
||||||
* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
|
||||||
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
|
||||||
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
|
|
||||||
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
|
|
||||||
* SUCH DAMAGE.
|
|
||||||
* ====================================================================
|
|
||||||
*
|
|
||||||
* This software consists of voluntary contributions made by many
|
|
||||||
* individuals on behalf of the Apache Software Foundation. For more
|
|
||||||
* information on the Apache Software Foundation, please see
|
|
||||||
* <http://www.apache.org/>.
|
|
||||||
*/
|
|
||||||
|
|
||||||
package de.lanlab.larm.fetcher;
|
|
||||||
|
|
||||||
import java.util.HashMap;
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Description of the Class
|
|
||||||
*
|
|
||||||
* @author Administrator
|
|
||||||
* @created 16. Februar 2002
|
|
||||||
* @version $Id$
|
|
||||||
*/
|
|
||||||
public class HostManager
|
|
||||||
{
|
|
||||||
HashMap hosts;
|
|
||||||
static int hostCount = 0;
|
|
||||||
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Constructor for the HostInfo object
|
|
||||||
*
|
|
||||||
* @param initialSize Description of the Parameter
|
|
||||||
*/
|
|
||||||
public HostManager(int initialCapacity)
|
|
||||||
{
|
|
||||||
hosts = new HashMap(initialCapacity);
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Description of the Method
|
|
||||||
*
|
|
||||||
* @param hostName Description of the Parameter
|
|
||||||
* @return Description of the Return Value
|
|
||||||
*/
|
|
||||||
public HostInfo put(String hostName)
|
|
||||||
{
|
|
||||||
if (!hosts.containsKey(hostName))
|
|
||||||
{
|
|
||||||
int hostID;
|
|
||||||
synchronized (this)
|
|
||||||
{
|
|
||||||
hostID = hostCount++;
|
|
||||||
}
|
|
||||||
HostInfo hi = new HostInfo(hostName,hostID);
|
|
||||||
hosts.put(hostName, hi);
|
|
||||||
return hi;
|
|
||||||
}
|
|
||||||
return (HostInfo)hosts.get(hostName);
|
|
||||||
/*else
|
|
||||||
{
|
|
||||||
hostID = hosts.get()
|
|
||||||
}
|
|
||||||
// assert hostID != -1;
|
|
||||||
return hostID;*/
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Gets the hostID attribute of the HostInfo object
|
|
||||||
*
|
|
||||||
* @param hostName Description of the Parameter
|
|
||||||
* @return The hostID value
|
|
||||||
*/
|
|
||||||
public HostInfo getHostInfo(String hostName)
|
|
||||||
{
|
|
||||||
HostInfo hi = (HostInfo)hosts.get(hostName);
|
|
||||||
if(hi == null)
|
|
||||||
{
|
|
||||||
return put(hostName);
|
|
||||||
}
|
|
||||||
return hi;
|
|
||||||
}
|
|
||||||
|
|
||||||
public int getSize()
|
|
||||||
{
|
|
||||||
return hosts.size();
|
|
||||||
}
|
|
||||||
}
|
|
|
@ -63,6 +63,7 @@ import org.apache.oro.text.perl.Perl5Util;
|
||||||
import de.lanlab.larm.util.*;
|
import de.lanlab.larm.util.*;
|
||||||
import de.lanlab.larm.threads.*;
|
import de.lanlab.larm.threads.*;
|
||||||
import HTTPClient.*;
|
import HTTPClient.*;
|
||||||
|
import de.lanlab.larm.net.*;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* this factory simply creates fetcher threads. It's gonna be passed to the
|
* this factory simply creates fetcher threads. It's gonna be passed to the
|
||||||
|
@ -164,13 +165,13 @@ public class RobotExclusionFilter extends Filter implements MessageListener
|
||||||
URLMessage urlMsg = ((URLMessage) message);
|
URLMessage urlMsg = ((URLMessage) message);
|
||||||
URL url = urlMsg.getUrl();
|
URL url = urlMsg.getUrl();
|
||||||
//assert url != null;
|
//assert url != null;
|
||||||
HostInfo h = hostManager.getHostInfo(url.getHost());
|
HostInfo h = hostManager.getHostInfo(url.getHost().toLowerCase());
|
||||||
if (!h.isRobotTxtChecked() && !h.isLoadingRobotsTxt())
|
if (!h.isRobotTxtChecked() && !h.isLoadingRobotsTxt())
|
||||||
{
|
{
|
||||||
log.logThreadSafe("handleRequest: starting to get robots.txt");
|
log.logThreadSafe("handleRequest: starting to get robots.txt");
|
||||||
// probably this results in Race Conditions here
|
// probably this results in Race Conditions here
|
||||||
|
|
||||||
rePool.doTask(new RobotExclusionTask(h), new Integer(h.id));
|
rePool.doTask(new RobotExclusionTask(h), new Integer(h.getId()));
|
||||||
h.setLoadingRobotsTxt(true);
|
h.setLoadingRobotsTxt(true);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -182,7 +183,7 @@ public class RobotExclusionFilter extends Filter implements MessageListener
|
||||||
|
|
||||||
//log.logThreadSafe("handleRequest: other thread is loading");
|
//log.logThreadSafe("handleRequest: other thread is loading");
|
||||||
// assert h.queuedRequests != null
|
// assert h.queuedRequests != null
|
||||||
h.queuedRequests.insert(message);
|
h.insertIntoQueue(message);
|
||||||
// not thread safe
|
// not thread safe
|
||||||
log.logThreadSafe("handleRequest: queued file " + url);
|
log.logThreadSafe("handleRequest: queued file " + url);
|
||||||
return null;
|
return null;
|
||||||
|
@ -273,14 +274,14 @@ public class RobotExclusionFilter extends Filter implements MessageListener
|
||||||
// assert hostInfo != null;
|
// assert hostInfo != null;
|
||||||
String threadName = Thread.currentThread().getName();
|
String threadName = Thread.currentThread().getName();
|
||||||
|
|
||||||
log.logThreadSafe("task " + threadName + ": starting to load " + hostInfo.hostName);
|
log.logThreadSafe("task " + threadName + ": starting to load " + hostInfo.getHostName());
|
||||||
//hostInfo.setLoadingRobotsTxt(true);
|
//hostInfo.setLoadingRobotsTxt(true);
|
||||||
String[] disallows = null;
|
String[] disallows = null;
|
||||||
boolean errorOccured = false;
|
boolean errorOccured = false;
|
||||||
try
|
try
|
||||||
{
|
{
|
||||||
log.logThreadSafe("task " + threadName + ": getting connection");
|
log.logThreadSafe("task " + threadName + ": getting connection");
|
||||||
HTTPConnection conn = new HTTPConnection(hostInfo.hostName);
|
HTTPConnection conn = new HTTPConnection(hostInfo.getHostName());
|
||||||
conn.setTimeout(30000);
|
conn.setTimeout(30000);
|
||||||
// wait at most 20 secs
|
// wait at most 20 secs
|
||||||
|
|
||||||
|
@ -348,8 +349,8 @@ public class RobotExclusionFilter extends Filter implements MessageListener
|
||||||
// crawl everything
|
// crawl everything
|
||||||
hostInfo.setLoadingRobotsTxt(false);
|
hostInfo.setLoadingRobotsTxt(false);
|
||||||
log.logThreadSafe("task " + threadName + ": error occured");
|
log.logThreadSafe("task " + threadName + ": error occured");
|
||||||
log.logThreadSafe("task " + threadName + ": now put " + hostInfo.queuedRequests.size() + " queueud requests back");
|
log.logThreadSafe("task " + threadName + ": now put " + hostInfo.getQueueSize() + " queueud requests back");
|
||||||
hostInfo.isLoadingRobotsTxt = false;
|
hostInfo.setLoadingRobotsTxt(false);
|
||||||
putBackURLs();
|
putBackURLs();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -359,8 +360,8 @@ public class RobotExclusionFilter extends Filter implements MessageListener
|
||||||
{
|
{
|
||||||
hostInfo.setRobotsChecked(true, disallows);
|
hostInfo.setRobotsChecked(true, disallows);
|
||||||
log.logThreadSafe("task " + threadName + ": done");
|
log.logThreadSafe("task " + threadName + ": done");
|
||||||
log.logThreadSafe("task " + threadName + ": now put " + hostInfo.queuedRequests.size() + " queueud requests back");
|
log.logThreadSafe("task " + threadName + ": now put " + hostInfo.getQueueSize() + " queueud requests back");
|
||||||
hostInfo.isLoadingRobotsTxt = false;
|
hostInfo.setLoadingRobotsTxt(false);
|
||||||
putBackURLs();
|
putBackURLs();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -373,12 +374,12 @@ public class RobotExclusionFilter extends Filter implements MessageListener
|
||||||
*/
|
*/
|
||||||
private void putBackURLs()
|
private void putBackURLs()
|
||||||
{
|
{
|
||||||
while (hostInfo.queuedRequests.size() > 0)
|
while (hostInfo.getQueueSize() > 0)
|
||||||
{
|
{
|
||||||
messageHandler.putMessage((Message) hostInfo.queuedRequests.remove());
|
messageHandler.putMessage((Message) hostInfo.removeFromQueue());
|
||||||
}
|
}
|
||||||
log.logThreadSafe("task " + Thread.currentThread().getName() + ": finished");
|
log.logThreadSafe("task " + Thread.currentThread().getName() + ": finished");
|
||||||
hostInfo.queuedRequests = null;
|
hostInfo.removeQueue();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -61,6 +61,7 @@ import java.text.*;
|
||||||
import java.io.*;
|
import java.io.*;
|
||||||
import de.lanlab.larm.util.State;
|
import de.lanlab.larm.util.State;
|
||||||
import de.lanlab.larm.util.SimpleLoggerManager;
|
import de.lanlab.larm.util.SimpleLoggerManager;
|
||||||
|
import de.lanlab.larm.net.*;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* this monitor takes a sample of every thread every x milliseconds,
|
* this monitor takes a sample of every thread every x milliseconds,
|
||||||
|
|
|
@ -1,66 +1,71 @@
|
||||||
/* ====================================================================
|
/*
|
||||||
* The Apache Software License, Version 1.1
|
* ====================================================================
|
||||||
|
* The Apache Software License, Version 1.1
|
||||||
*
|
*
|
||||||
* Copyright (c) 2001 The Apache Software Foundation. All rights
|
* Copyright (c) 2001 The Apache Software Foundation. All rights
|
||||||
* reserved.
|
* reserved.
|
||||||
*
|
*
|
||||||
* Redistribution and use in source and binary forms, with or without
|
* Redistribution and use in source and binary forms, with or without
|
||||||
* modification, are permitted provided that the following conditions
|
* modification, are permitted provided that the following conditions
|
||||||
* are met:
|
* are met:
|
||||||
*
|
*
|
||||||
* 1. Redistributions of source code must retain the above copyright
|
* 1. Redistributions of source code must retain the above copyright
|
||||||
* notice, this list of conditions and the following disclaimer.
|
* notice, this list of conditions and the following disclaimer.
|
||||||
*
|
*
|
||||||
* 2. Redistributions in binary form must reproduce the above copyright
|
* 2. Redistributions in binary form must reproduce the above copyright
|
||||||
* notice, this list of conditions and the following disclaimer in
|
* notice, this list of conditions and the following disclaimer in
|
||||||
* the documentation and/or other materials provided with the
|
* the documentation and/or other materials provided with the
|
||||||
* distribution.
|
* distribution.
|
||||||
*
|
*
|
||||||
* 3. The end-user documentation included with the redistribution,
|
* 3. The end-user documentation included with the redistribution,
|
||||||
* if any, must include the following acknowledgment:
|
* if any, must include the following acknowledgment:
|
||||||
* "This product includes software developed by the
|
* "This product includes software developed by the
|
||||||
* Apache Software Foundation (http://www.apache.org/)."
|
* Apache Software Foundation (http://www.apache.org/)."
|
||||||
* Alternately, this acknowledgment may appear in the software itself,
|
* Alternately, this acknowledgment may appear in the software itself,
|
||||||
* if and wherever such third-party acknowledgments normally appear.
|
* if and wherever such third-party acknowledgments normally appear.
|
||||||
*
|
*
|
||||||
* 4. The names "Apache" and "Apache Software Foundation" and
|
* 4. The names "Apache" and "Apache Software Foundation" and
|
||||||
* "Apache Lucene" must not be used to endorse or promote products
|
* "Apache Lucene" must not be used to endorse or promote products
|
||||||
* derived from this software without prior written permission. For
|
* derived from this software without prior written permission. For
|
||||||
* written permission, please contact apache@apache.org.
|
* written permission, please contact apache@apache.org.
|
||||||
*
|
*
|
||||||
* 5. Products derived from this software may not be called "Apache",
|
* 5. Products derived from this software may not be called "Apache",
|
||||||
* "Apache Lucene", nor may "Apache" appear in their name, without
|
* "Apache Lucene", nor may "Apache" appear in their name, without
|
||||||
* prior written permission of the Apache Software Foundation.
|
* prior written permission of the Apache Software Foundation.
|
||||||
*
|
*
|
||||||
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
|
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
|
||||||
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
|
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
|
||||||
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||||
* DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
|
* DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
|
||||||
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||||
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||||
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
|
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
|
||||||
* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
||||||
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||||
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
|
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
|
||||||
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
|
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
|
||||||
* SUCH DAMAGE.
|
* SUCH DAMAGE.
|
||||||
* ====================================================================
|
* ====================================================================
|
||||||
*
|
*
|
||||||
* This software consists of voluntary contributions made by many
|
* This software consists of voluntary contributions made by many
|
||||||
* individuals on behalf of the Apache Software Foundation. For more
|
* individuals on behalf of the Apache Software Foundation. For more
|
||||||
* information on the Apache Software Foundation, please see
|
* information on the Apache Software Foundation, please see
|
||||||
* <http://www.apache.org/>.
|
* <http://www.apache.org/>.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
package de.lanlab.larm.fetcher;
|
package de.lanlab.larm.fetcher;
|
||||||
|
|
||||||
import java.net.*;
|
import java.net.*;
|
||||||
import java.io.*;
|
import java.io.*;
|
||||||
import de.lanlab.larm.util.URLUtils;
|
import de.lanlab.larm.util.URLUtils;
|
||||||
|
import de.lanlab.larm.net.URLNormalizer;
|
||||||
|
import de.lanlab.larm.net.HostManager;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* represents a URL which is passed around in the messageHandler
|
* represents a URL which is passed around in the messageHandler
|
||||||
* @version $Id$
|
*
|
||||||
|
* @author Administrator
|
||||||
|
* @created 14. Juni 2002
|
||||||
|
* @version $Id$
|
||||||
*/
|
*/
|
||||||
public class URLMessage implements Message, Serializable
|
public class URLMessage implements Message, Serializable
|
||||||
{
|
{
|
||||||
|
@ -68,14 +73,51 @@ public class URLMessage implements Message, Serializable
|
||||||
* the URL
|
* the URL
|
||||||
*/
|
*/
|
||||||
protected URL url;
|
protected URL url;
|
||||||
protected String urlString;
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Description of the Field
|
||||||
|
*/
|
||||||
|
protected volatile String urlString;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* referer or null
|
||||||
|
*/
|
||||||
protected URL referer;
|
protected URL referer;
|
||||||
protected String refererString;
|
|
||||||
|
/**
|
||||||
|
* externalized referer URL, to prevent multiple calls to url.toExternalForm()
|
||||||
|
*/
|
||||||
|
protected volatile String refererString;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* externalized referer URL, to prevent multiple calls to url.toExternalForm()
|
||||||
|
*/
|
||||||
|
protected volatile String refererNormalizedString;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* normalized URL, as defined by {@link de.lanlab.larm.net.URLNormalizer}
|
||||||
|
* (lower case, index.* removed, all characters except alphanumeric ones escaped)
|
||||||
|
*/
|
||||||
|
protected String normalizedURLString;
|
||||||
|
|
||||||
|
|
||||||
boolean isFrame;
|
boolean isFrame;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* anchor text, as in <a href="...">Anchor</a>
|
||||||
|
*/
|
||||||
protected String anchor;
|
protected String anchor;
|
||||||
|
|
||||||
public URLMessage(URL url, URL referer, boolean isFrame, String anchor)
|
|
||||||
|
/**
|
||||||
|
* Constructor for the URLMessage object
|
||||||
|
*
|
||||||
|
* @param url Description of the Parameter
|
||||||
|
* @param referer Description of the Parameter
|
||||||
|
* @param isFrame Description of the Parameter
|
||||||
|
* @param anchor Description of the Parameter
|
||||||
|
*/
|
||||||
|
public URLMessage(URL url, URL referer, boolean isFrame, String anchor, HostManager hostManager)
|
||||||
{
|
{
|
||||||
//super();
|
//super();
|
||||||
this.url = url;
|
this.url = url;
|
||||||
|
@ -83,69 +125,144 @@ public class URLMessage implements Message, Serializable
|
||||||
|
|
||||||
this.referer = referer;
|
this.referer = referer;
|
||||||
this.refererString = referer != null ? URLUtils.toExternalFormNoRef(referer) : null;
|
this.refererString = referer != null ? URLUtils.toExternalFormNoRef(referer) : null;
|
||||||
|
this.refererNormalizedString = referer != null ? URLUtils.toExternalFormNoRef(URLNormalizer.normalize(referer, hostManager)) : null;
|
||||||
this.isFrame = isFrame;
|
this.isFrame = isFrame;
|
||||||
this.anchor = anchor != null ? anchor : "";
|
this.anchor = anchor != null ? anchor : "";
|
||||||
|
this.normalizedURLString = URLUtils.toExternalFormNoRef(URLNormalizer.normalize(url, hostManager));
|
||||||
|
//this.normalizedURLString = URLNormalizer.
|
||||||
//System.out.println("" + refererString + " -> " + urlString);
|
//System.out.println("" + refererString + " -> " + urlString);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public String getNormalizedURLString()
|
||||||
|
{
|
||||||
|
return this.normalizedURLString;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Gets the url attribute of the URLMessage object
|
||||||
|
*
|
||||||
|
* @return The url value
|
||||||
|
*/
|
||||||
public URL getUrl()
|
public URL getUrl()
|
||||||
{
|
{
|
||||||
return this.url;
|
return this.url;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Gets the referer attribute of the URLMessage object
|
||||||
|
*
|
||||||
|
* @return The referer value
|
||||||
|
*/
|
||||||
public URL getReferer()
|
public URL getReferer()
|
||||||
{
|
{
|
||||||
return this.referer;
|
return this.referer;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Description of the Method
|
||||||
|
*
|
||||||
|
* @return Description of the Return Value
|
||||||
|
*/
|
||||||
public String toString()
|
public String toString()
|
||||||
{
|
{
|
||||||
return urlString;
|
return urlString;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Gets the uRLString attribute of the URLMessage object
|
||||||
|
*
|
||||||
|
* @return The uRLString value
|
||||||
|
*/
|
||||||
public String getURLString()
|
public String getURLString()
|
||||||
{
|
{
|
||||||
return urlString;
|
return urlString;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Gets the refererString attribute of the URLMessage object
|
||||||
|
*
|
||||||
|
* @return The refererString value
|
||||||
|
*/
|
||||||
public String getRefererString()
|
public String getRefererString()
|
||||||
{
|
{
|
||||||
return refererString;
|
return refererString;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Gets the anchor attribute of the URLMessage object
|
||||||
|
*
|
||||||
|
* @return The anchor value
|
||||||
|
*/
|
||||||
public String getAnchor()
|
public String getAnchor()
|
||||||
{
|
{
|
||||||
return anchor;
|
return anchor;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Description of the Method
|
||||||
|
*
|
||||||
|
* @return Description of the Return Value
|
||||||
|
*/
|
||||||
public int hashCode()
|
public int hashCode()
|
||||||
{
|
{
|
||||||
return url.hashCode();
|
return url.hashCode();
|
||||||
}
|
}
|
||||||
|
|
||||||
private void writeObject(java.io.ObjectOutputStream out) throws IOException
|
|
||||||
|
/**
|
||||||
|
* Description of the Method
|
||||||
|
*
|
||||||
|
* @param out Description of the Parameter
|
||||||
|
* @exception IOException Description of the Exception
|
||||||
|
*/
|
||||||
|
private void writeObject(java.io.ObjectOutputStream out)
|
||||||
|
throws IOException
|
||||||
{
|
{
|
||||||
out.writeObject(url);
|
out.writeObject(url);
|
||||||
out.writeObject(referer);
|
out.writeObject(referer);
|
||||||
out.writeBoolean(isFrame);
|
out.writeBoolean(isFrame);
|
||||||
out.writeUTF(anchor);
|
out.writeUTF(anchor);
|
||||||
|
out.writeUTF(refererNormalizedString);
|
||||||
|
out.writeUTF(normalizedURLString);
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
private void readObject(java.io.ObjectInputStream in) throws IOException, ClassNotFoundException
|
|
||||||
|
/**
|
||||||
|
* Description of the Method
|
||||||
|
*
|
||||||
|
* @param in Description of the Parameter
|
||||||
|
* @exception IOException Description of the Exception
|
||||||
|
* @exception ClassNotFoundException Description of the Exception
|
||||||
|
*/
|
||||||
|
private void readObject(java.io.ObjectInputStream in)
|
||||||
|
throws IOException, ClassNotFoundException
|
||||||
{
|
{
|
||||||
url = (URL)in.readObject();
|
url = (URL) in.readObject();
|
||||||
referer = (URL)in.readObject();
|
referer = (URL) in.readObject();
|
||||||
urlString = url.toExternalForm();
|
urlString = url.toExternalForm();
|
||||||
refererString = referer.toExternalForm();
|
refererString = referer.toExternalForm();
|
||||||
isFrame = in.readBoolean();
|
isFrame = in.readBoolean();
|
||||||
anchor = in.readUTF();
|
anchor = in.readUTF();
|
||||||
|
refererNormalizedString = in.readUTF();
|
||||||
|
normalizedURLString = in.readUTF();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Gets the info attribute of the URLMessage object
|
||||||
|
*
|
||||||
|
* @return The info value
|
||||||
|
*/
|
||||||
public String getInfo()
|
public String getInfo()
|
||||||
{
|
{
|
||||||
return (referer != null ? refererString : "<start>") + "\t" + urlString + "\t" + (isFrame ? "1" : "0") + "\t" + anchor;
|
return (referer != null ? refererString : "<start>") + "\t" + urlString + "\t" + this.getNormalizedURLString() + "\t" + (isFrame ? "1" : "0") + "\t" + anchor;
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -123,7 +123,7 @@ public class URLVisitedFilter extends Filter implements MessageListener
|
||||||
{
|
{
|
||||||
URLMessage urlMessage = ((URLMessage) message);
|
URLMessage urlMessage = ((URLMessage) message);
|
||||||
URL url = urlMessage.getUrl();
|
URL url = urlMessage.getUrl();
|
||||||
String urlString = urlMessage.getURLString();
|
String urlString = urlMessage.getNormalizedURLString();
|
||||||
if (urlHash.contains(urlString))
|
if (urlHash.contains(urlString))
|
||||||
{
|
{
|
||||||
//System.out.println("URLVisitedFilter: " + urlString + " already present.");
|
//System.out.println("URLVisitedFilter: " + urlString + " already present.");
|
||||||
|
|
Loading…
Reference in New Issue