mirror of https://github.com/apache/lucene.git
moved HostInfo/HostManager to larm.net package; added URLNormalizer
git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@150782 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
5b90c10cb5
commit
8e18fa1cb0
|
@ -0,0 +1,298 @@
|
||||||
|
/*
|
||||||
|
* ====================================================================
|
||||||
|
* The Apache Software License, Version 1.1
|
||||||
|
*
|
||||||
|
* Copyright (c) 2001 The Apache Software Foundation. All rights
|
||||||
|
* reserved.
|
||||||
|
*
|
||||||
|
* Redistribution and use in source and binary forms, with or without
|
||||||
|
* modification, are permitted provided that the following conditions
|
||||||
|
* are met:
|
||||||
|
*
|
||||||
|
* 1. Redistributions of source code must retain the above copyright
|
||||||
|
* notice, this list of conditions and the following disclaimer.
|
||||||
|
*
|
||||||
|
* 2. Redistributions in binary form must reproduce the above copyright
|
||||||
|
* notice, this list of conditions and the following disclaimer in
|
||||||
|
* the documentation and/or other materials provided with the
|
||||||
|
* distribution.
|
||||||
|
*
|
||||||
|
* 3. The end-user documentation included with the redistribution,
|
||||||
|
* if any, must include the following acknowledgment:
|
||||||
|
* "This product includes software developed by the
|
||||||
|
* Apache Software Foundation (http://www.apache.org/)."
|
||||||
|
* Alternately, this acknowledgment may appear in the software itself,
|
||||||
|
* if and wherever such third-party acknowledgments normally appear.
|
||||||
|
*
|
||||||
|
* 4. The names "Apache" and "Apache Software Foundation" and
|
||||||
|
* "Apache Lucene" must not be used to endorse or promote products
|
||||||
|
* derived from this software without prior written permission. For
|
||||||
|
* written permission, please contact apache@apache.org.
|
||||||
|
*
|
||||||
|
* 5. Products derived from this software may not be called "Apache",
|
||||||
|
* "Apache Lucene", nor may "Apache" appear in their name, without
|
||||||
|
* prior written permission of the Apache Software Foundation.
|
||||||
|
*
|
||||||
|
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
|
||||||
|
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
|
||||||
|
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||||
|
* DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
|
||||||
|
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||||
|
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||||
|
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
|
||||||
|
* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
||||||
|
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||||
|
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
|
||||||
|
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
|
||||||
|
* SUCH DAMAGE.
|
||||||
|
* ====================================================================
|
||||||
|
*
|
||||||
|
* This software consists of voluntary contributions made by many
|
||||||
|
* individuals on behalf of the Apache Software Foundation. For more
|
||||||
|
* information on the Apache Software Foundation, please see
|
||||||
|
* <http://www.apache.org/>.
|
||||||
|
*/
|
||||||
|
package de.lanlab.larm.net;
|
||||||
|
|
||||||
|
import java.util.HashMap;
|
||||||
|
import java.net.*;
|
||||||
|
import de.lanlab.larm.util.CachingQueue;
|
||||||
|
import de.lanlab.larm.util.Queue;
|
||||||
|
import java.util.LinkedList;
|
||||||
|
import de.lanlab.larm.fetcher.Message;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* contains information about a host. If a host doesn't respond too often, it's
|
||||||
|
* excluded from the crawl. This class is used by the HostManager
|
||||||
|
*
|
||||||
|
* @author Clemens Marschner
|
||||||
|
* @created 16. Februar 2002
|
||||||
|
* @version $Id$
|
||||||
|
*/
|
||||||
|
public class HostInfo
|
||||||
|
{
|
||||||
|
final static String[] emptyKeepOutDirectories = new String[0];
|
||||||
|
|
||||||
|
int id;
|
||||||
|
|
||||||
|
int healthyCount = 5;
|
||||||
|
|
||||||
|
// five strikes, and you're out
|
||||||
|
boolean isReachable = true;
|
||||||
|
|
||||||
|
boolean robotTxtChecked = false;
|
||||||
|
|
||||||
|
String[] disallows;
|
||||||
|
|
||||||
|
// robot exclusion
|
||||||
|
boolean isLoadingRobotsTxt = false;
|
||||||
|
|
||||||
|
Queue queuedRequests = null;
|
||||||
|
|
||||||
|
// robot exclusion
|
||||||
|
String hostName;
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Description of the Method
|
||||||
|
*/
|
||||||
|
public void removeQueue()
|
||||||
|
{
|
||||||
|
queuedRequests = null;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Gets the id attribute of the HostInfo object
|
||||||
|
*
|
||||||
|
* @return The id value
|
||||||
|
*/
|
||||||
|
public int getId()
|
||||||
|
{
|
||||||
|
return id;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Description of the Method
|
||||||
|
*
|
||||||
|
* @param message Description of the Parameter
|
||||||
|
*/
|
||||||
|
public void insertIntoQueue(Message message)
|
||||||
|
{
|
||||||
|
queuedRequests.insert(message);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Gets the hostName attribute of the HostInfo object
|
||||||
|
*
|
||||||
|
* @return The hostName value
|
||||||
|
*/
|
||||||
|
public String getHostName()
|
||||||
|
{
|
||||||
|
return hostName;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Gets the queueSize. No error checking is done when the queue is null
|
||||||
|
*
|
||||||
|
* @return The queueSize value
|
||||||
|
*/
|
||||||
|
public int getQueueSize()
|
||||||
|
{
|
||||||
|
return queuedRequests.size();
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* gets last entry from queue. No error checking is done when the queue is null
|
||||||
|
*
|
||||||
|
* @return Description of the Return Value
|
||||||
|
*/
|
||||||
|
public Message removeFromQueue()
|
||||||
|
{
|
||||||
|
return (Message) queuedRequests.remove();
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
//LinkedList synonyms = new LinkedList();
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Constructor for the HostInfo object
|
||||||
|
*
|
||||||
|
* @param hostName Description of the Parameter
|
||||||
|
* @param id Description of the Parameter
|
||||||
|
*/
|
||||||
|
public HostInfo(String hostName, int id)
|
||||||
|
{
|
||||||
|
this.id = id;
|
||||||
|
this.disallows = HostInfo.emptyKeepOutDirectories;
|
||||||
|
this.hostName = hostName;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* is this host reachable and responding?
|
||||||
|
*
|
||||||
|
* @return The healthy value
|
||||||
|
*/
|
||||||
|
public boolean isHealthy()
|
||||||
|
{
|
||||||
|
return (healthyCount > 0) && isReachable;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* signals that the host returned with a bad request of whatever type
|
||||||
|
*/
|
||||||
|
public void badRequest()
|
||||||
|
{
|
||||||
|
healthyCount--;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Sets the reachable attribute of the HostInfo object
|
||||||
|
*
|
||||||
|
* @param reachable The new reachable value
|
||||||
|
*/
|
||||||
|
public void setReachable(boolean reachable)
|
||||||
|
{
|
||||||
|
isReachable = reachable;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Gets the reachable attribute of the HostInfo object
|
||||||
|
*
|
||||||
|
* @return The reachable value
|
||||||
|
*/
|
||||||
|
public boolean isReachable()
|
||||||
|
{
|
||||||
|
return isReachable;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Gets the robotTxtChecked attribute of the HostInfo object
|
||||||
|
*
|
||||||
|
* @return The robotTxtChecked value
|
||||||
|
*/
|
||||||
|
public boolean isRobotTxtChecked()
|
||||||
|
{
|
||||||
|
return robotTxtChecked;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* must be synchronized externally
|
||||||
|
*
|
||||||
|
* @return The loadingRobotsTxt value
|
||||||
|
*/
|
||||||
|
public boolean isLoadingRobotsTxt()
|
||||||
|
{
|
||||||
|
return this.isLoadingRobotsTxt;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Sets the loadingRobotsTxt attribute of the HostInfo object
|
||||||
|
*
|
||||||
|
* @param isLoading The new loadingRobotsTxt value
|
||||||
|
*/
|
||||||
|
public void setLoadingRobotsTxt(boolean isLoading)
|
||||||
|
{
|
||||||
|
this.isLoadingRobotsTxt = isLoading;
|
||||||
|
if (isLoading)
|
||||||
|
{
|
||||||
|
this.queuedRequests = new CachingQueue("HostInfo_" + id + "_QueuedRequests", 100);
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Sets the robotsChecked attribute of the HostInfo object
|
||||||
|
*
|
||||||
|
* @param isChecked The new robotsChecked value
|
||||||
|
* @param disallows The new robotsChecked value
|
||||||
|
*/
|
||||||
|
public void setRobotsChecked(boolean isChecked, String[] disallows)
|
||||||
|
{
|
||||||
|
this.robotTxtChecked = isChecked;
|
||||||
|
if (disallows != null)
|
||||||
|
{
|
||||||
|
this.disallows = disallows;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
this.disallows = emptyKeepOutDirectories;
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Gets the allowed attribute of the HostInfo object
|
||||||
|
*
|
||||||
|
* @param path Description of the Parameter
|
||||||
|
* @return The allowed value
|
||||||
|
*/
|
||||||
|
public synchronized boolean isAllowed(String path)
|
||||||
|
{
|
||||||
|
// assume keepOutDirectories is pretty short
|
||||||
|
// assert disallows != null
|
||||||
|
int length = disallows.length;
|
||||||
|
for (int i = 0; i < length; i++)
|
||||||
|
{
|
||||||
|
if (path.startsWith(disallows[i]))
|
||||||
|
{
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
|
@ -0,0 +1,154 @@
|
||||||
|
/* ====================================================================
|
||||||
|
* The Apache Software License, Version 1.1
|
||||||
|
*
|
||||||
|
* Copyright (c) 2001 The Apache Software Foundation. All rights
|
||||||
|
* reserved.
|
||||||
|
*
|
||||||
|
* Redistribution and use in source and binary forms, with or without
|
||||||
|
* modification, are permitted provided that the following conditions
|
||||||
|
* are met:
|
||||||
|
*
|
||||||
|
* 1. Redistributions of source code must retain the above copyright
|
||||||
|
* notice, this list of conditions and the following disclaimer.
|
||||||
|
*
|
||||||
|
* 2. Redistributions in binary form must reproduce the above copyright
|
||||||
|
* notice, this list of conditions and the following disclaimer in
|
||||||
|
* the documentation and/or other materials provided with the
|
||||||
|
* distribution.
|
||||||
|
*
|
||||||
|
* 3. The end-user documentation included with the redistribution,
|
||||||
|
* if any, must include the following acknowledgment:
|
||||||
|
* "This product includes software developed by the
|
||||||
|
* Apache Software Foundation (http://www.apache.org/)."
|
||||||
|
* Alternately, this acknowledgment may appear in the software itself,
|
||||||
|
* if and wherever such third-party acknowledgments normally appear.
|
||||||
|
*
|
||||||
|
* 4. The names "Apache" and "Apache Software Foundation" and
|
||||||
|
* "Apache Lucene" must not be used to endorse or promote products
|
||||||
|
* derived from this software without prior written permission. For
|
||||||
|
* written permission, please contact apache@apache.org.
|
||||||
|
*
|
||||||
|
* 5. Products derived from this software may not be called "Apache",
|
||||||
|
* "Apache Lucene", nor may "Apache" appear in their name, without
|
||||||
|
* prior written permission of the Apache Software Foundation.
|
||||||
|
*
|
||||||
|
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
|
||||||
|
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
|
||||||
|
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||||
|
* DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
|
||||||
|
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||||
|
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||||
|
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
|
||||||
|
* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
||||||
|
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||||
|
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
|
||||||
|
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
|
||||||
|
* SUCH DAMAGE.
|
||||||
|
* ====================================================================
|
||||||
|
*
|
||||||
|
* This software consists of voluntary contributions made by many
|
||||||
|
* individuals on behalf of the Apache Software Foundation. For more
|
||||||
|
* information on the Apache Software Foundation, please see
|
||||||
|
* <http://www.apache.org/>.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package de.lanlab.larm.net;
|
||||||
|
|
||||||
|
import java.util.HashMap;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Description of the Class
|
||||||
|
*
|
||||||
|
* @author Administrator
|
||||||
|
* @created 16. Februar 2002
|
||||||
|
* @version $Id$
|
||||||
|
*/
|
||||||
|
public class HostManager
|
||||||
|
{
|
||||||
|
HashMap hosts;
|
||||||
|
static int hostCount = 0;
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Constructor for the HostInfo object
|
||||||
|
*
|
||||||
|
* @param initialSize Description of the Parameter
|
||||||
|
*/
|
||||||
|
public HostManager(int initialCapacity)
|
||||||
|
{
|
||||||
|
hosts = new HashMap(initialCapacity);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Description of the Method
|
||||||
|
*
|
||||||
|
* @param hostName Description of the Parameter
|
||||||
|
* @return Description of the Return Value
|
||||||
|
*/
|
||||||
|
public HostInfo put(String hostName)
|
||||||
|
{
|
||||||
|
if (!hosts.containsKey(hostName))
|
||||||
|
{
|
||||||
|
int hostID;
|
||||||
|
synchronized (this)
|
||||||
|
{
|
||||||
|
hostID = hostCount++;
|
||||||
|
}
|
||||||
|
HostInfo hi = new HostInfo(hostName,hostID);
|
||||||
|
hosts.put(hostName, hi);
|
||||||
|
//System.out.println("hostManager: + " + hostName);
|
||||||
|
if(!hostName.equals(hostName.toLowerCase()))
|
||||||
|
{
|
||||||
|
try
|
||||||
|
{
|
||||||
|
throw new Exception();
|
||||||
|
}
|
||||||
|
catch(Exception e)
|
||||||
|
{
|
||||||
|
e.printStackTrace();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return hi;
|
||||||
|
}
|
||||||
|
return (HostInfo)hosts.get(hostName);
|
||||||
|
/*else
|
||||||
|
{
|
||||||
|
hostID = hosts.get()
|
||||||
|
}
|
||||||
|
// assert hostID != -1;
|
||||||
|
return hostID;*/
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Gets the hostID attribute of the HostInfo object
|
||||||
|
*
|
||||||
|
* @param hostName Description of the Parameter
|
||||||
|
* @return The hostID value
|
||||||
|
*/
|
||||||
|
public HostInfo getHostInfo(String hostName)
|
||||||
|
{
|
||||||
|
HostInfo hi = (HostInfo)hosts.get(hostName);
|
||||||
|
if(hi == null)
|
||||||
|
{
|
||||||
|
return put(hostName);
|
||||||
|
}
|
||||||
|
return hi;
|
||||||
|
}
|
||||||
|
|
||||||
|
public int getSize()
|
||||||
|
{
|
||||||
|
return hosts.size();
|
||||||
|
}
|
||||||
|
|
||||||
|
public HostInfo addSynonym(String hostName, String synonym)
|
||||||
|
{
|
||||||
|
HostInfo info = getHostInfo(hostName);
|
||||||
|
hosts.put(synonym, info);
|
||||||
|
return info;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
}
|
|
@ -0,0 +1,425 @@
|
||||||
|
package de.lanlab.larm.net;
|
||||||
|
/*
|
||||||
|
* ====================================================================
|
||||||
|
* The Apache Software License, Version 1.1
|
||||||
|
*
|
||||||
|
* Copyright (c) 2001 The Apache Software Foundation. All rights
|
||||||
|
* reserved.
|
||||||
|
*
|
||||||
|
* Redistribution and use in source and binary forms, with or without
|
||||||
|
* modification, are permitted provided that the following conditions
|
||||||
|
* are met:
|
||||||
|
*
|
||||||
|
* 1. Redistributions of source code must retain the above copyright
|
||||||
|
* notice, this list of conditions and the following disclaimer.
|
||||||
|
*
|
||||||
|
* 2. Redistributions in binary form must reproduce the above copyright
|
||||||
|
* notice, this list of conditions and the following disclaimer in
|
||||||
|
* the documentation and/or other materials provided with the
|
||||||
|
* distribution.
|
||||||
|
*
|
||||||
|
* 3. The end-user documentation included with the redistribution,
|
||||||
|
* if any, must include the following acknowledgment:
|
||||||
|
* "This product includes software developed by the
|
||||||
|
* Apache Software Foundation (http://www.apache.org/)."
|
||||||
|
* Alternately, this acknowledgment may appear in the software itself,
|
||||||
|
* if and wherever such third-party acknowledgments normally appear.
|
||||||
|
*
|
||||||
|
* 4. The names "Apache" and "Apache Software Foundation" and
|
||||||
|
* "Apache Lucene" must not be used to endorse or promote products
|
||||||
|
* derived from this software without prior written permission. For
|
||||||
|
* written permission, please contact apache@apache.org.
|
||||||
|
*
|
||||||
|
* 5. Products derived from this software may not be called "Apache",
|
||||||
|
* "Apache Lucene", nor may "Apache" appear in their name, without
|
||||||
|
* prior written permission of the Apache Software Foundation.
|
||||||
|
*
|
||||||
|
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
|
||||||
|
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
|
||||||
|
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||||
|
* DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
|
||||||
|
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||||
|
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||||
|
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
|
||||||
|
* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
||||||
|
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||||
|
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
|
||||||
|
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
|
||||||
|
* SUCH DAMAGE.
|
||||||
|
* ====================================================================
|
||||||
|
*
|
||||||
|
* This software consists of voluntary contributions made by many
|
||||||
|
* individuals on behalf of the Apache Software Foundation. For more
|
||||||
|
* information on the Apache Software Foundation, please see
|
||||||
|
* <http://www.apache.org/>.
|
||||||
|
*/
|
||||||
|
import java.io.*;
|
||||||
|
import java.net.*;
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Description of the Class
|
||||||
|
*
|
||||||
|
* @author Administrator
|
||||||
|
* @created 14. Juni 2002
|
||||||
|
*/
|
||||||
|
public class URLNormalizer
|
||||||
|
{
|
||||||
|
final static int NP_SLASH = 1;
|
||||||
|
final static int NP_CHAR = 2;
|
||||||
|
final static int NP_PERCENT = 3;
|
||||||
|
final static int NP_POINT = 4;
|
||||||
|
final static int NP_HEX = 5;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* contains hex codes for characters in lowercase uses char arrays instead
|
||||||
|
* of strings for faster processing
|
||||||
|
*/
|
||||||
|
protected static char[][] charMap = {
|
||||||
|
{'%', '0', '0'}, {'%', '0', '1'}, {'%', '0', '2'}, {'%', '0', '3'}, {'%', '0', '4'}, {'%', '0', '5'}, {'%', '0', '6'}, {'%', '0', '7'}, {'%', '0', '8'}, {'%', '0', '9'}, {'%', '0', 'A'}, {'%', '0', 'B'}, {'%', '0', 'C'}, {'%', '0', 'D'}, {'%', '0', 'E'}, {'%', '0', 'F'},
|
||||||
|
{'%', '1', '0'}, {'%', '1', '1'}, {'%', '1', '2'}, {'%', '1', '3'}, {'%', '1', '4'}, {'%', '1', '5'}, {'%', '1', '6'}, {'%', '1', '7'}, {'%', '1', '8'}, {'%', '1', '9'}, {'%', '1', 'A'}, {'%', '1', 'B'}, {'%', '1', 'C'}, {'%', '1', 'D'}, {'%', '1', 'E'}, {'%', '1', 'F'},
|
||||||
|
{'%', '2', '0'}, {'%', '2', '1'}, {'%', '2', '2'}, {'%', '2', '3'}, {'$'}, {'%', '2', '5'}, {'%', '2', '6'}, {'%', '2', '7'}, {'%', '2', '8'}, {'%', '2', '9'}, {'%', '2', 'A'}, {'%', '2', 'B'}, {'%', '2', 'C'}, {'-'}, {'.'}, {'%', '2', 'F'},
|
||||||
|
{'0'}, {'1'}, {'2'}, {'3'}, {'4'}, {'5'}, {'6'}, {'7'}, {'8'}, {'9'}, {'%', '3', 'A'}, {'%', '3', 'B'}, {'%', '3', 'C'}, {'%', '3', 'D'}, {'%', '3', 'E'}, {'%', '3', 'F'},
|
||||||
|
{'%', '4', '0'}, {'a'}, {'b'}, {'c'}, {'d'}, {'e'}, {'f'}, {'g'}, {'h'}, {'i'}, {'j'}, {'k'}, {'l'}, {'m'}, {'n'}, {'o'},
|
||||||
|
{'p'}, {'q'}, {'r'}, {'s'}, {'t'}, {'u'}, {'v'}, {'w'}, {'x'}, {'y'}, {'z'}, {'%', '5', 'B'}, {'%', '5', 'C'}, {'%', '5', 'D'}, {'%', '5', 'E'}, {'_'},
|
||||||
|
{'%', '6', '0'}, {'a'}, {'b'}, {'c'}, {'d'}, {'e'}, {'f'}, {'g'}, {'h'}, {'i'}, {'j'}, {'k'}, {'l'}, {'m'}, {'n'}, {'o'},
|
||||||
|
{'p'}, {'q'}, {'r'}, {'s'}, {'t'}, {'u'}, {'v'}, {'w'}, {'x'}, {'y'}, {'z'}, {'%', '7', 'B'}, {'%', '7', 'C'}, {'%', '7', 'D'}, {'%', '7', 'E'}, {'%', '7', 'F'},
|
||||||
|
{'%', '8', '0'}, {'%', '8', '1'}, {'%', '8', '2'}, {'%', '8', '3'}, {'%', '8', '4'}, {'%', '8', '5'}, {'%', '8', '6'}, {'%', '8', '7'}, {'%', '8', '8'}, {'%', '8', '9'}, {'%', '8', 'A'}, {'%', '8', 'B'}, {'%', '8', 'C'}, {'%', '8', 'D'}, {'%', '8', 'E'}, {'%', '8', 'F'},
|
||||||
|
{'%', '9', '0'}, {'%', '9', '1'}, {'%', '9', '2'}, {'%', '9', '3'}, {'%', '9', '4'}, {'%', '9', '5'}, {'%', '9', '6'}, {'%', '9', '7'}, {'%', '9', '8'}, {'%', '9', '9'}, {'%', '9', 'A'}, {'%', '9', 'B'}, {'%', '9', 'C'}, {'%', '9', 'D'}, {'%', '9', 'E'}, {'%', '9', 'F'},
|
||||||
|
{'%', 'A', '0'}, {'%', 'A', '1'}, {'%', 'A', '2'}, {'%', 'A', '3'}, {'%', 'A', '4'}, {'%', 'A', '5'}, {'%', 'A', '6'}, {'%', 'A', '7'}, {'%', 'A', '8'}, {'%', 'A', '9'}, {'%', 'A', 'A'}, {'%', 'A', 'B'}, {'%', 'A', 'C'}, {'%', 'A', 'D'}, {'%', 'A', 'E'}, {'%', 'A', 'F'},
|
||||||
|
{'%', 'B', '0'}, {'%', 'B', '1'}, {'%', 'B', '2'}, {'%', 'B', '3'}, {'%', 'B', '4'}, {'%', 'B', '5'}, {'%', 'B', '6'}, {'%', 'B', '7'}, {'%', 'B', '8'}, {'%', 'B', '9'}, {'%', 'B', 'A'}, {'%', 'B', 'B'}, {'%', 'B', 'C'}, {'%', 'B', 'D'}, {'%', 'B', 'E'}, {'%', 'B', 'F'},
|
||||||
|
{'%', 'E', '0'}, {'%', 'E', '1'}, {'%', 'E', '2'}, {'%', 'E', '3'}, {'%', 'E', '4'}, {'%', 'E', '5'}, {'%', 'E', '6'}, {'%', 'E', '7'}, {'%', 'E', '8'}, {'%', 'E', '9'}, {'%', 'E', 'A'}, {'%', 'E', 'B'}, {'%', 'E', 'C'}, {'%', 'E', 'D'}, {'%', 'E', 'E'}, {'%', 'E', 'F'},
|
||||||
|
{'%', 'F', '0'}, {'%', 'F', '1'}, {'%', 'F', '2'}, {'%', 'F', '3'}, {'%', 'F', '4'}, {'%', 'F', '5'}, {'%', 'F', '6'}, {'%', 'D', '7'}, {'%', 'F', '8'}, {'%', 'F', '9'}, {'%', 'F', 'A'}, {'%', 'F', 'B'}, {'%', 'F', 'C'}, {'%', 'F', 'D'}, {'%', 'F', 'E'}, {'%', 'D', 'F'},
|
||||||
|
{'%', 'E', '0'}, {'%', 'E', '1'}, {'%', 'E', '2'}, {'%', 'E', '3'}, {'%', 'E', '4'}, {'%', 'E', '5'}, {'%', 'E', '6'}, {'%', 'E', '7'}, {'%', 'E', '8'}, {'%', 'E', '9'}, {'%', 'E', 'A'}, {'%', 'E', 'B'}, {'%', 'E', 'C'}, {'%', 'E', 'D'}, {'%', 'E', 'E'}, {'%', 'E', 'F'},
|
||||||
|
{'%', 'F', '0'}, {'%', 'F', '1'}, {'%', 'F', '2'}, {'%', 'F', '3'}, {'%', 'F', '4'}, {'%', 'F', '5'}, {'%', 'F', '6'}, {'%', 'F', '7'}, {'%', 'F', '8'}, {'%', 'F', '9'}, {'%', 'F', 'A'}, {'%', 'F', 'B'}, {'%', 'F', 'C'}, {'%', 'F', 'D'}, {'%', 'F', 'E'}, {'%', 'F', 'F'},
|
||||||
|
};
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Description of the Method
|
||||||
|
*
|
||||||
|
* @param path Description of the Parameter
|
||||||
|
* @return Description of the Return Value
|
||||||
|
* @exception IOException Description of the Exception
|
||||||
|
*/
|
||||||
|
protected static String normalizePath(String path)
|
||||||
|
throws IOException
|
||||||
|
{
|
||||||
|
// rule 1: if the path is empty, return "/"
|
||||||
|
if (path.length() == 0)
|
||||||
|
{
|
||||||
|
return "/";
|
||||||
|
}
|
||||||
|
|
||||||
|
// Finite State Machine to convert characters to lowercase, remove "//" and "/./"
|
||||||
|
// and make sure that all characters are escaped in a uniform way, i.e.
|
||||||
|
// {" ", "+", "%20"} -> "%20"
|
||||||
|
|
||||||
|
StringBuffer w = new StringBuffer((int) (path.length() * 1.5));
|
||||||
|
|
||||||
|
int status = NP_CHAR;
|
||||||
|
|
||||||
|
int pos = 0;
|
||||||
|
int length = path.length();
|
||||||
|
char savedChar = '?';
|
||||||
|
int hexChar = '?';
|
||||||
|
int pathPos = -1; // position of last "/"
|
||||||
|
int questionPos = -1; // assert length >0
|
||||||
|
boolean isInQuery = false; // question mark reached?
|
||||||
|
|
||||||
|
while (pos < length)
|
||||||
|
{
|
||||||
|
char c = path.charAt(pos++);
|
||||||
|
try
|
||||||
|
{
|
||||||
|
switch (status)
|
||||||
|
{
|
||||||
|
case NP_SLASH:
|
||||||
|
if (c == '/')
|
||||||
|
{
|
||||||
|
// ignore subsequent slashes
|
||||||
|
}
|
||||||
|
else if (c == '.')
|
||||||
|
{
|
||||||
|
status = NP_POINT;
|
||||||
|
}
|
||||||
|
else if (c == '%')
|
||||||
|
{
|
||||||
|
status = NP_PERCENT;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
pos--;
|
||||||
|
status = NP_CHAR;
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
case NP_POINT:
|
||||||
|
if (c == '/')
|
||||||
|
{
|
||||||
|
// ignore
|
||||||
|
}
|
||||||
|
else if (c == '.')
|
||||||
|
{
|
||||||
|
// ignore; this shouldn't happen
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
w.append('.');
|
||||||
|
pos--;
|
||||||
|
status = NP_SLASH;
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
case NP_PERCENT:
|
||||||
|
if (c >= '0' && c <= '9')
|
||||||
|
{
|
||||||
|
hexChar = (c - '0') << 4;
|
||||||
|
}
|
||||||
|
else if (c >= 'a' && c <= 'f')
|
||||||
|
{
|
||||||
|
hexChar = (c - 'a' + 10) << 4;
|
||||||
|
}
|
||||||
|
else if (c >= 'A' && c <= 'F')
|
||||||
|
{
|
||||||
|
hexChar = (c - 'A' + 10) << 4;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
w.append(charMap['%']);
|
||||||
|
w.append(charMap[c]);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
savedChar = c;
|
||||||
|
status = NP_HEX;
|
||||||
|
break;
|
||||||
|
case NP_HEX:
|
||||||
|
if (c >= '0' && c <= '9')
|
||||||
|
{
|
||||||
|
hexChar |= (c - '0');
|
||||||
|
}
|
||||||
|
else if (c >= 'a' && c <= 'f')
|
||||||
|
{
|
||||||
|
hexChar |= (c - 'a' + 10);
|
||||||
|
}
|
||||||
|
else if (c >= 'A' && c <= 'F')
|
||||||
|
{
|
||||||
|
hexChar |= (c - 'A' + 10);
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
w.append(charMap['%']);
|
||||||
|
w.append(charMap[savedChar]);
|
||||||
|
w.append(charMap[c]);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
w.append(charMap[hexChar]);
|
||||||
|
status = NP_CHAR;
|
||||||
|
break;
|
||||||
|
case NP_CHAR:
|
||||||
|
switch (c)
|
||||||
|
{
|
||||||
|
case '%':
|
||||||
|
status = NP_PERCENT;
|
||||||
|
break;
|
||||||
|
case '/':
|
||||||
|
if(!isInQuery)
|
||||||
|
{
|
||||||
|
w.append(c);
|
||||||
|
pathPos = w.length(); // points to the char. after "/"
|
||||||
|
status = NP_SLASH;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
w.append(charMap[c]);
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
case '?':
|
||||||
|
if(!isInQuery)
|
||||||
|
{
|
||||||
|
if(pathPos == -1)
|
||||||
|
{
|
||||||
|
w.append('/');
|
||||||
|
pathPos = w.length();
|
||||||
|
}
|
||||||
|
questionPos = w.length(); // points to the char at "?"
|
||||||
|
isInQuery = true;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
w.append(charMap[c]);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
case '&':
|
||||||
|
case ';':
|
||||||
|
case '@':
|
||||||
|
//case ':':
|
||||||
|
case '=':
|
||||||
|
w.append(c);
|
||||||
|
break;
|
||||||
|
case '+':
|
||||||
|
w.append("%20");
|
||||||
|
break;
|
||||||
|
default:
|
||||||
|
w.append(charMap[c]);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
catch (ArrayIndexOutOfBoundsException e)
|
||||||
|
{
|
||||||
|
// we encountered a unicode character >= 0x00ff
|
||||||
|
// write UTF-8 to distinguish it from other characters
|
||||||
|
// note that this does NOT lead to a pure UTF-8 URL since we
|
||||||
|
// write 0x80 <= c <= 0xff as one-byte strings
|
||||||
|
/*
|
||||||
|
* if (ch <= 0x007f) { // other ASCII
|
||||||
|
* sbuf.append(hex[ch]);
|
||||||
|
* } else
|
||||||
|
*/
|
||||||
|
// note that we ignore the case that we receive "%" + unicode + c
|
||||||
|
// (status = NP_HEX + Exception when writing savedchar); in that case
|
||||||
|
// only the second character is written. we consider this to be very
|
||||||
|
// unlikely
|
||||||
|
|
||||||
|
// see http://www.w3.org/International/O-URL-code.html
|
||||||
|
if (c <= 0x07FF)
|
||||||
|
{
|
||||||
|
// non-ASCII <= 0x7FF
|
||||||
|
w.append(charMap[0xc0 | (c >> 6)]);
|
||||||
|
w.append(charMap[0x80 | (c & 0x3F)]);
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
// 0x7FF < c <= 0xFFFF
|
||||||
|
w.append(charMap[0xe0 | (c >> 12)]);
|
||||||
|
w.append(charMap[0x80 | ((c >> 6) & 0x3F)]);
|
||||||
|
w.append(charMap[0x80 | (c & 0x3F)]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// rule 3: delete index.* or default.*
|
||||||
|
|
||||||
|
if(questionPos == -1) // no query
|
||||||
|
{
|
||||||
|
questionPos = w.length();
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
if(questionPos == w.length()-1)
|
||||||
|
{
|
||||||
|
// empty query. assert questionPos > 0
|
||||||
|
w.deleteCharAt(questionPos);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if(pathPos == -1) // no query
|
||||||
|
{
|
||||||
|
pathPos = 0;
|
||||||
|
}
|
||||||
|
if(questionPos > pathPos)
|
||||||
|
{
|
||||||
|
String file = w.substring(pathPos, questionPos);
|
||||||
|
{
|
||||||
|
//System.out.println("file: " + file);
|
||||||
|
if(file.startsWith("index.") || file.startsWith("default."))
|
||||||
|
{
|
||||||
|
w.delete(pathPos, questionPos); // delete default page to avoid ambiguities
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return w.toString();
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Description of the Method
|
||||||
|
*
|
||||||
|
* @param host Description of the Parameter
|
||||||
|
* @return Description of the Return Value
|
||||||
|
*/
|
||||||
|
protected static String normalizeHost(HostManager hostManager, String host)
|
||||||
|
{
|
||||||
|
return hostManager.getHostInfo(host.toLowerCase()).getHostName();
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
HostManager hostManager;
|
||||||
|
*/
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Constructor for the URLNormalizer object
|
||||||
|
*
|
||||||
|
* @param hostManager Description of the Parameter
|
||||||
|
*/
|
||||||
|
/* public URLNormalizer(HostManager hostManager)
|
||||||
|
{
|
||||||
|
this.hostManager = hostManager;
|
||||||
|
}*/
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Description of the Method
|
||||||
|
*
|
||||||
|
* @param u Description of the Parameter
|
||||||
|
* @return Description of the Return Value
|
||||||
|
* @exception IOException Description of the Exception
|
||||||
|
* @exception MalformedURLException Description of the Exception
|
||||||
|
*/
|
||||||
|
public static URL normalize(URL u, HostManager hostManager)
|
||||||
|
{
|
||||||
|
if (u.getProtocol().equals("http"))
|
||||||
|
{
|
||||||
|
try
|
||||||
|
{
|
||||||
|
int port = u.getPort();
|
||||||
|
/*URL url =*/
|
||||||
|
return new URL(u.getProtocol(), normalizeHost(hostManager, u.getHost()), port == 80 ? -1 : port, normalizePath(u.getFile()));
|
||||||
|
/*if(!u.equals(url))
|
||||||
|
{
|
||||||
|
System.out.println(u.toExternalForm() + " -> " + url.toExternalForm());
|
||||||
|
}
|
||||||
|
return url;*/
|
||||||
|
}
|
||||||
|
catch(MalformedURLException e)
|
||||||
|
{
|
||||||
|
System.out.println("assertion failed: MalformedURLException in URLNormalizer.normalize()");
|
||||||
|
throw new java.lang.InternalError("assertion failed: MalformedURLException in URLNormalizer.normalize()");
|
||||||
|
}
|
||||||
|
catch(IOException e)
|
||||||
|
{
|
||||||
|
System.out.println("assertion failed: IOException in URLNormalizer.normalize()");
|
||||||
|
throw new java.lang.InternalError("assertion failed: MalformedURLException in URLNormalizer.normalize()");
|
||||||
|
}
|
||||||
|
|
||||||
|
//return url
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
return u;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public static void main(String[] args) throws Exception
|
||||||
|
{
|
||||||
|
HostManager hm = new HostManager(10);
|
||||||
|
hm.addSynonym("webinfo.campus.lmu.de", "webinfo.uni-muenchen.de");
|
||||||
|
System.out.println(URLNormalizer.normalize(new URL("http://www.lmu.de/conman/index.jsp?path=709"), hm));
|
||||||
|
System.out.println(URLNormalizer.normalize(new URL("http://webinfo.uni-muenchen.de/view-i.cfm?url=http://abc/resp?a=c"), hm));
|
||||||
|
System.out.println(URLNormalizer.normalize(new URL("http://webinfo.campus.lmu.de/view-i.cfm?url=http://abc/resp?a=c"), hm));
|
||||||
|
System.out.println(URLNormalizer.normalize(new URL("http://www.bwl.uni-muenchen.de/default.asp?id=123"), hm));
|
||||||
|
System.out.println(URLNormalizer.normalize(new URL("http://www.lmu.de/index.html"), hm));
|
||||||
|
System.out.println(URLNormalizer.normalize(new URL("http://www.lmu.de"), hm));
|
||||||
|
System.out.println(URLNormalizer.normalize(new URL("http://www.lmu.de/"), hm));
|
||||||
|
System.out.println(URLNormalizer.normalize(new URL("http://www.lmu.de/?"), hm));
|
||||||
|
System.out.println(URLNormalizer.normalize(new URL("http://www.lmu.de?"), hm));
|
||||||
|
System.out.println(URLNormalizer.normalize(new URL("http://www.lmu.de?id=abc"), hm));
|
||||||
|
System.out.println(URLNormalizer.normalize(new URL("http://www.lmu.de/abcde$1?id=abc"), hm));
|
||||||
|
URL u = new URL("http://www.lmu.de/abcde$1?id=abc");
|
||||||
|
System.out.println("host: " + u.getHost());
|
||||||
|
System.out.println("port: " + u.getPort());
|
||||||
|
System.out.println(URLNormalizer.normalize(u, hm));
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
}
|
||||||
|
}
|
Loading…
Reference in New Issue