mirror of https://github.com/apache/lucene.git
moved HostInfo/HostManager to larm.net package; added URLNormalizer
git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@150782 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
5b90c10cb5
commit
8e18fa1cb0
|
@ -0,0 +1,298 @@
|
|||
/*
|
||||
* ====================================================================
|
||||
* The Apache Software License, Version 1.1
|
||||
*
|
||||
* Copyright (c) 2001 The Apache Software Foundation. All rights
|
||||
* reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions
|
||||
* are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in
|
||||
* the documentation and/or other materials provided with the
|
||||
* distribution.
|
||||
*
|
||||
* 3. The end-user documentation included with the redistribution,
|
||||
* if any, must include the following acknowledgment:
|
||||
* "This product includes software developed by the
|
||||
* Apache Software Foundation (http://www.apache.org/)."
|
||||
* Alternately, this acknowledgment may appear in the software itself,
|
||||
* if and wherever such third-party acknowledgments normally appear.
|
||||
*
|
||||
* 4. The names "Apache" and "Apache Software Foundation" and
|
||||
* "Apache Lucene" must not be used to endorse or promote products
|
||||
* derived from this software without prior written permission. For
|
||||
* written permission, please contact apache@apache.org.
|
||||
*
|
||||
* 5. Products derived from this software may not be called "Apache",
|
||||
* "Apache Lucene", nor may "Apache" appear in their name, without
|
||||
* prior written permission of the Apache Software Foundation.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
|
||||
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
|
||||
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
* DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
|
||||
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
|
||||
* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
||||
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
|
||||
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
|
||||
* SUCH DAMAGE.
|
||||
* ====================================================================
|
||||
*
|
||||
* This software consists of voluntary contributions made by many
|
||||
* individuals on behalf of the Apache Software Foundation. For more
|
||||
* information on the Apache Software Foundation, please see
|
||||
* <http://www.apache.org/>.
|
||||
*/
|
||||
package de.lanlab.larm.net;
|
||||
|
||||
import java.util.HashMap;
|
||||
import java.net.*;
|
||||
import de.lanlab.larm.util.CachingQueue;
|
||||
import de.lanlab.larm.util.Queue;
|
||||
import java.util.LinkedList;
|
||||
import de.lanlab.larm.fetcher.Message;
|
||||
|
||||
/**
|
||||
* contains information about a host. If a host doesn't respond too often, it's
|
||||
* excluded from the crawl. This class is used by the HostManager
|
||||
*
|
||||
* @author Clemens Marschner
|
||||
* @created 16. Februar 2002
|
||||
* @version $Id$
|
||||
*/
|
||||
public class HostInfo
|
||||
{
|
||||
final static String[] emptyKeepOutDirectories = new String[0];
|
||||
|
||||
int id;
|
||||
|
||||
int healthyCount = 5;
|
||||
|
||||
// five strikes, and you're out
|
||||
boolean isReachable = true;
|
||||
|
||||
boolean robotTxtChecked = false;
|
||||
|
||||
String[] disallows;
|
||||
|
||||
// robot exclusion
|
||||
boolean isLoadingRobotsTxt = false;
|
||||
|
||||
Queue queuedRequests = null;
|
||||
|
||||
// robot exclusion
|
||||
String hostName;
|
||||
|
||||
|
||||
/**
|
||||
* Description of the Method
|
||||
*/
|
||||
public void removeQueue()
|
||||
{
|
||||
queuedRequests = null;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Gets the id attribute of the HostInfo object
|
||||
*
|
||||
* @return The id value
|
||||
*/
|
||||
public int getId()
|
||||
{
|
||||
return id;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Description of the Method
|
||||
*
|
||||
* @param message Description of the Parameter
|
||||
*/
|
||||
public void insertIntoQueue(Message message)
|
||||
{
|
||||
queuedRequests.insert(message);
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Gets the hostName attribute of the HostInfo object
|
||||
*
|
||||
* @return The hostName value
|
||||
*/
|
||||
public String getHostName()
|
||||
{
|
||||
return hostName;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Gets the queueSize. No error checking is done when the queue is null
|
||||
*
|
||||
* @return The queueSize value
|
||||
*/
|
||||
public int getQueueSize()
|
||||
{
|
||||
return queuedRequests.size();
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* gets last entry from queue. No error checking is done when the queue is null
|
||||
*
|
||||
* @return Description of the Return Value
|
||||
*/
|
||||
public Message removeFromQueue()
|
||||
{
|
||||
return (Message) queuedRequests.remove();
|
||||
}
|
||||
|
||||
|
||||
//LinkedList synonyms = new LinkedList();
|
||||
|
||||
/**
|
||||
* Constructor for the HostInfo object
|
||||
*
|
||||
* @param hostName Description of the Parameter
|
||||
* @param id Description of the Parameter
|
||||
*/
|
||||
public HostInfo(String hostName, int id)
|
||||
{
|
||||
this.id = id;
|
||||
this.disallows = HostInfo.emptyKeepOutDirectories;
|
||||
this.hostName = hostName;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* is this host reachable and responding?
|
||||
*
|
||||
* @return The healthy value
|
||||
*/
|
||||
public boolean isHealthy()
|
||||
{
|
||||
return (healthyCount > 0) && isReachable;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* signals that the host returned with a bad request of whatever type
|
||||
*/
|
||||
public void badRequest()
|
||||
{
|
||||
healthyCount--;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Sets the reachable attribute of the HostInfo object
|
||||
*
|
||||
* @param reachable The new reachable value
|
||||
*/
|
||||
public void setReachable(boolean reachable)
|
||||
{
|
||||
isReachable = reachable;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Gets the reachable attribute of the HostInfo object
|
||||
*
|
||||
* @return The reachable value
|
||||
*/
|
||||
public boolean isReachable()
|
||||
{
|
||||
return isReachable;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Gets the robotTxtChecked attribute of the HostInfo object
|
||||
*
|
||||
* @return The robotTxtChecked value
|
||||
*/
|
||||
public boolean isRobotTxtChecked()
|
||||
{
|
||||
return robotTxtChecked;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* must be synchronized externally
|
||||
*
|
||||
* @return The loadingRobotsTxt value
|
||||
*/
|
||||
public boolean isLoadingRobotsTxt()
|
||||
{
|
||||
return this.isLoadingRobotsTxt;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Sets the loadingRobotsTxt attribute of the HostInfo object
|
||||
*
|
||||
* @param isLoading The new loadingRobotsTxt value
|
||||
*/
|
||||
public void setLoadingRobotsTxt(boolean isLoading)
|
||||
{
|
||||
this.isLoadingRobotsTxt = isLoading;
|
||||
if (isLoading)
|
||||
{
|
||||
this.queuedRequests = new CachingQueue("HostInfo_" + id + "_QueuedRequests", 100);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Sets the robotsChecked attribute of the HostInfo object
|
||||
*
|
||||
* @param isChecked The new robotsChecked value
|
||||
* @param disallows The new robotsChecked value
|
||||
*/
|
||||
public void setRobotsChecked(boolean isChecked, String[] disallows)
|
||||
{
|
||||
this.robotTxtChecked = isChecked;
|
||||
if (disallows != null)
|
||||
{
|
||||
this.disallows = disallows;
|
||||
}
|
||||
else
|
||||
{
|
||||
this.disallows = emptyKeepOutDirectories;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Gets the allowed attribute of the HostInfo object
|
||||
*
|
||||
* @param path Description of the Parameter
|
||||
* @return The allowed value
|
||||
*/
|
||||
public synchronized boolean isAllowed(String path)
|
||||
{
|
||||
// assume keepOutDirectories is pretty short
|
||||
// assert disallows != null
|
||||
int length = disallows.length;
|
||||
for (int i = 0; i < length; i++)
|
||||
{
|
||||
if (path.startsWith(disallows[i]))
|
||||
{
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,154 @@
|
|||
/* ====================================================================
|
||||
* The Apache Software License, Version 1.1
|
||||
*
|
||||
* Copyright (c) 2001 The Apache Software Foundation. All rights
|
||||
* reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions
|
||||
* are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in
|
||||
* the documentation and/or other materials provided with the
|
||||
* distribution.
|
||||
*
|
||||
* 3. The end-user documentation included with the redistribution,
|
||||
* if any, must include the following acknowledgment:
|
||||
* "This product includes software developed by the
|
||||
* Apache Software Foundation (http://www.apache.org/)."
|
||||
* Alternately, this acknowledgment may appear in the software itself,
|
||||
* if and wherever such third-party acknowledgments normally appear.
|
||||
*
|
||||
* 4. The names "Apache" and "Apache Software Foundation" and
|
||||
* "Apache Lucene" must not be used to endorse or promote products
|
||||
* derived from this software without prior written permission. For
|
||||
* written permission, please contact apache@apache.org.
|
||||
*
|
||||
* 5. Products derived from this software may not be called "Apache",
|
||||
* "Apache Lucene", nor may "Apache" appear in their name, without
|
||||
* prior written permission of the Apache Software Foundation.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
|
||||
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
|
||||
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
* DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
|
||||
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
|
||||
* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
||||
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
|
||||
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
|
||||
* SUCH DAMAGE.
|
||||
* ====================================================================
|
||||
*
|
||||
* This software consists of voluntary contributions made by many
|
||||
* individuals on behalf of the Apache Software Foundation. For more
|
||||
* information on the Apache Software Foundation, please see
|
||||
* <http://www.apache.org/>.
|
||||
*/
|
||||
|
||||
package de.lanlab.larm.net;
|
||||
|
||||
import java.util.HashMap;
|
||||
|
||||
/**
|
||||
* Description of the Class
|
||||
*
|
||||
* @author Administrator
|
||||
* @created 16. Februar 2002
|
||||
* @version $Id$
|
||||
*/
|
||||
public class HostManager
|
||||
{
|
||||
HashMap hosts;
|
||||
static int hostCount = 0;
|
||||
|
||||
|
||||
/**
|
||||
* Constructor for the HostInfo object
|
||||
*
|
||||
* @param initialSize Description of the Parameter
|
||||
*/
|
||||
public HostManager(int initialCapacity)
|
||||
{
|
||||
hosts = new HashMap(initialCapacity);
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Description of the Method
|
||||
*
|
||||
* @param hostName Description of the Parameter
|
||||
* @return Description of the Return Value
|
||||
*/
|
||||
public HostInfo put(String hostName)
|
||||
{
|
||||
if (!hosts.containsKey(hostName))
|
||||
{
|
||||
int hostID;
|
||||
synchronized (this)
|
||||
{
|
||||
hostID = hostCount++;
|
||||
}
|
||||
HostInfo hi = new HostInfo(hostName,hostID);
|
||||
hosts.put(hostName, hi);
|
||||
//System.out.println("hostManager: + " + hostName);
|
||||
if(!hostName.equals(hostName.toLowerCase()))
|
||||
{
|
||||
try
|
||||
{
|
||||
throw new Exception();
|
||||
}
|
||||
catch(Exception e)
|
||||
{
|
||||
e.printStackTrace();
|
||||
}
|
||||
}
|
||||
return hi;
|
||||
}
|
||||
return (HostInfo)hosts.get(hostName);
|
||||
/*else
|
||||
{
|
||||
hostID = hosts.get()
|
||||
}
|
||||
// assert hostID != -1;
|
||||
return hostID;*/
|
||||
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Gets the hostID attribute of the HostInfo object
|
||||
*
|
||||
* @param hostName Description of the Parameter
|
||||
* @return The hostID value
|
||||
*/
|
||||
public HostInfo getHostInfo(String hostName)
|
||||
{
|
||||
HostInfo hi = (HostInfo)hosts.get(hostName);
|
||||
if(hi == null)
|
||||
{
|
||||
return put(hostName);
|
||||
}
|
||||
return hi;
|
||||
}
|
||||
|
||||
public int getSize()
|
||||
{
|
||||
return hosts.size();
|
||||
}
|
||||
|
||||
public HostInfo addSynonym(String hostName, String synonym)
|
||||
{
|
||||
HostInfo info = getHostInfo(hostName);
|
||||
hosts.put(synonym, info);
|
||||
return info;
|
||||
}
|
||||
|
||||
|
||||
}
|
|
@ -0,0 +1,425 @@
|
|||
package de.lanlab.larm.net;
|
||||
/*
|
||||
* ====================================================================
|
||||
* The Apache Software License, Version 1.1
|
||||
*
|
||||
* Copyright (c) 2001 The Apache Software Foundation. All rights
|
||||
* reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions
|
||||
* are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in
|
||||
* the documentation and/or other materials provided with the
|
||||
* distribution.
|
||||
*
|
||||
* 3. The end-user documentation included with the redistribution,
|
||||
* if any, must include the following acknowledgment:
|
||||
* "This product includes software developed by the
|
||||
* Apache Software Foundation (http://www.apache.org/)."
|
||||
* Alternately, this acknowledgment may appear in the software itself,
|
||||
* if and wherever such third-party acknowledgments normally appear.
|
||||
*
|
||||
* 4. The names "Apache" and "Apache Software Foundation" and
|
||||
* "Apache Lucene" must not be used to endorse or promote products
|
||||
* derived from this software without prior written permission. For
|
||||
* written permission, please contact apache@apache.org.
|
||||
*
|
||||
* 5. Products derived from this software may not be called "Apache",
|
||||
* "Apache Lucene", nor may "Apache" appear in their name, without
|
||||
* prior written permission of the Apache Software Foundation.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
|
||||
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
|
||||
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
* DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
|
||||
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
|
||||
* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
||||
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
|
||||
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
|
||||
* SUCH DAMAGE.
|
||||
* ====================================================================
|
||||
*
|
||||
* This software consists of voluntary contributions made by many
|
||||
* individuals on behalf of the Apache Software Foundation. For more
|
||||
* information on the Apache Software Foundation, please see
|
||||
* <http://www.apache.org/>.
|
||||
*/
|
||||
import java.io.*;
|
||||
import java.net.*;
|
||||
|
||||
|
||||
/**
|
||||
* Description of the Class
|
||||
*
|
||||
* @author Administrator
|
||||
* @created 14. Juni 2002
|
||||
*/
|
||||
public class URLNormalizer
|
||||
{
|
||||
final static int NP_SLASH = 1;
|
||||
final static int NP_CHAR = 2;
|
||||
final static int NP_PERCENT = 3;
|
||||
final static int NP_POINT = 4;
|
||||
final static int NP_HEX = 5;
|
||||
|
||||
/**
|
||||
* contains hex codes for characters in lowercase uses char arrays instead
|
||||
* of strings for faster processing
|
||||
*/
|
||||
protected static char[][] charMap = {
|
||||
{'%', '0', '0'}, {'%', '0', '1'}, {'%', '0', '2'}, {'%', '0', '3'}, {'%', '0', '4'}, {'%', '0', '5'}, {'%', '0', '6'}, {'%', '0', '7'}, {'%', '0', '8'}, {'%', '0', '9'}, {'%', '0', 'A'}, {'%', '0', 'B'}, {'%', '0', 'C'}, {'%', '0', 'D'}, {'%', '0', 'E'}, {'%', '0', 'F'},
|
||||
{'%', '1', '0'}, {'%', '1', '1'}, {'%', '1', '2'}, {'%', '1', '3'}, {'%', '1', '4'}, {'%', '1', '5'}, {'%', '1', '6'}, {'%', '1', '7'}, {'%', '1', '8'}, {'%', '1', '9'}, {'%', '1', 'A'}, {'%', '1', 'B'}, {'%', '1', 'C'}, {'%', '1', 'D'}, {'%', '1', 'E'}, {'%', '1', 'F'},
|
||||
{'%', '2', '0'}, {'%', '2', '1'}, {'%', '2', '2'}, {'%', '2', '3'}, {'$'}, {'%', '2', '5'}, {'%', '2', '6'}, {'%', '2', '7'}, {'%', '2', '8'}, {'%', '2', '9'}, {'%', '2', 'A'}, {'%', '2', 'B'}, {'%', '2', 'C'}, {'-'}, {'.'}, {'%', '2', 'F'},
|
||||
{'0'}, {'1'}, {'2'}, {'3'}, {'4'}, {'5'}, {'6'}, {'7'}, {'8'}, {'9'}, {'%', '3', 'A'}, {'%', '3', 'B'}, {'%', '3', 'C'}, {'%', '3', 'D'}, {'%', '3', 'E'}, {'%', '3', 'F'},
|
||||
{'%', '4', '0'}, {'a'}, {'b'}, {'c'}, {'d'}, {'e'}, {'f'}, {'g'}, {'h'}, {'i'}, {'j'}, {'k'}, {'l'}, {'m'}, {'n'}, {'o'},
|
||||
{'p'}, {'q'}, {'r'}, {'s'}, {'t'}, {'u'}, {'v'}, {'w'}, {'x'}, {'y'}, {'z'}, {'%', '5', 'B'}, {'%', '5', 'C'}, {'%', '5', 'D'}, {'%', '5', 'E'}, {'_'},
|
||||
{'%', '6', '0'}, {'a'}, {'b'}, {'c'}, {'d'}, {'e'}, {'f'}, {'g'}, {'h'}, {'i'}, {'j'}, {'k'}, {'l'}, {'m'}, {'n'}, {'o'},
|
||||
{'p'}, {'q'}, {'r'}, {'s'}, {'t'}, {'u'}, {'v'}, {'w'}, {'x'}, {'y'}, {'z'}, {'%', '7', 'B'}, {'%', '7', 'C'}, {'%', '7', 'D'}, {'%', '7', 'E'}, {'%', '7', 'F'},
|
||||
{'%', '8', '0'}, {'%', '8', '1'}, {'%', '8', '2'}, {'%', '8', '3'}, {'%', '8', '4'}, {'%', '8', '5'}, {'%', '8', '6'}, {'%', '8', '7'}, {'%', '8', '8'}, {'%', '8', '9'}, {'%', '8', 'A'}, {'%', '8', 'B'}, {'%', '8', 'C'}, {'%', '8', 'D'}, {'%', '8', 'E'}, {'%', '8', 'F'},
|
||||
{'%', '9', '0'}, {'%', '9', '1'}, {'%', '9', '2'}, {'%', '9', '3'}, {'%', '9', '4'}, {'%', '9', '5'}, {'%', '9', '6'}, {'%', '9', '7'}, {'%', '9', '8'}, {'%', '9', '9'}, {'%', '9', 'A'}, {'%', '9', 'B'}, {'%', '9', 'C'}, {'%', '9', 'D'}, {'%', '9', 'E'}, {'%', '9', 'F'},
|
||||
{'%', 'A', '0'}, {'%', 'A', '1'}, {'%', 'A', '2'}, {'%', 'A', '3'}, {'%', 'A', '4'}, {'%', 'A', '5'}, {'%', 'A', '6'}, {'%', 'A', '7'}, {'%', 'A', '8'}, {'%', 'A', '9'}, {'%', 'A', 'A'}, {'%', 'A', 'B'}, {'%', 'A', 'C'}, {'%', 'A', 'D'}, {'%', 'A', 'E'}, {'%', 'A', 'F'},
|
||||
{'%', 'B', '0'}, {'%', 'B', '1'}, {'%', 'B', '2'}, {'%', 'B', '3'}, {'%', 'B', '4'}, {'%', 'B', '5'}, {'%', 'B', '6'}, {'%', 'B', '7'}, {'%', 'B', '8'}, {'%', 'B', '9'}, {'%', 'B', 'A'}, {'%', 'B', 'B'}, {'%', 'B', 'C'}, {'%', 'B', 'D'}, {'%', 'B', 'E'}, {'%', 'B', 'F'},
|
||||
{'%', 'E', '0'}, {'%', 'E', '1'}, {'%', 'E', '2'}, {'%', 'E', '3'}, {'%', 'E', '4'}, {'%', 'E', '5'}, {'%', 'E', '6'}, {'%', 'E', '7'}, {'%', 'E', '8'}, {'%', 'E', '9'}, {'%', 'E', 'A'}, {'%', 'E', 'B'}, {'%', 'E', 'C'}, {'%', 'E', 'D'}, {'%', 'E', 'E'}, {'%', 'E', 'F'},
|
||||
{'%', 'F', '0'}, {'%', 'F', '1'}, {'%', 'F', '2'}, {'%', 'F', '3'}, {'%', 'F', '4'}, {'%', 'F', '5'}, {'%', 'F', '6'}, {'%', 'D', '7'}, {'%', 'F', '8'}, {'%', 'F', '9'}, {'%', 'F', 'A'}, {'%', 'F', 'B'}, {'%', 'F', 'C'}, {'%', 'F', 'D'}, {'%', 'F', 'E'}, {'%', 'D', 'F'},
|
||||
{'%', 'E', '0'}, {'%', 'E', '1'}, {'%', 'E', '2'}, {'%', 'E', '3'}, {'%', 'E', '4'}, {'%', 'E', '5'}, {'%', 'E', '6'}, {'%', 'E', '7'}, {'%', 'E', '8'}, {'%', 'E', '9'}, {'%', 'E', 'A'}, {'%', 'E', 'B'}, {'%', 'E', 'C'}, {'%', 'E', 'D'}, {'%', 'E', 'E'}, {'%', 'E', 'F'},
|
||||
{'%', 'F', '0'}, {'%', 'F', '1'}, {'%', 'F', '2'}, {'%', 'F', '3'}, {'%', 'F', '4'}, {'%', 'F', '5'}, {'%', 'F', '6'}, {'%', 'F', '7'}, {'%', 'F', '8'}, {'%', 'F', '9'}, {'%', 'F', 'A'}, {'%', 'F', 'B'}, {'%', 'F', 'C'}, {'%', 'F', 'D'}, {'%', 'F', 'E'}, {'%', 'F', 'F'},
|
||||
};
|
||||
|
||||
|
||||
/**
|
||||
* Description of the Method
|
||||
*
|
||||
* @param path Description of the Parameter
|
||||
* @return Description of the Return Value
|
||||
* @exception IOException Description of the Exception
|
||||
*/
|
||||
protected static String normalizePath(String path)
|
||||
throws IOException
|
||||
{
|
||||
// rule 1: if the path is empty, return "/"
|
||||
if (path.length() == 0)
|
||||
{
|
||||
return "/";
|
||||
}
|
||||
|
||||
// Finite State Machine to convert characters to lowercase, remove "//" and "/./"
|
||||
// and make sure that all characters are escaped in a uniform way, i.e.
|
||||
// {" ", "+", "%20"} -> "%20"
|
||||
|
||||
StringBuffer w = new StringBuffer((int) (path.length() * 1.5));
|
||||
|
||||
int status = NP_CHAR;
|
||||
|
||||
int pos = 0;
|
||||
int length = path.length();
|
||||
char savedChar = '?';
|
||||
int hexChar = '?';
|
||||
int pathPos = -1; // position of last "/"
|
||||
int questionPos = -1; // assert length >0
|
||||
boolean isInQuery = false; // question mark reached?
|
||||
|
||||
while (pos < length)
|
||||
{
|
||||
char c = path.charAt(pos++);
|
||||
try
|
||||
{
|
||||
switch (status)
|
||||
{
|
||||
case NP_SLASH:
|
||||
if (c == '/')
|
||||
{
|
||||
// ignore subsequent slashes
|
||||
}
|
||||
else if (c == '.')
|
||||
{
|
||||
status = NP_POINT;
|
||||
}
|
||||
else if (c == '%')
|
||||
{
|
||||
status = NP_PERCENT;
|
||||
}
|
||||
else
|
||||
{
|
||||
pos--;
|
||||
status = NP_CHAR;
|
||||
}
|
||||
break;
|
||||
case NP_POINT:
|
||||
if (c == '/')
|
||||
{
|
||||
// ignore
|
||||
}
|
||||
else if (c == '.')
|
||||
{
|
||||
// ignore; this shouldn't happen
|
||||
}
|
||||
else
|
||||
{
|
||||
w.append('.');
|
||||
pos--;
|
||||
status = NP_SLASH;
|
||||
}
|
||||
break;
|
||||
case NP_PERCENT:
|
||||
if (c >= '0' && c <= '9')
|
||||
{
|
||||
hexChar = (c - '0') << 4;
|
||||
}
|
||||
else if (c >= 'a' && c <= 'f')
|
||||
{
|
||||
hexChar = (c - 'a' + 10) << 4;
|
||||
}
|
||||
else if (c >= 'A' && c <= 'F')
|
||||
{
|
||||
hexChar = (c - 'A' + 10) << 4;
|
||||
}
|
||||
else
|
||||
{
|
||||
w.append(charMap['%']);
|
||||
w.append(charMap[c]);
|
||||
break;
|
||||
}
|
||||
savedChar = c;
|
||||
status = NP_HEX;
|
||||
break;
|
||||
case NP_HEX:
|
||||
if (c >= '0' && c <= '9')
|
||||
{
|
||||
hexChar |= (c - '0');
|
||||
}
|
||||
else if (c >= 'a' && c <= 'f')
|
||||
{
|
||||
hexChar |= (c - 'a' + 10);
|
||||
}
|
||||
else if (c >= 'A' && c <= 'F')
|
||||
{
|
||||
hexChar |= (c - 'A' + 10);
|
||||
}
|
||||
else
|
||||
{
|
||||
w.append(charMap['%']);
|
||||
w.append(charMap[savedChar]);
|
||||
w.append(charMap[c]);
|
||||
break;
|
||||
}
|
||||
w.append(charMap[hexChar]);
|
||||
status = NP_CHAR;
|
||||
break;
|
||||
case NP_CHAR:
|
||||
switch (c)
|
||||
{
|
||||
case '%':
|
||||
status = NP_PERCENT;
|
||||
break;
|
||||
case '/':
|
||||
if(!isInQuery)
|
||||
{
|
||||
w.append(c);
|
||||
pathPos = w.length(); // points to the char. after "/"
|
||||
status = NP_SLASH;
|
||||
}
|
||||
else
|
||||
{
|
||||
w.append(charMap[c]);
|
||||
}
|
||||
break;
|
||||
case '?':
|
||||
if(!isInQuery)
|
||||
{
|
||||
if(pathPos == -1)
|
||||
{
|
||||
w.append('/');
|
||||
pathPos = w.length();
|
||||
}
|
||||
questionPos = w.length(); // points to the char at "?"
|
||||
isInQuery = true;
|
||||
}
|
||||
else
|
||||
{
|
||||
w.append(charMap[c]);
|
||||
break;
|
||||
}
|
||||
case '&':
|
||||
case ';':
|
||||
case '@':
|
||||
//case ':':
|
||||
case '=':
|
||||
w.append(c);
|
||||
break;
|
||||
case '+':
|
||||
w.append("%20");
|
||||
break;
|
||||
default:
|
||||
w.append(charMap[c]);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
catch (ArrayIndexOutOfBoundsException e)
|
||||
{
|
||||
// we encountered a unicode character >= 0x00ff
|
||||
// write UTF-8 to distinguish it from other characters
|
||||
// note that this does NOT lead to a pure UTF-8 URL since we
|
||||
// write 0x80 <= c <= 0xff as one-byte strings
|
||||
/*
|
||||
* if (ch <= 0x007f) { // other ASCII
|
||||
* sbuf.append(hex[ch]);
|
||||
* } else
|
||||
*/
|
||||
// note that we ignore the case that we receive "%" + unicode + c
|
||||
// (status = NP_HEX + Exception when writing savedchar); in that case
|
||||
// only the second character is written. we consider this to be very
|
||||
// unlikely
|
||||
|
||||
// see http://www.w3.org/International/O-URL-code.html
|
||||
if (c <= 0x07FF)
|
||||
{
|
||||
// non-ASCII <= 0x7FF
|
||||
w.append(charMap[0xc0 | (c >> 6)]);
|
||||
w.append(charMap[0x80 | (c & 0x3F)]);
|
||||
}
|
||||
else
|
||||
{
|
||||
// 0x7FF < c <= 0xFFFF
|
||||
w.append(charMap[0xe0 | (c >> 12)]);
|
||||
w.append(charMap[0x80 | ((c >> 6) & 0x3F)]);
|
||||
w.append(charMap[0x80 | (c & 0x3F)]);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// rule 3: delete index.* or default.*
|
||||
|
||||
if(questionPos == -1) // no query
|
||||
{
|
||||
questionPos = w.length();
|
||||
}
|
||||
else
|
||||
{
|
||||
if(questionPos == w.length()-1)
|
||||
{
|
||||
// empty query. assert questionPos > 0
|
||||
w.deleteCharAt(questionPos);
|
||||
}
|
||||
}
|
||||
if(pathPos == -1) // no query
|
||||
{
|
||||
pathPos = 0;
|
||||
}
|
||||
if(questionPos > pathPos)
|
||||
{
|
||||
String file = w.substring(pathPos, questionPos);
|
||||
{
|
||||
//System.out.println("file: " + file);
|
||||
if(file.startsWith("index.") || file.startsWith("default."))
|
||||
{
|
||||
w.delete(pathPos, questionPos); // delete default page to avoid ambiguities
|
||||
}
|
||||
}
|
||||
}
|
||||
return w.toString();
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Description of the Method
|
||||
*
|
||||
* @param host Description of the Parameter
|
||||
* @return Description of the Return Value
|
||||
*/
|
||||
protected static String normalizeHost(HostManager hostManager, String host)
|
||||
{
|
||||
return hostManager.getHostInfo(host.toLowerCase()).getHostName();
|
||||
}
|
||||
|
||||
/*
|
||||
HostManager hostManager;
|
||||
*/
|
||||
|
||||
/**
|
||||
* Constructor for the URLNormalizer object
|
||||
*
|
||||
* @param hostManager Description of the Parameter
|
||||
*/
|
||||
/* public URLNormalizer(HostManager hostManager)
|
||||
{
|
||||
this.hostManager = hostManager;
|
||||
}*/
|
||||
|
||||
|
||||
/**
|
||||
* Description of the Method
|
||||
*
|
||||
* @param u Description of the Parameter
|
||||
* @return Description of the Return Value
|
||||
* @exception IOException Description of the Exception
|
||||
* @exception MalformedURLException Description of the Exception
|
||||
*/
|
||||
public static URL normalize(URL u, HostManager hostManager)
|
||||
{
|
||||
if (u.getProtocol().equals("http"))
|
||||
{
|
||||
try
|
||||
{
|
||||
int port = u.getPort();
|
||||
/*URL url =*/
|
||||
return new URL(u.getProtocol(), normalizeHost(hostManager, u.getHost()), port == 80 ? -1 : port, normalizePath(u.getFile()));
|
||||
/*if(!u.equals(url))
|
||||
{
|
||||
System.out.println(u.toExternalForm() + " -> " + url.toExternalForm());
|
||||
}
|
||||
return url;*/
|
||||
}
|
||||
catch(MalformedURLException e)
|
||||
{
|
||||
System.out.println("assertion failed: MalformedURLException in URLNormalizer.normalize()");
|
||||
throw new java.lang.InternalError("assertion failed: MalformedURLException in URLNormalizer.normalize()");
|
||||
}
|
||||
catch(IOException e)
|
||||
{
|
||||
System.out.println("assertion failed: IOException in URLNormalizer.normalize()");
|
||||
throw new java.lang.InternalError("assertion failed: MalformedURLException in URLNormalizer.normalize()");
|
||||
}
|
||||
|
||||
//return url
|
||||
}
|
||||
else
|
||||
{
|
||||
return u;
|
||||
}
|
||||
}
|
||||
|
||||
public static void main(String[] args) throws Exception
|
||||
{
|
||||
HostManager hm = new HostManager(10);
|
||||
hm.addSynonym("webinfo.campus.lmu.de", "webinfo.uni-muenchen.de");
|
||||
System.out.println(URLNormalizer.normalize(new URL("http://www.lmu.de/conman/index.jsp?path=709"), hm));
|
||||
System.out.println(URLNormalizer.normalize(new URL("http://webinfo.uni-muenchen.de/view-i.cfm?url=http://abc/resp?a=c"), hm));
|
||||
System.out.println(URLNormalizer.normalize(new URL("http://webinfo.campus.lmu.de/view-i.cfm?url=http://abc/resp?a=c"), hm));
|
||||
System.out.println(URLNormalizer.normalize(new URL("http://www.bwl.uni-muenchen.de/default.asp?id=123"), hm));
|
||||
System.out.println(URLNormalizer.normalize(new URL("http://www.lmu.de/index.html"), hm));
|
||||
System.out.println(URLNormalizer.normalize(new URL("http://www.lmu.de"), hm));
|
||||
System.out.println(URLNormalizer.normalize(new URL("http://www.lmu.de/"), hm));
|
||||
System.out.println(URLNormalizer.normalize(new URL("http://www.lmu.de/?"), hm));
|
||||
System.out.println(URLNormalizer.normalize(new URL("http://www.lmu.de?"), hm));
|
||||
System.out.println(URLNormalizer.normalize(new URL("http://www.lmu.de?id=abc"), hm));
|
||||
System.out.println(URLNormalizer.normalize(new URL("http://www.lmu.de/abcde$1?id=abc"), hm));
|
||||
URL u = new URL("http://www.lmu.de/abcde$1?id=abc");
|
||||
System.out.println("host: " + u.getHost());
|
||||
System.out.println("port: " + u.getPort());
|
||||
System.out.println(URLNormalizer.normalize(u, hm));
|
||||
|
||||
|
||||
|
||||
}
|
||||
}
|
Loading…
Reference in New Issue