diff --git a/sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/net/HostInfo.java b/sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/net/HostInfo.java
new file mode 100644
index 00000000000..c5b9ab82607
--- /dev/null
+++ b/sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/net/HostInfo.java
@@ -0,0 +1,298 @@
+/*
+ * ====================================================================
+ * The Apache Software License, Version 1.1
+ *
+ * Copyright (c) 2001 The Apache Software Foundation. All rights
+ * reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ *
+ * 3. The end-user documentation included with the redistribution,
+ * if any, must include the following acknowledgment:
+ * "This product includes software developed by the
+ * Apache Software Foundation (http://www.apache.org/)."
+ * Alternately, this acknowledgment may appear in the software itself,
+ * if and wherever such third-party acknowledgments normally appear.
+ *
+ * 4. The names "Apache" and "Apache Software Foundation" and
+ * "Apache Lucene" must not be used to endorse or promote products
+ * derived from this software without prior written permission. For
+ * written permission, please contact apache@apache.org.
+ *
+ * 5. Products derived from this software may not be called "Apache",
+ * "Apache Lucene", nor may "Apache" appear in their name, without
+ * prior written permission of the Apache Software Foundation.
+ *
+ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
+ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
+ * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+ * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ * ====================================================================
+ *
+ * This software consists of voluntary contributions made by many
+ * individuals on behalf of the Apache Software Foundation. For more
+ * information on the Apache Software Foundation, please see
+ * .
+ */
+package de.lanlab.larm.net;
+
+import java.util.HashMap;
+import java.net.*;
+import de.lanlab.larm.util.CachingQueue;
+import de.lanlab.larm.util.Queue;
+import java.util.LinkedList;
+import de.lanlab.larm.fetcher.Message;
+
+/**
+ * contains information about a host. If a host doesn't respond too often, it's
+ * excluded from the crawl. This class is used by the HostManager
+ *
+ * @author Clemens Marschner
+ * @created 16. Februar 2002
+ * @version $Id$
+ */
+public class HostInfo
+{
+ final static String[] emptyKeepOutDirectories = new String[0];
+
+ int id;
+
+ int healthyCount = 5;
+
+ // five strikes, and you're out
+ boolean isReachable = true;
+
+ boolean robotTxtChecked = false;
+
+ String[] disallows;
+
+ // robot exclusion
+ boolean isLoadingRobotsTxt = false;
+
+ Queue queuedRequests = null;
+
+ // robot exclusion
+ String hostName;
+
+
+ /**
+ * Description of the Method
+ */
+ public void removeQueue()
+ {
+ queuedRequests = null;
+ }
+
+
+ /**
+ * Gets the id attribute of the HostInfo object
+ *
+ * @return The id value
+ */
+ public int getId()
+ {
+ return id;
+ }
+
+
+ /**
+ * Description of the Method
+ *
+ * @param message Description of the Parameter
+ */
+ public void insertIntoQueue(Message message)
+ {
+ queuedRequests.insert(message);
+ }
+
+
+ /**
+ * Gets the hostName attribute of the HostInfo object
+ *
+ * @return The hostName value
+ */
+ public String getHostName()
+ {
+ return hostName;
+ }
+
+
+ /**
+ * Gets the queueSize. No error checking is done when the queue is null
+ *
+ * @return The queueSize value
+ */
+ public int getQueueSize()
+ {
+ return queuedRequests.size();
+ }
+
+
+ /**
+ * gets last entry from queue. No error checking is done when the queue is null
+ *
+ * @return Description of the Return Value
+ */
+ public Message removeFromQueue()
+ {
+ return (Message) queuedRequests.remove();
+ }
+
+
+ //LinkedList synonyms = new LinkedList();
+
+ /**
+ * Constructor for the HostInfo object
+ *
+ * @param hostName Description of the Parameter
+ * @param id Description of the Parameter
+ */
+ public HostInfo(String hostName, int id)
+ {
+ this.id = id;
+ this.disallows = HostInfo.emptyKeepOutDirectories;
+ this.hostName = hostName;
+ }
+
+
+ /**
+ * is this host reachable and responding?
+ *
+ * @return The healthy value
+ */
+ public boolean isHealthy()
+ {
+ return (healthyCount > 0) && isReachable;
+ }
+
+
+ /**
+ * signals that the host returned with a bad request of whatever type
+ */
+ public void badRequest()
+ {
+ healthyCount--;
+ }
+
+
+ /**
+ * Sets the reachable attribute of the HostInfo object
+ *
+ * @param reachable The new reachable value
+ */
+ public void setReachable(boolean reachable)
+ {
+ isReachable = reachable;
+ }
+
+
+ /**
+ * Gets the reachable attribute of the HostInfo object
+ *
+ * @return The reachable value
+ */
+ public boolean isReachable()
+ {
+ return isReachable;
+ }
+
+
+ /**
+ * Gets the robotTxtChecked attribute of the HostInfo object
+ *
+ * @return The robotTxtChecked value
+ */
+ public boolean isRobotTxtChecked()
+ {
+ return robotTxtChecked;
+ }
+
+
+ /**
+ * must be synchronized externally
+ *
+ * @return The loadingRobotsTxt value
+ */
+ public boolean isLoadingRobotsTxt()
+ {
+ return this.isLoadingRobotsTxt;
+ }
+
+
+ /**
+ * Sets the loadingRobotsTxt attribute of the HostInfo object
+ *
+ * @param isLoading The new loadingRobotsTxt value
+ */
+ public void setLoadingRobotsTxt(boolean isLoading)
+ {
+ this.isLoadingRobotsTxt = isLoading;
+ if (isLoading)
+ {
+ this.queuedRequests = new CachingQueue("HostInfo_" + id + "_QueuedRequests", 100);
+ }
+
+ }
+
+
+ /**
+ * Sets the robotsChecked attribute of the HostInfo object
+ *
+ * @param isChecked The new robotsChecked value
+ * @param disallows The new robotsChecked value
+ */
+ public void setRobotsChecked(boolean isChecked, String[] disallows)
+ {
+ this.robotTxtChecked = isChecked;
+ if (disallows != null)
+ {
+ this.disallows = disallows;
+ }
+ else
+ {
+ this.disallows = emptyKeepOutDirectories;
+ }
+
+ }
+
+
+ /**
+ * Gets the allowed attribute of the HostInfo object
+ *
+ * @param path Description of the Parameter
+ * @return The allowed value
+ */
+ public synchronized boolean isAllowed(String path)
+ {
+ // assume keepOutDirectories is pretty short
+ // assert disallows != null
+ int length = disallows.length;
+ for (int i = 0; i < length; i++)
+ {
+ if (path.startsWith(disallows[i]))
+ {
+ return false;
+ }
+ }
+ return true;
+ }
+
+}
diff --git a/sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/net/HostManager.java b/sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/net/HostManager.java
new file mode 100644
index 00000000000..2f6ace62ac5
--- /dev/null
+++ b/sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/net/HostManager.java
@@ -0,0 +1,154 @@
+/* ====================================================================
+ * The Apache Software License, Version 1.1
+ *
+ * Copyright (c) 2001 The Apache Software Foundation. All rights
+ * reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ *
+ * 3. The end-user documentation included with the redistribution,
+ * if any, must include the following acknowledgment:
+ * "This product includes software developed by the
+ * Apache Software Foundation (http://www.apache.org/)."
+ * Alternately, this acknowledgment may appear in the software itself,
+ * if and wherever such third-party acknowledgments normally appear.
+ *
+ * 4. The names "Apache" and "Apache Software Foundation" and
+ * "Apache Lucene" must not be used to endorse or promote products
+ * derived from this software without prior written permission. For
+ * written permission, please contact apache@apache.org.
+ *
+ * 5. Products derived from this software may not be called "Apache",
+ * "Apache Lucene", nor may "Apache" appear in their name, without
+ * prior written permission of the Apache Software Foundation.
+ *
+ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
+ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
+ * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+ * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ * ====================================================================
+ *
+ * This software consists of voluntary contributions made by many
+ * individuals on behalf of the Apache Software Foundation. For more
+ * information on the Apache Software Foundation, please see
+ * .
+ */
+
+package de.lanlab.larm.net;
+
+import java.util.HashMap;
+
+/**
+ * Description of the Class
+ *
+ * @author Administrator
+ * @created 16. Februar 2002
+ * @version $Id$
+ */
+public class HostManager
+{
+ HashMap hosts;
+ static int hostCount = 0;
+
+
+ /**
+ * Constructor for the HostInfo object
+ *
+ * @param initialSize Description of the Parameter
+ */
+ public HostManager(int initialCapacity)
+ {
+ hosts = new HashMap(initialCapacity);
+ }
+
+
+ /**
+ * Description of the Method
+ *
+ * @param hostName Description of the Parameter
+ * @return Description of the Return Value
+ */
+ public HostInfo put(String hostName)
+ {
+ if (!hosts.containsKey(hostName))
+ {
+ int hostID;
+ synchronized (this)
+ {
+ hostID = hostCount++;
+ }
+ HostInfo hi = new HostInfo(hostName,hostID);
+ hosts.put(hostName, hi);
+ //System.out.println("hostManager: + " + hostName);
+ if(!hostName.equals(hostName.toLowerCase()))
+ {
+ try
+ {
+ throw new Exception();
+ }
+ catch(Exception e)
+ {
+ e.printStackTrace();
+ }
+ }
+ return hi;
+ }
+ return (HostInfo)hosts.get(hostName);
+ /*else
+ {
+ hostID = hosts.get()
+ }
+ // assert hostID != -1;
+ return hostID;*/
+
+ }
+
+
+ /**
+ * Gets the hostID attribute of the HostInfo object
+ *
+ * @param hostName Description of the Parameter
+ * @return The hostID value
+ */
+ public HostInfo getHostInfo(String hostName)
+ {
+ HostInfo hi = (HostInfo)hosts.get(hostName);
+ if(hi == null)
+ {
+ return put(hostName);
+ }
+ return hi;
+ }
+
+ public int getSize()
+ {
+ return hosts.size();
+ }
+
+ public HostInfo addSynonym(String hostName, String synonym)
+ {
+ HostInfo info = getHostInfo(hostName);
+ hosts.put(synonym, info);
+ return info;
+ }
+
+
+}
diff --git a/sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/net/URLNormalizer.java b/sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/net/URLNormalizer.java
new file mode 100644
index 00000000000..15dc50d5285
--- /dev/null
+++ b/sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/net/URLNormalizer.java
@@ -0,0 +1,425 @@
+package de.lanlab.larm.net;
+/*
+ * ====================================================================
+ * The Apache Software License, Version 1.1
+ *
+ * Copyright (c) 2001 The Apache Software Foundation. All rights
+ * reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ *
+ * 3. The end-user documentation included with the redistribution,
+ * if any, must include the following acknowledgment:
+ * "This product includes software developed by the
+ * Apache Software Foundation (http://www.apache.org/)."
+ * Alternately, this acknowledgment may appear in the software itself,
+ * if and wherever such third-party acknowledgments normally appear.
+ *
+ * 4. The names "Apache" and "Apache Software Foundation" and
+ * "Apache Lucene" must not be used to endorse or promote products
+ * derived from this software without prior written permission. For
+ * written permission, please contact apache@apache.org.
+ *
+ * 5. Products derived from this software may not be called "Apache",
+ * "Apache Lucene", nor may "Apache" appear in their name, without
+ * prior written permission of the Apache Software Foundation.
+ *
+ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
+ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
+ * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+ * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ * ====================================================================
+ *
+ * This software consists of voluntary contributions made by many
+ * individuals on behalf of the Apache Software Foundation. For more
+ * information on the Apache Software Foundation, please see
+ * .
+ */
+import java.io.*;
+import java.net.*;
+
+
+/**
+ * Description of the Class
+ *
+ * @author Administrator
+ * @created 14. Juni 2002
+ */
+public class URLNormalizer
+{
+ final static int NP_SLASH = 1;
+ final static int NP_CHAR = 2;
+ final static int NP_PERCENT = 3;
+ final static int NP_POINT = 4;
+ final static int NP_HEX = 5;
+
+ /**
+ * contains hex codes for characters in lowercase uses char arrays instead
+ * of strings for faster processing
+ */
+ protected static char[][] charMap = {
+ {'%', '0', '0'}, {'%', '0', '1'}, {'%', '0', '2'}, {'%', '0', '3'}, {'%', '0', '4'}, {'%', '0', '5'}, {'%', '0', '6'}, {'%', '0', '7'}, {'%', '0', '8'}, {'%', '0', '9'}, {'%', '0', 'A'}, {'%', '0', 'B'}, {'%', '0', 'C'}, {'%', '0', 'D'}, {'%', '0', 'E'}, {'%', '0', 'F'},
+ {'%', '1', '0'}, {'%', '1', '1'}, {'%', '1', '2'}, {'%', '1', '3'}, {'%', '1', '4'}, {'%', '1', '5'}, {'%', '1', '6'}, {'%', '1', '7'}, {'%', '1', '8'}, {'%', '1', '9'}, {'%', '1', 'A'}, {'%', '1', 'B'}, {'%', '1', 'C'}, {'%', '1', 'D'}, {'%', '1', 'E'}, {'%', '1', 'F'},
+ {'%', '2', '0'}, {'%', '2', '1'}, {'%', '2', '2'}, {'%', '2', '3'}, {'$'}, {'%', '2', '5'}, {'%', '2', '6'}, {'%', '2', '7'}, {'%', '2', '8'}, {'%', '2', '9'}, {'%', '2', 'A'}, {'%', '2', 'B'}, {'%', '2', 'C'}, {'-'}, {'.'}, {'%', '2', 'F'},
+ {'0'}, {'1'}, {'2'}, {'3'}, {'4'}, {'5'}, {'6'}, {'7'}, {'8'}, {'9'}, {'%', '3', 'A'}, {'%', '3', 'B'}, {'%', '3', 'C'}, {'%', '3', 'D'}, {'%', '3', 'E'}, {'%', '3', 'F'},
+ {'%', '4', '0'}, {'a'}, {'b'}, {'c'}, {'d'}, {'e'}, {'f'}, {'g'}, {'h'}, {'i'}, {'j'}, {'k'}, {'l'}, {'m'}, {'n'}, {'o'},
+ {'p'}, {'q'}, {'r'}, {'s'}, {'t'}, {'u'}, {'v'}, {'w'}, {'x'}, {'y'}, {'z'}, {'%', '5', 'B'}, {'%', '5', 'C'}, {'%', '5', 'D'}, {'%', '5', 'E'}, {'_'},
+ {'%', '6', '0'}, {'a'}, {'b'}, {'c'}, {'d'}, {'e'}, {'f'}, {'g'}, {'h'}, {'i'}, {'j'}, {'k'}, {'l'}, {'m'}, {'n'}, {'o'},
+ {'p'}, {'q'}, {'r'}, {'s'}, {'t'}, {'u'}, {'v'}, {'w'}, {'x'}, {'y'}, {'z'}, {'%', '7', 'B'}, {'%', '7', 'C'}, {'%', '7', 'D'}, {'%', '7', 'E'}, {'%', '7', 'F'},
+ {'%', '8', '0'}, {'%', '8', '1'}, {'%', '8', '2'}, {'%', '8', '3'}, {'%', '8', '4'}, {'%', '8', '5'}, {'%', '8', '6'}, {'%', '8', '7'}, {'%', '8', '8'}, {'%', '8', '9'}, {'%', '8', 'A'}, {'%', '8', 'B'}, {'%', '8', 'C'}, {'%', '8', 'D'}, {'%', '8', 'E'}, {'%', '8', 'F'},
+ {'%', '9', '0'}, {'%', '9', '1'}, {'%', '9', '2'}, {'%', '9', '3'}, {'%', '9', '4'}, {'%', '9', '5'}, {'%', '9', '6'}, {'%', '9', '7'}, {'%', '9', '8'}, {'%', '9', '9'}, {'%', '9', 'A'}, {'%', '9', 'B'}, {'%', '9', 'C'}, {'%', '9', 'D'}, {'%', '9', 'E'}, {'%', '9', 'F'},
+ {'%', 'A', '0'}, {'%', 'A', '1'}, {'%', 'A', '2'}, {'%', 'A', '3'}, {'%', 'A', '4'}, {'%', 'A', '5'}, {'%', 'A', '6'}, {'%', 'A', '7'}, {'%', 'A', '8'}, {'%', 'A', '9'}, {'%', 'A', 'A'}, {'%', 'A', 'B'}, {'%', 'A', 'C'}, {'%', 'A', 'D'}, {'%', 'A', 'E'}, {'%', 'A', 'F'},
+ {'%', 'B', '0'}, {'%', 'B', '1'}, {'%', 'B', '2'}, {'%', 'B', '3'}, {'%', 'B', '4'}, {'%', 'B', '5'}, {'%', 'B', '6'}, {'%', 'B', '7'}, {'%', 'B', '8'}, {'%', 'B', '9'}, {'%', 'B', 'A'}, {'%', 'B', 'B'}, {'%', 'B', 'C'}, {'%', 'B', 'D'}, {'%', 'B', 'E'}, {'%', 'B', 'F'},
+ {'%', 'E', '0'}, {'%', 'E', '1'}, {'%', 'E', '2'}, {'%', 'E', '3'}, {'%', 'E', '4'}, {'%', 'E', '5'}, {'%', 'E', '6'}, {'%', 'E', '7'}, {'%', 'E', '8'}, {'%', 'E', '9'}, {'%', 'E', 'A'}, {'%', 'E', 'B'}, {'%', 'E', 'C'}, {'%', 'E', 'D'}, {'%', 'E', 'E'}, {'%', 'E', 'F'},
+ {'%', 'F', '0'}, {'%', 'F', '1'}, {'%', 'F', '2'}, {'%', 'F', '3'}, {'%', 'F', '4'}, {'%', 'F', '5'}, {'%', 'F', '6'}, {'%', 'D', '7'}, {'%', 'F', '8'}, {'%', 'F', '9'}, {'%', 'F', 'A'}, {'%', 'F', 'B'}, {'%', 'F', 'C'}, {'%', 'F', 'D'}, {'%', 'F', 'E'}, {'%', 'D', 'F'},
+ {'%', 'E', '0'}, {'%', 'E', '1'}, {'%', 'E', '2'}, {'%', 'E', '3'}, {'%', 'E', '4'}, {'%', 'E', '5'}, {'%', 'E', '6'}, {'%', 'E', '7'}, {'%', 'E', '8'}, {'%', 'E', '9'}, {'%', 'E', 'A'}, {'%', 'E', 'B'}, {'%', 'E', 'C'}, {'%', 'E', 'D'}, {'%', 'E', 'E'}, {'%', 'E', 'F'},
+ {'%', 'F', '0'}, {'%', 'F', '1'}, {'%', 'F', '2'}, {'%', 'F', '3'}, {'%', 'F', '4'}, {'%', 'F', '5'}, {'%', 'F', '6'}, {'%', 'F', '7'}, {'%', 'F', '8'}, {'%', 'F', '9'}, {'%', 'F', 'A'}, {'%', 'F', 'B'}, {'%', 'F', 'C'}, {'%', 'F', 'D'}, {'%', 'F', 'E'}, {'%', 'F', 'F'},
+ };
+
+
+ /**
+ * Description of the Method
+ *
+ * @param path Description of the Parameter
+ * @return Description of the Return Value
+ * @exception IOException Description of the Exception
+ */
+ protected static String normalizePath(String path)
+ throws IOException
+ {
+ // rule 1: if the path is empty, return "/"
+ if (path.length() == 0)
+ {
+ return "/";
+ }
+
+ // Finite State Machine to convert characters to lowercase, remove "//" and "/./"
+ // and make sure that all characters are escaped in a uniform way, i.e.
+ // {" ", "+", "%20"} -> "%20"
+
+ StringBuffer w = new StringBuffer((int) (path.length() * 1.5));
+
+ int status = NP_CHAR;
+
+ int pos = 0;
+ int length = path.length();
+ char savedChar = '?';
+ int hexChar = '?';
+ int pathPos = -1; // position of last "/"
+ int questionPos = -1; // assert length >0
+ boolean isInQuery = false; // question mark reached?
+
+ while (pos < length)
+ {
+ char c = path.charAt(pos++);
+ try
+ {
+ switch (status)
+ {
+ case NP_SLASH:
+ if (c == '/')
+ {
+ // ignore subsequent slashes
+ }
+ else if (c == '.')
+ {
+ status = NP_POINT;
+ }
+ else if (c == '%')
+ {
+ status = NP_PERCENT;
+ }
+ else
+ {
+ pos--;
+ status = NP_CHAR;
+ }
+ break;
+ case NP_POINT:
+ if (c == '/')
+ {
+ // ignore
+ }
+ else if (c == '.')
+ {
+ // ignore; this shouldn't happen
+ }
+ else
+ {
+ w.append('.');
+ pos--;
+ status = NP_SLASH;
+ }
+ break;
+ case NP_PERCENT:
+ if (c >= '0' && c <= '9')
+ {
+ hexChar = (c - '0') << 4;
+ }
+ else if (c >= 'a' && c <= 'f')
+ {
+ hexChar = (c - 'a' + 10) << 4;
+ }
+ else if (c >= 'A' && c <= 'F')
+ {
+ hexChar = (c - 'A' + 10) << 4;
+ }
+ else
+ {
+ w.append(charMap['%']);
+ w.append(charMap[c]);
+ break;
+ }
+ savedChar = c;
+ status = NP_HEX;
+ break;
+ case NP_HEX:
+ if (c >= '0' && c <= '9')
+ {
+ hexChar |= (c - '0');
+ }
+ else if (c >= 'a' && c <= 'f')
+ {
+ hexChar |= (c - 'a' + 10);
+ }
+ else if (c >= 'A' && c <= 'F')
+ {
+ hexChar |= (c - 'A' + 10);
+ }
+ else
+ {
+ w.append(charMap['%']);
+ w.append(charMap[savedChar]);
+ w.append(charMap[c]);
+ break;
+ }
+ w.append(charMap[hexChar]);
+ status = NP_CHAR;
+ break;
+ case NP_CHAR:
+ switch (c)
+ {
+ case '%':
+ status = NP_PERCENT;
+ break;
+ case '/':
+ if(!isInQuery)
+ {
+ w.append(c);
+ pathPos = w.length(); // points to the char. after "/"
+ status = NP_SLASH;
+ }
+ else
+ {
+ w.append(charMap[c]);
+ }
+ break;
+ case '?':
+ if(!isInQuery)
+ {
+ if(pathPos == -1)
+ {
+ w.append('/');
+ pathPos = w.length();
+ }
+ questionPos = w.length(); // points to the char at "?"
+ isInQuery = true;
+ }
+ else
+ {
+ w.append(charMap[c]);
+ break;
+ }
+ case '&':
+ case ';':
+ case '@':
+ //case ':':
+ case '=':
+ w.append(c);
+ break;
+ case '+':
+ w.append("%20");
+ break;
+ default:
+ w.append(charMap[c]);
+ break;
+ }
+ }
+
+ }
+ catch (ArrayIndexOutOfBoundsException e)
+ {
+ // we encountered a unicode character >= 0x00ff
+ // write UTF-8 to distinguish it from other characters
+ // note that this does NOT lead to a pure UTF-8 URL since we
+ // write 0x80 <= c <= 0xff as one-byte strings
+ /*
+ * if (ch <= 0x007f) { // other ASCII
+ * sbuf.append(hex[ch]);
+ * } else
+ */
+ // note that we ignore the case that we receive "%" + unicode + c
+ // (status = NP_HEX + Exception when writing savedchar); in that case
+ // only the second character is written. we consider this to be very
+ // unlikely
+
+ // see http://www.w3.org/International/O-URL-code.html
+ if (c <= 0x07FF)
+ {
+ // non-ASCII <= 0x7FF
+ w.append(charMap[0xc0 | (c >> 6)]);
+ w.append(charMap[0x80 | (c & 0x3F)]);
+ }
+ else
+ {
+ // 0x7FF < c <= 0xFFFF
+ w.append(charMap[0xe0 | (c >> 12)]);
+ w.append(charMap[0x80 | ((c >> 6) & 0x3F)]);
+ w.append(charMap[0x80 | (c & 0x3F)]);
+ }
+ }
+ }
+
+ // rule 3: delete index.* or default.*
+
+ if(questionPos == -1) // no query
+ {
+ questionPos = w.length();
+ }
+ else
+ {
+ if(questionPos == w.length()-1)
+ {
+ // empty query. assert questionPos > 0
+ w.deleteCharAt(questionPos);
+ }
+ }
+ if(pathPos == -1) // no query
+ {
+ pathPos = 0;
+ }
+ if(questionPos > pathPos)
+ {
+ String file = w.substring(pathPos, questionPos);
+ {
+ //System.out.println("file: " + file);
+ if(file.startsWith("index.") || file.startsWith("default."))
+ {
+ w.delete(pathPos, questionPos); // delete default page to avoid ambiguities
+ }
+ }
+ }
+ return w.toString();
+ }
+
+
+ /**
+ * Description of the Method
+ *
+ * @param host Description of the Parameter
+ * @return Description of the Return Value
+ */
+ protected static String normalizeHost(HostManager hostManager, String host)
+ {
+ return hostManager.getHostInfo(host.toLowerCase()).getHostName();
+ }
+
+/*
+ HostManager hostManager;
+*/
+
+ /**
+ * Constructor for the URLNormalizer object
+ *
+ * @param hostManager Description of the Parameter
+ */
+ /* public URLNormalizer(HostManager hostManager)
+ {
+ this.hostManager = hostManager;
+ }*/
+
+
+ /**
+ * Description of the Method
+ *
+ * @param u Description of the Parameter
+ * @return Description of the Return Value
+ * @exception IOException Description of the Exception
+ * @exception MalformedURLException Description of the Exception
+ */
+ public static URL normalize(URL u, HostManager hostManager)
+ {
+ if (u.getProtocol().equals("http"))
+ {
+ try
+ {
+ int port = u.getPort();
+ /*URL url =*/
+ return new URL(u.getProtocol(), normalizeHost(hostManager, u.getHost()), port == 80 ? -1 : port, normalizePath(u.getFile()));
+ /*if(!u.equals(url))
+ {
+ System.out.println(u.toExternalForm() + " -> " + url.toExternalForm());
+ }
+ return url;*/
+ }
+ catch(MalformedURLException e)
+ {
+ System.out.println("assertion failed: MalformedURLException in URLNormalizer.normalize()");
+ throw new java.lang.InternalError("assertion failed: MalformedURLException in URLNormalizer.normalize()");
+ }
+ catch(IOException e)
+ {
+ System.out.println("assertion failed: IOException in URLNormalizer.normalize()");
+ throw new java.lang.InternalError("assertion failed: MalformedURLException in URLNormalizer.normalize()");
+ }
+
+ //return url
+ }
+ else
+ {
+ return u;
+ }
+ }
+
+ public static void main(String[] args) throws Exception
+ {
+ HostManager hm = new HostManager(10);
+ hm.addSynonym("webinfo.campus.lmu.de", "webinfo.uni-muenchen.de");
+ System.out.println(URLNormalizer.normalize(new URL("http://www.lmu.de/conman/index.jsp?path=709"), hm));
+ System.out.println(URLNormalizer.normalize(new URL("http://webinfo.uni-muenchen.de/view-i.cfm?url=http://abc/resp?a=c"), hm));
+ System.out.println(URLNormalizer.normalize(new URL("http://webinfo.campus.lmu.de/view-i.cfm?url=http://abc/resp?a=c"), hm));
+ System.out.println(URLNormalizer.normalize(new URL("http://www.bwl.uni-muenchen.de/default.asp?id=123"), hm));
+ System.out.println(URLNormalizer.normalize(new URL("http://www.lmu.de/index.html"), hm));
+ System.out.println(URLNormalizer.normalize(new URL("http://www.lmu.de"), hm));
+ System.out.println(URLNormalizer.normalize(new URL("http://www.lmu.de/"), hm));
+ System.out.println(URLNormalizer.normalize(new URL("http://www.lmu.de/?"), hm));
+ System.out.println(URLNormalizer.normalize(new URL("http://www.lmu.de?"), hm));
+ System.out.println(URLNormalizer.normalize(new URL("http://www.lmu.de?id=abc"), hm));
+ System.out.println(URLNormalizer.normalize(new URL("http://www.lmu.de/abcde$1?id=abc"), hm));
+ URL u = new URL("http://www.lmu.de/abcde$1?id=abc");
+ System.out.println("host: " + u.getHost());
+ System.out.println("port: " + u.getPort());
+ System.out.println(URLNormalizer.normalize(u, hm));
+
+
+
+ }
+}