diff --git a/sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/net/HostInfo.java b/sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/net/HostInfo.java new file mode 100644 index 00000000000..c5b9ab82607 --- /dev/null +++ b/sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/net/HostInfo.java @@ -0,0 +1,298 @@ +/* + * ==================================================================== + * The Apache Software License, Version 1.1 + * + * Copyright (c) 2001 The Apache Software Foundation. All rights + * reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * + * 3. The end-user documentation included with the redistribution, + * if any, must include the following acknowledgment: + * "This product includes software developed by the + * Apache Software Foundation (http://www.apache.org/)." + * Alternately, this acknowledgment may appear in the software itself, + * if and wherever such third-party acknowledgments normally appear. + * + * 4. The names "Apache" and "Apache Software Foundation" and + * "Apache Lucene" must not be used to endorse or promote products + * derived from this software without prior written permission. For + * written permission, please contact apache@apache.org. + * + * 5. Products derived from this software may not be called "Apache", + * "Apache Lucene", nor may "Apache" appear in their name, without + * prior written permission of the Apache Software Foundation. + * + * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED + * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR + * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF + * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT + * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * ==================================================================== + * + * This software consists of voluntary contributions made by many + * individuals on behalf of the Apache Software Foundation. For more + * information on the Apache Software Foundation, please see + * . + */ +package de.lanlab.larm.net; + +import java.util.HashMap; +import java.net.*; +import de.lanlab.larm.util.CachingQueue; +import de.lanlab.larm.util.Queue; +import java.util.LinkedList; +import de.lanlab.larm.fetcher.Message; + +/** + * contains information about a host. If a host doesn't respond too often, it's + * excluded from the crawl. This class is used by the HostManager + * + * @author Clemens Marschner + * @created 16. Februar 2002 + * @version $Id$ + */ +public class HostInfo +{ + final static String[] emptyKeepOutDirectories = new String[0]; + + int id; + + int healthyCount = 5; + + // five strikes, and you're out + boolean isReachable = true; + + boolean robotTxtChecked = false; + + String[] disallows; + + // robot exclusion + boolean isLoadingRobotsTxt = false; + + Queue queuedRequests = null; + + // robot exclusion + String hostName; + + + /** + * Description of the Method + */ + public void removeQueue() + { + queuedRequests = null; + } + + + /** + * Gets the id attribute of the HostInfo object + * + * @return The id value + */ + public int getId() + { + return id; + } + + + /** + * Description of the Method + * + * @param message Description of the Parameter + */ + public void insertIntoQueue(Message message) + { + queuedRequests.insert(message); + } + + + /** + * Gets the hostName attribute of the HostInfo object + * + * @return The hostName value + */ + public String getHostName() + { + return hostName; + } + + + /** + * Gets the queueSize. No error checking is done when the queue is null + * + * @return The queueSize value + */ + public int getQueueSize() + { + return queuedRequests.size(); + } + + + /** + * gets last entry from queue. No error checking is done when the queue is null + * + * @return Description of the Return Value + */ + public Message removeFromQueue() + { + return (Message) queuedRequests.remove(); + } + + + //LinkedList synonyms = new LinkedList(); + + /** + * Constructor for the HostInfo object + * + * @param hostName Description of the Parameter + * @param id Description of the Parameter + */ + public HostInfo(String hostName, int id) + { + this.id = id; + this.disallows = HostInfo.emptyKeepOutDirectories; + this.hostName = hostName; + } + + + /** + * is this host reachable and responding? + * + * @return The healthy value + */ + public boolean isHealthy() + { + return (healthyCount > 0) && isReachable; + } + + + /** + * signals that the host returned with a bad request of whatever type + */ + public void badRequest() + { + healthyCount--; + } + + + /** + * Sets the reachable attribute of the HostInfo object + * + * @param reachable The new reachable value + */ + public void setReachable(boolean reachable) + { + isReachable = reachable; + } + + + /** + * Gets the reachable attribute of the HostInfo object + * + * @return The reachable value + */ + public boolean isReachable() + { + return isReachable; + } + + + /** + * Gets the robotTxtChecked attribute of the HostInfo object + * + * @return The robotTxtChecked value + */ + public boolean isRobotTxtChecked() + { + return robotTxtChecked; + } + + + /** + * must be synchronized externally + * + * @return The loadingRobotsTxt value + */ + public boolean isLoadingRobotsTxt() + { + return this.isLoadingRobotsTxt; + } + + + /** + * Sets the loadingRobotsTxt attribute of the HostInfo object + * + * @param isLoading The new loadingRobotsTxt value + */ + public void setLoadingRobotsTxt(boolean isLoading) + { + this.isLoadingRobotsTxt = isLoading; + if (isLoading) + { + this.queuedRequests = new CachingQueue("HostInfo_" + id + "_QueuedRequests", 100); + } + + } + + + /** + * Sets the robotsChecked attribute of the HostInfo object + * + * @param isChecked The new robotsChecked value + * @param disallows The new robotsChecked value + */ + public void setRobotsChecked(boolean isChecked, String[] disallows) + { + this.robotTxtChecked = isChecked; + if (disallows != null) + { + this.disallows = disallows; + } + else + { + this.disallows = emptyKeepOutDirectories; + } + + } + + + /** + * Gets the allowed attribute of the HostInfo object + * + * @param path Description of the Parameter + * @return The allowed value + */ + public synchronized boolean isAllowed(String path) + { + // assume keepOutDirectories is pretty short + // assert disallows != null + int length = disallows.length; + for (int i = 0; i < length; i++) + { + if (path.startsWith(disallows[i])) + { + return false; + } + } + return true; + } + +} diff --git a/sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/net/HostManager.java b/sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/net/HostManager.java new file mode 100644 index 00000000000..2f6ace62ac5 --- /dev/null +++ b/sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/net/HostManager.java @@ -0,0 +1,154 @@ +/* ==================================================================== + * The Apache Software License, Version 1.1 + * + * Copyright (c) 2001 The Apache Software Foundation. All rights + * reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * + * 3. The end-user documentation included with the redistribution, + * if any, must include the following acknowledgment: + * "This product includes software developed by the + * Apache Software Foundation (http://www.apache.org/)." + * Alternately, this acknowledgment may appear in the software itself, + * if and wherever such third-party acknowledgments normally appear. + * + * 4. The names "Apache" and "Apache Software Foundation" and + * "Apache Lucene" must not be used to endorse or promote products + * derived from this software without prior written permission. For + * written permission, please contact apache@apache.org. + * + * 5. Products derived from this software may not be called "Apache", + * "Apache Lucene", nor may "Apache" appear in their name, without + * prior written permission of the Apache Software Foundation. + * + * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED + * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR + * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF + * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT + * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * ==================================================================== + * + * This software consists of voluntary contributions made by many + * individuals on behalf of the Apache Software Foundation. For more + * information on the Apache Software Foundation, please see + * . + */ + +package de.lanlab.larm.net; + +import java.util.HashMap; + +/** + * Description of the Class + * + * @author Administrator + * @created 16. Februar 2002 + * @version $Id$ + */ +public class HostManager +{ + HashMap hosts; + static int hostCount = 0; + + + /** + * Constructor for the HostInfo object + * + * @param initialSize Description of the Parameter + */ + public HostManager(int initialCapacity) + { + hosts = new HashMap(initialCapacity); + } + + + /** + * Description of the Method + * + * @param hostName Description of the Parameter + * @return Description of the Return Value + */ + public HostInfo put(String hostName) + { + if (!hosts.containsKey(hostName)) + { + int hostID; + synchronized (this) + { + hostID = hostCount++; + } + HostInfo hi = new HostInfo(hostName,hostID); + hosts.put(hostName, hi); + //System.out.println("hostManager: + " + hostName); + if(!hostName.equals(hostName.toLowerCase())) + { + try + { + throw new Exception(); + } + catch(Exception e) + { + e.printStackTrace(); + } + } + return hi; + } + return (HostInfo)hosts.get(hostName); + /*else + { + hostID = hosts.get() + } + // assert hostID != -1; + return hostID;*/ + + } + + + /** + * Gets the hostID attribute of the HostInfo object + * + * @param hostName Description of the Parameter + * @return The hostID value + */ + public HostInfo getHostInfo(String hostName) + { + HostInfo hi = (HostInfo)hosts.get(hostName); + if(hi == null) + { + return put(hostName); + } + return hi; + } + + public int getSize() + { + return hosts.size(); + } + + public HostInfo addSynonym(String hostName, String synonym) + { + HostInfo info = getHostInfo(hostName); + hosts.put(synonym, info); + return info; + } + + +} diff --git a/sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/net/URLNormalizer.java b/sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/net/URLNormalizer.java new file mode 100644 index 00000000000..15dc50d5285 --- /dev/null +++ b/sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/net/URLNormalizer.java @@ -0,0 +1,425 @@ +package de.lanlab.larm.net; +/* + * ==================================================================== + * The Apache Software License, Version 1.1 + * + * Copyright (c) 2001 The Apache Software Foundation. All rights + * reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * + * 3. The end-user documentation included with the redistribution, + * if any, must include the following acknowledgment: + * "This product includes software developed by the + * Apache Software Foundation (http://www.apache.org/)." + * Alternately, this acknowledgment may appear in the software itself, + * if and wherever such third-party acknowledgments normally appear. + * + * 4. The names "Apache" and "Apache Software Foundation" and + * "Apache Lucene" must not be used to endorse or promote products + * derived from this software without prior written permission. For + * written permission, please contact apache@apache.org. + * + * 5. Products derived from this software may not be called "Apache", + * "Apache Lucene", nor may "Apache" appear in their name, without + * prior written permission of the Apache Software Foundation. + * + * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED + * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR + * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF + * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT + * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * ==================================================================== + * + * This software consists of voluntary contributions made by many + * individuals on behalf of the Apache Software Foundation. For more + * information on the Apache Software Foundation, please see + * . + */ +import java.io.*; +import java.net.*; + + +/** + * Description of the Class + * + * @author Administrator + * @created 14. Juni 2002 + */ +public class URLNormalizer +{ + final static int NP_SLASH = 1; + final static int NP_CHAR = 2; + final static int NP_PERCENT = 3; + final static int NP_POINT = 4; + final static int NP_HEX = 5; + + /** + * contains hex codes for characters in lowercase uses char arrays instead + * of strings for faster processing + */ + protected static char[][] charMap = { + {'%', '0', '0'}, {'%', '0', '1'}, {'%', '0', '2'}, {'%', '0', '3'}, {'%', '0', '4'}, {'%', '0', '5'}, {'%', '0', '6'}, {'%', '0', '7'}, {'%', '0', '8'}, {'%', '0', '9'}, {'%', '0', 'A'}, {'%', '0', 'B'}, {'%', '0', 'C'}, {'%', '0', 'D'}, {'%', '0', 'E'}, {'%', '0', 'F'}, + {'%', '1', '0'}, {'%', '1', '1'}, {'%', '1', '2'}, {'%', '1', '3'}, {'%', '1', '4'}, {'%', '1', '5'}, {'%', '1', '6'}, {'%', '1', '7'}, {'%', '1', '8'}, {'%', '1', '9'}, {'%', '1', 'A'}, {'%', '1', 'B'}, {'%', '1', 'C'}, {'%', '1', 'D'}, {'%', '1', 'E'}, {'%', '1', 'F'}, + {'%', '2', '0'}, {'%', '2', '1'}, {'%', '2', '2'}, {'%', '2', '3'}, {'$'}, {'%', '2', '5'}, {'%', '2', '6'}, {'%', '2', '7'}, {'%', '2', '8'}, {'%', '2', '9'}, {'%', '2', 'A'}, {'%', '2', 'B'}, {'%', '2', 'C'}, {'-'}, {'.'}, {'%', '2', 'F'}, + {'0'}, {'1'}, {'2'}, {'3'}, {'4'}, {'5'}, {'6'}, {'7'}, {'8'}, {'9'}, {'%', '3', 'A'}, {'%', '3', 'B'}, {'%', '3', 'C'}, {'%', '3', 'D'}, {'%', '3', 'E'}, {'%', '3', 'F'}, + {'%', '4', '0'}, {'a'}, {'b'}, {'c'}, {'d'}, {'e'}, {'f'}, {'g'}, {'h'}, {'i'}, {'j'}, {'k'}, {'l'}, {'m'}, {'n'}, {'o'}, + {'p'}, {'q'}, {'r'}, {'s'}, {'t'}, {'u'}, {'v'}, {'w'}, {'x'}, {'y'}, {'z'}, {'%', '5', 'B'}, {'%', '5', 'C'}, {'%', '5', 'D'}, {'%', '5', 'E'}, {'_'}, + {'%', '6', '0'}, {'a'}, {'b'}, {'c'}, {'d'}, {'e'}, {'f'}, {'g'}, {'h'}, {'i'}, {'j'}, {'k'}, {'l'}, {'m'}, {'n'}, {'o'}, + {'p'}, {'q'}, {'r'}, {'s'}, {'t'}, {'u'}, {'v'}, {'w'}, {'x'}, {'y'}, {'z'}, {'%', '7', 'B'}, {'%', '7', 'C'}, {'%', '7', 'D'}, {'%', '7', 'E'}, {'%', '7', 'F'}, + {'%', '8', '0'}, {'%', '8', '1'}, {'%', '8', '2'}, {'%', '8', '3'}, {'%', '8', '4'}, {'%', '8', '5'}, {'%', '8', '6'}, {'%', '8', '7'}, {'%', '8', '8'}, {'%', '8', '9'}, {'%', '8', 'A'}, {'%', '8', 'B'}, {'%', '8', 'C'}, {'%', '8', 'D'}, {'%', '8', 'E'}, {'%', '8', 'F'}, + {'%', '9', '0'}, {'%', '9', '1'}, {'%', '9', '2'}, {'%', '9', '3'}, {'%', '9', '4'}, {'%', '9', '5'}, {'%', '9', '6'}, {'%', '9', '7'}, {'%', '9', '8'}, {'%', '9', '9'}, {'%', '9', 'A'}, {'%', '9', 'B'}, {'%', '9', 'C'}, {'%', '9', 'D'}, {'%', '9', 'E'}, {'%', '9', 'F'}, + {'%', 'A', '0'}, {'%', 'A', '1'}, {'%', 'A', '2'}, {'%', 'A', '3'}, {'%', 'A', '4'}, {'%', 'A', '5'}, {'%', 'A', '6'}, {'%', 'A', '7'}, {'%', 'A', '8'}, {'%', 'A', '9'}, {'%', 'A', 'A'}, {'%', 'A', 'B'}, {'%', 'A', 'C'}, {'%', 'A', 'D'}, {'%', 'A', 'E'}, {'%', 'A', 'F'}, + {'%', 'B', '0'}, {'%', 'B', '1'}, {'%', 'B', '2'}, {'%', 'B', '3'}, {'%', 'B', '4'}, {'%', 'B', '5'}, {'%', 'B', '6'}, {'%', 'B', '7'}, {'%', 'B', '8'}, {'%', 'B', '9'}, {'%', 'B', 'A'}, {'%', 'B', 'B'}, {'%', 'B', 'C'}, {'%', 'B', 'D'}, {'%', 'B', 'E'}, {'%', 'B', 'F'}, + {'%', 'E', '0'}, {'%', 'E', '1'}, {'%', 'E', '2'}, {'%', 'E', '3'}, {'%', 'E', '4'}, {'%', 'E', '5'}, {'%', 'E', '6'}, {'%', 'E', '7'}, {'%', 'E', '8'}, {'%', 'E', '9'}, {'%', 'E', 'A'}, {'%', 'E', 'B'}, {'%', 'E', 'C'}, {'%', 'E', 'D'}, {'%', 'E', 'E'}, {'%', 'E', 'F'}, + {'%', 'F', '0'}, {'%', 'F', '1'}, {'%', 'F', '2'}, {'%', 'F', '3'}, {'%', 'F', '4'}, {'%', 'F', '5'}, {'%', 'F', '6'}, {'%', 'D', '7'}, {'%', 'F', '8'}, {'%', 'F', '9'}, {'%', 'F', 'A'}, {'%', 'F', 'B'}, {'%', 'F', 'C'}, {'%', 'F', 'D'}, {'%', 'F', 'E'}, {'%', 'D', 'F'}, + {'%', 'E', '0'}, {'%', 'E', '1'}, {'%', 'E', '2'}, {'%', 'E', '3'}, {'%', 'E', '4'}, {'%', 'E', '5'}, {'%', 'E', '6'}, {'%', 'E', '7'}, {'%', 'E', '8'}, {'%', 'E', '9'}, {'%', 'E', 'A'}, {'%', 'E', 'B'}, {'%', 'E', 'C'}, {'%', 'E', 'D'}, {'%', 'E', 'E'}, {'%', 'E', 'F'}, + {'%', 'F', '0'}, {'%', 'F', '1'}, {'%', 'F', '2'}, {'%', 'F', '3'}, {'%', 'F', '4'}, {'%', 'F', '5'}, {'%', 'F', '6'}, {'%', 'F', '7'}, {'%', 'F', '8'}, {'%', 'F', '9'}, {'%', 'F', 'A'}, {'%', 'F', 'B'}, {'%', 'F', 'C'}, {'%', 'F', 'D'}, {'%', 'F', 'E'}, {'%', 'F', 'F'}, + }; + + + /** + * Description of the Method + * + * @param path Description of the Parameter + * @return Description of the Return Value + * @exception IOException Description of the Exception + */ + protected static String normalizePath(String path) + throws IOException + { + // rule 1: if the path is empty, return "/" + if (path.length() == 0) + { + return "/"; + } + + // Finite State Machine to convert characters to lowercase, remove "//" and "/./" + // and make sure that all characters are escaped in a uniform way, i.e. + // {" ", "+", "%20"} -> "%20" + + StringBuffer w = new StringBuffer((int) (path.length() * 1.5)); + + int status = NP_CHAR; + + int pos = 0; + int length = path.length(); + char savedChar = '?'; + int hexChar = '?'; + int pathPos = -1; // position of last "/" + int questionPos = -1; // assert length >0 + boolean isInQuery = false; // question mark reached? + + while (pos < length) + { + char c = path.charAt(pos++); + try + { + switch (status) + { + case NP_SLASH: + if (c == '/') + { + // ignore subsequent slashes + } + else if (c == '.') + { + status = NP_POINT; + } + else if (c == '%') + { + status = NP_PERCENT; + } + else + { + pos--; + status = NP_CHAR; + } + break; + case NP_POINT: + if (c == '/') + { + // ignore + } + else if (c == '.') + { + // ignore; this shouldn't happen + } + else + { + w.append('.'); + pos--; + status = NP_SLASH; + } + break; + case NP_PERCENT: + if (c >= '0' && c <= '9') + { + hexChar = (c - '0') << 4; + } + else if (c >= 'a' && c <= 'f') + { + hexChar = (c - 'a' + 10) << 4; + } + else if (c >= 'A' && c <= 'F') + { + hexChar = (c - 'A' + 10) << 4; + } + else + { + w.append(charMap['%']); + w.append(charMap[c]); + break; + } + savedChar = c; + status = NP_HEX; + break; + case NP_HEX: + if (c >= '0' && c <= '9') + { + hexChar |= (c - '0'); + } + else if (c >= 'a' && c <= 'f') + { + hexChar |= (c - 'a' + 10); + } + else if (c >= 'A' && c <= 'F') + { + hexChar |= (c - 'A' + 10); + } + else + { + w.append(charMap['%']); + w.append(charMap[savedChar]); + w.append(charMap[c]); + break; + } + w.append(charMap[hexChar]); + status = NP_CHAR; + break; + case NP_CHAR: + switch (c) + { + case '%': + status = NP_PERCENT; + break; + case '/': + if(!isInQuery) + { + w.append(c); + pathPos = w.length(); // points to the char. after "/" + status = NP_SLASH; + } + else + { + w.append(charMap[c]); + } + break; + case '?': + if(!isInQuery) + { + if(pathPos == -1) + { + w.append('/'); + pathPos = w.length(); + } + questionPos = w.length(); // points to the char at "?" + isInQuery = true; + } + else + { + w.append(charMap[c]); + break; + } + case '&': + case ';': + case '@': + //case ':': + case '=': + w.append(c); + break; + case '+': + w.append("%20"); + break; + default: + w.append(charMap[c]); + break; + } + } + + } + catch (ArrayIndexOutOfBoundsException e) + { + // we encountered a unicode character >= 0x00ff + // write UTF-8 to distinguish it from other characters + // note that this does NOT lead to a pure UTF-8 URL since we + // write 0x80 <= c <= 0xff as one-byte strings + /* + * if (ch <= 0x007f) { // other ASCII + * sbuf.append(hex[ch]); + * } else + */ + // note that we ignore the case that we receive "%" + unicode + c + // (status = NP_HEX + Exception when writing savedchar); in that case + // only the second character is written. we consider this to be very + // unlikely + + // see http://www.w3.org/International/O-URL-code.html + if (c <= 0x07FF) + { + // non-ASCII <= 0x7FF + w.append(charMap[0xc0 | (c >> 6)]); + w.append(charMap[0x80 | (c & 0x3F)]); + } + else + { + // 0x7FF < c <= 0xFFFF + w.append(charMap[0xe0 | (c >> 12)]); + w.append(charMap[0x80 | ((c >> 6) & 0x3F)]); + w.append(charMap[0x80 | (c & 0x3F)]); + } + } + } + + // rule 3: delete index.* or default.* + + if(questionPos == -1) // no query + { + questionPos = w.length(); + } + else + { + if(questionPos == w.length()-1) + { + // empty query. assert questionPos > 0 + w.deleteCharAt(questionPos); + } + } + if(pathPos == -1) // no query + { + pathPos = 0; + } + if(questionPos > pathPos) + { + String file = w.substring(pathPos, questionPos); + { + //System.out.println("file: " + file); + if(file.startsWith("index.") || file.startsWith("default.")) + { + w.delete(pathPos, questionPos); // delete default page to avoid ambiguities + } + } + } + return w.toString(); + } + + + /** + * Description of the Method + * + * @param host Description of the Parameter + * @return Description of the Return Value + */ + protected static String normalizeHost(HostManager hostManager, String host) + { + return hostManager.getHostInfo(host.toLowerCase()).getHostName(); + } + +/* + HostManager hostManager; +*/ + + /** + * Constructor for the URLNormalizer object + * + * @param hostManager Description of the Parameter + */ + /* public URLNormalizer(HostManager hostManager) + { + this.hostManager = hostManager; + }*/ + + + /** + * Description of the Method + * + * @param u Description of the Parameter + * @return Description of the Return Value + * @exception IOException Description of the Exception + * @exception MalformedURLException Description of the Exception + */ + public static URL normalize(URL u, HostManager hostManager) + { + if (u.getProtocol().equals("http")) + { + try + { + int port = u.getPort(); + /*URL url =*/ + return new URL(u.getProtocol(), normalizeHost(hostManager, u.getHost()), port == 80 ? -1 : port, normalizePath(u.getFile())); + /*if(!u.equals(url)) + { + System.out.println(u.toExternalForm() + " -> " + url.toExternalForm()); + } + return url;*/ + } + catch(MalformedURLException e) + { + System.out.println("assertion failed: MalformedURLException in URLNormalizer.normalize()"); + throw new java.lang.InternalError("assertion failed: MalformedURLException in URLNormalizer.normalize()"); + } + catch(IOException e) + { + System.out.println("assertion failed: IOException in URLNormalizer.normalize()"); + throw new java.lang.InternalError("assertion failed: MalformedURLException in URLNormalizer.normalize()"); + } + + //return url + } + else + { + return u; + } + } + + public static void main(String[] args) throws Exception + { + HostManager hm = new HostManager(10); + hm.addSynonym("webinfo.campus.lmu.de", "webinfo.uni-muenchen.de"); + System.out.println(URLNormalizer.normalize(new URL("http://www.lmu.de/conman/index.jsp?path=709"), hm)); + System.out.println(URLNormalizer.normalize(new URL("http://webinfo.uni-muenchen.de/view-i.cfm?url=http://abc/resp?a=c"), hm)); + System.out.println(URLNormalizer.normalize(new URL("http://webinfo.campus.lmu.de/view-i.cfm?url=http://abc/resp?a=c"), hm)); + System.out.println(URLNormalizer.normalize(new URL("http://www.bwl.uni-muenchen.de/default.asp?id=123"), hm)); + System.out.println(URLNormalizer.normalize(new URL("http://www.lmu.de/index.html"), hm)); + System.out.println(URLNormalizer.normalize(new URL("http://www.lmu.de"), hm)); + System.out.println(URLNormalizer.normalize(new URL("http://www.lmu.de/"), hm)); + System.out.println(URLNormalizer.normalize(new URL("http://www.lmu.de/?"), hm)); + System.out.println(URLNormalizer.normalize(new URL("http://www.lmu.de?"), hm)); + System.out.println(URLNormalizer.normalize(new URL("http://www.lmu.de?id=abc"), hm)); + System.out.println(URLNormalizer.normalize(new URL("http://www.lmu.de/abcde$1?id=abc"), hm)); + URL u = new URL("http://www.lmu.de/abcde$1?id=abc"); + System.out.println("host: " + u.getHost()); + System.out.println("port: " + u.getPort()); + System.out.println(URLNormalizer.normalize(u, hm)); + + + + } +}