diff --git a/sandbox/contributions/webcrawler-LARM/build.sh b/sandbox/contributions/webcrawler-LARM/build.sh new file mode 100755 index 00000000000..384c3ab9e68 --- /dev/null +++ b/sandbox/contributions/webcrawler-LARM/build.sh @@ -0,0 +1,23 @@ +#!/bin/sh + +#clean +echo cleaning +rm -r build +rm -r classes +rm -r cachingqueue +rm -r logs + +#build +echo making build directory +mkdir build +cd build +echo extracting http client +jar xvf ../lib/HTTPClient.zip >/dev/nul +cd .. +cp -r src/* build +mkdir classes +echo compiling +javac -g -d classes -sourcepath build build/HTTPClient/*.java +javac -g -classpath ./lib/jakarta-oro-2.0.5.jar -d classes -sourcepath build build/de/lanlab/larm/fetcher/FetcherMain.java + + diff --git a/sandbox/contributions/webcrawler-LARM/clean.sh b/sandbox/contributions/webcrawler-LARM/clean.sh new file mode 100755 index 00000000000..65c222feba1 --- /dev/null +++ b/sandbox/contributions/webcrawler-LARM/clean.sh @@ -0,0 +1,5 @@ +#!/bin/sh +./cleanlastrun.sh +rm -r build +rm -r classes + diff --git a/sandbox/contributions/webcrawler-LARM/cleanlastrun.sh b/sandbox/contributions/webcrawler-LARM/cleanlastrun.sh new file mode 100755 index 00000000000..730d2165b55 --- /dev/null +++ b/sandbox/contributions/webcrawler-LARM/cleanlastrun.sh @@ -0,0 +1,4 @@ +#!/bin/sh +rm -r logs +rm -r cachingqueue + diff --git a/sandbox/contributions/webcrawler-LARM/og-build.sh b/sandbox/contributions/webcrawler-LARM/og-build.sh new file mode 100755 index 00000000000..5ce5c7dd214 --- /dev/null +++ b/sandbox/contributions/webcrawler-LARM/og-build.sh @@ -0,0 +1,23 @@ +#!/bin/sh + +#clean +echo cleaning +rm -r build +rm -r classes +rm -r cachingqueue +rm -r logs + +#build +echo making build directory +mkdir build +cd build +#echo extracting http client +#jar xvf ../lib/HTTPClient.zip >/dev/null +cd .. +cp -r src/* build +mkdir classes +echo compiling +#javac -g -d classes -sourcepath build build/HTTPClient/*.java +javac -g -d classes -sourcepath build build/de/lanlab/larm/fetcher/FetcherMain.java + + diff --git a/sandbox/contributions/webcrawler-LARM/run.sh b/sandbox/contributions/webcrawler-LARM/run.sh new file mode 100755 index 00000000000..4af92d2fed6 --- /dev/null +++ b/sandbox/contributions/webcrawler-LARM/run.sh @@ -0,0 +1,4 @@ +#!/bin/sh +rm -r logs +mkdir logs +java -server -Xmx400mb -classpath classes:lib/jakarta-oro-2.0.5.jar de.lanlab.larm.fetcher.FetcherMain -start http://www.cis.uni-muenchen.de/ -restrictto http://[^/]*\.uni-muenchen\.de.* -threads 15 diff --git a/sandbox/contributions/webcrawler-LARM/src/HTTPClient/ContentEncodingModule.java b/sandbox/contributions/webcrawler-LARM/src/HTTPClient/ContentEncodingModule.java new file mode 100644 index 00000000000..994caec61f6 --- /dev/null +++ b/sandbox/contributions/webcrawler-LARM/src/HTTPClient/ContentEncodingModule.java @@ -0,0 +1,278 @@ +/* + * @(#)ContentEncodingModule.java 0.3-3 06/05/2001 + * + * This file is part of the HTTPClient package + * Copyright (C) 1996-2001 Ronald Tschalär + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free + * Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, + * MA 02111-1307, USA + * + * For questions, suggestions, bug-reports, enhancement-requests etc. + * I may be contacted at: + * + * ronald@innovation.ch + * + * The HTTPClient's home page is located at: + * + * http://www.innovation.ch/java/HTTPClient/ + * + */ +package HTTPClient; + +import java.io.IOException; +import java.util.Vector; +import java.util.zip.InflaterInputStream; +import java.util.zip.GZIPInputStream; + +/** + * This module handles the Content-Encoding response header. It currently + * handles the "gzip", "deflate", "compress" and "identity" tokens. + * + * @author Ronald Tschalär + * @created 29. Dezember 2001 + * @version 0.3-3 06/05/2001 + */ +public class ContentEncodingModule implements HTTPClientModule +{ + // Methods + + /** + * Invoked by the HTTPClient. + * + * @param req Description of the Parameter + * @param resp Description of the Parameter + * @return Description of the Return Value + * @exception ModuleException Description of the Exception + */ + public int requestHandler(Request req, Response[] resp) + throws ModuleException + { + // parse Accept-Encoding header + + int idx; + NVPair[] hdrs = req.getHeaders(); + for (idx = 0; idx < hdrs.length; idx++) + { + if (hdrs[idx].getName().equalsIgnoreCase("Accept-Encoding")) + { + break; + } + } + + Vector pae; + if (idx == hdrs.length) + { + hdrs = Util.resizeArray(hdrs, idx + 1); + req.setHeaders(hdrs); + pae = new Vector(); + } + else + { + try + { + pae = Util.parseHeader(hdrs[idx].getValue()); + } + catch (ParseException pe) + { + throw new ModuleException(pe.toString()); + } + } + + // done if "*;q=1.0" present + + HttpHeaderElement all = Util.getElement(pae, "*"); + if (all != null) + { + NVPair[] params = all.getParams(); + for (idx = 0; idx < params.length; idx++) + { + if (params[idx].getName().equalsIgnoreCase("q")) + { + break; + } + } + + if (idx == params.length) + { + // no qvalue, i.e. q=1.0 + return REQ_CONTINUE; + } + + if (params[idx].getValue() == null || + params[idx].getValue().length() == 0) + { + throw new ModuleException("Invalid q value for \"*\" in " + + "Accept-Encoding header: "); + } + + try + { + if (Float.valueOf(params[idx].getValue()).floatValue() > 0.) + { + return REQ_CONTINUE; + } + } + catch (NumberFormatException nfe) + { + throw new ModuleException("Invalid q value for \"*\" in " + + "Accept-Encoding header: " + nfe.getMessage()); + } + } + + // Add gzip, deflate and compress tokens to the Accept-Encoding header + + if (!pae.contains(new HttpHeaderElement("deflate"))) + { + pae.addElement(new HttpHeaderElement("deflate")); + } + if (!pae.contains(new HttpHeaderElement("gzip"))) + { + pae.addElement(new HttpHeaderElement("gzip")); + } + if (!pae.contains(new HttpHeaderElement("x-gzip"))) + { + pae.addElement(new HttpHeaderElement("x-gzip")); + } + if (!pae.contains(new HttpHeaderElement("compress"))) + { + pae.addElement(new HttpHeaderElement("compress")); + } + if (!pae.contains(new HttpHeaderElement("x-compress"))) + { + pae.addElement(new HttpHeaderElement("x-compress")); + } + + hdrs[idx] = new NVPair("Accept-Encoding", Util.assembleHeader(pae)); + + return REQ_CONTINUE; + } + + + /** + * Invoked by the HTTPClient. + * + * @param resp Description of the Parameter + * @param req Description of the Parameter + */ + public void responsePhase1Handler(Response resp, RoRequest req) + { + } + + + /** + * Invoked by the HTTPClient. + * + * @param resp Description of the Parameter + * @param req Description of the Parameter + * @return Description of the Return Value + */ + public int responsePhase2Handler(Response resp, Request req) + { + return RSP_CONTINUE; + } + + + /** + * Invoked by the HTTPClient. + * + * @param resp Description of the Parameter + * @param req Description of the Parameter + * @exception IOException Description of the Exception + * @exception ModuleException Description of the Exception + */ + public void responsePhase3Handler(Response resp, RoRequest req) + throws IOException, ModuleException + { + String ce = resp.getHeader("Content-Encoding"); + if (ce == null || req.getMethod().equals("HEAD") || + resp.getStatusCode() == 206) + { + return; + } + + Vector pce; + try + { + pce = Util.parseHeader(ce); + } + catch (ParseException pe) + { + throw new ModuleException(pe.toString()); + } + + if (pce.size() == 0) + { + return; + } + + String encoding = ((HttpHeaderElement) pce.firstElement()).getName(); + if (encoding.equalsIgnoreCase("gzip") || + encoding.equalsIgnoreCase("x-gzip")) + { + Log.write(Log.MODS, "CEM: pushing gzip-input-stream"); + + resp.inp_stream = new GZIPInputStream(resp.inp_stream); + pce.removeElementAt(pce.size() - 1); + resp.deleteHeader("Content-length"); + } + else if (encoding.equalsIgnoreCase("deflate")) + { + Log.write(Log.MODS, "CEM: pushing inflater-input-stream"); + + resp.inp_stream = new InflaterInputStream(resp.inp_stream); + pce.removeElementAt(pce.size() - 1); + resp.deleteHeader("Content-length"); + } + else if (encoding.equalsIgnoreCase("compress") || + encoding.equalsIgnoreCase("x-compress")) + { + Log.write(Log.MODS, "CEM: pushing uncompress-input-stream"); + + resp.inp_stream = new UncompressInputStream(resp.inp_stream); + pce.removeElementAt(pce.size() - 1); + resp.deleteHeader("Content-length"); + } + else if (encoding.equalsIgnoreCase("identity")) + { + Log.write(Log.MODS, "CEM: ignoring 'identity' token"); + pce.removeElementAt(pce.size() - 1); + } + else + { + Log.write(Log.MODS, "CEM: Unknown content encoding '" + + encoding + "'"); + } + + if (pce.size() > 0) + { + resp.setHeader("Content-Encoding", Util.assembleHeader(pce)); + } + else + { + resp.deleteHeader("Content-Encoding"); + } + } + + + /** + * Invoked by the HTTPClient. + * + * @param resp Description of the Parameter + * @param req Description of the Parameter + */ + public void trailerHandler(Response resp, RoRequest req) + { + } +} diff --git a/sandbox/contributions/webcrawler-LARM/src/HTTPClient/HTTPConnection.java b/sandbox/contributions/webcrawler-LARM/src/HTTPClient/HTTPConnection.java new file mode 100644 index 00000000000..ba9309cb84c --- /dev/null +++ b/sandbox/contributions/webcrawler-LARM/src/HTTPClient/HTTPConnection.java @@ -0,0 +1,4489 @@ +/* + * @(#)HTTPConnection.java 0.3-3 06/05/2001 + * + * This file is part of the HTTPClient package + * Copyright (C) 1996-2001 Ronald Tschalär + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free + * Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, + * MA 02111-1307, USA + * + * For questions, suggestions, bug-reports, enhancement-requests etc. + * I may be contacted at: + * + * ronald@innovation.ch + * + * The HTTPClient's home page is located at: + * + * http://www.innovation.ch/java/HTTPClient/ + * + */ +package HTTPClient; + +import java.io.OutputStream; +import java.io.DataOutputStream; +import java.io.FilterOutputStream; +import java.io.ByteArrayOutputStream; +import java.io.IOException; +import java.io.InterruptedIOException; +import java.net.URL; +import java.net.Socket; +import java.net.InetAddress; +import java.net.SocketException; +import java.net.ConnectException; +import java.net.UnknownHostException; +import java.net.NoRouteToHostException; +import java.util.Vector; +import java.applet.Applet; + +/** + * This class implements http protocol requests; it contains most of HTTP/1.1 + * and ought to be unconditionally compliant. Redirections are automatically + * handled, and authorizations requests are recognized and dealt with via an + * authorization handler. Only full HTTP/1.0 and HTTP/1.1 requests are + * generated. HTTP/1.1, HTTP/1.0 and HTTP/0.9 responses are recognized.
+ *
+ * Using the HTTPClient should be quite simple. First add the import statement '
+ * import HTTPClient.*;
' to your file(s). Request can then be sent
+ * using one of the methods Head() , Get() , Post()
+ * , etc in HTTPConnection . These methods all return an
+ * instance of HTTPResponse which has methods for accessing the
+ * response headers (getHeader() , getHeaderAsInt() ,
+ * etc), various response info (getStatusCode() ,
+ * getReasonLine() , etc) and the reponse data (getData() ,
+ * getText() , and getInputStream() ). Following are some
+ * examples.
+ * + * If this is in an applet you can retrieve files from your server as follows: + *
+ * try + * { + * HTTPConnection con = new HTTPConnection(this); + * HTTPResponse rsp = con.Get("/my_file"); + * if (rsp.getStatusCode() >= 300) + * { + * System.err.println("Received Error: "+rsp.getReasonLine()); + * System.err.println(rsp.getText()); + * } + * else + * data = rsp.getData(); + * + * rsp = con.Get("/another_file"); + * if (rsp.getStatusCode() >= 300) + * { + * System.err.println("Received Error: "+rsp.getReasonLine()); + * System.err.println(rsp.getText()); + * } + * else + * other_data = rsp.getData(); + * } + * catch (IOException ioe) + * { + * System.err.println(ioe.toString()); + * } + * catch (ModuleException me) + * { + * System.err.println("Error handling request: " + me.getMessage()); + * } + *This will get the files "/my_file" and "/another_file" and put their + * contents into byte[]'s accessible via
getData()
. Note that you
+ * need to only create a new HTTPConnection when sending a request to
+ * a new server (different host or port); although you may create a new
+ * HTTPConnection for every request to the same server this not
+ * recommended, as various information about the server is cached
+ * after the first request (to optimize subsequent requests) and persistent
+ * connections are used whenever possible. + * + * To POST form data you would use something like this (assuming you have two + * fields called name and e-mail , whose contents are + * stored in the variables name and email ):
+ * try + * { + * NVPair form_data[] = new NVPair[2]; + * form_data[0] = new NVPair("name", name); + * form_data[1] = new NVPair("e-mail", email); + * + * HTTPConnection con = new HTTPConnection(this); + * HTTPResponse rsp = con.Post("/cgi-bin/my_script", form_data); + * if (rsp.getStatusCode() >= 300) + * { + * System.err.println("Received Error: "+rsp.getReasonLine()); + * System.err.println(rsp.getText()); + * } + * else + * stream = rsp.getInputStream(); + * } + * catch (IOException ioe) + * { + * System.err.println(ioe.toString()); + * } + * catch (ModuleException me) + * { + * System.err.println("Error handling request: " + me.getMessage()); + * } + *Here the response data is read at leasure via an InputStream + * instead of all at once into a byte[] .
+ * + * As another example, if you have a URL you're trying to send a request to you + * would do something like the following:
+ * try + * { + * URL url = new URL("http://www.mydomain.us/test/my_file"); + * HTTPConnection con = new HTTPConnection(url); + * HTTPResponse rsp = con.Put(url.getFile(), "Hello World"); + * if (rsp.getStatusCode() >= 300) + * { + * System.err.println("Received Error: "+rsp.getReasonLine()); + * System.err.println(rsp.getText()); + * } + * else + * text = rsp.getText(); + * } + * catch (IOException ioe) + * { + * System.err.println(ioe.toString()); + * } + * catch (ModuleException me) + * { + * System.err.println("Error handling request: " + me.getMessage()); + * } + *
+ * + * There are a whole number of methods for each request type; however the + * general forms are ([...] means that the enclosed is optional): + *
+ *
+ * Note: there is a small window where a request method such as Get()
+ * may have been invoked but the request has not been built and added to the
+ * list. Any request in this window will not be aborted.
+ *
+ * @since V0.2-3
+ */
+ public void stop()
+ {
+ for (Request req = (Request) RequestList.enumerate(); req != null;
+ req = (Request) RequestList.next())
+ {
+ req.aborted = true;
+ }
+
+ for (StreamDemultiplexor demux =
+ (StreamDemultiplexor) DemuxList.enumerate();
+ demux != null; demux = (StreamDemultiplexor) DemuxList.next())
+ {
+ demux.abort();
+ }
+ }
+
+
+ /**
+ * Sets the default http headers to be sent with each request. The actual
+ * headers sent are determined as follows: for each header specified in
+ * multiple places a value given as part of the request takes priority over
+ * any default values set by this method, which in turn takes priority over
+ * any built-in default values. A different way of looking at it is that we
+ * start off with a list of all headers specified with the request, then add
+ * any default headers set by this method which aren't already in our list,
+ * and finally add any built-in headers which aren't yet in the list. There
+ * is one exception to this rule: the "Content-length" header is always
+ * ignored; and when posting form-data any default "Content-type" is ignored
+ * in favor of the built-in "application/x-www-form-urlencoded" (however it
+ * will be overriden by any content-type header specified as part of the
+ * request).
+ * + * Typical headers you might want to set here are "Accept" and its + * "Accept-*" relatives, "Connection", "From", "User-Agent", etc. + * + * @param headers an array of header-name/value pairs (do not give the + * separating ':'). + */ + public void setDefaultHeaders(NVPair[] headers) + { + int length = (headers == null ? 0 : headers.length); + NVPair[] def_hdrs = new NVPair[length]; + + // weed out undesired headers + int sidx; + + // weed out undesired headers + int didx; + for (sidx = 0, didx = 0; sidx < length; sidx++) + { + if (headers[sidx] == null) + { + continue; + } + + String name = headers[sidx].getName().trim(); + if (name.equalsIgnoreCase("Content-length")) + { + continue; + } + + def_hdrs[didx++] = headers[sidx]; + } + + if (didx < length) + { + def_hdrs = Util.resizeArray(def_hdrs, didx); + } + + synchronized (DefaultHeaders) + { + DefaultHeaders = def_hdrs; + } + } + + + /** + * Gets the current list of default http headers. + * + * @return an array of header/value pairs. + */ + public NVPair[] getDefaultHeaders() + { + synchronized (DefaultHeaders) + { + return (NVPair[]) DefaultHeaders.clone(); + } + } + + + /** + * Returns the protocol this connection is talking. + * + * @return a string containing the (lowercased) protocol + */ + public String getProtocol() + { + switch (Protocol) + { + case HTTP: + return "http"; + case HTTPS: + return "https"; + case SHTTP: + return "shttp"; + case HTTP_NG: + return "http-ng"; + default: + throw new Error("HTTPClient Internal Error: invalid protocol " + + Protocol); + } + } + + + /** + * Returns the host this connection is talking to. + * + * @return a string containing the (lowercased) host name. + */ + public String getHost() + { + return Host; + } + + + /** + * Returns the port this connection connects to. This is always the actual + * port number, never -1. + * + * @return the port number + */ + public int getPort() + { + return Port; + } + + + /** + * Returns the host of the proxy this connection is using. + * + * @return a string containing the (lowercased) host name. + */ + public String getProxyHost() + { + return Proxy_Host; + } + + + /** + * Returns the port of the proxy this connection is using. + * + * @return the port number + */ + public int getProxyPort() + { + return Proxy_Port; + } + + + /** + * See if the given uri is compatible with this connection. Compatible means + * that the given uri can be retrieved using this connection object. + * + * @param uri the URI to check + * @return true if they're compatible, false otherwise + * @since V0.3-2 + */ + public boolean isCompatibleWith(URI uri) + { + if (!uri.getScheme().equals(getProtocol()) || + !uri.getHost().equalsIgnoreCase(Host)) + { + return false; + } + + int port = uri.getPort(); + if (port == -1) + { + port = URI.defaultPort(uri.getScheme()); + } + return port == Port; + } + + + /** + * Sets/Resets raw mode. In raw mode all modules are bypassed, meaning the + * automatic handling of authorization requests, redirections, cookies, etc. + * is turned off.
+ *
+ * The default is false.
+ *
+ * @param raw if true removes all modules (except for the retry module)
+ * @deprecated This is not really needed anymore; in V0.2 request were
+ * synchronous and therefore to do pipelining you needed to disable the
+ * processing of responses.
+ * @see #removeModule(java.lang.Class)
+ */
+ public void setRawMode(boolean raw)
+ {
+ // Don't remove the retry module
+ String[] modules = {"HTTPClient.CookieModule",
+ "HTTPClient.RedirectionModule",
+ "HTTPClient.AuthorizationModule",
+ "HTTPClient.DefaultModule",
+ "HTTPClient.TransferEncodingModule",
+ "HTTPClient.ContentMD5Module",
+ "HTTPClient.ContentEncodingModule"};
+
+ for (int idx = 0; idx < modules.length; idx++)
+ {
+ try
+ {
+ if (raw)
+ {
+ removeModule(Class.forName(modules[idx]));
+ }
+ else
+ {
+ addModule(Class.forName(modules[idx]), -1);
+ }
+ }
+ catch (ClassNotFoundException cnfe)
+ {
+ }
+ }
+ }
+
+
+ /**
+ * Sets the default timeout value to be used for each new HTTPConnection.
+ * The default is 0.
+ *
+ * @param time the timeout in milliseconds.
+ * @see #setTimeout(int)
+ */
+ public static void setDefaultTimeout(int time)
+ {
+ DefaultTimeout = time;
+ }
+
+
+ /**
+ * Gets the default timeout value to be used for each new HTTPConnection.
+ *
+ * @return the timeout in milliseconds.
+ * @see #setTimeout(int)
+ */
+ public static int getDefaultTimeout()
+ {
+ return DefaultTimeout;
+ }
+
+
+ /**
+ * Sets the timeout to be used for creating connections and reading
+ * responses. When a timeout expires the operation will throw an
+ * InterruptedIOException. The operation may be restarted again afterwards.
+ * If the operation is not restarted and it is a read operation (i.e
+ * HTTPResponse.xxxx()) then resp.getInputStream().close()
+ * should be invoked.
+ * + * When creating new sockets the timeout will limit the time spent doing the + * host name translation and establishing the connection with the server. + *
+ * + * The timeout also influences the reading of the response headers. However, + * it does not specify a how long, for example, getStatusCode() may take, as + * might be assumed. Instead it specifies how long a read on the socket may + * take. If the response dribbles in slowly with packets arriving quicker + * than the timeout then the method will complete normally. I.e. the + * exception is only thrown if nothing arrives on the socket for the + * specified time. Furthermore, the timeout only influences the reading of + * the headers, not the reading of the body.
+ * + * Read Timeouts are associated with responses, so that you may change this + * value before each request and it won't affect the reading of responses to + * previous requests. + * + * @param time the time in milliseconds. A time of 0 means wait + * indefinitely. + * @see #stop() + */ + public void setTimeout(int time) + { + Timeout = time; + } + + + /** + * Gets the timeout used for reading response data. + * + * @return the current timeout value + * @see #setTimeout(int) + */ + public int getTimeout() + { + return Timeout; + } + + + /** + * Controls whether modules are allowed to prompt the user or pop up dialogs + * if neccessary. + * + * @param allow if true allows modules to interact with user. + */ + public void setAllowUserInteraction(boolean allow) + { + allowUI = allow; + } + + + /** + * returns whether modules are allowed to prompt or popup dialogs if + * neccessary. + * + * @return true if modules are allowed to interact with user. + */ + public boolean getAllowUserInteraction() + { + return allowUI; + } + + + /** + * Sets the default allow-user-action. + * + * @param allow if true allows modules to interact with user. + */ + public static void setDefaultAllowUserInteraction(boolean allow) + { + defaultAllowUI = allow; + } + + + /** + * Gets the default allow-user-action. + * + * @return true if modules are allowed to interact with user. + */ + public static boolean getDefaultAllowUserInteraction() + { + return defaultAllowUI; + } + + + /** + * Returns the default list of modules. + * + * @return an array of classes + */ + public static Class[] getDefaultModules() + { + return getModules(DefaultModuleList); + } + + + /** + * Adds a module to the default list. It must implement the + * HTTPClientModule interface. If the module is already in the list + * then this method does nothing. This method only affects instances of + * HTTPConnection created after this method has been invoked; it does not + * affect existing instances.
+ * + * Example:
+ * HTTPConnection.addDefaultModule(Class.forName("HTTPClient.CookieModule"), 1); + *adds the cookie module as the second module in the list.
+ * + * The default list is created at class initialization time from the + * property HTTPClient.Modules . This must contain a "|" + * separated list of classes in the order they're to be invoked. If this + * property is not set it defaults to: "HTTPClient.RetryModule | + * HTTPClient.CookieModule | HTTPClient.RedirectionModule | + * HTTPClient.AuthorizationModule | HTTPClient.DefaultModule | + * HTTPClient.TransferEncodingModule | HTTPClient.ContentMD5Module | + * HTTPClient.ContentEncodingModule" + * + * @param module the module's Class object + * @param pos the position of this module in the list; if pos + * >= 0 then this is the absolute position in the list (0 is the first + * position); if pos < 0 then this is the position relative + * to the end of the list (-1 means the last element, -2 the second to + * last element, etc). + * @return true if module was successfully added; false if the module + * is already in the list. + * @see HTTPClientModule + */ + public static boolean addDefaultModule(Class module, int pos) + { + return addModule(DefaultModuleList, module, pos); + } + + + /** + * Removes a module from the default list. If the module is not in the list + * it does nothing. This method only affects instances of HTTPConnection + * created after this method has been invoked; it does not affect existing + * instances. + * + * @param module the module's Class object + * @return true if module was successfully removed; false otherwise + */ + public static boolean removeDefaultModule(Class module) + { + return removeModule(DefaultModuleList, module); + } + + + /** + * Returns the list of modules used currently. + * + * @return an array of classes + */ + public Class[] getModules() + { + return getModules(ModuleList); + } + + + /** + * Adds a module to the current list. It must implement the + * HTTPClientModule interface. If the module is already in the list + * then this method does nothing. + * + * @param module the module's Class object + * @param pos the position of this module in the list; if pos + * >= 0 then this is the absolute position in the list (0 is the first + * position); if pos < 0 then this is the position relative + * to the end of the list (-1 means the last element, -2 the second to + * last element, etc). + * @return true if module was successfully added; false if the module + * is already in the list. + * @see HTTPClientModule + */ + public boolean addModule(Class module, int pos) + { + return addModule(ModuleList, module, pos); + } + + + /** + * Removes a module from the current list. If the module is not in the list + * it does nothing. + * + * @param module the module's Class object + * @return true if module was successfully removed; false otherwise + */ + public boolean removeModule(Class module) + { + return removeModule(ModuleList, module); + } + + + /** + * Gets the modules attribute of the HTTPConnection class + * + * @param list Description of the Parameter + * @return The modules value + */ + private final static Class[] getModules(Vector list) + { + synchronized (list) + { + Class[] modules = new Class[list.size()]; + list.copyInto(modules); + return modules; + } + } + + + /** + * Adds a feature to the Module attribute of the HTTPConnection class + * + * @param list The feature to be added to the Module attribute + * @param module The feature to be added to the Module attribute + * @param pos The feature to be added to the Module attribute + * @return Description of the Return Value + */ + private final static boolean addModule(Vector list, Class module, int pos) + { + if (module == null) + { + return false; + } + + // check if module implements HTTPClientModule + try + { + HTTPClientModule tmp = (HTTPClientModule) module.newInstance(); + } + catch (RuntimeException re) + { + throw re; + } + catch (Exception e) + { + throw new RuntimeException(e.toString()); + } + + synchronized (list) + { + // check if module already in list + if (list.contains(module)) + { + return false; + } + + // add module to list + if (pos < 0) + { + list.insertElementAt(module, DefaultModuleList.size() + pos + 1); + } + else + { + list.insertElementAt(module, pos); + } + } + + Log.write(Log.CONN, "Conn: Added module " + module.getName() + + " to " + + ((list == DefaultModuleList) ? "default " : "") + + "list"); + + return true; + } + + + /** + * Description of the Method + * + * @param list Description of the Parameter + * @param module Description of the Parameter + * @return Description of the Return Value + */ + private final static boolean removeModule(Vector list, Class module) + { + if (module == null) + { + return false; + } + + boolean removed = list.removeElement(module); + if (removed) + { + Log.write(Log.CONN, "Conn: Removed module " + module.getName() + + " from " + + ((list == DefaultModuleList) ? "default " : "") + + "list"); + } + + return removed; + } + + + /** + * Sets the current context. The context is used by modules such as the + * AuthorizationModule and the CookieModule which keep lists of info that is + * normally shared between all instances of HTTPConnection. This is usually + * the desired behaviour. However, in some cases one would like to simulate + * multiple independent clients within the same application and hence the + * sharing of such info should be restricted. This is where the context + * comes in. Modules will only share their info between requests using the + * same context (i.e. they keep multiple lists, one for each context).
+ *
+ * The context may be any object. Contexts are considered equal if equals()
+ * returns true. Examples of useful context objects are threads (e.g. if you
+ * are running multiple clients, one per thread) and sockets (e.g. if you
+ * are implementing a gateway).
+ *
+ * When a new HTTPConnection is created it is initialized with a default
+ * context which is the same for all instances. This method must be invoked
+ * immediately after a new HTTPConnection is created and before any request
+ * method is invoked. Furthermore, this method may only be called once (i.e.
+ * the context is "sticky").
+ *
+ * @param context the new context; must be non-null
+ */
+ public void setContext(Object context)
+ {
+ if (context == null)
+ {
+ throw new IllegalArgumentException("Context must be non-null");
+ }
+ if (Context != null)
+ {
+ throw new IllegalStateException("Context already set");
+ }
+
+ Context = context;
+ }
+
+
+ /**
+ * Returns the current context.
+ *
+ * @return the current context, or the default context if setContext()
+ * hasn't been invoked
+ * @see #setContext(java.lang.Object)
+ */
+ public Object getContext()
+ {
+ if (Context != null)
+ {
+ return Context;
+ }
+ else
+ {
+ return dflt_context;
+ }
+ }
+
+
+ /**
+ * Returns the default context.
+ *
+ * @return the default context
+ * @see #setContext(java.lang.Object)
+ */
+ public static Object getDefaultContext()
+ {
+ return dflt_context;
+ }
+
+
+ /**
+ * Adds an authorization entry for the "digest" authorization scheme to the
+ * list. If an entry already exists for the "digest" scheme and the
+ * specified realm then it is overwritten.
+ * + * This is a convenience method and just invokes the corresponding method in + * AuthorizationInfo. + * + * @param realm the realm + * @param user the username + * @param passwd The feature to be added to the DigestAuthorization + * attribute + * @see AuthorizationInfo#addDigestAuthorization(java.lang.String, + * int, java.lang.String, java.lang.String, java.lang.String) + */ + public void addDigestAuthorization(String realm, String user, String passwd) + { + AuthorizationInfo.addDigestAuthorization(Host, Port, realm, user, + passwd, getContext()); + } + + + /** + * Adds an authorization entry for the "basic" authorization scheme to the + * list. If an entry already exists for the "basic" scheme and the specified + * realm then it is overwritten.
+ * + * This is a convenience method and just invokes the corresponding method in + * AuthorizationInfo. + * + * @param realm the realm + * @param user the username + * @param passwd The feature to be added to the BasicAuthorization + * attribute + * @see AuthorizationInfo#addBasicAuthorization(java.lang.String, + * int, java.lang.String, java.lang.String, java.lang.String) + */ + public void addBasicAuthorization(String realm, String user, String passwd) + { + AuthorizationInfo.addBasicAuthorization(Host, Port, realm, user, + passwd, getContext()); + } + + + /** + * Sets the default proxy server to use. The proxy will only be used for new + * HTTPConnection s created after this call and will not affect + * currrent instances of HTTPConnection . A null or empty string + * host parameter disables the proxy.
+ * + * In an application or using the Appletviewer an alternative to this method + * is to set the following properties (either in the properties file or on + * the command line): http.proxyHost and http.proxyPort + * . Whether http.proxyHost is set or not determines + * whether a proxy server is used.
+ * + * If the proxy server requires authorization and you wish to set this + * authorization information in the code, then you may use any of the + * AuthorizationInfo.addXXXAuthorization() methods to do so. Specify + * the same host and port as in this method. If you + * have not given any authorization info and the proxy server requires + * authorization then you will be prompted for the necessary info via a + * popup the first time you do a request. + * + * @param host the host on which the proxy server resides. + * @param port the port the proxy server is listening on. + * @see #setCurrentProxy(java.lang.String, int) + */ + public static void setProxyServer(String host, int port) + { + if (host == null || host.trim().length() == 0) + { + Default_Proxy_Host = null; + } + else + { + Default_Proxy_Host = host.trim().toLowerCase(); + Default_Proxy_Port = port; + } + } + + + /** + * Sets the proxy used by this instance. This can be used to override the + * proxy setting inherited from the default proxy setting. A null or empty + * string host parameter disables the proxy.
+ *
+ * Note that if you set a proxy for the connection using this method, and a
+ * request made over this connection is redirected to a different server,
+ * then the connection used for new server will not pick this proxy
+ * setting, but instead will use the default proxy settings.
+ *
+ * @param host the host the proxy runs on
+ * @param port the port the proxy is listening on
+ * @see #setProxyServer(java.lang.String, int)
+ */
+ public synchronized void setCurrentProxy(String host, int port)
+ {
+ if (host == null || host.trim().length() == 0)
+ {
+ Proxy_Host = null;
+ }
+ else
+ {
+ Proxy_Host = host.trim().toLowerCase();
+ if (port <= 0)
+ {
+ Proxy_Port = 80;
+ }
+ else
+ {
+ Proxy_Port = port;
+ }
+ }
+
+ // the proxy might be talking a different version, so renegotiate
+ switch (Protocol)
+ {
+ case HTTP:
+ case HTTPS:
+ if (force_1_0)
+ {
+ ServerProtocolVersion = HTTP_1_0;
+ ServProtVersKnown = true;
+ RequestProtocolVersion = "HTTP/1.0";
+ }
+ else
+ {
+ ServerProtocolVersion = HTTP_1_1;
+ ServProtVersKnown = false;
+ RequestProtocolVersion = "HTTP/1.1";
+ }
+ break;
+ case HTTP_NG:
+ ServerProtocolVersion = -1;
+ /*
+ * Unknown
+ */
+ ServProtVersKnown = false;
+ RequestProtocolVersion = "";
+ break;
+ case SHTTP:
+ ServerProtocolVersion = -1;
+ /*
+ * Unknown
+ */
+ ServProtVersKnown = false;
+ RequestProtocolVersion = "Secure-HTTP/1.3";
+ break;
+ default:
+ throw new Error("HTTPClient Internal Error: invalid protocol " +
+ Protocol);
+ }
+
+ keepAliveUnknown = true;
+ doesKeepAlive = false;
+
+ input_demux = null;
+ early_stall = null;
+ late_stall = null;
+ prev_resp = null;
+ }
+
+
+ /**
+ * Add host to the list of hosts which should be accessed
+ * directly, not via any proxy set by setProxyServer()
.
+ * + * The host may be any of: + *
+ *
+ * The two properties HTTPClient.nonProxyHosts and
+ * http.nonProxyHosts are used when this class is loaded to initialize
+ * the list of non-proxy hosts. The second property is only read if the
+ * first one is not set; the second property is also used the JDK's
+ * URLConnection. These properties must contain a "|" separated list of
+ * entries which conform to the above rules for the host
+ * parameter (e.g. "11.22.33.44|.disney.com").
+ *
+ * @param host a host name, domain name, IP-address or
+ * IP-subnet.
+ * @exception ParseException if the length of the netmask does not match
+ * the length of the IP-address
+ */
+ public static void dontProxyFor(String host)
+ throws ParseException
+ {
+ host = host.trim().toLowerCase();
+
+ // check for domain name
+
+ if (host.charAt(0) == '.')
+ {
+ if (!non_proxy_dom_list.contains(host))
+ {
+ non_proxy_dom_list.addElement(host);
+ }
+ return;
+ }
+
+ // check for host name
+
+ for (int idx = 0; idx < host.length(); idx++)
+ {
+ if (!Character.isDigit(host.charAt(idx)) &&
+ host.charAt(idx) != '.' && host.charAt(idx) != '/')
+ {
+ non_proxy_host_list.put(host, "");
+ return;
+ }
+ }
+
+ // must be an IP-address
+
+ byte[] ip_addr;
+ byte[] ip_mask;
+ int slash;
+ if ((slash = host.indexOf('/')) != -1)
+ {
+ // IP subnet
+
+ ip_addr = string2arr(host.substring(0, slash));
+ ip_mask = string2arr(host.substring(slash + 1));
+ if (ip_addr.length != ip_mask.length)
+ {
+ throw new ParseException("length of IP-address (" +
+ ip_addr.length + ") != length of netmask (" +
+ ip_mask.length + ")");
+ }
+ }
+ else
+ {
+ ip_addr = string2arr(host);
+ ip_mask = new byte[ip_addr.length];
+ for (int idx = 0; idx < ip_mask.length; idx++)
+ {
+ ip_mask[idx] = (byte) 255;
+ }
+ }
+
+ // check if addr or subnet already exists
+
+ ip_loop :
+ for (int idx = 0; idx < non_proxy_addr_list.size(); idx++)
+ {
+ byte[] addr = (byte[]) non_proxy_addr_list.elementAt(idx);
+ byte[] mask = (byte[]) non_proxy_mask_list.elementAt(idx);
+ if (addr.length != ip_addr.length)
+ {
+ continue;
+ }
+
+ for (int idx2 = 0; idx2 < addr.length; idx2++)
+ {
+ if ((ip_addr[idx2] & mask[idx2]) != (addr[idx2] & mask[idx2]) ||
+ (mask[idx2] != ip_mask[idx2]))
+ {
+ continue ip_loop;
+ }
+ }
+
+ return;
+ // already exists
+ }
+ non_proxy_addr_list.addElement(ip_addr);
+ non_proxy_mask_list.addElement(ip_mask);
+ }
+
+
+ /**
+ * Convenience method to add a number of hosts at once. If any one host is
+ * null or cannot be parsed it is ignored.
+ *
+ * @param hosts The list of hosts to set
+ * @see #dontProxyFor(java.lang.String)
+ * @since V0.3-2
+ */
+ public static void dontProxyFor(String[] hosts)
+ {
+ if (hosts == null || hosts.length == 0)
+ {
+ return;
+ }
+
+ for (int idx = 0; idx < hosts.length; idx++)
+ {
+ try
+ {
+ if (hosts[idx] != null)
+ {
+ dontProxyFor(hosts[idx]);
+ }
+ }
+ catch (ParseException pe)
+ {
+ // ignore it
+ }
+ }
+ }
+
+
+ /**
+ * Remove host from the list of hosts for which the proxy should
+ * not be used. This modifies the same list that dontProxyFor()
+ * uses, i.e. this is used to undo a dontProxyFor()
setting.
+ * The syntax for host is specified in dontProxyFor()
+ * .
+ *
+ * @param host a host name, domain name, IP-address or
+ * IP-subnet.
+ * @return true if the remove was sucessful, false
+ * otherwise
+ * @exception ParseException if the length of the netmask does not match
+ * the length of the IP-address
+ * @see #dontProxyFor(java.lang.String)
+ */
+ public static boolean doProxyFor(String host)
+ throws ParseException
+ {
+ host = host.trim().toLowerCase();
+
+ // check for domain name
+
+ if (host.charAt(0) == '.')
+ {
+ return non_proxy_dom_list.removeElement(host);
+ }
+
+ // check for host name
+
+ for (int idx = 0; idx < host.length(); idx++)
+ {
+ if (!Character.isDigit(host.charAt(idx)) &&
+ host.charAt(idx) != '.' && host.charAt(idx) != '/')
+ {
+ return (non_proxy_host_list.remove(host) != null);
+ }
+ }
+
+ // must be an IP-address
+
+ byte[] ip_addr;
+ byte[] ip_mask;
+ int slash;
+ if ((slash = host.indexOf('/')) != -1)
+ {
+ // IP subnet
+
+ ip_addr = string2arr(host.substring(0, slash));
+ ip_mask = string2arr(host.substring(slash + 1));
+ if (ip_addr.length != ip_mask.length)
+ {
+ throw new ParseException("length of IP-address (" +
+ ip_addr.length + ") != length of netmask (" +
+ ip_mask.length + ")");
+ }
+ }
+ else
+ {
+ ip_addr = string2arr(host);
+ ip_mask = new byte[ip_addr.length];
+ for (int idx = 0; idx < ip_mask.length; idx++)
+ {
+ ip_mask[idx] = (byte) 255;
+ }
+ }
+
+ ip_loop :
+ for (int idx = 0; idx < non_proxy_addr_list.size(); idx++)
+ {
+ byte[] addr = (byte[]) non_proxy_addr_list.elementAt(idx);
+ byte[] mask = (byte[]) non_proxy_mask_list.elementAt(idx);
+ if (addr.length != ip_addr.length)
+ {
+ continue;
+ }
+
+ for (int idx2 = 0; idx2 < addr.length; idx2++)
+ {
+ if ((ip_addr[idx2] & mask[idx2]) != (addr[idx2] & mask[idx2]) ||
+ (mask[idx2] != ip_mask[idx2]))
+ {
+ continue ip_loop;
+ }
+ }
+
+ non_proxy_addr_list.removeElementAt(idx);
+ non_proxy_mask_list.removeElementAt(idx);
+ return true;
+ }
+ return false;
+ }
+
+
+ /**
+ * Turn an IP-address string into an array (e.g. "12.34.56.78" into { 12,
+ * 34, 56, 78 }).
+ *
+ * @param ip IP-address
+ * @return IP-address in network byte order
+ */
+ private static byte[] string2arr(String ip)
+ {
+ byte[] arr;
+ char[] ip_char = new char[ip.length()];
+ ip.getChars(0, ip_char.length, ip_char, 0);
+
+ int cnt = 0;
+ for (int idx = 0; idx < ip_char.length; idx++)
+ {
+ if (ip_char[idx] == '.')
+ {
+ cnt++;
+ }
+ }
+ arr = new byte[cnt + 1];
+
+ cnt = 0;
+ int pos = 0;
+ for (int idx = 0; idx < ip_char.length; idx++)
+ {
+ if (ip_char[idx] == '.')
+ {
+ arr[cnt] = (byte) Integer.parseInt(ip.substring(pos, idx));
+ cnt++;
+ pos = idx + 1;
+ }
+ }
+ arr[cnt] = (byte) Integer.parseInt(ip.substring(pos));
+
+ return arr;
+ }
+
+
+ /**
+ * Sets the SOCKS server to use. The server will only be used for new
+ * HTTPConnections created after this call and will not affect currrent
+ * instances of HTTPConnection. A null or empty string host parameter
+ * disables SOCKS.
+ * + * The code will try to determine the SOCKS version to use at connection + * time. This might fail for a number of reasons, however, in which case you + * must specify the version explicitly. + * + * @param host the host on which the proxy server resides. The port used is + * the default port 1080. + * @see #setSocksServer(java.lang.String, int, int) + */ + public static void setSocksServer(String host) + { + setSocksServer(host, 1080); + } + + + /** + * Sets the SOCKS server to use. The server will only be used for new + * HTTPConnections created after this call and will not affect currrent + * instances of HTTPConnection. A null or empty string host parameter + * disables SOCKS.
+ * + * The code will try to determine the SOCKS version to use at connection + * time. This might fail for a number of reasons, however, in which case you + * must specify the version explicitly. + * + * @param host the host on which the proxy server resides. + * @param port the port the proxy server is listening on. + * @see #setSocksServer(java.lang.String, int, int) + */ + public static void setSocksServer(String host, int port) + { + if (port <= 0) + { + port = 1080; + } + + if (host == null || host.length() == 0) + { + Default_Socks_client = null; + } + else + { + Default_Socks_client = new SocksClient(host, port); + } + } + + + /** + * Sets the SOCKS server to use. The server will only be used for new + * HTTPConnections created after this call and will not affect currrent + * instances of HTTPConnection. A null or empty string host parameter + * disables SOCKS.
+ * + * In an application or using the Appletviewer an alternative to this method + * is to set the following properties (either in the properties file or on + * the command line): HTTPClient.socksHost , + * HTTPClient.socksPort and HTTPClient.socksVersion . + * Whether HTTPClient.socksHost is set or not determines whether + * a SOCKS server is used; if HTTPClient.socksPort is not set it + * defaults to 1080; if HTTPClient.socksVersion is not set an + * attempt will be made to automatically determine the version used by the + * server.
+ * + * Note: If you have also set a proxy server then a connection will be made + * to the SOCKS server, which in turn then makes a connection to the proxy + * server (possibly via other SOCKS servers), which in turn makes the final + * connection.
+ * + * If the proxy server is running SOCKS version 5 and requires + * username/password authorization, and you wish to set this authorization + * information in the code, then you may use the + * AuthorizationInfo.addAuthorization() method to do so. Specify the + * same host and port as in this method, give the + * scheme "SOCKS5" and the realm "USER/PASS", set the + * cookie to null and the params to an array + * containing a single NVPair in turn containing the username and + * password. Example:
+ * NVPair[] up = { new NVPair(username, password) }; + * AuthorizationInfo.addAuthorization(host, port, "SOCKS5", "USER/PASS", + * null, up); + *If you have not given any authorization info and the proxy server + * requires authorization then you will be prompted for the necessary info + * via a popup the first time you do a request. + * + * @param host the host on which the proxy server resides. + * @param port the port the proxy server is listening on. + * @param version the SOCKS version the server is running. + * Currently this must be '4' or '5'. + * @exception SocksException If version is not '4' or '5'. + */ + public static void setSocksServer(String host, int port, int version) + throws SocksException + { + if (port <= 0) + { + port = 1080; + } + + if (host == null || host.length() == 0) + { + Default_Socks_client = null; + } + else + { + Default_Socks_client = new SocksClient(host, port, version); + } + } + + + /** + * Removes the #... part. Returns the stripped name, or "" if either the + * file is null or is the empty string (after stripping). + * + * @param file the name to strip + * @return the stripped name + */ + private final String stripRef(String file) + { + if (file == null) + { + return ""; + } + + int hash = file.indexOf('#'); + if (hash != -1) + { + file = file.substring(0, hash); + } + + return file.trim(); + } + + + // private helper methods + + /** + * Sets up the request, creating the list of headers to send and creating + * instances of the modules. This may be invoked by subclasses which add + * further methods (such as those from DAV and IPP). + * + * @param method GET, POST, etc. + * @param resource the resource + * @param headers an array of headers to be used + * @param entity the entity (or null) + * @param stream the output stream (or null) - only one of + * stream and entity may be non-null + * @return the response. + * @exception ModuleException if an exception is encountered in any module. + * @exception IOException Description of the Exception + */ + protected final HTTPResponse setupRequest(String method, String resource, + NVPair[] headers, byte[] entity, + HttpOutputStream stream) + throws IOException, ModuleException + { + Request req = new Request(this, method, resource, + mergedHeaders(headers), entity, stream, + allowUI); + RequestList.addToEnd(req); + + try + { + HTTPResponse resp = new HTTPResponse(gen_mod_insts(), Timeout, req, defaultIncrement); + handleRequest(req, resp, null, true); + return resp; + } + finally + { + RequestList.remove(req); + } + } + + + /** + * This merges built-in default headers, user-specified default headers, and + * method-specified headers. Method-specified take precedence over user + * defaults, which take precedence over built-in defaults. The following + * headers are removed if found: "Content-length". + * + * @param spec the headers specified in the call to the method + * @return an array consisting of merged headers. + */ + private NVPair[] mergedHeaders(NVPair[] spec) + { + int spec_len = (spec != null ? spec.length : 0); + int + defs_len; + NVPair[] merged; + + synchronized (DefaultHeaders) + { + defs_len = (DefaultHeaders != null ? DefaultHeaders.length : 0); + merged = new NVPair[spec_len + defs_len]; + + // copy default headers + System.arraycopy(DefaultHeaders, 0, merged, 0, defs_len); + } + + // merge in selected headers + int sidx; + + // merge in selected headers + int didx = defs_len; + for (sidx = 0; sidx < spec_len; sidx++) + { + if (spec[sidx] == null) + { + continue; + } + + String s_name = spec[sidx].getName().trim(); + if (s_name.equalsIgnoreCase("Content-length")) + { + continue; + } + + int search; + for (search = 0; search < didx; search++) + { + if (merged[search].getName().trim().equalsIgnoreCase(s_name)) + { + break; + } + } + + merged[search] = spec[sidx]; + if (search == didx) + { + didx++; + } + } + + if (didx < merged.length) + { + merged = Util.resizeArray(merged, didx); + } + + return merged; + } + + + /** + * Generate an array of instances of the current modules. + * + * @return Description of the Return Value + */ + private HTTPClientModule[] gen_mod_insts() + { + synchronized (ModuleList) + { + HTTPClientModule[] mod_insts = + new HTTPClientModule[ModuleList.size()]; + + for (int idx = 0; idx < ModuleList.size(); idx++) + { + Class mod = (Class) ModuleList.elementAt(idx); + try + { + mod_insts[idx] = (HTTPClientModule) mod.newInstance(); + } + catch (Exception e) + { + throw new Error("HTTPClient Internal Error: could not " + + "create instance of " + mod.getName() + + " -\n" + e); + } + } + + return mod_insts; + } + } + + + /** + * handles the Request. First the request handler for each module is is + * invoked, and then if no response was generated the request is sent. + * + * @param req the Request + * @param http_resp the HTTPResponse + * @param resp the Response + * @param usemodules if false then skip module loop + * @exception IOException if any module or sendRequest throws it + * @exception ModuleException if any module throws it + */ + void handleRequest(Request req, HTTPResponse http_resp, Response resp, + boolean usemodules) + throws IOException, ModuleException + { + Response[] rsp_arr = {resp}; + HTTPClientModule[] modules = http_resp.getModules(); + + // invoke requestHandler for each module + + if (usemodules) + { + doModules : + for (int idx = 0; idx < modules.length; idx++) + { + int sts = modules[idx].requestHandler(req, rsp_arr); + switch (sts) + { + case REQ_CONTINUE: + // continue processing + break; + case REQ_RESTART: + // restart processing with first module + idx = -1; + continue doModules; + case REQ_SHORTCIRC: + // stop processing and send + break doModules; + case REQ_RESPONSE: + // go to phase 2 + case REQ_RETURN: + // return response immediately + if (rsp_arr[0] == null) + { + throw new Error("HTTPClient Internal Error: no " + + "response returned by module " + + modules[idx].getClass().getName()); + } + http_resp.set(req, rsp_arr[0]); + if (req.getStream() != null) + { + req.getStream().ignoreData(req); + } + if (req.internal_subrequest) + { + return; + } + if (sts == REQ_RESPONSE) + { + http_resp.handleResponse(); + } + else + { + http_resp.init(rsp_arr[0]); + } + return; + case REQ_NEWCON_RST: + // new connection + if (req.internal_subrequest) + { + return; + } + req.getConnection(). + handleRequest(req, http_resp, rsp_arr[0], true); + return; + case REQ_NEWCON_SND: + // new connection, send immediately + if (req.internal_subrequest) + { + return; + } + req.getConnection(). + handleRequest(req, http_resp, rsp_arr[0], false); + return; + default: + // not valid + throw new Error("HTTPClient Internal Error: invalid status" + + " " + sts + " returned by module " + + modules[idx].getClass().getName()); + } + } + } + + if (req.internal_subrequest) + { + return; + } + + // Send the request across the wire + + if (req.getStream() != null && req.getStream().getLength() == -1) + { + if (!ServProtVersKnown || ServerProtocolVersion < HTTP_1_1 || + no_chunked) + { + req.getStream().goAhead(req, null, http_resp.getTimeout()); + http_resp.set(req, req.getStream()); + } + else + { + // add Transfer-Encoding header if necessary + int idx; + NVPair[] hdrs = req.getHeaders(); + for (idx = 0; idx < hdrs.length; idx++) + { + if (hdrs[idx].getName().equalsIgnoreCase("Transfer-Encoding")) + { + break; + } + } + + if (idx == hdrs.length) + { + hdrs = Util.resizeArray(hdrs, idx + 1); + hdrs[idx] = new NVPair("Transfer-Encoding", "chunked"); + req.setHeaders(hdrs); + } + else + { + String v = hdrs[idx].getValue(); + try + { + if (!Util.hasToken(v, "chunked")) + { + hdrs[idx] = new NVPair("Transfer-Encoding", + v + ", chunked"); + } + } + catch (ParseException pe) + { + throw new IOException(pe.toString()); + } + } + + http_resp.set(req, sendRequest(req, http_resp.getTimeout())); + } + } + else + { + http_resp.set(req, sendRequest(req, http_resp.getTimeout())); + } + + if (req.aborted) + { + throw new IOException("Request aborted by user"); + } + } + + + /** + * These mark the response to stall the next request on, if any + */ + private volatile Response early_stall = null; + private volatile Response late_stall = null; + private volatile Response prev_resp = null; + /** + * This marks the socket output stream as still being used + */ + private boolean output_finished = true; + + + /** + * sends the request over the line. + * + * @param req the request + * @param con_timeout the timeout to use when establishing a socket + * connection; an InterruptedIOException is thrown if the procedure + * times out. + * @return Description of the Return Value + * @exception IOException if thrown by the socket + * @exception ModuleException if any module throws it during the SSL- + * tunneling handshake + */ + Response sendRequest(Request req, int con_timeout) + throws IOException, ModuleException + { + ByteArrayOutputStream hdr_buf = new ByteArrayOutputStream(600); + Response resp = null; + boolean keep_alive; + + // The very first request is special in that we need its response + // before any further requests may be made. This is to set things + // like the server version. + + if (early_stall != null) + { + try + { + Log.write(Log.CONN, "Conn: Early-stalling Request: " + + req.getMethod() + " " + + req.getRequestURI()); + + synchronized (early_stall) + { + // wait till the response is received + try + { + early_stall.getVersion(); + } + catch (IOException ioe) + { + } + early_stall = null; + } + } + catch (NullPointerException npe) + { + } + } + + String[] con_hdrs = assembleHeaders(req, hdr_buf); + + // determine if the connection should be kept alive after this + // request + + try + { + if (ServerProtocolVersion >= HTTP_1_1 && + !Util.hasToken(con_hdrs[0], "close") + || + ServerProtocolVersion == HTTP_1_0 && + Util.hasToken(con_hdrs[0], "keep-alive") + ) + { + keep_alive = true; + } + else + { + keep_alive = false; + } + } + catch (ParseException pe) + { + throw new IOException(pe.toString()); + } + + synchronized (this) + { + // Sometimes we must stall the pipeline until the previous request + // has been answered. However, if we are going to open up a new + // connection anyway we don't really need to stall. + + if (late_stall != null) + { + if (input_demux != null || keepAliveUnknown) + { + Log.write(Log.CONN, "Conn: Stalling Request: " + + req.getMethod() + " " + req.getRequestURI()); + + try + { + // wait till the response is received + + late_stall.getVersion(); + if (keepAliveUnknown) + { + determineKeepAlive(late_stall); + } + } + catch (IOException ioe) + { + } + } + + late_stall = null; + } + + /* + * POSTs must not be pipelined because of problems if the connection + * is aborted. Since it is generally impossible to know what urls + * POST will influence it is impossible to determine if a sequence + * of requests containing a POST is idempotent. + * Also, for retried requests we don't want to pipeline either. + */ + if ((req.getMethod().equals("POST") || req.dont_pipeline) && + prev_resp != null && input_demux != null) + { + Log.write(Log.CONN, "Conn: Stalling Request: " + + req.getMethod() + " " + req.getRequestURI()); + + try + { + // wait till the response is received + prev_resp.getVersion(); + } + catch (IOException ioe) + { + } + } + + // If the previous request used an output stream, then wait till + // all the data has been written + + if (!output_finished) + { + try + { + wait(); + } + catch (InterruptedException ie) + { + throw new IOException(ie.toString()); + } + } + + if (req.aborted) + { + throw new IOException("Request aborted by user"); + } + + int try_count = 3; + /* + * what a hack! This is to handle the case where the server closes + * the connection but we don't realize it until we try to send + * something. The problem is that we only get IOException, but + * we need a finer specification (i.e. whether it's an EPIPE or + * something else); I don't trust relying on the message part + * of IOException (which on SunOS/Solaris gives 'Broken pipe', + * but what on Windoze/Mac?). + */ + while (try_count-- > 0) + { + try + { + // get a client socket + + Socket sock; + if (input_demux == null || + (sock = input_demux.getSocket()) == null) + { + sock = getSocket(con_timeout); + + if (Protocol == HTTPS) + { + if (Proxy_Host != null) + { + Socket[] sarr = {sock}; + resp = enableSSLTunneling(sarr, req, con_timeout); + if (resp != null) + { + resp.final_resp = true; + return resp; + } + sock = sarr[0]; + } + + sock.setSoTimeout(con_timeout); + //sock = new SSLSocket(sock); + } + + input_demux = new StreamDemultiplexor(Protocol, sock, this); + DemuxList.addToEnd(input_demux); + keepAliveReqLeft = keepAliveReqMax; + } + + if (req.aborted) + { + throw new IOException("Request aborted by user"); + } + + Log.write(Log.CONN, "Conn: Sending Request: ", hdr_buf); + + // Send headers + + OutputStream sock_out = sock.getOutputStream(); + if (haveMSLargeWritesBug) + { + sock_out = new MSLargeWritesBugStream(sock_out); + } + + hdr_buf.writeTo(sock_out); + + // Wait for "100 Continue" status if necessary + + try + { + if (ServProtVersKnown && + ServerProtocolVersion >= HTTP_1_1 && + Util.hasToken(con_hdrs[1], "100-continue")) + { + resp = new Response(req, (Proxy_Host != null && Protocol != HTTPS), input_demux); + resp.timeout = 60; + if (resp.getContinue() != 100) + { + break; + } + } + } + catch (ParseException pe) + { + throw new IOException(pe.toString()); + } + catch (InterruptedIOException iioe) + { + } + finally + { + if (resp != null) + { + resp.timeout = 0; + } + } + + // POST/PUT data + + if (req.getData() != null && req.getData().length > 0) + { + if (req.delay_entity > 0) + { + // wait for something on the network; check available() + // roughly every 100 ms + + long num_units = req.delay_entity / 100; + long one_unit = req.delay_entity / num_units; + + for (int idx = 0; idx < num_units; idx++) + { + if (input_demux.available(null) != 0) + { + break; + } + try + { + Thread.sleep(one_unit); + } + catch (InterruptedException ie) + { + } + } + + if (input_demux.available(null) == 0) + { + sock_out.write(req.getData()); + } + // he's still waiting + else + { + keep_alive = false; + } + // Uh oh! + } + else + { + sock_out.write(req.getData()); + } + } + + if (req.getStream() != null) + { + req.getStream().goAhead(req, sock_out, 0); + } + else + { + sock_out.flush(); + } + + // get a new response. + // Note: this does not do a read on the socket. + + if (resp == null) + { + resp = new Response(req, (Proxy_Host != null && + Protocol != HTTPS), + input_demux); + } + } + catch (IOException ioe) + { + Log.write(Log.CONN, "Conn: ", ioe); + + closeDemux(ioe, true); + + if (try_count == 0 || ioe instanceof UnknownHostException || + ioe instanceof ConnectException || + ioe instanceof NoRouteToHostException || + ioe instanceof InterruptedIOException || req.aborted) + { + throw ioe; + } + + Log.write(Log.CONN, "Conn: Retrying request"); + continue; + } + + break; + } + + prev_resp = resp; + + // close the stream after this response if necessary + + if ((!keepAliveUnknown && !doesKeepAlive) || !keep_alive || + (keepAliveReqMax != -1 && keepAliveReqLeft-- == 0)) + { + input_demux.markForClose(resp); + input_demux = null; + } + else + { + input_demux.restartTimer(); + } + + if (keepAliveReqMax != -1) + { + Log.write(Log.CONN, "Conn: Number of requests left: " + + keepAliveReqLeft); + } + + /* + * We don't pipeline the first request, as we need some info + * about the server (such as which http version it complies with) + */ + if (!ServProtVersKnown) + { + early_stall = resp; + resp.markAsFirstResponse(req); + } + + /* + * Also don't pipeline until we know if the server supports + * keep-alive's or not. + * Note: strictly speaking, HTTP/1.0 keep-alives don't mean we can + * pipeline requests. I seem to remember some (beta?) version + * of Netscape's Enterprise server which barfed if you tried + * push requests down it's throat w/o waiting for the previous + * response first. However, I've not been able to find such a + * server lately, and so I'm taking the risk and assuming we + * can in fact pipeline requests to HTTP/1.0 servers. + */ + if (keepAliveUnknown || + // We don't pipeline POST's ... + !IdempotentSequence.methodIsIdempotent(req.getMethod()) || + req.dont_pipeline || + // Retries disable pipelining too + neverPipeline) + { + // Emergency measure: prevent all pipelining + late_stall = resp; + } + + /* + * If there is an output stream then just tell the other threads to + * wait; the stream will notify() when it's done. If there isn't any + * stream then wake up a waiting thread (if any). + */ + if (req.getStream() != null) + { + output_finished = false; + } + else + { + output_finished = true; + notify(); + } + + // Looks like were finally done + + Log.write(Log.CONN, "Conn: Request sent"); + } + + return resp; + } + + + /** + * Gets a socket. Creates a socket to the proxy if set, or else to the + * actual destination. + * + * @param con_timeout if not 0 then start a new thread to establish the + * the connection and join(con_timeout) it. If the join() times out an + * InteruptedIOException is thrown. + * @return The socket value + * @exception IOException Description of the Exception + */ + private Socket getSocket(int con_timeout) + throws IOException + { + Socket sock = null; + + String actual_host; + int actual_port; + + if (Proxy_Host != null) + { + actual_host = Proxy_Host; + actual_port = Proxy_Port; + } + else + { + actual_host = Host; + actual_port = Port; + } + + Log.write(Log.CONN, "Conn: Creating Socket: " + actual_host + ":" + + actual_port); + + if (con_timeout == 0) + { + // normal connection establishment + + if (Socks_client != null) + { + sock = Socks_client.getSocket(actual_host, actual_port); + } + else + { + // try all A records + InetAddress[] addr_list = InetAddress.getAllByName(actual_host); + for (int idx = 0; idx < addr_list.length; idx++) + { + try + { + if (LocalAddr == null) + { + sock = new Socket(addr_list[idx], actual_port); + } + else + { + sock = new Socket(addr_list[idx], actual_port, + LocalAddr, LocalPort); + } + break; + // success + } + catch (SocketException se) + { + if (idx == addr_list.length - 1) + { + throw se; + } + // we tried them all + } + } + } + } + else + { + EstablishConnection con = + new EstablishConnection(actual_host, actual_port, Socks_client); + con.start(); + try + { + con.join((long) con_timeout); + } + catch (InterruptedException ie) + { + } + + if (con.getException() != null) + { + throw con.getException(); + } + if ((sock = con.getSocket()) == null) + { + con.forget(); + if ((sock = con.getSocket()) == null) + { + throw new InterruptedIOException("Connection establishment timed out"); + } + } + } + + return sock; + } + + + /** + * Enable SSL Tunneling if we're talking to a proxy. See ietf draft + * draft-luotonen-ssl-tunneling-03 for more info. + * + * @param sock the socket + * @param req the request initiating this connection + * @param timeout the timeout + * @return the proxy's last response if unsuccessful, or + * null if tunnel successfuly established + * @exception IOException + * @exception ModuleException + */ + private Response enableSSLTunneling(Socket[] sock, Request req, int timeout) + throws IOException, ModuleException + { + // copy User-Agent and Proxy-Auth headers from request + + Vector hdrs = new Vector(); + for (int idx = 0; idx < req.getHeaders().length; idx++) + { + String name = req.getHeaders()[idx].getName(); + if (name.equalsIgnoreCase("User-Agent") || + name.equalsIgnoreCase("Proxy-Authorization")) + { + hdrs.addElement(req.getHeaders()[idx]); + } + } + + // create initial CONNECT subrequest + + NVPair[] h = new NVPair[hdrs.size()]; + hdrs.copyInto(h); + Request connect = new Request(this, "CONNECT", Host + ":" + Port, h, + null, null, req.allowUI()); + connect.internal_subrequest = true; + + ByteArrayOutputStream hdr_buf = new ByteArrayOutputStream(600); + HTTPResponse r = new HTTPResponse(gen_mod_insts(), timeout, connect, defaultIncrement); + + // send and handle CONNECT request until successful or tired + + Response resp = null; + + while (true) + { + handleRequest(connect, r, resp, true); + + hdr_buf.reset(); + assembleHeaders(connect, hdr_buf); + + Log.write(Log.CONN, "Conn: Sending SSL-Tunneling Subrequest: ", + hdr_buf); + + // send CONNECT + + hdr_buf.writeTo(sock[0].getOutputStream()); + + // return if successful + + resp = new Response(connect, sock[0].getInputStream()); + if (resp.getStatusCode() == 200) + { + return null; + } + + // failed! + + // make life easy: read data and close socket + + try + { + resp.getData(); + } + catch (IOException ioe) + { + } + try + { + sock[0].close(); + } + catch (IOException ioe) + { + } + + // handle response + + r.set(connect, resp); + if (!r.handleResponse()) + { + return resp; + } + + sock[0] = getSocket(timeout); + } + } + + + /** + * This writes out the headers on the hdr_buf . It takes special + * precautions for the following headers:
getHeader("Server")
+ * instead.
+ * @see #getHeader(java.lang.String)
+ */
+ public final String getServer()
+ throws IOException, ModuleException
+ {
+ if (!initialized)
+ {
+ handleResponse();
+ }
+ return getHeader("Server");
+ }
+
+
+ /**
+ * Get the original URI used in the request.
+ *
+ * @return the URI used in primary request
+ */
+ public final URI getOriginalURI()
+ {
+ return OriginalURI;
+ }
+
+
+ /**
+ * Get the final URL of the document. This is set if the original request
+ * was deferred via the "moved" (301, 302, or 303) return status.
+ *
+ * @return the effective URL, or null if no redirection
+ * occured
+ * @exception IOException If any exception occurs on the socket.
+ * @exception ModuleException if any module encounters an exception.
+ * @deprecated use getEffectiveURI() instead
+ * @see #getEffectiveURI
+ */
+ public final URL getEffectiveURL()
+ throws IOException, ModuleException
+ {
+ if (!initialized)
+ {
+ handleResponse();
+ }
+ if (EffectiveURI != null)
+ {
+ return EffectiveURI.toURL();
+ }
+ return null;
+ }
+
+
+ /**
+ * Get the final URI of the document. If the request was redirected via the
+ * "moved" (301, 302, 303, or 307) return status this returns the URI used
+ * in the last redirection; otherwise it returns the original URI.
+ *
+ * @return the effective URI
+ * @exception IOException If any exception occurs on the socket.
+ * @exception ModuleException if any module encounters an exception.
+ */
+ public final URI getEffectiveURI()
+ throws IOException, ModuleException
+ {
+ if (!initialized)
+ {
+ handleResponse();
+ }
+ if (EffectiveURI != null)
+ {
+ return EffectiveURI;
+ }
+ return OriginalURI;
+ }
+
+
+ /**
+ * Retrieves the value for a given header.
+ *
+ * @param hdr the header name.
+ * @return the value for the header, or null if
+ * non-existent.
+ * @exception IOException If any exception occurs on the socket.
+ * @exception ModuleException if any module encounters an exception.
+ */
+ public String getHeader(String hdr)
+ throws IOException, ModuleException
+ {
+ if (!initialized)
+ {
+ handleResponse();
+ }
+ return (String) Headers.get(hdr.trim());
+ }
+
+
+ /**
+ * Retrieves the value for a given header. The value is parsed as an int.
+ *
+ * @param hdr the header name.
+ * @return the value for the header if the header
+ * exists
+ * @exception NumberFormatException if the header's value is not a number
+ * or if the header does not exist.
+ * @exception IOException if any exception occurs on the socket.
+ * @exception ModuleException if any module encounters an exception.
+ */
+ public int getHeaderAsInt(String hdr)
+ throws IOException, ModuleException, NumberFormatException
+ {
+ String val = getHeader(hdr);
+ if (val == null)
+ {
+ throw new NumberFormatException("null");
+ }
+ return Integer.parseInt(val);
+ }
+
+
+ /**
+ * Retrieves the value for a given header. The value is parsed as a date; if
+ * this fails it is parsed as a long representing the number of seconds
+ * since 12:00 AM, Jan 1st, 1970. If this also fails an exception is thrown.
+ * getData()
+ * to force the data to be read.
+ *
+ * @param trailer the trailer name.
+ * @return the value for the trailer, or null if
+ * non-existent.
+ * @exception IOException If any exception occurs on the socket.
+ * @exception ModuleException if any module encounters an exception.
+ * @see #getData()
+ */
+ public String getTrailer(String trailer)
+ throws IOException, ModuleException
+ {
+ if (!got_trailers)
+ {
+ getTrailers();
+ }
+ return (String) Trailers.get(trailer.trim());
+ }
+
+
+ /**
+ * Retrieves the value for a given tailer. The value is parsed as an int.
+ *
+ * @param trailer the tailer name.
+ * @return the value for the trailer if the
+ * trailer exists
+ * @exception NumberFormatException if the trailer's value is not a number
+ * or if the trailer does not exist.
+ * @exception IOException if any exception occurs on the socket.
+ * @exception ModuleException if any module encounters an exception.
+ */
+ public int getTrailerAsInt(String trailer)
+ throws IOException, ModuleException, NumberFormatException
+ {
+ String val = getTrailer(trailer);
+ if (val == null)
+ {
+ throw new NumberFormatException("null");
+ }
+ return Integer.parseInt(val);
+ }
+
+
+ /**
+ * Retrieves the value for a given trailer. The value is parsed as a date;
+ * if this fails it is parsed as a long representing the number of seconds
+ * since 12:00 AM, Jan 1st, 1970. If this also fails an
+ * IllegalArgumentException is thrown. getInputStream()
had been previously invoked then this
+ * method only returns any unread data remaining on the stream and then
+ * closes it. + * + * Note to the unwary: code like
+ * System.out.println("The data: " + resp.getData()) + *will probably not do what you want - use
+ * System.out.println("The data: " + resp.getText()) + *instead. + * + * @return an array containing the data (body) returned. + * If no data was returned then it's set to a zero-length array. + * @exception IOException If any io exception occured while reading the + * data + * @exception ModuleException if any module encounters an exception. + * @see #getInputStream() + */ + + public byte[] getData() throws IOException, ModuleException + { + return getData(-1); + } + + public byte[] getData(int max) + throws IOException, ModuleException + { + if (!initialized) + { + handleResponse(); + } + + if (Data == null) + { + try + { + readResponseData(inp_stream, max); + } + catch (InterruptedIOException ie) + { + // don't intercept + throw ie; + } + catch (IOException ioe) + { + Log.write(Log.RESP, "HResp: (\"" + method + " " + + OriginalURI.getPathAndQuery() + "\")"); + Log.write(Log.RESP, " ", ioe); + + try + { + inp_stream.close(); + } + catch (Exception e) + { + } + throw ioe; + } + + inp_stream.close(); + } + + return Data; + } + + + /** + * Reads all the response data into a buffer and turns it into a string + * using the appropriate character converter. Since this uses {@link + * #getData() getData()}, the caveats of that method apply here as well. + * + * @return the body as a String. If no data was returned + * then an empty string is returned. + * @exception IOException If any io exception occured while reading the + * data, or if the content is not text + * @exception ModuleException if any module encounters an exception. + * @exception ParseException if an error occured trying to parse the + * content-type header field + * @see #getData() + */ + public synchronized String getText() + throws IOException, ModuleException, ParseException + { + String ct = getHeader("Content-Type"); + if (ct == null || !ct.toLowerCase().startsWith("text/")) + { + throw new IOException("Content-Type `" + ct + "' is not a text type"); + } + + String charset = Util.getParameter("charset", ct); + if (charset == null) + { + charset = "ISO-8859-1"; + } + + return new String(getData(), charset); + } + + + /** + * Gets an input stream from which the returned data can be read. Note that + * if
getData()
had been previously invoked it will actually
+ * return a ByteArrayInputStream created from that data.
+ *
+ * @return the InputStream.
+ * @exception IOException If any exception occurs on the socket.
+ * @exception ModuleException if any module encounters an exception.
+ * @see #getData()
+ */
+ public synchronized InputStream getInputStream()
+ throws IOException, ModuleException
+ {
+ if (!initialized)
+ {
+ handleResponse();
+ }
+
+ if (Data == null)
+ {
+ return inp_stream;
+ }
+ else
+ {
+ getData();
+ // ensure complete data is read
+ return new ByteArrayInputStream(Data);
+ }
+ }
+
+
+ /**
+ * Should the request be retried by the application? If the application used
+ * an HttpOutputStream in the request then various modules (such
+ * as the redirection and authorization modules) are not able to resend the
+ * request themselves. Instead, it becomes the application's responsibility.
+ * The application can check this flag, and if it's set, resend the exact
+ * same request. The modules such as the RedirectionModule or
+ * AuthorizationModule will then recognize the resend and fix up or redirect
+ * the request as required (i.e. they defer their normal action until the
+ * resend). + * + * If the application resends the request then it must use + * the same HttpOutputStream instance. This is because the + * modules use this to recognize the retried request and to perform the + * necessary work on the request before it's sent.
+ * + * Here is a skeleton example of usage:
+ * OutputStream out = new HttpOutputStream(1234); + * do + * { + * rsp = con.Post("/cgi-bin/my_cgi", out); + * out.write(...); + * out.close(); + * } while (rsp.retryRequest()); + * + * if (rsp.getStatusCode() >= 300) + * ... + *
+ * + * Note that for this to ever return true, the java system property + * HTTPClient.deferStreamed must be set to true at the beginning of + * the application (before the HTTPConnection class is loaded). This + * prevents unwary applications from causing inadvertent memory leaks. If an + * application does set this, then it must resend any request whose + * response returns true here in order to prevent memory leaks (a switch to + * JDK 1.2 will allow us to use weak references and eliminate this problem). + * + * @return true if the request should be retried. + * @exception IOException If any exception occurs on the socket. + * @exception ModuleException if any module encounters an exception. + */ + public boolean retryRequest() + throws IOException, ModuleException + { + if (!initialized) + { + try + { + handleResponse(); + } + catch (RetryException re) + { + this.retry = response.retry; + } + } + return retry; + } + + + /** + * produces a full list of headers and their values, one per line. + * + * @return a string containing the headers + */ + public String toString() + { + if (!initialized) + { + try + { + handleResponse(); + } + catch (Exception e) + { + if (!(e instanceof InterruptedIOException)) + { + Log.write(Log.RESP, "HResp: (\"" + method + " " + + OriginalURI.getPathAndQuery() + "\")"); + Log.write(Log.RESP, " ", e); + } + return "Failed to read headers: " + e; + } + } + + String nl = System.getProperty("line.separator", "\n"); + + StringBuffer str = new StringBuffer(Version); + str.append(' '); + str.append(StatusCode); + str.append(' '); + str.append(ReasonLine); + str.append(nl); + + if (EffectiveURI != null) + { + str.append("Effective-URI: "); + str.append(EffectiveURI); + str.append(nl); + } + + Enumeration hdr_list = Headers.keys(); + while (hdr_list.hasMoreElements()) + { + String hdr = (String) hdr_list.nextElement(); + str.append(hdr); + str.append(": "); + str.append(Headers.get(hdr)); + str.append(nl); + } + + return str.toString(); + } + + + // Helper Methods + + + /** + * Gets the modules attribute of the HTTPResponse object + * + * @return The modules value + */ + HTTPClientModule[] getModules() + { + return modules; + } + + + /** + * Processes a Response. This is done by calling the response handler in + * each module. When all is done, the various fields of this instance are + * intialized from the last Response. + * + * @return true if a new request was generated. This is + * used for internal subrequests only + * @exception IOException if any handler throws an IOException. + * @exception ModuleException if any module encounters an exception. + */ + synchronized boolean handleResponse() + throws IOException, ModuleException + { + if (initialized) + { + return false; + } + + /* + * first get the response if necessary + */ + if (out_stream != null) + { + response = out_stream.getResponse(); + response.http_resp = this; + out_stream = null; + } + + /* + * go through modules and handle them + */ + doModules : + while (true) + { + + Phase1 : + for (int idx = 0; idx < modules.length && !aborted; idx++) + { + try + { + modules[idx].responsePhase1Handler(response, request); + } + catch (RetryException re) + { + if (re.restart) + { + continue doModules; + } + else + { + throw re; + } + } + } + + Phase2 : + for (int idx = 0; idx < modules.length && !aborted; idx++) + { + int sts = modules[idx].responsePhase2Handler(response, request); + switch (sts) + { + case RSP_CONTINUE: + // continue processing + break; + case RSP_RESTART: + // restart response processing + idx = -1; + continue doModules; + case RSP_SHORTCIRC: + // stop processing and return + break doModules; + case RSP_REQUEST: + // go to phase 1 + case RSP_NEWCON_REQ: + // process the request using a new con + response.getInputStream().close(); + if (handle_trailers) + { + invokeTrailerHandlers(true); + } + if (request.internal_subrequest) + { + return true; + } + request.getConnection(). + handleRequest(request, this, response, true); + if (initialized) + { + break doModules; + } + + idx = -1; + continue doModules; + case RSP_SEND: + // send the request immediately + case RSP_NEWCON_SND: + // send the request using a new con + response.getInputStream().close(); + if (handle_trailers) + { + invokeTrailerHandlers(true); + } + if (request.internal_subrequest) + { + return true; + } + request.getConnection(). + handleRequest(request, this, response, false); + idx = -1; + continue doModules; + default: + // not valid + throw new Error("HTTPClient Internal Error: invalid status" + + " " + sts + " returned by module " + + modules[idx].getClass().getName()); + } + } + + Phase3 : + for (int idx = 0; idx < modules.length && !aborted; idx++) + { + modules[idx].responsePhase3Handler(response, request); + } + + break doModules; + } + + /* + * force a read on the response in case none of the modules did + */ + response.getStatusCode(); + + /* + * all done, so copy data + */ + if (!request.internal_subrequest) + { + init(response); + } + + if (handle_trailers) + { + invokeTrailerHandlers(false); + } + + return false; + } + + + /** + * Copies the relevant fields from Response and marks this as initialized. + * + * @param resp the Response class to copy from + */ + void init(Response resp) + { + if (initialized) + { + return; + } + + this.StatusCode = resp.StatusCode; + this.ReasonLine = resp.ReasonLine; + this.Version = resp.Version; + this.EffectiveURI = resp.EffectiveURI; + this.ContentLength = resp.ContentLength; + this.Headers = resp.Headers; + this.inp_stream = resp.inp_stream; + this.Data = resp.Data; + this.retry = resp.retry; + initialized = true; + } + + + private boolean handle_trailers = false; + private boolean trailers_handled = false; + + + /** + * This is invoked by the RespInputStream when it is close()'d. It just + * invokes the trailer handler in each module. + * + * @param force invoke the handlers even if not initialized + * yet? + * @exception IOException if thrown by any module + * @exception ModuleException if thrown by any module + */ + void invokeTrailerHandlers(boolean force) + throws IOException, ModuleException + { + if (trailers_handled) + { + return; + } + + if (!force && !initialized) + { + handle_trailers = true; + return; + } + + for (int idx = 0; idx < modules.length && !aborted; idx++) + { + modules[idx].trailerHandler(response, request); + } + + trailers_handled = true; + } + + + /** + * Mark this request as having been aborted. It's invoked by + * HTTPConnection.stop(). + */ + void markAborted() + { + aborted = true; + } + + + /** + * Gets any trailers from the response if we haven't already done so. + * + * @exception IOException Description of the Exception + * @exception ModuleException Description of the Exception + */ + private synchronized void getTrailers() + throws IOException, ModuleException + { + if (got_trailers) + { + return; + } + if (!initialized) + { + handleResponse(); + } + + response.getTrailer("Any"); + Trailers = response.Trailers; + got_trailers = true; + + invokeTrailerHandlers(false); + } + + + /** + * Reads the response data received. Does not return until either + * Content-Length bytes have been read or EOF is reached. + * + * @param inp Description of the Parameter + * @exception IOException if any read on the input stream fails + * @inp the input stream from which to read the data + */ + private void readResponseData(InputStream inp, int max) + throws IOException, ModuleException + { + boolean readUnlimited = (max == -1); + + if (ContentLength == 0) + { + return; + } + + if (Data == null) + { + Data = new byte[0]; + } + + // read response data + + int off = Data.length; + + try + { + // check Content-length header in case CE-Module removed it + if (getHeader("Content-Length") != null) + { + int rcvd = 0; + int total = max > 1 ? Math.min(ContentLength, max) : ContentLength; + //System.out.println("Reading with max file size: " + total); + Data = new byte[total]; + do + { + off += rcvd; + rcvd = inp.read(Data, off, total - off); + } while (rcvd != -1 && off + rcvd < total); + // if max < ContentLength (&& max > -1): lose the rest + /*if(total < ContentLength) + { + inp.skip(ContentLength - total); + }*/ + /* + * Don't do this! + * If we do, then getData() won't work after a getInputStream() + * because we'll never get all the expected data. Instead, let + * the underlying RespInputStream throw the EOF. + * if (rcvd == -1) // premature EOF + * { + * throw new EOFException("Encountered premature EOF while " + + * "reading headers: received " + off + + * " bytes instead of the expected " + + * ContentLength + " bytes"); + * } + */ + } + else + { + //System.out.println("Reading with unknown file size"); + java.util.LinkedList blocks = new java.util.LinkedList(); + //System.out.println("new LinkedList()"); + int total = 0; + int secondBlockSize = 10*2000; + byte[] secondBlock = new byte[secondBlockSize]; + //System.out.println("new byte[" + secondBlockSize + "]"); + int offInSecondBlock = 0; + int rcvd = 0; + do + { + int bytesToRead = secondBlockSize - offInSecondBlock; + if(bytesToRead < 1) + { + // System.out.println("adding block to list..."); + blocks.addLast(secondBlock); + secondBlock = new byte[secondBlockSize]; + //System.out.println("new byte[" + secondBlockSize + "]"); + offInSecondBlock = 0; + bytesToRead = secondBlockSize; + } + rcvd = inp.read(secondBlock, offInSecondBlock, bytesToRead); + //System.out.println("read " + rcvd); + // rcvd is usually << secondBlockSize + if(rcvd != -1) + { + offInSecondBlock += rcvd; + total += rcvd; + max -= rcvd; + } + } while(rcvd != -1 && (readUnlimited || max > 0)); + + // now we have: 1 x the last block as "secondBlock" + 0...n x blocks in the list + Data = new byte[total]; // I can't see how to do it without this second buffer + //System.out.println("new byte[" + total + "]"); + + int offset = 0; + while(blocks.size() > 0) + { + byte[] block = (byte[]) blocks.removeFirst(); + System.arraycopy(block, 0, Data, offset, block.length); + //System.out.println("System.arraycopy(" + block.length + ")"); + offset += block.length; + } + if(offInSecondBlock > 0) + { + //System.out.println("System.arraycopy(" + offInSecondBlock + ")"); + System.arraycopy(secondBlock, 0, Data, offset, offInSecondBlock); + } + + + } + } + catch (IOException ioe) + { + Data = Util.resizeArray(Data, off); + throw ioe; + } + finally + { + try + { + inp.close(); + } + catch (IOException ioe) + { + } + } + } + + + + /* + * Reads the response data received. Does not return until either + * Content-Length bytes have been read or EOF is reached. + * + * @param inp Description of the Parameter + * @exception IOException if any read on the input stream fails + * @exception ModuleException Description of the Exception + * @inp the input stream from which to read the data + * + private void readResponseData(InputStream inp) + throws IOException, ModuleException + { + if (ContentLength == 0) + { + return; + } + + if (Data == null) + { + Data = new byte[0]; + } + + // read response data + + int off = Data.length; + + LinkedList blocks = new java.util.LinkedList(); + + // check Content-length header in case CE-Module removed it + if (getHeader("Content-Length") != null) + { + try + { + int rcvd = 0; + Data = new byte[ContentLength]; + + do + { + off += rcvd; + rcvd = inp.read(Data, off, ContentLength - off); + } while (rcvd != -1 && off + rcvd < ContentLength); + /* + * Don't do this! + * If we do, then getData() won't work after a getInputStream() + * because we'll never get all the expected data. Instead, let + * the underlying RespInputStream throw the EOF. + * if (rcvd == -1) // premature EOF + * { + * throw new EOFException("Encountered premature EOF while " + + * "reading headers: received " + off + + * " bytes instead of the expected " + + * ContentLength + " bytes"); + * } + * + } + catch (IOException ioe) + { + Data = Util.resizeArray(Data, off); + throw ioe; + } + finally + { + try + { + inp.close(); + } + catch (IOException ioe) + { + } + } + } + else + { + int total = 0; + int rcvd = 0; + try + { + ByteBlock actBlock = new ByteBlock(this.readIncrement); + // TODO: Blocks are very small (500-2000 Bytes) -> combine them + while ((actBlock.length = inp.read(actBlock.block, 0, this.readIncrement)) != -1) + { + total += actBlock.length; + // System.out.println(this.getOriginalURI().toExternalForm() + ": adding block with length " + actBlock.length + " complete: " + total); + blocks.add(actBlock); + actBlock = new ByteBlock(this.readIncrement); + //off += rcvd; + // Data = Util.resizeArray(Data, off + this.readIncrement); + } + } + catch (IOException ioe) + { + throw ioe; + } + finally + { + Iterator it = blocks.iterator(); + Data = Util.resizeArray(Data, total); + off = 0; + while (it.hasNext()) + { + ByteBlock act = (ByteBlock) it.next(); + //System.out.println(this.getOriginalURI().toExternalForm() + ": copied " + act.length + " -> off: " + off + ", left: " + total); + System.arraycopy(act.block, 0, Data, off, act.length); + off += act.length; + total -= act.length; + } + try + { + inp.close(); + } + catch (IOException ioe) + { + } + } + } + } +*/ + + /** + * Gets the timeout attribute of the HTTPResponse object + * + * @return The timeout value + */ + int getTimeout() + { + return timeout; + } +} diff --git a/sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/fetcher/Constants.java b/sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/fetcher/Constants.java new file mode 100644 index 00000000000..b8b4d7e3a36 --- /dev/null +++ b/sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/fetcher/Constants.java @@ -0,0 +1,38 @@ + +/** + * Title: LARM Lanlab Retrieval Machine
+ * Description:
+ * Copyright: Copyright (c)
+ * Company:
+ * @author + * @version 1.0 + */ +package de.lanlab.larm.fetcher; + +/** + * contains all global constants used in this package + */ +public class Constants +{ + + /** + * user agent string a fetcher task gives to the corresponding server + */ + public static final String USER_AGENT = "Mozilla/4.06 [en] (WinNT; I)"; + + /** + * Crawler Identification + */ + public static final String CRAWLER_AGENT = "Fetcher/0.95"; + + /** + * size of the temporary buffer to read web documents in + */ + public final static int FETCHERTASK_READSIZE = 4096; + + /** + * don't read more than... bytes + */ + public final static int FETCHERTASK_MAXFILESIZE = 2000000; + +} \ No newline at end of file diff --git a/sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/fetcher/DNSResolver.java b/sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/fetcher/DNSResolver.java new file mode 100644 index 00000000000..a724066daff --- /dev/null +++ b/sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/fetcher/DNSResolver.java @@ -0,0 +1,73 @@ + +/** + * Title: LARM Lanlab Retrieval Machine
+ * Description:
+ * Copyright: Copyright (c)
+ * Company:
+ * @author
+ * @version 1.0
+ */
+package de.lanlab.larm.fetcher;
+
+import java.util.*;
+import java.net.*;
+
+/**
+ * filter class; gets IP Adresses from host names and forwards them to
+ * the other parts of the application
+ * since URLs cache their IP addresses themselves, and HTTP 1.1 needs the
+ * host names to be sent to the server, this class is not used anymore
+ */
+public class DNSResolver implements MessageListener
+{
+
+ HashMap ipCache = new HashMap();
+
+
+ public DNSResolver()
+ {
+ }
+
+ public void notifyAddedToMessageHandler(MessageHandler m)
+ {
+ this.messageHandler = m;
+ }
+
+ MessageHandler messageHandler;
+
+ public Message handleRequest(Message message)
+ {
+ if(message instanceof URLMessage)
+ {
+ URL url = ((URLMessage)message).getUrl();
+ String host = url.getHost();
+ InetAddress ip;
+ /*InetAddress ip = (InetAddress)ipCache.get(host);
+
+ if(ip == null)
+ {
+ */
+
+ try
+ {
+ ip = InetAddress.getByName(host);
+ /*
+ ipCache.put(host, ip);
+ //System.out.println("DNSResolver: new Cache Entry \"" + host + "\" = \"" + ip.getHostAddress() + "\"");*/
+ }
+ catch(UnknownHostException e)
+ {
+ ip = null;
+ return null;
+ //System.out.println("DNSResolver: unknown host \"" + host + "\"");
+ }
+ /*}
+ else
+ {
+ //System.out.println("DNSResolver: Cache hit: " + ip.getHostAddress());
+ }*/
+ //((URLMessage)message).setIpAddress(ip);
+ }
+ return message;
+ }
+}
\ No newline at end of file
diff --git a/sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/fetcher/Fetcher.java b/sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/fetcher/Fetcher.java
new file mode 100644
index 00000000000..e1ca56c2355
--- /dev/null
+++ b/sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/fetcher/Fetcher.java
@@ -0,0 +1,224 @@
+/*
+ * LARM - LANLab Retrieval Machine
+ *
+ * $history: $
+ *
+ */
+
+package de.lanlab.larm.fetcher;
+
+import de.lanlab.larm.threads.ThreadPool;
+import de.lanlab.larm.threads.ThreadPoolObserver;
+import de.lanlab.larm.threads.InterruptableTask;
+import de.lanlab.larm.storage.*;
+
+import java.net.MalformedURLException;
+import java.net.URL;
+import java.util.LinkedList;
+
+import de.lanlab.larm.fetcher.FetcherTask;
+
+/**
+ * filter class; the Fetcher is the main class which keeps the ThreadPool that
+ * gets the documents. It should be placed at the very end of the MessageQueue,
+ * so that all filtering can be made beforehand.
+ *
+ * @author Clemens Marschner
+ *
+ */
+
+public class Fetcher implements MessageListener
+{
+ /**
+ * holds the threads
+ */
+ ThreadPool fetcherPool;
+
+ /**
+ * total number of docs read
+ */
+ int docsRead = 0;
+
+ /**
+ * the storage where the docs are saved to
+ */
+ DocumentStorage storage;
+
+ /**
+ * the host manager keeps track of host information
+ */
+ HostManager hostManager;
+
+
+ /**
+ * initializes the fetcher with the given number of threads in the thread
+ * pool and a document storage.
+ *
+ * @param maxThreads the number of threads in the ThreadPool
+ * @param storage the storage where all documents are stored
+ * @param hostManager the host manager
+ */
+ public Fetcher(int maxThreads, DocumentStorage storage, HostManager hostManager)
+ {
+ this.storage = storage;
+ FetcherTask.setStorage(storage);
+ fetcherPool = new ThreadPool(maxThreads, new FetcherThreadFactory(hostManager));
+ fetcherPool.setQueue(new FetcherTaskQueue());
+ docsRead = 0;
+ this.hostManager = hostManager;
+ }
+
+
+ /**
+ * initializes the pool with default values (5 threads, NullStorage)
+ */
+ public void init()
+ {
+ fetcherPool.init();
+ }
+
+
+ /**
+ * initializes the pool with a NullStorage and the given number of threads
+ *
+ * @param maxThreads the number of threads in the thread pool
+ */
+ public void init(int maxThreads)
+ {
+ fetcherPool.init();
+ docsRead = 0;
+ }
+
+
+ /**
+ * this function will be called by the message handler each time a URL
+ * passes all filters and gets to the fetcher. From here, it will be
+ * distributed to the FetcherPool, a thread pool which carries out the task,
+ * that is to fetch the document from the web.
+ *
+ * @param message the message, which should actually be a URLMessage
+ * @return Description of the Return Value
+ */
+ public Message handleRequest(Message message)
+ {
+ URLMessage urlMessage = (URLMessage) message;
+
+ fetcherPool.doTask(new FetcherTask(urlMessage), "");
+ docsRead++;
+
+ // eat the message
+ return null;
+ }
+
+
+ /**
+ * called by the message handler when this object is added to it
+ *
+ * @param handler the message handler
+ */
+ public void notifyAddedToMessageHandler(MessageHandler handler)
+ {
+ this.messageHandler = handler;
+ FetcherTask.setMessageHandler(handler);
+ }
+
+
+ MessageHandler messageHandler;
+
+
+ /**
+ * the thread pool observer will be called each time a thread changes its
+ * state, i.e. from IDLE to RUNNING, and each time the number of thread
+ * queue entries change.
+ * this just wraps the thread pool method
+ *
+ * @param t the class that implements the ThreadPoolObserver interface
+ */
+ public void addThreadPoolObserver(ThreadPoolObserver t)
+ {
+ fetcherPool.addThreadPoolObserver(t);
+ }
+
+
+ /**
+ * returns the number of tasks queued. Should return 0 if there are any idle
+ * threads. this method just wraps the ThreadPool method
+ *
+ * @return The queueSize value
+ */
+ public int getQueueSize()
+ {
+ return fetcherPool.getQueueSize();
+ }
+
+
+ /**
+ * get the total number of threads.
+ * this method just wraps the ThreadPool method
+ *
+ * @return The workingThreadsCount value
+ */
+ public int getWorkingThreadsCount()
+ {
+ return fetcherPool.getIdleThreadsCount() + fetcherPool.getBusyThreadsCount();
+ }
+
+
+ /**
+ * get the number of threads that are currently idle.
+ * this method just wraps the ThreadPool method
+ *
+ * @return The idleThreadsCount value
+ */
+ public int getIdleThreadsCount()
+ {
+ return fetcherPool.getIdleThreadsCount();
+ }
+
+
+ /**
+ * get the number of threads that are currently busy.
+ * this method just wraps the ThreadPool method
+ *
+ * @return The busyThreadsCount value
+ */
+ public int getBusyThreadsCount()
+ {
+ return fetcherPool.getBusyThreadsCount();
+ }
+
+
+ /**
+ * Gets the threadPool attribute of the Fetcher object
+ * beware: the original object is returned
+ *
+ * @TODO remove this / make it private if possible
+ * @return The threadPool value
+ */
+ public ThreadPool getThreadPool()
+ {
+ return fetcherPool;
+ }
+
+
+ /**
+ * Gets the total number of docs read
+ *
+ * @return number of docs read
+ */
+ public int getDocsRead()
+ {
+ return docsRead;
+ }
+
+
+ /**
+ * returns the (original) task queue
+ * @TODO remove this if possible
+ * @return The taskQueue value
+ */
+ public FetcherTaskQueue getTaskQueue()
+ {
+ return (FetcherTaskQueue) this.fetcherPool.getTaskQueue();
+ }
+}
diff --git a/sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/fetcher/FetcherGUIController.java b/sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/fetcher/FetcherGUIController.java
new file mode 100644
index 00000000000..43b19768245
--- /dev/null
+++ b/sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/fetcher/FetcherGUIController.java
@@ -0,0 +1,150 @@
+package de.lanlab.larm.fetcher;
+
+import java.awt.event.ActionListener;
+import java.awt.event.ActionEvent;
+import java.net.MalformedURLException;
+import java.net.URL;
+import java.util.*;
+import java.awt.event.*;
+import de.lanlab.larm.gui.*;
+import de.lanlab.larm.threads.*;
+
+/**
+ * this was used to connect the GUI to the fetcher
+ * @TODO put this into the GUI package, probably?
+ */
+public class FetcherGUIController implements ActionListener
+{
+ FetcherMain fetcherMain;
+ FetcherSummaryFrame fetcherFrame;
+
+
+ public FetcherGUIController(FetcherMain fetcherMainPrg, FetcherSummaryFrame fetcherFrameWin, String defaultStartURL)
+ {
+ this.fetcherMain = fetcherMainPrg;
+ this.fetcherFrame = fetcherFrameWin;
+
+ fetcherFrame.setRestrictTo(fetcherMain.urlScopeFilter.getRexString());
+ fetcherFrame.setStartURL(defaultStartURL);
+
+ fetcherMain.fetcher.addThreadPoolObserver(
+ new ThreadPoolObserver()
+ {
+ public void threadUpdate(int threadNr, String action, String info)
+ {
+ String status = threadNr + ": " + action + ": " + info;
+ fetcherFrame.setIdleThreadsCount(fetcherMain.fetcher.getIdleThreadsCount());
+ fetcherFrame.setBusyThreadsCount(fetcherMain.fetcher.getBusyThreadsCount());
+ fetcherFrame.setWorkingThreadsCount(fetcherMain.fetcher.getWorkingThreadsCount());
+ }
+
+ public void queueUpdate(String info, String action)
+ {
+ fetcherFrame.setRequestQueueCount(fetcherMain.fetcher.getQueueSize());
+ }
+ }
+ );
+
+ fetcherMain.monitor.addObserver(new Observer()
+ {
+ public void update(Observable o, Object arg)
+ {
+ // der ThreadMonitor wurde geupdated
+ //fetcherFrame.setStalledThreads(fetcherMain.monitor.getStalledThreadCount(10, 500.0));
+ //fetcherFrame.setBytesPerSecond(fetcherMain.monitor.getAverageReadCount(5));
+ // fetcherFrame.setDocsPerSecond(fetcherMain.monitor.getDocsPerSecond(5));
+ // wir nutzen die Gelegenheit, den aktuellen Speicherbestand auszugeben
+ fetcherFrame.setFreeMem(Runtime.getRuntime().freeMemory());
+ fetcherFrame.setTotalMem(Runtime.getRuntime().totalMemory());
+
+ }
+
+ });
+
+ /* fetcherMain.reFilter.addObserver(
+ new Observer()
+ {
+ public void update(Observable o, Object arg)
+ {
+ fetcherFrame.setRobotsTxtCount(fetcherMain.reFilter.getExcludingHostsCount());
+ }
+ }
+ );*/
+
+ fetcherMain.messageHandler.addMessageQueueObserver(new Observer()
+ {
+ public void update(Observable o, Object arg)
+ {
+ // a message has been added or deleted
+
+ fetcherFrame.setURLsQueued(fetcherMain.messageHandler.getQueued());
+ }
+
+ }
+ );
+
+ // this observer will be called if a filter has decided to throw a
+ // message away.
+ fetcherMain.messageHandler.addMessageProcessorObserver(new Observer()
+ {
+ public void update(Observable o, Object arg)
+ {
+ if(arg == fetcherMain.urlScopeFilter)
+ {
+ fetcherFrame.setScopeFiltered(fetcherMain.urlScopeFilter.getFiltered());
+ }
+ else if(arg == fetcherMain.urlVisitedFilter)
+ {
+ fetcherFrame.setVisitedFiltered(fetcherMain.urlVisitedFilter.getFiltered());
+ }
+ else if(arg == fetcherMain.reFilter)
+ {
+ fetcherFrame.setURLsCaughtCount(fetcherMain.reFilter.getFiltered());
+ }
+ else // it's the fetcher
+ {
+ fetcherFrame.setDocsRead(fetcherMain.fetcher.getDocsRead());
+ }
+ }
+ }
+ );
+
+ fetcherFrame.addWindowListener(
+ new WindowAdapter()
+ {
+ public void windowClosed(WindowEvent e)
+ {
+ System.out.println("window Closed");
+ System.exit(0);
+ }
+
+
+ }
+ );
+
+ fetcherFrame.addStartButtonListener((ActionListener)this);
+ }
+
+ /**
+ * will be called when the start button is pressed
+ */
+ public void actionPerformed(ActionEvent e)
+ {
+ System.out.println("Füge Start-URL ein");
+ try
+ {
+ // urlVisitedFilter.printAllURLs();
+ // urlVisitedFilter.clearHashtable();
+ fetcherMain.setRexString(fetcherFrame.getRestrictTo());
+ fetcherMain.startMonitor();
+ fetcherMain.putURL(new URL(fetcherFrame.getStartURL()), false);
+ }
+ catch(Exception ex)
+ {
+ System.out.println("actionPerformed: Exception: " + ex.getMessage());
+ }
+ }
+
+}
+
+
diff --git a/sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/fetcher/FetcherMain.java b/sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/fetcher/FetcherMain.java
new file mode 100644
index 00000000000..2da43b08f68
--- /dev/null
+++ b/sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/fetcher/FetcherMain.java
@@ -0,0 +1,362 @@
+/*
+ * LARM - LANLab Retrieval Machine
+ *
+ * $history: $
+ *
+ */
+package de.lanlab.larm.fetcher;
+
+import de.lanlab.larm.threads.ThreadPoolObserver;
+import de.lanlab.larm.threads.ThreadPool;
+import java.net.MalformedURLException;
+import java.net.URL;
+import java.util.*;
+import de.lanlab.larm.gui.*;
+import de.lanlab.larm.util.*;
+import de.lanlab.larm.storage.*;
+import javax.swing.UIManager;
+import HTTPClient.*;
+import org.apache.oro.text.regex.MalformedPatternException;
+
+
+/**
+ * ENTRY POINT: this class contains the main()-method of the application, does
+ * all the initializing and optionally connects the fetcher with the GUI.
+ *
+ * @author Clemens Marschner
+ * @created December 16, 2000
+ */
+public class FetcherMain
+{
+
+ /**
+ * the main message pipeline
+ */
+ protected MessageHandler messageHandler;
+
+ /**
+ * this filter records all incoming URLs and filters everything it already
+ * knows
+ */
+ protected URLVisitedFilter urlVisitedFilter;
+
+ /**
+ * the scope filter filters URLs that fall out of the scope given by the
+ * regular expression
+ */
+ protected URLScopeFilter urlScopeFilter;
+
+ /*
+ * The DNS resolver was supposed to hold the host addresses for all hosts
+ * this is done by URL itself today
+ *
+ * protected DNSResolver dnsResolver;
+ */
+
+ /**
+ * the robot exclusion filter looks if a robots.txt is present on a host
+ * before it is first accessed
+ */
+ protected RobotExclusionFilter reFilter;
+
+ /**
+ * the host manager keeps track of all hosts and is used by the filters.
+ */
+ protected HostManager hostManager;
+
+ /**
+ * this rather flaky filter just filters out some URLs, i.e. different views
+ * of Apache the apache DirIndex module. Has to be made
+ * configurable in near future
+ */
+ protected KnownPathsFilter knownPathsFilter;
+
+ /**
+ * this is the main document fetcher. It contains a thread pool that fetches the
+ * documents and stores them
+ */
+ protected Fetcher fetcher;
+
+
+ /**
+ * the thread monitor once was only a monitoring tool, but now has become a
+ * vital part of the system that computes statistics and
+ * flushes the log file buffers
+ */
+
+ protected ThreadMonitor monitor;
+
+ /**
+ * the storage is a central class that puts all fetched documents somewhere.
+ * Several differnt implementations exist.
+ */
+ protected DocumentStorage storage;
+
+ /**
+ * the URL length filter filters URLs that are too long, i.e. because of errors
+ * in the implementation of dynamic web sites
+ */
+ protected URLLengthFilter urlLengthFilter;
+
+ /**
+ * initializes all classes and registers anonymous adapter classes as
+ * listeners for fetcher events.
+ *
+ * @param nrThreads number of fetcher threads to be created
+ */
+ public FetcherMain(int nrThreads)
+ {
+ // to make things clear, this method is commented a bit better than
+ // the rest of the program...
+
+ // this is the main message queue. handlers are registered with
+ // the queue, and whenever a message is put in it, they are passed to the
+ // filters in a "chain of responibility" manner. Every listener can decide
+ // to throw the message away
+ messageHandler = new MessageHandler();
+
+ // the storage is the class which saves a WebDocument somewhere, no
+ // matter how it does it, whether it's in a file, in a database or
+ // whatever
+
+
+ // example for the (very slow) SQL Server storage:
+ // this.storage = new SQLServerStorage("sun.jdbc.odbc.JdbcOdbcDriver","jdbc:odbc:search","sa","...",nrThreads);
+
+ // the LogStorage used here does extensive logging. It logs all links and
+ // document information.
+ // it also saves all documents to page files. Probably this single storage
+ // could also be replaced by a pipeline; or even incorporated into the
+ // existing message pipeline
+ SimpleLogger log = new SimpleLogger("store", false);
+ this.storage = new LogStorage(log, true, "logs/pagefile");
+
+ // a third example would be the NullStorage, which converts the documents into
+ // heat, which evaporates above the processor
+ // NullStorage();
+
+ // create the filters and add them to the message queue
+ urlScopeFilter = new URLScopeFilter();
+
+ urlVisitedFilter = new URLVisitedFilter(100000, log);
+
+ // dnsResolver = new DNSResolver();
+ hostManager = new HostManager(1000);
+
+ reFilter = new RobotExclusionFilter(hostManager);
+
+ fetcher = new Fetcher(nrThreads, storage, hostManager);
+
+ knownPathsFilter = new KnownPathsFilter();
+
+ urlLengthFilter = new URLLengthFilter(255);
+
+ // prevent message box popups
+ HTTPConnection.setDefaultAllowUserInteraction(false);
+
+ // prevent GZipped files from being decoded
+ HTTPConnection.removeDefaultModule(HTTPClient.ContentEncodingModule.class);
+
+ // initialize the threads
+ fetcher.init();
+
+ // the thread monitor watches the thread pool.
+
+ monitor = new ThreadMonitor(urlLengthFilter,
+ urlVisitedFilter,
+ urlScopeFilter,
+ /*dnsResolver,*/
+ reFilter,
+ messageHandler,
+ fetcher.getThreadPool(),
+ hostManager,
+ 5000 // wake up every 5 seconds
+ );
+
+
+ // add all filters to the handler.
+ messageHandler.addListener(urlLengthFilter);
+ messageHandler.addListener(urlScopeFilter);
+ messageHandler.addListener(reFilter);
+ messageHandler.addListener(urlVisitedFilter);
+ messageHandler.addListener(knownPathsFilter);
+ messageHandler.addListener(fetcher);
+
+ /* uncomment this to enable HTTPClient logging
+ try
+ {
+ HTTPClient.Log.setLogWriter(new java.io.FileWriter("logs/HttpClient.log"),false);
+ HTTPClient.Log.setLogging(HTTPClient.Log.ALL, true);
+ }
+ catch (Exception e)
+ {
+ e.printStackTrace();
+ }
+ */
+ }
+
+
+ /**
+ * Sets the RexString attribute of the FetcherMain object
+ *
+ * @param restrictTo The new RexString value
+ */
+ public void setRexString(String restrictTo) throws MalformedPatternException
+ {
+ urlScopeFilter.setRexString(restrictTo);
+ }
+
+
+ /**
+ * Description of the Method
+ *
+ * @param url Description of Parameter
+ * @param isFrame Description of the Parameter
+ * @exception java.net.MalformedURLException Description of Exception
+ */
+ public void putURL(URL url, boolean isFrame)
+ throws java.net.MalformedURLException
+ {
+ try
+ {
+ messageHandler.putMessage(new URLMessage(url, null, isFrame));
+ }
+ catch (Exception e)
+ {
+ System.out.println("Exception: " + e.getMessage());
+ e.printStackTrace();
+ }
+ //System.out.println("URLs geschrieben");
+ }
+
+
+ /**
+ * Description of the Method
+ */
+ public void startMonitor()
+ {
+ monitor.start();
+ }
+
+
+
+ /*
+ * the GUI is not working at this time. It was used in the very beginning, but
+ * synchronous updates turned out to slow down the program a lot, even if the
+ * GUI would be turned off. Thus, a lot
+ * of Observer messages where removed later. Nontheless, it's quite cool to see
+ * it working...
+ *
+ * @param f Description of Parameter
+ * @param startURL Description of Parameter
+ */
+
+ /*
+ public void initGui(FetcherMain f, String startURL)
+ {
+ // if we're on a windows platform, make it look a bit more convenient
+ try
+ {
+ UIManager.setLookAndFeel(UIManager.getSystemLookAndFeelClassName());
+ }
+ catch (Exception e)
+ {
+ // dann halt nicht...
+ }
+ System.out.println("Init FetcherFrame");
+
+ FetcherSummaryFrame fetcherFrame;
+ fetcherFrame = new FetcherSummaryFrame();
+ fetcherFrame.setSize(640, 450);
+ fetcherFrame.setVisible(true);
+ FetcherGUIController guiController = new FetcherGUIController(f, fetcherFrame, startURL);
+ }
+ */
+
+
+ /**
+ * The main program. parsed
+ *
+ * @param args The command line arguments
+ */
+ public static void main(String[] args)
+ {
+ int nrThreads = 10;
+
+ String startURL = "";
+ String restrictTo = "http://141.84.120.82/ll/cmarschn/.*";
+ boolean gui = false;
+ boolean showInfo = false;
+ System.out.println("LARM - LANLab Retrieval Machine - Fetcher - V 1.00 - (C) LANLab 2000-02");
+ for (int i = 0; i < args.length; i++)
+ {
+ if (args[i].equals("-start"))
+ {
+ i++;
+ startURL = args[i];
+ System.out.println("Start-URL set to: " + startURL);
+ }
+ else if (args[i].equals("-restrictto"))
+ {
+ i++;
+ restrictTo = args[i];
+ System.out.println("Restricting URLs to " + restrictTo);
+ }
+ else if (args[i].equals("-threads"))
+ {
+ i++;
+ nrThreads = Integer.parseInt(args[i]);
+ System.out.println("Threads set to " + nrThreads);
+ }
+ else if (args[i].equals("-gui"))
+ {
+ gui = true;
+ }
+ else if (args[i].equals("-?"))
+ {
+ showInfo = true;
+ }
+ else
+ {
+ System.out.println("Unknown option: " + args[i] + "; use -? to get syntax");
+ System.exit(0);
+ }
+ }
+
+ //URL.setURLStreamHandlerFactory(new HttpTimeoutFactory(500));
+ // replaced by HTTPClient
+
+ FetcherMain f = new FetcherMain(nrThreads);
+ if (showInfo || (startURL.equals("") && gui == false))
+ {
+ System.out.println("Usage: FetcherMain -start
+ * Description:
+ * Copyright: Copyright (c)
+ * Company:
+ * @author
+ * @version 1.0
+ */
+package de.lanlab.larm.fetcher;
+
+import de.lanlab.larm.threads.ServerThread;
+import de.lanlab.larm.util.State;
+
+/**
+ * a server thread for the thread pool that records the number
+ * of bytes read and the number of tasks run
+ * mainly for statistical purposes and to keep most of the information a task needs
+ * static
+ */
+public class FetcherThread extends ServerThread
+{
+
+ long totalBytesRead = 0;
+ long totalTasksRun = 0;
+
+ HostManager hostManager;
+
+ byte[] documentBuffer = new byte[Constants.FETCHERTASK_READSIZE];
+
+ public HostManager getHostManager()
+ {
+ return hostManager;
+ }
+
+ public FetcherThread(int threadNumber, ThreadGroup threadGroup, HostManager hostManager)
+ {
+ super(threadNumber,"FetcherThread " + threadNumber, threadGroup);
+ this.hostManager = hostManager;
+ }
+
+ public static String STATE_IDLE = "Idle";
+
+ State idleState = new State(STATE_IDLE); // only set if task is finished
+
+ protected void taskReady()
+ {
+ totalBytesRead += ((FetcherTask)task).getBytesRead();
+ totalTasksRun++;
+ super.taskReady();
+ idleState.setState(STATE_IDLE);
+
+ }
+
+
+ public long getTotalBytesRead()
+ {
+ if(task != null)
+ {
+ return totalBytesRead + ((FetcherTask)task).getBytesRead();
+ }
+ else
+ {
+ return totalBytesRead;
+ }
+ }
+
+ public long getTotalTasksRun()
+ {
+ return totalTasksRun;
+ }
+
+ public byte[] getDocumentBuffer()
+ {
+ return documentBuffer;
+ }
+
+ public State getTaskState()
+ {
+ if(task != null)
+ {
+ // task could be null here
+ return ((FetcherTask)task).getTaskState();
+ }
+ else
+ {
+ return idleState.cloneState();
+ }
+ }
+
+}
diff --git a/sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/fetcher/FetcherThreadFactory.java b/sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/fetcher/FetcherThreadFactory.java
new file mode 100644
index 00000000000..99035c24ee0
--- /dev/null
+++ b/sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/fetcher/FetcherThreadFactory.java
@@ -0,0 +1,38 @@
+
+/**
+ * Title: LARM Lanlab Retrieval Machine
+ * Description:
+ * Copyright: Copyright (c)
+ * Company:
+ * @author
+ * @version 1.0
+ */
+package de.lanlab.larm.fetcher;
+import de.lanlab.larm.threads.*;
+
+/**
+ * this factory simply creates fetcher threads. It's passed
+ * to the ThreadPool because the pool is creating the threads on its own
+ */
+public class FetcherThreadFactory extends ThreadFactory
+{
+
+ //static int count = 0;
+
+ ThreadGroup threadGroup = new ThreadGroup("FetcherThreads");
+
+ HostManager hostManager;
+
+ public FetcherThreadFactory(HostManager hostManager)
+ {
+ this.hostManager = hostManager;
+ }
+
+
+ public ServerThread createServerThread(int count)
+ {
+ ServerThread newThread = new FetcherThread(count, threadGroup, hostManager);
+ newThread.setPriority(4);
+ return newThread;
+ }
+}
\ No newline at end of file
diff --git a/sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/fetcher/Filter.java b/sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/fetcher/Filter.java
new file mode 100644
index 00000000000..0a3be1c0e7e
--- /dev/null
+++ b/sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/fetcher/Filter.java
@@ -0,0 +1,29 @@
+
+/**
+ * Title: LARM Lanlab Retrieval Machine
+ * Description:
+ * Copyright: Copyright (c)
+ * Company:
+ * @author
+ * @version 1.0
+ */
+package de.lanlab.larm.fetcher;
+
+
+/**
+ * base class of all filter classes
+ */
+public abstract class Filter
+{
+ /**
+ * number of items filtered. augmented directly by
+ * the inheriting classes
+ */
+ protected int filtered = 0;
+
+
+ public int getFiltered()
+ {
+ return filtered;
+ }
+}
\ No newline at end of file
diff --git a/sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/fetcher/GZipTest.java b/sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/fetcher/GZipTest.java
new file mode 100644
index 00000000000..ad6d5b3ed32
--- /dev/null
+++ b/sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/fetcher/GZipTest.java
@@ -0,0 +1,56 @@
+package de.lanlab.larm.fetcher;
+
+/**
+ * Title: LARM Lanlab Retrieval Machine Description: Copyright: Copyright (c)
+ * Company:
+ *
+ * @author
+ * @version 1.0
+ */
+
+import java.io.*;
+import java.util.zip.*;
+import java.net.*;
+
+/**
+ * Description of the Class
+ *
+ * @author Administrator
+ * @created 28. Januar 2002
+ */
+public class GZipTest
+{
+
+ /**
+ * Constructor for the GZipTest object
+ */
+ public GZipTest() { }
+
+
+ /**
+ * The main program for the GZipTest class
+ *
+ * @param args The command line arguments
+ */
+ public static void main(String[] args)
+ {
+ try
+ {
+ String url = "http://speechdat.phonetik.uni-muenchen.de/speechdt//speechDB/FIXED1SL/BLOCK00/SES0006/A10006O5.aif";
+
+ ByteArrayOutputStream a = new ByteArrayOutputStream(url.length());
+ GZIPOutputStream g = new GZIPOutputStream(a);
+ OutputStreamWriter o = new OutputStreamWriter(g,"ISO-8859-1");
+
+ o.write(url);
+ o.close();
+ g.finish();
+ byte[] array = a.toByteArray();
+ System.out.println("URL: " + url + " \n Length: " + url.length() + "\n zipped: " + array.length
+ );
+ }
+ catch (Exception e)
+ { e.printStackTrace();
+ }
+ }
+}
diff --git a/sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/fetcher/HostInfo.java b/sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/fetcher/HostInfo.java
new file mode 100644
index 00000000000..ff48f26f31f
--- /dev/null
+++ b/sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/fetcher/HostInfo.java
@@ -0,0 +1,121 @@
+package de.lanlab.larm.fetcher;
+
+/**
+ * Title: LARM Lanlab Retrieval Machine Description: Copyright: Copyright (c)
+ * Company:
+ *
+ * @author Clemens Marschner
+ * @version 1.0
+ */
+
+import java.util.HashMap;
+import java.net.*;
+import de.lanlab.larm.util.CachingQueue;
+import de.lanlab.larm.util.Queue;
+
+/**
+ * contains information about a host. If a host doesn't respond too often, it's
+ * excluded from the crawl.
+ * This class is used by the HostManager
+ *
+ * @author Clemens Marschner
+ * @created 16. Februar 2002
+ */
+public class HostInfo
+{
+ static final String[] emptyKeepOutDirectories = new String[0];
+
+ int id;
+ int healthyCount = 5; // five strikes, and you're out
+ boolean isReachable = true;
+ boolean robotTxtChecked = false;
+ String[] disallows; // robot exclusion
+ boolean isLoadingRobotsTxt = false;
+ Queue queuedRequests = null; // robot exclusion
+ String hostName;
+
+ public HostInfo(String hostName, int id)
+ {
+ this.id = id;
+ this.disallows = HostInfo.emptyKeepOutDirectories;
+ this.hostName = hostName;
+ }
+
+ /**
+ * is this host reachable and responding?
+ */
+ public boolean isHealthy()
+ {
+ return (healthyCount > 0) && isReachable;
+ }
+
+ /**
+ * signals that the host returned with a bad request of whatever type
+ */
+ public void badRequest()
+ {
+ healthyCount--;
+ }
+
+ public void setReachable(boolean reachable)
+ {
+ isReachable = reachable;
+ }
+
+ public boolean isReachable()
+ {
+ return isReachable;
+ }
+
+ public boolean isRobotTxtChecked()
+ {
+ return robotTxtChecked;
+ }
+
+ /**
+ * must be synchronized externally
+ */
+ public boolean isLoadingRobotsTxt()
+ {
+ return this.isLoadingRobotsTxt;
+ }
+
+ public void setLoadingRobotsTxt(boolean isLoading)
+ {
+ this.isLoadingRobotsTxt = isLoading;
+ if(isLoading)
+ {
+ this.queuedRequests = new CachingQueue("HostInfo_" + id + "_QueuedRequests", 100);
+ }
+
+ }
+
+ public void setRobotsChecked(boolean isChecked, String[] disallows)
+ {
+ this.robotTxtChecked = isChecked;
+ if(disallows != null)
+ {
+ this.disallows = disallows;
+ }
+ else
+ {
+ this.disallows = emptyKeepOutDirectories;
+ }
+
+ }
+
+ public synchronized boolean isAllowed(String path)
+ {
+ // assume keepOutDirectories is pretty short
+ // assert disallows != null
+ int length = disallows.length;
+ for(int i=0; i
+ *
+ * Description:
+ *
+ * Copyright: Copyright (c)
+ *
+ * Company:
+ *
+ *
+ *
+ * @author Clemens Marschner
+ * @version 1.0
+ */
+package de.lanlab.larm.fetcher;
+
+import de.lanlab.larm.util.SimpleObservable;
+import de.lanlab.larm.util.State;
+import java.util.*;
+import java.net.*;
+import java.io.*;
+import org.apache.oro.text.perl.Perl5Util;
+import de.lanlab.larm.util.*;
+import de.lanlab.larm.threads.*;
+import HTTPClient.*;
+
+/**
+ * this factory simply creates fetcher threads. It's gonna be passed to the
+ * ThreadPool because the pool is creating the threads on its own
+ *
+ * @author Administrator
+ * @created 17. Februar 2002
+ */
+class REFThreadFactory extends ThreadFactory
+{
+
+ ThreadGroup threadGroup = new ThreadGroup("RobotExclusionFilter");
+
+
+ /**
+ * Description of the Method
+ *
+ * @param count Description of the Parameter
+ * @return Description of the Return Value
+ */
+ public ServerThread createServerThread(int count)
+ {
+ ServerThread newThread = new ServerThread(count, "REF-" + count, threadGroup);
+ newThread.setPriority(4);
+ return newThread;
+ }
+}
+
+/**
+ * the RE filter obeys the robot exclusion standard. If a new host name is supposed
+ * to be accessed, it first loads a "/robots.txt" on the given server and records the
+ * disallows stated in that file.
+ * The REFilter has a thread pool on its own to prevent the message handler from being
+ * clogged up if the server doesn't respond. Incoming messages are queued while the
+ * robots.txt is loaded.
+ * The information is stored in HostInfo records of the host manager class
+ *
+ * @author Clemens Marschner
+ * @created 17. Februar 2002
+ */
+public class RobotExclusionFilter extends Filter implements MessageListener
+{
+
+
+ protected HostManager hostManager;
+
+ protected SimpleLogger log;
+
+
+ /**
+ * Constructor for the RobotExclusionFilter object
+ *
+ * @param hm Description of the Parameter
+ */
+ public RobotExclusionFilter(HostManager hm)
+ {
+ log = new SimpleLogger("RobotExclusionFilter");
+ hostManager = hm;
+ rePool = new ThreadPool(2, new REFThreadFactory());
+ rePool.init();
+ log.setFlushAtOnce(true);
+ log.log("refilter: initialized");
+ }
+
+
+ /**
+ * called by the message handler
+ */
+ public void notifyAddedToMessageHandler(MessageHandler handler)
+ {
+ this.messageHandler = handler;
+ }
+
+
+ MessageHandler messageHandler = null;
+ ThreadPool rePool;
+
+
+ /**
+ * method that handles each URL request
+ *
+ * This method will get the robots.txt file the first time a server is
+ * requested. See the description above.
+ *
+ * @param message
+ * the (URL)Message
+ * @return
+ * the original message or NULL if this host had a disallow on that URL
+ * @link{http://info.webcrawler.com/mak/projects/robots/norobots.html})
+ */
+
+ public Message handleRequest(Message message)
+ {
+ //log.logThreadSafe("handleRequest: got message: " + message);
+ try
+ {
+ // assert message instanceof URLMessage;
+ URLMessage urlMsg = ((URLMessage) message);
+ URL url = urlMsg.getUrl();
+ //assert url != null;
+ HostInfo h = hostManager.getHostInfo(url.getHost());
+ if (!h.isRobotTxtChecked() && !h.isLoadingRobotsTxt())
+ {
+ log.logThreadSafe("handleRequest: starting to get robots.txt");
+ // probably this results in Race Conditions here
+
+ rePool.doTask(new RobotExclusionTask(h), new Integer(h.id));
+ h.setLoadingRobotsTxt(true);
+ }
+
+ synchronized (h)
+ {
+ // isLoading...() and queuedRequest.insert() must be atomic
+ if (h.isLoadingRobotsTxt())
+ {
+
+ //log.logThreadSafe("handleRequest: other thread is loading");
+ // assert h.queuedRequests != null
+ h.queuedRequests.insert(message);
+ // not thread safe
+ log.logThreadSafe("handleRequest: queued file " + url);
+ return null;
+ }
+ }
+
+ //log.logThreadSafe("handleRequest: no thread is loading; robots.txt loaded");
+ //log.logThreadSafe("handleRequest: checking if allowed");
+ String path = url.getPath();
+ if (path == null || path.equals(""))
+ {
+ path = "/";
+ }
+
+ if (h.isAllowed(path))
+ {
+ // log.logThreadSafe("handleRequest: file " + urlMsg.getURLString() + " ok");
+ return message;
+ }
+ log.logThreadSafe("handleRequest: file " + urlMsg.getURLString() + " filtered");
+ this.filtered++;
+ }
+ catch (Exception e)
+ {
+ e.printStackTrace();
+ }
+ return null;
+ }
+
+
+ private static volatile NVPair headers[] = new NVPair[1];
+
+ static
+ {
+ headers[0] = new HTTPClient.NVPair("User-Agent", Constants.CRAWLER_AGENT);
+
+ }
+
+
+ /**
+ * the task that actually loads and parses the robots.txt files
+ *
+ * @author Clemens Marschner
+ * @created 17. Februar 2002
+ */
+ class RobotExclusionTask implements InterruptableTask
+ {
+ HostInfo hostInfo;
+
+
+
+ /**
+ * Constructor for the RobotExclusionTask object
+ *
+ * @param hostInfo Description of the Parameter
+ */
+ public RobotExclusionTask(HostInfo hostInfo)
+ {
+ this.hostInfo = hostInfo;
+ }
+
+
+ /**
+ * dummy
+ *
+ * @return The info value
+ */
+ public String getInfo()
+ {
+ return "";
+ }
+
+
+ /**
+ * not used
+ */
+ public void interrupt() { }
+
+
+ /**
+ * gets a robots.txt file and adds the information to the hostInfo
+ * structure
+ *
+ * @param thread the server thread (passed by the thread pool)
+ */
+ public void run(ServerThread thread)
+ {
+ // assert hostInfo != null;
+ String threadName = Thread.currentThread().getName();
+
+ log.logThreadSafe("task " + threadName + ": starting to load " + hostInfo.hostName);
+ //hostInfo.setLoadingRobotsTxt(true);
+ String[] disallows = null;
+ boolean errorOccured = false;
+ try
+ {
+ log.logThreadSafe("task " + threadName + ": getting connection");
+ HTTPConnection conn = new HTTPConnection(hostInfo.hostName);
+ conn.setTimeout(30000);
+ // wait at most 20 secs
+
+ HTTPResponse res = conn.Get("/robots.txt", (String) null, headers);
+ log.logThreadSafe("task " + threadName + ": got connection.");
+ if (res.getStatusCode() != 200)
+ {
+ errorOccured = true;
+ }
+ else
+ {
+
+ log.logThreadSafe("task " + threadName + ": reading");
+ byte[] file = res.getData(40000);
+ // max. 40 kb
+ log.logThreadSafe("task " + threadName + ": reading done. parsing");
+ disallows = parse(new BufferedReader(new InputStreamReader(new ByteArrayInputStream(file))));
+ log.logThreadSafe("task " + threadName + ": parsing done. found " + disallows.length + " disallows");
+ // assert disallows != null
+ // HostInfo hostInfo = hostManager.getHostInfo(this.hostName);
+ // assert hostInfo != null
+ log.logThreadSafe("task " + threadName + ": setting disallows");
+ }
+ }
+ catch (java.net.UnknownHostException e)
+ {
+ hostInfo.setReachable(false);
+ log.logThreadSafe("task " + threadName + ": unknown host. setting to unreachable");
+ errorOccured = true;
+ }
+ catch (java.net.NoRouteToHostException e)
+ {
+ hostInfo.setReachable(false);
+ log.logThreadSafe("task " + threadName + ": no route to. setting to unreachable");
+ errorOccured = true;
+ }
+ catch (java.net.ConnectException e)
+ {
+ hostInfo.setReachable(false);
+ log.logThreadSafe("task " + threadName + ": connect exception. setting to unreachable");
+ errorOccured = true;
+ }
+ catch (java.io.InterruptedIOException e)
+ {
+ // time out. fatal in this case
+ hostInfo.setReachable(false);
+ log.logThreadSafe("task " + threadName + ": time out. setting to unreachable");
+ errorOccured = true;
+ }
+
+ catch (Throwable e)
+ {
+ errorOccured = true;
+ log.log("task " + threadName + ": unknown exception: " + e.getClass().getName() + ": " + e.getMessage() + ". continuing");
+ log.log(e);
+
+ }
+ finally
+ {
+ if (errorOccured)
+ {
+ synchronized (hostInfo)
+ {
+ hostInfo.setRobotsChecked(true, null);
+ // crawl everything
+ hostInfo.setLoadingRobotsTxt(false);
+ log.logThreadSafe("task " + threadName + ": error occured");
+ log.logThreadSafe("task " + threadName + ": now put " + hostInfo.queuedRequests.size() + " queueud requests back");
+ hostInfo.isLoadingRobotsTxt = false;
+ putBackURLs();
+ }
+ }
+ else
+ {
+ synchronized (hostInfo)
+ {
+ hostInfo.setRobotsChecked(true, disallows);
+ log.logThreadSafe("task " + threadName + ": done");
+ log.logThreadSafe("task " + threadName + ": now put " + hostInfo.queuedRequests.size() + " queueud requests back");
+ hostInfo.isLoadingRobotsTxt = false;
+ putBackURLs();
+ }
+ }
+ }
+ }
+
+
+ /**
+ * put back queued URLs
+ */
+ private void putBackURLs()
+ {
+ while (hostInfo.queuedRequests.size() > 0)
+ {
+ messageHandler.putMessage((Message) hostInfo.queuedRequests.remove());
+ }
+ log.logThreadSafe("task " + Thread.currentThread().getName() + ": finished");
+ hostInfo.queuedRequests = null;
+ }
+
+
+ /**
+ * this parses the robots.txt file. It was taken from the PERL implementation
+ * Since this is only rarely called, it's not optimized for speed
+ *
+ * @param r the robots.txt file
+ * @return the disallows
+ * @exception IOException any IOException
+ */
+ public String[] parse(BufferedReader r)
+ throws IOException
+ {
+ // taken from Perl
+ Perl5Util p = new Perl5Util();
+ String line;
+ boolean isMe = false;
+ boolean isAnon = false;
+ ArrayList disallowed = new ArrayList();
+ String ua = null;
+
+ while ((line = r.readLine()) != null)
+ {
+ if (p.match("/^#.*/", line))
+ {
+ // a comment
+ continue;
+ }
+ line = p.substitute("s/\\s*\\#.* //", line);
+ if (p.match("/^\\s*$/", line))
+ {
+ if (isMe)
+ {
+ break;
+ }
+ }
+ else if (p.match("/^User-Agent:\\s*(.*)/i", line))
+ {
+ ua = p.group(1);
+ ua = p.substitute("s/\\s+$//", ua);
+ if (isMe)
+ {
+ break;
+ }
+ else if (ua.equals("*"))
+ {
+ isAnon = true;
+ }
+ else if (Constants.CRAWLER_AGENT.startsWith(ua))
+ {
+ isMe = true;
+ }
+ }
+ else if (p.match("/^Disallow:\\s*(.*)/i", line))
+ {
+ if (ua == null)
+ {
+ isAnon = true;
+ // warn...
+ }
+ String disallow = p.group(1);
+ if (disallow != null && disallow.length() > 0)
+ {
+ // assume we have a relative path
+ ;
+ }
+ else
+ {
+ disallow = "/";
+ }
+ if (isMe || isAnon)
+ {
+ disallowed.add(disallow);
+ }
+ }
+ else
+ {
+ // warn: unexpected line
+ }
+ }
+ String[] disalloweds = new String[disallowed.size()];
+ disallowed.toArray(disalloweds);
+ return disalloweds;
+ }
+
+ }
+
+}
diff --git a/sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/fetcher/ThreadMonitor.java b/sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/fetcher/ThreadMonitor.java
new file mode 100644
index 00000000000..140924ab81a
--- /dev/null
+++ b/sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/fetcher/ThreadMonitor.java
@@ -0,0 +1,545 @@
+
+/**
+ * Title: LARM Lanlab Retrieval Machine
+ * Description:
+ * Copyright: Copyright (c)
+ * Company:
+ * @author
+ * @version 1.0
+ */
+package de.lanlab.larm.fetcher;
+
+
+import de.lanlab.larm.threads.*;
+import java.util.*;
+import java.text.*;
+import java.io.*;
+import de.lanlab.larm.util.State;
+import de.lanlab.larm.util.SimpleLoggerManager;
+
+/**
+ * this monitor takes a sample of every thread every x milliseconds,
+ * and logs a lot of information. In the near past it has evolved into the multi
+ * purpose monitoring and maintenance facility.
+ * At the moment it prints status information
+ * to log files and to the console
+ * @TODO this can be done better. Probably with an agent where different services
+ * can be registered to be called every X seconds
+ */
+public class ThreadMonitor extends Observable implements Runnable
+{
+ /**
+ * a reference to the thread pool that's gonna be observed
+ */
+ private ThreadPool threadPool;
+
+
+ class Sample
+ {
+ long bytesRead;
+ long docsRead;
+ long time;
+ public Sample(long bytesRead, long docsRead, long time)
+ {
+ this.bytesRead = bytesRead;
+ this.docsRead = docsRead;
+ this.time = time;
+ }
+ }
+
+ ArrayList bytesReadPerPeriod;
+
+ /**
+ * Zeit zwischen den Messungen
+ */
+ int sampleDelta;
+
+ /**
+ * the thread where this monitor runs in. Will run with high priority
+ */
+ Thread thread;
+
+
+ URLVisitedFilter urlVisitedFilter;
+ URLScopeFilter urlScopeFilter;
+// DNSResolver dnsResolver;
+ RobotExclusionFilter reFilter;
+ MessageHandler messageHandler;
+ URLLengthFilter urlLengthFilter;
+ HostManager hostManager;
+
+ public final static double KBYTE = 1024;
+ public final static double MBYTE = 1024 * KBYTE;
+ public final static double ONEGBYTE = 1024 * MBYTE;
+
+
+ String formatBytes(long lbytes)
+ {
+ double bytes = (double)lbytes;
+ if(bytes >= ONEGBYTE)
+ {
+ return fractionFormat.format((bytes/ONEGBYTE)) + " GB";
+ }
+ else if(bytes >= MBYTE)
+ {
+ return fractionFormat.format(bytes/MBYTE) + " MB";
+ }
+ else if(bytes >= KBYTE)
+ {
+ return fractionFormat.format(bytes/KBYTE) + " KB";
+ }
+ else
+ {
+ return fractionFormat.format(bytes) + " Bytes";
+ }
+
+ }
+
+ /**
+ * a logfile where status information is posted
+ * FIXME: put that in a seperate class (double code in FetcherTask)
+ */
+ PrintWriter logWriter;
+ private SimpleDateFormat formatter
+ = new SimpleDateFormat ("hh:mm:ss:SSSS");
+ private DecimalFormat fractionFormat = new DecimalFormat("0.00");
+
+ long startTime = System.currentTimeMillis();
+
+ private void log(String text)
+ {
+ try
+ {
+ logWriter.println(formatter.format(new Date()) + ";" + (System.currentTimeMillis()-startTime) + ";" + text);
+ logWriter.flush();
+ }
+ catch(Exception e)
+ {
+ System.out.println("Couldn't write to logfile");
+ }
+ }
+
+ /**
+ * construct the monitor gets a reference to all monitored filters
+ * @param threadPool the pool to be observed
+ * @param sampleDelta time in ms between samples
+ */
+ public ThreadMonitor(URLLengthFilter urlLengthFilter,
+ URLVisitedFilter urlVisitedFilter,
+ URLScopeFilter urlScopeFilter,
+ /*DNSResolver dnsResolver,*/
+ RobotExclusionFilter reFilter,
+ MessageHandler messageHandler,
+ ThreadPool threadPool,
+ HostManager hostManager,
+ int sampleDelta)
+ {
+ this.urlLengthFilter = urlLengthFilter;
+ this.urlVisitedFilter = urlVisitedFilter;
+ this.urlScopeFilter = urlScopeFilter;
+ /* this.dnsResolver = dnsResolver;*/
+ this.hostManager = hostManager;
+ this.reFilter = reFilter;
+ this.messageHandler = messageHandler;
+
+ this.threadPool = threadPool;
+ bytesReadPerPeriod = new ArrayList();
+ this.sampleDelta = sampleDelta;
+ this.thread = new Thread(this, "ThreadMonitor");
+ this.thread.setPriority(7);
+
+ try
+ {
+ File logDir = new File("logs");
+ logDir.mkdir();
+ logWriter = new PrintWriter(new BufferedWriter(new FileWriter("logs/ThreadMonitor.log")));
+ }
+ catch(IOException e)
+ {
+ System.out.println("Couldn't create logfile (ThreadMonitor)");
+ }
+
+ }
+
+ /**
+ * java.lang.Threads run method. To be invoked via start()
+ * the monitor's main thread takes the samples every sampleDelta ms
+ * Since Java is not real time, it remembers
+ */
+ public void run()
+ {
+ int nothingReadCount = 0;
+ long lastPeriodBytesRead = -1;
+ long monitorRunCount = 0;
+ long startTime = System.currentTimeMillis();
+ log("time;overallBytesRead;overallTasksRun;urlsQueued;urlsWaiting;isWorkingOnMessage;urlsScopeFiltered;urlsVisitedFiltered;urlsREFiltered;memUsed;memFree;totalMem;nrHosts;visitedSize;visitedStringSize;urlLengthFiltered");
+ while(true)
+ {
+ try
+ {
+ try
+ {
+ thread.sleep(sampleDelta);
+ }
+ catch(InterruptedException e)
+ {
+ return;
+ }
+
+ Iterator threadIterator = threadPool.getThreadIterator();
+ int i=0;
+ StringBuffer bytesReadString = new StringBuffer(200);
+ StringBuffer rawBytesReadString = new StringBuffer(200);
+ StringBuffer tasksRunString = new StringBuffer(200);
+ long overallBytesRead = 0;
+ long overallTasksRun = 0;
+ long now = System.currentTimeMillis();
+ boolean finished = false;
+ //System.out.print("\f");
+ /*while(!finished)
+ {
+ boolean restart = false;*/
+ boolean allThreadsIdle = true;
+ StringBuffer sb = new StringBuffer(500);
+
+ while(threadIterator.hasNext())
+ {
+ FetcherThread thread = (FetcherThread)threadIterator.next();
+ long totalBytesRead = thread.getTotalBytesRead();
+ overallBytesRead += totalBytesRead;
+ bytesReadString.append(formatBytes(totalBytesRead)).append( "; ");
+ rawBytesReadString.append(totalBytesRead).append("; ");
+ long tasksRun = thread.getTotalTasksRun();
+ overallTasksRun += tasksRun;
+ tasksRunString.append(tasksRun).append("; ");
+
+ // check task status
+ State state = thread.getTaskState();
+ //StringBuffer sb = new StringBuffer(200);
+ sb.setLength(0);
+ System.out.println(sb + "[" + thread.getThreadNumber() + "] " + state.getState() + " for " +
+ (now - state.getStateSince() ) + " ms " +
+ (state.getInfo() != null ? "(" + state.getInfo() +")" : "")
+ );
+ if(!(state.getState().equals(FetcherThread.STATE_IDLE)))
+ {
+ //if(allThreadsIdle) System.out.println("(not all threads are idle, '"+state.getState()+"' != '"+FetcherThread.STATE_IDLE+"')");
+ allThreadsIdle = false;
+ }
+ if (((state.equals(FetcherTask.FT_CONNECTING)) || (state.equals(FetcherTask.FT_GETTING)) || (state.equals(FetcherTask.FT_READING)) || (state.equals(FetcherTask.FT_CLOSING)))
+ && ((now - state.getStateSince()) > 160000))
+ {
+ System.out.println("****Restarting Thread " + thread.getThreadNumber());
+ threadPool.restartThread(thread.getThreadNumber());
+ break; // Iterator is invalid
+ }
+
+ }
+ /*if(restart)
+ {
+ continue;
+ }
+ finished = true;
+ }*/
+ /*
+ if(overallBytesRead == lastPeriodBytesRead)
+ {
+ *
+ disabled kickout feature - cm
+
+ nothingReadCount ++;
+ System.out.println("Anomaly: nothing read during the last period(s). " + (20-nothingReadCount+1) + " periods to exit");
+ if(nothingReadCount > 20) // nothing happens anymore
+ {
+ log("Ending");
+ System.out.println("End at " + new Date().toString());
+ // print some information
+ System.exit(0);
+ }
+
+
+ }
+ else
+ {
+ nothingReadCount = 0;
+ }*/
+
+ lastPeriodBytesRead = overallBytesRead;
+
+ //State reState = new State("hhh"); //reFilter.getState();
+ sb.setLength(0);
+ //System.out.println(sb + "Robot-Excl.Filter State: " + reState.getState() + " since " + (now-reState.getStateSince()) + " ms " + (reState.getInfo() != null ? " at " + reState.getInfo() : ""));
+
+ addSample(new Sample(overallBytesRead, overallTasksRun, System.currentTimeMillis()));
+ int nrHosts = ((FetcherTaskQueue)threadPool.getTaskQueue()).getNumHosts();
+ int visitedSize = urlVisitedFilter.size();
+ int visitedStringSize = urlVisitedFilter.getStringSize();
+
+ double bytesPerSecond = getAverageBytesRead();
+ double docsPerSecond = getAverageDocsRead();
+ sb.setLength(0);
+ System.out.println(sb + "\nBytes total: " + formatBytes(overallBytesRead) + " (" + formatBytes((long)(((double)overallBytesRead)*1000/(System.currentTimeMillis()-startTime))) + " per second since start)" +
+ "\nBytes per Second: " + formatBytes((int)bytesPerSecond) + " (50 secs)" +
+ "\nDocs per Second: " + docsPerSecond +
+ "\nBytes per Thread: " + bytesReadString);
+ double docsPerSecondTotal = ((double)overallTasksRun)*1000/(System.currentTimeMillis()-startTime);
+ sb.setLength(0);
+ System.out.println(sb + "Docs read total: " + overallTasksRun + " Docs/s: " + fractionFormat.format(docsPerSecondTotal) +
+ "\nDocs p.thread: " + tasksRunString);
+
+ long memUsed = Runtime.getRuntime().totalMemory()-Runtime.getRuntime().freeMemory();
+ long memFree = Runtime.getRuntime().freeMemory();
+ long totalMem = Runtime.getRuntime().totalMemory();
+ sb.setLength(0);
+ System.out.println(sb + "Mem used: " + formatBytes(memUsed) + ", free: " + formatBytes(memFree) + " total VM: " + totalMem);
+ int urlsQueued = messageHandler.getQueued();
+ int urlsWaiting = threadPool.getQueueSize();
+ boolean isWorkingOnMessage = messageHandler.isWorkingOnMessage();
+ int urlsScopeFiltered = urlScopeFilter.getFiltered();
+ int urlsVisitedFiltered = urlVisitedFilter.getFiltered();
+ int urlsREFiltered = reFilter.getFiltered();
+ int urlLengthFiltered = urlLengthFilter.getFiltered();
+ sb.setLength(0);
+ System.out.println(sb + "URLs queued: " + urlsQueued + " waiting: " + urlsWaiting);
+ sb.setLength(0);
+ System.out.println(sb + "Message is being processed: " + isWorkingOnMessage);
+ sb.setLength(0);
+ System.out.println(sb + "URLs Filtered: length: " + urlLengthFiltered + " scope: " + urlsScopeFiltered + " visited: " + urlsVisitedFiltered + " robot.txt: " + urlsREFiltered);
+ sb.setLength(0);
+ System.out.println(sb + "Visited size: " + visitedSize + "; String Size in VisitedFilter: " + visitedStringSize + "; Number of Hosts: " + nrHosts + "; hosts in Host Manager: " + hostManager.getSize() + "\n");
+ sb.setLength(0);
+ log(sb + "" + now + ";" + overallBytesRead + ";" + overallTasksRun + ";" + urlsQueued + ";" + urlsWaiting + ";" + isWorkingOnMessage + ";" + urlsScopeFiltered + ";" + urlsVisitedFiltered + ";" + urlsREFiltered + ";" + memUsed + ";" + memFree + ";" + totalMem + ";" + nrHosts + ";" + visitedSize + ";" + visitedStringSize + ";" + rawBytesReadString + ";" + urlLengthFiltered);
+
+
+ if(!isWorkingOnMessage && (urlsQueued == 0) && (urlsWaiting == 0) && allThreadsIdle)
+ {
+ nothingReadCount++;
+ if(nothingReadCount > 3)
+ {
+ SimpleLoggerManager.getInstance().flush();
+ System.exit(0);
+ }
+
+ }
+ else
+ {
+ nothingReadCount = 0;
+ }
+
+ this.setChanged();
+ this.notifyObservers();
+
+ // Request Garbage Collection
+ monitorRunCount++;
+
+ if(monitorRunCount % 6 == 0)
+ {
+ System.runFinalization();
+ }
+
+ if(monitorRunCount % 2 == 0)
+ {
+ System.gc();
+ SimpleLoggerManager.getInstance().flush();
+ }
+
+ }
+ catch(Exception e)
+ {
+ System.out.println("Monitor: Exception: " + e.getClass().getName());
+ e.printStackTrace();
+ }
+ }
+ }
+
+ /**
+ * start the thread
+ */
+ public void start()
+ {
+ this.clear();
+ thread.start();
+ }
+
+ /**
+ * interrupt the monitor thread
+ */
+ public void interrupt()
+ {
+ thread.interrupt();
+ }
+
+
+ public synchronized void clear()
+ {
+ //sampleTimeStamps.clear();
+ /*for(int i=0; i < timeSamples.length; i++)
+ {
+ timeSamples[i].clear();
+ }
+ */
+ }
+
+/* public synchronized double getAverageReadCount(int maxPeriods)
+ {
+ int lastPeriod = bytesReadPerPeriod.size()-1;
+ int periods = Math.min(lastPeriod, maxPeriods);
+ if(periods < 2)
+ {
+ return 0.0;
+ }
+
+
+ long bytesLastPeriod = ((Sample)bytesReadPerPeriod.get(lastPeriod)).bytesRead;
+ long bytesBeforePeriod = ((Sample)bytesReadPerPeriod.get(lastPeriod - periods)).bytesRead;
+ long bytesRead = bytesLastPeriod - bytesBeforePeriod;
+
+ long endTime = ((Long)sampleTimeStamps.get(sampleTimeStamps.size()-1)).longValue();
+ long startTime = ((Long)sampleTimeStamps.get(sampleTimeStamps.size()-1 - periods)).longValue();
+ long duration = endTime - startTime;
+ System.out.println("bytes read: " + bytesRead + " duration in s: " + duration/1000.0 + " = " + ((double)bytesRead) / (duration/1000.0) + " per second");
+
+ return ((double)bytesRead) / (duration/1000.0);
+ }
+*/
+
+ /*public synchronized double getDocsPerSecond(int maxPeriods)
+ {
+ int lastPeriod = bytesReadPerPeriod.size()-1;
+ int periods = Math.min(lastPeriod, maxPeriods);
+ if(periods < 2)
+ {
+ return 0.0;
+ }
+
+
+ long docsLastPeriod = ((Sample)bytesReadPerPeriod.get(lastPeriod)).docsRead;
+ long docsBeforePeriod = ((Sample)bytesReadPerPeriod.get(lastPeriod - periods)).docsRead;
+ long docsRead = docsLastPeriod - docsBeforePeriod;
+
+ long endTime = ((Long)sampleTimeStamps.get(sampleTimeStamps.size()-1)).longValue();
+ long startTime = ((Long)sampleTimeStamps.get(sampleTimeStamps.size() - periods)).longValue();
+ long duration = endTime - startTime;
+ System.out.println("docs read: " + docsRead + " duration in s: " + duration/1000.0 + " = " + ((double)docsRead) / (duration/1000.0) + " per second");
+
+ return ((double)docsRead) / (duration/1000.0);
+ }*/
+
+ /**
+ * retrieves the number of threads whose byteCount is below the threshold
+ * @param maxPeriods the number of periods to look back
+ * @param threshold the number of bytes per second that acts as the threshold for a stalled thread
+ */
+ /*public synchronized int getStalledThreadCount(int maxPeriods, double threshold)
+ {
+ int periods = Math.min(sampleTimeStamps.size(), maxPeriods);
+ int stalledThreads = 0;
+ int j=0, i=0;
+ if(periods > 1)
+ {
+ for(j=0; j
+ * Description:
+ * Copyright: Copyright (c)
+ * Company:
+ * @author
+ * @version 1.0
+ */
+package de.lanlab.larm.gui;
+
+import javax.swing.*;
+import java.awt.*;
+import java.awt.event.*;
+
+
+public class FetcherSummaryFrame extends JFrame
+{
+ JPanel lowerPanel = new JPanel();
+ JPanel progressPanel = new JPanel();
+ JPanel middlePanel = new JPanel();
+ JPanel rightPanel = new JPanel();
+ BorderLayout borderLayout1 = new BorderLayout();
+ JPanel propertyPanel = new JPanel();
+ JLabel hostLabel = new JLabel();
+ JLabel urlRestrictionFrame = new JLabel();
+ JTextField startURL = new JTextField();
+ JTextField restrictTo = new JTextField();
+ JButton startButton = new JButton();
+ GridLayout gridLayout1 = new GridLayout();
+ JProgressBar urlQueuedProgress = new JProgressBar(0,100);
+ JLabel urlQueuedLabel = new JLabel();
+ JLabel scopeFilteredLabel = new JLabel();
+ JProgressBar scopeFilteredProgress = new JProgressBar(0,100);
+ JLabel visitedFilteredLabel = new JLabel();
+ JProgressBar visitedFilteredProgress = new JProgressBar(0,100);
+ JLabel workingThreadsLabel = new JLabel();
+ JProgressBar workingThreadsProgress = new JProgressBar(0,100);
+ JLabel idleThreadsLabel = new JLabel();
+ JProgressBar idleThreadsProgress = new JProgressBar(0,100);
+ JLabel busyThreadsLabel = new JLabel();
+ JProgressBar busyThreadsProgress = new JProgressBar(0,100);
+ JLabel requestQueueLabel = new JLabel();
+ JProgressBar requestQueueProgress = new JProgressBar();
+ JLabel stalledThreadsLabel = new JLabel();
+ JProgressBar stalledThreadsProgress = new JProgressBar();
+ JLabel dnsLabel = new JLabel();
+ JProgressBar dnsProgress = new JProgressBar(0,100);
+ JLabel freeMemLabel = new JLabel();
+ JLabel freeMemText = new JLabel();
+ JLabel totalMemLabel = new JLabel();
+ JLabel totalMemText = new JLabel();
+ JLabel bpsLabel = new JLabel();
+ JLabel bpsText = new JLabel();
+ JLabel docsLabel = new JLabel();
+ JLabel docsText = new JLabel();
+ JLabel docsReadLabel = new JLabel();
+ JLabel docsReadText = new JLabel();
+ JProgressBar urlsCaughtProgress = new JProgressBar(0,100);
+ JLabel urlsCaughtText = new JLabel();
+ JLabel robotsTxtsText = new JLabel();
+ JProgressBar robotsTxtsProgress = new JProgressBar(0,100);
+
+ public FetcherSummaryFrame()
+ {
+ try
+ {
+ jbInit();
+ this.setTitle("LARM - LANLab Retrieval Machine");
+ this.setSize(new Dimension(640,350));
+ this.urlQueuedProgress.setStringPainted(true);
+ this.urlQueuedProgress.setString("0");
+ this.scopeFilteredProgress.setStringPainted(true);
+ this.scopeFilteredProgress.setString("0");
+ this.visitedFilteredProgress.setStringPainted(true);
+ this.visitedFilteredProgress.setString("0");
+ workingThreadsProgress.setStringPainted(true);
+ workingThreadsProgress.setString("0");
+ idleThreadsProgress.setStringPainted(true);
+ idleThreadsProgress.setString("0");
+ busyThreadsProgress.setStringPainted(true);
+ busyThreadsProgress.setString("0");
+ stalledThreadsProgress.setStringPainted(true);
+ stalledThreadsProgress.setString("0");
+ requestQueueProgress.setStringPainted(true);
+ requestQueueProgress.setString("0");
+ dnsProgress.setStringPainted(true);
+ dnsProgress.setString("0");
+ urlsCaughtProgress.setStringPainted(true);
+ urlsCaughtProgress.setString("0");
+ robotsTxtsProgress.setStringPainted(true);
+ robotsTxtsProgress.setString("0");
+ }
+ catch(Exception e)
+ {
+ e.printStackTrace();
+ }
+ }
+
+ private void jbInit() throws Exception
+ {
+ this.getContentPane().setLayout(borderLayout1);
+ propertyPanel.setMinimumSize(new Dimension(10, 70));
+ propertyPanel.setPreferredSize(new Dimension(10, 80));
+ propertyPanel.setLayout(null);
+ hostLabel.setText("Startseite");
+ hostLabel.setBounds(new Rectangle(18, 15, 76, 17));
+ urlRestrictionFrame.setText("URL-Restriction (regul. Ausdruck)");
+ urlRestrictionFrame.setBounds(new Rectangle(18, 37, 208, 17));
+ startURL.setBounds(new Rectangle(224, 14, 281, 21));
+ restrictTo.setBounds(new Rectangle(224, 38, 281, 21));
+ startButton.setActionCommand("start");
+ startButton.setText("Start");
+ startButton.setBounds(new Rectangle(528, 14, 79, 47));
+ lowerPanel.setLayout(gridLayout1);
+ urlQueuedLabel.setToolTipText("");
+ urlQueuedLabel.setText("URLs queued");
+ scopeFilteredLabel.setToolTipText("");
+ scopeFilteredLabel.setText("Scope-gefiltert");
+ visitedFilteredLabel.setText("Visited gefiltert");
+ workingThreadsLabel.setText("Number of Working Threads");
+ idleThreadsLabel.setText("Idle Threads");
+ busyThreadsLabel.setText("Busy Threads");
+ requestQueueLabel.setText("requests queued");
+ stalledThreadsLabel.setText("stalled Threads");
+ stalledThreadsProgress.setPreferredSize(new Dimension(190, 25));
+ requestQueueProgress.setPreferredSize(new Dimension(190, 25));
+ busyThreadsProgress.setPreferredSize(new Dimension(190, 25));
+ idleThreadsProgress.setPreferredSize(new Dimension(190, 25));
+ workingThreadsProgress.setPreferredSize(new Dimension(190, 25));
+ urlQueuedProgress.setPreferredSize(new Dimension(190, 25));
+ scopeFilteredProgress.setPreferredSize(new Dimension(190, 25));
+ visitedFilteredProgress.setPreferredSize(new Dimension(190, 25));
+ dnsLabel.setText("DNS Hosts cached");
+ dnsProgress.setPreferredSize(new Dimension(190, 25));
+ freeMemLabel.setText("Free Mem");
+ freeMemLabel.setPreferredSize(new Dimension(60, 17));
+ freeMemText.setText("0");
+ freeMemText.setPreferredSize(new Dimension(120, 17));
+ freeMemText.setMinimumSize(new Dimension(100, 17));
+ totalMemLabel.setText("total Mem");
+ totalMemLabel.setPreferredSize(new Dimension(60, 17));
+ totalMemText.setText("0");
+ totalMemText.setPreferredSize(new Dimension(120, 17));
+ totalMemText.setMinimumSize(new Dimension(100, 17));
+ bpsLabel.setPreferredSize(new Dimension(60, 17));
+ bpsLabel.setText("Bytes/s");
+ bpsText.setMinimumSize(new Dimension(100, 17));
+ bpsText.setPreferredSize(new Dimension(120, 17));
+ bpsText.setText("0");
+ docsLabel.setText("Docs/s");
+ docsLabel.setPreferredSize(new Dimension(60, 17));
+ docsText.setText("0");
+ docsText.setPreferredSize(new Dimension(120, 17));
+ docsText.setMinimumSize(new Dimension(100, 17));
+ docsReadLabel.setText("Docs read");
+ docsReadLabel.setPreferredSize(new Dimension(60, 17));
+ docsReadText.setText("0");
+ docsReadText.setPreferredSize(new Dimension(120, 17));
+ docsReadText.setMinimumSize(new Dimension(100, 17));
+ urlsCaughtProgress.setPreferredSize(new Dimension(190, 25));
+ urlsCaughtText.setText("URLs caught by Robots.txt");
+ robotsTxtsText.setText("Robots.txts found");
+ robotsTxtsProgress.setPreferredSize(new Dimension(190, 25));
+ this.getContentPane().add(lowerPanel, BorderLayout.CENTER);
+ lowerPanel.add(progressPanel, null);
+ progressPanel.add(urlQueuedLabel, null);
+ progressPanel.add(urlQueuedProgress, null);
+ progressPanel.add(scopeFilteredLabel, null);
+ progressPanel.add(scopeFilteredProgress, null);
+ progressPanel.add(visitedFilteredLabel, null);
+ progressPanel.add(visitedFilteredProgress, null);
+ progressPanel.add(dnsLabel, null);
+ progressPanel.add(dnsProgress, null);
+ progressPanel.add(robotsTxtsText, null);
+ progressPanel.add(robotsTxtsProgress, null);
+ progressPanel.add(urlsCaughtText, null);
+ progressPanel.add(urlsCaughtProgress, null);
+ lowerPanel.add(middlePanel, null);
+ middlePanel.add(workingThreadsLabel, null);
+ middlePanel.add(workingThreadsProgress, null);
+ middlePanel.add(idleThreadsLabel, null);
+ middlePanel.add(idleThreadsProgress, null);
+ middlePanel.add(busyThreadsLabel, null);
+ middlePanel.add(busyThreadsProgress, null);
+ middlePanel.add(requestQueueLabel, null);
+ middlePanel.add(requestQueueProgress, null);
+ middlePanel.add(stalledThreadsLabel, null);
+ middlePanel.add(stalledThreadsProgress, null);
+ lowerPanel.add(rightPanel, null);
+ rightPanel.add(docsLabel, null);
+ rightPanel.add(docsText, null);
+ rightPanel.add(docsReadLabel, null);
+ rightPanel.add(docsReadText, null);
+ rightPanel.add(bpsLabel, null);
+ rightPanel.add(bpsText, null);
+ rightPanel.add(totalMemLabel, null);
+ rightPanel.add(totalMemText, null);
+ rightPanel.add(freeMemLabel, null);
+ rightPanel.add(freeMemText, null);
+ this.getContentPane().add(propertyPanel, BorderLayout.NORTH);
+ propertyPanel.add(urlRestrictionFrame, null);
+ propertyPanel.add(restrictTo, null);
+ propertyPanel.add(hostLabel, null);
+ propertyPanel.add(startButton, null);
+ propertyPanel.add(startURL, null);
+ }
+
+ public void setCounterProgressBar(JProgressBar p, int value)
+ {
+ int oldMax = p.getMaximum();
+ int oldValue = p.getValue();
+
+ if(value > oldMax)
+ {
+ p.setMaximum(oldMax * 2);
+ }
+ else if (value < oldMax / 2 && oldValue >= oldMax / 2)
+ {
+ p.setMaximum(oldMax / 2);
+ }
+ p.setValue(value);
+ p.setString("" + value);
+ }
+
+ public void setURLsQueued(int queued)
+ {
+ setCounterProgressBar(this.urlQueuedProgress, queued);
+ }
+
+ public void setScopeFiltered(int filtered)
+ {
+ setCounterProgressBar(this.scopeFilteredProgress, filtered);
+ }
+
+ public void setVisitedFiltered(int filtered)
+ {
+ setCounterProgressBar(this.visitedFilteredProgress, filtered);
+ }
+
+ public void setWorkingThreadsCount(int threads)
+ {
+ setCounterProgressBar(this.workingThreadsProgress, threads);
+ }
+
+ public void setIdleThreadsCount(int threads)
+ {
+ setCounterProgressBar(this.idleThreadsProgress, threads);
+ }
+
+ public void setBusyThreadsCount(int threads)
+ {
+ setCounterProgressBar(this.busyThreadsProgress, threads);
+ }
+
+ public void setRequestQueueCount(int requests)
+ {
+ setCounterProgressBar(this.requestQueueProgress, requests);
+ }
+
+ public void setDNSCount(int count)
+ {
+ setCounterProgressBar(this.dnsProgress, count);
+ }
+
+ public void setURLsCaughtCount(int count)
+ {
+ setCounterProgressBar(this.urlQueuedProgress, count);
+ }
+
+ public void addStartButtonListener(ActionListener a)
+ {
+ startButton.addActionListener(a);
+ }
+
+
+
+ public String getRestrictTo()
+ {
+ return restrictTo.getText();
+ }
+ public void setRestrictTo(String restrictTo)
+ {
+ this.restrictTo.setText(restrictTo);
+ }
+ public String getStartURL()
+ {
+ return startURL.getText();
+ }
+ public void setStartURL(String startURL)
+ {
+ this.startURL.setText(startURL);
+ }
+
+ public void setStalledThreads(int stalled)
+ {
+ stalledThreadsProgress.setValue(stalled);
+ }
+
+ public void setBytesPerSecond(double bps)
+ {
+ bpsText.setText("" + bps);
+ }
+
+
+ public void setDocsPerSecond(double docs)
+ {
+ bpsText.setText("" + docs);
+ }
+
+ public void setFreeMem(long freeMem)
+ {
+ freeMemText.setText("" + freeMem);
+ }
+
+ public void setTotalMem(long totalMem)
+ {
+ totalMemText.setText("" + totalMem);
+ }
+
+ public void setRobotsTxtCount(int robotsTxtCount)
+ {
+ setCounterProgressBar(robotsTxtsProgress, robotsTxtCount);
+ }
+
+ public void setDocsRead(int docs)
+ {
+ bpsText.setText("" + docs);
+ }
+
+}
+
diff --git a/sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/gui/QuitDialog.java b/sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/gui/QuitDialog.java
new file mode 100644
index 00000000000..d06b91642f9
--- /dev/null
+++ b/sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/gui/QuitDialog.java
@@ -0,0 +1,184 @@
+package de.lanlab.larm.gui;
+/*
+ A basic extension of the java.awt.Dialog class
+ */
+
+import java.awt.*;
+import java.awt.event.*;
+
+public class QuitDialog extends Dialog
+{
+ public QuitDialog(Frame parent, boolean modal)
+ {
+ super(parent, modal);
+
+ //Keep a local reference to the invoking frame
+ frame = parent;
+
+ // This code is automatically generated by Visual Cafe when you add
+ // components to the visual environment. It instantiates and initializes
+ // the components. To modify the code, only use code syntax that matches
+ // what Visual Cafe can generate, or Visual Cafe may be unable to back
+ // parse your Java file into its visual environment.
+ //{{INIT_CONTROLS
+ setLayout(null);
+ setSize(337,135);
+ setVisible(false);
+ yesButton.setLabel(" Ja ");
+ add(yesButton);
+ yesButton.setFont(new Font("Dialog", Font.BOLD, 12));
+ yesButton.setBounds(72,80,79,22);
+ noButton.setLabel(" Nein ");
+ add(noButton);
+ noButton.setFont(new Font("Dialog", Font.BOLD, 12));
+ noButton.setBounds(185,80,79,22);
+ label1.setText("Möchten Sie LARM beenden?");
+ label1.setAlignment(java.awt.Label.CENTER);
+ add(label1);
+ label1.setBounds(68,33,220,23);
+ setTitle("LARM - Beenden");
+ //}}
+
+ //{{REGISTER_LISTENERS
+ SymWindow aSymWindow = new SymWindow();
+ this.addWindowListener(aSymWindow);
+ SymAction lSymAction = new SymAction();
+ noButton.addActionListener(lSymAction);
+ yesButton.addActionListener(lSymAction);
+ //}}
+ }
+
+ public void addNotify()
+ {
+ // Record the size of the window prior to calling parents addNotify.
+ Dimension d = getSize();
+
+ super.addNotify();
+
+ if (fComponentsAdjusted)
+ return;
+
+ // Adjust components according to the insets
+ setSize(getInsets().left + getInsets().right + d.width, getInsets().top + getInsets().bottom + d.height);
+ Component components[] = getComponents();
+ for (int i = 0; i < components.length; i++)
+ {
+ Point p = components[i].getLocation();
+ p.translate(getInsets().left, getInsets().top);
+ components[i].setLocation(p);
+ }
+ fComponentsAdjusted = true;
+ }
+
+ public QuitDialog(Frame parent, String title, boolean modal)
+ {
+ this(parent, modal);
+ setTitle(title);
+ }
+
+ /**
+ * Shows or hides the component depending on the boolean flag b.
+ * @param b if true, show the component; otherwise, hide the component.
+ * @see java.awt.Component#isVisible
+ */
+ public void setVisible(boolean b)
+ {
+ if(b)
+ {
+ Rectangle bounds = getParent().getBounds();
+ Rectangle abounds = getBounds();
+
+ setLocation(bounds.x + (bounds.width - abounds.width)/ 2,
+ bounds.y + (bounds.height - abounds.height)/2);
+ Toolkit.getDefaultToolkit().beep();
+ }
+ super.setVisible(b);
+ }
+
+ // Used for addNotify check.
+ boolean fComponentsAdjusted = false;
+ // Invoking frame
+ Frame frame = null;
+
+ //{{DECLARE_CONTROLS
+ java.awt.Button yesButton = new java.awt.Button();
+ java.awt.Button noButton = new java.awt.Button();
+ java.awt.Label label1 = new java.awt.Label();
+ //}}
+
+ class SymAction implements java.awt.event.ActionListener
+ {
+ public void actionPerformed(java.awt.event.ActionEvent event)
+ {
+ Object object = event.getSource();
+ if (object == yesButton)
+ yesButton_ActionPerformed(event);
+ else if (object == noButton)
+ noButton_ActionPerformed(event);
+ }
+ }
+
+ void yesButton_ActionPerformed(java.awt.event.ActionEvent event)
+ {
+ // to do: code goes here.
+
+ yesButton_ActionPerformed_Interaction1(event);
+ }
+
+
+ void yesButton_ActionPerformed_Interaction1(java.awt.event.ActionEvent event)
+ {
+ try {
+ frame.setVisible(false); // Hide the invoking frame
+ frame.dispose(); // Free system resources
+ this.dispose(); // Free system resources
+ System.exit(0); // close the application
+ } catch (Exception e) {
+ }
+ }
+
+
+ void noButton_ActionPerformed(java.awt.event.ActionEvent event)
+ {
+ // to do: code goes here.
+
+ noButton_ActionPerformed_Interaction1(event);
+ }
+
+
+ void noButton_ActionPerformed_Interaction1(java.awt.event.ActionEvent event)
+ {
+ try {
+ this.dispose();
+ } catch (Exception e) {
+ }
+ }
+
+
+ class SymWindow extends java.awt.event.WindowAdapter
+ {
+ public void windowClosing(java.awt.event.WindowEvent event)
+ {
+ Object object = event.getSource();
+ if (object == QuitDialog.this)
+ QuitDialog_WindowClosing(event);
+ }
+ }
+
+ void QuitDialog_WindowClosing(java.awt.event.WindowEvent event)
+ {
+ // to do: code goes here.
+
+ QuitDialog_WindowClosing_Interaction1(event);
+ }
+
+
+ void QuitDialog_WindowClosing_Interaction1(java.awt.event.WindowEvent event)
+ {
+ try {
+ this.dispose();
+ } catch (Exception e) {
+ }
+ }
+
+}
diff --git a/sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/net/HttpClientTimeout.java b/sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/net/HttpClientTimeout.java
new file mode 100644
index 00000000000..b2dd21fc353
--- /dev/null
+++ b/sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/net/HttpClientTimeout.java
@@ -0,0 +1,136 @@
+package de.lanlab.larm.net;
+
+// whatever package you want
+import sun.net.www.http.HttpClient;
+import sun.net.www.MessageHeader;
+import sun.net.ProgressEntry;
+
+import java.net.*;
+import java.io.*;
+
+
+/**
+ * Description of the Class
+ *
+ *@author cmarschn
+ *@created 2. Mai 2001
+ */
+public class HttpClientTimeout extends HttpClient {
+ private int timeout = -1;
+
+
+ /**
+ * Constructor for the HttpClientTimeout object
+ *
+ *@param url Description of Parameter
+ *@param proxy Description of Parameter
+ *@param proxyPort Description of Parameter
+ *@exception IOException Description of Exception
+ */
+ public HttpClientTimeout(URL url, String proxy, int proxyPort) throws IOException {
+ super(url, proxy, proxyPort);
+ }
+
+
+ /**
+ * Constructor for the HttpClientTimeout object
+ *
+ *@param url Description of Parameter
+ *@exception IOException Description of Exception
+ */
+ public HttpClientTimeout(URL url) throws IOException {
+ super(url, null, -1);
+ }
+
+
+ /**
+ * Sets the Timeout attribute of the HttpClientTimeout object
+ *
+ *@param i The new Timeout value
+ *@exception SocketException Description of Exception
+ */
+ public void setTimeout(int i) throws SocketException {
+ this.timeout = -1;
+ serverSocket.setSoTimeout(i);
+ }
+
+
+ /**
+ * Gets the Socket attribute of the HttpClientTimeout object
+ *
+ *@return The Socket value
+ */
+ public Socket getSocket() {
+ return serverSocket;
+ }
+
+
+ /**
+ * Description of the Method
+ *
+ *@param header Description of Parameter
+ *@param entry Description of Parameter
+ *@return Description of the Returned Value
+ *@exception java.io.IOException Description of Exception
+ */
+ public boolean parseHTTP(MessageHeader header, ProgressEntry entry) throws java.io.IOException {
+ if (this.timeout != -1) {
+ try {
+ serverSocket.setSoTimeout(this.timeout);
+ }
+ catch (SocketException e) {
+ throw new java.io.IOException("unable to set socket timeout!");
+ }
+ }
+ return super.parseHTTP(header, entry);
+ }
+
+
+ /**
+ * Description of the Method
+ *
+ *@exception IOException Description of Exception
+ */
+ public void close() throws IOException {
+ serverSocket.close();
+ }
+
+
+ /*
+ * public void SetTimeout(int i) throws SocketException {
+ * serverSocket.setSoTimeout(i);
+ * }
+ */
+ /*
+ * This class has no public constructor for HTTP. This method is used to
+ * get an HttpClient to the specifed URL. If there's currently an
+ * active HttpClient to that server/port, you'll get that one.
+ *
+ * no longer syncrhonized -- it slows things down too much
+ * synchronize at a higher level
+ */
+ /**
+ * Gets the New attribute of the HttpClientTimeout class
+ *
+ *@param url Description of Parameter
+ *@return The New value
+ *@exception IOException Description of Exception
+ */
+ public static HttpClientTimeout getNew(URL url) throws IOException {
+ /*
+ * see if one's already around
+ */
+ HttpClientTimeout ret = (HttpClientTimeout) kac.get(url);
+ if (ret == null) {
+ ret = new HttpClientTimeout(url);
+ // CTOR called openServer()
+ }
+ else {
+ ret.url = url;
+ }
+ // don't know if we're keeping alive until we parse the headers
+ // for now, keepingAlive is false
+ return ret;
+ }
+}
+
diff --git a/sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/net/HttpTimeoutFactory.java b/sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/net/HttpTimeoutFactory.java
new file mode 100644
index 00000000000..aff661cb6c1
--- /dev/null
+++ b/sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/net/HttpTimeoutFactory.java
@@ -0,0 +1,50 @@
+package de.lanlab.larm.net;
+
+import java.net.*;
+
+/**
+ * Description of the Class
+ *
+ *@author cmarschn
+ *@created 2. Mai 2001
+ */
+public class HttpTimeoutFactory implements URLStreamHandlerFactory {
+ int fiTimeoutVal;
+
+
+ /**
+ * Constructor for the HttpTimeoutFactory object
+ *
+ *@param iT Description of Parameter
+ */
+ public HttpTimeoutFactory(int iT) {
+ fiTimeoutVal = iT;
+ }
+
+
+ /**
+ * Description of the Method
+ *
+ *@param str Description of Parameter
+ *@return Description of the Returned Value
+ */
+ public URLStreamHandler createURLStreamHandler(String str) {
+ return new HttpTimeoutHandler(fiTimeoutVal);
+ }
+
+ static HttpTimeoutFactory instance = null;
+
+ /**
+ * gets an instance. only the first call will create it. In subsequent calls the iT
+ * parameter doesn't have a meaning.
+ */
+ public static HttpTimeoutFactory getInstance(int iT)
+ {
+ if(instance == null)
+ {
+ instance = new HttpTimeoutFactory(iT);
+ }
+ return instance;
+ }
+}
+
diff --git a/sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/net/HttpTimeoutHandler.java b/sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/net/HttpTimeoutHandler.java
new file mode 100644
index 00000000000..b551e4fa6c2
--- /dev/null
+++ b/sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/net/HttpTimeoutHandler.java
@@ -0,0 +1,80 @@
+package de.lanlab.larm.net;
+
+import java.net.*;
+import java.io.IOException;
+
+/**
+ * Description of the Class
+ *
+ *@author cmarschn
+ *@created 2. Mai 2001
+ */
+public class HttpTimeoutHandler extends sun.net.www.protocol.http.Handler {
+ int timeoutVal;
+ HttpURLConnectionTimeout fHUCT;
+
+
+ /**
+ * Constructor for the HttpTimeoutHandler object
+ *
+ *@param iT Description of Parameter
+ */
+ public HttpTimeoutHandler(int iT) {
+ timeoutVal = iT;
+ }
+
+
+ /**
+ * Gets the Socket attribute of the HttpTimeoutHandler object
+ *
+ *@return The Socket value
+ */
+ public Socket getSocket() {
+ return fHUCT.getSocket();
+ }
+
+
+ /**
+ * Description of the Method
+ *
+ *@exception Exception Description of Exception
+ */
+ public void close() throws Exception {
+ fHUCT.close();
+ }
+
+
+ /**
+ * Description of the Method
+ *
+ *@param u Description of Parameter
+ *@return Description of the Returned Value
+ *@exception IOException Description of Exception
+ */
+ protected java.net.URLConnection openConnection(URL u) throws IOException {
+ return fHUCT = new HttpURLConnectionTimeout(u, this, timeoutVal);
+ }
+
+
+ /**
+ * Gets the Proxy attribute of the HttpTimeoutHandler object
+ *
+ *@return The Proxy value
+ */
+ String getProxy() {
+ return proxy;
+ // breaking encapsulation
+ }
+
+
+ /**
+ * Gets the ProxyPort attribute of the HttpTimeoutHandler object
+ *
+ *@return The ProxyPort value
+ */
+ int getProxyPort() {
+ return proxyPort;
+ // breaking encapsulation
+ }
+}
+
diff --git a/sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/net/HttpURLConnectionTimeout.java b/sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/net/HttpURLConnectionTimeout.java
new file mode 100644
index 00000000000..16b07ace098
--- /dev/null
+++ b/sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/net/HttpURLConnectionTimeout.java
@@ -0,0 +1,226 @@
+package de.lanlab.larm.net;
+
+import java.net.*;
+import java.io.*;
+import sun.net.www.http.HttpClient;
+
+/**
+ * Description of the Class
+ *
+ *@author cmarschn
+ *@created 2. Mai 2001
+ */
+public class HttpURLConnectionTimeout extends sun.net.www.protocol.http.HttpURLConnection {
+ int fiTimeoutVal;
+ HttpTimeoutHandler fHandler;
+ HttpClientTimeout fClient;
+
+
+ /**
+ * Constructor for the HttpURLConnectionTimeout object
+ *
+ *@param u Description of Parameter
+ *@param handler Description of Parameter
+ *@param iTimeout Description of Parameter
+ *@exception IOException Description of Exception
+ */
+ public HttpURLConnectionTimeout(URL u, HttpTimeoutHandler handler, int iTimeout) throws IOException {
+ super(u, handler);
+ fHandler = handler;
+ fiTimeoutVal = iTimeout;
+ }
+
+
+ /**
+ * Constructor for the HttpURLConnectionTimeout object
+ *
+ *@param u Description of Parameter
+ *@param host Description of Parameter
+ *@param port Description of Parameter
+ *@exception IOException Description of Exception
+ */
+ public HttpURLConnectionTimeout(URL u, String host, int port) throws IOException {
+ super(u, host, port);
+ }
+
+
+ /**
+ * Description of the Method
+ *
+ *@exception IOException Description of Exception
+ */
+ public void connect() throws IOException {
+ if (connected) {
+ return;
+ }
+ try {
+ if ("http".equals(url.getProtocol())
+ /*
+ * && !failedOnce <- PRIVATE
+ */
+ ) {
+ // for safety's sake, as reported by KLGroup
+ synchronized (url) {
+ http = HttpClientTimeout.getNew(url);
+ }
+ fClient = (HttpClientTimeout) http;
+ ((HttpClientTimeout) http).setTimeout(fiTimeoutVal);
+ }
+ else {
+ // make sure to construct new connection if first
+ // attempt failed
+ http = new HttpClientTimeout(url, fHandler.getProxy(), fHandler.getProxyPort());
+ }
+ ps = (PrintStream) http.getOutputStream();
+ }
+ catch (IOException e) {
+ throw e;
+ }
+ // this was missing from the original version
+ connected = true;
+ }
+
+
+ /**
+ * Create a new HttpClient object, bypassing the cache of HTTP client
+ * objects/connections.
+ *
+ *@param url the URL being accessed
+ *@return The NewClient value
+ *@exception IOException Description of Exception
+ */
+ protected HttpClient getNewClient(URL url)
+ throws IOException {
+ HttpClientTimeout client = new HttpClientTimeout(url, (String) null, -1);
+ try {
+ client.setTimeout(fiTimeoutVal);
+ }
+ catch (Exception e) {
+ System.out.println("Unable to set timeout value");
+ }
+ return (HttpClient) client;
+ }
+
+
+ /**
+ * Gets the Socket attribute of the HttpURLConnectionTimeout object
+ *
+ *@return The Socket value
+ */
+ Socket getSocket() {
+ return fClient.getSocket();
+ }
+
+
+ /**
+ * Description of the Method
+ *
+ *@exception Exception Description of Exception
+ */
+ void close() throws Exception {
+ fClient.close();
+ }
+
+
+ /**
+ * opens a stream allowing redirects only to the same host.
+ *
+ *@param c Description of Parameter
+ *@return Description of the Returned Value
+ *@exception IOException Description of Exception
+ */
+ public static InputStream openConnectionCheckRedirects(URLConnection c)
+ throws IOException {
+ boolean redir;
+ int redirects = 0;
+ InputStream in = null;
+
+ do {
+ if (c instanceof HttpURLConnectionTimeout) {
+ ((HttpURLConnectionTimeout) c).setInstanceFollowRedirects(false);
+ }
+
+ // We want to open the input stream before
+ // getting headers, because getHeaderField()
+ // et al swallow IOExceptions.
+ in = c.getInputStream();
+ redir = false;
+
+ if (c instanceof HttpURLConnectionTimeout) {
+ HttpURLConnectionTimeout http = (HttpURLConnectionTimeout) c;
+ int stat = http.getResponseCode();
+ if (stat >= 300 && stat <= 305 &&
+ stat != HttpURLConnection.HTTP_NOT_MODIFIED) {
+ URL base = http.getURL();
+ String loc = http.getHeaderField("Location");
+ URL target = null;
+ if (loc != null) {
+ target = new URL(base, loc);
+ }
+ http.disconnect();
+ if (target == null
+ || !base.getProtocol().equals(target.getProtocol())
+ || base.getPort() != target.getPort()
+ || !HostsEquals(base, target)
+ || redirects >= 5) {
+ throw new SecurityException("illegal URL redirect");
+ }
+ redir = true;
+ c = target.openConnection();
+ redirects++;
+ }
+ }
+ } while (redir);
+ return in;
+ }
+
+
+ // Same as java.net.URL.hostsEqual
+
+ /**
+ * Description of the Method
+ *
+ *@param u1 Description of Parameter
+ *@param u2 Description of Parameter
+ *@return Description of the Returned Value
+ */
+ static boolean HostsEquals(URL u1, URL u2) {
+ final String h1 = u1.getHost();
+ final String h2 = u2.getHost();
+
+ if (h1 == null) {
+ return h2 == null;
+ }
+ else if (h2 == null) {
+ return false;
+ }
+ else if (h1.equalsIgnoreCase(h2)) {
+ return true;
+ }
+ // Have to resolve addresses before comparing, otherwise
+ // names like tachyon and tachyon.eng would compare different
+ final boolean result[] = {false};
+
+ java.security.AccessController.doPrivileged(
+ new java.security.PrivilegedAction() {
+ /**
+ * Main processing method for the HttpURLConnectionTimeout object
+ *
+ *@return Description of the Returned Value
+ */
+ public Object run() {
+ try {
+ InetAddress a1 = InetAddress.getByName(h1);
+ InetAddress a2 = InetAddress.getByName(h2);
+ result[0] = a1.equals(a2);
+ }
+ catch (UnknownHostException e) {
+ }
+ catch (SecurityException e) {
+ }
+ return null;
+ }
+ });
+ return result[0];
+ }
+}
diff --git a/sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/parser/LinkHandler.java b/sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/parser/LinkHandler.java
new file mode 100644
index 00000000000..5f96063da54
--- /dev/null
+++ b/sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/parser/LinkHandler.java
@@ -0,0 +1,17 @@
+
+/**
+ * Title: LARM Lanlab Retrieval Machine
+ * Description:
+ * Copyright: Copyright (c)
+ * Company:
+ * @author
+ * @version 1.0
+ */
+package de.lanlab.larm.parser;
+
+public interface LinkHandler
+{
+ public void handleLink(String value, boolean isFrame);
+ public void handleBase(String value);
+ public void handleTitle(String value);
+}
\ No newline at end of file
diff --git a/sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/parser/Tokenizer.java b/sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/parser/Tokenizer.java
new file mode 100644
index 00000000000..9ccda662ed6
--- /dev/null
+++ b/sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/parser/Tokenizer.java
@@ -0,0 +1,1340 @@
+/*
+ * $Id$
+ *
+ * Copyright 2000 LANLab
+ *
+ */
+package de.lanlab.larm.parser;
+
+import hplb.org.xml.sax.*;
+import hplb.xml.*;
+import hplb.xml.util.*;
+
+import java.util.Dictionary;
+import java.util.Hashtable;
+import java.io.*;
+import hplb.misc.ByteArray;
+import java.net.URL;
+
+/**
+ * This parser is based on HEX, the HTML enabled XML parser, written by
+ * Anders Kristensen, HP Labs Bristol.
+ * It was stripped down and specialized to handle links in HTML pages. I removed
+ * some bugs. And it's FAST, about 10 x faster than the original HEX parser.
+ * Being some sort of SAX parser it calls the callback functions of the LinkHandler
+ * when links are found.
+ * @todo add handling of anchor texts
+ *
+ * @author Clemens Marschner
+ */
+public class Tokenizer implements hplb.org.xml.sax.Parser
+{
+ /**
+ * Sets the entityHandler attribute of the Tokenizer object
+ *
+ * @param e The new entityHandler value
+ */
+ public void setEntityHandler(hplb.org.xml.sax.EntityHandler e) { }
+
+
+ /**
+ * Sets the errorHandler attribute of the Tokenizer object
+ *
+ * @param e The new errorHandler value
+ */
+ public void setErrorHandler(hplb.org.xml.sax.ErrorHandler e) { }
+
+
+ /**
+ * Sets the documentHandler attribute of the Tokenizer object
+ *
+ * @param e The new documentHandler value
+ */
+ public void setDocumentHandler(hplb.org.xml.sax.DocumentHandler e) { }
+
+
+ /**
+ * The value of boolean attributes is this string.
+ */
+ public final static String BOOLATTR = Atom.getAtom("BOOLATTR");
+
+ // FSM states:
+ final static int ST_START = 1;
+ final static int ST_TAG_LT = 3;
+ final static int ST_TAG_NAME = 4;
+ final static int ST_TAG_WS = 5;
+ final static int ST_EMPTY_TAG_SLASH = 6;
+ final static int ST_NAME = 7;
+ final static int ST_NAME_WS = 8;
+ final static int ST_EQ = 9;
+ final static int ST_VALUE = 10;
+ final static int ST_VALUE_QUOTED = 11;
+ final static int ST_PCDATA = 21;
+ final static int ST_COMMENT = 22;
+
+ LinkHandler linkHandler;
+
+ String sysID = "what's this?";
+
+ /**
+ * Description of the Field
+ */
+ protected Hashtable noCaseElms;
+ /**
+ * Description of the Field
+ */
+ public boolean rcgnzWS = true;
+ // is white space chars recognized as PCDATA
+ // even when preceeding tags?
+ /**
+ * Description of the Field
+ */
+ public boolean rcgnzEntities = true;
+ /**
+ * Description of the Field
+ */
+ public boolean rcgnzCDATA = true;
+ /**
+ * Description of the Field
+ */
+ public boolean rcgnzComments = true;
+ //
+ /**
+ * Description of the Field
+ */
+ public boolean atomize = false;
+ // make element and attr names atoms
+
+ private final static int ATTR_HREF = 1;
+ private final static int ATTR_SRC = 2;
+
+ private final static int LINKTYPE_NONE = 0;
+ private final static int LINKTYPE_LINK = 1;
+ private final static int LINKTYPE_BASE = 2;
+ private final static int LINKTYPE_FRAME = 3;
+
+
+ private byte linkTagType;
+ private boolean linkAttrFound;
+ private int linkAttrType;
+ private String linkValue;
+ private boolean keepPCData;
+ private boolean isInTitleTag;
+ private boolean isInAnchorTag;
+
+ CharBuffer buf = new CharBuffer();
+ boolean isStartTag = true;
+ /**
+ * Signals whether a non-empty element has any children. If not we must
+ * generate an artificial empty-string child [characters(buf, 0, 0)].
+ */
+ boolean noChildren;
+ CharBuffer tagname = new CharBuffer();
+ CharBuffer attrName = new CharBuffer();
+ CharBuffer attrValue = new CharBuffer(1000);
+ CharBuffer pcData = new CharBuffer(8000);
+
+ Reader in;
+
+ /**
+ * Description of the Field
+ */
+ public final EntityManager entMngr = new EntityManager(this);
+ /**
+ * Description of the Field
+ */
+ protected int state = ST_START;
+ /**
+ * Description of the Field
+ */
+ protected int qchar;
+
+
+ // <'> or <"> when parsing quoted attr values
+
+
+ /**
+ * Constructor for the Tokenizer object
+ */
+ public Tokenizer() { }
+
+
+ /**
+ * Sets the linkHandler attribute of the Tokenizer object
+ *
+ * @param handler The new linkHandler value
+ */
+ public void setLinkHandler(LinkHandler handler)
+ {
+ linkHandler = handler;
+ }
+
+
+ /**
+ * Description of the Method
+ *
+ * @param publicID Description of the Parameter
+ * @param sysID Description of the Parameter
+ * @exception Exception Description of the Exception
+ */
+ public void parse(String publicID, String sysID)
+ throws Exception
+ {
+ this.sysID = sysID;
+ parse(new URL(sysID).openStream());
+ }
+
+
+ /**
+ * Description of the Method
+ *
+ * @param in Description of the Parameter
+ * @exception Exception Description of the Exception
+ */
+ public void parse(InputStream in)
+ throws Exception
+ {
+ parse(new BufferedReader(new InputStreamReader(in)));
+ }
+
+
+ /**
+ * Description of the Method
+ *
+ * @param in Description of the Parameter
+ * @exception Exception Description of the Exception
+ */
+ public void parse(Reader in)
+ throws Exception
+ {
+ if (linkHandler == null)
+ {
+ throw new IllegalStateException("parse called without LinkHandler being set");
+ }
+
+ this.in = in;
+ toStart();
+ tokenize();
+ }
+
+
+ /**
+ * Description of the Method
+ *
+ * @param elementName Description of the Parameter
+ */
+ public void ignoreCase(String elementName)
+ {
+ if (noCaseElms == null)
+ {
+ noCaseElms = new Hashtable();
+ }
+ noCaseElms.put(elementName.toLowerCase(), elementName);
+ }
+
+
+ /**
+ * Description of the Method
+ *
+ * @param b Description of the Parameter
+ */
+ public void rcgnzWS(boolean b)
+ {
+ rcgnzWS = b;
+ }
+
+
+ // invoked after doing any Handler callback - resets state
+ /**
+ * Description of the Method
+ */
+ protected void toStart()
+ {
+ state = ST_START;
+ buf.reset();
+ tagname.reset();
+ attrName.reset();
+ attrValue.reset();
+ pcData.reset();
+ //attrs.clear();
+ isStartTag = true;
+ // until proven wrong
+
+ linkTagType = LINKTYPE_NONE;
+ linkAttrFound = false;
+ linkAttrType = 0;
+ linkValue = "";
+ //keepPCData= false;
+ }
+
+
+ /**
+ * Description of the Method
+ *
+ * @exception Exception Description of the Exception
+ */
+ public void tokenize()
+ throws Exception
+ {
+ int c;
+
+
+ while ((c = read()) != -1)
+ {
+ switch (state)
+ {
+ case ST_START:
+ switch (c)
+ {
+ case '<':
+ state = ST_TAG_LT;
+ linkTagType = LINKTYPE_NONE;
+ linkAttrFound = false;
+ linkAttrType = 0;
+ linkValue = "";
+
+ isStartTag = true;
+ keepPCData= false;
+
+ // until proven wrong
+ tagname.reset();
+ break;
+ case ' ':
+ case '\t':
+ case '\r':
+ case '\n':
+ if (!rcgnzWS)
+ {
+ break;
+ }
+ // else fall through
+ default:
+ state = ST_PCDATA;
+ if(keepPCData)
+ {
+ pcData.write(c);
+ }
+
+ }
+ break;
+ case ST_PCDATA:
+ if (c == '<')
+ {
+ if(keepPCData)
+ {
+ gotPCDATA(true);
+ keepPCData = false;
+ }
+ linkTagType = LINKTYPE_NONE;
+ linkAttrFound = false;
+ linkAttrType = 0;
+ linkValue = "";
+ state = ST_TAG_LT;
+ }
+ else
+ {
+ if(keepPCData)
+ {
+ pcData.write(c);
+ }
+ }
+ break;
+ case ST_TAG_LT:
+ switch (c)
+ {
+ case '/':
+ isStartTag = false;
+ state = ST_TAG_NAME;
+ break;
+ case '!':
+ c = read();
+ if ((c == '-' && !rcgnzComments) || (c == '[' && !rcgnzCDATA))
+ {
+ state = ST_PCDATA;
+ pcData.reset();
+ pcData.write(c);
+ break;
+ }
+ if (c == '-')
+ {
+ state = ST_COMMENT;
+ }
+ else if (c == '[')
+ {
+ parseCDATA();
+ }
+ else
+ {
+ // FIXME: shouldn't be delivered as PCDATA
+ //warning("Bad markup " + buf);
+ state = ST_PCDATA;
+ pcData.reset();
+ pcData.write(c);
+ }
+ break;
+ case '?':
+ parsePI();
+ break;
+ case ' ':
+ case '\t':
+ case '\r':
+ case '\n':
+ state = ST_TAG_WS;
+ break;
+ default:
+ tagname.write(Character.toLowerCase((char) c));
+ // ## changed
+ state = ST_TAG_NAME;
+ }
+ break;
+ case ST_TAG_NAME:
+ switch (c)
+ {
+ case ' ':
+ case '\t':
+ case '\r':
+ case '\n':
+ state = ST_TAG_WS;
+ gotTagName();
+ // ## changed
+ break;
+ case '/':
+ state = ST_EMPTY_TAG_SLASH;
+ gotTagName();
+ // ## changed
+ break;
+ case '>':
+ gotTagName();
+ // ## changed
+ gotTag();
+ break;
+ default:
+ tagname.write(Character.toLowerCase((char) c));
+ // ## changed
+ }
+ break;
+ case ST_TAG_WS:
+ switch (c)
+ {
+ case ' ':
+ case '\t':
+ case '\r':
+ case '\n':
+ break;
+ case '/':
+ state = ST_EMPTY_TAG_SLASH;
+ break;
+ case '>':
+ gotTag();
+ break;
+ case '?':
+ // NOTE: if !inXMLDecl we fall through to default case
+ default:
+ if (!isStartTag)
+ {
+ // bit of a hack this...
+ //errHandler.warning("Malformed tag: "+buf, sysID, _line, _column);
+ //err_continue("Malformed tag: "+buf);
+ toStart();
+ // ## changed
+ if (c == '<')
+ {
+ gotPCDATA(true);
+ keepPCData = false;
+ state = ST_TAG_LT;
+ }
+ else
+ {
+ // we get here e.g. if there's an end tag with attributes
+ state = ST_PCDATA;
+ pcData.reset();
+ }
+ }
+ else
+ {
+ // FIXME: this accepts way too many first chars for attr name
+ attrName.write(Character.toLowerCase((char) c));
+ state = ST_NAME;
+ }
+ }
+ break;
+ case ST_EMPTY_TAG_SLASH:
+ if (c == '>')
+ {
+ //tagtype = TAG_EMPTY;
+ gotTag();
+ break;
+ }
+ else
+ {
+ // ERROR !? - can't throw Exception here - we go to next tag...
+ state = ST_PCDATA;
+ pcData.reset();
+ }
+ break;
+ case ST_NAME:
+ switch (c)
+ {
+ case ' ':
+ case '\t':
+ case '\r':
+ case '\n':
+ if (attrName.size() > 0)
+ {
+ state = ST_NAME_WS;
+ }
+ break;
+ case '>':
+ if (attrName.size() > 0)
+ {
+ gotAttr();
+ }
+ gotTag();
+ break;
+ case '=':
+ state = ST_EQ;
+ break;
+ default:
+ if (isCtlOrTspecial(c))
+ {
+ state = ST_PCDATA;
+ pcData.reset();
+ }
+ else
+ {
+ attrName.write(Character.toLowerCase((char) c));
+ }
+ }
+ break;
+ case ST_NAME_WS:
+ // white-space between name and '='
+ switch (c)
+ {
+ case ' ':
+ case '\t':
+ case '\r':
+ case '\n':
+ break;
+ case '=':
+ state = ST_EQ;
+ break;
+ case '>':
+ gotAttr();
+ gotTag();
+ break;
+ default:
+ if (isNameChar(c))
+ {
+ gotAttr();
+ attrName.write(Character.toLowerCase((char) c));
+ state = ST_TAG_WS;
+ }
+ else
+ {
+ state = ST_PCDATA;
+ pcData.reset();
+ }
+ }
+ break;
+ case ST_EQ:
+ // white-space between '=' and value
+ switch (c)
+ {
+ case ' ':
+ case '\t':
+ case '\r':
+ case '\n':
+ break;
+ case '"':
+ qchar = '"';
+ state = ST_VALUE_QUOTED;
+ break;
+ case '\'':
+ qchar = '\'';
+ state = ST_VALUE_QUOTED;
+ break;
+ default:
+ if (isCtlOrTspecial(c))
+ {
+ state = ST_PCDATA;
+ pcData.reset();
+ }
+ else
+ {
+ attrValue.write(c);
+ state = ST_VALUE;
+ }
+ }
+ break;
+ case ST_VALUE:
+ switch (c)
+ {
+ case ' ':
+ case '\t':
+ case '\r':
+ case '\n':
+ gotAttr();
+ state = ST_TAG_WS;
+ break;
+ case '>':
+ gotAttr();
+ gotTag();
+ break;
+ /*
+ * case '/': // FIXME: HTML knows things like !!
+ * gotAttr();
+ * state = ST_EMPTY_TAG_SLASH;
+ * break;
+ */
+ default:
+ if (isValueBreaker(c))
+ {
+ state = ST_PCDATA;
+ pcData.reset();
+ }
+ else
+ {
+ attrValue.write(c);
+ }
+ }
+ break;
+ case ST_VALUE_QUOTED:
+ if (c == qchar)
+ {
+ gotAttr();
+ state = ST_TAG_WS;
+ }
+ else
+ {
+ attrValue.write(c);
+ }
+ break;
+ case ST_COMMENT:
+ // we've seen "...'
+ gotComment();
+ //while (read_ex() != '>') ;
+ //state = ST_PCDATA;
+ }
+ catch (EmptyInputStream ex)
+ {
+ gotPCDATA(false);
+ keepPCData = false;
+ break;
+ }
+ }
+ }
+
+ // input stream ended - return rest, if any, as PCDATA
+ if (buf.size() > 0)
+ {
+ gotPCDATA(false);
+ keepPCData = false;
+ buf.reset();
+ }
+ }
+
+
+ // counts lines and columns - used in error reporting
+ // a line can be a single \r or \n or it can be \r\n - we handle them all
+ int cc;
+
+ // last char read
+
+
+ /**
+ * Description of the Method
+ *
+ * @return Description of the Return Value
+ * @exception IOException Description of the Exception
+ */
+ public final int read()
+ throws IOException
+ {
+ int c = in.read();
+ if (c != -1)
+ {
+ buf.write(c);
+ }
+
+ return c;
+ }
+
+
+ /**
+ * Description of the Method
+ *
+ * @return Description of the Return Value
+ * @exception IOException Description of the Exception
+ * @exception EmptyInputStream Description of the Exception
+ */
+ public final int read_ex()
+ throws IOException, EmptyInputStream
+ {
+ int c = read();
+ if (c == -1)
+ {
+ throw new EmptyInputStream();
+ }
+ return c;
+ }
+
+
+ // HTML allows boolean attributes - attributes without a
+ // value, or rather an implicit value which is the same as the name.
+ /**
+ * Description of the Method
+ *
+ * @exception Exception Description of the Exception
+ */
+ protected final void gotAttr()
+ throws Exception
+ {
+ // gotTag has to be called first, setting waitForAtt = ATT_HREF or ATT_SRC
+ if (!linkAttrFound)
+ {
+ char[] attName = attrName.getCharArray();
+ int attLength = attrName.getLength();
+ boolean gotcha = false;
+
+ switch (attLength)
+ {
+ case 4:
+ if (attName[0] == 'h' && attName[1] == 'r' && attName[2] == 'e' && attName[3] == 'f')
+ {
+ gotcha = true;
+ }
+ break;
+ case 3:
+ if (attName[0] == 's' && attName[1] == 'r' && attName[2] == 'c')
+ {
+ gotcha = true;
+ }
+ break;
+ }
+ if (gotcha)
+ {
+ linkValue = (rcgnzEntities ? entMngr.entityDecode(attrValue) :
+ attrValue).toString();
+ linkAttrFound = true;
+ }
+ else
+ {
+ linkValue = "";
+ }
+ }
+ attrName.reset();
+ attrValue.reset();
+ //attrs.put(nm, val);
+ }
+
+
+ /**
+ * Description of the Method
+ */
+ protected void gotTagName()
+ {
+ char[] tag = tagname.getCharArray();
+ int tagLength = tagname.getLength();
+ switch (tagLength)
+ {
+ case 1:
+ // A
+ if (tag[0] == 'a')
+ {
+ linkTagType = LINKTYPE_LINK;
+ linkAttrType = ATTR_HREF;
+
+ }
+ break;
+ // [case 3: // IMG]
+ case 4:
+ // BASE, AREA [, LINK]
+ if(isStartTag)
+ {
+ if (tag[0] == 'b' && tag[1] == 'a' && tag[2] == 's' && tag[3] == 'e')
+ {
+ linkTagType = LINKTYPE_BASE;
+ linkAttrType = ATTR_HREF;
+ }
+ else if (tag[0] == 'a' && tag[1] == 'r' && tag[2] == 'e' && tag[3] == 'a')
+ {
+ linkTagType = LINKTYPE_LINK;
+ linkAttrType = ATTR_HREF;
+ }
+ }
+ break;
+ case 5:
+ // FRAME
+ if(isStartTag)
+ {
+ if (tag[0] == 'f' && tag[1] == 'r' && tag[2] == 'a' && tag[3] == 'm' && tag[4] == 'e')
+ {
+ linkTagType = LINKTYPE_FRAME;
+ linkAttrType = ATTR_SRC;
+ }
+ else if (tag[0] == 't' && tag[1] == 'i' && tag[2] == 't' && tag[3] == 'l' && tag[4] == 'e')
+ {
+ isInTitleTag = true;
+ keepPCData = true;
+ }
+ }
+ default:
+ }
+ }
+
+
+ /**
+ * Description of the Method
+ *
+ * @exception Exception Description of the Exception
+ */
+ protected void gotTag()
+ throws Exception
+ {
+ if (linkAttrFound && isStartTag)
+ {
+ switch (linkTagType)
+ {
+ case LINKTYPE_LINK:
+ //System.out.println("got link " + linkValue);
+ linkHandler.handleLink(linkValue, false);
+ break;
+ case LINKTYPE_FRAME:
+ //System.out.println("got link " + linkValue);
+ linkHandler.handleLink(linkValue, true);
+ break;
+ case LINKTYPE_BASE:
+ linkHandler.handleBase(linkValue);
+ break;
+ }
+ }
+ toStart();
+ }
+
+
+ /**
+ * Description of the Method
+ *
+ * @param attrs Description of the Parameter
+ */
+ public final void keysToLowerCase(SAXAttributeMap attrs)
+ {
+ for (int i = 0; i < attrs.n; i++)
+ {
+ attrs.keys[i] = attrs.keys[i].toLowerCase();
+ if (atomize)
+ {
+ attrs.keys[i] = Atom.getAtom(attrs.keys[i]);
+ }
+ }
+ }
+
+
+ // toomuch true iff we read a '<' of the next token
+ /**
+ * Description of the Method
+ *
+ * @param toomuch Description of the Parameter
+ * @exception Exception Description of the Exception
+ */
+ protected void gotPCDATA(boolean toomuch)
+ throws Exception
+ {
+ if(isInTitleTag)
+ {
+ linkHandler.handleTitle(pcData.toString());
+ isInTitleTag = false;
+ }
+
+ // ignore it
+ toStart();
+ }
+
+
+ /*
+ * noChildren = false;
+ * if (toomuch) {
+ * buf.setLength(buf.size() - 1);
+ * }
+ * CharBuffer buf1 = rcgnzEntities ? entMngr.entityDecode(buf) : buf;
+ * docHandler.characters(buf1.getCharArray(), 0, buf1.size());
+ * /handler.gotText(getBuffer());
+ * toStart();
+ * if (toomuch) {
+ * buf.write('<');
+ * column--;
+ * }
+ * }
+ */
+ // XXX: should pass the comment on as docHandler.ignorable() ??
+ /**
+ * Description of the Method
+ *
+ * @exception IOException Description of the Exception
+ * @exception EmptyInputStream Description of the Exception
+ */
+ protected void gotComment()
+ throws IOException, EmptyInputStream
+ {
+ //toStart(); // so an unexpected EOF causes rest to be returned as PCDATA
+ while (read_ex() != '>')
+ {
+ ;
+ }
+ toStart();
+ }
+
+
+ // Processing Instruction
+ /**
+ * Description of the Method
+ *
+ * @exception Exception Description of the Exception
+ */
+ protected void parsePI()
+ throws Exception
+ {
+ // ignore this
+
+ /*
+ * int i;
+ * String target;
+ * noChildren = false;
+ * inXMLDecl = false;
+ * i = buf.size();
+ * try {
+ * while (!isWS(read_ex())) ;
+ * target = buf.toString();
+ * target = target.substring(i, target.length() - 1);
+ * if ("XML".equals(target)) {
+ * inXMLDecl = true;
+ * state = ST_TAG_WS;
+ * return;
+ * }
+ * while (isWS(read_ex())) ;
+ * i = buf.size() - 1;
+ * while (true) {
+ * while (read_ex() != '?') ;
+ * if (read_ex() == '>') {
+ * String s = buf.toString();
+ * docHandler.processingInstruction(
+ * Atom.getAtom(target), s.substring(i, s.length()-2));
+ * /handler.gotPI(Atom.getAtom(target),
+ * / s.substring(i, s.length()-2));
+ * break;
+ * }
+ * }
+ * } catch (EmptyInputStream ex) {
+ * gotPCDATA(false);
+ * errHandler.warning("EOF while parsing PI", sysID, _line, _column);
+ * /err_continue("EOF while parsing PI");
+ * }
+ */
+ toStart();
+ }
+
+
+ // CDATA section
+ // XXX: should contents be amalgamated with surrounding PCDATA?
+ /**
+ * Description of the Method
+ *
+ * @exception Exception Description of the Exception
+ */
+ protected void parseCDATA()
+ throws Exception
+ {
+ // we've seen "')
+ {
+ ;
+ }
+ // docHandler.characters(buf.getCharArray(), i1, buf.size()-3-i1);
+ }
+ else
+ {
+ warning("Bad CDATA markup");
+ state = ST_PCDATA;
+ pcData.reset();
+ }
+ }
+ catch (EmptyInputStream ex)
+ {
+ warning("EOF while parsing CDATA section");
+ //gotPCDATA(false);
+ }
+ toStart();
+ }
+
+
+ /**
+ * Gets the wS attribute of the Tokenizer object
+ *
+ * @param c Description of the Parameter
+ * @return The wS value
+ */
+ public boolean isWS(int c)
+ {
+ switch (c)
+ {
+ case ' ':
+ case '\t':
+ case '\r':
+ case '\n':
+ return true;
+ default:
+ return false;
+ }
+ }
+
+
+ /**
+ * Gets the valueBreaker attribute of the Tokenizer class
+ *
+ * @param c Description of the Parameter
+ * @return The valueBreaker value
+ */
+ public final static boolean isValueBreaker(int c)
+ {
+ switch (c)
+ {
+ // control characters (0-31 and 127):
+ case 0:
+ case 1:
+ case 2:
+ case 3:
+ case 4:
+ case 5:
+ case 6:
+ case 7:
+ case 8:
+ case 9:
+ case 10:
+ case 11:
+ case 12:
+ case 13:
+ case 14:
+ case 15:
+ case 16:
+ case 17:
+ case 18:
+ case 19:
+ case 20:
+ case 21:
+ case 22:
+ case 23:
+ case 24:
+ case 25:
+ case 26:
+ case 27:
+ case 28:
+ case 29:
+ case 30:
+ case 31:
+ case 127:
+
+ // tspecials:
+ case '>':
+ case ' ':
+ return true;
+ default:
+ return false;
+ }
+ }
+
+
+ /**
+ * Returns true if c is either an ascii control character or a tspecial
+ * according to the HTTP specification.
+ *
+ * @param c Description of the Parameter
+ * @return The ctlOrTspecial value
+ */
+ // private static final boolean[] isCtlOrTSpecial = new boolean[]
+// {
+// /* 0 */ true , true , true , true , true , true , true , true , true , true , true , true , true , true ,
+// /* 14 */ true , true , true , true , true , true , true , true , true , true , true , true , true , true ,
+// /* 28 */ true , true , true , true , true , false, true , false, false, false, false, false, true , true ,
+// /* 42 */ false, false, true , false, false, true , false, false, false, false, false, false, false, false,
+// /* 56 */ false, false, /*FIX: / no control char: true*/ false, true , true , true , true , true , true , false, false, false, false, false,
+// /* 70 */ false, false, false, false, false, false, false, false, false, false, false, false, false, false,
+// /* 84 */ false, false, false, false, false, false, false, true , true , true , false, false, false, false,
+// /* 98 */ false, false, false, false, false, false, false, false, false, false, false, false, false, false,
+// /* 112 */ false, false, false, false, false, false, false, false, false, false, false, true , false, true ,
+// /* 126 */ false, true , false, false, false, false, false, false, false, false, false, false, false, false,
+// /* 140 */ false, false, false, false, false, false, false, false, false, false, false, false, false, false,
+// /* 154 */ false, false, false, false, false, false, false, false, false, false, false, false, false, false,
+// /* 168 */ false, false, false, false, false, false, false, false, false, false, false, false, false, false,
+// /* 182 */ false, false, false, false, false, false, false, false, false, false, false, false, false, false,
+// /* 196 */ false, false, false, false, false, false, false, false, false, false, false, false, false, false,
+// /* 210 */ false, false, false, false, false, false, false, false, false, false, false, false, false, false,
+// /* 224 */ false, false, false, false, false, false, false, false, false, false, false, false, false, false,
+// /* 238 */ false, false, false, false, false, false, false, false, false, false, false, false, false, false,
+// /* 252 */ false, false, false, false
+// };
+
+ public final static boolean isCtlOrTspecial(int c)
+ {
+ switch (c)
+ {
+ // control characters (0-31 and 127):
+ case 0:
+ case 1:
+ case 2:
+ case 3:
+ case 4:
+ case 5:
+ case 6:
+ case 7:
+ case 8:
+ case 9:
+ case 10:
+ case 11:
+ case 12:
+ case 13:
+ case 14:
+ case 15:
+ case 16:
+ case 17:
+ case 18:
+ case 19:
+ case 20:
+ case 21:
+ case 22:
+ case 23:
+ case 24:
+ case 25:
+ case 26:
+ case 27:
+ case 28:
+ case 29:
+ case 30:
+ case 31:
+ case 127:
+
+ // tspecials:
+ case '(':
+ case ')':
+ case '<':
+ case '>':
+ case '@':
+ case ',':
+ case ';':
+ case ':':
+ case '\\':
+ case '"':
+ /*
+ * case '/':
+ */
+ case '[':
+ case ']':
+ case '?':
+ case '=':
+ case '{':
+ case '}':
+ case ' ':
+ // case '\t':
+ return true;
+ default:
+ return false;
+ }
+ }
+
+
+ /*
+ * public static void main(String[])
+ * {
+ * System.out.println("private static final boolean[] isCtlOrTSpecial = \n{"); // bzw. isNameChar
+ * for(int i=0; i<256; i++)
+ * {
+ * if(i>0)
+ * System.out.print(", ");
+ * if(i % 14 == 0)
+ * {
+ * System.out.print("\n/* " + i + " *" + "/ ");
+ * }
+ * if(Tokenizer.isCtlOrTspecial(i)) // bzw. isNameChar(i)
+ * {
+ * System.out.print("true ");
+ * }
+ * else
+ * {
+ * System.out.print("false");
+ * }
+ * }
+ * System.out.print("};\n\n");
+ * }
+ */
+// public static final boolean isCtlOrTspecial(int c)
+// {
+// return (c < 256 ? isCtlOrTSpecial[c] : false);
+// }
+//
+// private static final boolean[] isNameChar =
+// {
+// /* 0 */ false, false, false, false, false, false, false, false, false, false, false, false, false, false,
+// /* 14 */ false, false, false, false, false, false, false, false, false, false, false, false, false, false,
+// /* 28 */ false, false, false, false, false, false, false, false, false, false, false, false, false, false,
+// /* 42 */ false, false, false, true , true , false, true , true , true , true , true , true , true , true ,
+// /* 56 */ true , true , false, false, false, false, false, false, false, true , true , true , true , true ,
+// /* 70 */ true , true , true , true , true , true , true , true , true , true , true , true , true , true ,
+// /* 84 */ true , true , true , true , true , true , true , false, false, false, false, true , false, true ,
+// /* 98 */ true , true , true , true , true , true , true , true , true , true , true , true , true , true ,
+// /* 112 */ true , true , true , true , true , true , true , true , true , true , true , false, false, false,
+// /* 126 */ false, false, false, false, false, false, false, false, false, false, false, false, false, false,
+// /* 140 */ false, false, false, false, false, false, false, false, false, false, false, false, false, false,
+// /* 154 */ false, false, false, false, false, false, false, false, false, false, false, false, false, false,
+// /* 168 */ false, false, false, false, false, false, false, false, false, false, false, false, false, false,
+// /* 182 */ false, false, false, false, false, false, false, false, false, false, false, false, false, false,
+// /* 196 */ false, false, false, false, false, false, false, false, false, false, false, false, false, false,
+// /* 210 */ false, false, false, false, false, false, false, false, false, false, false, false, false, false,
+// /* 224 */ false, false, false, false, false, false, false, false, false, false, false, false, false, false,
+// /* 238 */ false, false, false, false, false, false, false, false, false, false, false, false, false, false,
+// /* 252 */ false, false, false, false
+// };
+// public static final boolean isNameChar(int c)
+// {
+// return (c < 256 ? isNameChar[c] : false);
+// }
+//
+ /*
+ * / I don't think this is a very standard definition of what can
+ * / go into tag and attribute names.
+ */
+ /**
+ * Gets the nameChar attribute of the Tokenizer class
+ *
+ * @param c Description of the Parameter
+ * @return The nameChar value
+ */
+ public final static boolean isNameChar(int c)
+ {
+ return ('a' <= c && c <= 'z') ||
+ ('A' <= c && c <= 'Z') ||
+ ('0' <= c && c <= '9') ||
+ c == '.' || c == '-' || c == '_';
+ }
+
+
+
+ /**
+ * Description of the Method
+ *
+ * @param s Description of the Parameter
+ * @exception Exception Description of the Exception
+ */
+ protected final void warning(String s)
+ throws Exception
+ {
+ //errHandler.warning(s, sysID, _line, _column);
+ }
+
+
+ /**
+ * Description of the Method
+ *
+ * @param s Description of the Parameter
+ * @exception Exception Description of the Exception
+ */
+ protected final void fatal(String s)
+ throws Exception
+ {
+ //errHandler.fatal(s, sysID, _line, _column);
+ }
+
+
+
+ /**
+ * The main program for the Tokenizer class
+ *
+ * @param argv The command line arguments
+ */
+ public static void main(String[] argv)
+ {
+ Tokenizer tok = new Tokenizer();
+ tok.setLinkHandler(
+ new LinkHandler()
+ {
+ int nr = 0;
+
+
+ public void handleLink(String link, boolean isFrame)
+ {
+ System.out.println("found link " + (++nr) + ": " + link);
+ }
+ public void handleTitle(String title)
+ {
+ System.out.println("found title " + (++nr) + ": " + title);
+ }
+
+
+ public void handleBase(String link)
+ {
+ System.out.println("found base " + (++nr) + ": " + link);
+ }
+ });
+ try
+ {
+ tok.parse(new FileReader("C:\\witest.htm"));
+ /*
+ * " " +
+ * "This is some Text\n" +
+ * "and this is... the link" +
+ * "
+ *
+ * Description:
+ *
+ * Copyright: Copyright (c)
+ *
+ * Company:
+ *
+ *
+ *
+ * @author
+ * @version 1.0
+ */
+package de.lanlab.larm.storage;
+import de.lanlab.larm.util.*;
+
+/**
+ * This interface stores documents provided by a fetcher task
+ * @author Clemens Marschner
+ */
+public interface DocumentStorage
+{
+ /**
+ * called once when the storage is supposed to be initialized
+ */
+ public void open();
+
+
+ /**
+ * called to store a web document
+ *
+ * @param doc the document
+ */
+ public void store(WebDocument doc);
+}
diff --git a/sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/storage/LogStorage.java b/sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/storage/LogStorage.java
new file mode 100644
index 00000000000..2b6507195c3
--- /dev/null
+++ b/sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/storage/LogStorage.java
@@ -0,0 +1,165 @@
+package de.lanlab.larm.storage;
+
+import de.lanlab.larm.util.WebDocument;
+import de.lanlab.larm.util.SimpleLogger;
+import java.io.*;
+
+
+/**
+ * Title: LARM Lanlab Retrieval Machine Description: Copyright: Copyright (c)
+ * Company:
+ *
+ * @author
+ * @created 11. Januar 2002
+ * @version 1.0
+ */
+
+
+
+/**
+ * this class saves the documents into page files of 50 MB and keeps a record of all
+ * the positions into a Logger. the log file contains URL, page file number, and
+ * index within the page file.
+ *
+ */
+
+public class LogStorage implements DocumentStorage
+{
+
+ SimpleLogger log;
+
+ File pageFile;
+ FileOutputStream out;
+ int pageFileCount;
+ String filePrefix;
+ int offset;
+ boolean isValid = false;
+ /**
+ * Description of the Field
+ */
+ public final static int MAXLENGTH = 50000000;
+ boolean logContents = false;
+ String fileName;
+
+
+ /**
+ * Constructor for the LogStorage object
+ *
+ * @param log the logger where index information is saved to
+ * @param logContents whether all docs are to be stored in page files or not
+ * @param filePrefix the file name where the page file number is appended
+ */
+ public LogStorage(SimpleLogger log, boolean logContents, String filePrefix)
+ {
+ this.log = log;
+ pageFileCount = 0;
+ this.filePrefix = filePrefix;
+ this.logContents = logContents;
+ if (logContents)
+ {
+ openPageFile();
+ }
+ }
+
+
+ /**
+ * Description of the Method
+ */
+ public void open() { }
+
+
+ /**
+ * Description of the Method
+ */
+ public void openPageFile()
+ {
+ int id = ++pageFileCount;
+ fileName = filePrefix + "_" + id + ".pfl";
+ try
+ {
+ this.offset = 0;
+ out = new FileOutputStream(fileName);
+ isValid = true;
+ }
+ catch (IOException io)
+ {
+ log.logThreadSafe("**ERROR: IOException while opening pageFile " + fileName + ": " + io.getClass().getName() + "; " + io.getMessage());
+ isValid = false;
+ }
+ }
+
+
+ /**
+ * Gets the outputStream attribute of the LogStorage object
+ *
+ * @return The outputStream value
+ */
+ public OutputStream getOutputStream()
+ {
+ if (offset > MAXLENGTH)
+ {
+ try
+ {
+ out.close();
+ }
+ catch (IOException io)
+ {
+ log.logThreadSafe("**ERROR: IOException while closing pageFile " + fileName + ": " + io.getClass().getName() + "; " + io.getMessage());
+ }
+ openPageFile();
+ }
+ return out;
+ }
+
+
+ /**
+ * Description of the Method
+ *
+ * @param bytes Description of the Parameter
+ * @return Description of the Return Value
+ */
+ public synchronized int writeToPageFile(byte[] bytes)
+ {
+ try
+ {
+ OutputStream out = getOutputStream();
+ int oldOffset = this.offset;
+ out.write(bytes);
+ this.offset += bytes.length;
+ return oldOffset;
+ }
+ catch (IOException io)
+ {
+ log.logThreadSafe("**ERROR: IOException while writing " + bytes.length + " bytes to pageFile " + fileName + ": " + io.getClass().getName() + "; " + io.getMessage());
+ }
+ return -1;
+ }
+
+
+ /**
+ * Sets the logger attribute of the LogStorage object
+ *
+ * @param log The new logger value
+ */
+ public void setLogger(SimpleLogger log)
+ {
+ this.log = log;
+ }
+
+
+ /**
+ * stores the document if storing is enabled
+ *
+ * @param doc Description of the Parameter
+ */
+ public void store(WebDocument doc)
+ {
+ String docInfo = doc.getInfo();
+ if (logContents && isValid && doc.getDocumentBytes() != null)
+ {
+ int offset = writeToPageFile(doc.getDocumentBytes());
+ docInfo = docInfo + "\t" + pageFileCount + "\t" + offset;
+ }
+ log.logThreadSafe(docInfo);
+ }
+}
diff --git a/sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/storage/NullStorage.java b/sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/storage/NullStorage.java
new file mode 100644
index 00000000000..57037ce3d0f
--- /dev/null
+++ b/sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/storage/NullStorage.java
@@ -0,0 +1,26 @@
+
+/**
+ * Title: LARM Lanlab Retrieval Machine
+ * Description:
+ * Copyright: Copyright (c)
+ * Company:
+ * @author
+ * @version 1.0
+ */
+package de.lanlab.larm.storage;
+import de.lanlab.larm.util.*;
+
+/**
+ * doesn't do a lot
+ */
+public class NullStorage implements DocumentStorage
+{
+
+ public NullStorage()
+ {
+ }
+
+ public void open() {}
+ public void store(WebDocument doc) {}
+
+}
\ No newline at end of file
diff --git a/sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/storage/SQLServerStorage.java b/sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/storage/SQLServerStorage.java
new file mode 100644
index 00000000000..522a8760d24
--- /dev/null
+++ b/sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/storage/SQLServerStorage.java
@@ -0,0 +1,176 @@
+
+/**
+ * Title: LARM Lanlab Retrieval Machine
+ * Description:
+ * Copyright: Copyright (c)
+ * Company:
+ * @author
+ * @version 1.0
+ */
+package de.lanlab.larm.storage;
+import java.sql.*;
+import de.lanlab.larm.util.*;
+import java.util.*;
+
+/**
+ * saves the document into an sql table. At this time only in MS SQL (and probably Sybase)
+ * a table "Document" with the columns DO_URL(varchar), DO_MimeType(varchar) and
+ * DO_Data2(BLOB) is created after start
+ * Copyright: Copyright (c)
+ * Company:
+ * @author
+ * @version 1.0
+ */
+package de.lanlab.larm.threads;
+
+public class ThreadFactory
+{
+ // static int count = 0;
+
+ public ServerThread createServerThread(int count)
+ {
+ return new ServerThread(count);
+ }
+}
\ No newline at end of file
diff --git a/sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/threads/ThreadPool.java b/sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/threads/ThreadPool.java
new file mode 100644
index 00000000000..84c1ef57fa7
--- /dev/null
+++ b/sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/threads/ThreadPool.java
@@ -0,0 +1,380 @@
+
+package de.lanlab.larm.threads;
+
+//import java.util.Vector;
+import java.util.*;
+
+/**
+ * if you have many tasks to accomplish, you can do this with one of the
+ * following strategies:
+ *
+ *
+ * Description:
+ *
+ * Copyright: Copyright (c)
+ *
+ * Company:
+ *
+ *
+ *
+ * @author
+ * @version 1.0
+ */
+package de.lanlab.larm.util;
+import java.io.*;
+import java.util.*;
+
+
+class StoreException extends RuntimeException
+{
+ Exception origException;
+
+
+ /**
+ * Constructor for the StoreException object
+ *
+ * @param e Description of the Parameter
+ */
+ public StoreException(Exception e)
+ {
+ origException = e;
+ }
+
+
+ /**
+ * Gets the message attribute of the StoreException object
+ *
+ * @return The message value
+ */
+ public String getMessage()
+ {
+ return origException.getMessage();
+ }
+
+
+ /**
+ * Description of the Method
+ */
+ public void printStackTrace()
+ {
+ System.err.println("StoreException occured with reason: " + origException.getMessage());
+ origException.printStackTrace();
+ }
+}
+
+/**
+ * internal class that represents one block within a queue
+ *
+ * @author Clemens Marschner
+ * @created 3. Januar 2002
+ */
+class QueueBlock
+{
+
+
+ /**
+ * the elements section will be set to null if it is on disk Vector elements
+ * must be Serializable
+ */
+ LinkedList elements;
+
+ /**
+ * Anzahl Elemente im Block. Kopie von elements.size()
+ */
+ int size;
+
+ /**
+ * maximale Blockgröße
+ */
+ int maxSize;
+
+ /**
+ * if set, elements is null and block was written to file
+ */
+ boolean onDisk;
+
+ /**
+ * Blockname
+ */
+ String name;
+
+
+ /**
+ * initialisiert den Block
+ *
+ * @param name Der Blockname (muss eindeutig sein, sonst Kollision auf
+ * Dateiebene)
+ * @param maxSize maximale Blockgröße. Über- und Unterläufe werden durch
+ * Exceptions behandelt
+ */
+ public QueueBlock(String name, int maxSize)
+ {
+ this.name = name;
+ this.onDisk = false;
+ this.elements = new LinkedList();
+ this.maxSize = maxSize;
+ }
+
+
+ /**
+ * serialisiert und speichert den Block auf Platte
+ *
+ * @exception StoreException Description of the Exception
+ */
+ public void store()
+ throws StoreException
+ {
+ try
+ {
+ ObjectOutputStream o = new ObjectOutputStream(new FileOutputStream(getFileName()));
+ o.writeObject(elements);
+ elements = null;
+ o.close();
+ onDisk = true;
+ //System.out.println("CachingQueue.store: Block stored");
+ }
+ catch (IOException e)
+ {
+ System.err.println("CachingQueue.store: IOException");
+ throw new StoreException(e);
+ }
+ }
+
+
+ /**
+ * @return the filename of the block
+ */
+ String getFileName()
+ {
+ // package protected!
+
+ return "cachingqueue/" + name + ".cqb";
+ }
+
+
+ /**
+ * load the block from disk
+ *
+ * @exception StoreException Description of the Exception
+ */
+ public void load()
+ throws StoreException
+ {
+ try
+ {
+ ObjectInputStream i = new ObjectInputStream(new FileInputStream(getFileName()));
+ elements = (LinkedList) i.readObject();
+ i.close();
+ onDisk = false;
+ size = elements.size();
+ if (!(new File(getFileName()).delete()))
+ {
+ System.err.println("CachingQueue.load: file could not be deleted");
+ }
+ //System.out.println("CachingQueue.load: Block loaded");
+ }
+ catch (Exception e)
+ {
+ System.err.println("CachingQueue.load: Exception " + e.getClass().getName() + " occured");
+ throw new StoreException(e);
+ }
+ }
+
+
+ /**
+ * inserts an object at the start of the queue must be synchronized by
+ * calling class to be thread safe
+ *
+ * @param o Description of the Parameter
+ * @exception StoreException Description of the Exception
+ */
+ public void insert(Object o)
+ throws StoreException
+ {
+ if (onDisk)
+ {
+ load();
+ }
+ if (size >= maxSize)
+ {
+ throw new OverflowException();
+ }
+ elements.addFirst(o);
+ size++;
+ }
+
+
+ /**
+ * gibt das letzte Element aus der Queue zurück und löscht dieses must be
+ * made synchronized by calling class to be thread safe
+ *
+ * @return Description of the Return Value
+ * @exception UnderflowException Description of the Exception
+ * @exception StoreException Description of the Exception
+ */
+ public Object remove()
+ throws UnderflowException, StoreException
+ {
+ if (onDisk)
+ {
+ load();
+ }
+ if (size <= 0)
+ {
+ throw new UnderflowException();
+ }
+ size--;
+ return elements.removeLast();
+ }
+
+
+ /**
+ * @return the number of elements in the block
+ */
+ public int size()
+ {
+ return size;
+ }
+
+
+ /**
+ * destructor. Assures that all files are deleted, even if the queue was not
+ * empty at the time when the program ended
+ */
+ public void finalize()
+ {
+ // System.err.println("finalize von " + name + " called");
+ if (onDisk)
+ {
+ // temp-Datei löschen. Passiert, wenn z.B. eine Exception aufgetreten ist
+ // System.err.println("CachingQueue.finalize von Block " + name + ": lösche Datei");
+ if (!(new File(getFileName()).delete()))
+ {
+ // Dateifehler möglich durch Exception: ignorieren
+
+ // System.err.println("CachingQueue.finalize: file could not be deleted although onDisk was true");
+ }
+ }
+ }
+}
+
+
+/**
+ * this class holds a queue whose data is kept on disk whenever possible.
+ * It's a single ended queue, meaning data can only be added at the front and
+ * taken from the back. the queue itself is divided into blocks. Only the first
+ * and last blocks are kept in main memory, the rest is stored on disk. Only a
+ * LinkedList entry is kept in memory then.
+ * Blocks are swapped if an overflow (in case of insertions) or underflow (in case
+ * of removals) occur.
+ * Elements are always added to the end of the list, that is, always at the same place
+ * Description:
+ * Copyright: Copyright (c)
+ * Company:
+ * @author
+ * @version 1.0
+ */
+package de.lanlab.larm.util;
+
+public interface InputStreamObserver
+{
+ public void notifyOpened(ObservableInputStream in, long timeElapsed);
+ public void notifyClosed(ObservableInputStream in, long timeElapsed);
+ public void notifyRead(ObservableInputStream in, long timeElapsed, int nrRead, int totalRead);
+ public void notifyFinished(ObservableInputStream in, long timeElapsed, int totalRead);
+}
\ No newline at end of file
diff --git a/sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/util/Logger.java b/sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/util/Logger.java
new file mode 100644
index 00000000000..2564b661c14
--- /dev/null
+++ b/sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/util/Logger.java
@@ -0,0 +1,19 @@
+/*
+ *
+ *
+ *
+ */
+package de.lanlab.larm.util;
+
+import java.io.*;
+
+public class Logger
+{
+ private FileOutputStream out;
+
+ public Logger(String fileName)
+ {
+
+ }
+
+}
\ No newline at end of file
diff --git a/sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/util/ObservableInputStream.java b/sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/util/ObservableInputStream.java
new file mode 100644
index 00000000000..d261d2bd75d
--- /dev/null
+++ b/sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/util/ObservableInputStream.java
@@ -0,0 +1,101 @@
+
+/**
+ * Title: LARM Lanlab Retrieval Machine
+ * Description:
+ * Copyright: Copyright (c)
+ * Company:
+ * @author
+ * @version 1.0
+ */
+package de.lanlab.larm.util;
+
+import java.io.*;
+
+public class ObservableInputStream extends FilterInputStream
+{
+ private boolean reporting = true;
+ private long startTime;
+ private int totalRead = 0;
+ private int step = 1;
+ private int nextStep = 0;
+
+ InputStreamObserver observer;
+
+ public ObservableInputStream(InputStream in, InputStreamObserver iso, int reportingStep)
+ {
+ super(in);
+ startTime = System.currentTimeMillis();
+ observer = iso;
+ observer.notifyOpened(this, System.currentTimeMillis() - startTime);
+ nextStep = step = reportingStep;
+ }
+
+ public void close() throws IOException
+ {
+ super.close();
+ observer.notifyClosed(this, System.currentTimeMillis() - startTime);
+ }
+
+ public void setReporting(boolean reporting)
+ {
+ this.reporting = reporting;
+ }
+
+ public boolean isReporting()
+ {
+ return reporting;
+ }
+
+ public void setReportingStep(int step)
+ {
+ this.step = step;
+ }
+
+ public int read() throws IOException
+ {
+ int readByte = super.read();
+ if(reporting)
+ {
+ notifyObserver(readByte>=0? 1 : 0);
+ }
+ return readByte;
+ }
+
+ public int read(byte[] b) throws IOException
+ {
+ int nrRead = super.read(b);
+ if(reporting)
+ {
+ notifyObserver(nrRead);
+ }
+ return nrRead;
+ }
+
+ private void notifyObserver(int nrRead)
+ {
+ if(nrRead > 0)
+ {
+ totalRead += nrRead;
+ if(totalRead > nextStep)
+ {
+ nextStep += step;
+ observer.notifyRead(this, System.currentTimeMillis() - startTime, nrRead, totalRead);
+ }
+ }
+ else
+ {
+ observer.notifyFinished(this, System.currentTimeMillis() - startTime, totalRead);
+ }
+ }
+
+ public int read(byte[] b, int offs, int size) throws IOException
+ {
+ int nrRead = super.read(b, offs, size);
+ if(reporting)
+ {
+ notifyObserver(nrRead);
+ }
+ return nrRead;
+ }
+}
+
diff --git a/sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/util/Observer.java b/sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/util/Observer.java
new file mode 100644
index 00000000000..a81095094da
--- /dev/null
+++ b/sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/util/Observer.java
@@ -0,0 +1,9 @@
+package de.lanlab.larm.util;
+
+
+/**
+ * not used
+ */
+public interface Observer
+{
+}
diff --git a/sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/util/OverflowException.java b/sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/util/OverflowException.java
new file mode 100644
index 00000000000..a1f427e667a
--- /dev/null
+++ b/sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/util/OverflowException.java
@@ -0,0 +1,15 @@
+package de.lanlab.larm.util;
+
+/**
+ * Title: LARM
+ * Description:
+ * Copyright: Copyright (c) 2001
+ * Company: LMU-IP
+ * @author Clemens Marschner
+ * @version 1.0
+ */
+
+
+public class OverflowException extends RuntimeException
+{
+}
diff --git a/sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/util/Queue.java b/sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/util/Queue.java
new file mode 100644
index 00000000000..26105c3c333
--- /dev/null
+++ b/sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/util/Queue.java
@@ -0,0 +1,20 @@
+package de.lanlab.larm.util;
+
+/**
+ * Title: LARM Lanlab Retrieval Machine
+ * Description:
+ * Copyright: Copyright (c)
+ * Company:
+ * @author
+ * @version 1.0
+ */
+
+import java.util.Collection;
+
+public interface Queue
+{
+ public Object remove();
+ public void insert(Object o);
+ public void insertMultiple(Collection c);
+ public int size();
+}
\ No newline at end of file
diff --git a/sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/util/SimpleCharArrayReader.java b/sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/util/SimpleCharArrayReader.java
new file mode 100644
index 00000000000..2e1cfd4c903
--- /dev/null
+++ b/sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/util/SimpleCharArrayReader.java
@@ -0,0 +1,285 @@
+/*
+ * @(#)SimpleCharArrayReader.java 1.35 00/02/02
+ *
+ */
+
+package de.lanlab.larm.util;
+import java.io.*;
+
+/**
+ * A
+ * Note that if bytes are simply read from
+ * the resulting input stream, elements
+ *
+ * @return the next byte of data, or
+ * This
+ */
+ public void close() throws IOException
+ {
+ isClosed = true;
+ }
+
+ /** Check to make sure that the stream has not been closed */
+ private void ensureOpen()
+ {
+ /* This method does nothing for now. Once we add throws clauses
+ * to the I/O methods in this class, it will throw an IOException
+ * if the stream has been closed.
+ */
+ }
+
+}
diff --git a/sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/util/SimpleLogger.java b/sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/util/SimpleLogger.java
new file mode 100644
index 00000000000..60cd99b2b58
--- /dev/null
+++ b/sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/util/SimpleLogger.java
@@ -0,0 +1,112 @@
+package de.lanlab.larm.util;
+
+/**
+ * Title: LARM Lanlab Retrieval Machine
+ * Description:
+ * Copyright: Copyright (c)
+ * Company:
+ * @author
+ * @version 1.0
+ */
+import java.io.*;
+import java.util.*;
+import java.text.*;
+
+/**
+ * this class is only used for SPEED. Its log function is not thread safe by
+ * default.
+ * It uses a BufferdWriter.
+ * It registers with a logger manager, which can be used to flush several loggers
+ * at once
+ * @todo: including the date slows down a lot
+ *
+ */
+public class SimpleLogger
+{
+ private SimpleDateFormat formatter = new SimpleDateFormat ("HH:mm:ss:SSSS");
+
+ Writer logFile;
+
+ StringBuffer buffer = new StringBuffer(1000);
+
+ long startTime = System.currentTimeMillis();
+ boolean includeDate;
+
+ public void setStartTime(long startTime)
+ {
+ this.startTime = startTime;
+ }
+
+ public synchronized void logThreadSafe(String text)
+ {
+ log(text);
+ }
+
+ public synchronized void logThreadSafe(Throwable t)
+ {
+ log(t);
+ }
+
+ public void log(String text)
+ {
+ try
+ {
+ buffer.setLength(0);
+ if(includeDate)
+ {
+ buffer.append(formatter.format(new Date())).append(": ").append(System.currentTimeMillis()-startTime).append(" ms: ");
+ }
+ buffer.append(text).append("\n");
+ logFile.write(buffer.toString());
+ if(flushAtOnce)
+ {
+ logFile.flush();
+ }
+ }
+ catch(IOException e)
+ {
+ System.out.println("Couldn't write to logfile");
+ }
+ }
+
+ public void log(Throwable t)
+ {
+ t.printStackTrace(new PrintWriter(logFile));
+ }
+
+ boolean flushAtOnce = false;
+
+ public void setFlushAtOnce(boolean flush)
+ {
+ this.flushAtOnce = flush;
+ }
+
+ public SimpleLogger(String name)
+ {
+ init(name, true);
+ }
+
+ public SimpleLogger(String name, boolean includeDate)
+ {
+ init(name, includeDate);
+ }
+
+ public void flush() throws IOException
+ {
+ logFile.flush();
+ }
+
+ private void init(String name, boolean includeDate)
+ {
+ try
+ {
+ logFile = new BufferedWriter(new FileWriter("logs/" + name + ".log"));
+ SimpleLoggerManager.getInstance().register(this);
+ }
+ catch(IOException e)
+ {
+ System.out.println("IOException while creating logfile " + name + ":");
+ e.printStackTrace();
+ }
+ }
+}
diff --git a/sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/util/SimpleLoggerManager.java b/sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/util/SimpleLoggerManager.java
new file mode 100644
index 00000000000..44717249305
--- /dev/null
+++ b/sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/util/SimpleLoggerManager.java
@@ -0,0 +1,65 @@
+package de.lanlab.larm.util;
+
+/**
+ * Title: LARM Lanlab Retrieval Machine
+ * Description:
+ * Copyright: Copyright (c)
+ * Company:
+ * @author
+ * @version 1.0
+ */
+
+import java.util.*;
+import java.io.IOException;
+
+/**
+ * this singleton manages all loggers. It can be used to flush all SimpleLoggers
+ * at once
+ */
+public class SimpleLoggerManager
+{
+ static SimpleLoggerManager instance = null;
+
+ ArrayList logs;
+
+ private SimpleLoggerManager()
+ {
+ logs = new ArrayList();
+ }
+
+ public void register(SimpleLogger logger)
+ {
+ logs.add(logger);
+ }
+
+ public void flush() throws IOException
+ {
+ Iterator it = logs.iterator();
+ IOException ex = null;
+ while(it.hasNext())
+ {
+ try
+ {
+ SimpleLogger logger = (SimpleLogger)it.next();
+ logger.flush();
+ }
+ catch(IOException e)
+ {
+ ex = e;
+ }
+ }
+ if(ex != null)
+ {
+ throw ex;
+ }
+ }
+
+ public static SimpleLoggerManager getInstance()
+ {
+ if(instance == null)
+ {
+ instance = new SimpleLoggerManager();
+ }
+ return instance;
+ }
+}
\ No newline at end of file
diff --git a/sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/util/SimpleObservable.java b/sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/util/SimpleObservable.java
new file mode 100644
index 00000000000..a24f9f2e181
--- /dev/null
+++ b/sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/util/SimpleObservable.java
@@ -0,0 +1,21 @@
+
+/**
+ * Title: LARM Lanlab Retrieval Machine
+ * Description:
+ * Copyright: Copyright (c)
+ * Company:
+ * @author
+ * @version 1.0
+ */
+package de.lanlab.larm.util;
+
+import java.util.Observable;
+
+public class SimpleObservable extends Observable
+{
+
+ public void setChanged()
+ {
+ super.setChanged();
+ }
+}
\ No newline at end of file
diff --git a/sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/util/State.java b/sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/util/State.java
new file mode 100644
index 00000000000..87ae48fe1b6
--- /dev/null
+++ b/sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/util/State.java
@@ -0,0 +1,91 @@
+package de.lanlab.larm.util;
+
+import java.io.Serializable;
+/**
+ * Title: LARM Lanlab Retrieval Machine
+ * Description:
+ * Copyright: Copyright (c)
+ * Company:
+ * @author
+ * @version 1.0
+ */
+
+/**
+ * thread safe state information.
+ * The get methods are not synchronized. Clone the state object before using them
+ * If you use a state object in a class, always return a clone
+ * This interface is part of the Java implementation of SAX,
+ * the Simple API for XML. It is free for both commercial and
+ * non-commercial use, and is distributed with no warrantee, real
+ * or implied. This map will be valid only during the invocation of the
+ * This applies to the current element, and can be called only
+ * during an invocation of This applies to the current element, and can be called only
+ * during an invocation of This applies to the current element, and can be called only
+ * during an invocation of This applies to the current element, and can be called only
+ * during an invocation of This applies to the current element, and can be called only
+ * during an invocation of This applies to the current element, and can be called only
+ * during an invocation of This applies to the current element, and can be called only
+ * during an invocation of This applies to the current element, and can be called only
+ * during an invocation of This applies to the current element, and can be called only
+ * during an invocation of This applies to the current element, and can be called only
+ * during an invocation of This applies to the current element, and can be called only
+ * during an invocation of This interface is part of the Java implementation of SAX,
+ * the Simple API for XML. It is free for both commercial and
+ * non-commercial use, and is distributed with no warrantee, real
+ * or implied. This is the main handler for basic document events; it provides
+ * information on roughly the same level as the ESIS in full SGML,
+ * concentrating on logical structure rather than lexical
+ * representation. If you do not set a document handler, then by default all of these
+ * events will simply be ignored. This is the first event called by a
+ * SAX-conformant parser, so you can use it to allocate and
+ * initialise new objects for the document. This is the last event called by a
+ * SAX-conformant parser, so you can use it to finalize and
+ * clean up objects for the document. This will appear only if the XML document contains a
+ * Please note that the information in the Please note that the contents of the array will be
+ * accurate only for the duration of this handler: if you need to
+ * use them elsewhere, you should make your own copy, possible
+ * by constructing a string: Please note that the contents of the array will be
+ * accurate only for the duration of this handler: if you need to
+ * use them elsewhere, you should make your own copy, possible
+ * by constructing a string: XML processing instructions have two parts: a target, which
+ * is a name, followed optionally by data. This interface is part of the Java implementation of SAX,
+ * the Simple API for XML. It is free for both commercial and
+ * non-commercial use, and is distributed with no warrantee, real
+ * or implied. If you do not set an entity handler, then a parser will
+ * resolve all entities to the suggested system ID, and will take no
+ * action for entity changes. Before loading any entity (including the document entity),
+ * SAX parsers will filter the system identifier through this
+ * callback, and you can return a different system identifier if you
+ * wish, or null to prevent the parser from reading any entity. Whenever the parser switches the entity (URI) that it is reading
+ * from, it will call this handler to report the change. This interface is part of the Java implementation of SAX,
+ * the Simple API for XML. It is free for both commercial and
+ * non-commercial use, and is distributed with no warrantee, real
+ * or implied. If you do not set an error handler, then a parser will report
+ * warnings to A SAX parser will use this callback to report a condition
+ * that is not serious enough to stop the parse (though you may
+ * still stop the parse if you wish). A SAX parser will use this callback to report a condition
+ * that is serious enough to invalidate the parse, and may not
+ * report all (or any) significant parse events after this. Ordinarily,
+ * you should stop immediately with an exception, but you can continue
+ * to try to collect more errors if you wish. This class is part of the Java implementation of SAX,
+ * the Simple API for XML. It is free for both commercial and
+ * non-commercial use, and is distributed with no warrantee, real
+ * or implied. This class implements the default behaviour when no handler
+ * is specified (though parsers are not actually required to use
+ * this class). By default, simply return the system ID supplied. By default, do nothing. By default, do nothing. By default, do nothing. By default, do nothing. By default, do nothing. By default, do nothing. By default, do nothing. By default, do nothing. By default, do nothing. By default, report the warning to System.err. By default, throw an instance of XmlException. This interface is part of the Java implementation of SAX,
+ * the Simple API for XML. It is free for both commercial and
+ * non-commercial use, and is distributed with no warrantee, real
+ * or implied. All SAX-conformant XML parsers (or their front-end SAX drivers)
+ * must implement this interface, together with a zero-argument
+ * constructor. You can plug three different kinds of callback interfaces into
+ * a basic SAX parser: one for entity handling, one for basic document
+ * events, and one for error reporting. It is not an error to start
+ * a parse without setting any handlers. If you begin a parse without setting an entity handler,
+ * the parser will by default resolve all entities to their
+ * default system IDs. You may begin the parse without setting a handler, but
+ * in that case no document events will be reported. If you begin a parse without setting an error handlers,
+ * warnings will be printed to System.err, and errors will
+ * throw an unspecified exception. Nothing exciting will happen unless you have set handlers. This interface is part of the Java implementation of SAX,
+ * the Simple API for XML. It is free for both commercial and
+ * non-commercial use, and is distributed with no warrantee, real
+ * or implied. This exception is not a required part of SAX, and it is not
+ * referenced in any of the core interfaces. It is used only in
+ * the optional HandlerBase base class, as a means of signalling
+ * parsing errors. Allows direct access to elements (as an alternative to using
+ * Enumerators) for speed.
+ *
+ * Can function as a bag, i.e. it can be created with a mode
+ * which allows the same key to map to multiple entries. In this case
+ * operations get() and remove() operate on the first pair in
+ * the map. Hence to get hold of all values associated with a key it is
+ * necessary to use the direct access to underlying arrays.
+ *
+ * @author Anders Kristensen
+ */
+public class AttrListImpl implements AttributeList {
+ protected Attribute[] elms;
+
+ /**
+ * Number of elements. The elements are held at indices 0 to n in elms.
+ */
+ protected int n = 0;
+
+ public AttrListImpl() {
+ this(2);
+ }
+
+ /**
+ * Create an AttrListImpl with the specififed initial capacity.
+ */
+ public AttrListImpl(int size) {
+ if (size <= 0) throw new IllegalArgumentException(
+ "Initial size must be at least 1");
+ elms = new Attribute[size];
+ }
+
+ /**
+ * Returns the value to which the key is mapped in this dictionary.
+ */
+ public synchronized Attribute getAttribute(String attrName) {
+ int i = getIndex(attrName);
+ return (i < 0 ? null : elms[i]);
+ }
+
+ protected int getIndex(String name) {
+ for (int i = 0; i < n; i++) {
+ if (elms[i].getName().equals(name)) {
+ return i;
+ }
+ }
+ return -1;
+ }
+
+ // XXX: what if attrName != attr.getName()???
+ public synchronized Attribute setAttribute(Attribute attr) {
+ int i = getIndex(attr.getName());
+ if (i >= 0) {
+ Attribute old = elms[i];
+ elms[i] = attr;
+ return old;
+ }
+
+ int len = elms.length;
+ if (len == n) {
+ // double size of key,elms arrays
+ AttrImpl[] e;
+ e = new AttrImpl[len * 2];
+ System.arraycopy(elms, 0, e, 0, len);
+ elms = e;
+ }
+ elms[n] = attr;
+ n++;
+ return null;
+ }
+
+ public synchronized Attribute remove(String attrName) {
+ int i = getIndex(attrName);
+ if (i < 0) return null;
+ Attribute val = elms[i];
+ System.arraycopy(elms, i+1, elms, i, n-i-1);
+ n--;
+ return val;
+ }
+
+ public synchronized Attribute item(int index) {
+ if (index < 0 || index >= n) {
+ throw new IndexOutOfBoundsException(""+index);
+ }
+ return elms[index];
+ }
+
+ /** Returns the number of keys in this dictionary. */
+ public synchronized int getLength() {
+ return n;
+ }
+
+ public synchronized String toString() {
+ StringBuffer sb = new StringBuffer();
+ boolean f = true;
+ int n = getLength();
+
+ sb.append("{ ");
+ for (int i = 0; i < n; i++) {
+ if (f) { f = false; }
+ else { sb.append(", "); }
+ Attribute attr = item(i);
+ sb.append(attr.getName() + '=' + attr);
+ }
+ sb.append(" }");
+ return sb.toString();
+ }
+
+ /**/
+ // for testing
+ public static void main(String[] args) throws Exception {
+ AttrListImpl alist;
+ Attribute attr;
+ java.io.BufferedReader r;
+ java.util.StringTokenizer tok;
+ String op;
+
+ if (args.length > 1) {
+ alist = new AttrListImpl(Integer.parseInt(args[0]));
+ } else {
+ alist = new AttrListImpl();
+ }
+
+ System.out.println(
+ "Enter operations... op's are one of\n"+
+ "put This class has very shallow (no) understanding of HTML. Correct
+ * handling of <p> tags requires some special code as does correct
+ * handling of <li>. This parser doesn't know that an "li" tag can
+ * be terminated by another "li" tag or a "ul" end tag. Hence "li" is
+ * treated as an empty tag here which means that in the generated parse
+ * tree the children of the "li" element are represented as siblings of it.
+ *
+ * @see Tokenizer
+ * @author Anders Kristensen
+ */
+public class Parser implements DocumentHandler {
+ // FIXME: add support for discriminate per-element whitespace handling
+
+ /**
+ * Set of elements which the parser will expect to be empty, i.e. it
+ * will not expect an end tag (e.g. IMG, META HTML elements).
+ * End tags for any of these are ignored...
+ */
+ protected Hashtable emptyElms = new Hashtable();
+
+ /**
+ * Maps element names to a list of names of other elements which
+ * terminate that element. So for example "dt" might be mapped to
+ * ("dt", "dd") and "p" might be mapped to all blocklevel HTML
+ * elements.
+ */
+ protected Hashtable terminators = new Hashtable();
+ protected Tokenizer tok;
+ protected DOM dom;
+ protected Document root;
+ protected Node current;
+
+ /**
+ * Non-fatal errors are written to this PrintStream. Fatal errors
+ * are reported as Exceptions.
+ */
+ PrintStream err = System.err;
+
+ public Parser() {
+ tok = new Tokenizer();
+ tok.setDocumentHandler(this);
+ dom = new DOMImpl();
+ }
+
+ public DOM setDOM(DOM dom) {
+ DOM old = dom;
+ this.dom = dom;
+ return old;
+ }
+
+ public Tokenizer getTokenizer() {
+ return tok;
+ }
+
+ /**
+ * Add the set of HTML empty elements to the set of tags recognized
+ * as empty tags.
+ */
+ public void addEmptyElms(String[] elms) {
+ for (int i = 0; i < elms.length; i++) {
+ emptyElms.put(elms[i], elms[i]);
+ }
+ }
+
+ public void clearEmptyElmSet() {
+ emptyElms.clear();
+ }
+
+ public boolean isEmptyElm(String elmName) {
+ return emptyElms.get(elmName) != null;
+ }
+
+ public void setElmTerminators(String elmName, String[] elmTerms) {
+ terminators.put(elmName, putIds(new Hashtable(), elmTerms));
+ }
+
+ public void addTerminator(String elmName, String elmTerm) {
+ Hashtable h = (Hashtable) terminators.get(elmName);
+ if (h == null) terminators.put(elmName, h = new Hashtable());
+ h.put(elmTerm, elmTerm);
+ }
+
+ public static final Dictionary putIds(Dictionary dict, String[] sary) {
+ for (int i = 0; i < sary.length; i++) {
+ dict.put(sary[i], sary[i]);
+ }
+ return dict;
+ }
+
+ protected Document root() {
+ return root;
+ }
+
+ public Document parse(InputStream in) throws Exception {
+ root = dom.createDocument(null);
+ current = root;
+ tok.parse(in);
+ return root();
+ }
+
+ public void startDocument() {}
+ public void endDocument() {}
+
+ // FIXME: record in root DOCUMENT the id's of elements which have one
+
+ public void doctype(String name, String publicID, String systemID) {
+ }
+
+ public void startElement(String name, AttributeMap attributes) {
+ //System.out.println("CURRENT: " + current);
+
+ // does this new element terminate the current element?
+ if (current != root) {
+ String tagName = ((Element) current).getTagName();
+ if (tagName != null) {
+ Hashtable terms = (Hashtable) terminators.get(tagName);
+ if (terms != null && terms.get(name) != null) {
+ current = current.getParentNode(); // FIXME: could be null
+ }
+ }
+ }
+
+ Element elm = root.createElement(name, getDOMAttrs(attributes));
+ // FIXME: Allows direct access to elements (as an alternative to using
+ * Enumerators) for speed.
+ *
+ * Can function as a bag, i.e. it can be created with a mode
+ * which allows the same key to map to multiple entries. In this case
+ * operations get() and remove() operate on the first pair in
+ * the map. Hence to get hold of all values associated with a key it is
+ * necessary to use the direct access to underlying arrays.
+ *
+ * @author Anders Kristensen
+ */
+public class SAXAttributeMap implements AttributeMap {
+
+ /** The list of keys. */
+ public String[] keys;
+
+ /** List of values associated with keys. */
+ public String[] elms;
+
+ /**
+ * Number of elements in the Dictionary.
+ * The elements are held at indices 0 to n in the keys and elms arrays.
+ */
+ public int n = 0;
+
+ public SAXAttributeMap() {
+ this(5);
+ }
+
+ /**
+ * Create a SAXAttributeMap with the specififed initial cpacity.
+ */
+ public SAXAttributeMap(int size) {
+ if (size <= 0) throw new IllegalArgumentException(
+ "Initial size must be at least 1");
+ keys = new String[size];
+ elms = new String[size];
+ }
+
+ /** Returns the number of keys in this dictionary. */
+ public synchronized int size() {
+ return n;
+ }
+
+ /** Returns true if this dictionary maps no keys to value. */
+ public synchronized boolean isEmpty() {
+ return size() == 0;
+ }
+
+ /**
+ * Returns an enumeration of the keys in this dictionary.
+ */
+ public Enumeration getAttributeNames() {
+ return new SAXAttributeEnum(keys, n);
+ }
+
+ /**
+ * Returns the value to which the key is mapped in this dictionary.
+ */
+ public synchronized String getValue(String key) {
+ int i = getIndex(key);
+ return (i < 0 ? null : elms[i]);
+ }
+
+ protected int getIndex(String key) {
+ for (int i = 0; i < n; i++) {
+ if (keys[i].equals(key))
+ return i;
+ }
+ return -1;
+ }
+
+ /**
+ * Maps the specified key to the specified value in this dictionary.
+ * Neither the key nor the value can be null.
+ *
+ * The value can be retrieved by calling the get method with a key
+ * that is equal to the original key.
+ * @return the previous value to which the key was mapped in
+ * this dictionary, or null if the key did not have a
+ * previous mapping.
+ * @throws NullPointerException if the key or value is null
+ */
+ public synchronized String put(String key, String value) {
+ if (value == null) throw new NullPointerException("value is null");
+ int i = getIndex(key);
+ if (i >= 0) {
+ String old = elms[i];
+ elms[i] = value;
+ return old;
+ }
+ int len = keys.length;
+ if (len == n) {
+ // double size of key,elms arrays
+ String[] k, e;
+ k = new String[len * 2];
+ e = new String[len * 2];
+ System.arraycopy(keys, 0, k, 0, len);
+ System.arraycopy(elms, 0, e, 0, len);
+ keys = k;
+ elms = e;
+ }
+ keys[n] = key;
+ elms[n] = value;
+ n++;
+ return null;
+ }
+
+ public void clear() {
+ n = 0;
+ }
+
+ public boolean isEntity (String aname) { return false; }
+ public boolean isNotation (String aname) { return false; }
+ public boolean isId (String aname) { return false; }
+ public boolean isIdref (String aname) { return false; }
+ public String getEntityPublicID (String aname) { return null; }
+ public String getEntitySystemID (String aname) { return null; }
+ public String getNotationName (String aname) { return null; }
+ public String getNotationPublicID (String aname) { return null; }
+ public String getNotationSystemID (String aname) { return null; }
+
+ public synchronized String toString() {
+ StringBuffer sb = new StringBuffer();
+ boolean f = true;
+
+ sb.append("{ ");
+ for (Enumeration e = getAttributeNames(); e.hasMoreElements(); ) {
+ if (f) { f = false; }
+ else { sb.append(", "); }
+ String key = (String) e.nextElement();
+ sb.append("" + key + '=' + getValue(key));
+ }
+ sb.append(" }");
+ return sb.toString();
+ }
+
+ /*
+ // for testing
+ public static void main(String[] args) throws Exception {
+ SAXAttributeMap d;
+ java.io.BufferedReader r;
+ java.util.StringTokenizer tok;
+ String op;
+
+ if (args.length > 1) {
+ d = new SAXAttributeMap(Integer.parseInt(args[0]));
+ } else {
+ d = new SAXAttributeMap();
+ }
+
+ System.out.println(
+ "Enter operations... op's are one of\n"+
+ "put For an example use see UrlScanner.
+ *
+ * @see HtmlObserver
+ * @see UrlScanner
+ * @author Anders Kristensen
+ */
+public class HtmlScanner extends HandlerBase {
+ HtmlObserver observer;
+ URL contextURL;
+ Object data;
+ Tokenizer tok;
+ Reader in;
+
+ /**
+ * Parse the input on the specified stream as if it was HTML and
+ * invoke the provided observer as links are encountered.
+ * @param url the URL to parse for links
+ * @param observer the callback object
+ * @param data client-specific data; this is passed back to the
+ * client in callbacks; this scanner doesn't use it
+ * @throws Exception see hplb.org.xml.sax.Parser.parse()
+ * @see hplb.org.xml.sax.Parser.parse
+ */
+ public HtmlScanner(URL url, HtmlObserver observer ) throws Exception {
+ this(new BufferedReader(new InputStreamReader(url.openStream())), url, observer);
+ }
+
+ /**
+ * Parse the input on the specified stream as if it was HTML and
+ * invoke the provided observer as links are encountered.
+ * @param in the input stream
+ * @param url the URL corresponding to this document
+ * @param observer the callback object
+ * @throws Exception see hplb.org.xml.sax.Parser.parse()
+ * @see hplb.org.xml.sax.Parser.parse
+ * @deprecated
+ */
+ public HtmlScanner(InputStream in, URL url, HtmlObserver observer)
+ throws Exception
+ {
+ this(new BufferedReader(new InputStreamReader(in)), url, observer, null);
+ }
+
+ /**
+ * Parse the input on the specified stream as if it was HTML and
+ * invoke the provided observer as links are encountered.
+ * @param in the Reader
+ * @param url the URL corresponding to this document
+ * @param observer the callback object
+ * @throws Exception see hplb.org.xml.sax.Parser.parse()
+ * @see hplb.org.xml.sax.Parser.parse
+ */
+ public HtmlScanner(Reader in, URL url, HtmlObserver observer)
+ throws Exception
+ {
+ this(in, url, observer, null);
+ }
+
+ /**
+ * Parse the input on the specified stream as if it was HTML and
+ * invoke the provided observer as links are encountered.
+ * Although not deprecated, this method should not be used. Use HtmlScanner(Reader...) instead
+ * @deprecated
+ */
+ public HtmlScanner(InputStream in, URL url, HtmlObserver observer, Object data)
+ throws Exception
+ {
+ this(new BufferedReader(new InputStreamReader(in)), url, observer, data);
+ }
+
+ /**
+ * Parse the input on the specified stream as if it was HTML and
+ * invoke the provided observer as links are encountered.
+ * @param in the input stream
+ * @param url the URL corresponding to this document
+ * @param observer the callback object
+ * @param data client-specific data; this is passed back to the
+ * client in callbacks; this scanner doesn't use it
+ * @throws Exception see hplb.org.xml.sax.Parser.parse()
+ * @see hplb.org.xml.sax.Parser.parse
+ */
+ public HtmlScanner(Reader in, URL url, HtmlObserver observer, Object data)
+ throws Exception
+ {
+ this.in = in;
+ this.observer = observer;
+ this.contextURL = url;
+ this.data = data;
+ tok = new Tokenizer();
+ setDocumentHandler(this);
+ HTML.applyHacks(tok);
+ tok.rcgnzEntities = false;
+ tok.rcgnzCDATA = false;
+ tok.atomize = true;
+ }
+
+ public void setDocumentHandler(DocumentHandler doc)
+ {
+ tok.setDocumentHandler(doc);
+ }
+
+ public void setEntityHandler(EntityHandler ent)
+ {
+ tok.setEntityHandler(ent);
+ }
+
+ public void setErrorHandler(ErrorHandler err)
+ {
+ tok.setErrorHandler(err);
+ }
+
+ public void parse() throws Exception
+ {
+ tok.parse(in);
+ }
+
+ public void startElement(String name, AttributeMap attributes) {
+ String val;
+
+ if (name == HTML.A) {
+ if ((val = attributes.getValue("href")) != null) {
+ observer.gotAHref(val, contextURL, data);
+ }
+ } else if (name == HTML.IMG) {
+ if ((val = attributes.getValue("src")) != null) {
+ observer.gotImgSrc(val, contextURL, data);
+ }
+ } else if (name == HTML.BASE) {
+ if ((val = attributes.getValue("href")) != null) {
+ observer.gotBaseHref(val, contextURL, data);
+ if (contextURL != null) {
+ try {
+ contextURL = new URL(contextURL, val);
+ } catch (MalformedURLException ex) {
+ System.err.println("Bad putMessages
+ * (use the latter whenever possible).
+ * The messages are passed to the filters in the order in which the filters where
+ * added to the handler.
+ * They can consume the message by returning null. Otherwise, they return a Message
+ * object, usually the one they got.
+ * The filters will run synchronously within the message handler thread
+ * This implements a chain of responsibility-style message handling
+ */
+public class MessageHandler implements Runnable
+{
+
+ /**
+ * the queue where messages are put in.
+ * Holds max. 2 x 5000 = 10.000 messages in RAM
+ */
+ private CachingQueue messageQueue = new CachingQueue("fetcherURLMessageQueue", 5000);
+
+ /**
+ * list of Observers
+ */
+ private LinkedList listeners = new LinkedList();
+
+ /**
+ * true as long as the thread is running
+ */
+ private boolean running = true;
+
+ /**
+ * the message handler thread
+ */
+ private Thread t;
+
+ /**
+ * flag for thread communication
+ */
+ boolean messagesWaiting = false;
+
+ /**
+ * true when a message is processed by the filters
+ */
+ boolean workingOnMessage = false;
+
+ Object queueMonitor = new Object();
+
+ SimpleObservable messageQueueObservable = new SimpleObservable();
+ SimpleObservable messageProcessorObservable = new SimpleObservable();
+
+ public boolean isWorkingOnMessage()
+ {
+ return workingOnMessage;
+ }
+
+ /**
+ * messageHandler-Thread erzeugen und starten
+ */
+ MessageHandler()
+ {
+ t = new Thread(this,"MessageHandler Thread");
+ t.setPriority(5); // higher priority to prevent starving when a lot of fetcher threads are used
+ t.start();
+ }
+
+ /**
+ * join messageHandler-Thread
+ */
+ public void finalize()
+ {
+ if(t != null)
+ {
+ try
+ {
+ t.join();
+ t = null;
+ }
+ catch(InterruptedException e) {}
+ }
+ }
+
+ /**
+ * registers a filter to the message handler
+ * @param MessageListener - the Listener
+ */
+ public void addListener(MessageListener m)
+ {
+ m.notifyAddedToMessageHandler(this);
+ listeners.addLast(m);
+ }
+
+ /**
+ * registers a MessageQueueObserver
+ * It will be notified whenever a message is put into the Queue (Parameter is Int(1)) oder
+ * removed (Parameter is Int(-1))
+ * @param o the Observer
+ */
+ public void addMessageQueueObserver(Observer o)
+ {
+ messageQueueObservable.addObserver(o);
+ }
+
+ /**
+ * adds a message processorObeserver
+ * It will be notified when a message is consumed. In this case the parameter
+ * is the filter that consumed the message
+ * @param o the Observer
+ */
+ public void addMessageProcessorObserver(Observer o)
+ {
+ messageProcessorObservable.addObserver(o);
+ }
+
+
+ /**
+ * einen Event in die Schlange schreiben
+ */
+ public void putMessage(Message msg)
+ {
+ messageQueue.insert(msg);
+ messageQueueObservable.setChanged();
+ messageQueueObservable.notifyObservers(new Integer(1));
+ synchronized(queueMonitor)
+ {
+ messagesWaiting = true;
+ queueMonitor.notify();
+ }
+ }
+
+ /**
+ * add a collection of events to the message queue
+ */
+ public void putMessages(Collection msgs)
+ {
+ for(Iterator i = msgs.iterator(); i.hasNext();)
+ {
+ Message msg = (Message)i.next();
+ messageQueue.insert(msg);
+ }
+ messageQueueObservable.setChanged();
+ messageQueueObservable.notifyObservers(new Integer(1));
+ synchronized(queueMonitor)
+ {
+ messagesWaiting = true;
+ queueMonitor.notify();
+ }
+ }
+
+ /**
+ * the main messageHandler-Thread.
+ */
+ public void run()
+ {
+ while(running)
+ {
+ //System.out.println("MessageHandler-Thread started");
+
+ synchronized(queueMonitor)
+ {
+ // wait for new messages
+ workingOnMessage=false;
+ try
+ {
+ queueMonitor.wait();
+ }
+ catch(InterruptedException e)
+ {
+ System.out.println("MessageHandler: Caught InterruptedException");
+ }
+ workingOnMessage=true;
+ }
+ //messagesWaiting = false;
+ Message m;
+ try
+ {
+ while(messagesWaiting)
+ {
+ synchronized(this.queueMonitor)
+ {
+ m = (Message)messageQueue.remove();
+ if(messageQueue.size() == 0)
+ {
+ messagesWaiting = false;
+ }
+
+ }
+ //System.out.println("MessageHandler:run: Entferne erstes Element");
+
+ messageQueueObservable.setChanged();
+ messageQueueObservable.notifyObservers(new Integer(-1)); // Message processed
+
+ // und verteilen. Die Listener erhalten die Message in ihrer
+ // Eintragungsreihenfolge und können die Message auch verändern
+
+ Iterator i = listeners.iterator();
+ while(i.hasNext())
+ {
+ //System.out.println("Verteile...");
+ try
+ {
+ MessageListener listener = (MessageListener)i.next();
+ m = (Message)listener.handleRequest(m);
+ if (m == null)
+ {
+ messageProcessorObservable.setChanged();
+ messageProcessorObservable.notifyObservers(listener);
+ break; // Handler hat die Message konsumiert
+ }
+ }
+ catch(ClassCastException e)
+ {
+ System.out.println("MessageHandler:run: ClassCastException(2): " + e.getMessage());
+ }
+ }
+ }
+ }
+ catch (ClassCastException e)
+ {
+ System.out.println("MessageHandler:run: ClassCastException: " + e.getMessage());
+ }
+ catch (UnderflowException e)
+ {
+ messagesWaiting = false;
+ // System.out.println("MessageHandler: messagesWaiting = true although nothing queued!");
+ // @FIXME: here is still a multi threading issue. I don't get it why this happens.
+ // does someone want to draw a petri net of this?
+ }
+ catch (Exception e)
+ {
+ System.out.println("MessageHandler: " + e.getClass() + " " + e.getMessage());
+ e.printStackTrace();
+ }
+
+ }
+ }
+
+ public int getQueued()
+ {
+ return messageQueue.size();
+ }
+}
diff --git a/sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/fetcher/MessageListener.java b/sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/fetcher/MessageListener.java
new file mode 100644
index 00000000000..f39681cbdbf
--- /dev/null
+++ b/sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/fetcher/MessageListener.java
@@ -0,0 +1,36 @@
+/*
+ * LARM - LANLab Retrieval Machine
+ *
+ * $history: $
+ *
+ *
+ */
+package de.lanlab.larm.fetcher;
+
+/**
+ * A Message Listener works on messages in a message queue Usually it returns
+ * the message back into the queue. But it can also change the message or create
+ * a new object. If it returns null, the message handler stops
+ *
+ * @author Administrator
+ * @created 24. November 2001
+ */
+public interface MessageListener
+{
+ /**
+ * the handler
+ *
+ * @param message the message to be handled
+ * @return Message usually the original message
+ * null: the message was consumed
+ */
+ public Message handleRequest(Message message);
+
+
+ /**
+ * will be called as soon as the Listener is added to the Message Queue
+ *
+ * @param handler the Message Handler
+ */
+ public void notifyAddedToMessageHandler(MessageHandler handler);
+}
diff --git a/sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/fetcher/RobotExclusionFilter.java b/sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/fetcher/RobotExclusionFilter.java
new file mode 100644
index 00000000000..35158d4f53d
--- /dev/null
+++ b/sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/fetcher/RobotExclusionFilter.java
@@ -0,0 +1,429 @@
+/**
+ * Title: LARM Lanlab Retrieval Machine
+ * Note: this is experimental stuff. get into the source code to see how it works
+ * @param args args[0] must point to the store.log file
+ */
+ public static void main(String[] args)
+ {
+ // Syntax: DistanceCount
+ * notes: experimental; slow
+ */
+public class SQLServerStorage implements DocumentStorage
+{
+
+ private Vector freeCons;
+ private Vector busyCons;
+
+ private Vector freeStatements;
+ private Vector busyStatements;
+
+ private PreparedStatement addDoc;
+
+ public SQLServerStorage(String driver, String connectionString, String account, String password, int nrConnections)
+ {
+ try
+ {
+ Class.forName(driver);
+ freeCons = new Vector(nrConnections);
+ busyCons = new Vector(nrConnections);
+ freeStatements = new Vector(nrConnections);
+ busyStatements = new Vector(nrConnections);
+
+ Connection sqlConn;
+ PreparedStatement statement;
+ for(int i=0; i
+ *
+ * This thread pool is based on an article in Java-Magazin 06/2000.
+ * synchronizations were removed unless necessary
+ *
+ *
+ */
+public class ThreadPool implements ThreadingStrategy, TaskReadyListener {
+ private int maxThreads = MAX_THREADS;
+ /**
+ * references to all threads are stored here
+ */
+ private HashMap allThreads = new HashMap();
+ /**
+ * this vector takes all idle threads
+ */
+ private Vector idleThreads = new Vector();
+ /**
+ * this vector takes all threads that are in operation (busy)
+ */
+ private Vector busyThreads = new Vector();
+
+ /**
+ * if there are no idleThreads, tasks will go here
+ */
+ private TaskQueue queue = new TaskQueue();
+
+ /**
+ * thread pool observers will be notified of status changes
+ */
+ private Vector threadPoolObservers = new Vector();
+
+ private boolean isStopped = false;
+
+ /**
+ * default maximum number of threads, if not given by the user
+ */
+ public final static int MAX_THREADS = 5;
+
+ /**
+ * thread was created
+ */
+ public final static String THREAD_CREATE = "T_CREATE";
+ /**
+ * thread was created
+ */
+ public final static String THREAD_START = "T_START";
+ /**
+ * thread is running
+ */
+ public final static String THREAD_RUNNING = "T_RUNNING";
+ /**
+ * thread was stopped
+ */
+ public final static String THREAD_STOP = "T_STOP";
+ /**
+ * thread was destroyed
+ */
+ public final static String THREAD_END = "T_END";
+ /**
+ * thread is idle
+ */
+ public final static String THREAD_IDLE = "T_IDLE";
+
+ /**
+ * a task was added to the queue, because all threads were busy
+ */
+ public final static String THREADQUEUE_ADD = "TQ_ADD";
+
+ /**
+ * a task was removed from the queue, because a thread had finished and was
+ * ready
+ */
+ public final static String THREADQUEUE_REMOVE = "TQ_REMOVE";
+
+ /**
+ * this factory will create the tasks
+ */
+ ThreadFactory factory;
+
+
+ /**
+ * this constructor will create the pool with MAX_THREADS threads and the
+ * default factory
+ */
+ public ThreadPool() {
+ this(MAX_THREADS, new ThreadFactory());
+ }
+
+
+ /**
+ * this constructor will create the pool with the default Factory
+ *
+ *@param max the maximum number of threads
+ */
+ public ThreadPool(int max) {
+ this(max, new ThreadFactory());
+ }
+
+
+ /**
+ * constructor
+ *
+ *@param max maximum number of threads
+ *@param factory the thread factory with which the threads will be created
+ */
+ public ThreadPool(int max, ThreadFactory factory) {
+ maxThreads = max;
+ this.factory = factory;
+ }
+
+
+ /**
+ * this init method will create the tasks. It must be called by hand
+ */
+ public void init() {
+ for (int i = 0; i < maxThreads; i++) {
+ createThread(i);
+ }
+ }
+
+
+ /**
+ * Description of the Method
+ *
+ *@param i Description of the Parameter
+ */
+ public void createThread(int i) {
+ ServerThread s = factory.createServerThread(i);
+ idleThreads.add(s);
+ allThreads.put(new Integer(i), s);
+ s.addTaskReadyListener(this);
+ sendMessage(i, THREAD_CREATE, "");
+ s.start();
+ sendMessage(i, THREAD_IDLE, "");
+ }
+
+
+ // FIXME: synchronisationstechnisch buggy
+ /**
+ * Description of the Method
+ *
+ *@param i Description of the Parameter
+ */
+ public void restartThread(int i) {
+ sendMessage(i, THREAD_STOP, "");
+ ServerThread t = (ServerThread) allThreads.get(new Integer(i));
+ idleThreads.remove(t);
+ busyThreads.remove(t);
+ allThreads.remove(new Integer(i));
+ t.interruptTask();
+ t.interrupt();
+ //t.join();
+ // deprecated, I know, but the only way to overcome SUN's bugs
+ t = null;
+ createThread(i);
+ }
+
+
+ /**
+ * Description of the Method
+ *
+ *@param t Description of the Parameter
+ *@param key Description of the Parameter
+ */
+ public synchronized void doTask(InterruptableTask t, Object key) {
+ if (!idleThreads.isEmpty()) {
+ ServerThread s = (ServerThread) idleThreads.firstElement();
+ idleThreads.remove(s);
+ busyThreads.add(s);
+ sendMessage(s.getThreadNumber(), THREAD_START, t.getInfo());
+ s.runTask(t);
+ sendMessage(s.getThreadNumber(), THREAD_RUNNING, t.getInfo());
+ } else {
+
+ queue.insert(t);
+ sendMessage(-1, THREADQUEUE_ADD, t.getInfo());
+ }
+ }
+
+
+ /**
+ * this will interrupt all threads. Therefore the InterruptableTasks must
+ * attend on the interrupted-flag
+ */
+ public void interrupt() {
+ Iterator tasks = queue.iterator();
+ while (tasks.hasNext()) {
+ InterruptableTask t = (InterruptableTask) tasks.next();
+ t.interrupt();
+ sendMessage(-1, THREADQUEUE_REMOVE, t.getInfo());
+ // In der Hoffnung, dass alles klappt...
+ }
+ queue.clear();
+ Iterator threads = busyThreads.iterator();
+ while (threads.hasNext()) {
+ ((ServerThread) threads.next()).interruptTask();
+ }
+ }
+
+
+ /**
+ * this will interrupt the tasks and end all threads
+ */
+ public void stop() {
+ isStopped = true;
+ interrupt();
+ Iterator threads = idleThreads.iterator();
+ while (threads.hasNext()) {
+ ((ServerThread) threads.next()).interruptTask();
+ }
+ idleThreads.clear();
+ }
+
+
+ /**
+ * wird von einem ServerThread aufgerufen, wenn dieser fertig ist
+ *
+ *@param s Description of the Parameter
+ *@param: ServerThread s - der aufrufende Thread
+ */
+ public synchronized void taskReady(ServerThread s) {
+ if (isStopped) {
+ s.interrupt();
+ sendMessage(s.getThreadNumber(), THREAD_STOP, s.getTask().getInfo());
+ busyThreads.remove(s);
+ } else if (!queue.isEmpty()) {
+ InterruptableTask t = (InterruptableTask) queue.remove();
+ //queue.remove(t);
+ sendMessage(-1, THREADQUEUE_REMOVE, t.getInfo());
+ sendMessage(s.getThreadNumber(), THREAD_START, "");
+ s.runTask(t);
+ sendMessage(s.getThreadNumber(), THREAD_RUNNING, s.getTask().getInfo());
+ } else {
+ sendMessage(s.getThreadNumber(), THREAD_IDLE, "");
+ idleThreads.add(s);
+ busyThreads.remove(s);
+ }
+ synchronized (idleThreads) {
+ idleThreads.notify();
+ }
+
+ }
+
+
+ /**
+ * Description of the Method
+ */
+ public void waitForFinish() {
+ synchronized (idleThreads) {
+ while (busyThreads.size() != 0) {
+ //System.out.println("busyThreads: " + busyThreads.size());
+ try {
+ idleThreads.wait();
+ } catch (InterruptedException e) {
+ System.out.println("Interrupted: " + e.getMessage());
+ }
+ }
+ //System.out.println("busyThreads: " + busyThreads.size());
+ }
+ }
+
+
+ /**
+ * Adds a feature to the ThreadPoolObserver attribute of the ThreadPool
+ * object
+ *
+ *@param o The feature to be added to the ThreadPoolObserver attribute
+ */
+ public void addThreadPoolObserver(ThreadPoolObserver o) {
+ threadPoolObservers.add(o);
+ }
+
+
+ /**
+ * Description of the Method
+ *
+ *@param threadNr Description of the Parameter
+ *@param action Description of the Parameter
+ *@param info Description of the Parameter
+ */
+ protected void sendMessage(int threadNr, String action, String info) {
+
+ Iterator Ie = threadPoolObservers.iterator();
+ //System.out.println("ThreadPool: Sende " + action + " message an " + threadPoolObservers.size() + " Observers");
+ if (threadNr != -1) {
+ while (Ie.hasNext()) {
+ ((ThreadPoolObserver) Ie.next()).threadUpdate(threadNr, action, info);
+ }
+ } else {
+ while (Ie.hasNext()) {
+ ((ThreadPoolObserver) Ie.next()).queueUpdate(info, action);
+ }
+ }
+ }
+
+
+ /**
+ * Gets the queueSize attribute of the ThreadPool object
+ *
+ *@return The queueSize value
+ */
+ public synchronized int getQueueSize() {
+ return this.queue.size();
+ }
+
+
+ /**
+ * Gets the idleThreadsCount attribute of the ThreadPool object
+ *
+ *@return The idleThreadsCount value
+ */
+ public synchronized int getIdleThreadsCount() {
+ return this.idleThreads.size();
+ }
+
+
+ /**
+ * Gets the busyThreadsCount attribute of the ThreadPool object
+ *
+ *@return The busyThreadsCount value
+ */
+ public synchronized int getBusyThreadsCount() {
+ return this.busyThreads.size();
+ }
+
+
+ /**
+ * Gets the threadCount attribute of the ThreadPool object
+ *
+ *@return The threadCount value
+ */
+ public synchronized int getThreadCount() {
+ return this.idleThreads.size() + this.busyThreads.size();
+ }
+
+
+ /**
+ * Gets the threadIterator attribute of the ThreadPool object
+ *
+ *@return The threadIterator value
+ */
+ public Iterator getThreadIterator() {
+ return allThreads.values().iterator();
+ // return allThreads.iterator();
+ }
+
+
+ /**
+ * Description of the Method
+ *
+ *@param queue Description of the Parameter
+ */
+ public void setQueue(TaskQueue queue) {
+ this.queue = queue;
+ }
+
+ public TaskQueue getTaskQueue()
+ {
+ return queue;
+ }
+
+}
+
+
diff --git a/sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/threads/ThreadPoolObserver.java b/sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/threads/ThreadPoolObserver.java
new file mode 100644
index 00000000000..47e11156265
--- /dev/null
+++ b/sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/threads/ThreadPoolObserver.java
@@ -0,0 +1,12 @@
+package de.lanlab.larm.threads;
+
+import de.lanlab.larm.util.Observer;
+
+/**
+ * an observer that observes the thread pool...
+ */
+public interface ThreadPoolObserver extends Observer
+{
+ public void queueUpdate(String info, String action);
+ public void threadUpdate(int threadNr, String action, String info);
+}
diff --git a/sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/threads/ThreadingStrategy.java b/sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/threads/ThreadingStrategy.java
new file mode 100644
index 00000000000..ab78ae89dcb
--- /dev/null
+++ b/sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/threads/ThreadingStrategy.java
@@ -0,0 +1,8 @@
+package de.lanlab.larm.threads;
+
+public interface ThreadingStrategy
+{
+ public void doTask(InterruptableTask t, Object key);
+ public void interrupt();
+ public void stop();
+}
diff --git a/sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/util/CachingQueue.java b/sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/util/CachingQueue.java
new file mode 100644
index 00000000000..2cb43ba8831
--- /dev/null
+++ b/sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/util/CachingQueue.java
@@ -0,0 +1,721 @@
+
+/**
+ * Title: LARM Lanlab Retrieval Machine
+ *
+ *
+ * +---+---+---+---+-+
+ * put -> | M | S | S | S |M| -> remove
+ * +---+---+---+---+-+
+ *
+ * the maximum number of entries can be specified with the blockSize parameter. Thus,
+ * the queue actually holds a maximum number of 2 x blockSize objects in main memory,
+ * plus a few bytes for each block.
+ * The objects contained in the blocks are stored with the standard Java
+ * serialization mechanism
+ * The files are named "cachingqueue\\Queuename_BlockNumber.cqb"
+ * note that the class is not synchronized
+ * @author Clemens Marschner
+ * @created 3. Januar 2002
+ */
+
+public class CachingQueue implements Queue
+{
+
+
+ /**
+ * the Blocks
+ */
+ LinkedList queueBlocks;
+
+ /**
+ * fast access to the first block
+ */
+ QueueBlock first = null;
+
+ /**
+ * fast access to the last block
+ */
+ QueueBlock last = null;
+
+ /**
+ * maximum block size
+ */
+ int blockSize;
+
+ /**
+ * "primary key" identity count for each block
+ */
+ int blockCount = 0;
+
+ /**
+ * active blocks
+ */
+ int numBlocks = 0;
+
+ /**
+ * queue name
+ */
+ String name;
+
+ /**
+ * total number of objects
+ */
+ int size;
+
+
+ /**
+ * init
+ *
+ * @param name the name of the queue, used in files names
+ * @param blockSize maximum number of objects stored in one block
+ */
+ public CachingQueue(String name, int blockSize)
+ {
+ queueBlocks = new LinkedList();
+ this.name = name;
+ this.blockSize = blockSize;
+ File cq = new File("cachingqueue");
+ cq.mkdir();
+ }
+
+
+ /**
+ * inserts an object to the front of the queue
+ *
+ * @param o the object to be inserted. must implement Serializable
+ * @exception StoreException encapsulates Exceptions that occur when writing to hard disk
+ */
+ public synchronized void insert(Object o)
+ throws StoreException
+ {
+ if (last == null && first == null)
+ {
+ first = last = newBlock();
+ queueBlocks.addFirst(first);
+ numBlocks++;
+ }
+ if (last == null && first != null)
+ {
+ // assert((last==null && first==null) || (last!= null && first!=null));
+ System.err.println("Error in CachingQueue: last!=first==null");
+ }
+
+ if (first.size() >= blockSize)
+ {
+ // save block and create a new one
+ QueueBlock newBlock = newBlock();
+ numBlocks++;
+ if (last != first)
+ {
+ first.store();
+ }
+ queueBlocks.addFirst(newBlock);
+ first = newBlock;
+ }
+ first.insert(o);
+ size++;
+ }
+
+
+ /**
+ * returns the last object from the queue
+ *
+ * @return the object returned
+ *
+ * @exception StoreException Description of the Exception
+ * @exception UnderflowException if the queue was empty
+ */
+ public synchronized Object remove()
+ throws StoreException, UnderflowException
+ {
+ if (last == null)
+ {
+ throw new UnderflowException();
+ }
+ if (last.size() <= 0)
+ {
+ queueBlocks.removeLast();
+ numBlocks--;
+ if (numBlocks == 1)
+ {
+ last = first;
+ }
+ else if (numBlocks == 0)
+ {
+ first = last = null;
+ throw new UnderflowException();
+ }
+ else if (numBlocks < 0)
+ {
+ // assert(numBlocks >= 0)
+ System.err.println("CachingQueue.remove: numBlocks<0!");
+ throw new UnderflowException();
+ }
+ else
+ {
+ last = (QueueBlock) queueBlocks.getLast();
+ }
+ }
+ --size;
+ return last.remove();
+ }
+
+
+ /**
+ * not supported
+ *
+ * @param c Description of the Parameter
+ */
+ public void insertMultiple(java.util.Collection c)
+ {
+ throw new UnsupportedOperationException();
+ }
+
+
+ /**
+ * creates a new block
+ *
+ * @return Description of the Return Value
+ */
+ private QueueBlock newBlock()
+ {
+ return new QueueBlock(name + "_" + blockCount++, blockSize);
+ }
+
+
+ /**
+ * total number of objects contained in the queue
+ *
+ * @return Description of the Return Value
+ */
+ public int size()
+ {
+ return size;
+ }
+
+
+ /**
+ * testing
+ *
+ * @param args The command line arguments
+ */
+ public static void main(String[] args)
+ {
+ System.out.println("Test1: " + CachingQueueTester.testUnderflow());
+ System.out.println("Test2: " + CachingQueueTester.testInsert());
+ System.out.println("Test3: " + CachingQueueTester.testBufReadWrite());
+ System.out.println("Test4: " + CachingQueueTester.testBufReadWrite2());
+ System.out.println("Test5: " + CachingQueueTester.testUnderflow2());
+ System.out.println("Test6: " + CachingQueueTester.testBufReadWrite3());
+ System.out.println("Test7: " + CachingQueueTester.testExceptions());
+ }
+}
+
+/**
+ * Testklasse TODO: auslagern und per JUnit handhaben
+ *
+ * @author Administrator
+ * @created 3. Januar 2002
+ */
+class AssertionFailedException extends RuntimeException
+{
+}
+
+/**
+ * Testklasse. Enthält einige Tests für die Funktionalität der CachingQueue
+ *
+ * @author Administrator
+ * @created 3. Januar 2002
+ */
+class CachingQueueTester
+{
+
+
+ /**
+ * A unit test for JUnit
+ *
+ * @return Description of the Return Value
+ */
+ public static boolean testUnderflow()
+ {
+ CachingQueue cq = new CachingQueue("testQueue1", 10);
+ try
+ {
+ cq.remove();
+ }
+ catch (UnderflowException e)
+ {
+ return true;
+ }
+ catch (Exception e)
+ {
+ e.printStackTrace();
+ }
+ return false;
+ }
+
+
+ /**
+ * A unit test for JUnit
+ *
+ * @return Description of the Return Value
+ */
+ public static boolean testInsert()
+ {
+ CachingQueue cq = new CachingQueue("testQueue2", 10);
+ String test = "Test1";
+ assert(cq.size() == 0);
+ cq.insert(test);
+ assert(cq.size() == 1);
+ return (cq.remove() == test);
+ }
+
+
+ /**
+ * A unit test for JUnit
+ *
+ * @return Description of the Return Value
+ */
+ public static boolean testBufReadWrite()
+ {
+ CachingQueue cq = new CachingQueue("testQueue3", 2);
+ String test1 = "Test1";
+ String test2 = "Test2";
+ String test3 = "Test3";
+ cq.insert(test1);
+ cq.insert(test2);
+ cq.insert(test3);
+ assert(cq.size() == 3);
+ cq.remove();
+ cq.remove();
+ assert(cq.size() == 1);
+ return (cq.remove() == test3);
+ }
+
+
+ /**
+ * A unit test for JUnit
+ *
+ * @return Description of the Return Value
+ */
+ public static boolean testBufReadWrite2()
+ {
+ CachingQueue cq = new CachingQueue("testQueue4", 2);
+ String test1 = "Test1";
+ String test2 = "Test2";
+ String test3 = "Test3";
+ String test4 = "Test4";
+ String test5 = "Test5";
+ cq.insert(test1);
+ cq.insert(test2);
+ cq.insert(test3);
+ cq.insert(test4);
+ cq.insert(test5);
+ assert(cq.size() == 5);
+ String t = (String) cq.remove();
+ assert(t.equals(test1));
+ t = (String) cq.remove();
+ assert(t.equals(test2));
+ t = (String) cq.remove();
+ assert(t.equals(test3));
+ t = (String) cq.remove();
+ assert(t.equals(test4));
+ t = (String) cq.remove();
+ assert(cq.size() == 0);
+ return (t.equals(test5));
+ }
+
+
+ /**
+ * Description of the Method
+ *
+ * @param expr Description of the Parameter
+ */
+ public static void assert(boolean expr)
+ {
+ if (!expr)
+ {
+ throw new AssertionFailedException();
+ }
+ }
+
+
+ /**
+ * A unit test for JUnit
+ *
+ * @return Description of the Return Value
+ */
+ public static boolean testUnderflow2()
+ {
+ CachingQueue cq = new CachingQueue("testQueue5", 2);
+ String test1 = "Test1";
+ String test2 = "Test2";
+ String test3 = "Test3";
+ String test4 = "Test4";
+ String test5 = "Test5";
+ cq.insert(test1);
+ cq.insert(test2);
+ cq.insert(test3);
+ cq.insert(test4);
+ cq.insert(test5);
+ assert(cq.remove().equals(test1));
+ assert(cq.remove().equals(test2));
+ assert(cq.remove().equals(test3));
+ assert(cq.remove().equals(test4));
+ assert(cq.remove().equals(test5));
+ try
+ {
+ cq.remove();
+ }
+ catch (UnderflowException e)
+ {
+ return true;
+ }
+ return false;
+ }
+
+
+ /**
+ * A unit test for JUnit
+ *
+ * @return Description of the Return Value
+ */
+ public static boolean testBufReadWrite3()
+ {
+ CachingQueue cq = new CachingQueue("testQueue4", 1);
+ String test1 = "Test1";
+ String test2 = "Test2";
+ String test3 = "Test3";
+ String test4 = "Test4";
+ String test5 = "Test5";
+ cq.insert(test1);
+ cq.insert(test2);
+ cq.insert(test3);
+ cq.insert(test4);
+ cq.insert(test5);
+ String t = (String) cq.remove();
+ assert(t.equals(test1));
+ t = (String) cq.remove();
+ assert(t.equals(test2));
+ t = (String) cq.remove();
+ assert(t.equals(test3));
+ t = (String) cq.remove();
+ assert(t.equals(test4));
+ t = (String) cq.remove();
+ return (t.equals(test5));
+ }
+
+
+ /**
+ * A unit test for JUnit
+ *
+ * @return Description of the Return Value
+ */
+ public static boolean testExceptions()
+ {
+ System.gc();
+ CachingQueue cq = new CachingQueue("testQueue5", 1);
+ String test1 = "Test1";
+ String test2 = "Test2";
+ String test3 = "Test3";
+ String test4 = "Test4";
+ String test5 = "Test5";
+ cq.insert(test1);
+ cq.insert(test2);
+ cq.insert(test3);
+ cq.insert(test4);
+ cq.insert(test5);
+ try
+ {
+ if (!(new File("testQueue5_1.cqb").delete()))
+ {
+ System.err.println("CachingQueueTester.textExceptions: Store 1 nicht vorhanden. Filename geändert?");
+ }
+ if (!(new File("testQueue5_2.cqb").delete()))
+ {
+ System.err.println("CachingQueueTester.textExceptions: Store 2 nicht vorhanden. Filename geändert?");
+ }
+ String t = (String) cq.remove();
+ assert(t.equals(test1));
+ t = (String) cq.remove();
+ assert(t.equals(test2));
+ t = (String) cq.remove();
+ assert(t.equals(test3));
+ t = (String) cq.remove();
+ assert(t.equals(test4));
+ t = (String) cq.remove();
+ assert(t.equals(test5));
+ }
+ catch (StoreException e)
+ {
+ return true;
+ }
+ finally
+ {
+ cq = null;
+ System.gc();
+ // finalizer müssten aufgerufen werden
+ }
+ return false;
+ }
+
+}
diff --git a/sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/util/ClassInfo.java b/sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/util/ClassInfo.java
new file mode 100644
index 00000000000..231c17d3f9f
--- /dev/null
+++ b/sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/util/ClassInfo.java
@@ -0,0 +1,273 @@
+package de.lanlab.larm.util;
+
+import java.lang.reflect.*;
+import java.io.*;
+import java.util.*;
+
+/**
+ * Title: LARM Lanlab Retrieval Machine
+ * Description:
+ * Copyright: Copyright (c)
+ * Company:
+ * @author
+ * @version 1.0
+ */
+
+/**
+ * prints class information with the reflection api
+ * for debugging only
+ */
+public class ClassInfo
+{
+
+ public ClassInfo()
+ {
+ }
+
+ /**
+ * Usage: java ClassInfo PackageName.MyNewClassName PackageName.DerivedClassName
+ */
+ public static void main(String[] args)
+ {
+
+ String name = args[0];
+ String derivedName = args[1];
+ LinkedList l = new LinkedList();
+ ListIterator itry = l.listIterator();
+
+ try
+ {
+ Class cls = Class.forName(name);
+ name = cls.getName();
+ String pkg = getPackageName(name);
+ String clss = getClassName(name);
+
+ StringWriter importsWriter = new StringWriter();
+ PrintWriter imports = new PrintWriter(importsWriter);
+ StringWriter outWriter = new StringWriter();
+ PrintWriter out = new PrintWriter(outWriter);
+
+ TreeSet importClasses = new TreeSet();
+ importClasses.add(getImportStatement(name));
+
+ out.println("/**\n * (class description here)\n */\npublic class " + derivedName + " " + (cls.isInterface() ? "implements " : "extends ") + clss + "\n{");
+
+ Method[] m = cls.getMethods();
+ for(int i= 0; i< m.length; i++)
+ {
+ Method thism = m[i];
+ if((thism.getModifiers() & Modifier.PRIVATE) == 0 && ((thism.getModifiers() & Modifier.FINAL) == 0)
+ && (thism.getDeclaringClass().getName() != "java.lang.Object"))
+ {
+ out.println(" /**");
+ out.println(" * (method description here)");
+ out.println(" * defined in " + thism.getDeclaringClass().getName());
+
+ Class[] parameters = thism.getParameterTypes();
+ for(int j = 0; j < parameters.length; j ++)
+ {
+ if(getPackageName(parameters[j].getName()) != "")
+ {
+ importClasses.add(getImportStatement(parameters[j].getName()));
+ }
+ out.println(" * @param p" + j + " (parameter description here)");
+ }
+
+ if(thism.getReturnType().getName() != "void")
+ {
+ String returnPackage = getPackageName(thism.getReturnType().getName());
+ if(returnPackage != "")
+ {
+ importClasses.add(getImportStatement(thism.getReturnType().getName()));
+ }
+ out.println(" * @return (return value description here)");
+ }
+
+ out.println(" */");
+
+ out.print(" " + getModifierString(thism.getModifiers()) + getClassName(thism.getReturnType().getName()) + " ");
+ out.print(thism.getName() + "(");
+
+ for(int j = 0; j < parameters.length; j ++)
+ {
+ if(j>0)
+ {
+ out.print(", ");
+ }
+ out.print(getClassName(parameters[j].getName()) + " p" + j);
+ }
+ out.print(")");
+ Class[] exceptions = thism.getExceptionTypes();
+
+ if (exceptions.length > 0)
+ {
+ out.print(" throws ");
+ }
+
+ for(int k = 0; k < exceptions.length; k++)
+ {
+ if(k > 0)
+ {
+ out.print(", ");
+ }
+ String exCompleteName = exceptions[k].getName();
+ String exName = getClassName(exCompleteName);
+ importClasses.add(getImportStatement(exCompleteName));
+
+ out.print(exName);
+ }
+ out.print("\n" +
+ " {\n" +
+ " /**@todo: Implement this " + thism.getName() + "() method */\n" +
+ " throw new UnsupportedOperationException(\"Method " + thism.getName() + "() not yet implemented.\");\n" +
+ " }\n\n");
+
+
+ }
+ }
+ out.println("}");
+
+ Iterator importIterator = importClasses.iterator();
+ while(importIterator.hasNext())
+ {
+ String importName = (String)importIterator.next();
+ if(!importName.startsWith("java.lang"))
+ {
+ imports.println("import " + importName + ";");
+ }
+ }
+
+ out.flush();
+ imports.flush();
+
+ if(getPackageName(derivedName) != "")
+ {
+ System.out.println("package " + getPackageName(derivedName) + ";\n");
+ }
+ System.out.println( "/**\n" +
+ " * Title: \n" +
+ " * Description:\n" +
+ " * Copyright: Copyright (c)\n" +
+ " * Company:\n" +
+ " * @author\n" +
+ " * @version 1.0\n" +
+ " */\n");
+ System.out.println(importsWriter.getBuffer());
+ System.out.print(outWriter.getBuffer());
+ }
+ catch(Throwable t)
+ {
+ t.printStackTrace();
+ }
+ }
+
+ public static String getPackageName(String className)
+ {
+ if(className.charAt(0) == '[')
+ {
+ switch(className.charAt(1))
+ {
+ case 'L':
+ return getPackageName(className.substring(2,className.length()-1));
+ default:
+ return "";
+ }
+ }
+ String name = className.lastIndexOf(".") != -1 ? className.substring(0, className.lastIndexOf(".")) : "";
+ //System.out.println("Package: " + name);
+ return name;
+ }
+
+ public static String getClassName(String className)
+ {
+ if(className.charAt(0) == '[')
+ {
+ switch(className.charAt(1))
+ {
+ case 'L':
+ return getClassName(className.substring(2,className.length()-1)) + "[]";
+ case 'C':
+ return "char[]";
+ case 'I':
+ return "int[]";
+ case 'B':
+ return "byte[]";
+ // rest is missing here
+
+ }
+ }
+ String name = (className.lastIndexOf(".") > -1) ? className.substring(className.lastIndexOf(".")+1) : className;
+ //System.out.println("Class: " + name);
+ return name;
+ }
+
+ static String getImportStatement(String className)
+ {
+ String pack = getPackageName(className);
+ String clss = getClassName(className);
+ if(clss.indexOf("[]") > -1)
+ {
+ return pack + "." + clss.substring(0,clss.length() - 2);
+ }
+ else
+ {
+ return pack + "." + clss;
+ }
+ }
+
+ public static String getModifierString(int modifiers)
+ {
+ StringBuffer mods = new StringBuffer();
+ if((modifiers & Modifier.ABSTRACT) != 0)
+ {
+ mods.append("abstract ");
+ }
+ if((modifiers & Modifier.FINAL) != 0)
+ {
+ mods.append("final ");
+ }
+ if((modifiers & Modifier.INTERFACE) != 0)
+ {
+ mods.append("interface ");
+ }
+ if((modifiers & Modifier.NATIVE) != 0)
+ {
+ mods.append("native ");
+ }
+ if((modifiers & Modifier.PRIVATE) != 0)
+ {
+ mods.append("private ");
+ }
+ if((modifiers & Modifier.PROTECTED) != 0)
+ {
+ mods.append("protected ");
+ }
+ if((modifiers & Modifier.PUBLIC) != 0)
+ {
+ mods.append("public ");
+ }
+ if((modifiers & Modifier.STATIC) != 0)
+ {
+ mods.append("static ");
+ }
+ if((modifiers & Modifier.STRICT) != 0)
+ {
+ mods.append("strictfp ");
+ }
+ if((modifiers & Modifier.SYNCHRONIZED) != 0)
+ {
+ mods.append("synchronized ");
+ }
+ if((modifiers & Modifier.TRANSIENT) != 0)
+ {
+ mods.append("transient ");
+ }
+ if((modifiers & Modifier.VOLATILE) != 0)
+ {
+ mods.append("volatile ");
+ }
+ return mods.toString();
+ }
+
+
+}
\ No newline at end of file
diff --git a/sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/util/HashedCircularLinkedList.java b/sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/util/HashedCircularLinkedList.java
new file mode 100644
index 00000000000..6b0d16fb6d1
--- /dev/null
+++ b/sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/util/HashedCircularLinkedList.java
@@ -0,0 +1,319 @@
+package de.lanlab.larm.util;
+
+/**
+ * Title:
+ * Description:
+ * Copyright: Copyright (c)
+ * Company:
+ * @author
+ * @version 1.0
+ */
+
+import java.util.*;
+
+/**
+ * simple hashed linked list. It allows for inserting and removing elements like
+ * in a hash table (in fact, it uses a HashMap), while still being able to easily
+ * traverse the collection like a list. In addition, the iterator is circular. It
+ * always returns a next element as long as there are elements in the list. In
+ * contrast to the iterator of Sun's collection classes, this class can cope with
+ * inserts and removals while traversing the list.
+ * All operations should work in near constant time as the list grows. Only the
+ * trade-off costs of a hash (memory versus speed) have to be considered.
+ * The List doesn't accept null elements
+ * @todo put the traversal function into an Iterator
+ * @todo implement the class as a derivate from a Hash
+ */
+public class HashedCircularLinkedList
+{
+
+
+ /**
+ * Entry class.
+ */
+ private static class Entry
+ {
+ Object key;
+ Object element;
+ Entry next;
+ Entry previous;
+
+ Entry(Object element, Entry next, Entry previous, Object key)
+ {
+ this.element = element;
+ this.next = next;
+ this.previous = previous;
+ this.key = key;
+ }
+ }
+
+ /**
+ * the list. contains objects
+ */
+ private transient Entry header = new Entry(null, null, null, null);
+
+ /**
+ * the hash. maps keys to entries, which by themselves map to objects
+ */
+ HashMap keys;
+
+ private transient int size = 0;
+
+ /** the current entry in the traversal */
+ Entry current = null;
+
+ /**
+ * Constructs an empty list.
+ */
+ public HashedCircularLinkedList(int initialCapacity, float loadFactor)
+ {
+ header.next = header.previous = header;
+ keys = new HashMap(initialCapacity, loadFactor);
+ }
+
+ /**
+ * Returns the number of elements in this list.
+ *
+ * @return the number of elements in this list.
+ */
+ public int size()
+ {
+ return size;
+ }
+
+ /**
+ * Removes the first occurrence of the specified element in this list. If
+ * the list does not contain the element, it is unchanged. More formally,
+ * removes the element with the lowest index i such that
+ * (o==null ? get(i)==null : o.equals(get(i))) (if such an
+ * element exists).
+ *
+ * @param o element to be removed from this list, if present.
+ * @return true if the list contained the specified element.
+ */
+ public boolean removeByKey(Object o)
+ {
+ // assert(o != null)
+ Entry e = (Entry)keys.get(o);
+ if(e != null)
+ {
+ if(e == current)
+ {
+ if(size > 1)
+ {
+ current = previousEntry(current);
+ }
+ else
+ {
+ current = null;
+ }
+ }
+ this.removeEntryFromList(e);
+ keys.remove(o);
+ size--;
+ return true;
+ }
+ else
+ {
+ return false;
+ }
+ }
+
+ /**
+ * Removes all of the elements from this list.
+ */
+ public void clear()
+ {
+ // list
+ header.next = header.previous = header;
+
+ // hash
+ keys.clear();
+
+ size = 0;
+ current = null;
+ }
+
+
+ private Entry addEntryBefore(Object key, Object o, Entry e)
+ {
+ Entry newEntry = new Entry(o, e, e.previous, key);
+ newEntry.previous.next = newEntry;
+ newEntry.next.previous = newEntry;
+ return newEntry;
+ }
+
+ private void removeEntryFromList(Entry e)
+ {
+ if(e != null)
+ {
+ if (e == header)
+ {
+ throw new NoSuchElementException();
+ }
+
+ e.previous.next = e.next;
+ e.next.previous = e.previous;
+ }
+ }
+
+
+ /**
+ * (method description here)
+ * defined in java.util.Map
+ * @param p0 (parameter description here)
+ * @param p1 (parameter description here)
+ * @return (return value description here)
+ */
+ public boolean put(Object key, Object value)
+ {
+ if(key != null && !keys.containsKey(key))
+ {
+ Entry e = addEntryBefore(key, value, header); // add it as the last element
+ keys.put(key, e); // link key to entry
+ size++;
+ return true;
+ }
+ else
+ {
+ return false;
+ }
+ }
+
+
+ public boolean hasNext()
+ {
+ return (size > 0);
+ }
+
+ private Entry nextEntry(Entry e)
+ {
+ // assert(e != null)
+ if(size > 1)
+ {
+ if(e == null)
+ {
+ e = header;
+ }
+ Entry next = e.next;
+ if(next == header)
+ {
+ next = next.next;
+ }
+ return next;
+ }
+ else if(size == 1)
+ {
+ return header.next;
+ }
+ else
+ {
+ return null;
+ }
+ }
+
+
+
+ private Entry previousEntry(Entry e)
+ {
+ // assert(e != null)
+ if(size > 1)
+ {
+ if(e == null)
+ {
+ e = header;
+ }
+ Entry previous = e.previous;
+ if(previous == header)
+ {
+ previous = previous.previous;
+ }
+ return previous;
+ }
+ else if(size == 1)
+ {
+ return header.previous;
+ }
+ else
+ {
+ return null;
+ }
+ }
+
+ public Object next()
+ {
+ current = nextEntry(current);
+ if(current != null)
+ {
+ return current.element;
+ }
+ else
+ {
+ return null;
+ }
+ }
+
+ public void removeCurrent()
+ {
+ keys.remove(current.key);
+ removeEntryFromList(current);
+ }
+
+
+ public Object get(Object key)
+ {
+ Entry e = ((Entry)keys.get(key));
+ if(e != null)
+ {
+ return e.element;
+ }
+ else
+ {
+ return null;
+ }
+ }
+
+ /**
+ * testing
+ */
+ public static void main(String[] args)
+ {
+ HashedCircularLinkedList h = new HashedCircularLinkedList(20, 0.75f);
+ h.put("1", "a");
+ h.put("2", "b");
+ h.put("3", "c");
+ String t;
+ System.out.println("size [3]: " + h.size());
+ t = (String)h.next();
+ System.out.println("2nd element via get [b]: " + h.get("2"));
+
+ System.out.println("next element [a]: " + t);
+ t = (String)h.next();
+ System.out.println("next element [b]: " + t);
+ t = (String)h.next();
+ System.out.println("next element [c]: " + t);
+ t = (String)h.next();
+ System.out.println("1st element after circular traversal [a]: " + t);
+ h.removeByKey("1");
+ System.out.println("1st element after remove [null]: " + h.get("1"));
+ System.out.println("size after removal [2]: " + h.size());
+ t = (String)h.next();
+ System.out.println("next element [b]: " + t);
+ t = (String)h.next();
+ System.out.println("next element [c]: " + t);
+ t = (String)h.next();
+ System.out.println("next element [b]: " + t);
+ h.removeCurrent();
+ t = (String)h.next();
+ System.out.println("next element after 1 removal [c]: " + t);
+ t = (String)h.next();
+ System.out.println("next element: [c]: " + t);
+ h.removeByKey("3");
+ System.out.println("size after 3 removals [0]: " + h.size());
+ t = (String)h.next();
+ System.out.println("next element [null]: " + t);
+ }
+}
+
+
diff --git a/sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/util/InputStreamObserver.java b/sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/util/InputStreamObserver.java
new file mode 100644
index 00000000000..c16940ffac5
--- /dev/null
+++ b/sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/util/InputStreamObserver.java
@@ -0,0 +1,18 @@
+
+/**
+ * Title: LARM Lanlab Retrieval MachineSimpleCharArrayReader
contains
+ * an internal buffer that contains bytes that
+ * may be read from the stream. An internal
+ * counter keeps track of the next byte to
+ * be supplied by the read
method.
+ *
+ * In contrast to the original CharArrayReader
this
+ * version is not thread safe. The monitor on the read()-function caused programs
+ * to slow down much, because this function is called for every character. This
+ * class can thus only be used if only one thread is accessing the stream
+ * @author Clemens Marschner
+ * @version 1.00
+ * @see java.io.ByteArrayInputStream
+ */
+public
+class SimpleCharArrayReader extends Reader
+{
+
+ /**
+ * A flag that is set to true when this stream is closed.
+ */
+ private boolean isClosed = false;
+
+ /**
+ * An array of bytes that was provided
+ * by the creator of the stream. Elements buf[0]
+ * through buf[count-1]
are the
+ * only bytes that can ever be read from the
+ * stream; element buf[pos]
is
+ * the next byte to be read.
+ */
+ protected char buf[];
+
+ /**
+ * The index of the next character to read from the input stream buffer.
+ * This value should always be nonnegative
+ * and not larger than the value of count
.
+ * The next byte to be read from the input stream buffer
+ * will be buf[pos]
.
+ */
+ protected int pos;
+
+ /**
+ * The currently marked position in the stream.
+ * SimpleCharArrayReader objects are marked at position zero by
+ * default when constructed. They may be marked at another
+ * position within the buffer by the mark()
method.
+ * The current buffer position is set to this point by the
+ * reset()
method.
+ *
+ * @since JDK1.1
+ */
+ protected int mark = 0;
+
+ /**
+ * The index one greater than the last valid character in the input
+ * stream buffer.
+ * This value should always be nonnegative
+ * and not larger than the length of buf
.
+ * It is one greater than the position of
+ * the last byte within buf
that
+ * can ever be read from the input stream buffer.
+ */
+ protected int count;
+
+ /**
+ * Creates a SimpleCharArrayReader
+ * so that it uses buf
as its
+ * buffer array.
+ * The buffer array is not copied.
+ * The initial value of pos
+ * is 0
and the initial value
+ * of count
is the length of
+ * buf
.
+ *
+ * @param buf the input buffer.
+ */
+ public SimpleCharArrayReader(char buf[])
+ {
+ this.buf = buf;
+ this.pos = 0;
+ this.count = buf.length;
+ }
+
+ /**
+ * Creates SimpleCharArrayReader
+ * that uses buf
as its
+ * buffer array. The initial value of pos
+ * is offset
and the initial value
+ * of count
is offset+len
.
+ * The buffer array is not copied.
+ * buf[pos]
+ * through buf[pos+len-1]
will
+ * be read; however, if a reset
+ * operation is performed, then bytes buf[0]
+ * through buf[pos-1]
will then
+ * become available for input.
+ *
+ * @param buf the input buffer.
+ * @param offset the offset in the buffer of the first byte to read.
+ * @param length the maximum number of bytes to read from the buffer.
+ */
+ public SimpleCharArrayReader(char buf[], int offset, int length)
+ {
+ this.buf = buf;
+ this.pos = offset;
+ this.count = Math.min(offset + length, buf.length);
+ this.mark = offset;
+ }
+
+ /**
+ * Reads the next byte of data from this input stream. The value
+ * byte is returned as an int
in the range
+ * 0
to 255
. If no byte is available
+ * because the end of the stream has been reached, the value
+ * -1
is returned.
+ * -1
if the end of the
+ * stream has been reached.
+ */
+ public int read()
+ {
+ return (pos < count) ? (buf[pos++] & 0xff) : -1;
+ }
+
+ /**
+ * Reads up to len
bytes of data into an array of bytes
+ * from this input stream.
+ * If pos
equals count
,
+ * then -1
is returned to indicate
+ * end of file. Otherwise, the number k
+ * of bytes read is equal to the smaller of
+ * len
and count-pos
.
+ * If k
is positive, then bytes
+ * buf[pos]
through buf[pos+k-1]
+ * are copied into b[off]
through
+ * b[off+k-1]
in the manner performed
+ * by System.arraycopy
. The
+ * value k
is added into pos
+ * and k
is returned.
+ * read
method cannot block.
+ *
+ * @param b the buffer into which the data is read.
+ * @param off the start offset of the data.
+ * @param len the maximum number of bytes read.
+ * @return the total number of bytes read into the buffer, or
+ * -1
if there is no more data because the end of
+ * the stream has been reached.
+ */
+ public int read(char b[], int off, int len)
+ {
+ if (b == null)
+ {
+ throw new NullPointerException();
+ }
+ else if ((off < 0) || (off > b.length) || (len < 0) ||
+ ((off + len) > b.length) || ((off + len) < 0))
+ {
+ throw new IndexOutOfBoundsException();
+ }
+ if (pos >= count)
+ {
+ return -1;
+ }
+ if (pos + len > count)
+ {
+ len = count - pos;
+ }
+ if (len <= 0)
+ {
+ return 0;
+ }
+ System.arraycopy(buf, pos, b, off, len);
+ pos += len;
+ return len;
+ }
+
+ /**
+ * Skips n
bytes of input from this input stream. Fewer
+ * bytes might be skipped if the end of the input stream is reached.
+ * The actual number k
+ * of bytes to be skipped is equal to the smaller
+ * of n
and count-pos
.
+ * The value k
is added into pos
+ * and k
is returned.
+ *
+ * @param n the number of bytes to be skipped.
+ * @return the actual number of bytes skipped.
+ */
+ public long skip(long n)
+ {
+ if (pos + n > count)
+ {
+ n = count - pos;
+ }
+ if (n < 0)
+ {
+ return 0;
+ }
+ pos += n;
+ return n;
+ }
+
+ /**
+ * Returns the number of bytes that can be read from this input
+ * stream without blocking.
+ * The value returned is
+ * count - pos
,
+ * which is the number of bytes remaining to be read from the input buffer.
+ *
+ * @return the number of bytes that can be read from the input stream
+ * without blocking.
+ */
+ public int available()
+ {
+ return count - pos;
+ }
+
+ /**
+ * Tests if SimpleCharArrayReader supports mark/reset.
+ *
+ * @since JDK1.1
+ */
+ public boolean markSupported()
+ {
+ return true;
+ }
+
+ /**
+ * Set the current marked position in the stream.
+ * SimpleCharArrayReader objects are marked at position zero by
+ * default when constructed. They may be marked at another
+ * position within the buffer by this method.
+ *
+ * @since JDK1.1
+ */
+ public void mark(int readAheadLimit)
+ {
+ mark = pos;
+ }
+
+ /**
+ * Resets the buffer to the marked position. The marked position
+ * is the beginning unless another position was marked.
+ * The value of pos
is set to 0.
+ */
+ public void reset()
+ {
+
+ pos = mark;
+ }
+
+ /**
+ * Closes this input stream and releases any system resources
+ * associated with the stream.
+ * public class MyClass {
+ * State state = new State("Running");
+ * public State getState() { return state.cloneState() }
+ *
+ * note on serialization: if you deserialize a state, the state string will be newly created.
+ * that means you then have to compare the states via equal() and not ==
+ */
+public class State implements Cloneable, Serializable
+{
+
+ private String state;
+ private long stateSince;
+ private Object info;
+
+ public State(String state)
+ {
+ setState(state);
+ }
+
+
+ private State(String state, long stateSince)
+ {
+ init(state, stateSince, null);
+ }
+
+ private State(String state, long stateSince, Object info)
+ {
+ init(state, stateSince, info);
+ }
+
+ private void init(String state, long stateSince, Object info)
+ {
+ this.state = state;
+ this.stateSince = stateSince;
+ this.info = info;
+ }
+
+ public void setState(String state)
+ {
+ setState(state, null);
+ }
+
+ public synchronized void setState(String state, Object info)
+ {
+ this.state = state;
+ this.stateSince = System.currentTimeMillis();
+ this.info = info;
+ }
+
+ public String getState()
+ {
+ return state;
+ }
+
+ public long getStateSince()
+ {
+ return stateSince;
+ }
+
+ public Object getInfo()
+ {
+ return info;
+ }
+
+ public synchronized Object clone()
+ {
+ return new State(state, stateSince, info);
+ }
+
+ public State cloneState()
+ {
+ return (State)clone();
+ }
+
+}
\ No newline at end of file
diff --git a/sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/util/URLUtils.java b/sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/util/URLUtils.java
new file mode 100644
index 00000000000..1956e81886a
--- /dev/null
+++ b/sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/util/URLUtils.java
@@ -0,0 +1,60 @@
+package de.lanlab.larm.util;
+
+/**
+ * Title: LARM Lanlab Retrieval Machine Description: Copyright: Copyright (c)
+ * Company:
+ *
+ * @author
+ * @version 1.0
+ */
+import java.net.URL;
+
+/**
+ * Description of the Class
+ *
+ * @author Administrator
+ * @created 27. Januar 2002
+ */
+public class URLUtils
+{
+ /**
+ * does the same as URL.toExternalForm(), but leaves out the Ref part (which we would
+ * cut off anyway) and handles the String Buffer so that no call of expandCapacity() will
+ * be necessary
+ * only meaningful if the default URLStreamHandler is used (as is the case with http, https, or shttp)
+ *
+ * @param u the URL to be converted
+ * @return the URL as String
+ */
+ public static String toExternalFormNoRef(URL u)
+ {
+ String protocol = u.getProtocol();
+ String authority = u.getAuthority();
+ String file = u.getFile();
+
+ StringBuffer result = new StringBuffer(
+ (protocol == null ? 0 : protocol.length()) +
+ (authority == null ? 0 : authority.length()) +
+ (file == null ? 1 : file.length()) + 3
+ );
+
+ result.append(protocol);
+ result.append(":");
+ if (u.getAuthority() != null && u.getAuthority().length() > 0)
+ {
+ result.append("//");
+ result.append(u.getAuthority());
+ }
+ if (u.getFile() != null && u.getFile().length() > 0)
+ {
+ result.append(u.getFile());
+ }
+ else
+ {
+ result.append("/");
+ }
+
+ return result.toString();
+ }
+
+}
diff --git a/sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/util/UnderflowException.java b/sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/util/UnderflowException.java
new file mode 100644
index 00000000000..e07b63ff58e
--- /dev/null
+++ b/sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/util/UnderflowException.java
@@ -0,0 +1,15 @@
+package de.lanlab.larm.util;
+
+/**
+ * Title: LARM
+ * Description:
+ * Copyright: Copyright (c) 2001
+ * Company: LMU-IP
+ * @author Clemens Marschner
+ * @version 1.0
+ */
+
+
+public class UnderflowException extends RuntimeException
+{
+}
diff --git a/sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/util/WebDocument.java b/sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/util/WebDocument.java
new file mode 100644
index 00000000000..3287fd51f6b
--- /dev/null
+++ b/sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/util/WebDocument.java
@@ -0,0 +1,94 @@
+package de.lanlab.larm.util;
+
+
+import java.net.URL;
+import de.lanlab.larm.fetcher.URLMessage;
+
+/**
+ * a web document of whatever type. generated by a fetcher task
+ */
+public class WebDocument extends URLMessage
+{
+ protected String mimeType;
+ protected byte[] document;
+ protected int resultCode;
+ protected int size;
+ protected String title;
+
+ public WebDocument(URL url, String mimeType, byte[] document, int resultCode, URL referer, int size, String title)
+ {
+ super(url, referer, false);
+ this.url = url;
+ this.mimeType = mimeType;
+ this.document = document;
+ this.resultCode = resultCode;
+ this.size = size;
+ this.title = title;
+ }
+
+ public String getTitle()
+ {
+ return title;
+ }
+
+ public URL getUrl()
+ {
+ return url;
+ }
+
+ public int getSize()
+ {
+ return this.size;
+ }
+
+ public void setSize(int size)
+ {
+ this.size = size;
+ }
+
+
+ public void setDocument(byte[] document)
+ {
+ this.document = document;
+ }
+ public int getResultCode()
+ {
+ return resultCode;
+ }
+
+ public void setResultCode(int resultCode)
+ {
+ this.resultCode = resultCode;
+ }
+
+ public byte[] getDocumentBytes()
+ {
+ return this.document;
+ }
+
+ public void setUrl(URL url)
+ {
+ this.url = url;
+ }
+
+ public void setMimeType(String mimeType)
+ {
+ this.mimeType = mimeType;
+ }
+
+ public String getMimeType()
+ {
+ return mimeType;
+ }
+
+ public String getInfo()
+ {
+ return super.getInfo() + "\t" +
+ this.resultCode + "\t" +
+ this.mimeType + "\t" +
+ this.size + "\t" +
+ "\"" + this.title.replace('\"', (char)0xff ).replace('\n',' ').replace('\r',' ') + "\"";
+ }
+
+
+}
diff --git a/sandbox/contributions/webcrawler-LARM/src/hplb/misc/ByteArray.java b/sandbox/contributions/webcrawler-LARM/src/hplb/misc/ByteArray.java
new file mode 100644
index 00000000000..73387d14ec4
--- /dev/null
+++ b/sandbox/contributions/webcrawler-LARM/src/hplb/misc/ByteArray.java
@@ -0,0 +1,294 @@
+/*
+ * $Id$
+ *
+ * Copyright 1997 Hewlett-Packard Company
+ *
+ * This file may be copied, modified and distributed only in
+ * accordance with the terms of the limited licence contained
+ * in the accompanying file LICENSE.TXT.
+ */
+
+package hplb.misc;
+
+import java.io.*;
+import java.net.*;
+
+/**
+ * This class is a container for algorithms working on byte arrays - some
+ * of the algorithms are analogous to those in java.lang.String.
+ * @author Anders Kristensen
+ */
+public class ByteArray {
+
+ /** Returns copy of characters in s as a new byte array. */
+ public static final byte[] getBytes(String s) {
+ int len = s.length();
+ byte b[] = new byte[len];
+ s.getBytes(0, len, b, 0);
+ return b;
+ }
+
+ /** Returns contents of file as byte array. */
+ public static byte[] loadFromFile(String filename) throws IOException {
+ return loadFromFile(new File(filename));
+ }
+
+ /** Returns contents of file file as byte array. */
+ public static byte[] loadFromFile(File file) throws IOException {
+ int n, nread = 0, len = (int) file.length();
+ FileInputStream fin = new FileInputStream(file);
+ byte[] content = new byte[len];
+
+ while (nread < len) {
+ if ((n = fin.read(content, nread, len - nread)) == -1)
+ throw new IOException("Error loading Compound from file");
+ nread += n;
+ }
+
+ return content;
+ }
+
+ /**
+ * Reads n bytes from the specified input stream. It will return
+ * fewer bytes if fewer bytes are available on the stream.
+ * Hence the application should check the resulting arrays length.
+ */
+ public static byte[] readn(InputStream in, int n) throws IOException {
+ byte[] buf = new byte[n];
+ int ntotal = 0;
+ int nread;
+
+ while (ntotal < n) {
+ nread = in.read(buf, ntotal, n - ntotal);
+ if (nread < 0) {
+ // we got less than expected - return what we got
+ byte[] newbuf = new byte[ntotal];
+ System.arraycopy(buf, 0, newbuf, 0, ntotal);
+ return newbuf;
+ }
+ ntotal += nread;
+ }
+ return buf;
+ }
+
+ /**
+ * Return contents of a WWW resource identified by a URL.
+ * @param url the resource to retrieve
+ * @return the resource contents as a byte array
+ */
+ public static byte[] getContent(URL url) throws IOException {
+ URLConnection conn = url.openConnection();
+ InputStream in = conn.getInputStream();
+ int length;
+
+ /*
+ * N.B. URLConnection.getContentLength() is buggy for "http" resources
+ * (at least in JDK1.0.2) and won't work for "file" URLs either.
+ */
+ length = length = conn.getContentLength();
+ if (length == -1)
+ length = conn.getHeaderFieldInt("Content-Length", -1);
+ if (length == -1)
+ return readAll(in);
+ return readn(in, length);
+ }
+
+ /**
+ * Read all input from an InputStream and return as a byte array.
+ * This method will not return before the end of the stream is reached.
+ * @return contents of the stream
+ */
+ public static byte[] readAll(InputStream in) throws IOException {
+ byte[] buf = new byte[1024];
+ int nread, ntotal = 0;
+
+ while ((nread = in.read(buf, ntotal, buf.length - ntotal)) > -1) {
+ ntotal += nread;
+ if (ntotal == buf.length) {
+ // extend buffer
+ byte[] newbuf = new byte[buf.length * 2];
+ System.arraycopy(buf, 0, newbuf, 0, buf.length);
+ buf = newbuf;
+ }
+ }
+ if (ntotal < buf.length) {
+ // we cannot have excess space
+ byte[] newbuf = new byte[ntotal];
+ System.arraycopy(buf, 0, newbuf, 0, ntotal);
+ buf = newbuf;
+ }
+ return buf;
+ }
+
+ /**
+ * Copies data from the specified input stream to the output stream
+ * until end of file is met.
+ * @return the total number of bytes written to the output stream
+ */
+ public static int cpybytes(InputStream in, OutputStream out)
+ throws IOException
+ {
+ byte[] buf = new byte[1024];
+ int n, ntotal = 0;
+ while ((n = in.read(buf)) > -1) {
+ out.write(buf, 0, n);
+ ntotal += n;
+ }
+ return ntotal;
+ }
+
+ /**
+ * Copies data from the specified input stream to the output stream
+ * until n bytes has been copied or end of file is met.
+ * @return the total number of bytes written to the output stream
+ */
+ public static int cpybytes(InputStream in, OutputStream out, int n)
+ throws IOException
+ {
+ int sz = n < 1024 ? n : 1024;
+ byte[] buf = new byte[sz];
+ int chunk, nread, ntotal = 0;
+
+ chunk = sz;
+
+ while (ntotal < n && (nread = in.read(buf, 0, chunk)) > -1) {
+ out.write(buf, 0, nread);
+ ntotal += nread;
+ chunk = (n - ntotal < sz) ? n - ntotal : sz;
+ }
+ return ntotal;
+ }
+
+ /**
+ * Returns the index within this String of the first occurrence of the
+ * specified character or -1 if the character is not found.
+ * @params buf the buffer to search
+ * @params ch the character to search for
+ */
+ public static final int indexOf(byte[] buf,
+ int ch) {
+ return indexOf(buf, ch, 0, buf.length);
+ }
+
+ /**
+ * Returns the index within this String of the first occurrence of the
+ * specified character, starting the search at fromIndex. This method
+ * returns -1 if the character is not found.
+ * @params buf the buffer to search
+ * @params ch the character to search for
+ * @params fromIndex the index to start the search from
+ * @params toIndex the highest possible index returned plus 1
+ */
+ public static final int indexOf(byte[] buf,
+ int ch,
+ int fromIndex,
+ int toIndex) {
+ int i;
+
+ for (i = fromIndex; i < toIndex && buf[i] != ch; i++)
+ ; // do nothing
+
+ if (i < toIndex)
+ return i;
+ else
+ return -1;
+ }
+
+ /**
+ * Returns the index of the first occurrence of s in the specified
+ * buffer or -1 if this is not found.
+ */
+ public static final int indexOf(byte[] buf, String s) {
+ return indexOf(buf, s, 0);
+ }
+
+ /**
+ * Returns the index of the first occurrence of s in the specified
+ * buffer. The search starts from fromIndex. This method returns -1
+ * if the index is not found.
+ */
+ public static final int indexOf(byte[] buf, String s, int fromIndex) {
+ int i; // index into buf
+ int j; // index into s
+ int max_i = buf.length;
+ int max_j = s.length();
+
+ for (i = fromIndex; i + max_j <= max_i; i++) {
+ for (j = 0; j < max_j; j++) {
+ if (buf[j + i] != s.charAt(j))
+ break;
+ }
+ if (j == max_j) return i;
+ }
+ return -1;
+ }
+
+/*
+ // for testing indexOf(byte[], String, int)
+ public static void main(String[] args) {
+ byte[] buf = getBytes(args[0]);
+ System.out.println("IndexOf(arg0, arg1, 0) = " + indexOf(buf, args[1], 3));
+ }
+*/
+
+ public static final boolean isSpace(int ch) {
+ if (ch == ' ' || ch == '\t' || ch == '\n' || ch == '\r') return true;
+ else return false;
+ }
+
+ public static final int skipSpaces(byte[] buf, int fromIndex, int toIndex) {
+ int i;
+ for (i = fromIndex; i < toIndex && isSpace(buf[i]); i++)
+ ;
+ return i;
+ }
+ /**
+ * Find byte pattern ptrn in buffer buf.
+ * @return index of first occurrence of ptrn in buf, -1 if no occurence
+ */
+ public static final int findBytes(byte buf[],
+ int off,
+ int len,
+ byte ptrn[]) {
+ // Note: This code is completely incomprehensible without a drawing...
+
+ int buf_len = off + len;
+ int ptrn_len = ptrn.length;
+ int i; // index into buf
+ int j; // index into ptrn;
+ byte b = ptrn[0]; // next byte of interest
+
+ for (i = off; i < buf_len; ) {
+ j = 0;
+ while (i < buf_len && j < ptrn_len && buf[i] == ptrn[j]) {
+ i++;
+ j++;
+ }
+ if (i == buf_len || j == ptrn_len)
+ return i - j;
+ else {
+ // We have to go back a bit as there may be an overlapping
+ // match starting a bit later in buf...
+ i = i - j + 1;
+ }
+ }
+ return -1;
+ }
+
+/*
+ // for testing findBytes(byte[], int, int, byte[])
+ public static void main(String args[]) {
+ if (args.length < 4) {
+ System.err.println("Usage: s1 off len s2");
+ System.exit(1);
+ }
+ byte b1[] = new byte[args[0].length()];
+ byte b2[] = new byte[args[3].length()];
+ args[0].getBytes(0, args[0].length(), b1, 0);
+ args[3].getBytes(0, args[3].length(), b2, 0);
+ int off = Integer.parseInt(args[1]);
+ int len = Integer.parseInt(args[2]);
+ System.out.println("Index = " + findBytes(b1, off, len, b2));
+ }
+*/
+}
diff --git a/sandbox/contributions/webcrawler-LARM/src/hplb/org/w3c/dom/Attribute.java b/sandbox/contributions/webcrawler-LARM/src/hplb/org/w3c/dom/Attribute.java
new file mode 100644
index 00000000000..3d7a4dcbfc2
--- /dev/null
+++ b/sandbox/contributions/webcrawler-LARM/src/hplb/org/w3c/dom/Attribute.java
@@ -0,0 +1,20 @@
+/*
+ * $Id$
+ */
+
+package hplb.org.w3c.dom;
+
+/**
+ *
+ */
+public interface Attribute {
+
+ public String getName();
+ public Node getValue();
+ public void setValue(Node arg);
+
+ public boolean getSpecified();
+ public void setSpecified(boolean arg);
+
+ public String toString();
+}
diff --git a/sandbox/contributions/webcrawler-LARM/src/hplb/org/w3c/dom/AttributeList.java b/sandbox/contributions/webcrawler-LARM/src/hplb/org/w3c/dom/AttributeList.java
new file mode 100644
index 00000000000..5339b89c18d
--- /dev/null
+++ b/sandbox/contributions/webcrawler-LARM/src/hplb/org/w3c/dom/AttributeList.java
@@ -0,0 +1,16 @@
+/*
+ * $Id$
+ */
+
+package hplb.org.w3c.dom;
+
+/**
+ *
+ */
+public interface AttributeList {
+ public Attribute getAttribute(String attrName);
+ public Attribute setAttribute(Attribute attr);
+ public Attribute remove(String attrName);
+ public Attribute item(int index);
+ public int getLength();
+}
diff --git a/sandbox/contributions/webcrawler-LARM/src/hplb/org/w3c/dom/Comment.java b/sandbox/contributions/webcrawler-LARM/src/hplb/org/w3c/dom/Comment.java
new file mode 100644
index 00000000000..17d54913e37
--- /dev/null
+++ b/sandbox/contributions/webcrawler-LARM/src/hplb/org/w3c/dom/Comment.java
@@ -0,0 +1,13 @@
+/*
+ * $Id$
+ */
+
+package hplb.org.w3c.dom;
+
+/**
+ * Represents the content of comments: <!-- ... -->
+ */
+public interface Comment extends Node {
+ public String getData();
+ public void setData(String arg);
+}
diff --git a/sandbox/contributions/webcrawler-LARM/src/hplb/org/w3c/dom/DOM.java b/sandbox/contributions/webcrawler-LARM/src/hplb/org/w3c/dom/DOM.java
new file mode 100644
index 00000000000..75608773cc0
--- /dev/null
+++ b/sandbox/contributions/webcrawler-LARM/src/hplb/org/w3c/dom/DOM.java
@@ -0,0 +1,13 @@
+/*
+ * $Id$
+ */
+
+package hplb.org.w3c.dom;
+
+/**
+ *
+ */
+public interface DOM {
+ public Document createDocument(String type);
+ public boolean hasFeature(String feature);
+}
diff --git a/sandbox/contributions/webcrawler-LARM/src/hplb/org/w3c/dom/Document.java b/sandbox/contributions/webcrawler-LARM/src/hplb/org/w3c/dom/Document.java
new file mode 100644
index 00000000000..7c71b5e18e6
--- /dev/null
+++ b/sandbox/contributions/webcrawler-LARM/src/hplb/org/w3c/dom/Document.java
@@ -0,0 +1,28 @@
+/*
+ * $Id$
+ */
+
+package hplb.org.w3c.dom;
+
+/**
+ *
+ */
+public interface Document extends DocumentFragment {
+ public Node getDocumentType();
+ public void setDocumentType(Node arg);
+
+ public Element getDocumentElement();
+ public void setDocumentElement(Element arg);
+
+ public DocumentContext getContextInfo();
+ public void setContextInfo(DocumentContext arg);
+
+ public DocumentContext createDocumentContext();
+ public Element createElement(String tagName, AttributeList attributes);
+ public Text createTextNode(String data);
+ public Comment createComment(String data);
+ public PI createPI(String name, String data);
+ public Attribute createAttribute(String name, Node value);
+ public AttributeList createAttributeList();
+ public NodeIterator getElementsByTagName();
+}
diff --git a/sandbox/contributions/webcrawler-LARM/src/hplb/org/w3c/dom/DocumentContext.java b/sandbox/contributions/webcrawler-LARM/src/hplb/org/w3c/dom/DocumentContext.java
new file mode 100644
index 00000000000..508c6292249
--- /dev/null
+++ b/sandbox/contributions/webcrawler-LARM/src/hplb/org/w3c/dom/DocumentContext.java
@@ -0,0 +1,14 @@
+/*
+ * $Id$
+ */
+
+package hplb.org.w3c.dom;
+
+/**
+ *
+ */
+public interface DocumentContext {
+
+ public Document getDocument();
+ public void setDocument(Document arg);
+}
diff --git a/sandbox/contributions/webcrawler-LARM/src/hplb/org/w3c/dom/DocumentFragment.java b/sandbox/contributions/webcrawler-LARM/src/hplb/org/w3c/dom/DocumentFragment.java
new file mode 100644
index 00000000000..3cae0af68ed
--- /dev/null
+++ b/sandbox/contributions/webcrawler-LARM/src/hplb/org/w3c/dom/DocumentFragment.java
@@ -0,0 +1,13 @@
+/*
+ * $Id$
+ */
+
+package hplb.org.w3c.dom;
+
+/**
+ *
+ */
+public interface DocumentFragment extends Node {
+ public Document getMasterDoc();
+ public void setMasterDoc(Document arg);
+}
diff --git a/sandbox/contributions/webcrawler-LARM/src/hplb/org/w3c/dom/Element.java b/sandbox/contributions/webcrawler-LARM/src/hplb/org/w3c/dom/Element.java
new file mode 100644
index 00000000000..8240ffa5e98
--- /dev/null
+++ b/sandbox/contributions/webcrawler-LARM/src/hplb/org/w3c/dom/Element.java
@@ -0,0 +1,16 @@
+/*
+ * $Id$
+ */
+
+package hplb.org.w3c.dom;
+
+/**
+ *
+ */
+public interface Element extends Node {
+ public String getTagName();
+ public AttributeList attributes();
+ public void setAttribute(Attribute newAttr);
+ public void normalize();
+ public NodeIterator getElementsByTagName();
+}
diff --git a/sandbox/contributions/webcrawler-LARM/src/hplb/org/w3c/dom/Makefile b/sandbox/contributions/webcrawler-LARM/src/hplb/org/w3c/dom/Makefile
new file mode 100644
index 00000000000..946af9eb603
--- /dev/null
+++ b/sandbox/contributions/webcrawler-LARM/src/hplb/org/w3c/dom/Makefile
@@ -0,0 +1,38 @@
+# This Makefile generated by hplb.util.jmkmf
+# Java package is org.w3c.dom
+
+.SUFFIXES: .java .class .jj
+JPACKAGE = org.w3c.dom
+JAVA = java
+JAVAC = javac
+JAVACC = java COM.sun.labs.javacc.Main
+JFLAGS =
+OBJS = \
+ Attribute.class \
+ AttributeList.class \
+ Comment.class \
+ DOM.class \
+ Document.class \
+ DocumentContext.class \
+ DocumentFragment.class \
+ Element.class \
+ Node.class \
+ NodeIterator.class \
+ PI.class \
+ Text.class \
+ TreeIterator.class
+JAVADOCFLAGS = -d ../../../doc/api -author -noindex -notree
+
+all: $(OBJS)
+
+doc:
+ javadoc $(JAVADOCFLAGS) $(JPACKAGE)
+
+.jj.java: $*.jj
+ $(JAVACC) $<
+
+.java.class: $*.java
+ $(JAVAC) $(JFLAGS) $<
+
+clean:
+ rm -f *.class *~
diff --git a/sandbox/contributions/webcrawler-LARM/src/hplb/org/w3c/dom/Node.java b/sandbox/contributions/webcrawler-LARM/src/hplb/org/w3c/dom/Node.java
new file mode 100644
index 00000000000..7587fce2830
--- /dev/null
+++ b/sandbox/contributions/webcrawler-LARM/src/hplb/org/w3c/dom/Node.java
@@ -0,0 +1,29 @@
+/*
+ * $Id$
+ */
+
+package hplb.org.w3c.dom;
+
+/**
+ *
+ */
+public interface Node {
+ // NodeType
+ public static final int DOCUMENT = 1;
+ public static final int ELEMENT = 2;
+ public static final int ATTRIBUTE = 3;
+ public static final int PI = 4;
+ public static final int COMMENT = 5;
+ public static final int TEXT = 6;
+
+ public int getNodeType();
+ public Node getParentNode();
+ public NodeIterator getChildNodes();
+ public boolean hasChildNodes();
+ public Node getFirstChild();
+ public Node getPreviousSibling();
+ public Node getNextSibling();
+ public Node insertBefore(Node newChild, Node refChild);
+ public Node replaceChild(Node newChild, Node oldChild);
+ public Node removeChild(Node oldChild);
+}
diff --git a/sandbox/contributions/webcrawler-LARM/src/hplb/org/w3c/dom/NodeIterator.java b/sandbox/contributions/webcrawler-LARM/src/hplb/org/w3c/dom/NodeIterator.java
new file mode 100644
index 00000000000..9194fb74d31
--- /dev/null
+++ b/sandbox/contributions/webcrawler-LARM/src/hplb/org/w3c/dom/NodeIterator.java
@@ -0,0 +1,19 @@
+/*
+ * $Id$
+ */
+
+package hplb.org.w3c.dom;
+
+/**
+ *
+ */
+public interface NodeIterator {
+ public int getLength();
+ public Node getCurrent();
+ public Node toNext();
+ public Node toPrevious();
+ public Node toFirst();
+ public Node toLast();
+ public Node toNth(int Nth);
+ public Node toNode(Node destNode);
+}
diff --git a/sandbox/contributions/webcrawler-LARM/src/hplb/org/w3c/dom/PI.java b/sandbox/contributions/webcrawler-LARM/src/hplb/org/w3c/dom/PI.java
new file mode 100644
index 00000000000..af63d9f94d6
--- /dev/null
+++ b/sandbox/contributions/webcrawler-LARM/src/hplb/org/w3c/dom/PI.java
@@ -0,0 +1,16 @@
+/*
+ * $Id$
+ */
+
+package hplb.org.w3c.dom;
+
+/**
+ * Processing Instruction
+ */
+public interface PI extends Node {
+ public String getName();
+ public void setName(String arg);
+
+ public String getData();
+ public void setData(String arg);
+}
diff --git a/sandbox/contributions/webcrawler-LARM/src/hplb/org/w3c/dom/Text.java b/sandbox/contributions/webcrawler-LARM/src/hplb/org/w3c/dom/Text.java
new file mode 100644
index 00000000000..2490c9ecabe
--- /dev/null
+++ b/sandbox/contributions/webcrawler-LARM/src/hplb/org/w3c/dom/Text.java
@@ -0,0 +1,19 @@
+/*
+ * $Id$
+ */
+
+package hplb.org.w3c.dom;
+
+/**
+ *
+ */
+public interface Text extends Node {
+ public String getData();
+ public void setData(String arg);
+
+ public void append(String data);
+ public void insert(int offset, String data);
+ public void delete(int offset, int count);
+ public void replace(int offset, int count, String data);
+ public void splice(Element element, int offset, int count);
+}
diff --git a/sandbox/contributions/webcrawler-LARM/src/hplb/org/w3c/dom/TreeIterator.java b/sandbox/contributions/webcrawler-LARM/src/hplb/org/w3c/dom/TreeIterator.java
new file mode 100644
index 00000000000..bdb2339c286
--- /dev/null
+++ b/sandbox/contributions/webcrawler-LARM/src/hplb/org/w3c/dom/TreeIterator.java
@@ -0,0 +1,20 @@
+/*
+ * $Id$
+ */
+
+package hplb.org.w3c.dom;
+
+/**
+ *
+ */
+public interface TreeIterator extends NodeIterator {
+ public int numChildren();
+ public int numPreviousSiblings();
+ public int numNextSiblings();
+ public Node toParent();
+ public Node toPreviousSibling();
+ public Node toNextSibling();
+ public Node toFirstChild();
+ public Node toLastChild();
+ public Node toNthChild();
+}
diff --git a/sandbox/contributions/webcrawler-LARM/src/hplb/org/xml/sax/AttributeMap.java b/sandbox/contributions/webcrawler-LARM/src/hplb/org/xml/sax/AttributeMap.java
new file mode 100644
index 00000000000..ef71ebaccd6
--- /dev/null
+++ b/sandbox/contributions/webcrawler-LARM/src/hplb/org/xml/sax/AttributeMap.java
@@ -0,0 +1,146 @@
+// $Id$
+
+package hplb.org.xml.sax;
+
+import java.util.Enumeration;
+
+/**
+ * A map of attributes for the current element.
+ * startElement
callback: if you need to use attribute
+ * information elsewhere, you will need to make your own copies.startElement
.startElement
.startElement
.startElement
.startElement
.startElement
.startElement
.startElement
.startElement
.startElement
.startElement
.DOCTYPE
declaration.attributes
+ * parameter will be accurate only for the duration of this handler:
+ * if you need to use the information elsewhere, you should copy
+ * it.
+ * String data = new String(ch, start, length);
+ *
+ * @param ch An array of characters.
+ * @param start The starting position in the array.
+ * @param length The number of characters to use in the array.
+ * @exception java.lang.Exception You may throw any exception.
+ */
+ public void characters (char ch[], int start, int length)
+ throws Exception;
+
+
+ /**
+ * Handle ignorable whitespace.
+ *
+ * String whitespace = new String(ch, start, length);
+ *
+ * @param ch An array of whitespace characters.
+ * @param start The starting position in the array.
+ * @param length The number of characters to use in the array.
+ * @exception java.lang.Exception You may throw any exception.
+ */
+ public void ignorable (char ch[], int start, int length)
+ throws Exception;
+
+
+ /**
+ * Handle a processing instruction.
+ * System.err
, and will throw an (unspecified)
+ * exception for fata errors.
gets written as
- the following line changes
+ // this tp
which is even wors - we should distinguish between
+ // those two types of empty elements.
+ current.insertBefore(elm, null);
+ if (!isEmptyElm(name)) current = elm;
+ }
+
+ public void endElement(String name) {
+ // we go up the parse tree till we find the node which matches
+ // this end tag. This mechanism elegantly handles "implicitly
+ // closed" elements such as being ended.
+
+ //System.out.println("CURRENT: " + current);
+
+ Node node = current;
+ for (;;) {
+ if (node == root) {
+ err.println("Stray end tag ignored: " + name +
+ " line " + tok.line + " column " + tok.column);
+ return;
+ } else if (name.equals(((Element) node).getTagName())) {
+ current = node.getParentNode();
+ return;
+ } else {
+ node = node.getParentNode();
+ }
+ }
+ }
+
+ public void characters(char[] ch, int start, int length) {
+ current.insertBefore(
+ root.createTextNode(new String(ch, start, length)), null);
+ }
+
+ public void ignorable (char ch[], int start, int length) {
+ System.out.println("Ignorable ws: " + new String(ch, start, length));
+ }
+
+ public void processingInstruction(String target, String remainder) {
+ // FIXME: the DOM says 2nd arg should be everything between "" and "?>"
+ current.insertBefore(root.createPI(target, remainder), null);
+ }
+
+ public AttributeList getDOMAttrs(AttributeMap attrs) {
+ String name;
+ Node value;
+ Enumeration e;
+ AttributeList domAttrs = root.createAttributeList();
+
+ for (e = attrs.getAttributeNames(); e.hasMoreElements(); ) {
+ name = (String) e.nextElement();
+ value = root.createTextNode(attrs.getValue(name));
+ domAttrs.setAttribute(root.createAttribute(name, value));
+ }
+ return domAttrs;
+ }
+
+ // for debugging
+ public static void main(String[] args) throws Exception {
+ Parser parser = new Parser();
+ Document doc = parser.parse(System.in);
+ Utils.pp(doc, System.out);
+ }
+}
diff --git a/sandbox/contributions/webcrawler-LARM/src/hplb/xml/SAXAttributeMap.java b/sandbox/contributions/webcrawler-LARM/src/hplb/xml/SAXAttributeMap.java
new file mode 100644
index 00000000000..69bee4117e4
--- /dev/null
+++ b/sandbox/contributions/webcrawler-LARM/src/hplb/xml/SAXAttributeMap.java
@@ -0,0 +1,229 @@
+/*
+ * $Id$
+ *
+ * Copyright 1997 Hewlett-Packard Company
+ *
+ * This file may be copied, modified and distributed only in
+ * accordance with the terms of the limited licence contained
+ * in the accompanying file LICENSE.TXT.
+ */
+
+package hplb.xml;
+
+import hplb.org.xml.sax.AttributeMap;
+import java.util.Enumeration;
+
+/**
+ * An ordered Dictionary. keys() and elements() returns Enumerations
+ * which enumerate over elements in the order they were inserted.
+ * Elements are stored linearly. Operations put(), get(), and remove()
+ * are linear in the number of elements in the Dictionary.
+ *
+ *
+ * java hplb.www.client.UrlScan [-t] [-v] [-h proxy-host] [-p proxy-port] URL
+ * where -t means test validity of embedded URLs and
+ * -v means be verbose
+ *
+ *
+ * @author Anders Kristensen
+ */
+public class UrlScanner implements HtmlObserver {
+
+ // should use getenv and/or getProperty for these:
+ static String proxyHost;
+ static String proxyPort;
+ static boolean test;
+ static boolean verbose;
+
+ public static void usage() {
+ PrintStream out = System.out;
+ out.println("Usage: UrlScan [-v] [-t]