diff --git a/sandbox/contributions/webcrawler-LARM/build.sh b/sandbox/contributions/webcrawler-LARM/build.sh new file mode 100755 index 00000000000..384c3ab9e68 --- /dev/null +++ b/sandbox/contributions/webcrawler-LARM/build.sh @@ -0,0 +1,23 @@ +#!/bin/sh + +#clean +echo cleaning +rm -r build +rm -r classes +rm -r cachingqueue +rm -r logs + +#build +echo making build directory +mkdir build +cd build +echo extracting http client +jar xvf ../lib/HTTPClient.zip >/dev/nul +cd .. +cp -r src/* build +mkdir classes +echo compiling +javac -g -d classes -sourcepath build build/HTTPClient/*.java +javac -g -classpath ./lib/jakarta-oro-2.0.5.jar -d classes -sourcepath build build/de/lanlab/larm/fetcher/FetcherMain.java + + diff --git a/sandbox/contributions/webcrawler-LARM/clean.sh b/sandbox/contributions/webcrawler-LARM/clean.sh new file mode 100755 index 00000000000..65c222feba1 --- /dev/null +++ b/sandbox/contributions/webcrawler-LARM/clean.sh @@ -0,0 +1,5 @@ +#!/bin/sh +./cleanlastrun.sh +rm -r build +rm -r classes + diff --git a/sandbox/contributions/webcrawler-LARM/cleanlastrun.sh b/sandbox/contributions/webcrawler-LARM/cleanlastrun.sh new file mode 100755 index 00000000000..730d2165b55 --- /dev/null +++ b/sandbox/contributions/webcrawler-LARM/cleanlastrun.sh @@ -0,0 +1,4 @@ +#!/bin/sh +rm -r logs +rm -r cachingqueue + diff --git a/sandbox/contributions/webcrawler-LARM/og-build.sh b/sandbox/contributions/webcrawler-LARM/og-build.sh new file mode 100755 index 00000000000..5ce5c7dd214 --- /dev/null +++ b/sandbox/contributions/webcrawler-LARM/og-build.sh @@ -0,0 +1,23 @@ +#!/bin/sh + +#clean +echo cleaning +rm -r build +rm -r classes +rm -r cachingqueue +rm -r logs + +#build +echo making build directory +mkdir build +cd build +#echo extracting http client +#jar xvf ../lib/HTTPClient.zip >/dev/null +cd .. +cp -r src/* build +mkdir classes +echo compiling +#javac -g -d classes -sourcepath build build/HTTPClient/*.java +javac -g -d classes -sourcepath build build/de/lanlab/larm/fetcher/FetcherMain.java + + diff --git a/sandbox/contributions/webcrawler-LARM/run.sh b/sandbox/contributions/webcrawler-LARM/run.sh new file mode 100755 index 00000000000..4af92d2fed6 --- /dev/null +++ b/sandbox/contributions/webcrawler-LARM/run.sh @@ -0,0 +1,4 @@ +#!/bin/sh +rm -r logs +mkdir logs +java -server -Xmx400mb -classpath classes:lib/jakarta-oro-2.0.5.jar de.lanlab.larm.fetcher.FetcherMain -start http://www.cis.uni-muenchen.de/ -restrictto http://[^/]*\.uni-muenchen\.de.* -threads 15 diff --git a/sandbox/contributions/webcrawler-LARM/src/HTTPClient/ContentEncodingModule.java b/sandbox/contributions/webcrawler-LARM/src/HTTPClient/ContentEncodingModule.java new file mode 100644 index 00000000000..994caec61f6 --- /dev/null +++ b/sandbox/contributions/webcrawler-LARM/src/HTTPClient/ContentEncodingModule.java @@ -0,0 +1,278 @@ +/* + * @(#)ContentEncodingModule.java 0.3-3 06/05/2001 + * + * This file is part of the HTTPClient package + * Copyright (C) 1996-2001 Ronald Tschalär + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free + * Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, + * MA 02111-1307, USA + * + * For questions, suggestions, bug-reports, enhancement-requests etc. + * I may be contacted at: + * + * ronald@innovation.ch + * + * The HTTPClient's home page is located at: + * + * http://www.innovation.ch/java/HTTPClient/ + * + */ +package HTTPClient; + +import java.io.IOException; +import java.util.Vector; +import java.util.zip.InflaterInputStream; +import java.util.zip.GZIPInputStream; + +/** + * This module handles the Content-Encoding response header. It currently + * handles the "gzip", "deflate", "compress" and "identity" tokens. + * + * @author Ronald Tschalär + * @created 29. Dezember 2001 + * @version 0.3-3 06/05/2001 + */ +public class ContentEncodingModule implements HTTPClientModule +{ + // Methods + + /** + * Invoked by the HTTPClient. + * + * @param req Description of the Parameter + * @param resp Description of the Parameter + * @return Description of the Return Value + * @exception ModuleException Description of the Exception + */ + public int requestHandler(Request req, Response[] resp) + throws ModuleException + { + // parse Accept-Encoding header + + int idx; + NVPair[] hdrs = req.getHeaders(); + for (idx = 0; idx < hdrs.length; idx++) + { + if (hdrs[idx].getName().equalsIgnoreCase("Accept-Encoding")) + { + break; + } + } + + Vector pae; + if (idx == hdrs.length) + { + hdrs = Util.resizeArray(hdrs, idx + 1); + req.setHeaders(hdrs); + pae = new Vector(); + } + else + { + try + { + pae = Util.parseHeader(hdrs[idx].getValue()); + } + catch (ParseException pe) + { + throw new ModuleException(pe.toString()); + } + } + + // done if "*;q=1.0" present + + HttpHeaderElement all = Util.getElement(pae, "*"); + if (all != null) + { + NVPair[] params = all.getParams(); + for (idx = 0; idx < params.length; idx++) + { + if (params[idx].getName().equalsIgnoreCase("q")) + { + break; + } + } + + if (idx == params.length) + { + // no qvalue, i.e. q=1.0 + return REQ_CONTINUE; + } + + if (params[idx].getValue() == null || + params[idx].getValue().length() == 0) + { + throw new ModuleException("Invalid q value for \"*\" in " + + "Accept-Encoding header: "); + } + + try + { + if (Float.valueOf(params[idx].getValue()).floatValue() > 0.) + { + return REQ_CONTINUE; + } + } + catch (NumberFormatException nfe) + { + throw new ModuleException("Invalid q value for \"*\" in " + + "Accept-Encoding header: " + nfe.getMessage()); + } + } + + // Add gzip, deflate and compress tokens to the Accept-Encoding header + + if (!pae.contains(new HttpHeaderElement("deflate"))) + { + pae.addElement(new HttpHeaderElement("deflate")); + } + if (!pae.contains(new HttpHeaderElement("gzip"))) + { + pae.addElement(new HttpHeaderElement("gzip")); + } + if (!pae.contains(new HttpHeaderElement("x-gzip"))) + { + pae.addElement(new HttpHeaderElement("x-gzip")); + } + if (!pae.contains(new HttpHeaderElement("compress"))) + { + pae.addElement(new HttpHeaderElement("compress")); + } + if (!pae.contains(new HttpHeaderElement("x-compress"))) + { + pae.addElement(new HttpHeaderElement("x-compress")); + } + + hdrs[idx] = new NVPair("Accept-Encoding", Util.assembleHeader(pae)); + + return REQ_CONTINUE; + } + + + /** + * Invoked by the HTTPClient. + * + * @param resp Description of the Parameter + * @param req Description of the Parameter + */ + public void responsePhase1Handler(Response resp, RoRequest req) + { + } + + + /** + * Invoked by the HTTPClient. + * + * @param resp Description of the Parameter + * @param req Description of the Parameter + * @return Description of the Return Value + */ + public int responsePhase2Handler(Response resp, Request req) + { + return RSP_CONTINUE; + } + + + /** + * Invoked by the HTTPClient. + * + * @param resp Description of the Parameter + * @param req Description of the Parameter + * @exception IOException Description of the Exception + * @exception ModuleException Description of the Exception + */ + public void responsePhase3Handler(Response resp, RoRequest req) + throws IOException, ModuleException + { + String ce = resp.getHeader("Content-Encoding"); + if (ce == null || req.getMethod().equals("HEAD") || + resp.getStatusCode() == 206) + { + return; + } + + Vector pce; + try + { + pce = Util.parseHeader(ce); + } + catch (ParseException pe) + { + throw new ModuleException(pe.toString()); + } + + if (pce.size() == 0) + { + return; + } + + String encoding = ((HttpHeaderElement) pce.firstElement()).getName(); + if (encoding.equalsIgnoreCase("gzip") || + encoding.equalsIgnoreCase("x-gzip")) + { + Log.write(Log.MODS, "CEM: pushing gzip-input-stream"); + + resp.inp_stream = new GZIPInputStream(resp.inp_stream); + pce.removeElementAt(pce.size() - 1); + resp.deleteHeader("Content-length"); + } + else if (encoding.equalsIgnoreCase("deflate")) + { + Log.write(Log.MODS, "CEM: pushing inflater-input-stream"); + + resp.inp_stream = new InflaterInputStream(resp.inp_stream); + pce.removeElementAt(pce.size() - 1); + resp.deleteHeader("Content-length"); + } + else if (encoding.equalsIgnoreCase("compress") || + encoding.equalsIgnoreCase("x-compress")) + { + Log.write(Log.MODS, "CEM: pushing uncompress-input-stream"); + + resp.inp_stream = new UncompressInputStream(resp.inp_stream); + pce.removeElementAt(pce.size() - 1); + resp.deleteHeader("Content-length"); + } + else if (encoding.equalsIgnoreCase("identity")) + { + Log.write(Log.MODS, "CEM: ignoring 'identity' token"); + pce.removeElementAt(pce.size() - 1); + } + else + { + Log.write(Log.MODS, "CEM: Unknown content encoding '" + + encoding + "'"); + } + + if (pce.size() > 0) + { + resp.setHeader("Content-Encoding", Util.assembleHeader(pce)); + } + else + { + resp.deleteHeader("Content-Encoding"); + } + } + + + /** + * Invoked by the HTTPClient. + * + * @param resp Description of the Parameter + * @param req Description of the Parameter + */ + public void trailerHandler(Response resp, RoRequest req) + { + } +} diff --git a/sandbox/contributions/webcrawler-LARM/src/HTTPClient/HTTPConnection.java b/sandbox/contributions/webcrawler-LARM/src/HTTPClient/HTTPConnection.java new file mode 100644 index 00000000000..ba9309cb84c --- /dev/null +++ b/sandbox/contributions/webcrawler-LARM/src/HTTPClient/HTTPConnection.java @@ -0,0 +1,4489 @@ +/* + * @(#)HTTPConnection.java 0.3-3 06/05/2001 + * + * This file is part of the HTTPClient package + * Copyright (C) 1996-2001 Ronald Tschalär + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free + * Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, + * MA 02111-1307, USA + * + * For questions, suggestions, bug-reports, enhancement-requests etc. + * I may be contacted at: + * + * ronald@innovation.ch + * + * The HTTPClient's home page is located at: + * + * http://www.innovation.ch/java/HTTPClient/ + * + */ +package HTTPClient; + +import java.io.OutputStream; +import java.io.DataOutputStream; +import java.io.FilterOutputStream; +import java.io.ByteArrayOutputStream; +import java.io.IOException; +import java.io.InterruptedIOException; +import java.net.URL; +import java.net.Socket; +import java.net.InetAddress; +import java.net.SocketException; +import java.net.ConnectException; +import java.net.UnknownHostException; +import java.net.NoRouteToHostException; +import java.util.Vector; +import java.applet.Applet; + +/** + * This class implements http protocol requests; it contains most of HTTP/1.1 + * and ought to be unconditionally compliant. Redirections are automatically + * handled, and authorizations requests are recognized and dealt with via an + * authorization handler. Only full HTTP/1.0 and HTTP/1.1 requests are + * generated. HTTP/1.1, HTTP/1.0 and HTTP/0.9 responses are recognized.

+ * + * Using the HTTPClient should be quite simple. First add the import statement ' + * import HTTPClient.*;' to your file(s). Request can then be sent + * using one of the methods Head() , Get() , Post() + * , etc in HTTPConnection . These methods all return an + * instance of HTTPResponse which has methods for accessing the + * response headers (getHeader() , getHeaderAsInt() , + * etc), various response info (getStatusCode() , + * getReasonLine() , etc) and the reponse data (getData() , + * getText() , and getInputStream() ). Following are some + * examples.

+ * + * If this is in an applet you can retrieve files from your server as follows: + *

+ *     try
+ *     {
+ *         HTTPConnection con = new HTTPConnection(this);
+ *         HTTPResponse   rsp = con.Get("/my_file");
+ *         if (rsp.getStatusCode() >= 300)
+ *         {
+ *             System.err.println("Received Error: "+rsp.getReasonLine());
+ *             System.err.println(rsp.getText());
+ *         }
+ *         else
+ *             data = rsp.getData();
+ *
+ *         rsp = con.Get("/another_file");
+ *         if (rsp.getStatusCode() >= 300)
+ *         {
+ *             System.err.println("Received Error: "+rsp.getReasonLine());
+ *             System.err.println(rsp.getText());
+ *         }
+ *         else
+ *             other_data = rsp.getData();
+ *     }
+ *     catch (IOException ioe)
+ *     {
+ *         System.err.println(ioe.toString());
+ *     }
+ *     catch (ModuleException me)
+ *     {
+ *         System.err.println("Error handling request: " + me.getMessage());
+ *     }
+ * 
This will get the files "/my_file" and "/another_file" and put their + * contents into byte[]'s accessible via getData(). Note that you + * need to only create a new HTTPConnection when sending a request to + * a new server (different host or port); although you may create a new + * HTTPConnection for every request to the same server this not + * recommended, as various information about the server is cached + * after the first request (to optimize subsequent requests) and persistent + * connections are used whenever possible.

+ * + * To POST form data you would use something like this (assuming you have two + * fields called name and e-mail , whose contents are + * stored in the variables name and email ):

+ *     try
+ *     {
+ *         NVPair form_data[] = new NVPair[2];
+ *         form_data[0] = new NVPair("name", name);
+ *         form_data[1] = new NVPair("e-mail", email);
+ *
+ *         HTTPConnection con = new HTTPConnection(this);
+ *         HTTPResponse   rsp = con.Post("/cgi-bin/my_script", form_data);
+ *         if (rsp.getStatusCode() >= 300)
+ *         {
+ *             System.err.println("Received Error: "+rsp.getReasonLine());
+ *             System.err.println(rsp.getText());
+ *         }
+ *         else
+ *             stream = rsp.getInputStream();
+ *     }
+ *     catch (IOException ioe)
+ *     {
+ *         System.err.println(ioe.toString());
+ *     }
+ *     catch (ModuleException me)
+ *     {
+ *         System.err.println("Error handling request: " + me.getMessage());
+ *     }
+ * 
Here the response data is read at leasure via an InputStream + * instead of all at once into a byte[] .

+ * + * As another example, if you have a URL you're trying to send a request to you + * would do something like the following:

+ *     try
+ *     {
+ *         URL url = new URL("http://www.mydomain.us/test/my_file");
+ *         HTTPConnection con = new HTTPConnection(url);
+ *         HTTPResponse   rsp = con.Put(url.getFile(), "Hello World");
+ *         if (rsp.getStatusCode() >= 300)
+ *         {
+ *             System.err.println("Received Error: "+rsp.getReasonLine());
+ *             System.err.println(rsp.getText());
+ *         }
+ *         else
+ *             text = rsp.getText();
+ *     }
+ *     catch (IOException ioe)
+ *     {
+ *         System.err.println(ioe.toString());
+ *     }
+ *     catch (ModuleException me)
+ *     {
+ *         System.err.println("Error handling request: " + me.getMessage());
+ *     }
+ * 

+ * + * There are a whole number of methods for each request type; however the + * general forms are ([...] means that the enclosed is optional): + *

+ * + * + * @author Ronald Tschalär + * @created 29. Dezember 2001 + * @version 0.3-3 06/05/2001 + */ +public class HTTPConnection implements GlobalConstants, HTTPClientModuleConstants +{ + /** + * The current version of this package. + */ + public final static String version = "RPT-HTTPClient/0.3-3"; + + /** + * The default context + */ + private final static Object dflt_context = new Object(); + + /** + * The current context + */ + private Object Context = null; + + /** + * The protocol used on this connection + */ + private int Protocol; + + /** + * The server's protocol version; M.m stored as (M<<16 | m) + */ + int ServerProtocolVersion; + + /** + * Have we gotten the server's protocol version yet? + */ + boolean ServProtVersKnown; + + /** + * The protocol version we send in a request; this is always HTTP/1.1 unless + * we're talking to a broken server in which case it's HTTP/1.0 + */ + private String RequestProtocolVersion; + + /** + * The remote host this connection is associated with + */ + private String Host; + + /** + * The remote port this connection is attached to + */ + private int Port; + + /** + * The local address this connection is associated with + */ + private InetAddress LocalAddr; + + /** + * The local port this connection is attached to + */ + private int LocalPort; + + /** + * The current proxy host to use (if any) + */ + private String Proxy_Host = null; + + /** + * The current proxy port + */ + private int Proxy_Port; + + /** + * The default proxy host to use (if any) + */ + private static String Default_Proxy_Host = null; + + /** + * The default proxy port + */ + private static int Default_Proxy_Port; + + /** + * The list of hosts for which no proxy is to be used + */ + private static CIHashtable non_proxy_host_list = new CIHashtable(); + private static Vector non_proxy_dom_list = new Vector(); + private static Vector non_proxy_addr_list = new Vector(); + private static Vector non_proxy_mask_list = new Vector(); + + /** + * The socks server to use + */ + private SocksClient Socks_client = null; + + /** + * The default socks server to use + */ + private static SocksClient Default_Socks_client = null; + + /** + * the current stream demultiplexor + */ + private StreamDemultiplexor input_demux = null; + + /** + * a list of active stream demultiplexors + */ + LinkedList DemuxList = new LinkedList(); + + /** + * a list of active requests + */ + private LinkedList RequestList = new LinkedList(); + + /** + * does the server support keep-alive's? + */ + private boolean doesKeepAlive = false; + + /** + * have we been able to determine the above yet? + */ + private boolean keepAliveUnknown = true; + + /** + * the maximum number of requests over a HTTP/1.0 keep-alive connection + */ + private int keepAliveReqMax = -1; + + /** + * the number of requests over a HTTP/1.0 keep-alive connection left + */ + private int keepAliveReqLeft; + + /** + * hack to force buffering of data instead of using chunked T-E + */ + private static boolean no_chunked = false; + + /** + * hack to force HTTP/1.0 requests + */ + private static boolean force_1_0 = false; + + /** + * hack to be able to disable pipelining + */ + private static boolean neverPipeline = false; + + /** + * hack to be able to disable keep-alives + */ + private static boolean noKeepAlives = false; + + /** + * hack to work around M$ bug + */ + private static boolean haveMSLargeWritesBug = false; + + /** + * hack to only enable defered handling of streamed requests when configured + * to do so. + */ + static boolean deferStreamed = false; + + /** + * the default timeout to use for new connections + */ + private static int DefaultTimeout = 0; + + /** + * the timeout to use for reading responses + */ + private int Timeout; + + /** + * The list of default http headers + */ + private NVPair[] DefaultHeaders = new NVPair[0]; + + /** + * The default list of modules (as a Vector of Class objects) + */ + private static Vector DefaultModuleList; + + /** + * The list of modules (as a Vector of Class objects) + */ + private Vector ModuleList; + + /** + * controls whether modules are allowed to interact with user + */ + private static boolean defaultAllowUI = true; + + /** + * controls whether modules are allowed to interact with user + */ + private boolean allowUI; + + static + { + /* + * Let's try and see if we can figure out whether any proxies are + * being used. + */ + try + { + // JDK 1.1 naming + + String host = System.getProperty("http.proxyHost"); + if (host == null) + { + throw new Exception(); + } + // try JDK 1.0.x naming + int port = Integer.getInteger("http.proxyPort", -1).intValue(); + + Log.write(Log.CONN, "Conn: using proxy " + host + ":" + port); + setProxyServer(host, port); + } + catch (Exception e) + { + try + { + // JDK 1.0.x naming + + if (Boolean.getBoolean("proxySet")) + { + String host = System.getProperty("proxyHost"); + int port = Integer.getInteger("proxyPort", -1).intValue(); + Log.write(Log.CONN, "Conn: using proxy " + host + ":" + port); + setProxyServer(host, port); + } + } + catch (Exception ee) + { + Default_Proxy_Host = null; + } + } + + /* + * now check for the non-proxy list + */ + try + { + String hosts = System.getProperty("HTTPClient.nonProxyHosts"); + if (hosts == null) + { + hosts = System.getProperty("http.nonProxyHosts"); + } + + String[] list = Util.splitProperty(hosts); + dontProxyFor(list); + } + catch (Exception e) + { + } + + /* + * we can't turn the JDK SOCKS handling off, so we don't use the + * properties 'socksProxyHost' and 'socksProxyPort'. Instead we + * define 'HTTPClient.socksHost', 'HTTPClient.socksPort' and + * 'HTTPClient.socksVersion'. + */ + try + { + String host = System.getProperty("HTTPClient.socksHost"); + if (host != null && host.length() > 0) + { + int port = Integer.getInteger("HTTPClient.socksPort", -1).intValue(); + int version = Integer.getInteger("HTTPClient.socksVersion", -1).intValue(); + Log.write(Log.CONN, "Conn: using SOCKS " + host + ":" + port); + if (version == -1) + { + setSocksServer(host, port); + } + else + { + setSocksServer(host, port, version); + } + } + } + catch (Exception e) + { + Default_Socks_client = null; + } + + // Set up module list + + String modules = "HTTPClient.RetryModule|" + + "HTTPClient.CookieModule|" + + "HTTPClient.RedirectionModule|" + + "HTTPClient.AuthorizationModule|" + + "HTTPClient.DefaultModule|" + + "HTTPClient.TransferEncodingModule|" + + "HTTPClient.ContentMD5Module|" + + "HTTPClient.ContentEncodingModule"; + + boolean in_applet = false; + try + { + modules = System.getProperty("HTTPClient.Modules", modules); + } + catch (SecurityException se) + { + in_applet = true; + } + + DefaultModuleList = new Vector(); + String[] list = Util.splitProperty(modules); + for (int idx = 0; idx < list.length; idx++) + { + try + { + DefaultModuleList.addElement(Class.forName(list[idx])); + Log.write(Log.CONN, "Conn: added module " + list[idx]); + } + catch (ClassNotFoundException cnfe) + { + if (!in_applet) + { + throw new NoClassDefFoundError(cnfe.getMessage()); + } + + /* + * Just ignore it. This allows for example applets to just + * load the necessary modules - if you don't need a module + * then don't provide it, and it won't be added to the + * list. The disadvantage is that if you accidently misstype + * a module name this will lead to a "silent" error. + */ + } + } + + /* + * Hack: disable pipelining + */ + try + { + neverPipeline = Boolean.getBoolean("HTTPClient.disable_pipelining"); + if (neverPipeline) + { + Log.write(Log.CONN, "Conn: disabling pipelining"); + } + } + catch (Exception e) + { + } + + /* + * Hack: disable keep-alives + */ + try + { + noKeepAlives = Boolean.getBoolean("HTTPClient.disableKeepAlives"); + if (noKeepAlives) + { + Log.write(Log.CONN, "Conn: disabling keep-alives"); + } + } + catch (Exception e) + { + } + + /* + * Hack: force HTTP/1.0 requests + */ + try + { + force_1_0 = Boolean.getBoolean("HTTPClient.forceHTTP_1.0"); + if (force_1_0) + { + Log.write(Log.CONN, "Conn: forcing HTTP/1.0 requests"); + } + } + catch (Exception e) + { + } + + /* + * Hack: prevent chunking of request data + */ + try + { + no_chunked = Boolean.getBoolean("HTTPClient.dontChunkRequests"); + if (no_chunked) + { + Log.write(Log.CONN, "Conn: never chunking requests"); + } + } + catch (Exception e) + { + } + + /* + * M$ bug: large writes hang the stuff + */ + try + { + if (System.getProperty("os.name").indexOf("Windows") >= 0 && + System.getProperty("java.version").startsWith("1.1")) + { + haveMSLargeWritesBug = true; + } + if (haveMSLargeWritesBug) + { + Log.write(Log.CONN, "Conn: splitting large writes into 20K chunks (M$ bug)"); + } + } + catch (Exception e) + { + } + + /* + * Deferring the handling of responses to requests which used an output + * stream is new in V0.3-3. Because it can cause memory leaks for apps + * which aren't expecting this, we only enable this feature if + * explicitly requested to do so. + */ + try + { + deferStreamed = Boolean.getBoolean("HTTPClient.deferStreamed"); + if (deferStreamed) + { + Log.write(Log.CONN, "Conn: enabling defered handling of " + + "responses to streamed requests"); + } + } + catch (Exception e) + { + } + } + + + // Constructors + + /** + * Constructs a connection to the host from where the applet was loaded. + * Note that current security policies only let applets connect home. + * + * @param applet the current applet + * @exception ProtocolNotSuppException Description of the Exception + */ + public HTTPConnection(Applet applet) + throws ProtocolNotSuppException + { + this(applet.getCodeBase().getProtocol(), + applet.getCodeBase().getHost(), + applet.getCodeBase().getPort()); + } + + + /** + * Constructs a connection to the specified host on port 80 + * + * @param host the host + */ + public HTTPConnection(String host) + { + Setup(HTTP, host, 80, null, -1); + } + + + /** + * Constructs a connection to the specified host on the specified port + * + * @param host the host + * @param port the port + */ + public HTTPConnection(String host, int port) + { + Setup(HTTP, host, port, null, -1); + } + + + /** + * Constructs a connection to the specified host on the specified port, + * using the specified protocol (currently only "http" is supported). + * + * @param prot the protocol + * @param host the host + * @param port the port, or -1 for the default port + * @exception ProtocolNotSuppException if the protocol is not HTTP + */ + public HTTPConnection(String prot, String host, int port) + throws ProtocolNotSuppException + { + this(prot, host, port, null, -1); + } + + + /** + * Constructs a connection to the specified host on the specified port, + * using the specified protocol (currently only "http" is supported), local + * address, and local port. + * + * @param prot the protocol + * @param host the host + * @param port the port, or -1 for the default port + * @param localAddr the local address to bind to + * @param localPort Description of the Parameter + * @exception ProtocolNotSuppException if the protocol is not HTTP + */ + public HTTPConnection(String prot, String host, int port, + InetAddress localAddr, int localPort) + throws ProtocolNotSuppException + { + prot = prot.trim().toLowerCase(); + + //if (!prot.equals("http") && !prot.equals("https")) + if (!prot.equals("http")) + { + throw new ProtocolNotSuppException("Unsupported protocol '" + prot + "'"); + } + + if (prot.equals("http")) + { + Setup(HTTP, host, port, localAddr, localPort); + } + else if (prot.equals("https")) + { + Setup(HTTPS, host, port, localAddr, localPort); + } + else if (prot.equals("shttp")) + { + Setup(SHTTP, host, port, localAddr, localPort); + } + else if (prot.equals("http-ng")) + { + Setup(HTTP_NG, host, port, localAddr, localPort); + } + } + + + /** + * Constructs a connection to the host (port) as given in the url. + * + * @param url the url + * @exception ProtocolNotSuppException if the protocol is not HTTP + */ + public HTTPConnection(URL url) + throws ProtocolNotSuppException + { + this(url.getProtocol(), url.getHost(), url.getPort()); + } + + + /** + * Constructs a connection to the host (port) as given in the uri. + * + * @param uri the uri + * @exception ProtocolNotSuppException if the protocol is not HTTP + */ + public HTTPConnection(URI uri) + throws ProtocolNotSuppException + { + this(uri.getScheme(), uri.getHost(), uri.getPort()); + } + + + /** + * Sets the class variables. Must not be public. + * + * @param prot the protocol + * @param host the host + * @param port the port + * @param localAddr the local address to bind to; if null, it's ignored + * @param localPort the local port to bind to + */ + private void Setup(int prot, String host, int port, InetAddress localAddr, + int localPort) + { + Protocol = prot; + Host = host.trim().toLowerCase(); + Port = port; + LocalAddr = localAddr; + LocalPort = localPort; + + if (Port == -1) + { + Port = URI.defaultPort(getProtocol()); + } + + if (Default_Proxy_Host != null && !matchNonProxy(Host)) + { + setCurrentProxy(Default_Proxy_Host, Default_Proxy_Port); + } + else + { + setCurrentProxy(null, 0); + } + + Socks_client = Default_Socks_client; + Timeout = DefaultTimeout; + ModuleList = (Vector) DefaultModuleList.clone(); + allowUI = defaultAllowUI; + if (noKeepAlives) + { + setDefaultHeaders(new NVPair[]{new NVPair("Connection", "close")}); + } + } + + + /** + * Determines if the given host matches any entry in the non-proxy list. + * + * @param host the host to match - must be trim()'d and lowercase + * @return true if a match is found, false otherwise + * @see #dontProxyFor(java.lang.String) + */ + private boolean matchNonProxy(String host) + { + // Check host name list + + if (non_proxy_host_list.get(host) != null) + { + return true; + } + + // Check domain name list + + for (int idx = 0; idx < non_proxy_dom_list.size(); idx++) + { + if (host.endsWith((String) non_proxy_dom_list.elementAt(idx))) + { + return true; + } + } + + // Check IP-address and subnet list + + if (non_proxy_addr_list.size() == 0) + { + return false; + } + + InetAddress[] host_addr; + try + { + host_addr = InetAddress.getAllByName(host); + } + catch (UnknownHostException uhe) + { + return false; + } + // maybe the proxy has better luck + + for (int idx = 0; idx < non_proxy_addr_list.size(); idx++) + { + byte[] addr = (byte[]) non_proxy_addr_list.elementAt(idx); + byte[] mask = (byte[]) non_proxy_mask_list.elementAt(idx); + + ip_loop : + for (int idx2 = 0; idx2 < host_addr.length; idx2++) + { + byte[] raw_addr = host_addr[idx2].getAddress(); + if (raw_addr.length != addr.length) + { + continue; + } + + for (int idx3 = 0; idx3 < raw_addr.length; idx3++) + { + if ((raw_addr[idx3] & mask[idx3]) != (addr[idx3] & mask[idx3])) + { + continue ip_loop; + } + } + return true; + } + } + + return false; + } + + + // Methods + + /** + * Sends the HEAD request. This request is just like the corresponding GET + * except that it only returns the headers and no data. + * + * @param file the absolute path of the file + * @return an HTTPResponse structure containing the + * response + * @exception ModuleException if an exception is encountered in any module. + * @exception IOException Description of the Exception + * @see #Get(java.lang.String) + */ + public HTTPResponse Head(String file) + throws IOException, ModuleException + { + return Head(file, (String) null, null); + } + + + /** + * Sends the HEAD request. This request is just like the corresponding GET + * except that it only returns the headers and no data. + * + * @param file the absolute path of the file + * @param form_data an array of Name/Value pairs + * @return an HTTPResponse structure containing the + * response + * @exception ModuleException if an exception is encountered in any module. + * @exception IOException Description of the Exception + * @see #Get(java.lang.String, HTTPClient.NVPair[]) + */ + public HTTPResponse Head(String file, NVPair form_data[]) + throws IOException, ModuleException + { + return Head(file, form_data, null); + } + + + /** + * Sends the HEAD request. This request is just like the corresponding GET + * except that it only returns the headers and no data. + * + * @param file the absolute path of the file + * @param form_data an array of Name/Value pairs + * @param headers additional headers + * @return an HTTPResponse structure containing the + * response + * @exception ModuleException if an exception is encountered in any module. + * @exception IOException Description of the Exception + * @see #Get(java.lang.String, HTTPClient.NVPair[], + * HTTPClient.NVPair[]) + */ + public HTTPResponse Head(String file, NVPair[] form_data, NVPair[] headers) + throws IOException, ModuleException + { + String File = stripRef(file); + String + query = Codecs.nv2query(form_data); + if (query != null && query.length() > 0) + { + File += "?" + query; + } + + return setupRequest("HEAD", File, headers, null, null); + } + + + /** + * Sends the HEAD request. This request is just like the corresponding GET + * except that it only returns the headers and no data. + * + * @param file the absolute path of the file + * @param query the query string; it will be urlencoded + * @return an HTTPResponse structure containing the + * response + * @exception ModuleException if an exception is encountered in any module. + * @exception IOException Description of the Exception + * @see #Get(java.lang.String, java.lang.String) + */ + public HTTPResponse Head(String file, String query) + throws IOException, ModuleException + { + return Head(file, query, null); + } + + + /** + * Sends the HEAD request. This request is just like the corresponding GET + * except that it only returns the headers and no data. + * + * @param file the absolute path of the file + * @param query the query string; it will be urlencoded + * @param headers additional headers + * @return an HTTPResponse structure containing the + * response + * @exception ModuleException if an exception is encountered in any module. + * @exception IOException Description of the Exception + * @see #Get(java.lang.String, java.lang.String, + * HTTPClient.NVPair[]) + */ + public HTTPResponse Head(String file, String query, NVPair[] headers) + throws IOException, ModuleException + { + String File = stripRef(file); + if (query != null && query.length() > 0) + { + File += "?" + Codecs.URLEncode(query); + } + + return setupRequest("HEAD", File, headers, null, null); + } + + + /** + * GETs the file. + * + * @param file the absolute path of the file + * @return an HTTPResponse structure containing the + * response + * @exception ModuleException if an exception is encountered in any module. + * @exception IOException Description of the Exception + */ + public HTTPResponse Get(String file) + throws IOException, ModuleException + { + return Get(file, (String) null, null); + } + + + /** + * GETs the file with a query consisting of the specified form-data. The + * data is urlencoded, turned into a string of the form + * "name1=value1&name2=value2" and then sent as a query string. + * + * @param file the absolute path of the file + * @param form_data an array of Name/Value pairs + * @return an HTTPResponse structure containing the + * response + * @exception ModuleException if an exception is encountered in any module. + * @exception IOException Description of the Exception + */ + public HTTPResponse Get(String file, NVPair form_data[]) + throws IOException, ModuleException + { + return Get(file, form_data, null); + } + + + /** + * GETs the file with a query consisting of the specified form-data. The + * data is urlencoded, turned into a string of the form + * "name1=value1&name2=value2" and then sent as a query string. + * + * @param file the absolute path of the file + * @param form_data an array of Name/Value pairs + * @param headers additional headers + * @return an HTTPResponse structure containing the + * response + * @exception ModuleException if an exception is encountered in any module. + * @exception IOException Description of the Exception + */ + public HTTPResponse Get(String file, NVPair[] form_data, NVPair[] headers) + throws IOException, ModuleException + { + String File = stripRef(file); + String + query = Codecs.nv2query(form_data); + if (query != null && query.length() > 0) + { + File += "?" + query; + } + + return setupRequest("GET", File, headers, null, null); + } + + + /** + * GETs the file using the specified query string. The query string is first + * urlencoded. + * + * @param file the absolute path of the file + * @param query the query + * @return an HTTPResponse structure containing the + * response + * @exception ModuleException if an exception is encountered in any module. + * @exception IOException Description of the Exception + */ + public HTTPResponse Get(String file, String query) + throws IOException, ModuleException + { + return Get(file, query, null); + } + + + /** + * GETs the file using the specified query string. The query string is first + * urlencoded. + * + * @param file the absolute path of the file + * @param query the query string + * @param headers additional headers + * @return an HTTPResponse structure containing the + * response + * @exception ModuleException if an exception is encountered in any module. + * @exception IOException Description of the Exception + */ + public HTTPResponse Get(String file, String query, NVPair[] headers) + throws IOException, ModuleException + { + String File = stripRef(file); + if (query != null && query.length() > 0) + { + File += "?" + Codecs.URLEncode(query); + } + + return setupRequest("GET", File, headers, null, null); + } + + + /** + * POSTs to the specified file. No data is sent. + * + * @param file the absolute path of the file + * @return an HTTPResponse structure containing the + * response + * @exception ModuleException if an exception is encountered in any module. + * @exception IOException Description of the Exception + */ + public HTTPResponse Post(String file) + throws IOException, ModuleException + { + return Post(file, (byte[]) null, null); + } + + + /** + * POSTs form-data to the specified file. The data is first urlencoded and + * then turned into a string of the form "name1=value1&name2=value2". A + * Content-type header with the value + * application/x-www-form-urlencoded is added. + * + * @param file the absolute path of the file + * @param form_data an array of Name/Value pairs + * @return an HTTPResponse structure containing the + * response + * @exception ModuleException if an exception is encountered in any module. + * @exception IOException Description of the Exception + */ + public HTTPResponse Post(String file, NVPair form_data[]) + throws IOException, ModuleException + { + NVPair[] headers = + {new NVPair("Content-type", "application/x-www-form-urlencoded")}; + + return Post(file, Codecs.nv2query(form_data), headers); + } + + + /** + * POST's form-data to the specified file using the specified headers. The + * data is first urlencoded and then turned into a string of the form + * "name1=value1&name2=value2". If no Content-type header is + * given then one is added with a value of + * application/x-www-form-urlencoded . + * + * @param file the absolute path of the file + * @param form_data an array of Name/Value pairs + * @param headers additional headers + * @return a HTTPResponse structure containing the + * response + * @exception ModuleException if an exception is encountered in any module. + * @exception IOException Description of the Exception + */ + public HTTPResponse Post(String file, NVPair form_data[], NVPair headers[]) + throws IOException, ModuleException + { + int idx; + for (idx = 0; idx < headers.length; idx++) + { + if (headers[idx].getName().equalsIgnoreCase("Content-type")) + { + break; + } + } + if (idx == headers.length) + { + headers = Util.resizeArray(headers, idx + 1); + headers[idx] = + new NVPair("Content-type", "application/x-www-form-urlencoded"); + } + + return Post(file, Codecs.nv2query(form_data), headers); + } + + + /** + * POSTs the data to the specified file. The data is converted to an array + * of bytes using the default character converter. The request is sent using + * the content-type "application/octet-stream". + * + * @param file the absolute path of the file + * @param data the data + * @return an HTTPResponse structure containing the + * response + * @exception ModuleException if an exception is encountered in any module. + * @exception IOException Description of the Exception + * @see java.lang.String#getBytes() + */ + public HTTPResponse Post(String file, String data) + throws IOException, ModuleException + { + return Post(file, data, null); + } + + + /** + * POSTs the data to the specified file using the specified headers. + * + * @param file the absolute path of the file + * @param data the data + * @param headers additional headers + * @return an HTTPResponse structure containing the + * response + * @exception ModuleException if an exception is encountered in any module. + * @exception IOException Description of the Exception + * @see java.lang.String#getBytes() + */ + public HTTPResponse Post(String file, String data, NVPair[] headers) + throws IOException, ModuleException + { + byte tmp[] = null; + + if (data != null && data.length() > 0) + { + tmp = data.getBytes(); + } + + return Post(file, tmp, headers); + } + + + /** + * POSTs the raw data to the specified file. The request is sent using the + * content-type "application/octet-stream" + * + * @param file the absolute path of the file + * @param data the data + * @return an HTTPResponse structure containing the + * response + * @exception ModuleException if an exception is encountered in any module. + * @exception IOException Description of the Exception + */ + public HTTPResponse Post(String file, byte data[]) + throws IOException, ModuleException + { + return Post(file, data, null); + } + + + /** + * POSTs the raw data to the specified file using the specified headers. + * + * @param file the absolute path of the file + * @param data the data + * @param headers additional headers + * @return an HTTPResponse structure containing the + * response + * @exception ModuleException if an exception is encountered in any module. + * @exception IOException Description of the Exception + */ + public HTTPResponse Post(String file, byte data[], NVPair[] headers) + throws IOException, ModuleException + { + if (data == null) + { + data = new byte[0]; + } + // POST must always have a CL + return setupRequest("POST", stripRef(file), headers, data, null); + } + + + /** + * POSTs the data written to the output stream to the specified file. The + * request is sent using the content-type "application/octet-stream" + * + * @param file the absolute path of the file + * @param stream the output stream on which the data is + * written + * @return an HTTPResponse structure containing the + * response + * @exception ModuleException if an exception is encountered in any module. + * @exception IOException Description of the Exception + */ + public HTTPResponse Post(String file, HttpOutputStream stream) + throws IOException, ModuleException + { + return Post(file, stream, null); + } + + + /** + * POSTs the data written to the output stream to the specified file using + * the specified headers. + * + * @param file the absolute path of the file + * @param stream the output stream on which the data is + * written + * @param headers additional headers + * @return an HTTPResponse structure containing the + * response + * @exception ModuleException if an exception is encountered in any module. + * @exception IOException Description of the Exception + */ + public HTTPResponse Post(String file, HttpOutputStream stream, + NVPair[] headers) + throws IOException, ModuleException + { + return setupRequest("POST", stripRef(file), headers, null, stream); + } + + + /** + * PUTs the data into the specified file. The data is converted to an array + * of bytes using the default character converter. The request ist sent + * using the content-type "application/octet-stream". + * + * @param file the absolute path of the file + * @param data the data + * @return an HTTPResponse structure containing the + * response + * @exception ModuleException if an exception is encountered in any module. + * @exception IOException Description of the Exception + * @see java.lang.String#getBytes() + */ + public HTTPResponse Put(String file, String data) + throws IOException, ModuleException + { + return Put(file, data, null); + } + + + /** + * PUTs the data into the specified file using the additional headers for + * the request. + * + * @param file the absolute path of the file + * @param data the data + * @param headers additional headers + * @return an HTTPResponse structure containing the + * response + * @exception ModuleException if an exception is encountered in any module. + * @exception IOException Description of the Exception + * @see java.lang.String#getBytes() + */ + public HTTPResponse Put(String file, String data, NVPair[] headers) + throws IOException, ModuleException + { + byte tmp[] = null; + + if (data != null && data.length() > 0) + { + tmp = data.getBytes(); + } + + return Put(file, tmp, headers); + } + + + /** + * PUTs the raw data into the specified file. The request is sent using the + * content-type "application/octet-stream". + * + * @param file the absolute path of the file + * @param data the data + * @return an HTTPResponse structure containing the + * response + * @exception ModuleException if an exception is encountered in any module. + * @exception IOException Description of the Exception + */ + public HTTPResponse Put(String file, byte data[]) + throws IOException, ModuleException + { + return Put(file, data, null); + } + + + /** + * PUTs the raw data into the specified file using the additional headers. + * + * @param file the absolute path of the file + * @param data the data + * @param headers any additional headers + * @return an HTTPResponse structure containing the + * response + * @exception ModuleException if an exception is encountered in any module. + * @exception IOException Description of the Exception + */ + public HTTPResponse Put(String file, byte data[], NVPair[] headers) + throws IOException, ModuleException + { + if (data == null) + { + data = new byte[0]; + } + // PUT must always have a CL + return setupRequest("PUT", stripRef(file), headers, data, null); + } + + + /** + * PUTs the data written to the output stream into the specified file. The + * request is sent using the content-type "application/octet-stream". + * + * @param file the absolute path of the file + * @param stream the output stream on which the data is + * written + * @return an HTTPResponse structure containing the + * response + * @exception ModuleException if an exception is encountered in any module. + * @exception IOException Description of the Exception + */ + public HTTPResponse Put(String file, HttpOutputStream stream) + throws IOException, ModuleException + { + return Put(file, stream, null); + } + + + /** + * PUTs the data written to the output stream into the specified file using + * the additional headers. + * + * @param file the absolute path of the file + * @param stream the output stream on which the data is + * written + * @param headers any additional headers + * @return an HTTPResponse structure containing the + * response + * @exception ModuleException if an exception is encountered in any module. + * @exception IOException Description of the Exception + */ + public HTTPResponse Put(String file, HttpOutputStream stream, + NVPair[] headers) + throws IOException, ModuleException + { + return setupRequest("PUT", stripRef(file), headers, null, stream); + } + + + /** + * Request OPTIONS from the server. If file is "*" then the + * request applies to the server as a whole; otherwise it applies only to + * that resource. + * + * @param file the absolute path of the resource, or "*" + * @return an HTTPResponse structure containing the + * response + * @exception ModuleException if an exception is encountered in any module. + * @exception IOException Description of the Exception + */ + public HTTPResponse Options(String file) + throws IOException, ModuleException + { + return Options(file, null, (byte[]) null); + } + + + /** + * Request OPTIONS from the server. If file is "*" then the + * request applies to the server as a whole; otherwise it applies only to + * that resource. + * + * @param file the absolute path of the resource, or "*" + * @param headers the headers containing optional info. + * @return an HTTPResponse structure containing the + * response + * @exception ModuleException if an exception is encountered in any module. + * @exception IOException Description of the Exception + */ + public HTTPResponse Options(String file, NVPair[] headers) + throws IOException, ModuleException + { + return Options(file, headers, (byte[]) null); + } + + + /** + * Request OPTIONS from the server. If file is "*" then the + * request applies to the server as a whole; otherwise it applies only to + * that resource. + * + * @param file the absolute path of the resource, or "*" + * @param headers the headers containing optional info. + * @param data any data to be sent in the optional body + * @return an HTTPResponse structure containing the + * response + * @exception ModuleException if an exception is encountered in any module. + * @exception IOException Description of the Exception + */ + public HTTPResponse Options(String file, NVPair[] headers, byte[] data) + throws IOException, ModuleException + { + return setupRequest("OPTIONS", stripRef(file), headers, data, null); + } + + + /** + * Request OPTIONS from the server. If file is "*" then the + * request applies to the server as a whole; otherwise it applies only to + * that resource. + * + * @param file the absolute path of the resource, or "*" + * @param headers the headers containing optional info. + * @param stream an output stream for sending the optional + * body + * @return an HTTPResponse structure containing the + * response + * @exception ModuleException if an exception is encountered in any module. + * @exception IOException Description of the Exception + */ + public HTTPResponse Options(String file, NVPair[] headers, + HttpOutputStream stream) + throws IOException, ModuleException + { + return setupRequest("OPTIONS", stripRef(file), headers, null, stream); + } + + + /** + * Requests that file be DELETEd from the server. + * + * @param file the absolute path of the resource + * @return an HTTPResponse structure containing the + * response + * @exception ModuleException if an exception is encountered in any module. + * @exception IOException Description of the Exception + */ + public HTTPResponse Delete(String file) + throws IOException, ModuleException + { + return Delete(file, null); + } + + + /** + * Requests that file be DELETEd from the server. + * + * @param file the absolute path of the resource + * @param headers additional headers + * @return an HTTPResponse structure containing the + * response + * @exception ModuleException if an exception is encountered in any module. + * @exception IOException Description of the Exception + */ + public HTTPResponse Delete(String file, NVPair[] headers) + throws IOException, ModuleException + { + return setupRequest("DELETE", stripRef(file), headers, null, null); + } + + + /** + * Requests a TRACE. Headers of particular interest here are "Via" and + * "Max-Forwards". + * + * @param file the absolute path of the resource + * @param headers additional headers + * @return an HTTPResponse structure containing the + * response + * @exception ModuleException if an exception is encountered in any module. + * @exception IOException Description of the Exception + */ + public HTTPResponse Trace(String file, NVPair[] headers) + throws IOException, ModuleException + { + return setupRequest("TRACE", stripRef(file), headers, null, null); + } + + + /** + * Requests a TRACE. + * + * @param file the absolute path of the resource + * @return an HTTPResponse structure containing the + * response + * @exception ModuleException if an exception is encountered in any module. + * @exception IOException Description of the Exception + */ + public HTTPResponse Trace(String file) + throws IOException, ModuleException + { + return Trace(file, null); + } + + + /** + * This is here to allow an arbitrary, non-standard request to be sent. I'm + * assuming you know what you are doing... + * + * @param method the extension method + * @param file the absolute path of the resource, or null + * @param data optional data, or null + * @param headers optional headers, or null + * @return an HTTPResponse structure containing the + * response + * @exception ModuleException if an exception is encountered in any module. + * @exception IOException Description of the Exception + */ + public HTTPResponse ExtensionMethod(String method, String file, + byte[] data, NVPair[] headers) + throws IOException, ModuleException + { + return setupRequest(method.trim(), stripRef(file), headers, data, null); + } + + + /** + * This is here to allow an arbitrary, non-standard request to be sent. I'm + * assuming you know what you are doing... + * + * @param method the extension method + * @param file the absolute path of the resource, or null + * @param headers optional headers, or null + * @param os Description of the Parameter + * @return an HTTPResponse structure containing the + * response + * @exception ModuleException if an exception is encountered in any module. + * @exception IOException Description of the Exception + */ + public HTTPResponse ExtensionMethod(String method, String file, + HttpOutputStream os, NVPair[] headers) + throws IOException, ModuleException + { + return setupRequest(method.trim(), stripRef(file), headers, null, os); + } + + + /** + * Aborts all the requests currently in progress on this connection and + * closes all associated sockets. You usually do not need to invoke + * this - it only meant for when you need to abruptly stop things, such as + * for example the stop button in a browser.

+ * + * Note: there is a small window where a request method such as Get() + * may have been invoked but the request has not been built and added to the + * list. Any request in this window will not be aborted. + * + * @since V0.2-3 + */ + public void stop() + { + for (Request req = (Request) RequestList.enumerate(); req != null; + req = (Request) RequestList.next()) + { + req.aborted = true; + } + + for (StreamDemultiplexor demux = + (StreamDemultiplexor) DemuxList.enumerate(); + demux != null; demux = (StreamDemultiplexor) DemuxList.next()) + { + demux.abort(); + } + } + + + /** + * Sets the default http headers to be sent with each request. The actual + * headers sent are determined as follows: for each header specified in + * multiple places a value given as part of the request takes priority over + * any default values set by this method, which in turn takes priority over + * any built-in default values. A different way of looking at it is that we + * start off with a list of all headers specified with the request, then add + * any default headers set by this method which aren't already in our list, + * and finally add any built-in headers which aren't yet in the list. There + * is one exception to this rule: the "Content-length" header is always + * ignored; and when posting form-data any default "Content-type" is ignored + * in favor of the built-in "application/x-www-form-urlencoded" (however it + * will be overriden by any content-type header specified as part of the + * request).

+ * + * Typical headers you might want to set here are "Accept" and its + * "Accept-*" relatives, "Connection", "From", "User-Agent", etc. + * + * @param headers an array of header-name/value pairs (do not give the + * separating ':'). + */ + public void setDefaultHeaders(NVPair[] headers) + { + int length = (headers == null ? 0 : headers.length); + NVPair[] def_hdrs = new NVPair[length]; + + // weed out undesired headers + int sidx; + + // weed out undesired headers + int didx; + for (sidx = 0, didx = 0; sidx < length; sidx++) + { + if (headers[sidx] == null) + { + continue; + } + + String name = headers[sidx].getName().trim(); + if (name.equalsIgnoreCase("Content-length")) + { + continue; + } + + def_hdrs[didx++] = headers[sidx]; + } + + if (didx < length) + { + def_hdrs = Util.resizeArray(def_hdrs, didx); + } + + synchronized (DefaultHeaders) + { + DefaultHeaders = def_hdrs; + } + } + + + /** + * Gets the current list of default http headers. + * + * @return an array of header/value pairs. + */ + public NVPair[] getDefaultHeaders() + { + synchronized (DefaultHeaders) + { + return (NVPair[]) DefaultHeaders.clone(); + } + } + + + /** + * Returns the protocol this connection is talking. + * + * @return a string containing the (lowercased) protocol + */ + public String getProtocol() + { + switch (Protocol) + { + case HTTP: + return "http"; + case HTTPS: + return "https"; + case SHTTP: + return "shttp"; + case HTTP_NG: + return "http-ng"; + default: + throw new Error("HTTPClient Internal Error: invalid protocol " + + Protocol); + } + } + + + /** + * Returns the host this connection is talking to. + * + * @return a string containing the (lowercased) host name. + */ + public String getHost() + { + return Host; + } + + + /** + * Returns the port this connection connects to. This is always the actual + * port number, never -1. + * + * @return the port number + */ + public int getPort() + { + return Port; + } + + + /** + * Returns the host of the proxy this connection is using. + * + * @return a string containing the (lowercased) host name. + */ + public String getProxyHost() + { + return Proxy_Host; + } + + + /** + * Returns the port of the proxy this connection is using. + * + * @return the port number + */ + public int getProxyPort() + { + return Proxy_Port; + } + + + /** + * See if the given uri is compatible with this connection. Compatible means + * that the given uri can be retrieved using this connection object. + * + * @param uri the URI to check + * @return true if they're compatible, false otherwise + * @since V0.3-2 + */ + public boolean isCompatibleWith(URI uri) + { + if (!uri.getScheme().equals(getProtocol()) || + !uri.getHost().equalsIgnoreCase(Host)) + { + return false; + } + + int port = uri.getPort(); + if (port == -1) + { + port = URI.defaultPort(uri.getScheme()); + } + return port == Port; + } + + + /** + * Sets/Resets raw mode. In raw mode all modules are bypassed, meaning the + * automatic handling of authorization requests, redirections, cookies, etc. + * is turned off.

+ * + * The default is false. + * + * @param raw if true removes all modules (except for the retry module) + * @deprecated This is not really needed anymore; in V0.2 request were + * synchronous and therefore to do pipelining you needed to disable the + * processing of responses. + * @see #removeModule(java.lang.Class) + */ + public void setRawMode(boolean raw) + { + // Don't remove the retry module + String[] modules = {"HTTPClient.CookieModule", + "HTTPClient.RedirectionModule", + "HTTPClient.AuthorizationModule", + "HTTPClient.DefaultModule", + "HTTPClient.TransferEncodingModule", + "HTTPClient.ContentMD5Module", + "HTTPClient.ContentEncodingModule"}; + + for (int idx = 0; idx < modules.length; idx++) + { + try + { + if (raw) + { + removeModule(Class.forName(modules[idx])); + } + else + { + addModule(Class.forName(modules[idx]), -1); + } + } + catch (ClassNotFoundException cnfe) + { + } + } + } + + + /** + * Sets the default timeout value to be used for each new HTTPConnection. + * The default is 0. + * + * @param time the timeout in milliseconds. + * @see #setTimeout(int) + */ + public static void setDefaultTimeout(int time) + { + DefaultTimeout = time; + } + + + /** + * Gets the default timeout value to be used for each new HTTPConnection. + * + * @return the timeout in milliseconds. + * @see #setTimeout(int) + */ + public static int getDefaultTimeout() + { + return DefaultTimeout; + } + + + /** + * Sets the timeout to be used for creating connections and reading + * responses. When a timeout expires the operation will throw an + * InterruptedIOException. The operation may be restarted again afterwards. + * If the operation is not restarted and it is a read operation (i.e + * HTTPResponse.xxxx()) then resp.getInputStream().close() + * should be invoked.

+ * + * When creating new sockets the timeout will limit the time spent doing the + * host name translation and establishing the connection with the server. + *

+ * + * The timeout also influences the reading of the response headers. However, + * it does not specify a how long, for example, getStatusCode() may take, as + * might be assumed. Instead it specifies how long a read on the socket may + * take. If the response dribbles in slowly with packets arriving quicker + * than the timeout then the method will complete normally. I.e. the + * exception is only thrown if nothing arrives on the socket for the + * specified time. Furthermore, the timeout only influences the reading of + * the headers, not the reading of the body.

+ * + * Read Timeouts are associated with responses, so that you may change this + * value before each request and it won't affect the reading of responses to + * previous requests. + * + * @param time the time in milliseconds. A time of 0 means wait + * indefinitely. + * @see #stop() + */ + public void setTimeout(int time) + { + Timeout = time; + } + + + /** + * Gets the timeout used for reading response data. + * + * @return the current timeout value + * @see #setTimeout(int) + */ + public int getTimeout() + { + return Timeout; + } + + + /** + * Controls whether modules are allowed to prompt the user or pop up dialogs + * if neccessary. + * + * @param allow if true allows modules to interact with user. + */ + public void setAllowUserInteraction(boolean allow) + { + allowUI = allow; + } + + + /** + * returns whether modules are allowed to prompt or popup dialogs if + * neccessary. + * + * @return true if modules are allowed to interact with user. + */ + public boolean getAllowUserInteraction() + { + return allowUI; + } + + + /** + * Sets the default allow-user-action. + * + * @param allow if true allows modules to interact with user. + */ + public static void setDefaultAllowUserInteraction(boolean allow) + { + defaultAllowUI = allow; + } + + + /** + * Gets the default allow-user-action. + * + * @return true if modules are allowed to interact with user. + */ + public static boolean getDefaultAllowUserInteraction() + { + return defaultAllowUI; + } + + + /** + * Returns the default list of modules. + * + * @return an array of classes + */ + public static Class[] getDefaultModules() + { + return getModules(DefaultModuleList); + } + + + /** + * Adds a module to the default list. It must implement the + * HTTPClientModule interface. If the module is already in the list + * then this method does nothing. This method only affects instances of + * HTTPConnection created after this method has been invoked; it does not + * affect existing instances.

+ * + * Example:

+     * HTTPConnection.addDefaultModule(Class.forName("HTTPClient.CookieModule"), 1);
+     * 
adds the cookie module as the second module in the list.

+ * + * The default list is created at class initialization time from the + * property HTTPClient.Modules . This must contain a "|" + * separated list of classes in the order they're to be invoked. If this + * property is not set it defaults to: "HTTPClient.RetryModule | + * HTTPClient.CookieModule | HTTPClient.RedirectionModule | + * HTTPClient.AuthorizationModule | HTTPClient.DefaultModule | + * HTTPClient.TransferEncodingModule | HTTPClient.ContentMD5Module | + * HTTPClient.ContentEncodingModule" + * + * @param module the module's Class object + * @param pos the position of this module in the list; if pos + * >= 0 then this is the absolute position in the list (0 is the first + * position); if pos < 0 then this is the position relative + * to the end of the list (-1 means the last element, -2 the second to + * last element, etc). + * @return true if module was successfully added; false if the module + * is already in the list. + * @see HTTPClientModule + */ + public static boolean addDefaultModule(Class module, int pos) + { + return addModule(DefaultModuleList, module, pos); + } + + + /** + * Removes a module from the default list. If the module is not in the list + * it does nothing. This method only affects instances of HTTPConnection + * created after this method has been invoked; it does not affect existing + * instances. + * + * @param module the module's Class object + * @return true if module was successfully removed; false otherwise + */ + public static boolean removeDefaultModule(Class module) + { + return removeModule(DefaultModuleList, module); + } + + + /** + * Returns the list of modules used currently. + * + * @return an array of classes + */ + public Class[] getModules() + { + return getModules(ModuleList); + } + + + /** + * Adds a module to the current list. It must implement the + * HTTPClientModule interface. If the module is already in the list + * then this method does nothing. + * + * @param module the module's Class object + * @param pos the position of this module in the list; if pos + * >= 0 then this is the absolute position in the list (0 is the first + * position); if pos < 0 then this is the position relative + * to the end of the list (-1 means the last element, -2 the second to + * last element, etc). + * @return true if module was successfully added; false if the module + * is already in the list. + * @see HTTPClientModule + */ + public boolean addModule(Class module, int pos) + { + return addModule(ModuleList, module, pos); + } + + + /** + * Removes a module from the current list. If the module is not in the list + * it does nothing. + * + * @param module the module's Class object + * @return true if module was successfully removed; false otherwise + */ + public boolean removeModule(Class module) + { + return removeModule(ModuleList, module); + } + + + /** + * Gets the modules attribute of the HTTPConnection class + * + * @param list Description of the Parameter + * @return The modules value + */ + private final static Class[] getModules(Vector list) + { + synchronized (list) + { + Class[] modules = new Class[list.size()]; + list.copyInto(modules); + return modules; + } + } + + + /** + * Adds a feature to the Module attribute of the HTTPConnection class + * + * @param list The feature to be added to the Module attribute + * @param module The feature to be added to the Module attribute + * @param pos The feature to be added to the Module attribute + * @return Description of the Return Value + */ + private final static boolean addModule(Vector list, Class module, int pos) + { + if (module == null) + { + return false; + } + + // check if module implements HTTPClientModule + try + { + HTTPClientModule tmp = (HTTPClientModule) module.newInstance(); + } + catch (RuntimeException re) + { + throw re; + } + catch (Exception e) + { + throw new RuntimeException(e.toString()); + } + + synchronized (list) + { + // check if module already in list + if (list.contains(module)) + { + return false; + } + + // add module to list + if (pos < 0) + { + list.insertElementAt(module, DefaultModuleList.size() + pos + 1); + } + else + { + list.insertElementAt(module, pos); + } + } + + Log.write(Log.CONN, "Conn: Added module " + module.getName() + + " to " + + ((list == DefaultModuleList) ? "default " : "") + + "list"); + + return true; + } + + + /** + * Description of the Method + * + * @param list Description of the Parameter + * @param module Description of the Parameter + * @return Description of the Return Value + */ + private final static boolean removeModule(Vector list, Class module) + { + if (module == null) + { + return false; + } + + boolean removed = list.removeElement(module); + if (removed) + { + Log.write(Log.CONN, "Conn: Removed module " + module.getName() + + " from " + + ((list == DefaultModuleList) ? "default " : "") + + "list"); + } + + return removed; + } + + + /** + * Sets the current context. The context is used by modules such as the + * AuthorizationModule and the CookieModule which keep lists of info that is + * normally shared between all instances of HTTPConnection. This is usually + * the desired behaviour. However, in some cases one would like to simulate + * multiple independent clients within the same application and hence the + * sharing of such info should be restricted. This is where the context + * comes in. Modules will only share their info between requests using the + * same context (i.e. they keep multiple lists, one for each context).

+ * + * The context may be any object. Contexts are considered equal if equals() + * returns true. Examples of useful context objects are threads (e.g. if you + * are running multiple clients, one per thread) and sockets (e.g. if you + * are implementing a gateway).

+ * + * When a new HTTPConnection is created it is initialized with a default + * context which is the same for all instances. This method must be invoked + * immediately after a new HTTPConnection is created and before any request + * method is invoked. Furthermore, this method may only be called once (i.e. + * the context is "sticky"). + * + * @param context the new context; must be non-null + */ + public void setContext(Object context) + { + if (context == null) + { + throw new IllegalArgumentException("Context must be non-null"); + } + if (Context != null) + { + throw new IllegalStateException("Context already set"); + } + + Context = context; + } + + + /** + * Returns the current context. + * + * @return the current context, or the default context if setContext() + * hasn't been invoked + * @see #setContext(java.lang.Object) + */ + public Object getContext() + { + if (Context != null) + { + return Context; + } + else + { + return dflt_context; + } + } + + + /** + * Returns the default context. + * + * @return the default context + * @see #setContext(java.lang.Object) + */ + public static Object getDefaultContext() + { + return dflt_context; + } + + + /** + * Adds an authorization entry for the "digest" authorization scheme to the + * list. If an entry already exists for the "digest" scheme and the + * specified realm then it is overwritten.

+ * + * This is a convenience method and just invokes the corresponding method in + * AuthorizationInfo. + * + * @param realm the realm + * @param user the username + * @param passwd The feature to be added to the DigestAuthorization + * attribute + * @see AuthorizationInfo#addDigestAuthorization(java.lang.String, + * int, java.lang.String, java.lang.String, java.lang.String) + */ + public void addDigestAuthorization(String realm, String user, String passwd) + { + AuthorizationInfo.addDigestAuthorization(Host, Port, realm, user, + passwd, getContext()); + } + + + /** + * Adds an authorization entry for the "basic" authorization scheme to the + * list. If an entry already exists for the "basic" scheme and the specified + * realm then it is overwritten.

+ * + * This is a convenience method and just invokes the corresponding method in + * AuthorizationInfo. + * + * @param realm the realm + * @param user the username + * @param passwd The feature to be added to the BasicAuthorization + * attribute + * @see AuthorizationInfo#addBasicAuthorization(java.lang.String, + * int, java.lang.String, java.lang.String, java.lang.String) + */ + public void addBasicAuthorization(String realm, String user, String passwd) + { + AuthorizationInfo.addBasicAuthorization(Host, Port, realm, user, + passwd, getContext()); + } + + + /** + * Sets the default proxy server to use. The proxy will only be used for new + * HTTPConnection s created after this call and will not affect + * currrent instances of HTTPConnection . A null or empty string + * host parameter disables the proxy.

+ * + * In an application or using the Appletviewer an alternative to this method + * is to set the following properties (either in the properties file or on + * the command line): http.proxyHost and http.proxyPort + * . Whether http.proxyHost is set or not determines + * whether a proxy server is used.

+ * + * If the proxy server requires authorization and you wish to set this + * authorization information in the code, then you may use any of the + * AuthorizationInfo.addXXXAuthorization() methods to do so. Specify + * the same host and port as in this method. If you + * have not given any authorization info and the proxy server requires + * authorization then you will be prompted for the necessary info via a + * popup the first time you do a request. + * + * @param host the host on which the proxy server resides. + * @param port the port the proxy server is listening on. + * @see #setCurrentProxy(java.lang.String, int) + */ + public static void setProxyServer(String host, int port) + { + if (host == null || host.trim().length() == 0) + { + Default_Proxy_Host = null; + } + else + { + Default_Proxy_Host = host.trim().toLowerCase(); + Default_Proxy_Port = port; + } + } + + + /** + * Sets the proxy used by this instance. This can be used to override the + * proxy setting inherited from the default proxy setting. A null or empty + * string host parameter disables the proxy.

+ * + * Note that if you set a proxy for the connection using this method, and a + * request made over this connection is redirected to a different server, + * then the connection used for new server will not pick this proxy + * setting, but instead will use the default proxy settings. + * + * @param host the host the proxy runs on + * @param port the port the proxy is listening on + * @see #setProxyServer(java.lang.String, int) + */ + public synchronized void setCurrentProxy(String host, int port) + { + if (host == null || host.trim().length() == 0) + { + Proxy_Host = null; + } + else + { + Proxy_Host = host.trim().toLowerCase(); + if (port <= 0) + { + Proxy_Port = 80; + } + else + { + Proxy_Port = port; + } + } + + // the proxy might be talking a different version, so renegotiate + switch (Protocol) + { + case HTTP: + case HTTPS: + if (force_1_0) + { + ServerProtocolVersion = HTTP_1_0; + ServProtVersKnown = true; + RequestProtocolVersion = "HTTP/1.0"; + } + else + { + ServerProtocolVersion = HTTP_1_1; + ServProtVersKnown = false; + RequestProtocolVersion = "HTTP/1.1"; + } + break; + case HTTP_NG: + ServerProtocolVersion = -1; + /* + * Unknown + */ + ServProtVersKnown = false; + RequestProtocolVersion = ""; + break; + case SHTTP: + ServerProtocolVersion = -1; + /* + * Unknown + */ + ServProtVersKnown = false; + RequestProtocolVersion = "Secure-HTTP/1.3"; + break; + default: + throw new Error("HTTPClient Internal Error: invalid protocol " + + Protocol); + } + + keepAliveUnknown = true; + doesKeepAlive = false; + + input_demux = null; + early_stall = null; + late_stall = null; + prev_resp = null; + } + + + /** + * Add host to the list of hosts which should be accessed + * directly, not via any proxy set by setProxyServer().

+ * + * The host may be any of: + *

+ *

+ * + * The two properties HTTPClient.nonProxyHosts and + * http.nonProxyHosts are used when this class is loaded to initialize + * the list of non-proxy hosts. The second property is only read if the + * first one is not set; the second property is also used the JDK's + * URLConnection. These properties must contain a "|" separated list of + * entries which conform to the above rules for the host + * parameter (e.g. "11.22.33.44|.disney.com"). + * + * @param host a host name, domain name, IP-address or + * IP-subnet. + * @exception ParseException if the length of the netmask does not match + * the length of the IP-address + */ + public static void dontProxyFor(String host) + throws ParseException + { + host = host.trim().toLowerCase(); + + // check for domain name + + if (host.charAt(0) == '.') + { + if (!non_proxy_dom_list.contains(host)) + { + non_proxy_dom_list.addElement(host); + } + return; + } + + // check for host name + + for (int idx = 0; idx < host.length(); idx++) + { + if (!Character.isDigit(host.charAt(idx)) && + host.charAt(idx) != '.' && host.charAt(idx) != '/') + { + non_proxy_host_list.put(host, ""); + return; + } + } + + // must be an IP-address + + byte[] ip_addr; + byte[] ip_mask; + int slash; + if ((slash = host.indexOf('/')) != -1) + { + // IP subnet + + ip_addr = string2arr(host.substring(0, slash)); + ip_mask = string2arr(host.substring(slash + 1)); + if (ip_addr.length != ip_mask.length) + { + throw new ParseException("length of IP-address (" + + ip_addr.length + ") != length of netmask (" + + ip_mask.length + ")"); + } + } + else + { + ip_addr = string2arr(host); + ip_mask = new byte[ip_addr.length]; + for (int idx = 0; idx < ip_mask.length; idx++) + { + ip_mask[idx] = (byte) 255; + } + } + + // check if addr or subnet already exists + + ip_loop : + for (int idx = 0; idx < non_proxy_addr_list.size(); idx++) + { + byte[] addr = (byte[]) non_proxy_addr_list.elementAt(idx); + byte[] mask = (byte[]) non_proxy_mask_list.elementAt(idx); + if (addr.length != ip_addr.length) + { + continue; + } + + for (int idx2 = 0; idx2 < addr.length; idx2++) + { + if ((ip_addr[idx2] & mask[idx2]) != (addr[idx2] & mask[idx2]) || + (mask[idx2] != ip_mask[idx2])) + { + continue ip_loop; + } + } + + return; + // already exists + } + non_proxy_addr_list.addElement(ip_addr); + non_proxy_mask_list.addElement(ip_mask); + } + + + /** + * Convenience method to add a number of hosts at once. If any one host is + * null or cannot be parsed it is ignored. + * + * @param hosts The list of hosts to set + * @see #dontProxyFor(java.lang.String) + * @since V0.3-2 + */ + public static void dontProxyFor(String[] hosts) + { + if (hosts == null || hosts.length == 0) + { + return; + } + + for (int idx = 0; idx < hosts.length; idx++) + { + try + { + if (hosts[idx] != null) + { + dontProxyFor(hosts[idx]); + } + } + catch (ParseException pe) + { + // ignore it + } + } + } + + + /** + * Remove host from the list of hosts for which the proxy should + * not be used. This modifies the same list that dontProxyFor() + * uses, i.e. this is used to undo a dontProxyFor() setting. + * The syntax for host is specified in dontProxyFor() + * . + * + * @param host a host name, domain name, IP-address or + * IP-subnet. + * @return true if the remove was sucessful, false + * otherwise + * @exception ParseException if the length of the netmask does not match + * the length of the IP-address + * @see #dontProxyFor(java.lang.String) + */ + public static boolean doProxyFor(String host) + throws ParseException + { + host = host.trim().toLowerCase(); + + // check for domain name + + if (host.charAt(0) == '.') + { + return non_proxy_dom_list.removeElement(host); + } + + // check for host name + + for (int idx = 0; idx < host.length(); idx++) + { + if (!Character.isDigit(host.charAt(idx)) && + host.charAt(idx) != '.' && host.charAt(idx) != '/') + { + return (non_proxy_host_list.remove(host) != null); + } + } + + // must be an IP-address + + byte[] ip_addr; + byte[] ip_mask; + int slash; + if ((slash = host.indexOf('/')) != -1) + { + // IP subnet + + ip_addr = string2arr(host.substring(0, slash)); + ip_mask = string2arr(host.substring(slash + 1)); + if (ip_addr.length != ip_mask.length) + { + throw new ParseException("length of IP-address (" + + ip_addr.length + ") != length of netmask (" + + ip_mask.length + ")"); + } + } + else + { + ip_addr = string2arr(host); + ip_mask = new byte[ip_addr.length]; + for (int idx = 0; idx < ip_mask.length; idx++) + { + ip_mask[idx] = (byte) 255; + } + } + + ip_loop : + for (int idx = 0; idx < non_proxy_addr_list.size(); idx++) + { + byte[] addr = (byte[]) non_proxy_addr_list.elementAt(idx); + byte[] mask = (byte[]) non_proxy_mask_list.elementAt(idx); + if (addr.length != ip_addr.length) + { + continue; + } + + for (int idx2 = 0; idx2 < addr.length; idx2++) + { + if ((ip_addr[idx2] & mask[idx2]) != (addr[idx2] & mask[idx2]) || + (mask[idx2] != ip_mask[idx2])) + { + continue ip_loop; + } + } + + non_proxy_addr_list.removeElementAt(idx); + non_proxy_mask_list.removeElementAt(idx); + return true; + } + return false; + } + + + /** + * Turn an IP-address string into an array (e.g. "12.34.56.78" into { 12, + * 34, 56, 78 }). + * + * @param ip IP-address + * @return IP-address in network byte order + */ + private static byte[] string2arr(String ip) + { + byte[] arr; + char[] ip_char = new char[ip.length()]; + ip.getChars(0, ip_char.length, ip_char, 0); + + int cnt = 0; + for (int idx = 0; idx < ip_char.length; idx++) + { + if (ip_char[idx] == '.') + { + cnt++; + } + } + arr = new byte[cnt + 1]; + + cnt = 0; + int pos = 0; + for (int idx = 0; idx < ip_char.length; idx++) + { + if (ip_char[idx] == '.') + { + arr[cnt] = (byte) Integer.parseInt(ip.substring(pos, idx)); + cnt++; + pos = idx + 1; + } + } + arr[cnt] = (byte) Integer.parseInt(ip.substring(pos)); + + return arr; + } + + + /** + * Sets the SOCKS server to use. The server will only be used for new + * HTTPConnections created after this call and will not affect currrent + * instances of HTTPConnection. A null or empty string host parameter + * disables SOCKS.

+ * + * The code will try to determine the SOCKS version to use at connection + * time. This might fail for a number of reasons, however, in which case you + * must specify the version explicitly. + * + * @param host the host on which the proxy server resides. The port used is + * the default port 1080. + * @see #setSocksServer(java.lang.String, int, int) + */ + public static void setSocksServer(String host) + { + setSocksServer(host, 1080); + } + + + /** + * Sets the SOCKS server to use. The server will only be used for new + * HTTPConnections created after this call and will not affect currrent + * instances of HTTPConnection. A null or empty string host parameter + * disables SOCKS.

+ * + * The code will try to determine the SOCKS version to use at connection + * time. This might fail for a number of reasons, however, in which case you + * must specify the version explicitly. + * + * @param host the host on which the proxy server resides. + * @param port the port the proxy server is listening on. + * @see #setSocksServer(java.lang.String, int, int) + */ + public static void setSocksServer(String host, int port) + { + if (port <= 0) + { + port = 1080; + } + + if (host == null || host.length() == 0) + { + Default_Socks_client = null; + } + else + { + Default_Socks_client = new SocksClient(host, port); + } + } + + + /** + * Sets the SOCKS server to use. The server will only be used for new + * HTTPConnections created after this call and will not affect currrent + * instances of HTTPConnection. A null or empty string host parameter + * disables SOCKS.

+ * + * In an application or using the Appletviewer an alternative to this method + * is to set the following properties (either in the properties file or on + * the command line): HTTPClient.socksHost , + * HTTPClient.socksPort and HTTPClient.socksVersion . + * Whether HTTPClient.socksHost is set or not determines whether + * a SOCKS server is used; if HTTPClient.socksPort is not set it + * defaults to 1080; if HTTPClient.socksVersion is not set an + * attempt will be made to automatically determine the version used by the + * server.

+ * + * Note: If you have also set a proxy server then a connection will be made + * to the SOCKS server, which in turn then makes a connection to the proxy + * server (possibly via other SOCKS servers), which in turn makes the final + * connection.

+ * + * If the proxy server is running SOCKS version 5 and requires + * username/password authorization, and you wish to set this authorization + * information in the code, then you may use the + * AuthorizationInfo.addAuthorization() method to do so. Specify the + * same host and port as in this method, give the + * scheme "SOCKS5" and the realm "USER/PASS", set the + * cookie to null and the params to an array + * containing a single NVPair in turn containing the username and + * password. Example:

+     *     NVPair[] up = { new NVPair(username, password) };
+     *     AuthorizationInfo.addAuthorization(host, port, "SOCKS5", "USER/PASS",
+     *                                        null, up);
+     * 
If you have not given any authorization info and the proxy server + * requires authorization then you will be prompted for the necessary info + * via a popup the first time you do a request. + * + * @param host the host on which the proxy server resides. + * @param port the port the proxy server is listening on. + * @param version the SOCKS version the server is running. + * Currently this must be '4' or '5'. + * @exception SocksException If version is not '4' or '5'. + */ + public static void setSocksServer(String host, int port, int version) + throws SocksException + { + if (port <= 0) + { + port = 1080; + } + + if (host == null || host.length() == 0) + { + Default_Socks_client = null; + } + else + { + Default_Socks_client = new SocksClient(host, port, version); + } + } + + + /** + * Removes the #... part. Returns the stripped name, or "" if either the + * file is null or is the empty string (after stripping). + * + * @param file the name to strip + * @return the stripped name + */ + private final String stripRef(String file) + { + if (file == null) + { + return ""; + } + + int hash = file.indexOf('#'); + if (hash != -1) + { + file = file.substring(0, hash); + } + + return file.trim(); + } + + + // private helper methods + + /** + * Sets up the request, creating the list of headers to send and creating + * instances of the modules. This may be invoked by subclasses which add + * further methods (such as those from DAV and IPP). + * + * @param method GET, POST, etc. + * @param resource the resource + * @param headers an array of headers to be used + * @param entity the entity (or null) + * @param stream the output stream (or null) - only one of + * stream and entity may be non-null + * @return the response. + * @exception ModuleException if an exception is encountered in any module. + * @exception IOException Description of the Exception + */ + protected final HTTPResponse setupRequest(String method, String resource, + NVPair[] headers, byte[] entity, + HttpOutputStream stream) + throws IOException, ModuleException + { + Request req = new Request(this, method, resource, + mergedHeaders(headers), entity, stream, + allowUI); + RequestList.addToEnd(req); + + try + { + HTTPResponse resp = new HTTPResponse(gen_mod_insts(), Timeout, req, defaultIncrement); + handleRequest(req, resp, null, true); + return resp; + } + finally + { + RequestList.remove(req); + } + } + + + /** + * This merges built-in default headers, user-specified default headers, and + * method-specified headers. Method-specified take precedence over user + * defaults, which take precedence over built-in defaults. The following + * headers are removed if found: "Content-length". + * + * @param spec the headers specified in the call to the method + * @return an array consisting of merged headers. + */ + private NVPair[] mergedHeaders(NVPair[] spec) + { + int spec_len = (spec != null ? spec.length : 0); + int + defs_len; + NVPair[] merged; + + synchronized (DefaultHeaders) + { + defs_len = (DefaultHeaders != null ? DefaultHeaders.length : 0); + merged = new NVPair[spec_len + defs_len]; + + // copy default headers + System.arraycopy(DefaultHeaders, 0, merged, 0, defs_len); + } + + // merge in selected headers + int sidx; + + // merge in selected headers + int didx = defs_len; + for (sidx = 0; sidx < spec_len; sidx++) + { + if (spec[sidx] == null) + { + continue; + } + + String s_name = spec[sidx].getName().trim(); + if (s_name.equalsIgnoreCase("Content-length")) + { + continue; + } + + int search; + for (search = 0; search < didx; search++) + { + if (merged[search].getName().trim().equalsIgnoreCase(s_name)) + { + break; + } + } + + merged[search] = spec[sidx]; + if (search == didx) + { + didx++; + } + } + + if (didx < merged.length) + { + merged = Util.resizeArray(merged, didx); + } + + return merged; + } + + + /** + * Generate an array of instances of the current modules. + * + * @return Description of the Return Value + */ + private HTTPClientModule[] gen_mod_insts() + { + synchronized (ModuleList) + { + HTTPClientModule[] mod_insts = + new HTTPClientModule[ModuleList.size()]; + + for (int idx = 0; idx < ModuleList.size(); idx++) + { + Class mod = (Class) ModuleList.elementAt(idx); + try + { + mod_insts[idx] = (HTTPClientModule) mod.newInstance(); + } + catch (Exception e) + { + throw new Error("HTTPClient Internal Error: could not " + + "create instance of " + mod.getName() + + " -\n" + e); + } + } + + return mod_insts; + } + } + + + /** + * handles the Request. First the request handler for each module is is + * invoked, and then if no response was generated the request is sent. + * + * @param req the Request + * @param http_resp the HTTPResponse + * @param resp the Response + * @param usemodules if false then skip module loop + * @exception IOException if any module or sendRequest throws it + * @exception ModuleException if any module throws it + */ + void handleRequest(Request req, HTTPResponse http_resp, Response resp, + boolean usemodules) + throws IOException, ModuleException + { + Response[] rsp_arr = {resp}; + HTTPClientModule[] modules = http_resp.getModules(); + + // invoke requestHandler for each module + + if (usemodules) + { + doModules : + for (int idx = 0; idx < modules.length; idx++) + { + int sts = modules[idx].requestHandler(req, rsp_arr); + switch (sts) + { + case REQ_CONTINUE: + // continue processing + break; + case REQ_RESTART: + // restart processing with first module + idx = -1; + continue doModules; + case REQ_SHORTCIRC: + // stop processing and send + break doModules; + case REQ_RESPONSE: + // go to phase 2 + case REQ_RETURN: + // return response immediately + if (rsp_arr[0] == null) + { + throw new Error("HTTPClient Internal Error: no " + + "response returned by module " + + modules[idx].getClass().getName()); + } + http_resp.set(req, rsp_arr[0]); + if (req.getStream() != null) + { + req.getStream().ignoreData(req); + } + if (req.internal_subrequest) + { + return; + } + if (sts == REQ_RESPONSE) + { + http_resp.handleResponse(); + } + else + { + http_resp.init(rsp_arr[0]); + } + return; + case REQ_NEWCON_RST: + // new connection + if (req.internal_subrequest) + { + return; + } + req.getConnection(). + handleRequest(req, http_resp, rsp_arr[0], true); + return; + case REQ_NEWCON_SND: + // new connection, send immediately + if (req.internal_subrequest) + { + return; + } + req.getConnection(). + handleRequest(req, http_resp, rsp_arr[0], false); + return; + default: + // not valid + throw new Error("HTTPClient Internal Error: invalid status" + + " " + sts + " returned by module " + + modules[idx].getClass().getName()); + } + } + } + + if (req.internal_subrequest) + { + return; + } + + // Send the request across the wire + + if (req.getStream() != null && req.getStream().getLength() == -1) + { + if (!ServProtVersKnown || ServerProtocolVersion < HTTP_1_1 || + no_chunked) + { + req.getStream().goAhead(req, null, http_resp.getTimeout()); + http_resp.set(req, req.getStream()); + } + else + { + // add Transfer-Encoding header if necessary + int idx; + NVPair[] hdrs = req.getHeaders(); + for (idx = 0; idx < hdrs.length; idx++) + { + if (hdrs[idx].getName().equalsIgnoreCase("Transfer-Encoding")) + { + break; + } + } + + if (idx == hdrs.length) + { + hdrs = Util.resizeArray(hdrs, idx + 1); + hdrs[idx] = new NVPair("Transfer-Encoding", "chunked"); + req.setHeaders(hdrs); + } + else + { + String v = hdrs[idx].getValue(); + try + { + if (!Util.hasToken(v, "chunked")) + { + hdrs[idx] = new NVPair("Transfer-Encoding", + v + ", chunked"); + } + } + catch (ParseException pe) + { + throw new IOException(pe.toString()); + } + } + + http_resp.set(req, sendRequest(req, http_resp.getTimeout())); + } + } + else + { + http_resp.set(req, sendRequest(req, http_resp.getTimeout())); + } + + if (req.aborted) + { + throw new IOException("Request aborted by user"); + } + } + + + /** + * These mark the response to stall the next request on, if any + */ + private volatile Response early_stall = null; + private volatile Response late_stall = null; + private volatile Response prev_resp = null; + /** + * This marks the socket output stream as still being used + */ + private boolean output_finished = true; + + + /** + * sends the request over the line. + * + * @param req the request + * @param con_timeout the timeout to use when establishing a socket + * connection; an InterruptedIOException is thrown if the procedure + * times out. + * @return Description of the Return Value + * @exception IOException if thrown by the socket + * @exception ModuleException if any module throws it during the SSL- + * tunneling handshake + */ + Response sendRequest(Request req, int con_timeout) + throws IOException, ModuleException + { + ByteArrayOutputStream hdr_buf = new ByteArrayOutputStream(600); + Response resp = null; + boolean keep_alive; + + // The very first request is special in that we need its response + // before any further requests may be made. This is to set things + // like the server version. + + if (early_stall != null) + { + try + { + Log.write(Log.CONN, "Conn: Early-stalling Request: " + + req.getMethod() + " " + + req.getRequestURI()); + + synchronized (early_stall) + { + // wait till the response is received + try + { + early_stall.getVersion(); + } + catch (IOException ioe) + { + } + early_stall = null; + } + } + catch (NullPointerException npe) + { + } + } + + String[] con_hdrs = assembleHeaders(req, hdr_buf); + + // determine if the connection should be kept alive after this + // request + + try + { + if (ServerProtocolVersion >= HTTP_1_1 && + !Util.hasToken(con_hdrs[0], "close") + || + ServerProtocolVersion == HTTP_1_0 && + Util.hasToken(con_hdrs[0], "keep-alive") + ) + { + keep_alive = true; + } + else + { + keep_alive = false; + } + } + catch (ParseException pe) + { + throw new IOException(pe.toString()); + } + + synchronized (this) + { + // Sometimes we must stall the pipeline until the previous request + // has been answered. However, if we are going to open up a new + // connection anyway we don't really need to stall. + + if (late_stall != null) + { + if (input_demux != null || keepAliveUnknown) + { + Log.write(Log.CONN, "Conn: Stalling Request: " + + req.getMethod() + " " + req.getRequestURI()); + + try + { + // wait till the response is received + + late_stall.getVersion(); + if (keepAliveUnknown) + { + determineKeepAlive(late_stall); + } + } + catch (IOException ioe) + { + } + } + + late_stall = null; + } + + /* + * POSTs must not be pipelined because of problems if the connection + * is aborted. Since it is generally impossible to know what urls + * POST will influence it is impossible to determine if a sequence + * of requests containing a POST is idempotent. + * Also, for retried requests we don't want to pipeline either. + */ + if ((req.getMethod().equals("POST") || req.dont_pipeline) && + prev_resp != null && input_demux != null) + { + Log.write(Log.CONN, "Conn: Stalling Request: " + + req.getMethod() + " " + req.getRequestURI()); + + try + { + // wait till the response is received + prev_resp.getVersion(); + } + catch (IOException ioe) + { + } + } + + // If the previous request used an output stream, then wait till + // all the data has been written + + if (!output_finished) + { + try + { + wait(); + } + catch (InterruptedException ie) + { + throw new IOException(ie.toString()); + } + } + + if (req.aborted) + { + throw new IOException("Request aborted by user"); + } + + int try_count = 3; + /* + * what a hack! This is to handle the case where the server closes + * the connection but we don't realize it until we try to send + * something. The problem is that we only get IOException, but + * we need a finer specification (i.e. whether it's an EPIPE or + * something else); I don't trust relying on the message part + * of IOException (which on SunOS/Solaris gives 'Broken pipe', + * but what on Windoze/Mac?). + */ + while (try_count-- > 0) + { + try + { + // get a client socket + + Socket sock; + if (input_demux == null || + (sock = input_demux.getSocket()) == null) + { + sock = getSocket(con_timeout); + + if (Protocol == HTTPS) + { + if (Proxy_Host != null) + { + Socket[] sarr = {sock}; + resp = enableSSLTunneling(sarr, req, con_timeout); + if (resp != null) + { + resp.final_resp = true; + return resp; + } + sock = sarr[0]; + } + + sock.setSoTimeout(con_timeout); + //sock = new SSLSocket(sock); + } + + input_demux = new StreamDemultiplexor(Protocol, sock, this); + DemuxList.addToEnd(input_demux); + keepAliveReqLeft = keepAliveReqMax; + } + + if (req.aborted) + { + throw new IOException("Request aborted by user"); + } + + Log.write(Log.CONN, "Conn: Sending Request: ", hdr_buf); + + // Send headers + + OutputStream sock_out = sock.getOutputStream(); + if (haveMSLargeWritesBug) + { + sock_out = new MSLargeWritesBugStream(sock_out); + } + + hdr_buf.writeTo(sock_out); + + // Wait for "100 Continue" status if necessary + + try + { + if (ServProtVersKnown && + ServerProtocolVersion >= HTTP_1_1 && + Util.hasToken(con_hdrs[1], "100-continue")) + { + resp = new Response(req, (Proxy_Host != null && Protocol != HTTPS), input_demux); + resp.timeout = 60; + if (resp.getContinue() != 100) + { + break; + } + } + } + catch (ParseException pe) + { + throw new IOException(pe.toString()); + } + catch (InterruptedIOException iioe) + { + } + finally + { + if (resp != null) + { + resp.timeout = 0; + } + } + + // POST/PUT data + + if (req.getData() != null && req.getData().length > 0) + { + if (req.delay_entity > 0) + { + // wait for something on the network; check available() + // roughly every 100 ms + + long num_units = req.delay_entity / 100; + long one_unit = req.delay_entity / num_units; + + for (int idx = 0; idx < num_units; idx++) + { + if (input_demux.available(null) != 0) + { + break; + } + try + { + Thread.sleep(one_unit); + } + catch (InterruptedException ie) + { + } + } + + if (input_demux.available(null) == 0) + { + sock_out.write(req.getData()); + } + // he's still waiting + else + { + keep_alive = false; + } + // Uh oh! + } + else + { + sock_out.write(req.getData()); + } + } + + if (req.getStream() != null) + { + req.getStream().goAhead(req, sock_out, 0); + } + else + { + sock_out.flush(); + } + + // get a new response. + // Note: this does not do a read on the socket. + + if (resp == null) + { + resp = new Response(req, (Proxy_Host != null && + Protocol != HTTPS), + input_demux); + } + } + catch (IOException ioe) + { + Log.write(Log.CONN, "Conn: ", ioe); + + closeDemux(ioe, true); + + if (try_count == 0 || ioe instanceof UnknownHostException || + ioe instanceof ConnectException || + ioe instanceof NoRouteToHostException || + ioe instanceof InterruptedIOException || req.aborted) + { + throw ioe; + } + + Log.write(Log.CONN, "Conn: Retrying request"); + continue; + } + + break; + } + + prev_resp = resp; + + // close the stream after this response if necessary + + if ((!keepAliveUnknown && !doesKeepAlive) || !keep_alive || + (keepAliveReqMax != -1 && keepAliveReqLeft-- == 0)) + { + input_demux.markForClose(resp); + input_demux = null; + } + else + { + input_demux.restartTimer(); + } + + if (keepAliveReqMax != -1) + { + Log.write(Log.CONN, "Conn: Number of requests left: " + + keepAliveReqLeft); + } + + /* + * We don't pipeline the first request, as we need some info + * about the server (such as which http version it complies with) + */ + if (!ServProtVersKnown) + { + early_stall = resp; + resp.markAsFirstResponse(req); + } + + /* + * Also don't pipeline until we know if the server supports + * keep-alive's or not. + * Note: strictly speaking, HTTP/1.0 keep-alives don't mean we can + * pipeline requests. I seem to remember some (beta?) version + * of Netscape's Enterprise server which barfed if you tried + * push requests down it's throat w/o waiting for the previous + * response first. However, I've not been able to find such a + * server lately, and so I'm taking the risk and assuming we + * can in fact pipeline requests to HTTP/1.0 servers. + */ + if (keepAliveUnknown || + // We don't pipeline POST's ... + !IdempotentSequence.methodIsIdempotent(req.getMethod()) || + req.dont_pipeline || + // Retries disable pipelining too + neverPipeline) + { + // Emergency measure: prevent all pipelining + late_stall = resp; + } + + /* + * If there is an output stream then just tell the other threads to + * wait; the stream will notify() when it's done. If there isn't any + * stream then wake up a waiting thread (if any). + */ + if (req.getStream() != null) + { + output_finished = false; + } + else + { + output_finished = true; + notify(); + } + + // Looks like were finally done + + Log.write(Log.CONN, "Conn: Request sent"); + } + + return resp; + } + + + /** + * Gets a socket. Creates a socket to the proxy if set, or else to the + * actual destination. + * + * @param con_timeout if not 0 then start a new thread to establish the + * the connection and join(con_timeout) it. If the join() times out an + * InteruptedIOException is thrown. + * @return The socket value + * @exception IOException Description of the Exception + */ + private Socket getSocket(int con_timeout) + throws IOException + { + Socket sock = null; + + String actual_host; + int actual_port; + + if (Proxy_Host != null) + { + actual_host = Proxy_Host; + actual_port = Proxy_Port; + } + else + { + actual_host = Host; + actual_port = Port; + } + + Log.write(Log.CONN, "Conn: Creating Socket: " + actual_host + ":" + + actual_port); + + if (con_timeout == 0) + { + // normal connection establishment + + if (Socks_client != null) + { + sock = Socks_client.getSocket(actual_host, actual_port); + } + else + { + // try all A records + InetAddress[] addr_list = InetAddress.getAllByName(actual_host); + for (int idx = 0; idx < addr_list.length; idx++) + { + try + { + if (LocalAddr == null) + { + sock = new Socket(addr_list[idx], actual_port); + } + else + { + sock = new Socket(addr_list[idx], actual_port, + LocalAddr, LocalPort); + } + break; + // success + } + catch (SocketException se) + { + if (idx == addr_list.length - 1) + { + throw se; + } + // we tried them all + } + } + } + } + else + { + EstablishConnection con = + new EstablishConnection(actual_host, actual_port, Socks_client); + con.start(); + try + { + con.join((long) con_timeout); + } + catch (InterruptedException ie) + { + } + + if (con.getException() != null) + { + throw con.getException(); + } + if ((sock = con.getSocket()) == null) + { + con.forget(); + if ((sock = con.getSocket()) == null) + { + throw new InterruptedIOException("Connection establishment timed out"); + } + } + } + + return sock; + } + + + /** + * Enable SSL Tunneling if we're talking to a proxy. See ietf draft + * draft-luotonen-ssl-tunneling-03 for more info. + * + * @param sock the socket + * @param req the request initiating this connection + * @param timeout the timeout + * @return the proxy's last response if unsuccessful, or + * null if tunnel successfuly established + * @exception IOException + * @exception ModuleException + */ + private Response enableSSLTunneling(Socket[] sock, Request req, int timeout) + throws IOException, ModuleException + { + // copy User-Agent and Proxy-Auth headers from request + + Vector hdrs = new Vector(); + for (int idx = 0; idx < req.getHeaders().length; idx++) + { + String name = req.getHeaders()[idx].getName(); + if (name.equalsIgnoreCase("User-Agent") || + name.equalsIgnoreCase("Proxy-Authorization")) + { + hdrs.addElement(req.getHeaders()[idx]); + } + } + + // create initial CONNECT subrequest + + NVPair[] h = new NVPair[hdrs.size()]; + hdrs.copyInto(h); + Request connect = new Request(this, "CONNECT", Host + ":" + Port, h, + null, null, req.allowUI()); + connect.internal_subrequest = true; + + ByteArrayOutputStream hdr_buf = new ByteArrayOutputStream(600); + HTTPResponse r = new HTTPResponse(gen_mod_insts(), timeout, connect, defaultIncrement); + + // send and handle CONNECT request until successful or tired + + Response resp = null; + + while (true) + { + handleRequest(connect, r, resp, true); + + hdr_buf.reset(); + assembleHeaders(connect, hdr_buf); + + Log.write(Log.CONN, "Conn: Sending SSL-Tunneling Subrequest: ", + hdr_buf); + + // send CONNECT + + hdr_buf.writeTo(sock[0].getOutputStream()); + + // return if successful + + resp = new Response(connect, sock[0].getInputStream()); + if (resp.getStatusCode() == 200) + { + return null; + } + + // failed! + + // make life easy: read data and close socket + + try + { + resp.getData(); + } + catch (IOException ioe) + { + } + try + { + sock[0].close(); + } + catch (IOException ioe) + { + } + + // handle response + + r.set(connect, resp); + if (!r.handleResponse()) + { + return resp; + } + + sock[0] = getSocket(timeout); + } + } + + + /** + * This writes out the headers on the hdr_buf . It takes special + * precautions for the following headers:
Content-typeThis is + * only written if the request has an entity. If the request has an entity + * and no content-type header was given for the request it defaults to + * "application/octet-stream"
Content-lengthThis header is generated + * if the request has an entity and the entity isn't being sent with the + * Transfer-Encoding "chunked".
User-Agent If not present it will be + * generated with the current HTTPClient version strings. Otherwise the + * version string is appended to the given User-Agent string.
Connection + * This header is only written if no proxy is used. If no connection + * header is specified and the server is not known to understand HTTP/1.1 or + * later then a "Connection: keep-alive" header is generated.
+ * Proxy-ConnectionThis header is only written if a proxy is used. If no + * connection header is specified and the proxy is not known to understand + * HTTP/1.1 or later then a "Proxy-Connection: keep-alive" header is + * generated.
Keep-Alive This header is only written if the + * Connection or Proxy-Connection header contains the Keep-Alive token.
+ * Expect If there is no entity and this header contains the + * "100-continue" token then this token is removed. before writing the + * header.
TE If this header does not exist, it is created; else if + * the "trailers" token is not specified this token is added; else the + * header is not touched.
Furthermore, it escapes various characters + * in request-URI. + * + * @param req the Request + * @param hdr_buf the buffer onto which to write the headers + * @return an array of headers; the first element contains + * the the value of the Connection or Proxy-Connectin header, the + * second element the value of the Expect header. + * @exception IOException if writing on hdr_buf generates an an + * IOException, or if an error occurs during parsing of a header + */ + private String[] assembleHeaders(Request req, + ByteArrayOutputStream hdr_buf) + throws IOException + { + DataOutputStream dataout = new DataOutputStream(hdr_buf); + String[] con_hdrs = {"", ""}; + NVPair[] hdrs = req.getHeaders(); + + // remember various headers + + int ho_idx = -1; + + // remember various headers + + int + ct_idx = -1; + + // remember various headers + + int + ua_idx = -1; + + // remember various headers + + int + co_idx = -1; + + // remember various headers + + int + pc_idx = -1; + + // remember various headers + + int + ka_idx = -1; + + // remember various headers + + int + ex_idx = -1; + + // remember various headers + + int + te_idx = -1; + + // remember various headers + + int + tc_idx = -1; + + // remember various headers + + int + ug_idx = -1; + for (int idx = 0; idx < hdrs.length; idx++) + { + String name = hdrs[idx].getName().trim().toLowerCase(); + if (name.equals("host")) + { + ho_idx = idx; + } + else if (name.equals("content-type")) + { + ct_idx = idx; + } + else if (name.equals("user-agent")) + { + ua_idx = idx; + } + else if (name.equals("connection")) + { + co_idx = idx; + } + else if (name.equals("proxy-connection")) + { + pc_idx = idx; + } + else if (name.equals("keep-alive")) + { + ka_idx = idx; + } + else if (name.equals("expect")) + { + ex_idx = idx; + } + else if (name.equals("te")) + { + te_idx = idx; + } + else if (name.equals("transfer-encoding")) + { + tc_idx = idx; + } + else if (name.equals("upgrade")) + { + ug_idx = idx; + } + } + + // Generate request line and Host header + + String file = Util.escapeUnsafeChars(req.getRequestURI()); + if (Proxy_Host != null && Protocol != HTTPS && !file.equals("*")) + { + dataout.writeBytes(req.getMethod() + " http://" + Host + ":" + Port + + file + " " + RequestProtocolVersion + "\r\n"); + } + else + { + dataout.writeBytes(req.getMethod() + " " + file + " " + + RequestProtocolVersion + "\r\n"); + } + + String h_hdr = (ho_idx >= 0) ? hdrs[ho_idx].getValue().trim() : Host; + if (Port != URI.defaultPort(getProtocol())) + { + dataout.writeBytes("Host: " + h_hdr + ":" + Port + "\r\n"); + } + else + { + // Netscape-Enterprise has some bugs... + dataout.writeBytes("Host: " + h_hdr + "\r\n"); + } + + /* + * What follows is the setup for persistent connections. We default + * to doing persistent connections for both HTTP/1.0 and HTTP/1.1, + * unless we're using a proxy server and HTTP/1.0 in which case we + * must make sure we don't do persistence (because of the problem of + * 1.0 proxies blindly passing the Connection header on). + * + * Note: there is a "Proxy-Connection" header for use with proxies. + * This however is only understood by Netscape and Netapp caches. + * Furthermore, it suffers from the same problem as the Connection + * header in HTTP/1.0 except that at least two proxies must be + * involved. But I've taken the risk now and decided to send the + * Proxy-Connection header. If I get complaints I'll remove it again. + * + * In any case, with this header we can now modify the above to send + * the Proxy-Connection header whenever we wouldn't send the normal + * Connection header. + */ + String co_hdr = null; + if (!(ServProtVersKnown && ServerProtocolVersion >= HTTP_1_1 && + co_idx == -1)) + { + if (co_idx == -1) + { + // no connection header given by user + co_hdr = "Keep-Alive"; + con_hdrs[0] = "Keep-Alive"; + } + else + { + con_hdrs[0] = hdrs[co_idx].getValue().trim(); + co_hdr = con_hdrs[0]; + } + + try + { + if (ka_idx != -1 && + Util.hasToken(con_hdrs[0], "keep-alive")) + { + dataout.writeBytes("Keep-Alive: " + + hdrs[ka_idx].getValue().trim() + "\r\n"); + } + } + catch (ParseException pe) + { + throw new IOException(pe.toString()); + } + } + + if ((Proxy_Host != null && Protocol != HTTPS) && + !(ServProtVersKnown && ServerProtocolVersion >= HTTP_1_1)) + { + if (co_hdr != null) + { + dataout.writeBytes("Proxy-Connection: "); + dataout.writeBytes(co_hdr); + dataout.writeBytes("\r\n"); + co_hdr = null; + } + } + + if (co_hdr != null) + { + try + { + if (!Util.hasToken(co_hdr, "TE")) + { + co_hdr += ", TE"; + } + } + catch (ParseException pe) + { + throw new IOException(pe.toString()); + } + } + else + { + co_hdr = "TE"; + } + + if (ug_idx != -1) + { + co_hdr += ", Upgrade"; + } + + if (co_hdr != null) + { + dataout.writeBytes("Connection: "); + dataout.writeBytes(co_hdr); + dataout.writeBytes("\r\n"); + } + + // handle TE header + + if (te_idx != -1) + { + dataout.writeBytes("TE: "); + Vector pte; + try + { + pte = Util.parseHeader(hdrs[te_idx].getValue()); + } + catch (ParseException pe) + { + throw new IOException(pe.toString()); + } + + if (!pte.contains(new HttpHeaderElement("trailers"))) + { + dataout.writeBytes("trailers, "); + } + + dataout.writeBytes(hdrs[te_idx].getValue().trim() + "\r\n"); + } + else + { + dataout.writeBytes("TE: trailers\r\n"); + } + + // User-Agent + + if (ua_idx != -1) + { + dataout.writeBytes("User-Agent: " + hdrs[ua_idx].getValue().trim() + "\r\n"); + } + else + { + dataout.writeBytes("User-Agent: " + version + "\r\n"); + } + + // Write out any headers left + + for (int idx = 0; idx < hdrs.length; idx++) + { + if (idx != ct_idx && idx != ua_idx && idx != co_idx && + idx != pc_idx && idx != ka_idx && idx != ex_idx && + idx != te_idx && idx != ho_idx) + { + dataout.writeBytes(hdrs[idx].getName().trim() + ": " + + hdrs[idx].getValue().trim() + "\r\n"); + } + } + + // Handle Content-type, Content-length and Expect headers + + if (req.getData() != null || req.getStream() != null) + { + dataout.writeBytes("Content-type: "); + if (ct_idx != -1) + { + dataout.writeBytes(hdrs[ct_idx].getValue().trim()); + } + else + { + dataout.writeBytes("application/octet-stream"); + } + dataout.writeBytes("\r\n"); + + if (req.getData() != null) + { + dataout.writeBytes("Content-length: " + req.getData().length + + "\r\n"); + } + else if (req.getStream().getLength() != -1 && tc_idx == -1) + { + dataout.writeBytes("Content-length: " + + req.getStream().getLength() + "\r\n"); + } + + if (ex_idx != -1) + { + con_hdrs[1] = hdrs[ex_idx].getValue().trim(); + dataout.writeBytes("Expect: " + con_hdrs[1] + "\r\n"); + } + } + else if (ex_idx != -1) + { + Vector expect_tokens; + try + { + expect_tokens = Util.parseHeader(hdrs[ex_idx].getValue()); + } + catch (ParseException pe) + { + throw new IOException(pe.toString()); + } + + // remove any 100-continue tokens + + HttpHeaderElement cont = new HttpHeaderElement("100-continue"); + while (expect_tokens.removeElement(cont)) + { + ; + } + + // write out header if any tokens left + + if (!expect_tokens.isEmpty()) + { + con_hdrs[1] = Util.assembleHeader(expect_tokens); + dataout.writeBytes("Expect: " + con_hdrs[1] + "\r\n"); + } + } + + dataout.writeBytes("\r\n"); + // end of header + + return con_hdrs; + } + + + /** + * The very first request is special in that we use it to figure out the + * protocol version the server (or proxy) is compliant with. + * + * @param req Description of the Parameter + * @param resp Description of the Parameter + * @return true if all went fine, false if the request needs + * to be resent + * @exception IOException if any exception is thrown by the response + */ + boolean handleFirstRequest(Request req, Response resp) + throws IOException + { + // read response headers to get protocol version used by + // the server. + + ServerProtocolVersion = String2ProtVers(resp.getVersion()); + ServProtVersKnown = true; + + /* + * We need to treat connections through proxies specially, because + * many HTTP/1.0 proxies do not downgrade an HTTP/1.1 response + * version to HTTP/1.0 (i.e. when we are talking to an HTTP/1.1 + * server through an HTTP/1.0 proxy we are mislead to thinking we're + * talking to an HTTP/1.1 proxy). We use the absence of the Via + * header to detect whether we're talking to an HTTP/1.0 proxy, + * unless the status code indicates an error from the proxy + * itself. However, this only works when the chain contains + * only HTTP/1.0 proxies; if you have then this will fail too. Unfortunately there + * seems to be no way to reliably detect broken HTTP/1.0 + * proxies... + */ + int sts = resp.getStatusCode(); + if ((Proxy_Host != null && Protocol != HTTPS) && + resp.getHeader("Via") == null && + sts != 407 && sts != 502 && sts != 504) + { + ServerProtocolVersion = HTTP_1_0; + } + + Log.write(Log.CONN, "Conn: Protocol Version established: " + + ProtVers2String(ServerProtocolVersion)); + + // some (buggy) servers return an error status if they get a + // version they don't comprehend + + if (ServerProtocolVersion == HTTP_1_0 && + (resp.getStatusCode() == 400 || resp.getStatusCode() == 500)) + { + if (input_demux != null) + { + input_demux.markForClose(resp); + } + input_demux = null; + RequestProtocolVersion = "HTTP/1.0"; + return false; + } + + return true; + } + + + /** + * Description of the Method + * + * @param resp Description of the Parameter + * @exception IOException Description of the Exception + */ + private void determineKeepAlive(Response resp) + throws IOException + { + // try and determine if this server does keep-alives + + String con; + + try + { + if (ServerProtocolVersion >= HTTP_1_1 || + ( + ( + ((Proxy_Host == null || Protocol == HTTPS) && + (con = resp.getHeader("Connection")) != null) + || + ((Proxy_Host != null && Protocol != HTTPS) && + (con = resp.getHeader("Proxy-Connection")) != null) + ) && + Util.hasToken(con, "keep-alive") + ) + ) + { + doesKeepAlive = true; + keepAliveUnknown = false; + + Log.write(Log.CONN, "Conn: Keep-Alive enabled"); + } + else if (resp.getStatusCode() < 400) + { + keepAliveUnknown = false; + } + + // get maximum number of requests + + if (doesKeepAlive && ServerProtocolVersion == HTTP_1_0 && + (con = resp.getHeader("Keep-Alive")) != null) + { + HttpHeaderElement max = + Util.getElement(Util.parseHeader(con), "max"); + if (max != null && max.getValue() != null) + { + keepAliveReqMax = Integer.parseInt(max.getValue()); + keepAliveReqLeft = keepAliveReqMax; + + Log.write(Log.CONN, "Conn: Max Keep-Alive requests: " + + keepAliveReqMax); + } + } + } + catch (ParseException pe) + { + } + catch (NumberFormatException nfe) + { + } + catch (ClassCastException cce) + { + } + } + + + /** + * Description of the Method + */ + synchronized void outputFinished() + { + output_finished = true; + notify(); + } + + + /** + * Description of the Method + * + * @param ioe Description of the Parameter + * @param was_reset Description of the Parameter + */ + synchronized void closeDemux(IOException ioe, boolean was_reset) + { + if (input_demux != null) + { + input_demux.close(ioe, was_reset); + } + + early_stall = null; + late_stall = null; + prev_resp = null; + } + + + /** + * Description of the Method + * + * @param prot_vers Description of the Parameter + * @return Description of the Return Value + */ + final static String ProtVers2String(int prot_vers) + { + return "HTTP/" + (prot_vers >>> 16) + "." + (prot_vers & 0xFFFF); + } + + + /** + * Description of the Method + * + * @param prot_vers Description of the Parameter + * @return Description of the Return Value + */ + final static int String2ProtVers(String prot_vers) + { + String vers = prot_vers.substring(5); + int dot = vers.indexOf('.'); + return Integer.parseInt(vers.substring(0, dot)) << 16 | + Integer.parseInt(vers.substring(dot + 1)); + } + + + /** + * Generates a string of the form protocol://host.domain:port . + * + * @return the string + */ + public String toString() + { + return getProtocol() + "://" + getHost() + + (getPort() != URI.defaultPort(getProtocol()) ? ":" + getPort() : ""); + } + + + /** + * Description of the Class + * + * @author Administrator + * @created 29. Dezember 2001 + */ + private class EstablishConnection extends Thread + { + String actual_host; + int actual_port; + IOException exception; + Socket sock; + SocksClient Socks_client; + boolean close; + + + /** + * Constructor for the EstablishConnection object + * + * @param host Description of the Parameter + * @param port Description of the Parameter + * @param socks Description of the Parameter + */ + EstablishConnection(String host, int port, SocksClient socks) + { + super("EstablishConnection (" + host + ":" + port + ")"); + try + { + setDaemon(true); + } + catch (SecurityException se) + { + } + // Oh well... + + actual_host = host; + actual_port = port; + Socks_client = socks; + + exception = null; + sock = null; + close = false; + } + + + /** + * Main processing method for the EstablishConnection object + */ + public void run() + { + try + { + if (Socks_client != null) + { + sock = Socks_client.getSocket(actual_host, actual_port); + } + else + { + // try all A records + InetAddress[] addr_list = InetAddress.getAllByName(actual_host); + for (int idx = 0; idx < addr_list.length; idx++) + { + try + { + if (LocalAddr == null) + { + sock = new Socket(addr_list[idx], actual_port); + } + else + { + sock = new Socket(addr_list[idx], actual_port, + LocalAddr, LocalPort); + } + break; + // success + } + catch (SocketException se) + { + if (idx == addr_list.length - 1 || close) + { + throw se; + } + // we tried them all + } + } + } + } + catch (IOException ioe) + { + exception = ioe; + } + + if (close && sock != null) + { + try + { + sock.close(); + } + catch (IOException ioe) + { + } + sock = null; + } + } + + + /** + * Gets the exception attribute of the EstablishConnection object + * + * @return The exception value + */ + IOException getException() + { + return exception; + } + + + /** + * Gets the socket attribute of the EstablishConnection object + * + * @return The socket value + */ + Socket getSocket() + { + return sock; + } + + + /** + * Description of the Method + */ + void forget() + { + close = true; + } + } + + + /** + * M$ has yet another bug in their WinSock: if you try to write too much + * data at once it'll hang itself. This filter therefore splits big writes + * up into multiple writes of at most 20K. + * + * @author Administrator + * @created 29. Dezember 2001 + */ + private class MSLargeWritesBugStream extends FilterOutputStream + { + private final int CHUNK_SIZE = 20000; + + + /** + * Constructor for the MSLargeWritesBugStream object + * + * @param os Description of the Parameter + */ + MSLargeWritesBugStream(OutputStream os) + { + super(os); + } + + + /** + * Description of the Method + * + * @param b Description of the Parameter + * @param off Description of the Parameter + * @param len Description of the Parameter + * @exception IOException Description of the Exception + */ + public void write(byte[] b, int off, int len) + throws IOException + { + while (len > CHUNK_SIZE) + { + out.write(b, off, CHUNK_SIZE); + off += CHUNK_SIZE; + len -= CHUNK_SIZE; + } + out.write(b, off, len); + } + } + + public void setDefaultReadIncement(int increment) + { + this.defaultIncrement = increment; + } + + int defaultIncrement = 1000; + + +} diff --git a/sandbox/contributions/webcrawler-LARM/src/HTTPClient/HTTPResponse.java b/sandbox/contributions/webcrawler-LARM/src/HTTPClient/HTTPResponse.java new file mode 100644 index 00000000000..c7db0d84846 --- /dev/null +++ b/sandbox/contributions/webcrawler-LARM/src/HTTPClient/HTTPResponse.java @@ -0,0 +1,1419 @@ +/* + * @(#)HTTPResponse.java 0.3-3 06/05/2001 + * + * This file is part of the HTTPClient package + * Copyright (C) 1996-2001 Ronald Tschalär + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You shou + * d have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free + * Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, + * MA 02111-1307, USA + * + * For questions, suggestions, bug-reports, enhancement-requests etc. + * I may be contacted at: + * + * ronald@innovation.ch + * + * The HTTPClient's home page is located at: + * + * http://www.innovation.ch/java/HTTPClient/ + * + */ +package HTTPClient; + +import java.io.IOException; +import java.io.InterruptedIOException; +import java.io.InputStream; +import java.io.ByteArrayInputStream; +import java.net.URL; +import java.util.Date; +import java.util.LinkedList; +import java.util.Enumeration; +import java.util.Iterator; + +/** + * This defines the http-response class returned by the requests. It's basically + * a wrapper around the Response class which first lets all the modules handle + * the response before finally giving the info to the user. + * + * @author Ronald Tschalär + * @created 29. Dezember 2001 + * @version 0.3-3 06/05/2001 + * @since 0.3 + */ + +class ByteBlock +{ + byte[] block; + int length; + ByteBlock(int size) + { + block = new byte[size]; + } +} + +public class HTTPResponse implements HTTPClientModuleConstants +{ + /** + * the list of modules + */ + private HTTPClientModule[] modules; + + /** + * the timeout for reads + */ + private int timeout; + + /** + * the request + */ + private Request request = null; + + /** + * the current response + */ + Response response = null; + + /** + * the HttpOutputStream to synchronize on + */ + private HttpOutputStream out_stream = null; + + /** + * our input stream from the stream demux + */ + private InputStream inp_stream; + + /** + * the status code returned. + */ + private int StatusCode; + + /** + * the reason line associated with the status code. + */ + private String ReasonLine; + + /** + * the HTTP version of the response. + */ + private String Version; + + /** + * the original URI used. + */ + private URI OriginalURI = null; + + /** + * the final URI of the document. + */ + private URI EffectiveURI = null; + + /** + * any headers which were received and do not fit in the above list. + */ + private CIHashtable Headers = null; + + /** + * any trailers which were received and do not fit in the above list. + */ + private CIHashtable Trailers = null; + + /** + * the ContentLength of the data. + */ + private int ContentLength = -1; + + /** + * the data (body) returned. + */ + private byte[] Data = null; + + /** + * signals if we have got and parsed the headers yet? + */ + private boolean initialized = false; + + /** + * signals if we have got the trailers yet? + */ + private boolean got_trailers = false; + + /** + * marks this response as aborted (stop() in HTTPConnection) + */ + private boolean aborted = false; + + /** + * should the request be retried by the application? + */ + private boolean retry = false; + + /** + * the method used in the request + */ + private String method = null; + + + // Constructors + + /** + * Creates a new HTTPResponse. + * + * @param modules the list of modules handling this response + * @param timeout the timeout to be used on stream read()'s + * @param orig Description of the Parameter + * @param readIncrement Description of the Parameter + */ + HTTPResponse(HTTPClientModule[] modules, int timeout, Request orig, int readIncrement) + { + this.modules = modules; + this.timeout = timeout; + try + { + int qp = orig.getRequestURI().indexOf('?'); + this.OriginalURI = new URI(orig.getConnection().getProtocol(), + null, + orig.getConnection().getHost(), + orig.getConnection().getPort(), + qp < 0 ? orig.getRequestURI() : + orig.getRequestURI().substring(0, qp), + qp < 0 ? null : + orig.getRequestURI().substring(qp + 1), + null); + } + catch (ParseException pe) + { + } + this.method = orig.getMethod(); + this.readIncrement = readIncrement; + } + + + int readIncrement = 1000; + + + /** + * Sets the readIncrement attribute of the HTTPResponse object + * + * @param readIncrement The new readIncrement value + */ + public void setReadIncrement(int readIncrement) + { + this.readIncrement = readIncrement; + } + + + /** + * Gets the readIncrement attribute of the HTTPResponse object + * + * @return The readIncrement value + */ + public int getReadIncrement() + { + return this.readIncrement; + } + + + /** + * @param req the request + * @param resp the response + */ + void set(Request req, Response resp) + { + this.request = req; + this.response = resp; + resp.http_resp = this; + resp.timeout = timeout; + this.aborted = resp.final_resp; + } + + + /** + * @param req the request + * @param out_stream Description of the Parameter + */ + void set(Request req, HttpOutputStream out_stream) + { + this.request = req; + this.out_stream = out_stream; + } + + + // Methods + + /** + * Give the status code for this request. These are grouped as follows: + * + * + * + * @return The statusCode value + * @exception IOException if any exception occurs on the socket. + * @exception ModuleException if any module encounters an exception. + */ + public final int getStatusCode() + throws IOException, ModuleException + { + if (!initialized) + { + handleResponse(); + } + return StatusCode; + } + + + /** + * Give the reason line associated with the status code. + * + * @return The reasonLine value + * @exception IOException If any exception occurs on the socket. + * @exception ModuleException if any module encounters an exception. + */ + public final String getReasonLine() + throws IOException, ModuleException + { + if (!initialized) + { + handleResponse(); + } + return ReasonLine; + } + + + /** + * Get the HTTP version used for the response. + * + * @return The version value + * @exception IOException If any exception occurs on the socket. + * @exception ModuleException if any module encounters an exception. + */ + public final String getVersion() + throws IOException, ModuleException + { + if (!initialized) + { + handleResponse(); + } + return Version; + } + + + /** + * Get the name and type of server. + * + * @return The server value + * @exception IOException If any exception occurs on the socket. + * @exception ModuleException if any module encounters an exception. + * @deprecated This method is a remnant of V0.1; use getHeader("Server") + * instead. + * @see #getHeader(java.lang.String) + */ + public final String getServer() + throws IOException, ModuleException + { + if (!initialized) + { + handleResponse(); + } + return getHeader("Server"); + } + + + /** + * Get the original URI used in the request. + * + * @return the URI used in primary request + */ + public final URI getOriginalURI() + { + return OriginalURI; + } + + + /** + * Get the final URL of the document. This is set if the original request + * was deferred via the "moved" (301, 302, or 303) return status. + * + * @return the effective URL, or null if no redirection + * occured + * @exception IOException If any exception occurs on the socket. + * @exception ModuleException if any module encounters an exception. + * @deprecated use getEffectiveURI() instead + * @see #getEffectiveURI + */ + public final URL getEffectiveURL() + throws IOException, ModuleException + { + if (!initialized) + { + handleResponse(); + } + if (EffectiveURI != null) + { + return EffectiveURI.toURL(); + } + return null; + } + + + /** + * Get the final URI of the document. If the request was redirected via the + * "moved" (301, 302, 303, or 307) return status this returns the URI used + * in the last redirection; otherwise it returns the original URI. + * + * @return the effective URI + * @exception IOException If any exception occurs on the socket. + * @exception ModuleException if any module encounters an exception. + */ + public final URI getEffectiveURI() + throws IOException, ModuleException + { + if (!initialized) + { + handleResponse(); + } + if (EffectiveURI != null) + { + return EffectiveURI; + } + return OriginalURI; + } + + + /** + * Retrieves the value for a given header. + * + * @param hdr the header name. + * @return the value for the header, or null if + * non-existent. + * @exception IOException If any exception occurs on the socket. + * @exception ModuleException if any module encounters an exception. + */ + public String getHeader(String hdr) + throws IOException, ModuleException + { + if (!initialized) + { + handleResponse(); + } + return (String) Headers.get(hdr.trim()); + } + + + /** + * Retrieves the value for a given header. The value is parsed as an int. + * + * @param hdr the header name. + * @return the value for the header if the header + * exists + * @exception NumberFormatException if the header's value is not a number + * or if the header does not exist. + * @exception IOException if any exception occurs on the socket. + * @exception ModuleException if any module encounters an exception. + */ + public int getHeaderAsInt(String hdr) + throws IOException, ModuleException, NumberFormatException + { + String val = getHeader(hdr); + if (val == null) + { + throw new NumberFormatException("null"); + } + return Integer.parseInt(val); + } + + + /** + * Retrieves the value for a given header. The value is parsed as a date; if + * this fails it is parsed as a long representing the number of seconds + * since 12:00 AM, Jan 1st, 1970. If this also fails an exception is thrown. + *
+ * Note: When sending dates use Util.httpDate(). + * + * @param hdr the header name. + * @return the value for the header, or null if + * non-existent. + * @exception IllegalArgumentException if the header's value is neither a + * legal date nor a number. + * @exception IOException if any exception occurs on the + * socket. + * @exception ModuleException if any module encounters an + * exception. + */ + public Date getHeaderAsDate(String hdr) + throws IOException, IllegalArgumentException, ModuleException + { + String raw_date = getHeader(hdr); + if (raw_date == null) + { + return null; + } + + // asctime() format is missing an explicit GMT specifier + if (raw_date.toUpperCase().indexOf("GMT") == -1 && + raw_date.indexOf(' ') > 0) + { + raw_date += " GMT"; + } + + Date date; + + try + { + date = Util.parseHttpDate(raw_date); + } + catch (IllegalArgumentException iae) + { + // some servers erroneously send a number, so let's try that + long time; + try + { + time = Long.parseLong(raw_date); + } + catch (NumberFormatException nfe) + { + throw iae; + } + // give up + if (time < 0) + { + time = 0; + } + date = new Date(time * 1000L); + } + + return date; + } + + + /** + * Returns an enumeration of all the headers available via getHeader(). + * + * @return Description of the Return Value + * @exception IOException If any exception occurs on the socket. + * @exception ModuleException if any module encounters an exception. + */ + public Enumeration listHeaders() + throws IOException, ModuleException + { + if (!initialized) + { + handleResponse(); + } + return Headers.keys(); + } + + + /** + * Retrieves the value for a given trailer. This should not be invoked until + * all response data has been read. If invoked before it will call getData() + * to force the data to be read. + * + * @param trailer the trailer name. + * @return the value for the trailer, or null if + * non-existent. + * @exception IOException If any exception occurs on the socket. + * @exception ModuleException if any module encounters an exception. + * @see #getData() + */ + public String getTrailer(String trailer) + throws IOException, ModuleException + { + if (!got_trailers) + { + getTrailers(); + } + return (String) Trailers.get(trailer.trim()); + } + + + /** + * Retrieves the value for a given tailer. The value is parsed as an int. + * + * @param trailer the tailer name. + * @return the value for the trailer if the + * trailer exists + * @exception NumberFormatException if the trailer's value is not a number + * or if the trailer does not exist. + * @exception IOException if any exception occurs on the socket. + * @exception ModuleException if any module encounters an exception. + */ + public int getTrailerAsInt(String trailer) + throws IOException, ModuleException, NumberFormatException + { + String val = getTrailer(trailer); + if (val == null) + { + throw new NumberFormatException("null"); + } + return Integer.parseInt(val); + } + + + /** + * Retrieves the value for a given trailer. The value is parsed as a date; + * if this fails it is parsed as a long representing the number of seconds + * since 12:00 AM, Jan 1st, 1970. If this also fails an + * IllegalArgumentException is thrown.
+ * Note: When sending dates use Util.httpDate(). + * + * @param trailer the trailer name. + * @return the value for the trailer, or null + * if non-existent. + * @exception IllegalArgumentException if the trailer's value is neither a + * legal date nor a number. + * @exception IOException if any exception occurs on the + * socket. + * @exception ModuleException if any module encounters an + * exception. + */ + public Date getTrailerAsDate(String trailer) + throws IOException, IllegalArgumentException, ModuleException + { + String raw_date = getTrailer(trailer); + if (raw_date == null) + { + return null; + } + + // asctime() format is missing an explicit GMT specifier + if (raw_date.toUpperCase().indexOf("GMT") == -1 && + raw_date.indexOf(' ') > 0) + { + raw_date += " GMT"; + } + + Date date; + + try + { + date = Util.parseHttpDate(raw_date); + } + catch (IllegalArgumentException iae) + { + // some servers erroneously send a number, so let's try that + long time; + try + { + time = Long.parseLong(raw_date); + } + catch (NumberFormatException nfe) + { + throw iae; + } + // give up + if (time < 0) + { + time = 0; + } + date = new Date(time * 1000L); + } + + return date; + } + + + /** + * Returns an enumeration of all the trailers available via getTrailer(). + * + * @return Description of the Return Value + * @exception IOException If any exception occurs on the socket. + * @exception ModuleException if any module encounters an exception. + */ + public Enumeration listTrailers() + throws IOException, ModuleException + { + if (!got_trailers) + { + getTrailers(); + } + return Trailers.keys(); + } + + + /** + * Reads all the response data into a byte array. Note that this method + * won't return until all the data has been received (so for + * instance don't invoke this method if the server is doing a server push). + * If getInputStream() had been previously invoked then this + * method only returns any unread data remaining on the stream and then + * closes it.

+ * + * Note to the unwary: code like

+     *     System.out.println("The data: " + resp.getData())
+     *
will probably not do what you want - use
+     *     System.out.println("The data: " + resp.getText())
+     *
instead. + * + * @return an array containing the data (body) returned. + * If no data was returned then it's set to a zero-length array. + * @exception IOException If any io exception occured while reading the + * data + * @exception ModuleException if any module encounters an exception. + * @see #getInputStream() + */ + + public byte[] getData() throws IOException, ModuleException + { + return getData(-1); + } + + public byte[] getData(int max) + throws IOException, ModuleException + { + if (!initialized) + { + handleResponse(); + } + + if (Data == null) + { + try + { + readResponseData(inp_stream, max); + } + catch (InterruptedIOException ie) + { + // don't intercept + throw ie; + } + catch (IOException ioe) + { + Log.write(Log.RESP, "HResp: (\"" + method + " " + + OriginalURI.getPathAndQuery() + "\")"); + Log.write(Log.RESP, " ", ioe); + + try + { + inp_stream.close(); + } + catch (Exception e) + { + } + throw ioe; + } + + inp_stream.close(); + } + + return Data; + } + + + /** + * Reads all the response data into a buffer and turns it into a string + * using the appropriate character converter. Since this uses {@link + * #getData() getData()}, the caveats of that method apply here as well. + * + * @return the body as a String. If no data was returned + * then an empty string is returned. + * @exception IOException If any io exception occured while reading the + * data, or if the content is not text + * @exception ModuleException if any module encounters an exception. + * @exception ParseException if an error occured trying to parse the + * content-type header field + * @see #getData() + */ + public synchronized String getText() + throws IOException, ModuleException, ParseException + { + String ct = getHeader("Content-Type"); + if (ct == null || !ct.toLowerCase().startsWith("text/")) + { + throw new IOException("Content-Type `" + ct + "' is not a text type"); + } + + String charset = Util.getParameter("charset", ct); + if (charset == null) + { + charset = "ISO-8859-1"; + } + + return new String(getData(), charset); + } + + + /** + * Gets an input stream from which the returned data can be read. Note that + * if getData() had been previously invoked it will actually + * return a ByteArrayInputStream created from that data. + * + * @return the InputStream. + * @exception IOException If any exception occurs on the socket. + * @exception ModuleException if any module encounters an exception. + * @see #getData() + */ + public synchronized InputStream getInputStream() + throws IOException, ModuleException + { + if (!initialized) + { + handleResponse(); + } + + if (Data == null) + { + return inp_stream; + } + else + { + getData(); + // ensure complete data is read + return new ByteArrayInputStream(Data); + } + } + + + /** + * Should the request be retried by the application? If the application used + * an HttpOutputStream in the request then various modules (such + * as the redirection and authorization modules) are not able to resend the + * request themselves. Instead, it becomes the application's responsibility. + * The application can check this flag, and if it's set, resend the exact + * same request. The modules such as the RedirectionModule or + * AuthorizationModule will then recognize the resend and fix up or redirect + * the request as required (i.e. they defer their normal action until the + * resend).

+ * + * If the application resends the request then it must use + * the same HttpOutputStream instance. This is because the + * modules use this to recognize the retried request and to perform the + * necessary work on the request before it's sent.

+ * + * Here is a skeleton example of usage:

+     *     OutputStream out = new HttpOutputStream(1234);
+     *     do
+     *     {
+     *         rsp = con.Post("/cgi-bin/my_cgi", out);
+     *         out.write(...);
+     *         out.close();
+     *     } while (rsp.retryRequest());
+     *
+     *     if (rsp.getStatusCode() >= 300)
+     *         ...
+     * 

+ * + * Note that for this to ever return true, the java system property + * HTTPClient.deferStreamed must be set to true at the beginning of + * the application (before the HTTPConnection class is loaded). This + * prevents unwary applications from causing inadvertent memory leaks. If an + * application does set this, then it must resend any request whose + * response returns true here in order to prevent memory leaks (a switch to + * JDK 1.2 will allow us to use weak references and eliminate this problem). + * + * @return true if the request should be retried. + * @exception IOException If any exception occurs on the socket. + * @exception ModuleException if any module encounters an exception. + */ + public boolean retryRequest() + throws IOException, ModuleException + { + if (!initialized) + { + try + { + handleResponse(); + } + catch (RetryException re) + { + this.retry = response.retry; + } + } + return retry; + } + + + /** + * produces a full list of headers and their values, one per line. + * + * @return a string containing the headers + */ + public String toString() + { + if (!initialized) + { + try + { + handleResponse(); + } + catch (Exception e) + { + if (!(e instanceof InterruptedIOException)) + { + Log.write(Log.RESP, "HResp: (\"" + method + " " + + OriginalURI.getPathAndQuery() + "\")"); + Log.write(Log.RESP, " ", e); + } + return "Failed to read headers: " + e; + } + } + + String nl = System.getProperty("line.separator", "\n"); + + StringBuffer str = new StringBuffer(Version); + str.append(' '); + str.append(StatusCode); + str.append(' '); + str.append(ReasonLine); + str.append(nl); + + if (EffectiveURI != null) + { + str.append("Effective-URI: "); + str.append(EffectiveURI); + str.append(nl); + } + + Enumeration hdr_list = Headers.keys(); + while (hdr_list.hasMoreElements()) + { + String hdr = (String) hdr_list.nextElement(); + str.append(hdr); + str.append(": "); + str.append(Headers.get(hdr)); + str.append(nl); + } + + return str.toString(); + } + + + // Helper Methods + + + /** + * Gets the modules attribute of the HTTPResponse object + * + * @return The modules value + */ + HTTPClientModule[] getModules() + { + return modules; + } + + + /** + * Processes a Response. This is done by calling the response handler in + * each module. When all is done, the various fields of this instance are + * intialized from the last Response. + * + * @return true if a new request was generated. This is + * used for internal subrequests only + * @exception IOException if any handler throws an IOException. + * @exception ModuleException if any module encounters an exception. + */ + synchronized boolean handleResponse() + throws IOException, ModuleException + { + if (initialized) + { + return false; + } + + /* + * first get the response if necessary + */ + if (out_stream != null) + { + response = out_stream.getResponse(); + response.http_resp = this; + out_stream = null; + } + + /* + * go through modules and handle them + */ + doModules : + while (true) + { + + Phase1 : + for (int idx = 0; idx < modules.length && !aborted; idx++) + { + try + { + modules[idx].responsePhase1Handler(response, request); + } + catch (RetryException re) + { + if (re.restart) + { + continue doModules; + } + else + { + throw re; + } + } + } + + Phase2 : + for (int idx = 0; idx < modules.length && !aborted; idx++) + { + int sts = modules[idx].responsePhase2Handler(response, request); + switch (sts) + { + case RSP_CONTINUE: + // continue processing + break; + case RSP_RESTART: + // restart response processing + idx = -1; + continue doModules; + case RSP_SHORTCIRC: + // stop processing and return + break doModules; + case RSP_REQUEST: + // go to phase 1 + case RSP_NEWCON_REQ: + // process the request using a new con + response.getInputStream().close(); + if (handle_trailers) + { + invokeTrailerHandlers(true); + } + if (request.internal_subrequest) + { + return true; + } + request.getConnection(). + handleRequest(request, this, response, true); + if (initialized) + { + break doModules; + } + + idx = -1; + continue doModules; + case RSP_SEND: + // send the request immediately + case RSP_NEWCON_SND: + // send the request using a new con + response.getInputStream().close(); + if (handle_trailers) + { + invokeTrailerHandlers(true); + } + if (request.internal_subrequest) + { + return true; + } + request.getConnection(). + handleRequest(request, this, response, false); + idx = -1; + continue doModules; + default: + // not valid + throw new Error("HTTPClient Internal Error: invalid status" + + " " + sts + " returned by module " + + modules[idx].getClass().getName()); + } + } + + Phase3 : + for (int idx = 0; idx < modules.length && !aborted; idx++) + { + modules[idx].responsePhase3Handler(response, request); + } + + break doModules; + } + + /* + * force a read on the response in case none of the modules did + */ + response.getStatusCode(); + + /* + * all done, so copy data + */ + if (!request.internal_subrequest) + { + init(response); + } + + if (handle_trailers) + { + invokeTrailerHandlers(false); + } + + return false; + } + + + /** + * Copies the relevant fields from Response and marks this as initialized. + * + * @param resp the Response class to copy from + */ + void init(Response resp) + { + if (initialized) + { + return; + } + + this.StatusCode = resp.StatusCode; + this.ReasonLine = resp.ReasonLine; + this.Version = resp.Version; + this.EffectiveURI = resp.EffectiveURI; + this.ContentLength = resp.ContentLength; + this.Headers = resp.Headers; + this.inp_stream = resp.inp_stream; + this.Data = resp.Data; + this.retry = resp.retry; + initialized = true; + } + + + private boolean handle_trailers = false; + private boolean trailers_handled = false; + + + /** + * This is invoked by the RespInputStream when it is close()'d. It just + * invokes the trailer handler in each module. + * + * @param force invoke the handlers even if not initialized + * yet? + * @exception IOException if thrown by any module + * @exception ModuleException if thrown by any module + */ + void invokeTrailerHandlers(boolean force) + throws IOException, ModuleException + { + if (trailers_handled) + { + return; + } + + if (!force && !initialized) + { + handle_trailers = true; + return; + } + + for (int idx = 0; idx < modules.length && !aborted; idx++) + { + modules[idx].trailerHandler(response, request); + } + + trailers_handled = true; + } + + + /** + * Mark this request as having been aborted. It's invoked by + * HTTPConnection.stop(). + */ + void markAborted() + { + aborted = true; + } + + + /** + * Gets any trailers from the response if we haven't already done so. + * + * @exception IOException Description of the Exception + * @exception ModuleException Description of the Exception + */ + private synchronized void getTrailers() + throws IOException, ModuleException + { + if (got_trailers) + { + return; + } + if (!initialized) + { + handleResponse(); + } + + response.getTrailer("Any"); + Trailers = response.Trailers; + got_trailers = true; + + invokeTrailerHandlers(false); + } + + + /** + * Reads the response data received. Does not return until either + * Content-Length bytes have been read or EOF is reached. + * + * @param inp Description of the Parameter + * @exception IOException if any read on the input stream fails + * @inp the input stream from which to read the data + */ + private void readResponseData(InputStream inp, int max) + throws IOException, ModuleException + { + boolean readUnlimited = (max == -1); + + if (ContentLength == 0) + { + return; + } + + if (Data == null) + { + Data = new byte[0]; + } + + // read response data + + int off = Data.length; + + try + { + // check Content-length header in case CE-Module removed it + if (getHeader("Content-Length") != null) + { + int rcvd = 0; + int total = max > 1 ? Math.min(ContentLength, max) : ContentLength; + //System.out.println("Reading with max file size: " + total); + Data = new byte[total]; + do + { + off += rcvd; + rcvd = inp.read(Data, off, total - off); + } while (rcvd != -1 && off + rcvd < total); + // if max < ContentLength (&& max > -1): lose the rest + /*if(total < ContentLength) + { + inp.skip(ContentLength - total); + }*/ + /* + * Don't do this! + * If we do, then getData() won't work after a getInputStream() + * because we'll never get all the expected data. Instead, let + * the underlying RespInputStream throw the EOF. + * if (rcvd == -1) // premature EOF + * { + * throw new EOFException("Encountered premature EOF while " + + * "reading headers: received " + off + + * " bytes instead of the expected " + + * ContentLength + " bytes"); + * } + */ + } + else + { + //System.out.println("Reading with unknown file size"); + java.util.LinkedList blocks = new java.util.LinkedList(); + //System.out.println("new LinkedList()"); + int total = 0; + int secondBlockSize = 10*2000; + byte[] secondBlock = new byte[secondBlockSize]; + //System.out.println("new byte[" + secondBlockSize + "]"); + int offInSecondBlock = 0; + int rcvd = 0; + do + { + int bytesToRead = secondBlockSize - offInSecondBlock; + if(bytesToRead < 1) + { + // System.out.println("adding block to list..."); + blocks.addLast(secondBlock); + secondBlock = new byte[secondBlockSize]; + //System.out.println("new byte[" + secondBlockSize + "]"); + offInSecondBlock = 0; + bytesToRead = secondBlockSize; + } + rcvd = inp.read(secondBlock, offInSecondBlock, bytesToRead); + //System.out.println("read " + rcvd); + // rcvd is usually << secondBlockSize + if(rcvd != -1) + { + offInSecondBlock += rcvd; + total += rcvd; + max -= rcvd; + } + } while(rcvd != -1 && (readUnlimited || max > 0)); + + // now we have: 1 x the last block as "secondBlock" + 0...n x blocks in the list + Data = new byte[total]; // I can't see how to do it without this second buffer + //System.out.println("new byte[" + total + "]"); + + int offset = 0; + while(blocks.size() > 0) + { + byte[] block = (byte[]) blocks.removeFirst(); + System.arraycopy(block, 0, Data, offset, block.length); + //System.out.println("System.arraycopy(" + block.length + ")"); + offset += block.length; + } + if(offInSecondBlock > 0) + { + //System.out.println("System.arraycopy(" + offInSecondBlock + ")"); + System.arraycopy(secondBlock, 0, Data, offset, offInSecondBlock); + } + + + } + } + catch (IOException ioe) + { + Data = Util.resizeArray(Data, off); + throw ioe; + } + finally + { + try + { + inp.close(); + } + catch (IOException ioe) + { + } + } + } + + + + /* + * Reads the response data received. Does not return until either + * Content-Length bytes have been read or EOF is reached. + * + * @param inp Description of the Parameter + * @exception IOException if any read on the input stream fails + * @exception ModuleException Description of the Exception + * @inp the input stream from which to read the data + * + private void readResponseData(InputStream inp) + throws IOException, ModuleException + { + if (ContentLength == 0) + { + return; + } + + if (Data == null) + { + Data = new byte[0]; + } + + // read response data + + int off = Data.length; + + LinkedList blocks = new java.util.LinkedList(); + + // check Content-length header in case CE-Module removed it + if (getHeader("Content-Length") != null) + { + try + { + int rcvd = 0; + Data = new byte[ContentLength]; + + do + { + off += rcvd; + rcvd = inp.read(Data, off, ContentLength - off); + } while (rcvd != -1 && off + rcvd < ContentLength); + /* + * Don't do this! + * If we do, then getData() won't work after a getInputStream() + * because we'll never get all the expected data. Instead, let + * the underlying RespInputStream throw the EOF. + * if (rcvd == -1) // premature EOF + * { + * throw new EOFException("Encountered premature EOF while " + + * "reading headers: received " + off + + * " bytes instead of the expected " + + * ContentLength + " bytes"); + * } + * + } + catch (IOException ioe) + { + Data = Util.resizeArray(Data, off); + throw ioe; + } + finally + { + try + { + inp.close(); + } + catch (IOException ioe) + { + } + } + } + else + { + int total = 0; + int rcvd = 0; + try + { + ByteBlock actBlock = new ByteBlock(this.readIncrement); + // TODO: Blocks are very small (500-2000 Bytes) -> combine them + while ((actBlock.length = inp.read(actBlock.block, 0, this.readIncrement)) != -1) + { + total += actBlock.length; + // System.out.println(this.getOriginalURI().toExternalForm() + ": adding block with length " + actBlock.length + " complete: " + total); + blocks.add(actBlock); + actBlock = new ByteBlock(this.readIncrement); + //off += rcvd; + // Data = Util.resizeArray(Data, off + this.readIncrement); + } + } + catch (IOException ioe) + { + throw ioe; + } + finally + { + Iterator it = blocks.iterator(); + Data = Util.resizeArray(Data, total); + off = 0; + while (it.hasNext()) + { + ByteBlock act = (ByteBlock) it.next(); + //System.out.println(this.getOriginalURI().toExternalForm() + ": copied " + act.length + " -> off: " + off + ", left: " + total); + System.arraycopy(act.block, 0, Data, off, act.length); + off += act.length; + total -= act.length; + } + try + { + inp.close(); + } + catch (IOException ioe) + { + } + } + } + } +*/ + + /** + * Gets the timeout attribute of the HTTPResponse object + * + * @return The timeout value + */ + int getTimeout() + { + return timeout; + } +} diff --git a/sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/fetcher/Constants.java b/sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/fetcher/Constants.java new file mode 100644 index 00000000000..b8b4d7e3a36 --- /dev/null +++ b/sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/fetcher/Constants.java @@ -0,0 +1,38 @@ + +/** + * Title: LARM Lanlab Retrieval Machine

+ * Description:

+ * Copyright: Copyright (c)

+ * Company:

+ * @author + * @version 1.0 + */ +package de.lanlab.larm.fetcher; + +/** + * contains all global constants used in this package + */ +public class Constants +{ + + /** + * user agent string a fetcher task gives to the corresponding server + */ + public static final String USER_AGENT = "Mozilla/4.06 [en] (WinNT; I)"; + + /** + * Crawler Identification + */ + public static final String CRAWLER_AGENT = "Fetcher/0.95"; + + /** + * size of the temporary buffer to read web documents in + */ + public final static int FETCHERTASK_READSIZE = 4096; + + /** + * don't read more than... bytes + */ + public final static int FETCHERTASK_MAXFILESIZE = 2000000; + +} \ No newline at end of file diff --git a/sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/fetcher/DNSResolver.java b/sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/fetcher/DNSResolver.java new file mode 100644 index 00000000000..a724066daff --- /dev/null +++ b/sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/fetcher/DNSResolver.java @@ -0,0 +1,73 @@ + +/** + * Title: LARM Lanlab Retrieval Machine

+ * Description:

+ * Copyright: Copyright (c)

+ * Company:

+ * @author + * @version 1.0 + */ +package de.lanlab.larm.fetcher; + +import java.util.*; +import java.net.*; + +/** + * filter class; gets IP Adresses from host names and forwards them to + * the other parts of the application + * since URLs cache their IP addresses themselves, and HTTP 1.1 needs the + * host names to be sent to the server, this class is not used anymore + */ +public class DNSResolver implements MessageListener +{ + + HashMap ipCache = new HashMap(); + + + public DNSResolver() + { + } + + public void notifyAddedToMessageHandler(MessageHandler m) + { + this.messageHandler = m; + } + + MessageHandler messageHandler; + + public Message handleRequest(Message message) + { + if(message instanceof URLMessage) + { + URL url = ((URLMessage)message).getUrl(); + String host = url.getHost(); + InetAddress ip; + /*InetAddress ip = (InetAddress)ipCache.get(host); + + if(ip == null) + { + */ + + try + { + ip = InetAddress.getByName(host); + /* + ipCache.put(host, ip); + //System.out.println("DNSResolver: new Cache Entry \"" + host + "\" = \"" + ip.getHostAddress() + "\"");*/ + } + catch(UnknownHostException e) + { + ip = null; + return null; + //System.out.println("DNSResolver: unknown host \"" + host + "\""); + } + /*} + else + { + //System.out.println("DNSResolver: Cache hit: " + ip.getHostAddress()); + }*/ + //((URLMessage)message).setIpAddress(ip); + } + return message; + } +} \ No newline at end of file diff --git a/sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/fetcher/Fetcher.java b/sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/fetcher/Fetcher.java new file mode 100644 index 00000000000..e1ca56c2355 --- /dev/null +++ b/sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/fetcher/Fetcher.java @@ -0,0 +1,224 @@ +/* + * LARM - LANLab Retrieval Machine + * + * $history: $ + * + */ + +package de.lanlab.larm.fetcher; + +import de.lanlab.larm.threads.ThreadPool; +import de.lanlab.larm.threads.ThreadPoolObserver; +import de.lanlab.larm.threads.InterruptableTask; +import de.lanlab.larm.storage.*; + +import java.net.MalformedURLException; +import java.net.URL; +import java.util.LinkedList; + +import de.lanlab.larm.fetcher.FetcherTask; + +/** + * filter class; the Fetcher is the main class which keeps the ThreadPool that + * gets the documents. It should be placed at the very end of the MessageQueue, + * so that all filtering can be made beforehand. + * + * @author Clemens Marschner + * + */ + +public class Fetcher implements MessageListener +{ + /** + * holds the threads + */ + ThreadPool fetcherPool; + + /** + * total number of docs read + */ + int docsRead = 0; + + /** + * the storage where the docs are saved to + */ + DocumentStorage storage; + + /** + * the host manager keeps track of host information + */ + HostManager hostManager; + + + /** + * initializes the fetcher with the given number of threads in the thread + * pool and a document storage. + * + * @param maxThreads the number of threads in the ThreadPool + * @param storage the storage where all documents are stored + * @param hostManager the host manager + */ + public Fetcher(int maxThreads, DocumentStorage storage, HostManager hostManager) + { + this.storage = storage; + FetcherTask.setStorage(storage); + fetcherPool = new ThreadPool(maxThreads, new FetcherThreadFactory(hostManager)); + fetcherPool.setQueue(new FetcherTaskQueue()); + docsRead = 0; + this.hostManager = hostManager; + } + + + /** + * initializes the pool with default values (5 threads, NullStorage) + */ + public void init() + { + fetcherPool.init(); + } + + + /** + * initializes the pool with a NullStorage and the given number of threads + * + * @param maxThreads the number of threads in the thread pool + */ + public void init(int maxThreads) + { + fetcherPool.init(); + docsRead = 0; + } + + + /** + * this function will be called by the message handler each time a URL + * passes all filters and gets to the fetcher. From here, it will be + * distributed to the FetcherPool, a thread pool which carries out the task, + * that is to fetch the document from the web. + * + * @param message the message, which should actually be a URLMessage + * @return Description of the Return Value + */ + public Message handleRequest(Message message) + { + URLMessage urlMessage = (URLMessage) message; + + fetcherPool.doTask(new FetcherTask(urlMessage), ""); + docsRead++; + + // eat the message + return null; + } + + + /** + * called by the message handler when this object is added to it + * + * @param handler the message handler + */ + public void notifyAddedToMessageHandler(MessageHandler handler) + { + this.messageHandler = handler; + FetcherTask.setMessageHandler(handler); + } + + + MessageHandler messageHandler; + + + /** + * the thread pool observer will be called each time a thread changes its + * state, i.e. from IDLE to RUNNING, and each time the number of thread + * queue entries change. + * this just wraps the thread pool method + * + * @param t the class that implements the ThreadPoolObserver interface + */ + public void addThreadPoolObserver(ThreadPoolObserver t) + { + fetcherPool.addThreadPoolObserver(t); + } + + + /** + * returns the number of tasks queued. Should return 0 if there are any idle + * threads. this method just wraps the ThreadPool method + * + * @return The queueSize value + */ + public int getQueueSize() + { + return fetcherPool.getQueueSize(); + } + + + /** + * get the total number of threads. + * this method just wraps the ThreadPool method + * + * @return The workingThreadsCount value + */ + public int getWorkingThreadsCount() + { + return fetcherPool.getIdleThreadsCount() + fetcherPool.getBusyThreadsCount(); + } + + + /** + * get the number of threads that are currently idle. + * this method just wraps the ThreadPool method + * + * @return The idleThreadsCount value + */ + public int getIdleThreadsCount() + { + return fetcherPool.getIdleThreadsCount(); + } + + + /** + * get the number of threads that are currently busy. + * this method just wraps the ThreadPool method + * + * @return The busyThreadsCount value + */ + public int getBusyThreadsCount() + { + return fetcherPool.getBusyThreadsCount(); + } + + + /** + * Gets the threadPool attribute of the Fetcher object + * beware: the original object is returned + * + * @TODO remove this / make it private if possible + * @return The threadPool value + */ + public ThreadPool getThreadPool() + { + return fetcherPool; + } + + + /** + * Gets the total number of docs read + * + * @return number of docs read + */ + public int getDocsRead() + { + return docsRead; + } + + + /** + * returns the (original) task queue + * @TODO remove this if possible + * @return The taskQueue value + */ + public FetcherTaskQueue getTaskQueue() + { + return (FetcherTaskQueue) this.fetcherPool.getTaskQueue(); + } +} diff --git a/sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/fetcher/FetcherGUIController.java b/sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/fetcher/FetcherGUIController.java new file mode 100644 index 00000000000..43b19768245 --- /dev/null +++ b/sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/fetcher/FetcherGUIController.java @@ -0,0 +1,150 @@ +package de.lanlab.larm.fetcher; + +import java.awt.event.ActionListener; +import java.awt.event.ActionEvent; +import java.net.MalformedURLException; +import java.net.URL; +import java.util.*; +import java.awt.event.*; +import de.lanlab.larm.gui.*; +import de.lanlab.larm.threads.*; + +/** + * this was used to connect the GUI to the fetcher + * @TODO put this into the GUI package, probably? + */ +public class FetcherGUIController implements ActionListener +{ + FetcherMain fetcherMain; + FetcherSummaryFrame fetcherFrame; + + + public FetcherGUIController(FetcherMain fetcherMainPrg, FetcherSummaryFrame fetcherFrameWin, String defaultStartURL) + { + this.fetcherMain = fetcherMainPrg; + this.fetcherFrame = fetcherFrameWin; + + fetcherFrame.setRestrictTo(fetcherMain.urlScopeFilter.getRexString()); + fetcherFrame.setStartURL(defaultStartURL); + + fetcherMain.fetcher.addThreadPoolObserver( + new ThreadPoolObserver() + { + public void threadUpdate(int threadNr, String action, String info) + { + String status = threadNr + ": " + action + ": " + info; + fetcherFrame.setIdleThreadsCount(fetcherMain.fetcher.getIdleThreadsCount()); + fetcherFrame.setBusyThreadsCount(fetcherMain.fetcher.getBusyThreadsCount()); + fetcherFrame.setWorkingThreadsCount(fetcherMain.fetcher.getWorkingThreadsCount()); + } + + public void queueUpdate(String info, String action) + { + fetcherFrame.setRequestQueueCount(fetcherMain.fetcher.getQueueSize()); + } + } + ); + + fetcherMain.monitor.addObserver(new Observer() + { + public void update(Observable o, Object arg) + { + // der ThreadMonitor wurde geupdated + //fetcherFrame.setStalledThreads(fetcherMain.monitor.getStalledThreadCount(10, 500.0)); + //fetcherFrame.setBytesPerSecond(fetcherMain.monitor.getAverageReadCount(5)); + // fetcherFrame.setDocsPerSecond(fetcherMain.monitor.getDocsPerSecond(5)); + // wir nutzen die Gelegenheit, den aktuellen Speicherbestand auszugeben + fetcherFrame.setFreeMem(Runtime.getRuntime().freeMemory()); + fetcherFrame.setTotalMem(Runtime.getRuntime().totalMemory()); + + } + + }); + + /* fetcherMain.reFilter.addObserver( + new Observer() + { + public void update(Observable o, Object arg) + { + fetcherFrame.setRobotsTxtCount(fetcherMain.reFilter.getExcludingHostsCount()); + } + } + );*/ + + fetcherMain.messageHandler.addMessageQueueObserver(new Observer() + { + public void update(Observable o, Object arg) + { + // a message has been added or deleted + + fetcherFrame.setURLsQueued(fetcherMain.messageHandler.getQueued()); + } + + } + ); + + // this observer will be called if a filter has decided to throw a + // message away. + fetcherMain.messageHandler.addMessageProcessorObserver(new Observer() + { + public void update(Observable o, Object arg) + { + if(arg == fetcherMain.urlScopeFilter) + { + fetcherFrame.setScopeFiltered(fetcherMain.urlScopeFilter.getFiltered()); + } + else if(arg == fetcherMain.urlVisitedFilter) + { + fetcherFrame.setVisitedFiltered(fetcherMain.urlVisitedFilter.getFiltered()); + } + else if(arg == fetcherMain.reFilter) + { + fetcherFrame.setURLsCaughtCount(fetcherMain.reFilter.getFiltered()); + } + else // it's the fetcher + { + fetcherFrame.setDocsRead(fetcherMain.fetcher.getDocsRead()); + } + } + } + ); + + fetcherFrame.addWindowListener( + new WindowAdapter() + { + public void windowClosed(WindowEvent e) + { + System.out.println("window Closed"); + System.exit(0); + } + + + } + ); + + fetcherFrame.addStartButtonListener((ActionListener)this); + } + + /** + * will be called when the start button is pressed + */ + public void actionPerformed(ActionEvent e) + { + System.out.println("Füge Start-URL ein"); + try + { + // urlVisitedFilter.printAllURLs(); + // urlVisitedFilter.clearHashtable(); + fetcherMain.setRexString(fetcherFrame.getRestrictTo()); + fetcherMain.startMonitor(); + fetcherMain.putURL(new URL(fetcherFrame.getStartURL()), false); + } + catch(Exception ex) + { + System.out.println("actionPerformed: Exception: " + ex.getMessage()); + } + } + +} + + diff --git a/sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/fetcher/FetcherMain.java b/sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/fetcher/FetcherMain.java new file mode 100644 index 00000000000..2da43b08f68 --- /dev/null +++ b/sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/fetcher/FetcherMain.java @@ -0,0 +1,362 @@ +/* + * LARM - LANLab Retrieval Machine + * + * $history: $ + * + */ +package de.lanlab.larm.fetcher; + +import de.lanlab.larm.threads.ThreadPoolObserver; +import de.lanlab.larm.threads.ThreadPool; +import java.net.MalformedURLException; +import java.net.URL; +import java.util.*; +import de.lanlab.larm.gui.*; +import de.lanlab.larm.util.*; +import de.lanlab.larm.storage.*; +import javax.swing.UIManager; +import HTTPClient.*; +import org.apache.oro.text.regex.MalformedPatternException; + + +/** + * ENTRY POINT: this class contains the main()-method of the application, does + * all the initializing and optionally connects the fetcher with the GUI. + * + * @author Clemens Marschner + * @created December 16, 2000 + */ +public class FetcherMain +{ + + /** + * the main message pipeline + */ + protected MessageHandler messageHandler; + + /** + * this filter records all incoming URLs and filters everything it already + * knows + */ + protected URLVisitedFilter urlVisitedFilter; + + /** + * the scope filter filters URLs that fall out of the scope given by the + * regular expression + */ + protected URLScopeFilter urlScopeFilter; + + /* + * The DNS resolver was supposed to hold the host addresses for all hosts + * this is done by URL itself today + * + * protected DNSResolver dnsResolver; + */ + + /** + * the robot exclusion filter looks if a robots.txt is present on a host + * before it is first accessed + */ + protected RobotExclusionFilter reFilter; + + /** + * the host manager keeps track of all hosts and is used by the filters. + */ + protected HostManager hostManager; + + /** + * this rather flaky filter just filters out some URLs, i.e. different views + * of Apache the apache DirIndex module. Has to be made + * configurable in near future + */ + protected KnownPathsFilter knownPathsFilter; + + /** + * this is the main document fetcher. It contains a thread pool that fetches the + * documents and stores them + */ + protected Fetcher fetcher; + + + /** + * the thread monitor once was only a monitoring tool, but now has become a + * vital part of the system that computes statistics and + * flushes the log file buffers + */ + + protected ThreadMonitor monitor; + + /** + * the storage is a central class that puts all fetched documents somewhere. + * Several differnt implementations exist. + */ + protected DocumentStorage storage; + + /** + * the URL length filter filters URLs that are too long, i.e. because of errors + * in the implementation of dynamic web sites + */ + protected URLLengthFilter urlLengthFilter; + + /** + * initializes all classes and registers anonymous adapter classes as + * listeners for fetcher events. + * + * @param nrThreads number of fetcher threads to be created + */ + public FetcherMain(int nrThreads) + { + // to make things clear, this method is commented a bit better than + // the rest of the program... + + // this is the main message queue. handlers are registered with + // the queue, and whenever a message is put in it, they are passed to the + // filters in a "chain of responibility" manner. Every listener can decide + // to throw the message away + messageHandler = new MessageHandler(); + + // the storage is the class which saves a WebDocument somewhere, no + // matter how it does it, whether it's in a file, in a database or + // whatever + + + // example for the (very slow) SQL Server storage: + // this.storage = new SQLServerStorage("sun.jdbc.odbc.JdbcOdbcDriver","jdbc:odbc:search","sa","...",nrThreads); + + // the LogStorage used here does extensive logging. It logs all links and + // document information. + // it also saves all documents to page files. Probably this single storage + // could also be replaced by a pipeline; or even incorporated into the + // existing message pipeline + SimpleLogger log = new SimpleLogger("store", false); + this.storage = new LogStorage(log, true, "logs/pagefile"); + + // a third example would be the NullStorage, which converts the documents into + // heat, which evaporates above the processor + // NullStorage(); + + // create the filters and add them to the message queue + urlScopeFilter = new URLScopeFilter(); + + urlVisitedFilter = new URLVisitedFilter(100000, log); + + // dnsResolver = new DNSResolver(); + hostManager = new HostManager(1000); + + reFilter = new RobotExclusionFilter(hostManager); + + fetcher = new Fetcher(nrThreads, storage, hostManager); + + knownPathsFilter = new KnownPathsFilter(); + + urlLengthFilter = new URLLengthFilter(255); + + // prevent message box popups + HTTPConnection.setDefaultAllowUserInteraction(false); + + // prevent GZipped files from being decoded + HTTPConnection.removeDefaultModule(HTTPClient.ContentEncodingModule.class); + + // initialize the threads + fetcher.init(); + + // the thread monitor watches the thread pool. + + monitor = new ThreadMonitor(urlLengthFilter, + urlVisitedFilter, + urlScopeFilter, + /*dnsResolver,*/ + reFilter, + messageHandler, + fetcher.getThreadPool(), + hostManager, + 5000 // wake up every 5 seconds + ); + + + // add all filters to the handler. + messageHandler.addListener(urlLengthFilter); + messageHandler.addListener(urlScopeFilter); + messageHandler.addListener(reFilter); + messageHandler.addListener(urlVisitedFilter); + messageHandler.addListener(knownPathsFilter); + messageHandler.addListener(fetcher); + + /* uncomment this to enable HTTPClient logging + try + { + HTTPClient.Log.setLogWriter(new java.io.FileWriter("logs/HttpClient.log"),false); + HTTPClient.Log.setLogging(HTTPClient.Log.ALL, true); + } + catch (Exception e) + { + e.printStackTrace(); + } + */ + } + + + /** + * Sets the RexString attribute of the FetcherMain object + * + * @param restrictTo The new RexString value + */ + public void setRexString(String restrictTo) throws MalformedPatternException + { + urlScopeFilter.setRexString(restrictTo); + } + + + /** + * Description of the Method + * + * @param url Description of Parameter + * @param isFrame Description of the Parameter + * @exception java.net.MalformedURLException Description of Exception + */ + public void putURL(URL url, boolean isFrame) + throws java.net.MalformedURLException + { + try + { + messageHandler.putMessage(new URLMessage(url, null, isFrame)); + } + catch (Exception e) + { + System.out.println("Exception: " + e.getMessage()); + e.printStackTrace(); + } + //System.out.println("URLs geschrieben"); + } + + + /** + * Description of the Method + */ + public void startMonitor() + { + monitor.start(); + } + + + + /* + * the GUI is not working at this time. It was used in the very beginning, but + * synchronous updates turned out to slow down the program a lot, even if the + * GUI would be turned off. Thus, a lot + * of Observer messages where removed later. Nontheless, it's quite cool to see + * it working... + * + * @param f Description of Parameter + * @param startURL Description of Parameter + */ + + /* + public void initGui(FetcherMain f, String startURL) + { + // if we're on a windows platform, make it look a bit more convenient + try + { + UIManager.setLookAndFeel(UIManager.getSystemLookAndFeelClassName()); + } + catch (Exception e) + { + // dann halt nicht... + } + System.out.println("Init FetcherFrame"); + + FetcherSummaryFrame fetcherFrame; + fetcherFrame = new FetcherSummaryFrame(); + fetcherFrame.setSize(640, 450); + fetcherFrame.setVisible(true); + FetcherGUIController guiController = new FetcherGUIController(f, fetcherFrame, startURL); + } + */ + + + /** + * The main program. parsed + * + * @param args The command line arguments + */ + public static void main(String[] args) + { + int nrThreads = 10; + + String startURL = ""; + String restrictTo = "http://141.84.120.82/ll/cmarschn/.*"; + boolean gui = false; + boolean showInfo = false; + System.out.println("LARM - LANLab Retrieval Machine - Fetcher - V 1.00 - (C) LANLab 2000-02"); + for (int i = 0; i < args.length; i++) + { + if (args[i].equals("-start")) + { + i++; + startURL = args[i]; + System.out.println("Start-URL set to: " + startURL); + } + else if (args[i].equals("-restrictto")) + { + i++; + restrictTo = args[i]; + System.out.println("Restricting URLs to " + restrictTo); + } + else if (args[i].equals("-threads")) + { + i++; + nrThreads = Integer.parseInt(args[i]); + System.out.println("Threads set to " + nrThreads); + } + else if (args[i].equals("-gui")) + { + gui = true; + } + else if (args[i].equals("-?")) + { + showInfo = true; + } + else + { + System.out.println("Unknown option: " + args[i] + "; use -? to get syntax"); + System.exit(0); + } + } + + //URL.setURLStreamHandlerFactory(new HttpTimeoutFactory(500)); + // replaced by HTTPClient + + FetcherMain f = new FetcherMain(nrThreads); + if (showInfo || (startURL.equals("") && gui == false)) + { + System.out.println("Usage: FetcherMain -start -restrictto [-threads ]"); // [-gui] + System.exit(0); + } + try + { + f.setRexString(restrictTo); + + if (gui) + { + // f.initGui(f, startURL); + } + else + { + try + { + f.startMonitor(); + f.putURL(new URL(startURL), false); + } + catch (MalformedURLException e) + { + System.out.println("Malformed URL"); + + } + } + } + catch (MalformedPatternException e) + { + System.out.println("Wrong RegEx syntax. Must be a valid PERL RE"); + } + } +} diff --git a/sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/fetcher/FetcherTask.java b/sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/fetcher/FetcherTask.java new file mode 100644 index 00000000000..9f6edf904e4 --- /dev/null +++ b/sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/fetcher/FetcherTask.java @@ -0,0 +1,617 @@ +/* + * LARM - LANLab Retrieval Machine + * + * $history: $ + * + */ +package de.lanlab.larm.fetcher; + +import java.net.URL; +import de.lanlab.larm.threads.*; +import de.lanlab.larm.util.InputStreamObserver; +import de.lanlab.larm.util.ObservableInputStream; +import de.lanlab.larm.util.WebDocument; +import de.lanlab.larm.util.SimpleCharArrayReader; +import de.lanlab.larm.storage.DocumentStorage; +import de.lanlab.larm.util.State; +import de.lanlab.larm.util.SimpleLogger; +import de.lanlab.larm.net.HttpTimeoutFactory; +import HTTPClient.*; +import java.net.*; +import java.io.*; +import java.util.*; +import java.text.*; +import de.lanlab.larm.parser.Tokenizer; +import de.lanlab.larm.parser.LinkHandler; + +/** + * this class gets the documents from the web. It connects to the server given + * by the IP address in the URLMessage, gets the document, and forwards it to + * the storage. If it's an HTML document, it will be parsed and all links will + * be put into the message handler again. + * + * @author Clemens Marschner + * + */ +public class FetcherTask + implements InterruptableTask, LinkHandler, Serializable +{ + + protected volatile boolean isInterrupted = false; + + /** + * each task has its own number. the class variable counts up if an instance + * of a fetcher task is created + */ + static volatile int taskIdentity = 0; + + /** + * the number of this object + */ + int taskNr; + + /** + * the BASE Href (defaults to contextUrl, may be changed with a tag + * only valid within a doTask call + */ + private volatile URL base; + + /** + * the URL of the docuzment + * only valid within a doTask call + */ + private volatile URL contextUrl; + + /** + * the message handler the URL message comes from; same for all tasks + */ + protected static volatile MessageHandler messageHandler; + + /** + * actual number of bytes read + * only valid within a doTask call + */ + private volatile long bytesRead = 0; + + /** + * the storage this task will put the document to + */ + private static volatile DocumentStorage storage; + + /** + * task state IDs. comparisons will be done by their references, so always + * use the IDs + */ + public final static String FT_IDLE = "idle"; + public final static String FT_STARTED = "started"; + public final static String FT_OPENCONNECTION = "opening connection"; + public final static String FT_CONNECTING = "connecting"; + public final static String FT_GETTING = "getting"; + public final static String FT_READING = "reading"; + public final static String FT_SCANNING = "scanning"; + public final static String FT_STORING = "storing"; + public final static String FT_READY = "ready"; + public final static String FT_CLOSING = "closing"; + public final static String FT_EXCEPTION = "exception"; + public final static String FT_INTERRUPTED = "interrupted"; + + private volatile State taskState = new State(FT_IDLE); + + /** + * the URLs found will be stored and only added to the message handler in the very + * end, to avoid too many synchronizations + */ + private volatile LinkedList foundUrls; + + /** + * the URL to be get + */ + protected volatile URLMessage actURLMessage; + + /** + * the document title, if present + */ + private volatile String title; + + /** + * headers for HTTPClient + */ + private static volatile NVPair headers[] = new NVPair[1]; + + static + { + headers[0] = new HTTPClient.NVPair("User-Agent", Constants.CRAWLER_AGENT); + + } + + + /** + * Gets a copy of the current taskState + * + * @return The taskState value + */ + public State getTaskState() + { + return taskState.cloneState(); + } + + + /** + * Constructor for the FetcherTask object + * + * @param urlMessage Description of the Parameter + */ + public FetcherTask(URLMessage urlMessage) + { + actURLMessage = urlMessage; + } + + + /** + * Gets the uRLMessages attribute of the FetcherTask object + * + * @return The uRLMessages value + */ + public URLMessage getActURLMessage() + { + return this.actURLMessage; + } + + + /** + * Sets the document storage + * + * @param storage The new storage + */ + public static void setStorage(DocumentStorage storage) + { + FetcherTask.storage = storage; + } + + + /** + * Sets the messageHandler + * + * @param messageHandler The new messageHandler + */ + public static void setMessageHandler(MessageHandler messageHandler) + { + FetcherTask.messageHandler = messageHandler; + } + + + /** + * @return the URL as a string + */ + public String getInfo() + { + return actURLMessage.getURLString(); + } + + + /** + * Gets the uRL attribute of the FetcherTask object + * + * @return The uRL value + */ + public URL getURL() + { + return actURLMessage.getUrl(); + } + + SimpleLogger log; + SimpleLogger errorLog; + //private long startTime; + + /** + * this will be called by the fetcher thread and will do all the work + * + * @TODO probably split this up into different processing steps + * @param thread Description of the Parameter + */ + public void run(ServerThread thread) + { + + taskState.setState(FT_STARTED); // state information is always set to make the thread monitor happy + + log = thread.getLog(); + HostManager hm = ((FetcherThread)thread).getHostManager(); + + errorLog = thread.getErrorLog(); + + // startTime = System.currentTimeMillis(); + int threadNr = ((FetcherThread) thread).getThreadNumber(); + + log.log("start"); + base = contextUrl = actURLMessage.getUrl(); + String urlString = actURLMessage.getURLString(); + String host = contextUrl.getHost(); + int hostPos = urlString.indexOf(host); + int hostLen = host.length(); + + HostInfo hi = hm.getHostInfo(host); // get and create + + if(!hi.isHealthy()) + { + // we make this check as late as possible to get the most current information + log.log("Bad Host: " + contextUrl + "; returning"); + System.out.println("[" + threadNr + "] bad host: " + this.actURLMessage.getUrl()); + + taskState.setState(FT_READY, null); + return; + } + + foundUrls = new java.util.LinkedList(); + + HTTPConnection conn = null; + + title = "*untitled*"; + + int size = 1; + + InputStream in = null; + bytesRead = 0; + + + try + { + + URL ipURL = contextUrl; + + taskState.setState(FT_OPENCONNECTION, urlString); + + log.log("connecting to " + ipURL.getHost()); + taskState.setState(FT_CONNECTING, ipURL); + conn = new HTTPConnection(host); + + conn.setDefaultTimeout(75000); + // 75 s + conn.setDefaultAllowUserInteraction(false); + + taskState.setState(this.FT_GETTING, ipURL); + log.log("getting"); + + HTTPResponse response = conn.Get(ipURL.getFile(), "", headers); + response.setReadIncrement(2720); + int statusCode = response.getStatusCode(); + byte[] fullBuffer = null; + String contentType = ""; + int contentLength = 0; + + if (statusCode != 404 && statusCode != 403) + { + // read up to Constants.FETCHERTASK_MAXFILESIZE bytes into a byte array + taskState.setState(FT_READING, ipURL); + contentType = response.getHeader("Content-Type"); + String length = response.getHeader("Content-Length"); + if (length != null) + { + contentLength = Integer.parseInt(length); + } + log.log("reading"); + + fullBuffer = response.getData(Constants.FETCHERTASK_MAXFILESIZE); // max. 2 MB + if (fullBuffer != null) + { + contentLength = fullBuffer.length; + this.bytesRead += contentLength; + } + } + //conn.stop(); // close connection. todo: Do some caching... + + + /* + * conn.disconnect(); + */ + if (isInterrupted) + { + System.out.println("FetcherTask: interrupted while reading. File truncated"); + log.log("interrupted while reading. File truncated"); + } + else + { + if (fullBuffer != null) + { + taskState.setState(FT_SCANNING, ipURL); + + log.log("read file (" + fullBuffer.length + " bytes). Now scanning."); + + if (contentType.startsWith("text/html")) + { + + // ouch. I haven't found a better solution yet. just slower ones. + char[] fullCharBuffer = new char[contentLength]; + new InputStreamReader(new ByteArrayInputStream(fullBuffer)).read(fullCharBuffer); + Tokenizer tok = new Tokenizer(); + tok.setLinkHandler(this); + tok.parse(new SimpleCharArrayReader(fullCharBuffer)); + } + else + { + // System.out.println("Discovered unknown content type: " + contentType + " at " + urlString); + errorLog.log("[" + threadNr + "] Discovered unknown content type at " + urlString + ": " + contentType + ". just storing"); + } + log.log("scanned"); + } + taskState.setState(FT_STORING, ipURL); + messageHandler.putMessages(foundUrls); + storage.store(new WebDocument(contextUrl, contentType, fullBuffer, statusCode, actURLMessage.getReferer(), contentLength, title)); + log.log("stored"); + } + } + catch (InterruptedIOException e) + { + // timeout while reading this file + System.out.println("[" + threadNr + "] FetcherTask: Timeout while opening: " + this.actURLMessage.getUrl()); + errorLog.log("error: Timeout: " + this.actURLMessage.getUrl()); + hi.badRequest(); + } + catch (FileNotFoundException e) + { + taskState.setState(FT_EXCEPTION); + System.out.println("[" + threadNr + "] FetcherTask: File not Found: " + this.actURLMessage.getUrl()); + errorLog.log("error: File not Found: " + this.actURLMessage.getUrl()); + } + catch(NoRouteToHostException e) + { + // router is down or firewall prevents to connect + hi.setReachable(false); + taskState.setState(FT_EXCEPTION); + System.out.println("[" + threadNr + "] " + e.getClass().getName() + ": " + e.getMessage()); + // e.printStackTrace(); + errorLog.log("error: " + e.getClass().getName() + ": " + e.getMessage()); + } + catch(ConnectException e) + { + // no server is listening at this port + hi.setReachable(false); + taskState.setState(FT_EXCEPTION); + System.out.println("[" + threadNr + "] " + e.getClass().getName() + ": " + e.getMessage()); + // e.printStackTrace(); + errorLog.log("error: " + e.getClass().getName() + ": " + e.getMessage()); + } + catch (SocketException e) + { + taskState.setState(FT_EXCEPTION); + System.out.println("[" + threadNr + "]: SocketException:" + e.getMessage()); + errorLog.log("error: " + e.getClass().getName() + ": " + e.getMessage()); + + } + catch(UnknownHostException e) + { + // IP Address not to be determined + hi.setReachable(false); + taskState.setState(FT_EXCEPTION); + System.out.println("[" + threadNr + "] " + e.getClass().getName() + ": " + e.getMessage()); + // e.printStackTrace(); + errorLog.log("error: " + e.getClass().getName() + ": " + e.getMessage()); + + } + catch (IOException e) + { + taskState.setState(FT_EXCEPTION); + System.out.println("[" + threadNr + "] " + e.getClass().getName() + ": " + e.getMessage()); + // e.printStackTrace(); + errorLog.log("error: IOException: " + e.getClass().getName() + ": " + e.getMessage()); + + } + catch (OutOfMemoryError ome) + { + taskState.setState(FT_EXCEPTION); + System.out.println("[" + threadNr + "] Task " + this.taskNr + " OutOfMemory after " + size + " bytes"); + errorLog.log("error: OutOfMemory after " + size + " bytes"); + } + catch (Throwable e) + { + taskState.setState(FT_EXCEPTION); + System.out.println("[" + threadNr + "] " + e.getMessage() + " type: " + e.getClass().getName()); + e.printStackTrace(); + System.out.println("[" + threadNr + "]: stopping"); + errorLog.log("error: " + e.getClass().getName() + ": " + e.getMessage() + "; stopping"); + + } + finally + { + + if (isInterrupted) + { + System.out.println("Task was interrupted"); + log.log("interrupted"); + taskState.setState(FT_INTERRUPTED); + } + } + if (isInterrupted) + { + System.out.println("Task: closed everything"); + } + /* + * } + */ + taskState.setState(FT_CLOSING); + conn.stop(); + + taskState.setState(FT_READY); + foundUrls = null; + } + + + /** + * the interrupt method. not in use since the change to HTTPClient + * @TODO decide if we need this anymore + */ + public void interrupt() + { + System.out.println("FetcherTask: interrupted!"); + this.isInterrupted = true; + /* + * try + * { + * if (conn != null) + * { + * ((HttpURLConnection) conn).disconnect(); + * System.out.println("FetcherTask: disconnected URL Connection"); + * conn = null; + * } + * if (in != null) + * { + * in.close(); + * / possibly hangs at close() .> KeepAliveStream.close() -> MeteredStream.skip() + * System.out.println("FetcherTask: Closed Input Stream"); + * in = null; + * } + * } + * catch (IOException e) + * { + * System.out.println("IOException while interrupting: "); + * e.printStackTrace(); + * } + * System.out.println("FetcherTask: Set all IOs to null"); + */ + } + + + /** + * this is called whenever a links was found in the current document, + * Don't create too many objects here, this will be called + * millions of times + * + * @param link Description of the Parameter + */ + public void handleLink(String link, boolean isFrame) + { + try + { + // cut out Ref part + + + int refPart = link.indexOf("#"); + //System.out.println(link); + if (refPart == 0) + { + return; + } + else if (refPart > 0) + { + link = link.substring(0, refPart); + } + + URL url = null; + if (link.startsWith("http:")) + { + // distinguish between absolute and relative URLs + + url = new URL(link); + } + else + { + // relative url + url = new URL(base, link); + } + + URLMessage urlMessage = new URLMessage(url, contextUrl, isFrame); + + String urlString = urlMessage.getURLString(); + + foundUrls.add(urlMessage); + //messageHandler.putMessage(new actURLMessage(url)); // put them in the very end + } + catch (MalformedURLException e) + { + //log.log("malformed url: base:" + base + " -+- link:" + link); + log.log("warning: " + e.getClass().getName() + ": " + e.getMessage()); + } + catch (Exception e) + { + log.log("warning: " + e.getClass().getName() + ": " + e.getMessage()); + // e.printStackTrace(); + } + + } + + + /** + * called when a BASE tag was found + * + * @param base the HREF attribute + */ + public void handleBase(String base) + { + try + { + this.base = new URL(base); + } + catch (MalformedURLException e) + { + log.log("warning: " + e.getClass().getName() + ": " + e.getMessage() + " while converting '" + base + "' to URL in document " + contextUrl); + } + } + + + /** + * called when a TITLE tag was found + * + * @param title the string between <title> and >/title> + */ + public void handleTitle(String title) + { + this.title = title; + } + + + + /* + * public void notifyOpened(ObservableInputStream in, long timeElapsed) + * { + * } + * public void notifyClosed(ObservableInputStream in, long timeElapsed) + * { + * } + * public void notifyRead(ObservableInputStream in, long timeElapsed, int nrRead, int totalRead) + * { + * if(totalRead / ((double)timeElapsed) < 0.3) // weniger als 300 bytes/s + * { + * System.out.println("Task " + this.taskNr + " stalled at pos " + totalRead + " with " + totalRead / (timeElapsed / 1000.0) + " bytes/s"); + * } + * } + * public void notifyFinished(ObservableInputStream in, long timeElapsed, int totalRead) + * { + * /System.out.println("Task " + this.taskNr + " finished (" + totalRead + " bytes in " + timeElapsed + " ms with " + totalRead / (timeElapsed / 1000.0) + " bytes/s)"); + * } + */ + public long getBytesRead() + { + return bytesRead; + } + + + /** + * do nothing if a warning occurs within the html parser + * + * @param message Description of the Parameter + * @param systemID Description of the Parameter + * @param line Description of the Parameter + * @param column Description of the Parameter + * @exception java.lang.Exception Description of the Exception + */ + public void warning(String message, String systemID, int line, int column) + throws java.lang.Exception { } + + + /** + * do nothing if a fatal error occurs... + * + * @param message Description of the Parameter + * @param systemID Description of the Parameter + * @param line Description of the Parameter + * @param column Description of the Parameter + * @exception Exception Description of the Exception + */ + public void fatal(String message, String systemID, int line, int column) + throws Exception + { + System.out.println("fatal error: " + message); + log.log("fatal error: " + message); + } + +} + diff --git a/sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/fetcher/FetcherTaskQueue.java b/sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/fetcher/FetcherTaskQueue.java new file mode 100644 index 00000000000..f2c9083708b --- /dev/null +++ b/sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/fetcher/FetcherTaskQueue.java @@ -0,0 +1,198 @@ +package de.lanlab.larm.fetcher; + +import de.lanlab.larm.threads.*; +import de.lanlab.larm.util.*; +import java.util.*; +import java.net.URL; + +/** + * this special kind of task queue reorders the incoming tasks so that every subsequent + * task is for a different host. + * This is done by a "HashedCircularLinkedList" which allows random adding while + * a differnet thread iterates through the collection circularly. + * + * @author Clemens Marschner + * @created 23. November 2001 + */ +public class FetcherTaskQueue extends TaskQueue +{ + /** + * this is a hash that contains an entry for each server, which by itself is a + * CachingQueue that stores all tasks for this server + * @TODO probably link this to the host info structure + */ + HashedCircularLinkedList servers = new HashedCircularLinkedList(100, 0.75f); + int size = 0; + + + /** + * Constructor for the FetcherTaskQueue object. Does nothing + */ + public FetcherTaskQueue() { } + + + /** + * true if no task is queued + * + * @return The empty value + */ + public boolean isEmpty() + { + return (size == 0); + } + + + /** + * clear the queue. not synchronized. + */ + public void clear() + { + servers.clear(); + } + + + /** + * puts task into Queue. + * Warning: not synchronized + * + * @param t the task to be added. must be a FetcherTask + */ + public void insert(Object t) + { + // assert (t != null && t.getURL() != null) + + URLMessage um = ((FetcherTask)t).getActURLMessage(); + URL act = um.getUrl(); + String host = act.getHost(); + Queue q; + q = ((Queue) servers.get(host)); + if (q == null) + { + // add a new host to the queue + //String host2 = host.replace(':', '_').replace('/', '_').replace('\\', '_'); + // make it file system ready + q = new CachingQueue(host, 100); + servers.put(host, q); + } + // assert((q != null) && (q instanceof FetcherTaskQueue)); + q.insert(t); + size++; + } + + + /** + * the size of the queue. make sure that insert() and size() calls are synchronized + * if the exact number matters. + * + * @return Description of the Return Value + */ + public int size() + { + return size; + } + + /** + * the number of different hosts queued at the moment + */ + public int getNumHosts() + { + return servers.size(); + } + + /** + * get the next task. warning: not synchronized + * + * @return Description of the Return Value + */ + public Object remove() + { + FetcherTask t = null; + if (servers.size() > 0) + { + Queue q = (Queue) servers.next(); + // assert(q != null && q.size() > 0) + t = (FetcherTask)q.remove(); + if (q.size() == 0) + { + servers.removeCurrent(); + q = null; + } + size--; + } + return t; + } + + + /** + * tests + * + * @param args Description of the Parameter + */ + public static void main(String args[]) + { + FetcherTaskQueue q = new FetcherTaskQueue(); + System.out.println("Test 1. put in 4 yahoos and 3 lmus. pull out LMU/Yahoo/LMU/Yahoo/LMU/Yahoo/Yahoo"); + try + { + q.insert(new FetcherTask(new URLMessage(new URL("http://www.lmu.de/1"), null, false))); + q.insert(new FetcherTask(new URLMessage(new URL("http://www.lmu.de/2"), null, false))); + q.insert(new FetcherTask(new URLMessage(new URL("http://www.yahoo.de/1"), null, false))); + q.insert(new FetcherTask(new URLMessage(new URL("http://www.yahoo.de/2"), null, false))); + q.insert(new FetcherTask(new URLMessage(new URL("http://www.yahoo.de/3"), null, false))); + q.insert(new FetcherTask(new URLMessage(new URL("http://www.yahoo.de/4"), null, false))); + q.insert(new FetcherTask(new URLMessage(new URL("http://www.lmu.de/3"), null, false))); + } + catch (Throwable t) + { + t.printStackTrace(); + } + + System.out.println(((FetcherTask) q.remove()).getInfo()); + System.out.println(((FetcherTask) q.remove()).getInfo()); + System.out.println(((FetcherTask) q.remove()).getInfo()); + System.out.println(((FetcherTask) q.remove()).getInfo()); + System.out.println(((FetcherTask) q.remove()).getInfo()); + System.out.println(((FetcherTask) q.remove()).getInfo()); + System.out.println(((FetcherTask) q.remove()).getInfo()); + + System.out.println("Test 2. new Queue"); + q = new FetcherTaskQueue(); + System.out.println("size [0]:"); + System.out.println(q.size()); + try + { + System.out.println("put 3 lmus."); + q.insert(new FetcherTask(new URLMessage(new URL("http://www.lmu.de/1"), null, false))); + q.insert(new FetcherTask(new URLMessage(new URL("http://www.lmu.de/2"), null, false))); + q.insert(new FetcherTask(new URLMessage(new URL("http://www.lmu.de/3"), null, false))); + System.out.print("pull out 1st element [lmu/1]: "); + System.out.println(((FetcherTask) q.remove()).getInfo()); + System.out.println("size now [2]: " + q.size()); + System.out.print("pull out 2nd element [lmu/2]: "); + System.out.println(((FetcherTask) q.remove()).getInfo()); + System.out.println("size now [1]: " + q.size()); + System.out.println("put in 3 yahoos"); + q.insert(new FetcherTask(new URLMessage(new URL("http://www.yahoo.de/1"), null, false))); + q.insert(new FetcherTask(new URLMessage(new URL("http://www.yahoo.de/2"), null, false))); + q.insert(new FetcherTask(new URLMessage(new URL("http://www.yahoo.de/3"), null, false))); + System.out.println("remove [?]: " + ((FetcherTask) q.remove()).getInfo()); + System.out.println("Size now [3]: " + q.size()); + System.out.println("remove [?]: " + ((FetcherTask) q.remove()).getInfo()); + System.out.println("Size now [2]: " + q.size()); + System.out.println("remove [?]: " + ((FetcherTask) q.remove()).getInfo()); + System.out.println("Size now [1]: " + q.size()); + System.out.println("put in another Yahoo"); + q.insert(new FetcherTask(new URLMessage(new URL("http://www.yahoo.de/4"), null, false))); + System.out.println("remove [?]: " + ((FetcherTask) q.remove()).getInfo()); + System.out.println("Size now [1]: " + q.size()); + System.out.println("remove [?]: " + ((FetcherTask) q.remove()).getInfo()); + System.out.println("Size now [0]: " + q.size()); + } + catch (Throwable t) + { + t.printStackTrace(); + } + + } + +} diff --git a/sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/fetcher/FetcherThread.java b/sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/fetcher/FetcherThread.java new file mode 100644 index 00000000000..54930fa9fc3 --- /dev/null +++ b/sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/fetcher/FetcherThread.java @@ -0,0 +1,91 @@ + +/** + * Title: LARM Lanlab Retrieval Machine

+ * Description:

+ * Copyright: Copyright (c)

+ * Company:

+ * @author + * @version 1.0 + */ +package de.lanlab.larm.fetcher; + +import de.lanlab.larm.threads.ServerThread; +import de.lanlab.larm.util.State; + +/** + * a server thread for the thread pool that records the number + * of bytes read and the number of tasks run + * mainly for statistical purposes and to keep most of the information a task needs + * static + */ +public class FetcherThread extends ServerThread +{ + + long totalBytesRead = 0; + long totalTasksRun = 0; + + HostManager hostManager; + + byte[] documentBuffer = new byte[Constants.FETCHERTASK_READSIZE]; + + public HostManager getHostManager() + { + return hostManager; + } + + public FetcherThread(int threadNumber, ThreadGroup threadGroup, HostManager hostManager) + { + super(threadNumber,"FetcherThread " + threadNumber, threadGroup); + this.hostManager = hostManager; + } + + public static String STATE_IDLE = "Idle"; + + State idleState = new State(STATE_IDLE); // only set if task is finished + + protected void taskReady() + { + totalBytesRead += ((FetcherTask)task).getBytesRead(); + totalTasksRun++; + super.taskReady(); + idleState.setState(STATE_IDLE); + + } + + + public long getTotalBytesRead() + { + if(task != null) + { + return totalBytesRead + ((FetcherTask)task).getBytesRead(); + } + else + { + return totalBytesRead; + } + } + + public long getTotalTasksRun() + { + return totalTasksRun; + } + + public byte[] getDocumentBuffer() + { + return documentBuffer; + } + + public State getTaskState() + { + if(task != null) + { + // task could be null here + return ((FetcherTask)task).getTaskState(); + } + else + { + return idleState.cloneState(); + } + } + +} diff --git a/sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/fetcher/FetcherThreadFactory.java b/sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/fetcher/FetcherThreadFactory.java new file mode 100644 index 00000000000..99035c24ee0 --- /dev/null +++ b/sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/fetcher/FetcherThreadFactory.java @@ -0,0 +1,38 @@ + +/** + * Title: LARM Lanlab Retrieval Machine

+ * Description:

+ * Copyright: Copyright (c)

+ * Company:

+ * @author + * @version 1.0 + */ +package de.lanlab.larm.fetcher; +import de.lanlab.larm.threads.*; + +/** + * this factory simply creates fetcher threads. It's passed + * to the ThreadPool because the pool is creating the threads on its own + */ +public class FetcherThreadFactory extends ThreadFactory +{ + + //static int count = 0; + + ThreadGroup threadGroup = new ThreadGroup("FetcherThreads"); + + HostManager hostManager; + + public FetcherThreadFactory(HostManager hostManager) + { + this.hostManager = hostManager; + } + + + public ServerThread createServerThread(int count) + { + ServerThread newThread = new FetcherThread(count, threadGroup, hostManager); + newThread.setPriority(4); + return newThread; + } +} \ No newline at end of file diff --git a/sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/fetcher/Filter.java b/sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/fetcher/Filter.java new file mode 100644 index 00000000000..0a3be1c0e7e --- /dev/null +++ b/sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/fetcher/Filter.java @@ -0,0 +1,29 @@ + +/** + * Title: LARM Lanlab Retrieval Machine

+ * Description:

+ * Copyright: Copyright (c)

+ * Company:

+ * @author + * @version 1.0 + */ +package de.lanlab.larm.fetcher; + + +/** + * base class of all filter classes + */ +public abstract class Filter +{ + /** + * number of items filtered. augmented directly by + * the inheriting classes + */ + protected int filtered = 0; + + + public int getFiltered() + { + return filtered; + } +} \ No newline at end of file diff --git a/sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/fetcher/GZipTest.java b/sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/fetcher/GZipTest.java new file mode 100644 index 00000000000..ad6d5b3ed32 --- /dev/null +++ b/sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/fetcher/GZipTest.java @@ -0,0 +1,56 @@ +package de.lanlab.larm.fetcher; + +/** + * Title: LARM Lanlab Retrieval Machine Description: Copyright: Copyright (c) + * Company: + * + * @author + * @version 1.0 + */ + +import java.io.*; +import java.util.zip.*; +import java.net.*; + +/** + * Description of the Class + * + * @author Administrator + * @created 28. Januar 2002 + */ +public class GZipTest +{ + + /** + * Constructor for the GZipTest object + */ + public GZipTest() { } + + + /** + * The main program for the GZipTest class + * + * @param args The command line arguments + */ + public static void main(String[] args) + { + try + { + String url = "http://speechdat.phonetik.uni-muenchen.de/speechdt//speechDB/FIXED1SL/BLOCK00/SES0006/A10006O5.aif"; + + ByteArrayOutputStream a = new ByteArrayOutputStream(url.length()); + GZIPOutputStream g = new GZIPOutputStream(a); + OutputStreamWriter o = new OutputStreamWriter(g,"ISO-8859-1"); + + o.write(url); + o.close(); + g.finish(); + byte[] array = a.toByteArray(); + System.out.println("URL: " + url + " \n Length: " + url.length() + "\n zipped: " + array.length + ); + } + catch (Exception e) + { e.printStackTrace(); + } + } +} diff --git a/sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/fetcher/HostInfo.java b/sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/fetcher/HostInfo.java new file mode 100644 index 00000000000..ff48f26f31f --- /dev/null +++ b/sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/fetcher/HostInfo.java @@ -0,0 +1,121 @@ +package de.lanlab.larm.fetcher; + +/** + * Title: LARM Lanlab Retrieval Machine Description: Copyright: Copyright (c) + * Company: + * + * @author Clemens Marschner + * @version 1.0 + */ + +import java.util.HashMap; +import java.net.*; +import de.lanlab.larm.util.CachingQueue; +import de.lanlab.larm.util.Queue; + +/** + * contains information about a host. If a host doesn't respond too often, it's + * excluded from the crawl. + * This class is used by the HostManager + * + * @author Clemens Marschner + * @created 16. Februar 2002 + */ +public class HostInfo +{ + static final String[] emptyKeepOutDirectories = new String[0]; + + int id; + int healthyCount = 5; // five strikes, and you're out + boolean isReachable = true; + boolean robotTxtChecked = false; + String[] disallows; // robot exclusion + boolean isLoadingRobotsTxt = false; + Queue queuedRequests = null; // robot exclusion + String hostName; + + public HostInfo(String hostName, int id) + { + this.id = id; + this.disallows = HostInfo.emptyKeepOutDirectories; + this.hostName = hostName; + } + + /** + * is this host reachable and responding? + */ + public boolean isHealthy() + { + return (healthyCount > 0) && isReachable; + } + + /** + * signals that the host returned with a bad request of whatever type + */ + public void badRequest() + { + healthyCount--; + } + + public void setReachable(boolean reachable) + { + isReachable = reachable; + } + + public boolean isReachable() + { + return isReachable; + } + + public boolean isRobotTxtChecked() + { + return robotTxtChecked; + } + + /** + * must be synchronized externally + */ + public boolean isLoadingRobotsTxt() + { + return this.isLoadingRobotsTxt; + } + + public void setLoadingRobotsTxt(boolean isLoading) + { + this.isLoadingRobotsTxt = isLoading; + if(isLoading) + { + this.queuedRequests = new CachingQueue("HostInfo_" + id + "_QueuedRequests", 100); + } + + } + + public void setRobotsChecked(boolean isChecked, String[] disallows) + { + this.robotTxtChecked = isChecked; + if(disallows != null) + { + this.disallows = disallows; + } + else + { + this.disallows = emptyKeepOutDirectories; + } + + } + + public synchronized boolean isAllowed(String path) + { + // assume keepOutDirectories is pretty short + // assert disallows != null + int length = disallows.length; + for(int i=0; iputMessage or putMessages + * (use the latter whenever possible).
+ * The messages are passed to the filters in the order in which the filters where + * added to the handler.
+ * They can consume the message by returning null. Otherwise, they return a Message + * object, usually the one they got.
+ * The filters will run synchronously within the message handler thread
+ * This implements a chain of responsibility-style message handling + */ +public class MessageHandler implements Runnable +{ + + /** + * the queue where messages are put in. + * Holds max. 2 x 5000 = 10.000 messages in RAM + */ + private CachingQueue messageQueue = new CachingQueue("fetcherURLMessageQueue", 5000); + + /** + * list of Observers + */ + private LinkedList listeners = new LinkedList(); + + /** + * true as long as the thread is running + */ + private boolean running = true; + + /** + * the message handler thread + */ + private Thread t; + + /** + * flag for thread communication + */ + boolean messagesWaiting = false; + + /** + * true when a message is processed by the filters + */ + boolean workingOnMessage = false; + + Object queueMonitor = new Object(); + + SimpleObservable messageQueueObservable = new SimpleObservable(); + SimpleObservable messageProcessorObservable = new SimpleObservable(); + + public boolean isWorkingOnMessage() + { + return workingOnMessage; + } + + /** + * messageHandler-Thread erzeugen und starten + */ + MessageHandler() + { + t = new Thread(this,"MessageHandler Thread"); + t.setPriority(5); // higher priority to prevent starving when a lot of fetcher threads are used + t.start(); + } + + /** + * join messageHandler-Thread + */ + public void finalize() + { + if(t != null) + { + try + { + t.join(); + t = null; + } + catch(InterruptedException e) {} + } + } + + /** + * registers a filter to the message handler + * @param MessageListener - the Listener + */ + public void addListener(MessageListener m) + { + m.notifyAddedToMessageHandler(this); + listeners.addLast(m); + } + + /** + * registers a MessageQueueObserver + * It will be notified whenever a message is put into the Queue (Parameter is Int(1)) oder + * removed (Parameter is Int(-1)) + * @param o the Observer + */ + public void addMessageQueueObserver(Observer o) + { + messageQueueObservable.addObserver(o); + } + + /** + * adds a message processorObeserver + * It will be notified when a message is consumed. In this case the parameter + * is the filter that consumed the message + * @param o the Observer + */ + public void addMessageProcessorObserver(Observer o) + { + messageProcessorObservable.addObserver(o); + } + + + /** + * einen Event in die Schlange schreiben + */ + public void putMessage(Message msg) + { + messageQueue.insert(msg); + messageQueueObservable.setChanged(); + messageQueueObservable.notifyObservers(new Integer(1)); + synchronized(queueMonitor) + { + messagesWaiting = true; + queueMonitor.notify(); + } + } + + /** + * add a collection of events to the message queue + */ + public void putMessages(Collection msgs) + { + for(Iterator i = msgs.iterator(); i.hasNext();) + { + Message msg = (Message)i.next(); + messageQueue.insert(msg); + } + messageQueueObservable.setChanged(); + messageQueueObservable.notifyObservers(new Integer(1)); + synchronized(queueMonitor) + { + messagesWaiting = true; + queueMonitor.notify(); + } + } + + /** + * the main messageHandler-Thread. + */ + public void run() + { + while(running) + { + //System.out.println("MessageHandler-Thread started"); + + synchronized(queueMonitor) + { + // wait for new messages + workingOnMessage=false; + try + { + queueMonitor.wait(); + } + catch(InterruptedException e) + { + System.out.println("MessageHandler: Caught InterruptedException"); + } + workingOnMessage=true; + } + //messagesWaiting = false; + Message m; + try + { + while(messagesWaiting) + { + synchronized(this.queueMonitor) + { + m = (Message)messageQueue.remove(); + if(messageQueue.size() == 0) + { + messagesWaiting = false; + } + + } + //System.out.println("MessageHandler:run: Entferne erstes Element"); + + messageQueueObservable.setChanged(); + messageQueueObservable.notifyObservers(new Integer(-1)); // Message processed + + // und verteilen. Die Listener erhalten die Message in ihrer + // Eintragungsreihenfolge und können die Message auch verändern + + Iterator i = listeners.iterator(); + while(i.hasNext()) + { + //System.out.println("Verteile..."); + try + { + MessageListener listener = (MessageListener)i.next(); + m = (Message)listener.handleRequest(m); + if (m == null) + { + messageProcessorObservable.setChanged(); + messageProcessorObservable.notifyObservers(listener); + break; // Handler hat die Message konsumiert + } + } + catch(ClassCastException e) + { + System.out.println("MessageHandler:run: ClassCastException(2): " + e.getMessage()); + } + } + } + } + catch (ClassCastException e) + { + System.out.println("MessageHandler:run: ClassCastException: " + e.getMessage()); + } + catch (UnderflowException e) + { + messagesWaiting = false; + // System.out.println("MessageHandler: messagesWaiting = true although nothing queued!"); + // @FIXME: here is still a multi threading issue. I don't get it why this happens. + // does someone want to draw a petri net of this? + } + catch (Exception e) + { + System.out.println("MessageHandler: " + e.getClass() + " " + e.getMessage()); + e.printStackTrace(); + } + + } + } + + public int getQueued() + { + return messageQueue.size(); + } +} diff --git a/sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/fetcher/MessageListener.java b/sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/fetcher/MessageListener.java new file mode 100644 index 00000000000..f39681cbdbf --- /dev/null +++ b/sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/fetcher/MessageListener.java @@ -0,0 +1,36 @@ +/* + * LARM - LANLab Retrieval Machine + * + * $history: $ + * + * + */ +package de.lanlab.larm.fetcher; + +/** + * A Message Listener works on messages in a message queue Usually it returns + * the message back into the queue. But it can also change the message or create + * a new object. If it returns null, the message handler stops + * + * @author Administrator + * @created 24. November 2001 + */ +public interface MessageListener +{ + /** + * the handler + * + * @param message the message to be handled + * @return Message usually the original message + * null: the message was consumed + */ + public Message handleRequest(Message message); + + + /** + * will be called as soon as the Listener is added to the Message Queue + * + * @param handler the Message Handler + */ + public void notifyAddedToMessageHandler(MessageHandler handler); +} diff --git a/sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/fetcher/RobotExclusionFilter.java b/sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/fetcher/RobotExclusionFilter.java new file mode 100644 index 00000000000..35158d4f53d --- /dev/null +++ b/sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/fetcher/RobotExclusionFilter.java @@ -0,0 +1,429 @@ +/** + * Title: LARM Lanlab Retrieval Machine

+ * + * Description:

+ * + * Copyright: Copyright (c)

+ * + * Company:

+ * + * + * + * @author Clemens Marschner + * @version 1.0 + */ +package de.lanlab.larm.fetcher; + +import de.lanlab.larm.util.SimpleObservable; +import de.lanlab.larm.util.State; +import java.util.*; +import java.net.*; +import java.io.*; +import org.apache.oro.text.perl.Perl5Util; +import de.lanlab.larm.util.*; +import de.lanlab.larm.threads.*; +import HTTPClient.*; + +/** + * this factory simply creates fetcher threads. It's gonna be passed to the + * ThreadPool because the pool is creating the threads on its own + * + * @author Administrator + * @created 17. Februar 2002 + */ +class REFThreadFactory extends ThreadFactory +{ + + ThreadGroup threadGroup = new ThreadGroup("RobotExclusionFilter"); + + + /** + * Description of the Method + * + * @param count Description of the Parameter + * @return Description of the Return Value + */ + public ServerThread createServerThread(int count) + { + ServerThread newThread = new ServerThread(count, "REF-" + count, threadGroup); + newThread.setPriority(4); + return newThread; + } +} + +/** + * the RE filter obeys the robot exclusion standard. If a new host name is supposed + * to be accessed, it first loads a "/robots.txt" on the given server and records the + * disallows stated in that file. + * The REFilter has a thread pool on its own to prevent the message handler from being + * clogged up if the server doesn't respond. Incoming messages are queued while the + * robots.txt is loaded. + * The information is stored in HostInfo records of the host manager class + * + * @author Clemens Marschner + * @created 17. Februar 2002 + */ +public class RobotExclusionFilter extends Filter implements MessageListener +{ + + + protected HostManager hostManager; + + protected SimpleLogger log; + + + /** + * Constructor for the RobotExclusionFilter object + * + * @param hm Description of the Parameter + */ + public RobotExclusionFilter(HostManager hm) + { + log = new SimpleLogger("RobotExclusionFilter"); + hostManager = hm; + rePool = new ThreadPool(2, new REFThreadFactory()); + rePool.init(); + log.setFlushAtOnce(true); + log.log("refilter: initialized"); + } + + + /** + * called by the message handler + */ + public void notifyAddedToMessageHandler(MessageHandler handler) + { + this.messageHandler = handler; + } + + + MessageHandler messageHandler = null; + ThreadPool rePool; + + + /** + * method that handles each URL request

+ * + * This method will get the robots.txt file the first time a server is + * requested. See the description above. + * + * @param message + * the (URL)Message + * @return + * the original message or NULL if this host had a disallow on that URL + * @link{http://info.webcrawler.com/mak/projects/robots/norobots.html}) + */ + + public Message handleRequest(Message message) + { + //log.logThreadSafe("handleRequest: got message: " + message); + try + { + // assert message instanceof URLMessage; + URLMessage urlMsg = ((URLMessage) message); + URL url = urlMsg.getUrl(); + //assert url != null; + HostInfo h = hostManager.getHostInfo(url.getHost()); + if (!h.isRobotTxtChecked() && !h.isLoadingRobotsTxt()) + { + log.logThreadSafe("handleRequest: starting to get robots.txt"); + // probably this results in Race Conditions here + + rePool.doTask(new RobotExclusionTask(h), new Integer(h.id)); + h.setLoadingRobotsTxt(true); + } + + synchronized (h) + { + // isLoading...() and queuedRequest.insert() must be atomic + if (h.isLoadingRobotsTxt()) + { + + //log.logThreadSafe("handleRequest: other thread is loading"); + // assert h.queuedRequests != null + h.queuedRequests.insert(message); + // not thread safe + log.logThreadSafe("handleRequest: queued file " + url); + return null; + } + } + + //log.logThreadSafe("handleRequest: no thread is loading; robots.txt loaded"); + //log.logThreadSafe("handleRequest: checking if allowed"); + String path = url.getPath(); + if (path == null || path.equals("")) + { + path = "/"; + } + + if (h.isAllowed(path)) + { + // log.logThreadSafe("handleRequest: file " + urlMsg.getURLString() + " ok"); + return message; + } + log.logThreadSafe("handleRequest: file " + urlMsg.getURLString() + " filtered"); + this.filtered++; + } + catch (Exception e) + { + e.printStackTrace(); + } + return null; + } + + + private static volatile NVPair headers[] = new NVPair[1]; + + static + { + headers[0] = new HTTPClient.NVPair("User-Agent", Constants.CRAWLER_AGENT); + + } + + + /** + * the task that actually loads and parses the robots.txt files + * + * @author Clemens Marschner + * @created 17. Februar 2002 + */ + class RobotExclusionTask implements InterruptableTask + { + HostInfo hostInfo; + + + + /** + * Constructor for the RobotExclusionTask object + * + * @param hostInfo Description of the Parameter + */ + public RobotExclusionTask(HostInfo hostInfo) + { + this.hostInfo = hostInfo; + } + + + /** + * dummy + * + * @return The info value + */ + public String getInfo() + { + return ""; + } + + + /** + * not used + */ + public void interrupt() { } + + + /** + * gets a robots.txt file and adds the information to the hostInfo + * structure + * + * @param thread the server thread (passed by the thread pool) + */ + public void run(ServerThread thread) + { + // assert hostInfo != null; + String threadName = Thread.currentThread().getName(); + + log.logThreadSafe("task " + threadName + ": starting to load " + hostInfo.hostName); + //hostInfo.setLoadingRobotsTxt(true); + String[] disallows = null; + boolean errorOccured = false; + try + { + log.logThreadSafe("task " + threadName + ": getting connection"); + HTTPConnection conn = new HTTPConnection(hostInfo.hostName); + conn.setTimeout(30000); + // wait at most 20 secs + + HTTPResponse res = conn.Get("/robots.txt", (String) null, headers); + log.logThreadSafe("task " + threadName + ": got connection."); + if (res.getStatusCode() != 200) + { + errorOccured = true; + } + else + { + + log.logThreadSafe("task " + threadName + ": reading"); + byte[] file = res.getData(40000); + // max. 40 kb + log.logThreadSafe("task " + threadName + ": reading done. parsing"); + disallows = parse(new BufferedReader(new InputStreamReader(new ByteArrayInputStream(file)))); + log.logThreadSafe("task " + threadName + ": parsing done. found " + disallows.length + " disallows"); + // assert disallows != null + // HostInfo hostInfo = hostManager.getHostInfo(this.hostName); + // assert hostInfo != null + log.logThreadSafe("task " + threadName + ": setting disallows"); + } + } + catch (java.net.UnknownHostException e) + { + hostInfo.setReachable(false); + log.logThreadSafe("task " + threadName + ": unknown host. setting to unreachable"); + errorOccured = true; + } + catch (java.net.NoRouteToHostException e) + { + hostInfo.setReachable(false); + log.logThreadSafe("task " + threadName + ": no route to. setting to unreachable"); + errorOccured = true; + } + catch (java.net.ConnectException e) + { + hostInfo.setReachable(false); + log.logThreadSafe("task " + threadName + ": connect exception. setting to unreachable"); + errorOccured = true; + } + catch (java.io.InterruptedIOException e) + { + // time out. fatal in this case + hostInfo.setReachable(false); + log.logThreadSafe("task " + threadName + ": time out. setting to unreachable"); + errorOccured = true; + } + + catch (Throwable e) + { + errorOccured = true; + log.log("task " + threadName + ": unknown exception: " + e.getClass().getName() + ": " + e.getMessage() + ". continuing"); + log.log(e); + + } + finally + { + if (errorOccured) + { + synchronized (hostInfo) + { + hostInfo.setRobotsChecked(true, null); + // crawl everything + hostInfo.setLoadingRobotsTxt(false); + log.logThreadSafe("task " + threadName + ": error occured"); + log.logThreadSafe("task " + threadName + ": now put " + hostInfo.queuedRequests.size() + " queueud requests back"); + hostInfo.isLoadingRobotsTxt = false; + putBackURLs(); + } + } + else + { + synchronized (hostInfo) + { + hostInfo.setRobotsChecked(true, disallows); + log.logThreadSafe("task " + threadName + ": done"); + log.logThreadSafe("task " + threadName + ": now put " + hostInfo.queuedRequests.size() + " queueud requests back"); + hostInfo.isLoadingRobotsTxt = false; + putBackURLs(); + } + } + } + } + + + /** + * put back queued URLs + */ + private void putBackURLs() + { + while (hostInfo.queuedRequests.size() > 0) + { + messageHandler.putMessage((Message) hostInfo.queuedRequests.remove()); + } + log.logThreadSafe("task " + Thread.currentThread().getName() + ": finished"); + hostInfo.queuedRequests = null; + } + + + /** + * this parses the robots.txt file. It was taken from the PERL implementation + * Since this is only rarely called, it's not optimized for speed + * + * @param r the robots.txt file + * @return the disallows + * @exception IOException any IOException + */ + public String[] parse(BufferedReader r) + throws IOException + { + // taken from Perl + Perl5Util p = new Perl5Util(); + String line; + boolean isMe = false; + boolean isAnon = false; + ArrayList disallowed = new ArrayList(); + String ua = null; + + while ((line = r.readLine()) != null) + { + if (p.match("/^#.*/", line)) + { + // a comment + continue; + } + line = p.substitute("s/\\s*\\#.* //", line); + if (p.match("/^\\s*$/", line)) + { + if (isMe) + { + break; + } + } + else if (p.match("/^User-Agent:\\s*(.*)/i", line)) + { + ua = p.group(1); + ua = p.substitute("s/\\s+$//", ua); + if (isMe) + { + break; + } + else if (ua.equals("*")) + { + isAnon = true; + } + else if (Constants.CRAWLER_AGENT.startsWith(ua)) + { + isMe = true; + } + } + else if (p.match("/^Disallow:\\s*(.*)/i", line)) + { + if (ua == null) + { + isAnon = true; + // warn... + } + String disallow = p.group(1); + if (disallow != null && disallow.length() > 0) + { + // assume we have a relative path + ; + } + else + { + disallow = "/"; + } + if (isMe || isAnon) + { + disallowed.add(disallow); + } + } + else + { + // warn: unexpected line + } + } + String[] disalloweds = new String[disallowed.size()]; + disallowed.toArray(disalloweds); + return disalloweds; + } + + } + +} diff --git a/sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/fetcher/ThreadMonitor.java b/sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/fetcher/ThreadMonitor.java new file mode 100644 index 00000000000..140924ab81a --- /dev/null +++ b/sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/fetcher/ThreadMonitor.java @@ -0,0 +1,545 @@ + +/** + * Title: LARM Lanlab Retrieval Machine

+ * Description:

+ * Copyright: Copyright (c)

+ * Company:

+ * @author + * @version 1.0 + */ +package de.lanlab.larm.fetcher; + + +import de.lanlab.larm.threads.*; +import java.util.*; +import java.text.*; +import java.io.*; +import de.lanlab.larm.util.State; +import de.lanlab.larm.util.SimpleLoggerManager; + +/** + * this monitor takes a sample of every thread every x milliseconds, + * and logs a lot of information. In the near past it has evolved into the multi + * purpose monitoring and maintenance facility. + * At the moment it prints status information + * to log files and to the console + * @TODO this can be done better. Probably with an agent where different services + * can be registered to be called every X seconds + */ +public class ThreadMonitor extends Observable implements Runnable +{ + /** + * a reference to the thread pool that's gonna be observed + */ + private ThreadPool threadPool; + + + class Sample + { + long bytesRead; + long docsRead; + long time; + public Sample(long bytesRead, long docsRead, long time) + { + this.bytesRead = bytesRead; + this.docsRead = docsRead; + this.time = time; + } + } + + ArrayList bytesReadPerPeriod; + + /** + * Zeit zwischen den Messungen + */ + int sampleDelta; + + /** + * the thread where this monitor runs in. Will run with high priority + */ + Thread thread; + + + URLVisitedFilter urlVisitedFilter; + URLScopeFilter urlScopeFilter; +// DNSResolver dnsResolver; + RobotExclusionFilter reFilter; + MessageHandler messageHandler; + URLLengthFilter urlLengthFilter; + HostManager hostManager; + + public final static double KBYTE = 1024; + public final static double MBYTE = 1024 * KBYTE; + public final static double ONEGBYTE = 1024 * MBYTE; + + + String formatBytes(long lbytes) + { + double bytes = (double)lbytes; + if(bytes >= ONEGBYTE) + { + return fractionFormat.format((bytes/ONEGBYTE)) + " GB"; + } + else if(bytes >= MBYTE) + { + return fractionFormat.format(bytes/MBYTE) + " MB"; + } + else if(bytes >= KBYTE) + { + return fractionFormat.format(bytes/KBYTE) + " KB"; + } + else + { + return fractionFormat.format(bytes) + " Bytes"; + } + + } + + /** + * a logfile where status information is posted + * FIXME: put that in a seperate class (double code in FetcherTask) + */ + PrintWriter logWriter; + private SimpleDateFormat formatter + = new SimpleDateFormat ("hh:mm:ss:SSSS"); + private DecimalFormat fractionFormat = new DecimalFormat("0.00"); + + long startTime = System.currentTimeMillis(); + + private void log(String text) + { + try + { + logWriter.println(formatter.format(new Date()) + ";" + (System.currentTimeMillis()-startTime) + ";" + text); + logWriter.flush(); + } + catch(Exception e) + { + System.out.println("Couldn't write to logfile"); + } + } + + /** + * construct the monitor gets a reference to all monitored filters + * @param threadPool the pool to be observed + * @param sampleDelta time in ms between samples + */ + public ThreadMonitor(URLLengthFilter urlLengthFilter, + URLVisitedFilter urlVisitedFilter, + URLScopeFilter urlScopeFilter, + /*DNSResolver dnsResolver,*/ + RobotExclusionFilter reFilter, + MessageHandler messageHandler, + ThreadPool threadPool, + HostManager hostManager, + int sampleDelta) + { + this.urlLengthFilter = urlLengthFilter; + this.urlVisitedFilter = urlVisitedFilter; + this.urlScopeFilter = urlScopeFilter; + /* this.dnsResolver = dnsResolver;*/ + this.hostManager = hostManager; + this.reFilter = reFilter; + this.messageHandler = messageHandler; + + this.threadPool = threadPool; + bytesReadPerPeriod = new ArrayList(); + this.sampleDelta = sampleDelta; + this.thread = new Thread(this, "ThreadMonitor"); + this.thread.setPriority(7); + + try + { + File logDir = new File("logs"); + logDir.mkdir(); + logWriter = new PrintWriter(new BufferedWriter(new FileWriter("logs/ThreadMonitor.log"))); + } + catch(IOException e) + { + System.out.println("Couldn't create logfile (ThreadMonitor)"); + } + + } + + /** + * java.lang.Threads run method. To be invoked via start() + * the monitor's main thread takes the samples every sampleDelta ms + * Since Java is not real time, it remembers + */ + public void run() + { + int nothingReadCount = 0; + long lastPeriodBytesRead = -1; + long monitorRunCount = 0; + long startTime = System.currentTimeMillis(); + log("time;overallBytesRead;overallTasksRun;urlsQueued;urlsWaiting;isWorkingOnMessage;urlsScopeFiltered;urlsVisitedFiltered;urlsREFiltered;memUsed;memFree;totalMem;nrHosts;visitedSize;visitedStringSize;urlLengthFiltered"); + while(true) + { + try + { + try + { + thread.sleep(sampleDelta); + } + catch(InterruptedException e) + { + return; + } + + Iterator threadIterator = threadPool.getThreadIterator(); + int i=0; + StringBuffer bytesReadString = new StringBuffer(200); + StringBuffer rawBytesReadString = new StringBuffer(200); + StringBuffer tasksRunString = new StringBuffer(200); + long overallBytesRead = 0; + long overallTasksRun = 0; + long now = System.currentTimeMillis(); + boolean finished = false; + //System.out.print("\f"); + /*while(!finished) + { + boolean restart = false;*/ + boolean allThreadsIdle = true; + StringBuffer sb = new StringBuffer(500); + + while(threadIterator.hasNext()) + { + FetcherThread thread = (FetcherThread)threadIterator.next(); + long totalBytesRead = thread.getTotalBytesRead(); + overallBytesRead += totalBytesRead; + bytesReadString.append(formatBytes(totalBytesRead)).append( "; "); + rawBytesReadString.append(totalBytesRead).append("; "); + long tasksRun = thread.getTotalTasksRun(); + overallTasksRun += tasksRun; + tasksRunString.append(tasksRun).append("; "); + + // check task status + State state = thread.getTaskState(); + //StringBuffer sb = new StringBuffer(200); + sb.setLength(0); + System.out.println(sb + "[" + thread.getThreadNumber() + "] " + state.getState() + " for " + + (now - state.getStateSince() ) + " ms " + + (state.getInfo() != null ? "(" + state.getInfo() +")" : "") + ); + if(!(state.getState().equals(FetcherThread.STATE_IDLE))) + { + //if(allThreadsIdle) System.out.println("(not all threads are idle, '"+state.getState()+"' != '"+FetcherThread.STATE_IDLE+"')"); + allThreadsIdle = false; + } + if (((state.equals(FetcherTask.FT_CONNECTING)) || (state.equals(FetcherTask.FT_GETTING)) || (state.equals(FetcherTask.FT_READING)) || (state.equals(FetcherTask.FT_CLOSING))) + && ((now - state.getStateSince()) > 160000)) + { + System.out.println("****Restarting Thread " + thread.getThreadNumber()); + threadPool.restartThread(thread.getThreadNumber()); + break; // Iterator is invalid + } + + } + /*if(restart) + { + continue; + } + finished = true; + }*/ + /* + if(overallBytesRead == lastPeriodBytesRead) + { + * + disabled kickout feature - cm + + nothingReadCount ++; + System.out.println("Anomaly: nothing read during the last period(s). " + (20-nothingReadCount+1) + " periods to exit"); + if(nothingReadCount > 20) // nothing happens anymore + { + log("Ending"); + System.out.println("End at " + new Date().toString()); + // print some information + System.exit(0); + } + + + } + else + { + nothingReadCount = 0; + }*/ + + lastPeriodBytesRead = overallBytesRead; + + //State reState = new State("hhh"); //reFilter.getState(); + sb.setLength(0); + //System.out.println(sb + "Robot-Excl.Filter State: " + reState.getState() + " since " + (now-reState.getStateSince()) + " ms " + (reState.getInfo() != null ? " at " + reState.getInfo() : "")); + + addSample(new Sample(overallBytesRead, overallTasksRun, System.currentTimeMillis())); + int nrHosts = ((FetcherTaskQueue)threadPool.getTaskQueue()).getNumHosts(); + int visitedSize = urlVisitedFilter.size(); + int visitedStringSize = urlVisitedFilter.getStringSize(); + + double bytesPerSecond = getAverageBytesRead(); + double docsPerSecond = getAverageDocsRead(); + sb.setLength(0); + System.out.println(sb + "\nBytes total: " + formatBytes(overallBytesRead) + " (" + formatBytes((long)(((double)overallBytesRead)*1000/(System.currentTimeMillis()-startTime))) + " per second since start)" + + "\nBytes per Second: " + formatBytes((int)bytesPerSecond) + " (50 secs)" + + "\nDocs per Second: " + docsPerSecond + + "\nBytes per Thread: " + bytesReadString); + double docsPerSecondTotal = ((double)overallTasksRun)*1000/(System.currentTimeMillis()-startTime); + sb.setLength(0); + System.out.println(sb + "Docs read total: " + overallTasksRun + " Docs/s: " + fractionFormat.format(docsPerSecondTotal) + + "\nDocs p.thread: " + tasksRunString); + + long memUsed = Runtime.getRuntime().totalMemory()-Runtime.getRuntime().freeMemory(); + long memFree = Runtime.getRuntime().freeMemory(); + long totalMem = Runtime.getRuntime().totalMemory(); + sb.setLength(0); + System.out.println(sb + "Mem used: " + formatBytes(memUsed) + ", free: " + formatBytes(memFree) + " total VM: " + totalMem); + int urlsQueued = messageHandler.getQueued(); + int urlsWaiting = threadPool.getQueueSize(); + boolean isWorkingOnMessage = messageHandler.isWorkingOnMessage(); + int urlsScopeFiltered = urlScopeFilter.getFiltered(); + int urlsVisitedFiltered = urlVisitedFilter.getFiltered(); + int urlsREFiltered = reFilter.getFiltered(); + int urlLengthFiltered = urlLengthFilter.getFiltered(); + sb.setLength(0); + System.out.println(sb + "URLs queued: " + urlsQueued + " waiting: " + urlsWaiting); + sb.setLength(0); + System.out.println(sb + "Message is being processed: " + isWorkingOnMessage); + sb.setLength(0); + System.out.println(sb + "URLs Filtered: length: " + urlLengthFiltered + " scope: " + urlsScopeFiltered + " visited: " + urlsVisitedFiltered + " robot.txt: " + urlsREFiltered); + sb.setLength(0); + System.out.println(sb + "Visited size: " + visitedSize + "; String Size in VisitedFilter: " + visitedStringSize + "; Number of Hosts: " + nrHosts + "; hosts in Host Manager: " + hostManager.getSize() + "\n"); + sb.setLength(0); + log(sb + "" + now + ";" + overallBytesRead + ";" + overallTasksRun + ";" + urlsQueued + ";" + urlsWaiting + ";" + isWorkingOnMessage + ";" + urlsScopeFiltered + ";" + urlsVisitedFiltered + ";" + urlsREFiltered + ";" + memUsed + ";" + memFree + ";" + totalMem + ";" + nrHosts + ";" + visitedSize + ";" + visitedStringSize + ";" + rawBytesReadString + ";" + urlLengthFiltered); + + + if(!isWorkingOnMessage && (urlsQueued == 0) && (urlsWaiting == 0) && allThreadsIdle) + { + nothingReadCount++; + if(nothingReadCount > 3) + { + SimpleLoggerManager.getInstance().flush(); + System.exit(0); + } + + } + else + { + nothingReadCount = 0; + } + + this.setChanged(); + this.notifyObservers(); + + // Request Garbage Collection + monitorRunCount++; + + if(monitorRunCount % 6 == 0) + { + System.runFinalization(); + } + + if(monitorRunCount % 2 == 0) + { + System.gc(); + SimpleLoggerManager.getInstance().flush(); + } + + } + catch(Exception e) + { + System.out.println("Monitor: Exception: " + e.getClass().getName()); + e.printStackTrace(); + } + } + } + + /** + * start the thread + */ + public void start() + { + this.clear(); + thread.start(); + } + + /** + * interrupt the monitor thread + */ + public void interrupt() + { + thread.interrupt(); + } + + + public synchronized void clear() + { + //sampleTimeStamps.clear(); + /*for(int i=0; i < timeSamples.length; i++) + { + timeSamples[i].clear(); + } + */ + } + +/* public synchronized double getAverageReadCount(int maxPeriods) + { + int lastPeriod = bytesReadPerPeriod.size()-1; + int periods = Math.min(lastPeriod, maxPeriods); + if(periods < 2) + { + return 0.0; + } + + + long bytesLastPeriod = ((Sample)bytesReadPerPeriod.get(lastPeriod)).bytesRead; + long bytesBeforePeriod = ((Sample)bytesReadPerPeriod.get(lastPeriod - periods)).bytesRead; + long bytesRead = bytesLastPeriod - bytesBeforePeriod; + + long endTime = ((Long)sampleTimeStamps.get(sampleTimeStamps.size()-1)).longValue(); + long startTime = ((Long)sampleTimeStamps.get(sampleTimeStamps.size()-1 - periods)).longValue(); + long duration = endTime - startTime; + System.out.println("bytes read: " + bytesRead + " duration in s: " + duration/1000.0 + " = " + ((double)bytesRead) / (duration/1000.0) + " per second"); + + return ((double)bytesRead) / (duration/1000.0); + } +*/ + + /*public synchronized double getDocsPerSecond(int maxPeriods) + { + int lastPeriod = bytesReadPerPeriod.size()-1; + int periods = Math.min(lastPeriod, maxPeriods); + if(periods < 2) + { + return 0.0; + } + + + long docsLastPeriod = ((Sample)bytesReadPerPeriod.get(lastPeriod)).docsRead; + long docsBeforePeriod = ((Sample)bytesReadPerPeriod.get(lastPeriod - periods)).docsRead; + long docsRead = docsLastPeriod - docsBeforePeriod; + + long endTime = ((Long)sampleTimeStamps.get(sampleTimeStamps.size()-1)).longValue(); + long startTime = ((Long)sampleTimeStamps.get(sampleTimeStamps.size() - periods)).longValue(); + long duration = endTime - startTime; + System.out.println("docs read: " + docsRead + " duration in s: " + duration/1000.0 + " = " + ((double)docsRead) / (duration/1000.0) + " per second"); + + return ((double)docsRead) / (duration/1000.0); + }*/ + + /** + * retrieves the number of threads whose byteCount is below the threshold + * @param maxPeriods the number of periods to look back + * @param threshold the number of bytes per second that acts as the threshold for a stalled thread + */ + /*public synchronized int getStalledThreadCount(int maxPeriods, double threshold) + { + int periods = Math.min(sampleTimeStamps.size(), maxPeriods); + int stalledThreads = 0; + int j=0, i=0; + if(periods > 1) + { + for(j=0; j newest.time) + { + newest = s; + } + } + } + return ((newest.bytesRead - oldest.bytesRead)/((newest.time - oldest.time)/1000.0)); + } + public double getAverageDocsRead() + { + Iterator i = bytesReadPerPeriod.iterator(); + Sample oldest = null; + Sample newest = null; + while(i.hasNext()) + { + + Sample s = (Sample)i.next(); + if(oldest == null) + { + oldest = newest = s; + } + else + { + if(s.time < oldest.time) + { + oldest = s; + } + else if(s.time > newest.time) + { + newest = s; + } + } + } + return ((newest.docsRead - oldest.docsRead)/((newest.time - oldest.time)/1000.0)); + } +} + + diff --git a/sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/fetcher/URLLengthFilter.java b/sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/fetcher/URLLengthFilter.java new file mode 100644 index 00000000000..61f49c448f4 --- /dev/null +++ b/sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/fetcher/URLLengthFilter.java @@ -0,0 +1,69 @@ +package de.lanlab.larm.fetcher; + +/** + * Title: LARM Lanlab Retrieval Machine Description: Copyright: Copyright (c) + * Company: + * + * @author + * @created 28. Januar 2002 + * @version 1.0 + */ + +/** + * Title: LARM Lanlab Retrieval Machine Description: Copyright: Copyright (c) + * Company: + * + * kills URLs longer than X characters. Used to prevent endless loops where + * the page contains the current URL + some extension + * + * @author Clemens Marschner + * @created 28. Januar 2002 + */ + +public class URLLengthFilter extends Filter implements MessageListener +{ + /** + * called by the message handler + * + * @param handler the handler + */ + public void notifyAddedToMessageHandler(MessageHandler handler) + { + this.messageHandler = handler; + } + + + MessageHandler messageHandler; + + int maxLength; + + + /** + * Constructor for the URLLengthFilter object + * + * @param maxLength max length of the _total_ URL (protocol+host+port+path) + */ + public URLLengthFilter(int maxLength) + { + this.maxLength = maxLength; + } + + + /** + * handles the message + * + * @param message Description of the Parameter + * @return the original message or NULL if the URL was too long + */ + public Message handleRequest(Message message) + { + URLMessage m = (URLMessage) message; + String file = m.getUrl().getFile(); + if (file != null && file.length() > maxLength) // path + query + { + filtered++; + return null; + } + return message; + } +} diff --git a/sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/fetcher/URLMessage.java b/sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/fetcher/URLMessage.java new file mode 100644 index 00000000000..24973f93929 --- /dev/null +++ b/sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/fetcher/URLMessage.java @@ -0,0 +1,87 @@ +package de.lanlab.larm.fetcher; + +import java.net.*; +import java.io.*; +import de.lanlab.larm.util.URLUtils; + +/** + * represents a URL which is passed around in the messageHandler + */ +public class URLMessage implements Message, Serializable +{ + /** + * the URL + */ + protected URL url; + protected String urlString; + + protected URL referer; + protected String refererString; + boolean isFrame; + + public URLMessage(URL url, URL referer, boolean isFrame) + { + //super(); + this.url = url; + this.urlString = url != null ? URLUtils.toExternalFormNoRef(url) : null; + + this.referer = referer; + this.refererString = referer != null ? URLUtils.toExternalFormNoRef(referer) : null; + this.isFrame = isFrame; + //System.out.println("" + refererString + " -> " + urlString); + } + + public URL getUrl() + { + return this.url; + } + + public URL getReferer() + { + return this.referer; + } + + + public String toString() + { + return urlString; + } + + public String getURLString() + { + return urlString; + } + + public String getRefererString() + { + return refererString; + } + + + public int hashCode() + { + return url.hashCode(); + } + + private void writeObject(java.io.ObjectOutputStream out) throws IOException + { + out.writeObject(url); + out.writeObject(referer); + out.writeBoolean(isFrame); + } + + private void readObject(java.io.ObjectInputStream in) throws IOException, ClassNotFoundException + { + url = (URL)in.readObject(); + referer = (URL)in.readObject(); + urlString = url.toExternalForm(); + refererString = referer.toExternalForm(); + isFrame = in.readBoolean(); + } + + public String getInfo() + { + return (referer != null ? refererString : "") + "\t" + urlString + "\t" + (isFrame ? "1" : "0"); + } + +} diff --git a/sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/fetcher/URLScopeFilter.java b/sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/fetcher/URLScopeFilter.java new file mode 100644 index 00000000000..66d66fd5a94 --- /dev/null +++ b/sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/fetcher/URLScopeFilter.java @@ -0,0 +1,75 @@ +package de.lanlab.larm.fetcher; + +import org.apache.oro.text.regex.Perl5Matcher; +import org.apache.oro.text.regex.Perl5Compiler; +import org.apache.oro.text.regex.Pattern; + +/** + * Filter-Klasse; prüft eine eingegangene Message auf Einhaltung eines + * regulären Ausdrucks. Wenn die URL diesem Ausdruck + * nicht entspricht, wird sie verworfen + * @author Clemens Marschner + */ +class URLScopeFilter extends Filter implements MessageListener +{ + public void notifyAddedToMessageHandler(MessageHandler handler) + { + this.messageHandler = handler; + } + MessageHandler messageHandler; + + /** + * the regular expression which describes a valid URL + */ + private Pattern pattern; + private Perl5Matcher matcher; + private Perl5Compiler compiler; + + public URLScopeFilter() + { + matcher = new Perl5Matcher(); + compiler = new Perl5Compiler(); + } + + public String getRexString() + { + return pattern.toString(); + } + + /** + * set the regular expression + * @param rexString the expression + */ + public void setRexString(String rexString) throws org.apache.oro.text.regex.MalformedPatternException + { + this.pattern = compiler.compile(rexString, Perl5Compiler.CASE_INSENSITIVE_MASK | Perl5Compiler.SINGLELINE_MASK); + //System.out.println("pattern set to: " + pattern); + } + + + /** + * this method will be called by the message handler. Tests the URL + * and throws it out if it's not in the scope + */ + public Message handleRequest(Message message) + { + if(message instanceof URLMessage) + { + String urlString = ((URLMessage)message).toString(); + int length = urlString.length(); + char buffer[] = new char[length]; + urlString.getChars(0,length,buffer,0); + + //System.out.println("using pattern: " + pattern); + boolean match = matcher.matches(buffer, pattern); + if(!match) + { + //System.out.println("not in Scope: " + urlString); + filtered++; + return null; + } + } + return message; + } + +} \ No newline at end of file diff --git a/sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/fetcher/URLVisitedFilter.java b/sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/fetcher/URLVisitedFilter.java new file mode 100644 index 00000000000..0c9ba7cb75b --- /dev/null +++ b/sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/fetcher/URLVisitedFilter.java @@ -0,0 +1,114 @@ +package de.lanlab.larm.fetcher; + +import java.net.URL; +import java.util.*; + +import de.lanlab.larm.util.SimpleLogger; + +/** + * contains a HashMap of all URLs already passed. Adds each URL to that list, or + * consumes it if it is already present + * + * @todo find ways to reduce memory consumption here. the approach is somewhat naive + * + * @author Clemens Marschner + * @created 3. Januar 2002 + */ +class URLVisitedFilter extends Filter implements MessageListener +{ + + /** + * Description of the Method + * + * @param handler Description of the Parameter + */ + public void notifyAddedToMessageHandler(MessageHandler handler) + { + this.messageHandler = handler; + } + + + MessageHandler messageHandler; + + SimpleLogger log; + + HashSet urlHash; + + static Boolean dummy = new Boolean(true); + + + + /** + * Constructor for the URLVisitedFilter object + * + * @param initialHashCapacity Description of the Parameter + */ + public URLVisitedFilter(int initialHashCapacity, SimpleLogger log) + { + urlHash = new HashSet(initialHashCapacity); + this.log = log; + //urlVector = new Vector(initialHashCapacity); + } + + + /** + * clears everything + */ + public void clearHashtable() + { + urlHash.clear(); + // urlVector.clear(); + } + + + + /** + * @param message Description of the Parameter + * @return Description of the Return Value + */ + public Message handleRequest(Message message) + { + if (message instanceof URLMessage) + { + URLMessage urlMessage = ((URLMessage) message); + URL url = urlMessage.getUrl(); + String urlString = urlMessage.getURLString(); + if (urlHash.contains(urlString)) + { + //System.out.println("URLVisitedFilter: " + urlString + " already present."); + filtered++; + if(log != null) + { + log.logThreadSafe(urlMessage.getInfo()); + } + return null; + } + else + { + // System.out.println("URLVisitedFilter: " + urlString + " not present yet."); + urlHash.add(urlString); + stringSize += urlString.length(); // see below + //urlVector.add(urlString); + } + } + return message; + } + + + private int stringSize = 0; + + /** + * just a method to get a rough number of characters contained in the array + * with that you see that the total memory is mostly used by this class + */ + public int getStringSize() + { + return stringSize; + } + + public int size() + { + return urlHash.size(); + } + +} diff --git a/sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/graph/DistanceCount.java b/sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/graph/DistanceCount.java new file mode 100644 index 00000000000..444523ff6b2 --- /dev/null +++ b/sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/graph/DistanceCount.java @@ -0,0 +1,875 @@ +package de.lanlab.larm.graph; + +/** + * Title: LARM Lanlab Retrieval Machine Description: Copyright: Copyright (c) + * Company: + * + * @author + * @version 1.0 + */ + +import java.io.*; +import java.util.*; + +/** + * Description of the Class + * + * @author Administrator + * @created 30. Januar 2002 + */ +class Node implements Comparable +{ + LinkedList incoming; + // 16 + 4 per entry + //HashSet incomingNodes; // 16 + 16 per entry, 11 x 16 default size = 192 + LinkedList outgoing; + // 16 + 4 per entry + //Object o; + //HashSet outgoingNodes; // 16 + 16 per entry, 11 x 16 default size = 192 + + //LinkedList shortestIncoming; + int id; + // 4 + float distance; + // 8 + String name; + // 4 + String object + String title; + // 4 + String object + float nodeRank[] = new float[2]; + // 16 + // 470 bytes + 2 string objects + /** + * Description of the Field + */ + public static int sortType = 0; + + + /** + * Description of the Method + * + * @param n Description of the Parameter + * @return Description of the Return Value + */ + public int compareTo(Object n) + { + if (sortType < 2) + { + double diff = ((Node) n).nodeRank[sortType] - nodeRank[sortType]; + return diff < 0 ? -1 : diff > 0 ? 1 : 0; + } + else + { + return (((Node) n).incoming.size() - incoming.size()); + } + } + + + /** + * Constructor for the Node object + * + * @param id Description of the Parameter + * @param name Description of the Parameter + * @param title Description of the Parameter + */ + public Node(int id, String name, String title) + { + this.id = id; + this.name = name; + this.title = title; + this.incoming = new LinkedList(); + this.outgoing = new LinkedList(); + //this.incomingNodes = new HashSet(); + //this.outgoingNodes = new HashSet(); + this.distance = Float.MAX_VALUE; + this.nodeRank[0] = this.nodeRank[1] = 1; + } + + + /** + * Adds a feature to the Incoming attribute of the Node object + * + * @param incomingT The feature to be added to the Incoming attribute + * @return Description of the Return Value + */ + public boolean addIncoming(Transition incomingT) + { + Integer id = new Integer(incomingT.getFrom().id); + if (!incoming.contains(id)) + { + // attn: doesn't scale well, but also saves memory + + incoming.addLast(incomingT); + //incomingNodes.add(id); + return true; + } + else + { + return false; + } + } + + + /** + * Adds a feature to the Outgoing attribute of the Node object + * + * @param outgoingT The feature to be added to the Outgoing attribute + * @return Description of the Return Value + */ + public boolean addOutgoing(Transition outgoingT) + { + Integer id = new Integer(outgoingT.getTo().id); + if (!outgoing.contains(id)) + { + outgoing.addLast(outgoingT); + //outgoingNodes.add(id); + return true; + } + else + { + return false; + } + } + + + /** + * Gets the incoming attribute of the Node object + * + * @return The incoming value + */ + public LinkedList getIncoming() + { + return incoming; + } + + + /** + * Gets the outgoing attribute of the Node object + * + * @return The outgoing value + */ + public LinkedList getOutgoing() + { + return outgoing; + } + + + /** + * Sets the distance attribute of the Node object + * + * @param distance The new distance value + */ + public void setDistance(float distance) + { + this.distance = distance; + } + + + /** + * Gets the distance attribute of the Node object + * + * @return The distance value + */ + public float getDistance() + { + return distance; + } + + + /** + * Gets the name attribute of the Node object + * + * @return The name value + */ + public String getName() + { + return name; + } + + + /** + * Sets the title attribute of the Node object + * + * @param title The new title value + */ + public void setTitle(String title) + { + this.title = title; + } + + + /** + * Gets the title attribute of the Node object + * + * @return The title value + */ + public String getTitle() + { + return title; + } + + + /** + * Gets the nodeRank attribute of the Node object + * + * @param idx Description of the Parameter + * @return The nodeRank value + */ + public float getNodeRank(int idx) + { + return nodeRank[idx]; + } + + + /** + * Sets the nodeRank attribute of the Node object + * + * @param nodeRank The new nodeRank value + * @param idx The new nodeRank value + */ + public void setNodeRank(float nodeRank, int idx) + { + this.nodeRank[idx] = nodeRank; + } + +} + +/** + * Description of the Class + * + * @author Administrator + * @created 30. Januar 2002 + */ +class Transition +{ + + + Node from; + Node to; + float distance; + float linkRank[] = new float[2]; + boolean isFrame; + + + /** + * Constructor for the Transition object + * + * @param from Description of the Parameter + * @param to Description of the Parameter + * @param isFrame Description of the Parameter + */ + public Transition(Node from, Node to, boolean isFrame) + { + LinkedList l = from.getOutgoing(); + Iterator i = l.iterator(); + while(i.hasNext()) + { + Transition t = (Transition)i.next(); + if(t.getTo() == to) + { + return; // schon enthalten + } + } + this.from = from; + this.to = to; + from.addOutgoing(this); + to.addIncoming(this); + this.distance = Integer.MAX_VALUE; + this.isFrame = isFrame; + this.linkRank[0] = this.linkRank[1] = 1; + } + + + /** + * Gets the to attribute of the Transition object + * + * @return The to value + */ + public Node getTo() + { + return to; + } + + + /** + * Gets the from attribute of the Transition object + * + * @return The from value + */ + public Node getFrom() + { + return from; + } + + + /** + * Gets the distance attribute of the Transition object + * + * @return The distance value + */ + public float getDistance() + { + return distance; + } + + + /** + * Sets the distance attribute of the Transition object + * + * @param distance The new distance value + */ + public void setDistance(float distance) + { + this.distance = distance; + } + + + /** + * Gets the frame attribute of the Transition object + * + * @return The frame value + */ + public boolean isFrame() + { + return isFrame; + } + + + /** + * Gets the linkRank attribute of the Transition object + * + * @param idx Description of the Parameter + * @return The linkRank value + */ + public float getLinkRank(int idx) + { + return linkRank[idx]; + } + + + /** + * Sets the linkRank attribute of the Transition object + * + * @param linkRank The new linkRank value + * @param idx The new linkRank value + */ + public void setLinkRank(float linkRank, int idx) + { + this.linkRank[idx] = linkRank; + } +} + +/** + * Description of the Class + * + * @author Administrator + * @created 30. Januar 2002 + */ +public class DistanceCount +{ + + + HashMap nodes = new HashMap(100000); + LinkedList nodesToDo = new LinkedList(); + static int id = 0; + + + /** + * Gets the orCreateNode attribute of the DistanceCount object + * + * @param name Description of the Parameter + * @param title Description of the Parameter + * @return The orCreateNode value + */ + Node getOrCreateNode(String name, String title) + { + Node node = (Node) nodes.get(name); + if (node != null) + { + if (title != null) + { + node.setTitle(title); + } + return node; + } + else + { + node = new Node(id++, name, title); + nodes.put(name, node); + return node; + } + } + + + /** + * Constructor for the DistanceCount object + * + * @param filename Description of the Parameter + * @exception IOException Description of the Exception + */ + public DistanceCount(String filename) + throws IOException + { + System.out.println("reading file..."); + long t1 = System.currentTimeMillis(); + BufferedReader b = new BufferedReader(new FileReader(filename)); + String line; + boolean firstNotFound = true; + Node firstNode = null; + int lines = 0; + while ((line = b.readLine()) != null) + { + lines++; + String title = null; + try + { + //StringTokenizer st = new StringTokenizer(line, " "); + StringTokenizer st = new StringTokenizer(line, "\t"); + String from = st.nextToken(); + if (from.endsWith("/")) + { + from = from.substring(0, from.length() - 1); + } + from = from.toLowerCase(); + String to = st.nextToken(); + if (to.endsWith("/")) + { + to = to.substring(0, to.length() - 1); + } + to = to.toLowerCase(); + boolean isFrame = (Integer.parseInt(st.nextToken()) == 1); + if (st.countTokens() > 3) + { + title = ""; + //StringBuffer sb = new StringBuffer(); + st.nextToken(); + // result + st.nextToken(); + // Mime Type + st.nextToken(); + // Size + /* + * while(st.hasMoreTokens()) + * { + * sb.append(st.nextToken()).append(" "); + * } + */ + title = st.nextToken(); + if (title.length() > 2) + { + + title = title.substring(1, title.length() - 1); + int indexOfPara = title.indexOf("\""); + if (indexOfPara > -1) + { + title = title.substring(0, indexOfPara); + } + } + } + Node fromNode = getOrCreateNode(from, null); + Node toNode = getOrCreateNode(to, title); + Transition t = new Transition(fromNode, toNode, isFrame); + /* + * if(firstNotFound && to.equals("http://127.0.0.1")) + * { + * firstNode = toNode; + * firstNotFound = false; + * } + */ + if (lines % 10000 == 0) + { + System.out.println("" + lines + " Lines; " + nodes.size() + " nodes"); + } + } + catch (NoSuchElementException e) + { + System.out.println("Malformed line " + lines + ": field number doesn't match"); + } + catch (NumberFormatException e) + { + System.out.println("Malformed line " + lines + ": NumberFormat wrong"); + } + } + System.out.println("finished; b" + lines + " Lines; " + nodes.size() + " nodes"); + long t2 = System.currentTimeMillis(); + System.out.println("" + (t2 - t1) + " ms"); + + /* + * if(firstNotFound) + * { + * System.out.println("Couldn't find start page"); + * System.exit(-1); + * } + */ + } + + + /** + * Description of the Method + * + * @param firstNode Description of the Parameter + */ + public void calculateShortestDistance(Node firstNode) + { + clearDistances(); + firstNode.setDistance(0); + nodesToDo.addLast(firstNode); + int calculations = 0; + while (!nodesToDo.isEmpty()) + { + if (calculations % 100000 == 0) + { + System.out.println("Calculations: " + calculations + "; nodes to go: " + nodesToDo.size() + " total Mem: " + Runtime.getRuntime().totalMemory() + "; free mem: " + Runtime.getRuntime().freeMemory()); + } + calculations++; + + Node act = (Node) nodesToDo.removeFirst(); + LinkedList outTrans = act.getOutgoing(); + float distance = act.getDistance(); + Iterator i = outTrans.iterator(); + //distance++; + + while (i.hasNext()) + { + Transition t = (Transition) i.next(); + float transDistance = t.getDistance(); + /*if (t.isFrame()) + { + System.out.println("Frame from " + t.from.getName() + " to " + t.to.getName()); + }*/ + float newDistance = distance + (t.isFrame() ? 0.25f : 1f); + if (transDistance > newDistance) + { + t.setDistance(newDistance); + Node to = t.getTo(); + if (to.distance > distance) + { + to.setDistance(newDistance); + nodesToDo.addLast(to); + } + } + } + /* + * if(looksGood) + * { + * System.out.println("Node " + act.id + " looks good"); + * } + */ + } + System.out.println("Calculations: " + calculations ); + + } + + + public void clearDistances() + { + System.out.println("Clearing distance data..."); + Iterator it = nodes.values().iterator(); + int nr = 0; + while (it.hasNext()) + { + Node n = (Node) it.next(); + nr++; + n.setDistance(Float.MAX_VALUE); + } + System.out.println("cleared " + nr + " nodes. done"); + + } + /** + * Description of the Method + * + * @param nodeFrom Description of the Parameter + * @param nodeTo Description of the Parameter + */ + public void printDistance(String nodeFrom, String nodeTo) + { + + Node firstNode = (Node) nodes.get(nodeFrom); + if (firstNode == null) + { + System.out.println("FROM node not found"); + return; + } + Node toNode = (Node) nodes.get(nodeTo); + if (toNode == null) + { + System.out.println("TO node not found"); + return; + } + //System.out.println("resetting node distance..."); + //clearDistances(); + + System.out.println("calculating..."); + calculateShortestDistance(firstNode); + + //t1 = System.currentTimeMillis(); + //System.out.println("" + (t1-t2) + " ms"); + + + System.out.println("\nSorting..."); + + /* + * Collection nodeCollection = nodes.values(); + * Object[] nodeArray = nodeCollection.toArray(); + * Arrays.sort(nodeArray); + * t2 = System.currentTimeMillis(); + * System.out.println("" + (t2-t1) + " ms"); + * int from = 0; + * int to = 1; + */ + /* + * /calculate page Rank + * for(int i = 0; i< 1; i++) + * { + * from = i%2; + * to = (i+1) % 2; + * for(int j = 0; j 0) + * { + * float linkRank = pageRank / size; + * it = out.iterator(); + * while(it.hasNext()) + * { + * Transition t = (Transition)it.next(); + * t.setLinkRank(linkRank, to); + * } + * } + * } + * } + */ + /* + * System.out.println("\nLink Count:"); + * for(int i=0; i<10; i++) + * { + * Node n = ((Node)nodeArray[i]); + * System.out.println("Node " + n.name + ": " + n.getIncoming().size() + "; pageRank: " + n.getNodeRank(to)); + * } + * for(int i=nodeArray.length/2; i"); + } + else + { + System.out.print(spaces + "+- " + n.name + " (" + (n.getTitle() != null ? n.getTitle().substring(0,Math.min(n.getTitle().length(),25)) : "") + "\") D:" + n.distance + "; L:" + n.getIncoming().size() + "; C:" + linkCount); + Iterator it = n.getIncoming().iterator(); + float dist = n.distance; + if (dist > 10000000) + { + System.out.println(spaces + "\n--no link--"); + return; + } + while (it.hasNext()) + { + Transition t = (Transition) it.next(); + if (t.distance <= dist) + { + if (t.isFrame()) + { + System.out.println(" **F** ->"); + } + else + { + System.out.println(" -> "); + } + printShortestRoute(t.getFrom(), indent + 1, linkCount + n.getIncoming().size()); + } + } + } + //System.out.println(""); + } + + + /** + * this class reads in store.log, constructs a graph of the crawled web and is able + * to perform a breadth-first search for the shortest distance between two nodes
+ * Note: this is experimental stuff. get into the source code to see how it works + * @param args args[0] must point to the store.log file + */ + public static void main(String[] args) + { + // Syntax: DistanceCount + try + { + DistanceCount dc = new DistanceCount(args[0]); + boolean running = true; + BufferedReader in = new BufferedReader(new InputStreamReader(System.in),400); + while (running) + { + System.out.print("\n\nCommand (? for help) > "); + String newL; + String input = ""; + //while((newL = in.readLine()) != null) + //{ + input = in.readLine(); + StringTokenizer st = new StringTokenizer(input," "); + String command; + boolean printHelp = false; + + if (!st.hasMoreTokens()) + { + printHelp = true; + command = "?"; + } + else + { + command = st.nextToken(); + } + + try + { + if ("?".equals(command)) + { + printHelp = true; + } + else if ("d".equals(command)) + { + String from = st.nextToken(); + String to = st.nextToken(); + dc.printDistance(from ,to); + } + else if ("q".equals(command)) + { + running = false; + } + else if ("r".equals(command)) + { + dc.printRandomRoute(); + } + else + { + System.out.println("unknown command '" + command + "'"); + } + } + catch (java.util.NoSuchElementException e) + { + System.out.println("Syntax error"); + e.printStackTrace(); + printHelp = true; + } + catch(Exception e) + { + e.printStackTrace(); + } + + if (printHelp) + { + System.out.println("\nSyntax\n" + + "? print this help message\n" + + "d print shortest route from page1 to page2\n" + + "r print random walk\n" + + "q quit"); + + } + } + + } + catch (IOException e) + { + e.printStackTrace(); + } + catch (ArrayIndexOutOfBoundsException e) + { + System.out.println("Syntax: java ... store.log"); + } + + } +} diff --git a/sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/gui/AboutDialog.java b/sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/gui/AboutDialog.java new file mode 100644 index 00000000000..e2a1137faaa --- /dev/null +++ b/sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/gui/AboutDialog.java @@ -0,0 +1,154 @@ +package de.lanlab.larm.gui; + +/* + A basic extension of the java.awt.Dialog class + */ + +import java.awt.*; + +public class AboutDialog extends Dialog { + + public AboutDialog(Frame parent, boolean modal) + { + super(parent, modal); + + // This code is automatically generated by Visual Cafe when you add + // components to the visual environment. It instantiates and initializes + // the components. To modify the code, only use code syntax that matches + // what Visual Cafe can generate, or Visual Cafe may be unable to back + // parse your Java file into its visual environment. + + //{{INIT_CONTROLS + setLayout(null); + setSize(249,150); + setVisible(false); + label1.setText("LARM - LANLab Retrieval Machine"); + add(label1); + label1.setBounds(12,12,228,24); + okButton.setLabel("OK"); + add(okButton); + okButton.setBounds(95,85,66,27); + label2.setText("(C) 2000 Clemens Marschner"); + add(label2); + label2.setBounds(12,36,228,24); + setTitle("AWT-Anwendung - Info"); + //}} + + //{{REGISTER_LISTENERS + SymWindow aSymWindow = new SymWindow(); + this.addWindowListener(aSymWindow); + SymAction lSymAction = new SymAction(); + okButton.addActionListener(lSymAction); + //}} + + } + + public AboutDialog(Frame parent, String title, boolean modal) + { + this(parent, modal); + setTitle(title); + } + + public void addNotify() + { + // Record the size of the window prior to calling parents addNotify. + Dimension d = getSize(); + + super.addNotify(); + + // Only do this once. + if (fComponentsAdjusted) + return; + + // Adjust components according to the insets + Insets insets = getInsets(); + setSize(insets.left + insets.right + d.width, insets.top + insets.bottom + d.height); + Component components[] = getComponents(); + for (int i = 0; i < components.length; i++) + { + Point p = components[i].getLocation(); + p.translate(insets.left, insets.top); + components[i].setLocation(p); + } + + // Used for addNotify check. + fComponentsAdjusted = true; + } + + public void setVisible(boolean b) + { + if (b) + { + Rectangle bounds = getParent().getBounds(); + Rectangle abounds = getBounds(); + + setLocation(bounds.x + (bounds.width - abounds.width)/ 2, + bounds.y + (bounds.height - abounds.height)/2); + } + + super.setVisible(b); + } + + //{{DECLARE_CONTROLS + java.awt.Label label1 = new java.awt.Label(); + java.awt.Button okButton = new java.awt.Button(); + java.awt.Label label2 = new java.awt.Label(); + //}} + + // Used for addNotify check. + boolean fComponentsAdjusted = false; + + class SymAction implements java.awt.event.ActionListener + { + public void actionPerformed(java.awt.event.ActionEvent event) + { + Object object = event.getSource(); + if (object == okButton) + okButton_ActionPerformed(event); + } + } + + void okButton_ActionPerformed(java.awt.event.ActionEvent event) + { + // to do: code goes here. + + okButton_ActionPerformed_Interaction1(event); + } + + + void okButton_ActionPerformed_Interaction1(java.awt.event.ActionEvent event) + { + try { + this.dispose(); + } catch (Exception e) { + } + } + + + class SymWindow extends java.awt.event.WindowAdapter + { + public void windowClosing(java.awt.event.WindowEvent event) + { + Object object = event.getSource(); + if (object == AboutDialog.this) + AboutDialog_WindowClosing(event); + } + } + + void AboutDialog_WindowClosing(java.awt.event.WindowEvent event) + { + // to do: code goes here. + + AboutDialog_WindowClosing_Interaction1(event); + } + + + void AboutDialog_WindowClosing_Interaction1(java.awt.event.WindowEvent event) + { + try { + this.dispose(); + } catch (Exception e) { + } + } + +} diff --git a/sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/gui/FetcherFrame.java b/sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/gui/FetcherFrame.java new file mode 100644 index 00000000000..a3d8dd242ee --- /dev/null +++ b/sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/gui/FetcherFrame.java @@ -0,0 +1,485 @@ +package de.lanlab.larm.gui; + +/* + This simple extension of the java.awt.Frame class + contains all the elements necessary to act as the + main window of an application. + */ + +import java.awt.*; +import java.awt.event.ActionListener; +//import com.sun.java.swing.*; + +public class FetcherFrame extends Frame +{ + public FetcherFrame() + { + // This code is automatically generated by Visual Cafe when you add + // components to the visual environment. It instantiates and initializes + // the components. To modify the code, only use code syntax that matches + // what Visual Cafe can generate, or Visual Cafe may be unable to back + // parse your Java file into its visual environment. + + //{{INIT_CONTROLS + setLayout(new BorderLayout(0,0)); + setSize(800,600); + setVisible(false); + openFileDialog1.setMode(FileDialog.LOAD); + openFileDialog1.setTitle("Öffnen"); + //$$ openFileDialog1.move(24,312); + mainPanelWithBorders.setLayout(new BorderLayout(0,0)); + add("Center", mainPanelWithBorders); + mainPanelWithBorders.setBounds(0,0,800,600); + northBorder.setLayout(null); + mainPanelWithBorders.add("North", northBorder); + northBorder.setBackground(java.awt.Color.lightGray); + northBorder.setBounds(0,0,800,3); + southBorder.setLayout(null); + mainPanelWithBorders.add("South", southBorder); + southBorder.setBackground(java.awt.Color.lightGray); + southBorder.setBounds(0,597,800,3); + westBorder.setLayout(null); + mainPanelWithBorders.add("West", westBorder); + westBorder.setBackground(java.awt.Color.lightGray); + westBorder.setBounds(0,3,3,594); + eastBorder.setLayout(null); + mainPanelWithBorders.add("East", eastBorder); + eastBorder.setBackground(java.awt.Color.lightGray); + eastBorder.setBounds(797,3,3,594); + mainPanel.setLayout(new BorderLayout(0,3)); + mainPanelWithBorders.add("Center", mainPanel); + mainPanel.setBackground(java.awt.Color.lightGray); + mainPanel.setBounds(3,3,794,594); + upperPanel.setLayout(new GridLayout(1,2,0,0)); + mainPanel.add("North", upperPanel); + upperPanel.setBounds(0,0,794,150); + preferencesPanel.setLayout(null); + upperPanel.add(preferencesPanel); + preferencesPanel.setBounds(0,0,397,150); + startURLlabel.setText("Start-URL"); + preferencesPanel.add(startURLlabel); + startURLlabel.setBounds(12,0,121,24); + startURL.setText("uni-muenchen.de"); + preferencesPanel.add(startURL); + startURL.setBounds(132,0,133,24); + startButton.setLabel("Start"); + preferencesPanel.add(startButton); + startButton.setFont(new Font("Dialog", Font.BOLD, 12)); + startButton.setBounds(288,36,99,24); + restrictToLabel.setText("Restrict host to"); + preferencesPanel.add(restrictToLabel); + restrictToLabel.setBounds(12,36,121,28); + preferencesPanel.add(restrictTo); + restrictTo.setBounds(133,36,133,24); + logPanel.setLayout(new BorderLayout(0,0)); + upperPanel.add(logPanel); + logPanel.setBounds(397,0,397,150); + logPanel.add("Center", logList); + logList.setBackground(java.awt.Color.white); + logList.setBounds(0,0,397,150); + lowerPanel.setLayout(new GridLayout(1,3,3,3)); + mainPanel.add("Center", lowerPanel); + lowerPanel.setBounds(0,153,794,441); + urlQueuePanel.setLayout(new BorderLayout(0,0)); + lowerPanel.add(urlQueuePanel); + urlQueuePanel.setBounds(0,0,196,441); + urlQueueLabel.setText("URLQueue"); + urlQueuePanel.add("North", urlQueueLabel); + urlQueueLabel.setBounds(0,0,196,23); + urlQueuePanel.add("Center", urlQueueList); + urlQueueList.setBackground(java.awt.Color.white); + urlQueueList.setBounds(0,23,196,418); + urlThreadPanel.setLayout(new BorderLayout(0,0)); + lowerPanel.add(urlThreadPanel); + urlThreadPanel.setBounds(199,0,196,441); + urlThreadLabel.setText("URLThreads"); + urlThreadPanel.add("North", urlThreadLabel); + urlThreadLabel.setBounds(0,0,196,23); + urlThreadPanel.add("Center", urlThreadList); + urlThreadList.setBackground(java.awt.Color.white); + urlThreadList.setBounds(0,23,196,418); + docQueuePanel.setLayout(new BorderLayout(0,0)); + lowerPanel.add(docQueuePanel); + docQueuePanel.setBounds(398,0,196,441); + docQueueLabel.setText("DocQueue"); + docQueuePanel.add("North", docQueueLabel); + docQueueLabel.setBounds(0,0,196,23); + docQueuePanel.add("Center", docQueueList); + docQueueList.setBackground(java.awt.Color.white); + docQueueList.setBounds(0,23,196,418); + docThreadPanel.setLayout(new BorderLayout(0,0)); + lowerPanel.add(docThreadPanel); + docThreadPanel.setBounds(597,0,196,441); + docThreadLabel.setText("DocThreads"); + docThreadPanel.add("North", docThreadLabel); + docThreadLabel.setBounds(0,0,196,23); + docThreadPanel.add("Center", docThreadList); + docThreadList.setBackground(java.awt.Color.white); + docThreadList.setBounds(0,23,196,418); + setTitle("LARM - Fetcher"); + //}} + + //{{INIT_MENUS + menu1.setLabel("Datei"); + menu1.add(newMenuItem); + newMenuItem.setEnabled(false); + newMenuItem.setLabel("Neu"); + newMenuItem.setShortcut(new MenuShortcut(java.awt.event.KeyEvent.VK_N,false)); + menu1.add(openMenuItem); + openMenuItem.setLabel("Öffnen..."); + openMenuItem.setShortcut(new MenuShortcut(java.awt.event.KeyEvent.VK_O,false)); + menu1.add(saveMenuItem); + saveMenuItem.setEnabled(false); + saveMenuItem.setLabel("Speichern"); + saveMenuItem.setShortcut(new MenuShortcut(java.awt.event.KeyEvent.VK_S,false)); + menu1.add(saveAsMenuItem); + saveAsMenuItem.setEnabled(false); + saveAsMenuItem.setLabel("Speichern unter..."); + menu1.add(separatorMenuItem); + separatorMenuItem.setLabel("-"); + menu1.add(exitMenuItem); + exitMenuItem.setLabel("Beenden"); + mainMenuBar.add(menu1); + menu2.setLabel("Bearbeiten"); + menu2.add(cutMenuItem); + cutMenuItem.setEnabled(false); + cutMenuItem.setLabel("Ausschneiden"); + cutMenuItem.setShortcut(new MenuShortcut(java.awt.event.KeyEvent.VK_X,false)); + menu2.add(copyMenuItem); + copyMenuItem.setEnabled(false); + copyMenuItem.setLabel("Kopieren"); + copyMenuItem.setShortcut(new MenuShortcut(java.awt.event.KeyEvent.VK_C,false)); + menu2.add(pasteMenuItem); + pasteMenuItem.setEnabled(false); + pasteMenuItem.setLabel("Einfügen"); + pasteMenuItem.setShortcut(new MenuShortcut(java.awt.event.KeyEvent.VK_V,false)); + mainMenuBar.add(menu2); + menu3.setLabel("Hilfe"); + menu3.add(aboutMenuItem); + aboutMenuItem.setLabel("Info..."); + mainMenuBar.add(menu3); + //$$ mainMenuBar.move(0,312); + setMenuBar(mainMenuBar); + //}} + + //{{REGISTER_LISTENERS + SymWindow aSymWindow = new SymWindow(); + this.addWindowListener(aSymWindow); + SymAction lSymAction = new SymAction(); + openMenuItem.addActionListener(lSymAction); + exitMenuItem.addActionListener(lSymAction); + aboutMenuItem.addActionListener(lSymAction); + startButton.addActionListener(lSymAction); + //}} + } + + public FetcherFrame(String title) + { + this(); + setTitle(title); + } + + /** + * Shows or hides the component depending on the boolean flag b. + * @param b if true, show the component; otherwise, hide the component. + * @see java.awt.Component#isVisible + */ + public void setVisible(boolean b) + { + if(b) + { + setLocation(50, 50); + } + super.setVisible(b); + } + + static public void main(String args[]) + { + try + { + //Create a new instance of our application's frame, and make it visible. + (new FetcherFrame()).setVisible(true); + } + catch (Throwable t) + { + System.err.println(t); + t.printStackTrace(); + //Ensure the application exits with an error condition. + System.exit(1); + } + } + + public void addNotify() + { + // Record the size of the window prior to calling parents addNotify. + Dimension d = getSize(); + + super.addNotify(); + + if (fComponentsAdjusted) + return; + + // Adjust components according to the insets + setSize(getInsets().left + getInsets().right + d.width, getInsets().top + getInsets().bottom + d.height); + Component components[] = getComponents(); + for (int i = 0; i < components.length; i++) + { + Point p = components[i].getLocation(); + p.translate(getInsets().left, getInsets().top); + components[i].setLocation(p); + } + fComponentsAdjusted = true; + } + + // Used for addNotify check. + boolean fComponentsAdjusted = false; + + //{{DECLARE_CONTROLS + java.awt.FileDialog openFileDialog1 = new java.awt.FileDialog(this); + java.awt.Panel mainPanelWithBorders = new java.awt.Panel(); + java.awt.Panel northBorder = new java.awt.Panel(); + java.awt.Panel southBorder = new java.awt.Panel(); + java.awt.Panel westBorder = new java.awt.Panel(); + java.awt.Panel eastBorder = new java.awt.Panel(); + java.awt.Panel mainPanel = new java.awt.Panel(); + java.awt.Panel upperPanel = new java.awt.Panel(); + java.awt.Panel preferencesPanel = new java.awt.Panel(); + java.awt.Label startURLlabel = new java.awt.Label(); + java.awt.TextField startURL = new java.awt.TextField(30); + java.awt.Button startButton = new java.awt.Button(); + java.awt.Label restrictToLabel = new java.awt.Label(); + java.awt.TextField restrictTo = new java.awt.TextField(); + java.awt.Panel logPanel = new java.awt.Panel(); + java.awt.List logList = new java.awt.List(8); + java.awt.Panel lowerPanel = new java.awt.Panel(); + java.awt.Panel urlQueuePanel = new java.awt.Panel(); + java.awt.Label urlQueueLabel = new java.awt.Label(); + java.awt.List urlQueueList = new java.awt.List(5); + java.awt.Panel urlThreadPanel = new java.awt.Panel(); + java.awt.Label urlThreadLabel = new java.awt.Label(); + java.awt.List urlThreadList = new java.awt.List(4); + java.awt.Panel docQueuePanel = new java.awt.Panel(); + java.awt.Label docQueueLabel = new java.awt.Label(); + java.awt.List docQueueList = new java.awt.List(4); + java.awt.Panel docThreadPanel = new java.awt.Panel(); + java.awt.Label docThreadLabel = new java.awt.Label(); + java.awt.List docThreadList = new java.awt.List(4); + //}} + + //{{DECLARE_MENUS + java.awt.MenuBar mainMenuBar = new java.awt.MenuBar(); + java.awt.Menu menu1 = new java.awt.Menu(); + java.awt.MenuItem newMenuItem = new java.awt.MenuItem(); + java.awt.MenuItem openMenuItem = new java.awt.MenuItem(); + java.awt.MenuItem saveMenuItem = new java.awt.MenuItem(); + java.awt.MenuItem saveAsMenuItem = new java.awt.MenuItem(); + java.awt.MenuItem separatorMenuItem = new java.awt.MenuItem(); + java.awt.MenuItem exitMenuItem = new java.awt.MenuItem(); + java.awt.Menu menu2 = new java.awt.Menu(); + java.awt.MenuItem cutMenuItem = new java.awt.MenuItem(); + java.awt.MenuItem copyMenuItem = new java.awt.MenuItem(); + java.awt.MenuItem pasteMenuItem = new java.awt.MenuItem(); + java.awt.Menu menu3 = new java.awt.Menu(); + java.awt.MenuItem aboutMenuItem = new java.awt.MenuItem(); + //}} + + class SymWindow extends java.awt.event.WindowAdapter + { + public void windowClosing(java.awt.event.WindowEvent event) + { + Object object = event.getSource(); + if (object == FetcherFrame.this) + FetcherFrame_WindowClosing(event); + } + } + + void FetcherFrame_WindowClosing(java.awt.event.WindowEvent event) + { + // to do: code goes here. + + FetcherFrame_WindowClosing_Interaction1(event); + } + + + void FetcherFrame_WindowClosing_Interaction1(java.awt.event.WindowEvent event) + { + try { + // QuitDialog Create and show as modal + (new QuitDialog(this, true)).setVisible(true); + } catch (Exception e) { + } + } + + + class SymAction implements java.awt.event.ActionListener + { + public void actionPerformed(java.awt.event.ActionEvent event) + { + Object object = event.getSource(); + if (object == openMenuItem) + openMenuItem_ActionPerformed(event); + else if (object == aboutMenuItem) + aboutMenuItem_ActionPerformed(event); + else if (object == exitMenuItem) + exitMenuItem_ActionPerformed(event); + else if (object == startButton) + startButton_ActionPerformed(event); + } + } + + void openMenuItem_ActionPerformed(java.awt.event.ActionEvent event) + { + // to do: code goes here. + + openMenuItem_ActionPerformed_Interaction1(event); + } + + + void openMenuItem_ActionPerformed_Interaction1(java.awt.event.ActionEvent event) + { + try { + // OpenFileDialog Create and show as modal + int defMode = openFileDialog1.getMode(); + String defTitle = openFileDialog1.getTitle(); + String defDirectory = openFileDialog1.getDirectory(); + String defFile = openFileDialog1.getFile(); + + openFileDialog1 = new java.awt.FileDialog(this, defTitle, defMode); + openFileDialog1.setDirectory(defDirectory); + openFileDialog1.setFile(defFile); + openFileDialog1.setVisible(true); + } catch (Exception e) { + } + } + + + void aboutMenuItem_ActionPerformed(java.awt.event.ActionEvent event) + { + // to do: code goes here. + + aboutMenuItem_ActionPerformed_Interaction1(event); + } + + + void aboutMenuItem_ActionPerformed_Interaction1(java.awt.event.ActionEvent event) + { + try { + // AboutDialog Create and show as modal + (new AboutDialog(this, true)).setVisible(true); + } catch (Exception e) { + } + } + + + void exitMenuItem_ActionPerformed(java.awt.event.ActionEvent event) + { + // to do: code goes here. + + exitMenuItem_ActionPerformed_Interaction1(event); + } + + + void exitMenuItem_ActionPerformed_Interaction1(java.awt.event.ActionEvent event) + { + try { + // QuitDialog Create and show as modal + (new QuitDialog(this, true)).setVisible(true); + } catch (Exception e) { + } + } + + + public void startButton_ActionPerformed(java.awt.event.ActionEvent event) + { + // to do: code goes here. + + } + + public void addUrlQueueItem(String item) + { + urlQueueList.add(item); + } + + public void removeUrlQueueItem(String item) + { + urlQueueList.remove(item); + } + public void addDocQueueItem(String item) + { + docQueueList.add(item); + } + + public void removeDocQueueItem(String item) + { + docQueueList.remove(item); + } + + public synchronized int addUrlThreadItem(String item) + { + urlThreadList.add(item); + return urlThreadList.getItemCount(); + } + + public synchronized int addUrlThreadItem(String item, int pos) + { + urlThreadList.add(item,pos); + return urlThreadList.getItemCount(); + } + + public void replaceUrlThreadItem(String item, int index) + { + urlThreadList.replaceItem(item,index); + } + + public synchronized int addDocThreadItem(String item) + { + docThreadList.add(item); + return docThreadList.getItemCount(); + } + + public void replaceDocThreadItem(String item, int index) + { + docThreadList.replaceItem(item,index); + } + + + + public void addLogEntry(String entry) + { + logList.add(entry); + logList.makeVisible(logList.getItemCount()-1); + } + + public void clearLog() + { + logList.removeAll(); + } + + public void addStartButtonListener(ActionListener a) + { + startButton.addActionListener(a); + } + + public String getRestrictTo() + { + return restrictTo.getText(); + } + public void setRestrictTo(String restrictTo) + { + this.restrictTo.setText(restrictTo); + } + public String getStartURL() + { + return startURL.getText(); + } + public void setStartURL(String startURL) + { + this.startURL.setText(startURL); + } + + //public void setInfoText(String text) + //{ + // thi + //} +} + diff --git a/sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/gui/FetcherSummaryFrame.java b/sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/gui/FetcherSummaryFrame.java new file mode 100644 index 00000000000..405f9db7839 --- /dev/null +++ b/sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/gui/FetcherSummaryFrame.java @@ -0,0 +1,332 @@ + +/** + * Title: LARM Lanlab Retrieval Machine

+ * Description:

+ * Copyright: Copyright (c)

+ * Company:

+ * @author + * @version 1.0 + */ +package de.lanlab.larm.gui; + +import javax.swing.*; +import java.awt.*; +import java.awt.event.*; + + +public class FetcherSummaryFrame extends JFrame +{ + JPanel lowerPanel = new JPanel(); + JPanel progressPanel = new JPanel(); + JPanel middlePanel = new JPanel(); + JPanel rightPanel = new JPanel(); + BorderLayout borderLayout1 = new BorderLayout(); + JPanel propertyPanel = new JPanel(); + JLabel hostLabel = new JLabel(); + JLabel urlRestrictionFrame = new JLabel(); + JTextField startURL = new JTextField(); + JTextField restrictTo = new JTextField(); + JButton startButton = new JButton(); + GridLayout gridLayout1 = new GridLayout(); + JProgressBar urlQueuedProgress = new JProgressBar(0,100); + JLabel urlQueuedLabel = new JLabel(); + JLabel scopeFilteredLabel = new JLabel(); + JProgressBar scopeFilteredProgress = new JProgressBar(0,100); + JLabel visitedFilteredLabel = new JLabel(); + JProgressBar visitedFilteredProgress = new JProgressBar(0,100); + JLabel workingThreadsLabel = new JLabel(); + JProgressBar workingThreadsProgress = new JProgressBar(0,100); + JLabel idleThreadsLabel = new JLabel(); + JProgressBar idleThreadsProgress = new JProgressBar(0,100); + JLabel busyThreadsLabel = new JLabel(); + JProgressBar busyThreadsProgress = new JProgressBar(0,100); + JLabel requestQueueLabel = new JLabel(); + JProgressBar requestQueueProgress = new JProgressBar(); + JLabel stalledThreadsLabel = new JLabel(); + JProgressBar stalledThreadsProgress = new JProgressBar(); + JLabel dnsLabel = new JLabel(); + JProgressBar dnsProgress = new JProgressBar(0,100); + JLabel freeMemLabel = new JLabel(); + JLabel freeMemText = new JLabel(); + JLabel totalMemLabel = new JLabel(); + JLabel totalMemText = new JLabel(); + JLabel bpsLabel = new JLabel(); + JLabel bpsText = new JLabel(); + JLabel docsLabel = new JLabel(); + JLabel docsText = new JLabel(); + JLabel docsReadLabel = new JLabel(); + JLabel docsReadText = new JLabel(); + JProgressBar urlsCaughtProgress = new JProgressBar(0,100); + JLabel urlsCaughtText = new JLabel(); + JLabel robotsTxtsText = new JLabel(); + JProgressBar robotsTxtsProgress = new JProgressBar(0,100); + + public FetcherSummaryFrame() + { + try + { + jbInit(); + this.setTitle("LARM - LANLab Retrieval Machine"); + this.setSize(new Dimension(640,350)); + this.urlQueuedProgress.setStringPainted(true); + this.urlQueuedProgress.setString("0"); + this.scopeFilteredProgress.setStringPainted(true); + this.scopeFilteredProgress.setString("0"); + this.visitedFilteredProgress.setStringPainted(true); + this.visitedFilteredProgress.setString("0"); + workingThreadsProgress.setStringPainted(true); + workingThreadsProgress.setString("0"); + idleThreadsProgress.setStringPainted(true); + idleThreadsProgress.setString("0"); + busyThreadsProgress.setStringPainted(true); + busyThreadsProgress.setString("0"); + stalledThreadsProgress.setStringPainted(true); + stalledThreadsProgress.setString("0"); + requestQueueProgress.setStringPainted(true); + requestQueueProgress.setString("0"); + dnsProgress.setStringPainted(true); + dnsProgress.setString("0"); + urlsCaughtProgress.setStringPainted(true); + urlsCaughtProgress.setString("0"); + robotsTxtsProgress.setStringPainted(true); + robotsTxtsProgress.setString("0"); + } + catch(Exception e) + { + e.printStackTrace(); + } + } + + private void jbInit() throws Exception + { + this.getContentPane().setLayout(borderLayout1); + propertyPanel.setMinimumSize(new Dimension(10, 70)); + propertyPanel.setPreferredSize(new Dimension(10, 80)); + propertyPanel.setLayout(null); + hostLabel.setText("Startseite"); + hostLabel.setBounds(new Rectangle(18, 15, 76, 17)); + urlRestrictionFrame.setText("URL-Restriction (regul. Ausdruck)"); + urlRestrictionFrame.setBounds(new Rectangle(18, 37, 208, 17)); + startURL.setBounds(new Rectangle(224, 14, 281, 21)); + restrictTo.setBounds(new Rectangle(224, 38, 281, 21)); + startButton.setActionCommand("start"); + startButton.setText("Start"); + startButton.setBounds(new Rectangle(528, 14, 79, 47)); + lowerPanel.setLayout(gridLayout1); + urlQueuedLabel.setToolTipText(""); + urlQueuedLabel.setText("URLs queued"); + scopeFilteredLabel.setToolTipText(""); + scopeFilteredLabel.setText("Scope-gefiltert"); + visitedFilteredLabel.setText("Visited gefiltert"); + workingThreadsLabel.setText("Number of Working Threads"); + idleThreadsLabel.setText("Idle Threads"); + busyThreadsLabel.setText("Busy Threads"); + requestQueueLabel.setText("requests queued"); + stalledThreadsLabel.setText("stalled Threads"); + stalledThreadsProgress.setPreferredSize(new Dimension(190, 25)); + requestQueueProgress.setPreferredSize(new Dimension(190, 25)); + busyThreadsProgress.setPreferredSize(new Dimension(190, 25)); + idleThreadsProgress.setPreferredSize(new Dimension(190, 25)); + workingThreadsProgress.setPreferredSize(new Dimension(190, 25)); + urlQueuedProgress.setPreferredSize(new Dimension(190, 25)); + scopeFilteredProgress.setPreferredSize(new Dimension(190, 25)); + visitedFilteredProgress.setPreferredSize(new Dimension(190, 25)); + dnsLabel.setText("DNS Hosts cached"); + dnsProgress.setPreferredSize(new Dimension(190, 25)); + freeMemLabel.setText("Free Mem"); + freeMemLabel.setPreferredSize(new Dimension(60, 17)); + freeMemText.setText("0"); + freeMemText.setPreferredSize(new Dimension(120, 17)); + freeMemText.setMinimumSize(new Dimension(100, 17)); + totalMemLabel.setText("total Mem"); + totalMemLabel.setPreferredSize(new Dimension(60, 17)); + totalMemText.setText("0"); + totalMemText.setPreferredSize(new Dimension(120, 17)); + totalMemText.setMinimumSize(new Dimension(100, 17)); + bpsLabel.setPreferredSize(new Dimension(60, 17)); + bpsLabel.setText("Bytes/s"); + bpsText.setMinimumSize(new Dimension(100, 17)); + bpsText.setPreferredSize(new Dimension(120, 17)); + bpsText.setText("0"); + docsLabel.setText("Docs/s"); + docsLabel.setPreferredSize(new Dimension(60, 17)); + docsText.setText("0"); + docsText.setPreferredSize(new Dimension(120, 17)); + docsText.setMinimumSize(new Dimension(100, 17)); + docsReadLabel.setText("Docs read"); + docsReadLabel.setPreferredSize(new Dimension(60, 17)); + docsReadText.setText("0"); + docsReadText.setPreferredSize(new Dimension(120, 17)); + docsReadText.setMinimumSize(new Dimension(100, 17)); + urlsCaughtProgress.setPreferredSize(new Dimension(190, 25)); + urlsCaughtText.setText("URLs caught by Robots.txt"); + robotsTxtsText.setText("Robots.txts found"); + robotsTxtsProgress.setPreferredSize(new Dimension(190, 25)); + this.getContentPane().add(lowerPanel, BorderLayout.CENTER); + lowerPanel.add(progressPanel, null); + progressPanel.add(urlQueuedLabel, null); + progressPanel.add(urlQueuedProgress, null); + progressPanel.add(scopeFilteredLabel, null); + progressPanel.add(scopeFilteredProgress, null); + progressPanel.add(visitedFilteredLabel, null); + progressPanel.add(visitedFilteredProgress, null); + progressPanel.add(dnsLabel, null); + progressPanel.add(dnsProgress, null); + progressPanel.add(robotsTxtsText, null); + progressPanel.add(robotsTxtsProgress, null); + progressPanel.add(urlsCaughtText, null); + progressPanel.add(urlsCaughtProgress, null); + lowerPanel.add(middlePanel, null); + middlePanel.add(workingThreadsLabel, null); + middlePanel.add(workingThreadsProgress, null); + middlePanel.add(idleThreadsLabel, null); + middlePanel.add(idleThreadsProgress, null); + middlePanel.add(busyThreadsLabel, null); + middlePanel.add(busyThreadsProgress, null); + middlePanel.add(requestQueueLabel, null); + middlePanel.add(requestQueueProgress, null); + middlePanel.add(stalledThreadsLabel, null); + middlePanel.add(stalledThreadsProgress, null); + lowerPanel.add(rightPanel, null); + rightPanel.add(docsLabel, null); + rightPanel.add(docsText, null); + rightPanel.add(docsReadLabel, null); + rightPanel.add(docsReadText, null); + rightPanel.add(bpsLabel, null); + rightPanel.add(bpsText, null); + rightPanel.add(totalMemLabel, null); + rightPanel.add(totalMemText, null); + rightPanel.add(freeMemLabel, null); + rightPanel.add(freeMemText, null); + this.getContentPane().add(propertyPanel, BorderLayout.NORTH); + propertyPanel.add(urlRestrictionFrame, null); + propertyPanel.add(restrictTo, null); + propertyPanel.add(hostLabel, null); + propertyPanel.add(startButton, null); + propertyPanel.add(startURL, null); + } + + public void setCounterProgressBar(JProgressBar p, int value) + { + int oldMax = p.getMaximum(); + int oldValue = p.getValue(); + + if(value > oldMax) + { + p.setMaximum(oldMax * 2); + } + else if (value < oldMax / 2 && oldValue >= oldMax / 2) + { + p.setMaximum(oldMax / 2); + } + p.setValue(value); + p.setString("" + value); + } + + public void setURLsQueued(int queued) + { + setCounterProgressBar(this.urlQueuedProgress, queued); + } + + public void setScopeFiltered(int filtered) + { + setCounterProgressBar(this.scopeFilteredProgress, filtered); + } + + public void setVisitedFiltered(int filtered) + { + setCounterProgressBar(this.visitedFilteredProgress, filtered); + } + + public void setWorkingThreadsCount(int threads) + { + setCounterProgressBar(this.workingThreadsProgress, threads); + } + + public void setIdleThreadsCount(int threads) + { + setCounterProgressBar(this.idleThreadsProgress, threads); + } + + public void setBusyThreadsCount(int threads) + { + setCounterProgressBar(this.busyThreadsProgress, threads); + } + + public void setRequestQueueCount(int requests) + { + setCounterProgressBar(this.requestQueueProgress, requests); + } + + public void setDNSCount(int count) + { + setCounterProgressBar(this.dnsProgress, count); + } + + public void setURLsCaughtCount(int count) + { + setCounterProgressBar(this.urlQueuedProgress, count); + } + + public void addStartButtonListener(ActionListener a) + { + startButton.addActionListener(a); + } + + + + public String getRestrictTo() + { + return restrictTo.getText(); + } + public void setRestrictTo(String restrictTo) + { + this.restrictTo.setText(restrictTo); + } + public String getStartURL() + { + return startURL.getText(); + } + public void setStartURL(String startURL) + { + this.startURL.setText(startURL); + } + + public void setStalledThreads(int stalled) + { + stalledThreadsProgress.setValue(stalled); + } + + public void setBytesPerSecond(double bps) + { + bpsText.setText("" + bps); + } + + + public void setDocsPerSecond(double docs) + { + bpsText.setText("" + docs); + } + + public void setFreeMem(long freeMem) + { + freeMemText.setText("" + freeMem); + } + + public void setTotalMem(long totalMem) + { + totalMemText.setText("" + totalMem); + } + + public void setRobotsTxtCount(int robotsTxtCount) + { + setCounterProgressBar(robotsTxtsProgress, robotsTxtCount); + } + + public void setDocsRead(int docs) + { + bpsText.setText("" + docs); + } + +} + diff --git a/sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/gui/QuitDialog.java b/sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/gui/QuitDialog.java new file mode 100644 index 00000000000..d06b91642f9 --- /dev/null +++ b/sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/gui/QuitDialog.java @@ -0,0 +1,184 @@ +package de.lanlab.larm.gui; +/* + A basic extension of the java.awt.Dialog class + */ + +import java.awt.*; +import java.awt.event.*; + +public class QuitDialog extends Dialog +{ + public QuitDialog(Frame parent, boolean modal) + { + super(parent, modal); + + //Keep a local reference to the invoking frame + frame = parent; + + // This code is automatically generated by Visual Cafe when you add + // components to the visual environment. It instantiates and initializes + // the components. To modify the code, only use code syntax that matches + // what Visual Cafe can generate, or Visual Cafe may be unable to back + // parse your Java file into its visual environment. + //{{INIT_CONTROLS + setLayout(null); + setSize(337,135); + setVisible(false); + yesButton.setLabel(" Ja "); + add(yesButton); + yesButton.setFont(new Font("Dialog", Font.BOLD, 12)); + yesButton.setBounds(72,80,79,22); + noButton.setLabel(" Nein "); + add(noButton); + noButton.setFont(new Font("Dialog", Font.BOLD, 12)); + noButton.setBounds(185,80,79,22); + label1.setText("Möchten Sie LARM beenden?"); + label1.setAlignment(java.awt.Label.CENTER); + add(label1); + label1.setBounds(68,33,220,23); + setTitle("LARM - Beenden"); + //}} + + //{{REGISTER_LISTENERS + SymWindow aSymWindow = new SymWindow(); + this.addWindowListener(aSymWindow); + SymAction lSymAction = new SymAction(); + noButton.addActionListener(lSymAction); + yesButton.addActionListener(lSymAction); + //}} + } + + public void addNotify() + { + // Record the size of the window prior to calling parents addNotify. + Dimension d = getSize(); + + super.addNotify(); + + if (fComponentsAdjusted) + return; + + // Adjust components according to the insets + setSize(getInsets().left + getInsets().right + d.width, getInsets().top + getInsets().bottom + d.height); + Component components[] = getComponents(); + for (int i = 0; i < components.length; i++) + { + Point p = components[i].getLocation(); + p.translate(getInsets().left, getInsets().top); + components[i].setLocation(p); + } + fComponentsAdjusted = true; + } + + public QuitDialog(Frame parent, String title, boolean modal) + { + this(parent, modal); + setTitle(title); + } + + /** + * Shows or hides the component depending on the boolean flag b. + * @param b if true, show the component; otherwise, hide the component. + * @see java.awt.Component#isVisible + */ + public void setVisible(boolean b) + { + if(b) + { + Rectangle bounds = getParent().getBounds(); + Rectangle abounds = getBounds(); + + setLocation(bounds.x + (bounds.width - abounds.width)/ 2, + bounds.y + (bounds.height - abounds.height)/2); + Toolkit.getDefaultToolkit().beep(); + } + super.setVisible(b); + } + + // Used for addNotify check. + boolean fComponentsAdjusted = false; + // Invoking frame + Frame frame = null; + + //{{DECLARE_CONTROLS + java.awt.Button yesButton = new java.awt.Button(); + java.awt.Button noButton = new java.awt.Button(); + java.awt.Label label1 = new java.awt.Label(); + //}} + + class SymAction implements java.awt.event.ActionListener + { + public void actionPerformed(java.awt.event.ActionEvent event) + { + Object object = event.getSource(); + if (object == yesButton) + yesButton_ActionPerformed(event); + else if (object == noButton) + noButton_ActionPerformed(event); + } + } + + void yesButton_ActionPerformed(java.awt.event.ActionEvent event) + { + // to do: code goes here. + + yesButton_ActionPerformed_Interaction1(event); + } + + + void yesButton_ActionPerformed_Interaction1(java.awt.event.ActionEvent event) + { + try { + frame.setVisible(false); // Hide the invoking frame + frame.dispose(); // Free system resources + this.dispose(); // Free system resources + System.exit(0); // close the application + } catch (Exception e) { + } + } + + + void noButton_ActionPerformed(java.awt.event.ActionEvent event) + { + // to do: code goes here. + + noButton_ActionPerformed_Interaction1(event); + } + + + void noButton_ActionPerformed_Interaction1(java.awt.event.ActionEvent event) + { + try { + this.dispose(); + } catch (Exception e) { + } + } + + + class SymWindow extends java.awt.event.WindowAdapter + { + public void windowClosing(java.awt.event.WindowEvent event) + { + Object object = event.getSource(); + if (object == QuitDialog.this) + QuitDialog_WindowClosing(event); + } + } + + void QuitDialog_WindowClosing(java.awt.event.WindowEvent event) + { + // to do: code goes here. + + QuitDialog_WindowClosing_Interaction1(event); + } + + + void QuitDialog_WindowClosing_Interaction1(java.awt.event.WindowEvent event) + { + try { + this.dispose(); + } catch (Exception e) { + } + } + +} diff --git a/sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/net/HttpClientTimeout.java b/sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/net/HttpClientTimeout.java new file mode 100644 index 00000000000..b2dd21fc353 --- /dev/null +++ b/sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/net/HttpClientTimeout.java @@ -0,0 +1,136 @@ +package de.lanlab.larm.net; + +// whatever package you want +import sun.net.www.http.HttpClient; +import sun.net.www.MessageHeader; +import sun.net.ProgressEntry; + +import java.net.*; +import java.io.*; + + +/** + * Description of the Class + * + *@author cmarschn + *@created 2. Mai 2001 + */ +public class HttpClientTimeout extends HttpClient { + private int timeout = -1; + + + /** + * Constructor for the HttpClientTimeout object + * + *@param url Description of Parameter + *@param proxy Description of Parameter + *@param proxyPort Description of Parameter + *@exception IOException Description of Exception + */ + public HttpClientTimeout(URL url, String proxy, int proxyPort) throws IOException { + super(url, proxy, proxyPort); + } + + + /** + * Constructor for the HttpClientTimeout object + * + *@param url Description of Parameter + *@exception IOException Description of Exception + */ + public HttpClientTimeout(URL url) throws IOException { + super(url, null, -1); + } + + + /** + * Sets the Timeout attribute of the HttpClientTimeout object + * + *@param i The new Timeout value + *@exception SocketException Description of Exception + */ + public void setTimeout(int i) throws SocketException { + this.timeout = -1; + serverSocket.setSoTimeout(i); + } + + + /** + * Gets the Socket attribute of the HttpClientTimeout object + * + *@return The Socket value + */ + public Socket getSocket() { + return serverSocket; + } + + + /** + * Description of the Method + * + *@param header Description of Parameter + *@param entry Description of Parameter + *@return Description of the Returned Value + *@exception java.io.IOException Description of Exception + */ + public boolean parseHTTP(MessageHeader header, ProgressEntry entry) throws java.io.IOException { + if (this.timeout != -1) { + try { + serverSocket.setSoTimeout(this.timeout); + } + catch (SocketException e) { + throw new java.io.IOException("unable to set socket timeout!"); + } + } + return super.parseHTTP(header, entry); + } + + + /** + * Description of the Method + * + *@exception IOException Description of Exception + */ + public void close() throws IOException { + serverSocket.close(); + } + + + /* + * public void SetTimeout(int i) throws SocketException { + * serverSocket.setSoTimeout(i); + * } + */ + /* + * This class has no public constructor for HTTP. This method is used to + * get an HttpClient to the specifed URL. If there's currently an + * active HttpClient to that server/port, you'll get that one. + * + * no longer syncrhonized -- it slows things down too much + * synchronize at a higher level + */ + /** + * Gets the New attribute of the HttpClientTimeout class + * + *@param url Description of Parameter + *@return The New value + *@exception IOException Description of Exception + */ + public static HttpClientTimeout getNew(URL url) throws IOException { + /* + * see if one's already around + */ + HttpClientTimeout ret = (HttpClientTimeout) kac.get(url); + if (ret == null) { + ret = new HttpClientTimeout(url); + // CTOR called openServer() + } + else { + ret.url = url; + } + // don't know if we're keeping alive until we parse the headers + // for now, keepingAlive is false + return ret; + } +} + diff --git a/sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/net/HttpTimeoutFactory.java b/sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/net/HttpTimeoutFactory.java new file mode 100644 index 00000000000..aff661cb6c1 --- /dev/null +++ b/sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/net/HttpTimeoutFactory.java @@ -0,0 +1,50 @@ +package de.lanlab.larm.net; + +import java.net.*; + +/** + * Description of the Class + * + *@author cmarschn + *@created 2. Mai 2001 + */ +public class HttpTimeoutFactory implements URLStreamHandlerFactory { + int fiTimeoutVal; + + + /** + * Constructor for the HttpTimeoutFactory object + * + *@param iT Description of Parameter + */ + public HttpTimeoutFactory(int iT) { + fiTimeoutVal = iT; + } + + + /** + * Description of the Method + * + *@param str Description of Parameter + *@return Description of the Returned Value + */ + public URLStreamHandler createURLStreamHandler(String str) { + return new HttpTimeoutHandler(fiTimeoutVal); + } + + static HttpTimeoutFactory instance = null; + + /** + * gets an instance. only the first call will create it. In subsequent calls the iT + * parameter doesn't have a meaning. + */ + public static HttpTimeoutFactory getInstance(int iT) + { + if(instance == null) + { + instance = new HttpTimeoutFactory(iT); + } + return instance; + } +} + diff --git a/sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/net/HttpTimeoutHandler.java b/sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/net/HttpTimeoutHandler.java new file mode 100644 index 00000000000..b551e4fa6c2 --- /dev/null +++ b/sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/net/HttpTimeoutHandler.java @@ -0,0 +1,80 @@ +package de.lanlab.larm.net; + +import java.net.*; +import java.io.IOException; + +/** + * Description of the Class + * + *@author cmarschn + *@created 2. Mai 2001 + */ +public class HttpTimeoutHandler extends sun.net.www.protocol.http.Handler { + int timeoutVal; + HttpURLConnectionTimeout fHUCT; + + + /** + * Constructor for the HttpTimeoutHandler object + * + *@param iT Description of Parameter + */ + public HttpTimeoutHandler(int iT) { + timeoutVal = iT; + } + + + /** + * Gets the Socket attribute of the HttpTimeoutHandler object + * + *@return The Socket value + */ + public Socket getSocket() { + return fHUCT.getSocket(); + } + + + /** + * Description of the Method + * + *@exception Exception Description of Exception + */ + public void close() throws Exception { + fHUCT.close(); + } + + + /** + * Description of the Method + * + *@param u Description of Parameter + *@return Description of the Returned Value + *@exception IOException Description of Exception + */ + protected java.net.URLConnection openConnection(URL u) throws IOException { + return fHUCT = new HttpURLConnectionTimeout(u, this, timeoutVal); + } + + + /** + * Gets the Proxy attribute of the HttpTimeoutHandler object + * + *@return The Proxy value + */ + String getProxy() { + return proxy; + // breaking encapsulation + } + + + /** + * Gets the ProxyPort attribute of the HttpTimeoutHandler object + * + *@return The ProxyPort value + */ + int getProxyPort() { + return proxyPort; + // breaking encapsulation + } +} + diff --git a/sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/net/HttpURLConnectionTimeout.java b/sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/net/HttpURLConnectionTimeout.java new file mode 100644 index 00000000000..16b07ace098 --- /dev/null +++ b/sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/net/HttpURLConnectionTimeout.java @@ -0,0 +1,226 @@ +package de.lanlab.larm.net; + +import java.net.*; +import java.io.*; +import sun.net.www.http.HttpClient; + +/** + * Description of the Class + * + *@author cmarschn + *@created 2. Mai 2001 + */ +public class HttpURLConnectionTimeout extends sun.net.www.protocol.http.HttpURLConnection { + int fiTimeoutVal; + HttpTimeoutHandler fHandler; + HttpClientTimeout fClient; + + + /** + * Constructor for the HttpURLConnectionTimeout object + * + *@param u Description of Parameter + *@param handler Description of Parameter + *@param iTimeout Description of Parameter + *@exception IOException Description of Exception + */ + public HttpURLConnectionTimeout(URL u, HttpTimeoutHandler handler, int iTimeout) throws IOException { + super(u, handler); + fHandler = handler; + fiTimeoutVal = iTimeout; + } + + + /** + * Constructor for the HttpURLConnectionTimeout object + * + *@param u Description of Parameter + *@param host Description of Parameter + *@param port Description of Parameter + *@exception IOException Description of Exception + */ + public HttpURLConnectionTimeout(URL u, String host, int port) throws IOException { + super(u, host, port); + } + + + /** + * Description of the Method + * + *@exception IOException Description of Exception + */ + public void connect() throws IOException { + if (connected) { + return; + } + try { + if ("http".equals(url.getProtocol()) + /* + * && !failedOnce <- PRIVATE + */ + ) { + // for safety's sake, as reported by KLGroup + synchronized (url) { + http = HttpClientTimeout.getNew(url); + } + fClient = (HttpClientTimeout) http; + ((HttpClientTimeout) http).setTimeout(fiTimeoutVal); + } + else { + // make sure to construct new connection if first + // attempt failed + http = new HttpClientTimeout(url, fHandler.getProxy(), fHandler.getProxyPort()); + } + ps = (PrintStream) http.getOutputStream(); + } + catch (IOException e) { + throw e; + } + // this was missing from the original version + connected = true; + } + + + /** + * Create a new HttpClient object, bypassing the cache of HTTP client + * objects/connections. + * + *@param url the URL being accessed + *@return The NewClient value + *@exception IOException Description of Exception + */ + protected HttpClient getNewClient(URL url) + throws IOException { + HttpClientTimeout client = new HttpClientTimeout(url, (String) null, -1); + try { + client.setTimeout(fiTimeoutVal); + } + catch (Exception e) { + System.out.println("Unable to set timeout value"); + } + return (HttpClient) client; + } + + + /** + * Gets the Socket attribute of the HttpURLConnectionTimeout object + * + *@return The Socket value + */ + Socket getSocket() { + return fClient.getSocket(); + } + + + /** + * Description of the Method + * + *@exception Exception Description of Exception + */ + void close() throws Exception { + fClient.close(); + } + + + /** + * opens a stream allowing redirects only to the same host. + * + *@param c Description of Parameter + *@return Description of the Returned Value + *@exception IOException Description of Exception + */ + public static InputStream openConnectionCheckRedirects(URLConnection c) + throws IOException { + boolean redir; + int redirects = 0; + InputStream in = null; + + do { + if (c instanceof HttpURLConnectionTimeout) { + ((HttpURLConnectionTimeout) c).setInstanceFollowRedirects(false); + } + + // We want to open the input stream before + // getting headers, because getHeaderField() + // et al swallow IOExceptions. + in = c.getInputStream(); + redir = false; + + if (c instanceof HttpURLConnectionTimeout) { + HttpURLConnectionTimeout http = (HttpURLConnectionTimeout) c; + int stat = http.getResponseCode(); + if (stat >= 300 && stat <= 305 && + stat != HttpURLConnection.HTTP_NOT_MODIFIED) { + URL base = http.getURL(); + String loc = http.getHeaderField("Location"); + URL target = null; + if (loc != null) { + target = new URL(base, loc); + } + http.disconnect(); + if (target == null + || !base.getProtocol().equals(target.getProtocol()) + || base.getPort() != target.getPort() + || !HostsEquals(base, target) + || redirects >= 5) { + throw new SecurityException("illegal URL redirect"); + } + redir = true; + c = target.openConnection(); + redirects++; + } + } + } while (redir); + return in; + } + + + // Same as java.net.URL.hostsEqual + + /** + * Description of the Method + * + *@param u1 Description of Parameter + *@param u2 Description of Parameter + *@return Description of the Returned Value + */ + static boolean HostsEquals(URL u1, URL u2) { + final String h1 = u1.getHost(); + final String h2 = u2.getHost(); + + if (h1 == null) { + return h2 == null; + } + else if (h2 == null) { + return false; + } + else if (h1.equalsIgnoreCase(h2)) { + return true; + } + // Have to resolve addresses before comparing, otherwise + // names like tachyon and tachyon.eng would compare different + final boolean result[] = {false}; + + java.security.AccessController.doPrivileged( + new java.security.PrivilegedAction() { + /** + * Main processing method for the HttpURLConnectionTimeout object + * + *@return Description of the Returned Value + */ + public Object run() { + try { + InetAddress a1 = InetAddress.getByName(h1); + InetAddress a2 = InetAddress.getByName(h2); + result[0] = a1.equals(a2); + } + catch (UnknownHostException e) { + } + catch (SecurityException e) { + } + return null; + } + }); + return result[0]; + } +} diff --git a/sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/parser/LinkHandler.java b/sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/parser/LinkHandler.java new file mode 100644 index 00000000000..5f96063da54 --- /dev/null +++ b/sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/parser/LinkHandler.java @@ -0,0 +1,17 @@ + +/** + * Title: LARM Lanlab Retrieval Machine

+ * Description:

+ * Copyright: Copyright (c)

+ * Company:

+ * @author + * @version 1.0 + */ +package de.lanlab.larm.parser; + +public interface LinkHandler +{ + public void handleLink(String value, boolean isFrame); + public void handleBase(String value); + public void handleTitle(String value); +} \ No newline at end of file diff --git a/sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/parser/Tokenizer.java b/sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/parser/Tokenizer.java new file mode 100644 index 00000000000..9ccda662ed6 --- /dev/null +++ b/sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/parser/Tokenizer.java @@ -0,0 +1,1340 @@ +/* + * $Id$ + * + * Copyright 2000 LANLab + * + */ +package de.lanlab.larm.parser; + +import hplb.org.xml.sax.*; +import hplb.xml.*; +import hplb.xml.util.*; + +import java.util.Dictionary; +import java.util.Hashtable; +import java.io.*; +import hplb.misc.ByteArray; +import java.net.URL; + +/** + * This parser is based on HEX, the HTML enabled XML parser, written by + * Anders Kristensen, HP Labs Bristol. + * It was stripped down and specialized to handle links in HTML pages. I removed + * some bugs. And it's FAST, about 10 x faster than the original HEX parser. + * Being some sort of SAX parser it calls the callback functions of the LinkHandler + * when links are found. + * @todo add handling of anchor texts + * + * @author Clemens Marschner + */ +public class Tokenizer implements hplb.org.xml.sax.Parser +{ + /** + * Sets the entityHandler attribute of the Tokenizer object + * + * @param e The new entityHandler value + */ + public void setEntityHandler(hplb.org.xml.sax.EntityHandler e) { } + + + /** + * Sets the errorHandler attribute of the Tokenizer object + * + * @param e The new errorHandler value + */ + public void setErrorHandler(hplb.org.xml.sax.ErrorHandler e) { } + + + /** + * Sets the documentHandler attribute of the Tokenizer object + * + * @param e The new documentHandler value + */ + public void setDocumentHandler(hplb.org.xml.sax.DocumentHandler e) { } + + + /** + * The value of boolean attributes is this string. + */ + public final static String BOOLATTR = Atom.getAtom("BOOLATTR"); + + // FSM states: + final static int ST_START = 1; + final static int ST_TAG_LT = 3; + final static int ST_TAG_NAME = 4; + final static int ST_TAG_WS = 5; + final static int ST_EMPTY_TAG_SLASH = 6; + final static int ST_NAME = 7; + final static int ST_NAME_WS = 8; + final static int ST_EQ = 9; + final static int ST_VALUE = 10; + final static int ST_VALUE_QUOTED = 11; + final static int ST_PCDATA = 21; + final static int ST_COMMENT = 22; + + LinkHandler linkHandler; + + String sysID = "what's this?"; + + /** + * Description of the Field + */ + protected Hashtable noCaseElms; + /** + * Description of the Field + */ + public boolean rcgnzWS = true; + // is white space chars recognized as PCDATA + // even when preceeding tags? + /** + * Description of the Field + */ + public boolean rcgnzEntities = true; + /** + * Description of the Field + */ + public boolean rcgnzCDATA = true; + /** + * Description of the Field + */ + public boolean rcgnzComments = true; + // + /** + * Description of the Field + */ + public boolean atomize = false; + // make element and attr names atoms + + private final static int ATTR_HREF = 1; + private final static int ATTR_SRC = 2; + + private final static int LINKTYPE_NONE = 0; + private final static int LINKTYPE_LINK = 1; + private final static int LINKTYPE_BASE = 2; + private final static int LINKTYPE_FRAME = 3; + + + private byte linkTagType; + private boolean linkAttrFound; + private int linkAttrType; + private String linkValue; + private boolean keepPCData; + private boolean isInTitleTag; + private boolean isInAnchorTag; + + CharBuffer buf = new CharBuffer(); + boolean isStartTag = true; + /** + * Signals whether a non-empty element has any children. If not we must + * generate an artificial empty-string child [characters(buf, 0, 0)]. + */ + boolean noChildren; + CharBuffer tagname = new CharBuffer(); + CharBuffer attrName = new CharBuffer(); + CharBuffer attrValue = new CharBuffer(1000); + CharBuffer pcData = new CharBuffer(8000); + + Reader in; + + /** + * Description of the Field + */ + public final EntityManager entMngr = new EntityManager(this); + /** + * Description of the Field + */ + protected int state = ST_START; + /** + * Description of the Field + */ + protected int qchar; + + + // <'> or <"> when parsing quoted attr values + + + /** + * Constructor for the Tokenizer object + */ + public Tokenizer() { } + + + /** + * Sets the linkHandler attribute of the Tokenizer object + * + * @param handler The new linkHandler value + */ + public void setLinkHandler(LinkHandler handler) + { + linkHandler = handler; + } + + + /** + * Description of the Method + * + * @param publicID Description of the Parameter + * @param sysID Description of the Parameter + * @exception Exception Description of the Exception + */ + public void parse(String publicID, String sysID) + throws Exception + { + this.sysID = sysID; + parse(new URL(sysID).openStream()); + } + + + /** + * Description of the Method + * + * @param in Description of the Parameter + * @exception Exception Description of the Exception + */ + public void parse(InputStream in) + throws Exception + { + parse(new BufferedReader(new InputStreamReader(in))); + } + + + /** + * Description of the Method + * + * @param in Description of the Parameter + * @exception Exception Description of the Exception + */ + public void parse(Reader in) + throws Exception + { + if (linkHandler == null) + { + throw new IllegalStateException("parse called without LinkHandler being set"); + } + + this.in = in; + toStart(); + tokenize(); + } + + + /** + * Description of the Method + * + * @param elementName Description of the Parameter + */ + public void ignoreCase(String elementName) + { + if (noCaseElms == null) + { + noCaseElms = new Hashtable(); + } + noCaseElms.put(elementName.toLowerCase(), elementName); + } + + + /** + * Description of the Method + * + * @param b Description of the Parameter + */ + public void rcgnzWS(boolean b) + { + rcgnzWS = b; + } + + + // invoked after doing any Handler callback - resets state + /** + * Description of the Method + */ + protected void toStart() + { + state = ST_START; + buf.reset(); + tagname.reset(); + attrName.reset(); + attrValue.reset(); + pcData.reset(); + //attrs.clear(); + isStartTag = true; + // until proven wrong + + linkTagType = LINKTYPE_NONE; + linkAttrFound = false; + linkAttrType = 0; + linkValue = ""; + //keepPCData= false; + } + + + /** + * Description of the Method + * + * @exception Exception Description of the Exception + */ + public void tokenize() + throws Exception + { + int c; + + + while ((c = read()) != -1) + { + switch (state) + { + case ST_START: + switch (c) + { + case '<': + state = ST_TAG_LT; + linkTagType = LINKTYPE_NONE; + linkAttrFound = false; + linkAttrType = 0; + linkValue = ""; + + isStartTag = true; + keepPCData= false; + + // until proven wrong + tagname.reset(); + break; + case ' ': + case '\t': + case '\r': + case '\n': + if (!rcgnzWS) + { + break; + } + // else fall through + default: + state = ST_PCDATA; + if(keepPCData) + { + pcData.write(c); + } + + } + break; + case ST_PCDATA: + if (c == '<') + { + if(keepPCData) + { + gotPCDATA(true); + keepPCData = false; + } + linkTagType = LINKTYPE_NONE; + linkAttrFound = false; + linkAttrType = 0; + linkValue = ""; + state = ST_TAG_LT; + } + else + { + if(keepPCData) + { + pcData.write(c); + } + } + break; + case ST_TAG_LT: + switch (c) + { + case '/': + isStartTag = false; + state = ST_TAG_NAME; + break; + case '!': + c = read(); + if ((c == '-' && !rcgnzComments) || (c == '[' && !rcgnzCDATA)) + { + state = ST_PCDATA; + pcData.reset(); + pcData.write(c); + break; + } + if (c == '-') + { + state = ST_COMMENT; + } + else if (c == '[') + { + parseCDATA(); + } + else + { + // FIXME: shouldn't be delivered as PCDATA + //warning("Bad markup " + buf); + state = ST_PCDATA; + pcData.reset(); + pcData.write(c); + } + break; + case '?': + parsePI(); + break; + case ' ': + case '\t': + case '\r': + case '\n': + state = ST_TAG_WS; + break; + default: + tagname.write(Character.toLowerCase((char) c)); + // ## changed + state = ST_TAG_NAME; + } + break; + case ST_TAG_NAME: + switch (c) + { + case ' ': + case '\t': + case '\r': + case '\n': + state = ST_TAG_WS; + gotTagName(); + // ## changed + break; + case '/': + state = ST_EMPTY_TAG_SLASH; + gotTagName(); + // ## changed + break; + case '>': + gotTagName(); + // ## changed + gotTag(); + break; + default: + tagname.write(Character.toLowerCase((char) c)); + // ## changed + } + break; + case ST_TAG_WS: + switch (c) + { + case ' ': + case '\t': + case '\r': + case '\n': + break; + case '/': + state = ST_EMPTY_TAG_SLASH; + break; + case '>': + gotTag(); + break; + case '?': + // NOTE: if !inXMLDecl we fall through to default case + default: + if (!isStartTag) + { + // bit of a hack this... + //errHandler.warning("Malformed tag: "+buf, sysID, _line, _column); + //err_continue("Malformed tag: "+buf); + toStart(); + // ## changed + if (c == '<') + { + gotPCDATA(true); + keepPCData = false; + state = ST_TAG_LT; + } + else + { + // we get here e.g. if there's an end tag with attributes + state = ST_PCDATA; + pcData.reset(); + } + } + else + { + // FIXME: this accepts way too many first chars for attr name + attrName.write(Character.toLowerCase((char) c)); + state = ST_NAME; + } + } + break; + case ST_EMPTY_TAG_SLASH: + if (c == '>') + { + //tagtype = TAG_EMPTY; + gotTag(); + break; + } + else + { + // ERROR !? - can't throw Exception here - we go to next tag... + state = ST_PCDATA; + pcData.reset(); + } + break; + case ST_NAME: + switch (c) + { + case ' ': + case '\t': + case '\r': + case '\n': + if (attrName.size() > 0) + { + state = ST_NAME_WS; + } + break; + case '>': + if (attrName.size() > 0) + { + gotAttr(); + } + gotTag(); + break; + case '=': + state = ST_EQ; + break; + default: + if (isCtlOrTspecial(c)) + { + state = ST_PCDATA; + pcData.reset(); + } + else + { + attrName.write(Character.toLowerCase((char) c)); + } + } + break; + case ST_NAME_WS: + // white-space between name and '=' + switch (c) + { + case ' ': + case '\t': + case '\r': + case '\n': + break; + case '=': + state = ST_EQ; + break; + case '>': + gotAttr(); + gotTag(); + break; + default: + if (isNameChar(c)) + { + gotAttr(); + attrName.write(Character.toLowerCase((char) c)); + state = ST_TAG_WS; + } + else + { + state = ST_PCDATA; + pcData.reset(); + } + } + break; + case ST_EQ: + // white-space between '=' and value + switch (c) + { + case ' ': + case '\t': + case '\r': + case '\n': + break; + case '"': + qchar = '"'; + state = ST_VALUE_QUOTED; + break; + case '\'': + qchar = '\''; + state = ST_VALUE_QUOTED; + break; + default: + if (isCtlOrTspecial(c)) + { + state = ST_PCDATA; + pcData.reset(); + } + else + { + attrValue.write(c); + state = ST_VALUE; + } + } + break; + case ST_VALUE: + switch (c) + { + case ' ': + case '\t': + case '\r': + case '\n': + gotAttr(); + state = ST_TAG_WS; + break; + case '>': + gotAttr(); + gotTag(); + break; + /* + * case '/': // FIXME: HTML knows things like !! + * gotAttr(); + * state = ST_EMPTY_TAG_SLASH; + * break; + */ + default: + if (isValueBreaker(c)) + { + state = ST_PCDATA; + pcData.reset(); + } + else + { + attrValue.write(c); + } + } + break; + case ST_VALUE_QUOTED: + if (c == qchar) + { + gotAttr(); + state = ST_TAG_WS; + } + else + { + attrValue.write(c); + } + break; + case ST_COMMENT: + // we've seen "...' + gotComment(); + //while (read_ex() != '>') ; + //state = ST_PCDATA; + } + catch (EmptyInputStream ex) + { + gotPCDATA(false); + keepPCData = false; + break; + } + } + } + + // input stream ended - return rest, if any, as PCDATA + if (buf.size() > 0) + { + gotPCDATA(false); + keepPCData = false; + buf.reset(); + } + } + + + // counts lines and columns - used in error reporting + // a line can be a single \r or \n or it can be \r\n - we handle them all + int cc; + + // last char read + + + /** + * Description of the Method + * + * @return Description of the Return Value + * @exception IOException Description of the Exception + */ + public final int read() + throws IOException + { + int c = in.read(); + if (c != -1) + { + buf.write(c); + } + + return c; + } + + + /** + * Description of the Method + * + * @return Description of the Return Value + * @exception IOException Description of the Exception + * @exception EmptyInputStream Description of the Exception + */ + public final int read_ex() + throws IOException, EmptyInputStream + { + int c = read(); + if (c == -1) + { + throw new EmptyInputStream(); + } + return c; + } + + + // HTML allows boolean attributes - attributes without a + // value, or rather an implicit value which is the same as the name. + /** + * Description of the Method + * + * @exception Exception Description of the Exception + */ + protected final void gotAttr() + throws Exception + { + // gotTag has to be called first, setting waitForAtt = ATT_HREF or ATT_SRC + if (!linkAttrFound) + { + char[] attName = attrName.getCharArray(); + int attLength = attrName.getLength(); + boolean gotcha = false; + + switch (attLength) + { + case 4: + if (attName[0] == 'h' && attName[1] == 'r' && attName[2] == 'e' && attName[3] == 'f') + { + gotcha = true; + } + break; + case 3: + if (attName[0] == 's' && attName[1] == 'r' && attName[2] == 'c') + { + gotcha = true; + } + break; + } + if (gotcha) + { + linkValue = (rcgnzEntities ? entMngr.entityDecode(attrValue) : + attrValue).toString(); + linkAttrFound = true; + } + else + { + linkValue = ""; + } + } + attrName.reset(); + attrValue.reset(); + //attrs.put(nm, val); + } + + + /** + * Description of the Method + */ + protected void gotTagName() + { + char[] tag = tagname.getCharArray(); + int tagLength = tagname.getLength(); + switch (tagLength) + { + case 1: + // A + if (tag[0] == 'a') + { + linkTagType = LINKTYPE_LINK; + linkAttrType = ATTR_HREF; + + } + break; + // [case 3: // IMG] + case 4: + // BASE, AREA [, LINK] + if(isStartTag) + { + if (tag[0] == 'b' && tag[1] == 'a' && tag[2] == 's' && tag[3] == 'e') + { + linkTagType = LINKTYPE_BASE; + linkAttrType = ATTR_HREF; + } + else if (tag[0] == 'a' && tag[1] == 'r' && tag[2] == 'e' && tag[3] == 'a') + { + linkTagType = LINKTYPE_LINK; + linkAttrType = ATTR_HREF; + } + } + break; + case 5: + // FRAME + if(isStartTag) + { + if (tag[0] == 'f' && tag[1] == 'r' && tag[2] == 'a' && tag[3] == 'm' && tag[4] == 'e') + { + linkTagType = LINKTYPE_FRAME; + linkAttrType = ATTR_SRC; + } + else if (tag[0] == 't' && tag[1] == 'i' && tag[2] == 't' && tag[3] == 'l' && tag[4] == 'e') + { + isInTitleTag = true; + keepPCData = true; + } + } + default: + } + } + + + /** + * Description of the Method + * + * @exception Exception Description of the Exception + */ + protected void gotTag() + throws Exception + { + if (linkAttrFound && isStartTag) + { + switch (linkTagType) + { + case LINKTYPE_LINK: + //System.out.println("got link " + linkValue); + linkHandler.handleLink(linkValue, false); + break; + case LINKTYPE_FRAME: + //System.out.println("got link " + linkValue); + linkHandler.handleLink(linkValue, true); + break; + case LINKTYPE_BASE: + linkHandler.handleBase(linkValue); + break; + } + } + toStart(); + } + + + /** + * Description of the Method + * + * @param attrs Description of the Parameter + */ + public final void keysToLowerCase(SAXAttributeMap attrs) + { + for (int i = 0; i < attrs.n; i++) + { + attrs.keys[i] = attrs.keys[i].toLowerCase(); + if (atomize) + { + attrs.keys[i] = Atom.getAtom(attrs.keys[i]); + } + } + } + + + // toomuch true iff we read a '<' of the next token + /** + * Description of the Method + * + * @param toomuch Description of the Parameter + * @exception Exception Description of the Exception + */ + protected void gotPCDATA(boolean toomuch) + throws Exception + { + if(isInTitleTag) + { + linkHandler.handleTitle(pcData.toString()); + isInTitleTag = false; + } + + // ignore it + toStart(); + } + + + /* + * noChildren = false; + * if (toomuch) { + * buf.setLength(buf.size() - 1); + * } + * CharBuffer buf1 = rcgnzEntities ? entMngr.entityDecode(buf) : buf; + * docHandler.characters(buf1.getCharArray(), 0, buf1.size()); + * /handler.gotText(getBuffer()); + * toStart(); + * if (toomuch) { + * buf.write('<'); + * column--; + * } + * } + */ + // XXX: should pass the comment on as docHandler.ignorable() ?? + /** + * Description of the Method + * + * @exception IOException Description of the Exception + * @exception EmptyInputStream Description of the Exception + */ + protected void gotComment() + throws IOException, EmptyInputStream + { + //toStart(); // so an unexpected EOF causes rest to be returned as PCDATA + while (read_ex() != '>') + { + ; + } + toStart(); + } + + + // Processing Instruction + /** + * Description of the Method + * + * @exception Exception Description of the Exception + */ + protected void parsePI() + throws Exception + { + // ignore this + + /* + * int i; + * String target; + * noChildren = false; + * inXMLDecl = false; + * i = buf.size(); + * try { + * while (!isWS(read_ex())) ; + * target = buf.toString(); + * target = target.substring(i, target.length() - 1); + * if ("XML".equals(target)) { + * inXMLDecl = true; + * state = ST_TAG_WS; + * return; + * } + * while (isWS(read_ex())) ; + * i = buf.size() - 1; + * while (true) { + * while (read_ex() != '?') ; + * if (read_ex() == '>') { + * String s = buf.toString(); + * docHandler.processingInstruction( + * Atom.getAtom(target), s.substring(i, s.length()-2)); + * /handler.gotPI(Atom.getAtom(target), + * / s.substring(i, s.length()-2)); + * break; + * } + * } + * } catch (EmptyInputStream ex) { + * gotPCDATA(false); + * errHandler.warning("EOF while parsing PI", sysID, _line, _column); + * /err_continue("EOF while parsing PI"); + * } + */ + toStart(); + } + + + // CDATA section + // XXX: should contents be amalgamated with surrounding PCDATA? + /** + * Description of the Method + * + * @exception Exception Description of the Exception + */ + protected void parseCDATA() + throws Exception + { + // we've seen "') + { + ; + } + // docHandler.characters(buf.getCharArray(), i1, buf.size()-3-i1); + } + else + { + warning("Bad CDATA markup"); + state = ST_PCDATA; + pcData.reset(); + } + } + catch (EmptyInputStream ex) + { + warning("EOF while parsing CDATA section"); + //gotPCDATA(false); + } + toStart(); + } + + + /** + * Gets the wS attribute of the Tokenizer object + * + * @param c Description of the Parameter + * @return The wS value + */ + public boolean isWS(int c) + { + switch (c) + { + case ' ': + case '\t': + case '\r': + case '\n': + return true; + default: + return false; + } + } + + + /** + * Gets the valueBreaker attribute of the Tokenizer class + * + * @param c Description of the Parameter + * @return The valueBreaker value + */ + public final static boolean isValueBreaker(int c) + { + switch (c) + { + // control characters (0-31 and 127): + case 0: + case 1: + case 2: + case 3: + case 4: + case 5: + case 6: + case 7: + case 8: + case 9: + case 10: + case 11: + case 12: + case 13: + case 14: + case 15: + case 16: + case 17: + case 18: + case 19: + case 20: + case 21: + case 22: + case 23: + case 24: + case 25: + case 26: + case 27: + case 28: + case 29: + case 30: + case 31: + case 127: + + // tspecials: + case '>': + case ' ': + return true; + default: + return false; + } + } + + + /** + * Returns true if c is either an ascii control character or a tspecial + * according to the HTTP specification. + * + * @param c Description of the Parameter + * @return The ctlOrTspecial value + */ + // private static final boolean[] isCtlOrTSpecial = new boolean[] +// { +// /* 0 */ true , true , true , true , true , true , true , true , true , true , true , true , true , true , +// /* 14 */ true , true , true , true , true , true , true , true , true , true , true , true , true , true , +// /* 28 */ true , true , true , true , true , false, true , false, false, false, false, false, true , true , +// /* 42 */ false, false, true , false, false, true , false, false, false, false, false, false, false, false, +// /* 56 */ false, false, /*FIX: / no control char: true*/ false, true , true , true , true , true , true , false, false, false, false, false, +// /* 70 */ false, false, false, false, false, false, false, false, false, false, false, false, false, false, +// /* 84 */ false, false, false, false, false, false, false, true , true , true , false, false, false, false, +// /* 98 */ false, false, false, false, false, false, false, false, false, false, false, false, false, false, +// /* 112 */ false, false, false, false, false, false, false, false, false, false, false, true , false, true , +// /* 126 */ false, true , false, false, false, false, false, false, false, false, false, false, false, false, +// /* 140 */ false, false, false, false, false, false, false, false, false, false, false, false, false, false, +// /* 154 */ false, false, false, false, false, false, false, false, false, false, false, false, false, false, +// /* 168 */ false, false, false, false, false, false, false, false, false, false, false, false, false, false, +// /* 182 */ false, false, false, false, false, false, false, false, false, false, false, false, false, false, +// /* 196 */ false, false, false, false, false, false, false, false, false, false, false, false, false, false, +// /* 210 */ false, false, false, false, false, false, false, false, false, false, false, false, false, false, +// /* 224 */ false, false, false, false, false, false, false, false, false, false, false, false, false, false, +// /* 238 */ false, false, false, false, false, false, false, false, false, false, false, false, false, false, +// /* 252 */ false, false, false, false +// }; + + public final static boolean isCtlOrTspecial(int c) + { + switch (c) + { + // control characters (0-31 and 127): + case 0: + case 1: + case 2: + case 3: + case 4: + case 5: + case 6: + case 7: + case 8: + case 9: + case 10: + case 11: + case 12: + case 13: + case 14: + case 15: + case 16: + case 17: + case 18: + case 19: + case 20: + case 21: + case 22: + case 23: + case 24: + case 25: + case 26: + case 27: + case 28: + case 29: + case 30: + case 31: + case 127: + + // tspecials: + case '(': + case ')': + case '<': + case '>': + case '@': + case ',': + case ';': + case ':': + case '\\': + case '"': + /* + * case '/': + */ + case '[': + case ']': + case '?': + case '=': + case '{': + case '}': + case ' ': + // case '\t': + return true; + default: + return false; + } + } + + + /* + * public static void main(String[]) + * { + * System.out.println("private static final boolean[] isCtlOrTSpecial = \n{"); // bzw. isNameChar + * for(int i=0; i<256; i++) + * { + * if(i>0) + * System.out.print(", "); + * if(i % 14 == 0) + * { + * System.out.print("\n/* " + i + " *" + "/ "); + * } + * if(Tokenizer.isCtlOrTspecial(i)) // bzw. isNameChar(i) + * { + * System.out.print("true "); + * } + * else + * { + * System.out.print("false"); + * } + * } + * System.out.print("};\n\n"); + * } + */ +// public static final boolean isCtlOrTspecial(int c) +// { +// return (c < 256 ? isCtlOrTSpecial[c] : false); +// } +// +// private static final boolean[] isNameChar = +// { +// /* 0 */ false, false, false, false, false, false, false, false, false, false, false, false, false, false, +// /* 14 */ false, false, false, false, false, false, false, false, false, false, false, false, false, false, +// /* 28 */ false, false, false, false, false, false, false, false, false, false, false, false, false, false, +// /* 42 */ false, false, false, true , true , false, true , true , true , true , true , true , true , true , +// /* 56 */ true , true , false, false, false, false, false, false, false, true , true , true , true , true , +// /* 70 */ true , true , true , true , true , true , true , true , true , true , true , true , true , true , +// /* 84 */ true , true , true , true , true , true , true , false, false, false, false, true , false, true , +// /* 98 */ true , true , true , true , true , true , true , true , true , true , true , true , true , true , +// /* 112 */ true , true , true , true , true , true , true , true , true , true , true , false, false, false, +// /* 126 */ false, false, false, false, false, false, false, false, false, false, false, false, false, false, +// /* 140 */ false, false, false, false, false, false, false, false, false, false, false, false, false, false, +// /* 154 */ false, false, false, false, false, false, false, false, false, false, false, false, false, false, +// /* 168 */ false, false, false, false, false, false, false, false, false, false, false, false, false, false, +// /* 182 */ false, false, false, false, false, false, false, false, false, false, false, false, false, false, +// /* 196 */ false, false, false, false, false, false, false, false, false, false, false, false, false, false, +// /* 210 */ false, false, false, false, false, false, false, false, false, false, false, false, false, false, +// /* 224 */ false, false, false, false, false, false, false, false, false, false, false, false, false, false, +// /* 238 */ false, false, false, false, false, false, false, false, false, false, false, false, false, false, +// /* 252 */ false, false, false, false +// }; +// public static final boolean isNameChar(int c) +// { +// return (c < 256 ? isNameChar[c] : false); +// } +// + /* + * / I don't think this is a very standard definition of what can + * / go into tag and attribute names. + */ + /** + * Gets the nameChar attribute of the Tokenizer class + * + * @param c Description of the Parameter + * @return The nameChar value + */ + public final static boolean isNameChar(int c) + { + return ('a' <= c && c <= 'z') || + ('A' <= c && c <= 'Z') || + ('0' <= c && c <= '9') || + c == '.' || c == '-' || c == '_'; + } + + + + /** + * Description of the Method + * + * @param s Description of the Parameter + * @exception Exception Description of the Exception + */ + protected final void warning(String s) + throws Exception + { + //errHandler.warning(s, sysID, _line, _column); + } + + + /** + * Description of the Method + * + * @param s Description of the Parameter + * @exception Exception Description of the Exception + */ + protected final void fatal(String s) + throws Exception + { + //errHandler.fatal(s, sysID, _line, _column); + } + + + + /** + * The main program for the Tokenizer class + * + * @param argv The command line arguments + */ + public static void main(String[] argv) + { + Tokenizer tok = new Tokenizer(); + tok.setLinkHandler( + new LinkHandler() + { + int nr = 0; + + + public void handleLink(String link, boolean isFrame) + { + System.out.println("found link " + (++nr) + ": " + link); + } + public void handleTitle(String title) + { + System.out.println("found title " + (++nr) + ": " + title); + } + + + public void handleBase(String link) + { + System.out.println("found base " + (++nr) + ": " + link); + } + }); + try + { + tok.parse(new FileReader("C:\\witest.htm")); + /* + * " " + + * "This is some Text\n" + + * "and this is... the link" + + * "")); + */ + } + catch (Exception e) + { + System.out.println("Caught Exception: " + e.getClass().getName()); + e.printStackTrace(); + } + } +} + +/** + * Description of the Class + * + * @author Administrator + * @created 29. Dezember 2001 + */ +class EmptyInputStream extends Exception +{ + + + /** + * Constructor for the EmptyInputStream object + */ + EmptyInputStream() { } + +} + diff --git a/sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/storage/DocumentStorage.java b/sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/storage/DocumentStorage.java new file mode 100644 index 00000000000..26417c05449 --- /dev/null +++ b/sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/storage/DocumentStorage.java @@ -0,0 +1,37 @@ + +/** + * Title: LARM Lanlab Retrieval Machine

+ * + * Description:

+ * + * Copyright: Copyright (c)

+ * + * Company:

+ * + * + * + * @author + * @version 1.0 + */ +package de.lanlab.larm.storage; +import de.lanlab.larm.util.*; + +/** + * This interface stores documents provided by a fetcher task + * @author Clemens Marschner + */ +public interface DocumentStorage +{ + /** + * called once when the storage is supposed to be initialized + */ + public void open(); + + + /** + * called to store a web document + * + * @param doc the document + */ + public void store(WebDocument doc); +} diff --git a/sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/storage/LogStorage.java b/sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/storage/LogStorage.java new file mode 100644 index 00000000000..2b6507195c3 --- /dev/null +++ b/sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/storage/LogStorage.java @@ -0,0 +1,165 @@ +package de.lanlab.larm.storage; + +import de.lanlab.larm.util.WebDocument; +import de.lanlab.larm.util.SimpleLogger; +import java.io.*; + + +/** + * Title: LARM Lanlab Retrieval Machine Description: Copyright: Copyright (c) + * Company: + * + * @author + * @created 11. Januar 2002 + * @version 1.0 + */ + + + +/** + * this class saves the documents into page files of 50 MB and keeps a record of all + * the positions into a Logger. the log file contains URL, page file number, and + * index within the page file. + * + */ + +public class LogStorage implements DocumentStorage +{ + + SimpleLogger log; + + File pageFile; + FileOutputStream out; + int pageFileCount; + String filePrefix; + int offset; + boolean isValid = false; + /** + * Description of the Field + */ + public final static int MAXLENGTH = 50000000; + boolean logContents = false; + String fileName; + + + /** + * Constructor for the LogStorage object + * + * @param log the logger where index information is saved to + * @param logContents whether all docs are to be stored in page files or not + * @param filePrefix the file name where the page file number is appended + */ + public LogStorage(SimpleLogger log, boolean logContents, String filePrefix) + { + this.log = log; + pageFileCount = 0; + this.filePrefix = filePrefix; + this.logContents = logContents; + if (logContents) + { + openPageFile(); + } + } + + + /** + * Description of the Method + */ + public void open() { } + + + /** + * Description of the Method + */ + public void openPageFile() + { + int id = ++pageFileCount; + fileName = filePrefix + "_" + id + ".pfl"; + try + { + this.offset = 0; + out = new FileOutputStream(fileName); + isValid = true; + } + catch (IOException io) + { + log.logThreadSafe("**ERROR: IOException while opening pageFile " + fileName + ": " + io.getClass().getName() + "; " + io.getMessage()); + isValid = false; + } + } + + + /** + * Gets the outputStream attribute of the LogStorage object + * + * @return The outputStream value + */ + public OutputStream getOutputStream() + { + if (offset > MAXLENGTH) + { + try + { + out.close(); + } + catch (IOException io) + { + log.logThreadSafe("**ERROR: IOException while closing pageFile " + fileName + ": " + io.getClass().getName() + "; " + io.getMessage()); + } + openPageFile(); + } + return out; + } + + + /** + * Description of the Method + * + * @param bytes Description of the Parameter + * @return Description of the Return Value + */ + public synchronized int writeToPageFile(byte[] bytes) + { + try + { + OutputStream out = getOutputStream(); + int oldOffset = this.offset; + out.write(bytes); + this.offset += bytes.length; + return oldOffset; + } + catch (IOException io) + { + log.logThreadSafe("**ERROR: IOException while writing " + bytes.length + " bytes to pageFile " + fileName + ": " + io.getClass().getName() + "; " + io.getMessage()); + } + return -1; + } + + + /** + * Sets the logger attribute of the LogStorage object + * + * @param log The new logger value + */ + public void setLogger(SimpleLogger log) + { + this.log = log; + } + + + /** + * stores the document if storing is enabled + * + * @param doc Description of the Parameter + */ + public void store(WebDocument doc) + { + String docInfo = doc.getInfo(); + if (logContents && isValid && doc.getDocumentBytes() != null) + { + int offset = writeToPageFile(doc.getDocumentBytes()); + docInfo = docInfo + "\t" + pageFileCount + "\t" + offset; + } + log.logThreadSafe(docInfo); + } +} diff --git a/sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/storage/NullStorage.java b/sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/storage/NullStorage.java new file mode 100644 index 00000000000..57037ce3d0f --- /dev/null +++ b/sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/storage/NullStorage.java @@ -0,0 +1,26 @@ + +/** + * Title: LARM Lanlab Retrieval Machine

+ * Description:

+ * Copyright: Copyright (c)

+ * Company:

+ * @author + * @version 1.0 + */ +package de.lanlab.larm.storage; +import de.lanlab.larm.util.*; + +/** + * doesn't do a lot + */ +public class NullStorage implements DocumentStorage +{ + + public NullStorage() + { + } + + public void open() {} + public void store(WebDocument doc) {} + +} \ No newline at end of file diff --git a/sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/storage/SQLServerStorage.java b/sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/storage/SQLServerStorage.java new file mode 100644 index 00000000000..522a8760d24 --- /dev/null +++ b/sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/storage/SQLServerStorage.java @@ -0,0 +1,176 @@ + +/** + * Title: LARM Lanlab Retrieval Machine

+ * Description:

+ * Copyright: Copyright (c)

+ * Company:

+ * @author + * @version 1.0 + */ +package de.lanlab.larm.storage; +import java.sql.*; +import de.lanlab.larm.util.*; +import java.util.*; + +/** + * saves the document into an sql table. At this time only in MS SQL (and probably Sybase) + * a table "Document" with the columns DO_URL(varchar), DO_MimeType(varchar) and + * DO_Data2(BLOB) is created after start
+ * notes: experimental; slow + */ +public class SQLServerStorage implements DocumentStorage +{ + + private Vector freeCons; + private Vector busyCons; + + private Vector freeStatements; + private Vector busyStatements; + + private PreparedStatement addDoc; + + public SQLServerStorage(String driver, String connectionString, String account, String password, int nrConnections) + { + try + { + Class.forName(driver); + freeCons = new Vector(nrConnections); + busyCons = new Vector(nrConnections); + freeStatements = new Vector(nrConnections); + busyStatements = new Vector(nrConnections); + + Connection sqlConn; + PreparedStatement statement; + for(int i=0; i + * Description:

+ * Copyright: Copyright (c)

+ * Company:

+ * @author + * @version 1.0 + */ +package de.lanlab.larm.threads; + +public class ThreadFactory +{ + // static int count = 0; + + public ServerThread createServerThread(int count) + { + return new ServerThread(count); + } +} \ No newline at end of file diff --git a/sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/threads/ThreadPool.java b/sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/threads/ThreadPool.java new file mode 100644 index 00000000000..84c1ef57fa7 --- /dev/null +++ b/sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/threads/ThreadPool.java @@ -0,0 +1,380 @@ + +package de.lanlab.larm.threads; + +//import java.util.Vector; +import java.util.*; + +/** + * if you have many tasks to accomplish, you can do this with one of the + * following strategies: + *

+ * This thread pool is based on an article in Java-Magazin 06/2000. + * synchronizations were removed unless necessary + * + * + */ +public class ThreadPool implements ThreadingStrategy, TaskReadyListener { + private int maxThreads = MAX_THREADS; + /** + * references to all threads are stored here + */ + private HashMap allThreads = new HashMap(); + /** + * this vector takes all idle threads + */ + private Vector idleThreads = new Vector(); + /** + * this vector takes all threads that are in operation (busy) + */ + private Vector busyThreads = new Vector(); + + /** + * if there are no idleThreads, tasks will go here + */ + private TaskQueue queue = new TaskQueue(); + + /** + * thread pool observers will be notified of status changes + */ + private Vector threadPoolObservers = new Vector(); + + private boolean isStopped = false; + + /** + * default maximum number of threads, if not given by the user + */ + public final static int MAX_THREADS = 5; + + /** + * thread was created + */ + public final static String THREAD_CREATE = "T_CREATE"; + /** + * thread was created + */ + public final static String THREAD_START = "T_START"; + /** + * thread is running + */ + public final static String THREAD_RUNNING = "T_RUNNING"; + /** + * thread was stopped + */ + public final static String THREAD_STOP = "T_STOP"; + /** + * thread was destroyed + */ + public final static String THREAD_END = "T_END"; + /** + * thread is idle + */ + public final static String THREAD_IDLE = "T_IDLE"; + + /** + * a task was added to the queue, because all threads were busy + */ + public final static String THREADQUEUE_ADD = "TQ_ADD"; + + /** + * a task was removed from the queue, because a thread had finished and was + * ready + */ + public final static String THREADQUEUE_REMOVE = "TQ_REMOVE"; + + /** + * this factory will create the tasks + */ + ThreadFactory factory; + + + /** + * this constructor will create the pool with MAX_THREADS threads and the + * default factory + */ + public ThreadPool() { + this(MAX_THREADS, new ThreadFactory()); + } + + + /** + * this constructor will create the pool with the default Factory + * + *@param max the maximum number of threads + */ + public ThreadPool(int max) { + this(max, new ThreadFactory()); + } + + + /** + * constructor + * + *@param max maximum number of threads + *@param factory the thread factory with which the threads will be created + */ + public ThreadPool(int max, ThreadFactory factory) { + maxThreads = max; + this.factory = factory; + } + + + /** + * this init method will create the tasks. It must be called by hand + */ + public void init() { + for (int i = 0; i < maxThreads; i++) { + createThread(i); + } + } + + + /** + * Description of the Method + * + *@param i Description of the Parameter + */ + public void createThread(int i) { + ServerThread s = factory.createServerThread(i); + idleThreads.add(s); + allThreads.put(new Integer(i), s); + s.addTaskReadyListener(this); + sendMessage(i, THREAD_CREATE, ""); + s.start(); + sendMessage(i, THREAD_IDLE, ""); + } + + + // FIXME: synchronisationstechnisch buggy + /** + * Description of the Method + * + *@param i Description of the Parameter + */ + public void restartThread(int i) { + sendMessage(i, THREAD_STOP, ""); + ServerThread t = (ServerThread) allThreads.get(new Integer(i)); + idleThreads.remove(t); + busyThreads.remove(t); + allThreads.remove(new Integer(i)); + t.interruptTask(); + t.interrupt(); + //t.join(); + // deprecated, I know, but the only way to overcome SUN's bugs + t = null; + createThread(i); + } + + + /** + * Description of the Method + * + *@param t Description of the Parameter + *@param key Description of the Parameter + */ + public synchronized void doTask(InterruptableTask t, Object key) { + if (!idleThreads.isEmpty()) { + ServerThread s = (ServerThread) idleThreads.firstElement(); + idleThreads.remove(s); + busyThreads.add(s); + sendMessage(s.getThreadNumber(), THREAD_START, t.getInfo()); + s.runTask(t); + sendMessage(s.getThreadNumber(), THREAD_RUNNING, t.getInfo()); + } else { + + queue.insert(t); + sendMessage(-1, THREADQUEUE_ADD, t.getInfo()); + } + } + + + /** + * this will interrupt all threads. Therefore the InterruptableTasks must + * attend on the interrupted-flag + */ + public void interrupt() { + Iterator tasks = queue.iterator(); + while (tasks.hasNext()) { + InterruptableTask t = (InterruptableTask) tasks.next(); + t.interrupt(); + sendMessage(-1, THREADQUEUE_REMOVE, t.getInfo()); + // In der Hoffnung, dass alles klappt... + } + queue.clear(); + Iterator threads = busyThreads.iterator(); + while (threads.hasNext()) { + ((ServerThread) threads.next()).interruptTask(); + } + } + + + /** + * this will interrupt the tasks and end all threads + */ + public void stop() { + isStopped = true; + interrupt(); + Iterator threads = idleThreads.iterator(); + while (threads.hasNext()) { + ((ServerThread) threads.next()).interruptTask(); + } + idleThreads.clear(); + } + + + /** + * wird von einem ServerThread aufgerufen, wenn dieser fertig ist + * + *@param s Description of the Parameter + *@param: ServerThread s - der aufrufende Thread + */ + public synchronized void taskReady(ServerThread s) { + if (isStopped) { + s.interrupt(); + sendMessage(s.getThreadNumber(), THREAD_STOP, s.getTask().getInfo()); + busyThreads.remove(s); + } else if (!queue.isEmpty()) { + InterruptableTask t = (InterruptableTask) queue.remove(); + //queue.remove(t); + sendMessage(-1, THREADQUEUE_REMOVE, t.getInfo()); + sendMessage(s.getThreadNumber(), THREAD_START, ""); + s.runTask(t); + sendMessage(s.getThreadNumber(), THREAD_RUNNING, s.getTask().getInfo()); + } else { + sendMessage(s.getThreadNumber(), THREAD_IDLE, ""); + idleThreads.add(s); + busyThreads.remove(s); + } + synchronized (idleThreads) { + idleThreads.notify(); + } + + } + + + /** + * Description of the Method + */ + public void waitForFinish() { + synchronized (idleThreads) { + while (busyThreads.size() != 0) { + //System.out.println("busyThreads: " + busyThreads.size()); + try { + idleThreads.wait(); + } catch (InterruptedException e) { + System.out.println("Interrupted: " + e.getMessage()); + } + } + //System.out.println("busyThreads: " + busyThreads.size()); + } + } + + + /** + * Adds a feature to the ThreadPoolObserver attribute of the ThreadPool + * object + * + *@param o The feature to be added to the ThreadPoolObserver attribute + */ + public void addThreadPoolObserver(ThreadPoolObserver o) { + threadPoolObservers.add(o); + } + + + /** + * Description of the Method + * + *@param threadNr Description of the Parameter + *@param action Description of the Parameter + *@param info Description of the Parameter + */ + protected void sendMessage(int threadNr, String action, String info) { + + Iterator Ie = threadPoolObservers.iterator(); + //System.out.println("ThreadPool: Sende " + action + " message an " + threadPoolObservers.size() + " Observers"); + if (threadNr != -1) { + while (Ie.hasNext()) { + ((ThreadPoolObserver) Ie.next()).threadUpdate(threadNr, action, info); + } + } else { + while (Ie.hasNext()) { + ((ThreadPoolObserver) Ie.next()).queueUpdate(info, action); + } + } + } + + + /** + * Gets the queueSize attribute of the ThreadPool object + * + *@return The queueSize value + */ + public synchronized int getQueueSize() { + return this.queue.size(); + } + + + /** + * Gets the idleThreadsCount attribute of the ThreadPool object + * + *@return The idleThreadsCount value + */ + public synchronized int getIdleThreadsCount() { + return this.idleThreads.size(); + } + + + /** + * Gets the busyThreadsCount attribute of the ThreadPool object + * + *@return The busyThreadsCount value + */ + public synchronized int getBusyThreadsCount() { + return this.busyThreads.size(); + } + + + /** + * Gets the threadCount attribute of the ThreadPool object + * + *@return The threadCount value + */ + public synchronized int getThreadCount() { + return this.idleThreads.size() + this.busyThreads.size(); + } + + + /** + * Gets the threadIterator attribute of the ThreadPool object + * + *@return The threadIterator value + */ + public Iterator getThreadIterator() { + return allThreads.values().iterator(); + // return allThreads.iterator(); + } + + + /** + * Description of the Method + * + *@param queue Description of the Parameter + */ + public void setQueue(TaskQueue queue) { + this.queue = queue; + } + + public TaskQueue getTaskQueue() + { + return queue; + } + +} + + diff --git a/sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/threads/ThreadPoolObserver.java b/sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/threads/ThreadPoolObserver.java new file mode 100644 index 00000000000..47e11156265 --- /dev/null +++ b/sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/threads/ThreadPoolObserver.java @@ -0,0 +1,12 @@ +package de.lanlab.larm.threads; + +import de.lanlab.larm.util.Observer; + +/** + * an observer that observes the thread pool... + */ +public interface ThreadPoolObserver extends Observer +{ + public void queueUpdate(String info, String action); + public void threadUpdate(int threadNr, String action, String info); +} diff --git a/sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/threads/ThreadingStrategy.java b/sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/threads/ThreadingStrategy.java new file mode 100644 index 00000000000..ab78ae89dcb --- /dev/null +++ b/sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/threads/ThreadingStrategy.java @@ -0,0 +1,8 @@ +package de.lanlab.larm.threads; + +public interface ThreadingStrategy +{ + public void doTask(InterruptableTask t, Object key); + public void interrupt(); + public void stop(); +} diff --git a/sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/util/CachingQueue.java b/sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/util/CachingQueue.java new file mode 100644 index 00000000000..2cb43ba8831 --- /dev/null +++ b/sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/util/CachingQueue.java @@ -0,0 +1,721 @@ + +/** + * Title: LARM Lanlab Retrieval Machine

+ * + * Description:

+ * + * Copyright: Copyright (c)

+ * + * Company:

+ * + * + * + * @author + * @version 1.0 + */ +package de.lanlab.larm.util; +import java.io.*; +import java.util.*; + + +class StoreException extends RuntimeException +{ + Exception origException; + + + /** + * Constructor for the StoreException object + * + * @param e Description of the Parameter + */ + public StoreException(Exception e) + { + origException = e; + } + + + /** + * Gets the message attribute of the StoreException object + * + * @return The message value + */ + public String getMessage() + { + return origException.getMessage(); + } + + + /** + * Description of the Method + */ + public void printStackTrace() + { + System.err.println("StoreException occured with reason: " + origException.getMessage()); + origException.printStackTrace(); + } +} + +/** + * internal class that represents one block within a queue + * + * @author Clemens Marschner + * @created 3. Januar 2002 + */ +class QueueBlock +{ + + + /** + * the elements section will be set to null if it is on disk Vector elements + * must be Serializable + */ + LinkedList elements; + + /** + * Anzahl Elemente im Block. Kopie von elements.size() + */ + int size; + + /** + * maximale Blockgröße + */ + int maxSize; + + /** + * if set, elements is null and block was written to file + */ + boolean onDisk; + + /** + * Blockname + */ + String name; + + + /** + * initialisiert den Block + * + * @param name Der Blockname (muss eindeutig sein, sonst Kollision auf + * Dateiebene) + * @param maxSize maximale Blockgröße. Über- und Unterläufe werden durch + * Exceptions behandelt + */ + public QueueBlock(String name, int maxSize) + { + this.name = name; + this.onDisk = false; + this.elements = new LinkedList(); + this.maxSize = maxSize; + } + + + /** + * serialisiert und speichert den Block auf Platte + * + * @exception StoreException Description of the Exception + */ + public void store() + throws StoreException + { + try + { + ObjectOutputStream o = new ObjectOutputStream(new FileOutputStream(getFileName())); + o.writeObject(elements); + elements = null; + o.close(); + onDisk = true; + //System.out.println("CachingQueue.store: Block stored"); + } + catch (IOException e) + { + System.err.println("CachingQueue.store: IOException"); + throw new StoreException(e); + } + } + + + /** + * @return the filename of the block + */ + String getFileName() + { + // package protected! + + return "cachingqueue/" + name + ".cqb"; + } + + + /** + * load the block from disk + * + * @exception StoreException Description of the Exception + */ + public void load() + throws StoreException + { + try + { + ObjectInputStream i = new ObjectInputStream(new FileInputStream(getFileName())); + elements = (LinkedList) i.readObject(); + i.close(); + onDisk = false; + size = elements.size(); + if (!(new File(getFileName()).delete())) + { + System.err.println("CachingQueue.load: file could not be deleted"); + } + //System.out.println("CachingQueue.load: Block loaded"); + } + catch (Exception e) + { + System.err.println("CachingQueue.load: Exception " + e.getClass().getName() + " occured"); + throw new StoreException(e); + } + } + + + /** + * inserts an object at the start of the queue must be synchronized by + * calling class to be thread safe + * + * @param o Description of the Parameter + * @exception StoreException Description of the Exception + */ + public void insert(Object o) + throws StoreException + { + if (onDisk) + { + load(); + } + if (size >= maxSize) + { + throw new OverflowException(); + } + elements.addFirst(o); + size++; + } + + + /** + * gibt das letzte Element aus der Queue zurück und löscht dieses must be + * made synchronized by calling class to be thread safe + * + * @return Description of the Return Value + * @exception UnderflowException Description of the Exception + * @exception StoreException Description of the Exception + */ + public Object remove() + throws UnderflowException, StoreException + { + if (onDisk) + { + load(); + } + if (size <= 0) + { + throw new UnderflowException(); + } + size--; + return elements.removeLast(); + } + + + /** + * @return the number of elements in the block + */ + public int size() + { + return size; + } + + + /** + * destructor. Assures that all files are deleted, even if the queue was not + * empty at the time when the program ended + */ + public void finalize() + { + // System.err.println("finalize von " + name + " called"); + if (onDisk) + { + // temp-Datei löschen. Passiert, wenn z.B. eine Exception aufgetreten ist + // System.err.println("CachingQueue.finalize von Block " + name + ": lösche Datei"); + if (!(new File(getFileName()).delete())) + { + // Dateifehler möglich durch Exception: ignorieren + + // System.err.println("CachingQueue.finalize: file could not be deleted although onDisk was true"); + } + } + } +} + + +/** + * this class holds a queue whose data is kept on disk whenever possible. + * It's a single ended queue, meaning data can only be added at the front and + * taken from the back. the queue itself is divided into blocks. Only the first + * and last blocks are kept in main memory, the rest is stored on disk. Only a + * LinkedList entry is kept in memory then. + * Blocks are swapped if an overflow (in case of insertions) or underflow (in case + * of removals) occur.
+ * + *

+ *         +---+---+---+---+-+
+ *  put -> | M | S | S | S |M| -> remove
+ *         +---+---+---+---+-+
+ * 
+ * the maximum number of entries can be specified with the blockSize parameter. Thus, + * the queue actually holds a maximum number of 2 x blockSize objects in main memory, + * plus a few bytes for each block.
+ * The objects contained in the blocks are stored with the standard Java + * serialization mechanism + * The files are named "cachingqueue\\Queuename_BlockNumber.cqb" + * note that the class is not synchronized + * @author Clemens Marschner + * @created 3. Januar 2002 + */ + +public class CachingQueue implements Queue +{ + + + /** + * the Blocks + */ + LinkedList queueBlocks; + + /** + * fast access to the first block + */ + QueueBlock first = null; + + /** + * fast access to the last block + */ + QueueBlock last = null; + + /** + * maximum block size + */ + int blockSize; + + /** + * "primary key" identity count for each block + */ + int blockCount = 0; + + /** + * active blocks + */ + int numBlocks = 0; + + /** + * queue name + */ + String name; + + /** + * total number of objects + */ + int size; + + + /** + * init + * + * @param name the name of the queue, used in files names + * @param blockSize maximum number of objects stored in one block + */ + public CachingQueue(String name, int blockSize) + { + queueBlocks = new LinkedList(); + this.name = name; + this.blockSize = blockSize; + File cq = new File("cachingqueue"); + cq.mkdir(); + } + + + /** + * inserts an object to the front of the queue + * + * @param o the object to be inserted. must implement Serializable + * @exception StoreException encapsulates Exceptions that occur when writing to hard disk + */ + public synchronized void insert(Object o) + throws StoreException + { + if (last == null && first == null) + { + first = last = newBlock(); + queueBlocks.addFirst(first); + numBlocks++; + } + if (last == null && first != null) + { + // assert((last==null && first==null) || (last!= null && first!=null)); + System.err.println("Error in CachingQueue: last!=first==null"); + } + + if (first.size() >= blockSize) + { + // save block and create a new one + QueueBlock newBlock = newBlock(); + numBlocks++; + if (last != first) + { + first.store(); + } + queueBlocks.addFirst(newBlock); + first = newBlock; + } + first.insert(o); + size++; + } + + + /** + * returns the last object from the queue + * + * @return the object returned + * + * @exception StoreException Description of the Exception + * @exception UnderflowException if the queue was empty + */ + public synchronized Object remove() + throws StoreException, UnderflowException + { + if (last == null) + { + throw new UnderflowException(); + } + if (last.size() <= 0) + { + queueBlocks.removeLast(); + numBlocks--; + if (numBlocks == 1) + { + last = first; + } + else if (numBlocks == 0) + { + first = last = null; + throw new UnderflowException(); + } + else if (numBlocks < 0) + { + // assert(numBlocks >= 0) + System.err.println("CachingQueue.remove: numBlocks<0!"); + throw new UnderflowException(); + } + else + { + last = (QueueBlock) queueBlocks.getLast(); + } + } + --size; + return last.remove(); + } + + + /** + * not supported + * + * @param c Description of the Parameter + */ + public void insertMultiple(java.util.Collection c) + { + throw new UnsupportedOperationException(); + } + + + /** + * creates a new block + * + * @return Description of the Return Value + */ + private QueueBlock newBlock() + { + return new QueueBlock(name + "_" + blockCount++, blockSize); + } + + + /** + * total number of objects contained in the queue + * + * @return Description of the Return Value + */ + public int size() + { + return size; + } + + + /** + * testing + * + * @param args The command line arguments + */ + public static void main(String[] args) + { + System.out.println("Test1: " + CachingQueueTester.testUnderflow()); + System.out.println("Test2: " + CachingQueueTester.testInsert()); + System.out.println("Test3: " + CachingQueueTester.testBufReadWrite()); + System.out.println("Test4: " + CachingQueueTester.testBufReadWrite2()); + System.out.println("Test5: " + CachingQueueTester.testUnderflow2()); + System.out.println("Test6: " + CachingQueueTester.testBufReadWrite3()); + System.out.println("Test7: " + CachingQueueTester.testExceptions()); + } +} + +/** + * Testklasse TODO: auslagern und per JUnit handhaben + * + * @author Administrator + * @created 3. Januar 2002 + */ +class AssertionFailedException extends RuntimeException +{ +} + +/** + * Testklasse. Enthält einige Tests für die Funktionalität der CachingQueue + * + * @author Administrator + * @created 3. Januar 2002 + */ +class CachingQueueTester +{ + + + /** + * A unit test for JUnit + * + * @return Description of the Return Value + */ + public static boolean testUnderflow() + { + CachingQueue cq = new CachingQueue("testQueue1", 10); + try + { + cq.remove(); + } + catch (UnderflowException e) + { + return true; + } + catch (Exception e) + { + e.printStackTrace(); + } + return false; + } + + + /** + * A unit test for JUnit + * + * @return Description of the Return Value + */ + public static boolean testInsert() + { + CachingQueue cq = new CachingQueue("testQueue2", 10); + String test = "Test1"; + assert(cq.size() == 0); + cq.insert(test); + assert(cq.size() == 1); + return (cq.remove() == test); + } + + + /** + * A unit test for JUnit + * + * @return Description of the Return Value + */ + public static boolean testBufReadWrite() + { + CachingQueue cq = new CachingQueue("testQueue3", 2); + String test1 = "Test1"; + String test2 = "Test2"; + String test3 = "Test3"; + cq.insert(test1); + cq.insert(test2); + cq.insert(test3); + assert(cq.size() == 3); + cq.remove(); + cq.remove(); + assert(cq.size() == 1); + return (cq.remove() == test3); + } + + + /** + * A unit test for JUnit + * + * @return Description of the Return Value + */ + public static boolean testBufReadWrite2() + { + CachingQueue cq = new CachingQueue("testQueue4", 2); + String test1 = "Test1"; + String test2 = "Test2"; + String test3 = "Test3"; + String test4 = "Test4"; + String test5 = "Test5"; + cq.insert(test1); + cq.insert(test2); + cq.insert(test3); + cq.insert(test4); + cq.insert(test5); + assert(cq.size() == 5); + String t = (String) cq.remove(); + assert(t.equals(test1)); + t = (String) cq.remove(); + assert(t.equals(test2)); + t = (String) cq.remove(); + assert(t.equals(test3)); + t = (String) cq.remove(); + assert(t.equals(test4)); + t = (String) cq.remove(); + assert(cq.size() == 0); + return (t.equals(test5)); + } + + + /** + * Description of the Method + * + * @param expr Description of the Parameter + */ + public static void assert(boolean expr) + { + if (!expr) + { + throw new AssertionFailedException(); + } + } + + + /** + * A unit test for JUnit + * + * @return Description of the Return Value + */ + public static boolean testUnderflow2() + { + CachingQueue cq = new CachingQueue("testQueue5", 2); + String test1 = "Test1"; + String test2 = "Test2"; + String test3 = "Test3"; + String test4 = "Test4"; + String test5 = "Test5"; + cq.insert(test1); + cq.insert(test2); + cq.insert(test3); + cq.insert(test4); + cq.insert(test5); + assert(cq.remove().equals(test1)); + assert(cq.remove().equals(test2)); + assert(cq.remove().equals(test3)); + assert(cq.remove().equals(test4)); + assert(cq.remove().equals(test5)); + try + { + cq.remove(); + } + catch (UnderflowException e) + { + return true; + } + return false; + } + + + /** + * A unit test for JUnit + * + * @return Description of the Return Value + */ + public static boolean testBufReadWrite3() + { + CachingQueue cq = new CachingQueue("testQueue4", 1); + String test1 = "Test1"; + String test2 = "Test2"; + String test3 = "Test3"; + String test4 = "Test4"; + String test5 = "Test5"; + cq.insert(test1); + cq.insert(test2); + cq.insert(test3); + cq.insert(test4); + cq.insert(test5); + String t = (String) cq.remove(); + assert(t.equals(test1)); + t = (String) cq.remove(); + assert(t.equals(test2)); + t = (String) cq.remove(); + assert(t.equals(test3)); + t = (String) cq.remove(); + assert(t.equals(test4)); + t = (String) cq.remove(); + return (t.equals(test5)); + } + + + /** + * A unit test for JUnit + * + * @return Description of the Return Value + */ + public static boolean testExceptions() + { + System.gc(); + CachingQueue cq = new CachingQueue("testQueue5", 1); + String test1 = "Test1"; + String test2 = "Test2"; + String test3 = "Test3"; + String test4 = "Test4"; + String test5 = "Test5"; + cq.insert(test1); + cq.insert(test2); + cq.insert(test3); + cq.insert(test4); + cq.insert(test5); + try + { + if (!(new File("testQueue5_1.cqb").delete())) + { + System.err.println("CachingQueueTester.textExceptions: Store 1 nicht vorhanden. Filename geändert?"); + } + if (!(new File("testQueue5_2.cqb").delete())) + { + System.err.println("CachingQueueTester.textExceptions: Store 2 nicht vorhanden. Filename geändert?"); + } + String t = (String) cq.remove(); + assert(t.equals(test1)); + t = (String) cq.remove(); + assert(t.equals(test2)); + t = (String) cq.remove(); + assert(t.equals(test3)); + t = (String) cq.remove(); + assert(t.equals(test4)); + t = (String) cq.remove(); + assert(t.equals(test5)); + } + catch (StoreException e) + { + return true; + } + finally + { + cq = null; + System.gc(); + // finalizer müssten aufgerufen werden + } + return false; + } + +} diff --git a/sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/util/ClassInfo.java b/sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/util/ClassInfo.java new file mode 100644 index 00000000000..231c17d3f9f --- /dev/null +++ b/sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/util/ClassInfo.java @@ -0,0 +1,273 @@ +package de.lanlab.larm.util; + +import java.lang.reflect.*; +import java.io.*; +import java.util.*; + +/** + * Title: LARM Lanlab Retrieval Machine + * Description: + * Copyright: Copyright (c) + * Company: + * @author + * @version 1.0 + */ + +/** + * prints class information with the reflection api + * for debugging only + */ +public class ClassInfo +{ + + public ClassInfo() + { + } + + /** + * Usage: java ClassInfo PackageName.MyNewClassName PackageName.DerivedClassName + */ + public static void main(String[] args) + { + + String name = args[0]; + String derivedName = args[1]; + LinkedList l = new LinkedList(); + ListIterator itry = l.listIterator(); + + try + { + Class cls = Class.forName(name); + name = cls.getName(); + String pkg = getPackageName(name); + String clss = getClassName(name); + + StringWriter importsWriter = new StringWriter(); + PrintWriter imports = new PrintWriter(importsWriter); + StringWriter outWriter = new StringWriter(); + PrintWriter out = new PrintWriter(outWriter); + + TreeSet importClasses = new TreeSet(); + importClasses.add(getImportStatement(name)); + + out.println("/**\n * (class description here)\n */\npublic class " + derivedName + " " + (cls.isInterface() ? "implements " : "extends ") + clss + "\n{"); + + Method[] m = cls.getMethods(); + for(int i= 0; i< m.length; i++) + { + Method thism = m[i]; + if((thism.getModifiers() & Modifier.PRIVATE) == 0 && ((thism.getModifiers() & Modifier.FINAL) == 0) + && (thism.getDeclaringClass().getName() != "java.lang.Object")) + { + out.println(" /**"); + out.println(" * (method description here)"); + out.println(" * defined in " + thism.getDeclaringClass().getName()); + + Class[] parameters = thism.getParameterTypes(); + for(int j = 0; j < parameters.length; j ++) + { + if(getPackageName(parameters[j].getName()) != "") + { + importClasses.add(getImportStatement(parameters[j].getName())); + } + out.println(" * @param p" + j + " (parameter description here)"); + } + + if(thism.getReturnType().getName() != "void") + { + String returnPackage = getPackageName(thism.getReturnType().getName()); + if(returnPackage != "") + { + importClasses.add(getImportStatement(thism.getReturnType().getName())); + } + out.println(" * @return (return value description here)"); + } + + out.println(" */"); + + out.print(" " + getModifierString(thism.getModifiers()) + getClassName(thism.getReturnType().getName()) + " "); + out.print(thism.getName() + "("); + + for(int j = 0; j < parameters.length; j ++) + { + if(j>0) + { + out.print(", "); + } + out.print(getClassName(parameters[j].getName()) + " p" + j); + } + out.print(")"); + Class[] exceptions = thism.getExceptionTypes(); + + if (exceptions.length > 0) + { + out.print(" throws "); + } + + for(int k = 0; k < exceptions.length; k++) + { + if(k > 0) + { + out.print(", "); + } + String exCompleteName = exceptions[k].getName(); + String exName = getClassName(exCompleteName); + importClasses.add(getImportStatement(exCompleteName)); + + out.print(exName); + } + out.print("\n" + + " {\n" + + " /**@todo: Implement this " + thism.getName() + "() method */\n" + + " throw new UnsupportedOperationException(\"Method " + thism.getName() + "() not yet implemented.\");\n" + + " }\n\n"); + + + } + } + out.println("}"); + + Iterator importIterator = importClasses.iterator(); + while(importIterator.hasNext()) + { + String importName = (String)importIterator.next(); + if(!importName.startsWith("java.lang")) + { + imports.println("import " + importName + ";"); + } + } + + out.flush(); + imports.flush(); + + if(getPackageName(derivedName) != "") + { + System.out.println("package " + getPackageName(derivedName) + ";\n"); + } + System.out.println( "/**\n" + + " * Title: \n" + + " * Description:\n" + + " * Copyright: Copyright (c)\n" + + " * Company:\n" + + " * @author\n" + + " * @version 1.0\n" + + " */\n"); + System.out.println(importsWriter.getBuffer()); + System.out.print(outWriter.getBuffer()); + } + catch(Throwable t) + { + t.printStackTrace(); + } + } + + public static String getPackageName(String className) + { + if(className.charAt(0) == '[') + { + switch(className.charAt(1)) + { + case 'L': + return getPackageName(className.substring(2,className.length()-1)); + default: + return ""; + } + } + String name = className.lastIndexOf(".") != -1 ? className.substring(0, className.lastIndexOf(".")) : ""; + //System.out.println("Package: " + name); + return name; + } + + public static String getClassName(String className) + { + if(className.charAt(0) == '[') + { + switch(className.charAt(1)) + { + case 'L': + return getClassName(className.substring(2,className.length()-1)) + "[]"; + case 'C': + return "char[]"; + case 'I': + return "int[]"; + case 'B': + return "byte[]"; + // rest is missing here + + } + } + String name = (className.lastIndexOf(".") > -1) ? className.substring(className.lastIndexOf(".")+1) : className; + //System.out.println("Class: " + name); + return name; + } + + static String getImportStatement(String className) + { + String pack = getPackageName(className); + String clss = getClassName(className); + if(clss.indexOf("[]") > -1) + { + return pack + "." + clss.substring(0,clss.length() - 2); + } + else + { + return pack + "." + clss; + } + } + + public static String getModifierString(int modifiers) + { + StringBuffer mods = new StringBuffer(); + if((modifiers & Modifier.ABSTRACT) != 0) + { + mods.append("abstract "); + } + if((modifiers & Modifier.FINAL) != 0) + { + mods.append("final "); + } + if((modifiers & Modifier.INTERFACE) != 0) + { + mods.append("interface "); + } + if((modifiers & Modifier.NATIVE) != 0) + { + mods.append("native "); + } + if((modifiers & Modifier.PRIVATE) != 0) + { + mods.append("private "); + } + if((modifiers & Modifier.PROTECTED) != 0) + { + mods.append("protected "); + } + if((modifiers & Modifier.PUBLIC) != 0) + { + mods.append("public "); + } + if((modifiers & Modifier.STATIC) != 0) + { + mods.append("static "); + } + if((modifiers & Modifier.STRICT) != 0) + { + mods.append("strictfp "); + } + if((modifiers & Modifier.SYNCHRONIZED) != 0) + { + mods.append("synchronized "); + } + if((modifiers & Modifier.TRANSIENT) != 0) + { + mods.append("transient "); + } + if((modifiers & Modifier.VOLATILE) != 0) + { + mods.append("volatile "); + } + return mods.toString(); + } + + +} \ No newline at end of file diff --git a/sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/util/HashedCircularLinkedList.java b/sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/util/HashedCircularLinkedList.java new file mode 100644 index 00000000000..6b0d16fb6d1 --- /dev/null +++ b/sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/util/HashedCircularLinkedList.java @@ -0,0 +1,319 @@ +package de.lanlab.larm.util; + +/** + * Title: + * Description: + * Copyright: Copyright (c) + * Company: + * @author + * @version 1.0 + */ + +import java.util.*; + +/** + * simple hashed linked list. It allows for inserting and removing elements like + * in a hash table (in fact, it uses a HashMap), while still being able to easily + * traverse the collection like a list. In addition, the iterator is circular. It + * always returns a next element as long as there are elements in the list. In + * contrast to the iterator of Sun's collection classes, this class can cope with + * inserts and removals while traversing the list.

+ * Elements are always added to the end of the list, that is, always at the same place
+ * All operations should work in near constant time as the list grows. Only the + * trade-off costs of a hash (memory versus speed) have to be considered. + * The List doesn't accept null elements + * @todo put the traversal function into an Iterator + * @todo implement the class as a derivate from a Hash + */ +public class HashedCircularLinkedList +{ + + + /** + * Entry class. + */ + private static class Entry + { + Object key; + Object element; + Entry next; + Entry previous; + + Entry(Object element, Entry next, Entry previous, Object key) + { + this.element = element; + this.next = next; + this.previous = previous; + this.key = key; + } + } + + /** + * the list. contains objects + */ + private transient Entry header = new Entry(null, null, null, null); + + /** + * the hash. maps keys to entries, which by themselves map to objects + */ + HashMap keys; + + private transient int size = 0; + + /** the current entry in the traversal */ + Entry current = null; + + /** + * Constructs an empty list. + */ + public HashedCircularLinkedList(int initialCapacity, float loadFactor) + { + header.next = header.previous = header; + keys = new HashMap(initialCapacity, loadFactor); + } + + /** + * Returns the number of elements in this list. + * + * @return the number of elements in this list. + */ + public int size() + { + return size; + } + + /** + * Removes the first occurrence of the specified element in this list. If + * the list does not contain the element, it is unchanged. More formally, + * removes the element with the lowest index i such that + * (o==null ? get(i)==null : o.equals(get(i))) (if such an + * element exists). + * + * @param o element to be removed from this list, if present. + * @return true if the list contained the specified element. + */ + public boolean removeByKey(Object o) + { + // assert(o != null) + Entry e = (Entry)keys.get(o); + if(e != null) + { + if(e == current) + { + if(size > 1) + { + current = previousEntry(current); + } + else + { + current = null; + } + } + this.removeEntryFromList(e); + keys.remove(o); + size--; + return true; + } + else + { + return false; + } + } + + /** + * Removes all of the elements from this list. + */ + public void clear() + { + // list + header.next = header.previous = header; + + // hash + keys.clear(); + + size = 0; + current = null; + } + + + private Entry addEntryBefore(Object key, Object o, Entry e) + { + Entry newEntry = new Entry(o, e, e.previous, key); + newEntry.previous.next = newEntry; + newEntry.next.previous = newEntry; + return newEntry; + } + + private void removeEntryFromList(Entry e) + { + if(e != null) + { + if (e == header) + { + throw new NoSuchElementException(); + } + + e.previous.next = e.next; + e.next.previous = e.previous; + } + } + + + /** + * (method description here) + * defined in java.util.Map + * @param p0 (parameter description here) + * @param p1 (parameter description here) + * @return (return value description here) + */ + public boolean put(Object key, Object value) + { + if(key != null && !keys.containsKey(key)) + { + Entry e = addEntryBefore(key, value, header); // add it as the last element + keys.put(key, e); // link key to entry + size++; + return true; + } + else + { + return false; + } + } + + + public boolean hasNext() + { + return (size > 0); + } + + private Entry nextEntry(Entry e) + { + // assert(e != null) + if(size > 1) + { + if(e == null) + { + e = header; + } + Entry next = e.next; + if(next == header) + { + next = next.next; + } + return next; + } + else if(size == 1) + { + return header.next; + } + else + { + return null; + } + } + + + + private Entry previousEntry(Entry e) + { + // assert(e != null) + if(size > 1) + { + if(e == null) + { + e = header; + } + Entry previous = e.previous; + if(previous == header) + { + previous = previous.previous; + } + return previous; + } + else if(size == 1) + { + return header.previous; + } + else + { + return null; + } + } + + public Object next() + { + current = nextEntry(current); + if(current != null) + { + return current.element; + } + else + { + return null; + } + } + + public void removeCurrent() + { + keys.remove(current.key); + removeEntryFromList(current); + } + + + public Object get(Object key) + { + Entry e = ((Entry)keys.get(key)); + if(e != null) + { + return e.element; + } + else + { + return null; + } + } + + /** + * testing + */ + public static void main(String[] args) + { + HashedCircularLinkedList h = new HashedCircularLinkedList(20, 0.75f); + h.put("1", "a"); + h.put("2", "b"); + h.put("3", "c"); + String t; + System.out.println("size [3]: " + h.size()); + t = (String)h.next(); + System.out.println("2nd element via get [b]: " + h.get("2")); + + System.out.println("next element [a]: " + t); + t = (String)h.next(); + System.out.println("next element [b]: " + t); + t = (String)h.next(); + System.out.println("next element [c]: " + t); + t = (String)h.next(); + System.out.println("1st element after circular traversal [a]: " + t); + h.removeByKey("1"); + System.out.println("1st element after remove [null]: " + h.get("1")); + System.out.println("size after removal [2]: " + h.size()); + t = (String)h.next(); + System.out.println("next element [b]: " + t); + t = (String)h.next(); + System.out.println("next element [c]: " + t); + t = (String)h.next(); + System.out.println("next element [b]: " + t); + h.removeCurrent(); + t = (String)h.next(); + System.out.println("next element after 1 removal [c]: " + t); + t = (String)h.next(); + System.out.println("next element: [c]: " + t); + h.removeByKey("3"); + System.out.println("size after 3 removals [0]: " + h.size()); + t = (String)h.next(); + System.out.println("next element [null]: " + t); + } +} + + diff --git a/sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/util/InputStreamObserver.java b/sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/util/InputStreamObserver.java new file mode 100644 index 00000000000..c16940ffac5 --- /dev/null +++ b/sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/util/InputStreamObserver.java @@ -0,0 +1,18 @@ + +/** + * Title: LARM Lanlab Retrieval Machine

+ * Description:

+ * Copyright: Copyright (c)

+ * Company:

+ * @author + * @version 1.0 + */ +package de.lanlab.larm.util; + +public interface InputStreamObserver +{ + public void notifyOpened(ObservableInputStream in, long timeElapsed); + public void notifyClosed(ObservableInputStream in, long timeElapsed); + public void notifyRead(ObservableInputStream in, long timeElapsed, int nrRead, int totalRead); + public void notifyFinished(ObservableInputStream in, long timeElapsed, int totalRead); +} \ No newline at end of file diff --git a/sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/util/Logger.java b/sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/util/Logger.java new file mode 100644 index 00000000000..2564b661c14 --- /dev/null +++ b/sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/util/Logger.java @@ -0,0 +1,19 @@ +/* + * + * + * + */ +package de.lanlab.larm.util; + +import java.io.*; + +public class Logger +{ + private FileOutputStream out; + + public Logger(String fileName) + { + + } + +} \ No newline at end of file diff --git a/sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/util/ObservableInputStream.java b/sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/util/ObservableInputStream.java new file mode 100644 index 00000000000..d261d2bd75d --- /dev/null +++ b/sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/util/ObservableInputStream.java @@ -0,0 +1,101 @@ + +/** + * Title: LARM Lanlab Retrieval Machine

+ * Description:

+ * Copyright: Copyright (c)

+ * Company:

+ * @author + * @version 1.0 + */ +package de.lanlab.larm.util; + +import java.io.*; + +public class ObservableInputStream extends FilterInputStream +{ + private boolean reporting = true; + private long startTime; + private int totalRead = 0; + private int step = 1; + private int nextStep = 0; + + InputStreamObserver observer; + + public ObservableInputStream(InputStream in, InputStreamObserver iso, int reportingStep) + { + super(in); + startTime = System.currentTimeMillis(); + observer = iso; + observer.notifyOpened(this, System.currentTimeMillis() - startTime); + nextStep = step = reportingStep; + } + + public void close() throws IOException + { + super.close(); + observer.notifyClosed(this, System.currentTimeMillis() - startTime); + } + + public void setReporting(boolean reporting) + { + this.reporting = reporting; + } + + public boolean isReporting() + { + return reporting; + } + + public void setReportingStep(int step) + { + this.step = step; + } + + public int read() throws IOException + { + int readByte = super.read(); + if(reporting) + { + notifyObserver(readByte>=0? 1 : 0); + } + return readByte; + } + + public int read(byte[] b) throws IOException + { + int nrRead = super.read(b); + if(reporting) + { + notifyObserver(nrRead); + } + return nrRead; + } + + private void notifyObserver(int nrRead) + { + if(nrRead > 0) + { + totalRead += nrRead; + if(totalRead > nextStep) + { + nextStep += step; + observer.notifyRead(this, System.currentTimeMillis() - startTime, nrRead, totalRead); + } + } + else + { + observer.notifyFinished(this, System.currentTimeMillis() - startTime, totalRead); + } + } + + public int read(byte[] b, int offs, int size) throws IOException + { + int nrRead = super.read(b, offs, size); + if(reporting) + { + notifyObserver(nrRead); + } + return nrRead; + } +} + diff --git a/sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/util/Observer.java b/sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/util/Observer.java new file mode 100644 index 00000000000..a81095094da --- /dev/null +++ b/sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/util/Observer.java @@ -0,0 +1,9 @@ +package de.lanlab.larm.util; + + +/** + * not used + */ +public interface Observer +{ +} diff --git a/sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/util/OverflowException.java b/sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/util/OverflowException.java new file mode 100644 index 00000000000..a1f427e667a --- /dev/null +++ b/sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/util/OverflowException.java @@ -0,0 +1,15 @@ +package de.lanlab.larm.util; + +/** + * Title: LARM + * Description: + * Copyright: Copyright (c) 2001 + * Company: LMU-IP + * @author Clemens Marschner + * @version 1.0 + */ + + +public class OverflowException extends RuntimeException +{ +} diff --git a/sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/util/Queue.java b/sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/util/Queue.java new file mode 100644 index 00000000000..26105c3c333 --- /dev/null +++ b/sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/util/Queue.java @@ -0,0 +1,20 @@ +package de.lanlab.larm.util; + +/** + * Title: LARM Lanlab Retrieval Machine + * Description: + * Copyright: Copyright (c) + * Company: + * @author + * @version 1.0 + */ + +import java.util.Collection; + +public interface Queue +{ + public Object remove(); + public void insert(Object o); + public void insertMultiple(Collection c); + public int size(); +} \ No newline at end of file diff --git a/sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/util/SimpleCharArrayReader.java b/sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/util/SimpleCharArrayReader.java new file mode 100644 index 00000000000..2e1cfd4c903 --- /dev/null +++ b/sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/util/SimpleCharArrayReader.java @@ -0,0 +1,285 @@ +/* + * @(#)SimpleCharArrayReader.java 1.35 00/02/02 + * + */ + +package de.lanlab.larm.util; +import java.io.*; + +/** + * A SimpleCharArrayReader contains + * an internal buffer that contains bytes that + * may be read from the stream. An internal + * counter keeps track of the next byte to + * be supplied by the read method. + *
+ * In contrast to the original CharArrayReader this + * version is not thread safe. The monitor on the read()-function caused programs + * to slow down much, because this function is called for every character. This + * class can thus only be used if only one thread is accessing the stream + * @author Clemens Marschner + * @version 1.00 + * @see java.io.ByteArrayInputStream + */ +public +class SimpleCharArrayReader extends Reader +{ + + /** + * A flag that is set to true when this stream is closed. + */ + private boolean isClosed = false; + + /** + * An array of bytes that was provided + * by the creator of the stream. Elements buf[0] + * through buf[count-1] are the + * only bytes that can ever be read from the + * stream; element buf[pos] is + * the next byte to be read. + */ + protected char buf[]; + + /** + * The index of the next character to read from the input stream buffer. + * This value should always be nonnegative + * and not larger than the value of count. + * The next byte to be read from the input stream buffer + * will be buf[pos]. + */ + protected int pos; + + /** + * The currently marked position in the stream. + * SimpleCharArrayReader objects are marked at position zero by + * default when constructed. They may be marked at another + * position within the buffer by the mark() method. + * The current buffer position is set to this point by the + * reset() method. + * + * @since JDK1.1 + */ + protected int mark = 0; + + /** + * The index one greater than the last valid character in the input + * stream buffer. + * This value should always be nonnegative + * and not larger than the length of buf. + * It is one greater than the position of + * the last byte within buf that + * can ever be read from the input stream buffer. + */ + protected int count; + + /** + * Creates a SimpleCharArrayReader + * so that it uses buf as its + * buffer array. + * The buffer array is not copied. + * The initial value of pos + * is 0 and the initial value + * of count is the length of + * buf. + * + * @param buf the input buffer. + */ + public SimpleCharArrayReader(char buf[]) + { + this.buf = buf; + this.pos = 0; + this.count = buf.length; + } + + /** + * Creates SimpleCharArrayReader + * that uses buf as its + * buffer array. The initial value of pos + * is offset and the initial value + * of count is offset+len. + * The buffer array is not copied. + *

+ * Note that if bytes are simply read from + * the resulting input stream, elements buf[pos] + * through buf[pos+len-1] will + * be read; however, if a reset + * operation is performed, then bytes buf[0] + * through buf[pos-1] will then + * become available for input. + * + * @param buf the input buffer. + * @param offset the offset in the buffer of the first byte to read. + * @param length the maximum number of bytes to read from the buffer. + */ + public SimpleCharArrayReader(char buf[], int offset, int length) + { + this.buf = buf; + this.pos = offset; + this.count = Math.min(offset + length, buf.length); + this.mark = offset; + } + + /** + * Reads the next byte of data from this input stream. The value + * byte is returned as an int in the range + * 0 to 255. If no byte is available + * because the end of the stream has been reached, the value + * -1 is returned. + *

+ * + * @return the next byte of data, or -1 if the end of the + * stream has been reached. + */ + public int read() + { + return (pos < count) ? (buf[pos++] & 0xff) : -1; + } + + /** + * Reads up to len bytes of data into an array of bytes + * from this input stream. + * If pos equals count, + * then -1 is returned to indicate + * end of file. Otherwise, the number k + * of bytes read is equal to the smaller of + * len and count-pos. + * If k is positive, then bytes + * buf[pos] through buf[pos+k-1] + * are copied into b[off] through + * b[off+k-1] in the manner performed + * by System.arraycopy. The + * value k is added into pos + * and k is returned. + *

+ * This read method cannot block. + * + * @param b the buffer into which the data is read. + * @param off the start offset of the data. + * @param len the maximum number of bytes read. + * @return the total number of bytes read into the buffer, or + * -1 if there is no more data because the end of + * the stream has been reached. + */ + public int read(char b[], int off, int len) + { + if (b == null) + { + throw new NullPointerException(); + } + else if ((off < 0) || (off > b.length) || (len < 0) || + ((off + len) > b.length) || ((off + len) < 0)) + { + throw new IndexOutOfBoundsException(); + } + if (pos >= count) + { + return -1; + } + if (pos + len > count) + { + len = count - pos; + } + if (len <= 0) + { + return 0; + } + System.arraycopy(buf, pos, b, off, len); + pos += len; + return len; + } + + /** + * Skips n bytes of input from this input stream. Fewer + * bytes might be skipped if the end of the input stream is reached. + * The actual number k + * of bytes to be skipped is equal to the smaller + * of n and count-pos. + * The value k is added into pos + * and k is returned. + * + * @param n the number of bytes to be skipped. + * @return the actual number of bytes skipped. + */ + public long skip(long n) + { + if (pos + n > count) + { + n = count - pos; + } + if (n < 0) + { + return 0; + } + pos += n; + return n; + } + + /** + * Returns the number of bytes that can be read from this input + * stream without blocking. + * The value returned is + * count - pos, + * which is the number of bytes remaining to be read from the input buffer. + * + * @return the number of bytes that can be read from the input stream + * without blocking. + */ + public int available() + { + return count - pos; + } + + /** + * Tests if SimpleCharArrayReader supports mark/reset. + * + * @since JDK1.1 + */ + public boolean markSupported() + { + return true; + } + + /** + * Set the current marked position in the stream. + * SimpleCharArrayReader objects are marked at position zero by + * default when constructed. They may be marked at another + * position within the buffer by this method. + * + * @since JDK1.1 + */ + public void mark(int readAheadLimit) + { + mark = pos; + } + + /** + * Resets the buffer to the marked position. The marked position + * is the beginning unless another position was marked. + * The value of pos is set to 0. + */ + public void reset() + { + + pos = mark; + } + + /** + * Closes this input stream and releases any system resources + * associated with the stream. + *

+ */ + public void close() throws IOException + { + isClosed = true; + } + + /** Check to make sure that the stream has not been closed */ + private void ensureOpen() + { + /* This method does nothing for now. Once we add throws clauses + * to the I/O methods in this class, it will throw an IOException + * if the stream has been closed. + */ + } + +} diff --git a/sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/util/SimpleLogger.java b/sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/util/SimpleLogger.java new file mode 100644 index 00000000000..60cd99b2b58 --- /dev/null +++ b/sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/util/SimpleLogger.java @@ -0,0 +1,112 @@ +package de.lanlab.larm.util; + +/** + * Title: LARM Lanlab Retrieval Machine + * Description: + * Copyright: Copyright (c) + * Company: + * @author + * @version 1.0 + */ +import java.io.*; +import java.util.*; +import java.text.*; + +/** + * this class is only used for SPEED. Its log function is not thread safe by + * default. + * It uses a BufferdWriter. + * It registers with a logger manager, which can be used to flush several loggers + * at once + * @todo: including the date slows down a lot + * + */ +public class SimpleLogger +{ + private SimpleDateFormat formatter = new SimpleDateFormat ("HH:mm:ss:SSSS"); + + Writer logFile; + + StringBuffer buffer = new StringBuffer(1000); + + long startTime = System.currentTimeMillis(); + boolean includeDate; + + public void setStartTime(long startTime) + { + this.startTime = startTime; + } + + public synchronized void logThreadSafe(String text) + { + log(text); + } + + public synchronized void logThreadSafe(Throwable t) + { + log(t); + } + + public void log(String text) + { + try + { + buffer.setLength(0); + if(includeDate) + { + buffer.append(formatter.format(new Date())).append(": ").append(System.currentTimeMillis()-startTime).append(" ms: "); + } + buffer.append(text).append("\n"); + logFile.write(buffer.toString()); + if(flushAtOnce) + { + logFile.flush(); + } + } + catch(IOException e) + { + System.out.println("Couldn't write to logfile"); + } + } + + public void log(Throwable t) + { + t.printStackTrace(new PrintWriter(logFile)); + } + + boolean flushAtOnce = false; + + public void setFlushAtOnce(boolean flush) + { + this.flushAtOnce = flush; + } + + public SimpleLogger(String name) + { + init(name, true); + } + + public SimpleLogger(String name, boolean includeDate) + { + init(name, includeDate); + } + + public void flush() throws IOException + { + logFile.flush(); + } + + private void init(String name, boolean includeDate) + { + try + { + logFile = new BufferedWriter(new FileWriter("logs/" + name + ".log")); + SimpleLoggerManager.getInstance().register(this); + } + catch(IOException e) + { + System.out.println("IOException while creating logfile " + name + ":"); + e.printStackTrace(); + } + } +} diff --git a/sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/util/SimpleLoggerManager.java b/sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/util/SimpleLoggerManager.java new file mode 100644 index 00000000000..44717249305 --- /dev/null +++ b/sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/util/SimpleLoggerManager.java @@ -0,0 +1,65 @@ +package de.lanlab.larm.util; + +/** + * Title: LARM Lanlab Retrieval Machine + * Description: + * Copyright: Copyright (c) + * Company: + * @author + * @version 1.0 + */ + +import java.util.*; +import java.io.IOException; + +/** + * this singleton manages all loggers. It can be used to flush all SimpleLoggers + * at once + */ +public class SimpleLoggerManager +{ + static SimpleLoggerManager instance = null; + + ArrayList logs; + + private SimpleLoggerManager() + { + logs = new ArrayList(); + } + + public void register(SimpleLogger logger) + { + logs.add(logger); + } + + public void flush() throws IOException + { + Iterator it = logs.iterator(); + IOException ex = null; + while(it.hasNext()) + { + try + { + SimpleLogger logger = (SimpleLogger)it.next(); + logger.flush(); + } + catch(IOException e) + { + ex = e; + } + } + if(ex != null) + { + throw ex; + } + } + + public static SimpleLoggerManager getInstance() + { + if(instance == null) + { + instance = new SimpleLoggerManager(); + } + return instance; + } +} \ No newline at end of file diff --git a/sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/util/SimpleObservable.java b/sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/util/SimpleObservable.java new file mode 100644 index 00000000000..a24f9f2e181 --- /dev/null +++ b/sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/util/SimpleObservable.java @@ -0,0 +1,21 @@ + +/** + * Title: LARM Lanlab Retrieval Machine

+ * Description:

+ * Copyright: Copyright (c)

+ * Company:

+ * @author + * @version 1.0 + */ +package de.lanlab.larm.util; + +import java.util.Observable; + +public class SimpleObservable extends Observable +{ + + public void setChanged() + { + super.setChanged(); + } +} \ No newline at end of file diff --git a/sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/util/State.java b/sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/util/State.java new file mode 100644 index 00000000000..87ae48fe1b6 --- /dev/null +++ b/sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/util/State.java @@ -0,0 +1,91 @@ +package de.lanlab.larm.util; + +import java.io.Serializable; +/** + * Title: LARM Lanlab Retrieval Machine + * Description: + * Copyright: Copyright (c) + * Company: + * @author + * @version 1.0 + */ + +/** + * thread safe state information. + * The get methods are not synchronized. Clone the state object before using them + * If you use a state object in a class, always return a clone + *

public class MyClass {
+ *     State state = new State("Running");
+ *     public State getState() { return state.cloneState() }
+ * + * note on serialization: if you deserialize a state, the state string will be newly created. + * that means you then have to compare the states via equal() and not == + */ +public class State implements Cloneable, Serializable +{ + + private String state; + private long stateSince; + private Object info; + + public State(String state) + { + setState(state); + } + + + private State(String state, long stateSince) + { + init(state, stateSince, null); + } + + private State(String state, long stateSince, Object info) + { + init(state, stateSince, info); + } + + private void init(String state, long stateSince, Object info) + { + this.state = state; + this.stateSince = stateSince; + this.info = info; + } + + public void setState(String state) + { + setState(state, null); + } + + public synchronized void setState(String state, Object info) + { + this.state = state; + this.stateSince = System.currentTimeMillis(); + this.info = info; + } + + public String getState() + { + return state; + } + + public long getStateSince() + { + return stateSince; + } + + public Object getInfo() + { + return info; + } + + public synchronized Object clone() + { + return new State(state, stateSince, info); + } + + public State cloneState() + { + return (State)clone(); + } + +} \ No newline at end of file diff --git a/sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/util/URLUtils.java b/sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/util/URLUtils.java new file mode 100644 index 00000000000..1956e81886a --- /dev/null +++ b/sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/util/URLUtils.java @@ -0,0 +1,60 @@ +package de.lanlab.larm.util; + +/** + * Title: LARM Lanlab Retrieval Machine Description: Copyright: Copyright (c) + * Company: + * + * @author + * @version 1.0 + */ +import java.net.URL; + +/** + * Description of the Class + * + * @author Administrator + * @created 27. Januar 2002 + */ +public class URLUtils +{ + /** + * does the same as URL.toExternalForm(), but leaves out the Ref part (which we would + * cut off anyway) and handles the String Buffer so that no call of expandCapacity() will + * be necessary + * only meaningful if the default URLStreamHandler is used (as is the case with http, https, or shttp) + * + * @param u the URL to be converted + * @return the URL as String + */ + public static String toExternalFormNoRef(URL u) + { + String protocol = u.getProtocol(); + String authority = u.getAuthority(); + String file = u.getFile(); + + StringBuffer result = new StringBuffer( + (protocol == null ? 0 : protocol.length()) + + (authority == null ? 0 : authority.length()) + + (file == null ? 1 : file.length()) + 3 + ); + + result.append(protocol); + result.append(":"); + if (u.getAuthority() != null && u.getAuthority().length() > 0) + { + result.append("//"); + result.append(u.getAuthority()); + } + if (u.getFile() != null && u.getFile().length() > 0) + { + result.append(u.getFile()); + } + else + { + result.append("/"); + } + + return result.toString(); + } + +} diff --git a/sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/util/UnderflowException.java b/sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/util/UnderflowException.java new file mode 100644 index 00000000000..e07b63ff58e --- /dev/null +++ b/sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/util/UnderflowException.java @@ -0,0 +1,15 @@ +package de.lanlab.larm.util; + +/** + * Title: LARM + * Description: + * Copyright: Copyright (c) 2001 + * Company: LMU-IP + * @author Clemens Marschner + * @version 1.0 + */ + + +public class UnderflowException extends RuntimeException +{ +} diff --git a/sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/util/WebDocument.java b/sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/util/WebDocument.java new file mode 100644 index 00000000000..3287fd51f6b --- /dev/null +++ b/sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/util/WebDocument.java @@ -0,0 +1,94 @@ +package de.lanlab.larm.util; + + +import java.net.URL; +import de.lanlab.larm.fetcher.URLMessage; + +/** + * a web document of whatever type. generated by a fetcher task + */ +public class WebDocument extends URLMessage +{ + protected String mimeType; + protected byte[] document; + protected int resultCode; + protected int size; + protected String title; + + public WebDocument(URL url, String mimeType, byte[] document, int resultCode, URL referer, int size, String title) + { + super(url, referer, false); + this.url = url; + this.mimeType = mimeType; + this.document = document; + this.resultCode = resultCode; + this.size = size; + this.title = title; + } + + public String getTitle() + { + return title; + } + + public URL getUrl() + { + return url; + } + + public int getSize() + { + return this.size; + } + + public void setSize(int size) + { + this.size = size; + } + + + public void setDocument(byte[] document) + { + this.document = document; + } + public int getResultCode() + { + return resultCode; + } + + public void setResultCode(int resultCode) + { + this.resultCode = resultCode; + } + + public byte[] getDocumentBytes() + { + return this.document; + } + + public void setUrl(URL url) + { + this.url = url; + } + + public void setMimeType(String mimeType) + { + this.mimeType = mimeType; + } + + public String getMimeType() + { + return mimeType; + } + + public String getInfo() + { + return super.getInfo() + "\t" + + this.resultCode + "\t" + + this.mimeType + "\t" + + this.size + "\t" + + "\"" + this.title.replace('\"', (char)0xff ).replace('\n',' ').replace('\r',' ') + "\""; + } + + +} diff --git a/sandbox/contributions/webcrawler-LARM/src/hplb/misc/ByteArray.java b/sandbox/contributions/webcrawler-LARM/src/hplb/misc/ByteArray.java new file mode 100644 index 00000000000..73387d14ec4 --- /dev/null +++ b/sandbox/contributions/webcrawler-LARM/src/hplb/misc/ByteArray.java @@ -0,0 +1,294 @@ +/* + * $Id$ + * + * Copyright 1997 Hewlett-Packard Company + * + * This file may be copied, modified and distributed only in + * accordance with the terms of the limited licence contained + * in the accompanying file LICENSE.TXT. + */ + +package hplb.misc; + +import java.io.*; +import java.net.*; + +/** + * This class is a container for algorithms working on byte arrays - some + * of the algorithms are analogous to those in java.lang.String. + * @author Anders Kristensen + */ +public class ByteArray { + + /** Returns copy of characters in s as a new byte array. */ + public static final byte[] getBytes(String s) { + int len = s.length(); + byte b[] = new byte[len]; + s.getBytes(0, len, b, 0); + return b; + } + + /** Returns contents of file as byte array. */ + public static byte[] loadFromFile(String filename) throws IOException { + return loadFromFile(new File(filename)); + } + + /** Returns contents of file file as byte array. */ + public static byte[] loadFromFile(File file) throws IOException { + int n, nread = 0, len = (int) file.length(); + FileInputStream fin = new FileInputStream(file); + byte[] content = new byte[len]; + + while (nread < len) { + if ((n = fin.read(content, nread, len - nread)) == -1) + throw new IOException("Error loading Compound from file"); + nread += n; + } + + return content; + } + + /** + * Reads n bytes from the specified input stream. It will return + * fewer bytes if fewer bytes are available on the stream. + * Hence the application should check the resulting arrays length. + */ + public static byte[] readn(InputStream in, int n) throws IOException { + byte[] buf = new byte[n]; + int ntotal = 0; + int nread; + + while (ntotal < n) { + nread = in.read(buf, ntotal, n - ntotal); + if (nread < 0) { + // we got less than expected - return what we got + byte[] newbuf = new byte[ntotal]; + System.arraycopy(buf, 0, newbuf, 0, ntotal); + return newbuf; + } + ntotal += nread; + } + return buf; + } + + /** + * Return contents of a WWW resource identified by a URL. + * @param url the resource to retrieve + * @return the resource contents as a byte array + */ + public static byte[] getContent(URL url) throws IOException { + URLConnection conn = url.openConnection(); + InputStream in = conn.getInputStream(); + int length; + + /* + * N.B. URLConnection.getContentLength() is buggy for "http" resources + * (at least in JDK1.0.2) and won't work for "file" URLs either. + */ + length = length = conn.getContentLength(); + if (length == -1) + length = conn.getHeaderFieldInt("Content-Length", -1); + if (length == -1) + return readAll(in); + return readn(in, length); + } + + /** + * Read all input from an InputStream and return as a byte array. + * This method will not return before the end of the stream is reached. + * @return contents of the stream + */ + public static byte[] readAll(InputStream in) throws IOException { + byte[] buf = new byte[1024]; + int nread, ntotal = 0; + + while ((nread = in.read(buf, ntotal, buf.length - ntotal)) > -1) { + ntotal += nread; + if (ntotal == buf.length) { + // extend buffer + byte[] newbuf = new byte[buf.length * 2]; + System.arraycopy(buf, 0, newbuf, 0, buf.length); + buf = newbuf; + } + } + if (ntotal < buf.length) { + // we cannot have excess space + byte[] newbuf = new byte[ntotal]; + System.arraycopy(buf, 0, newbuf, 0, ntotal); + buf = newbuf; + } + return buf; + } + + /** + * Copies data from the specified input stream to the output stream + * until end of file is met. + * @return the total number of bytes written to the output stream + */ + public static int cpybytes(InputStream in, OutputStream out) + throws IOException + { + byte[] buf = new byte[1024]; + int n, ntotal = 0; + while ((n = in.read(buf)) > -1) { + out.write(buf, 0, n); + ntotal += n; + } + return ntotal; + } + + /** + * Copies data from the specified input stream to the output stream + * until n bytes has been copied or end of file is met. + * @return the total number of bytes written to the output stream + */ + public static int cpybytes(InputStream in, OutputStream out, int n) + throws IOException + { + int sz = n < 1024 ? n : 1024; + byte[] buf = new byte[sz]; + int chunk, nread, ntotal = 0; + + chunk = sz; + + while (ntotal < n && (nread = in.read(buf, 0, chunk)) > -1) { + out.write(buf, 0, nread); + ntotal += nread; + chunk = (n - ntotal < sz) ? n - ntotal : sz; + } + return ntotal; + } + + /** + * Returns the index within this String of the first occurrence of the + * specified character or -1 if the character is not found. + * @params buf the buffer to search + * @params ch the character to search for + */ + public static final int indexOf(byte[] buf, + int ch) { + return indexOf(buf, ch, 0, buf.length); + } + + /** + * Returns the index within this String of the first occurrence of the + * specified character, starting the search at fromIndex. This method + * returns -1 if the character is not found. + * @params buf the buffer to search + * @params ch the character to search for + * @params fromIndex the index to start the search from + * @params toIndex the highest possible index returned plus 1 + */ + public static final int indexOf(byte[] buf, + int ch, + int fromIndex, + int toIndex) { + int i; + + for (i = fromIndex; i < toIndex && buf[i] != ch; i++) + ; // do nothing + + if (i < toIndex) + return i; + else + return -1; + } + + /** + * Returns the index of the first occurrence of s in the specified + * buffer or -1 if this is not found. + */ + public static final int indexOf(byte[] buf, String s) { + return indexOf(buf, s, 0); + } + + /** + * Returns the index of the first occurrence of s in the specified + * buffer. The search starts from fromIndex. This method returns -1 + * if the index is not found. + */ + public static final int indexOf(byte[] buf, String s, int fromIndex) { + int i; // index into buf + int j; // index into s + int max_i = buf.length; + int max_j = s.length(); + + for (i = fromIndex; i + max_j <= max_i; i++) { + for (j = 0; j < max_j; j++) { + if (buf[j + i] != s.charAt(j)) + break; + } + if (j == max_j) return i; + } + return -1; + } + +/* + // for testing indexOf(byte[], String, int) + public static void main(String[] args) { + byte[] buf = getBytes(args[0]); + System.out.println("IndexOf(arg0, arg1, 0) = " + indexOf(buf, args[1], 3)); + } +*/ + + public static final boolean isSpace(int ch) { + if (ch == ' ' || ch == '\t' || ch == '\n' || ch == '\r') return true; + else return false; + } + + public static final int skipSpaces(byte[] buf, int fromIndex, int toIndex) { + int i; + for (i = fromIndex; i < toIndex && isSpace(buf[i]); i++) + ; + return i; + } + /** + * Find byte pattern ptrn in buffer buf. + * @return index of first occurrence of ptrn in buf, -1 if no occurence + */ + public static final int findBytes(byte buf[], + int off, + int len, + byte ptrn[]) { + // Note: This code is completely incomprehensible without a drawing... + + int buf_len = off + len; + int ptrn_len = ptrn.length; + int i; // index into buf + int j; // index into ptrn; + byte b = ptrn[0]; // next byte of interest + + for (i = off; i < buf_len; ) { + j = 0; + while (i < buf_len && j < ptrn_len && buf[i] == ptrn[j]) { + i++; + j++; + } + if (i == buf_len || j == ptrn_len) + return i - j; + else { + // We have to go back a bit as there may be an overlapping + // match starting a bit later in buf... + i = i - j + 1; + } + } + return -1; + } + +/* + // for testing findBytes(byte[], int, int, byte[]) + public static void main(String args[]) { + if (args.length < 4) { + System.err.println("Usage: s1 off len s2"); + System.exit(1); + } + byte b1[] = new byte[args[0].length()]; + byte b2[] = new byte[args[3].length()]; + args[0].getBytes(0, args[0].length(), b1, 0); + args[3].getBytes(0, args[3].length(), b2, 0); + int off = Integer.parseInt(args[1]); + int len = Integer.parseInt(args[2]); + System.out.println("Index = " + findBytes(b1, off, len, b2)); + } +*/ +} diff --git a/sandbox/contributions/webcrawler-LARM/src/hplb/org/w3c/dom/Attribute.java b/sandbox/contributions/webcrawler-LARM/src/hplb/org/w3c/dom/Attribute.java new file mode 100644 index 00000000000..3d7a4dcbfc2 --- /dev/null +++ b/sandbox/contributions/webcrawler-LARM/src/hplb/org/w3c/dom/Attribute.java @@ -0,0 +1,20 @@ +/* + * $Id$ + */ + +package hplb.org.w3c.dom; + +/** + * + */ +public interface Attribute { + + public String getName(); + public Node getValue(); + public void setValue(Node arg); + + public boolean getSpecified(); + public void setSpecified(boolean arg); + + public String toString(); +} diff --git a/sandbox/contributions/webcrawler-LARM/src/hplb/org/w3c/dom/AttributeList.java b/sandbox/contributions/webcrawler-LARM/src/hplb/org/w3c/dom/AttributeList.java new file mode 100644 index 00000000000..5339b89c18d --- /dev/null +++ b/sandbox/contributions/webcrawler-LARM/src/hplb/org/w3c/dom/AttributeList.java @@ -0,0 +1,16 @@ +/* + * $Id$ + */ + +package hplb.org.w3c.dom; + +/** + * + */ +public interface AttributeList { + public Attribute getAttribute(String attrName); + public Attribute setAttribute(Attribute attr); + public Attribute remove(String attrName); + public Attribute item(int index); + public int getLength(); +} diff --git a/sandbox/contributions/webcrawler-LARM/src/hplb/org/w3c/dom/Comment.java b/sandbox/contributions/webcrawler-LARM/src/hplb/org/w3c/dom/Comment.java new file mode 100644 index 00000000000..17d54913e37 --- /dev/null +++ b/sandbox/contributions/webcrawler-LARM/src/hplb/org/w3c/dom/Comment.java @@ -0,0 +1,13 @@ +/* + * $Id$ + */ + +package hplb.org.w3c.dom; + +/** + * Represents the content of comments: <!-- ... --> + */ +public interface Comment extends Node { + public String getData(); + public void setData(String arg); +} diff --git a/sandbox/contributions/webcrawler-LARM/src/hplb/org/w3c/dom/DOM.java b/sandbox/contributions/webcrawler-LARM/src/hplb/org/w3c/dom/DOM.java new file mode 100644 index 00000000000..75608773cc0 --- /dev/null +++ b/sandbox/contributions/webcrawler-LARM/src/hplb/org/w3c/dom/DOM.java @@ -0,0 +1,13 @@ +/* + * $Id$ + */ + +package hplb.org.w3c.dom; + +/** + * + */ +public interface DOM { + public Document createDocument(String type); + public boolean hasFeature(String feature); +} diff --git a/sandbox/contributions/webcrawler-LARM/src/hplb/org/w3c/dom/Document.java b/sandbox/contributions/webcrawler-LARM/src/hplb/org/w3c/dom/Document.java new file mode 100644 index 00000000000..7c71b5e18e6 --- /dev/null +++ b/sandbox/contributions/webcrawler-LARM/src/hplb/org/w3c/dom/Document.java @@ -0,0 +1,28 @@ +/* + * $Id$ + */ + +package hplb.org.w3c.dom; + +/** + * + */ +public interface Document extends DocumentFragment { + public Node getDocumentType(); + public void setDocumentType(Node arg); + + public Element getDocumentElement(); + public void setDocumentElement(Element arg); + + public DocumentContext getContextInfo(); + public void setContextInfo(DocumentContext arg); + + public DocumentContext createDocumentContext(); + public Element createElement(String tagName, AttributeList attributes); + public Text createTextNode(String data); + public Comment createComment(String data); + public PI createPI(String name, String data); + public Attribute createAttribute(String name, Node value); + public AttributeList createAttributeList(); + public NodeIterator getElementsByTagName(); +} diff --git a/sandbox/contributions/webcrawler-LARM/src/hplb/org/w3c/dom/DocumentContext.java b/sandbox/contributions/webcrawler-LARM/src/hplb/org/w3c/dom/DocumentContext.java new file mode 100644 index 00000000000..508c6292249 --- /dev/null +++ b/sandbox/contributions/webcrawler-LARM/src/hplb/org/w3c/dom/DocumentContext.java @@ -0,0 +1,14 @@ +/* + * $Id$ + */ + +package hplb.org.w3c.dom; + +/** + * + */ +public interface DocumentContext { + + public Document getDocument(); + public void setDocument(Document arg); +} diff --git a/sandbox/contributions/webcrawler-LARM/src/hplb/org/w3c/dom/DocumentFragment.java b/sandbox/contributions/webcrawler-LARM/src/hplb/org/w3c/dom/DocumentFragment.java new file mode 100644 index 00000000000..3cae0af68ed --- /dev/null +++ b/sandbox/contributions/webcrawler-LARM/src/hplb/org/w3c/dom/DocumentFragment.java @@ -0,0 +1,13 @@ +/* + * $Id$ + */ + +package hplb.org.w3c.dom; + +/** + * + */ +public interface DocumentFragment extends Node { + public Document getMasterDoc(); + public void setMasterDoc(Document arg); +} diff --git a/sandbox/contributions/webcrawler-LARM/src/hplb/org/w3c/dom/Element.java b/sandbox/contributions/webcrawler-LARM/src/hplb/org/w3c/dom/Element.java new file mode 100644 index 00000000000..8240ffa5e98 --- /dev/null +++ b/sandbox/contributions/webcrawler-LARM/src/hplb/org/w3c/dom/Element.java @@ -0,0 +1,16 @@ +/* + * $Id$ + */ + +package hplb.org.w3c.dom; + +/** + * + */ +public interface Element extends Node { + public String getTagName(); + public AttributeList attributes(); + public void setAttribute(Attribute newAttr); + public void normalize(); + public NodeIterator getElementsByTagName(); +} diff --git a/sandbox/contributions/webcrawler-LARM/src/hplb/org/w3c/dom/Makefile b/sandbox/contributions/webcrawler-LARM/src/hplb/org/w3c/dom/Makefile new file mode 100644 index 00000000000..946af9eb603 --- /dev/null +++ b/sandbox/contributions/webcrawler-LARM/src/hplb/org/w3c/dom/Makefile @@ -0,0 +1,38 @@ +# This Makefile generated by hplb.util.jmkmf +# Java package is org.w3c.dom + +.SUFFIXES: .java .class .jj +JPACKAGE = org.w3c.dom +JAVA = java +JAVAC = javac +JAVACC = java COM.sun.labs.javacc.Main +JFLAGS = +OBJS = \ + Attribute.class \ + AttributeList.class \ + Comment.class \ + DOM.class \ + Document.class \ + DocumentContext.class \ + DocumentFragment.class \ + Element.class \ + Node.class \ + NodeIterator.class \ + PI.class \ + Text.class \ + TreeIterator.class +JAVADOCFLAGS = -d ../../../doc/api -author -noindex -notree + +all: $(OBJS) + +doc: + javadoc $(JAVADOCFLAGS) $(JPACKAGE) + +.jj.java: $*.jj + $(JAVACC) $< + +.java.class: $*.java + $(JAVAC) $(JFLAGS) $< + +clean: + rm -f *.class *~ diff --git a/sandbox/contributions/webcrawler-LARM/src/hplb/org/w3c/dom/Node.java b/sandbox/contributions/webcrawler-LARM/src/hplb/org/w3c/dom/Node.java new file mode 100644 index 00000000000..7587fce2830 --- /dev/null +++ b/sandbox/contributions/webcrawler-LARM/src/hplb/org/w3c/dom/Node.java @@ -0,0 +1,29 @@ +/* + * $Id$ + */ + +package hplb.org.w3c.dom; + +/** + * + */ +public interface Node { + // NodeType + public static final int DOCUMENT = 1; + public static final int ELEMENT = 2; + public static final int ATTRIBUTE = 3; + public static final int PI = 4; + public static final int COMMENT = 5; + public static final int TEXT = 6; + + public int getNodeType(); + public Node getParentNode(); + public NodeIterator getChildNodes(); + public boolean hasChildNodes(); + public Node getFirstChild(); + public Node getPreviousSibling(); + public Node getNextSibling(); + public Node insertBefore(Node newChild, Node refChild); + public Node replaceChild(Node newChild, Node oldChild); + public Node removeChild(Node oldChild); +} diff --git a/sandbox/contributions/webcrawler-LARM/src/hplb/org/w3c/dom/NodeIterator.java b/sandbox/contributions/webcrawler-LARM/src/hplb/org/w3c/dom/NodeIterator.java new file mode 100644 index 00000000000..9194fb74d31 --- /dev/null +++ b/sandbox/contributions/webcrawler-LARM/src/hplb/org/w3c/dom/NodeIterator.java @@ -0,0 +1,19 @@ +/* + * $Id$ + */ + +package hplb.org.w3c.dom; + +/** + * + */ +public interface NodeIterator { + public int getLength(); + public Node getCurrent(); + public Node toNext(); + public Node toPrevious(); + public Node toFirst(); + public Node toLast(); + public Node toNth(int Nth); + public Node toNode(Node destNode); +} diff --git a/sandbox/contributions/webcrawler-LARM/src/hplb/org/w3c/dom/PI.java b/sandbox/contributions/webcrawler-LARM/src/hplb/org/w3c/dom/PI.java new file mode 100644 index 00000000000..af63d9f94d6 --- /dev/null +++ b/sandbox/contributions/webcrawler-LARM/src/hplb/org/w3c/dom/PI.java @@ -0,0 +1,16 @@ +/* + * $Id$ + */ + +package hplb.org.w3c.dom; + +/** + * Processing Instruction + */ +public interface PI extends Node { + public String getName(); + public void setName(String arg); + + public String getData(); + public void setData(String arg); +} diff --git a/sandbox/contributions/webcrawler-LARM/src/hplb/org/w3c/dom/Text.java b/sandbox/contributions/webcrawler-LARM/src/hplb/org/w3c/dom/Text.java new file mode 100644 index 00000000000..2490c9ecabe --- /dev/null +++ b/sandbox/contributions/webcrawler-LARM/src/hplb/org/w3c/dom/Text.java @@ -0,0 +1,19 @@ +/* + * $Id$ + */ + +package hplb.org.w3c.dom; + +/** + * + */ +public interface Text extends Node { + public String getData(); + public void setData(String arg); + + public void append(String data); + public void insert(int offset, String data); + public void delete(int offset, int count); + public void replace(int offset, int count, String data); + public void splice(Element element, int offset, int count); +} diff --git a/sandbox/contributions/webcrawler-LARM/src/hplb/org/w3c/dom/TreeIterator.java b/sandbox/contributions/webcrawler-LARM/src/hplb/org/w3c/dom/TreeIterator.java new file mode 100644 index 00000000000..bdb2339c286 --- /dev/null +++ b/sandbox/contributions/webcrawler-LARM/src/hplb/org/w3c/dom/TreeIterator.java @@ -0,0 +1,20 @@ +/* + * $Id$ + */ + +package hplb.org.w3c.dom; + +/** + * + */ +public interface TreeIterator extends NodeIterator { + public int numChildren(); + public int numPreviousSiblings(); + public int numNextSiblings(); + public Node toParent(); + public Node toPreviousSibling(); + public Node toNextSibling(); + public Node toFirstChild(); + public Node toLastChild(); + public Node toNthChild(); +} diff --git a/sandbox/contributions/webcrawler-LARM/src/hplb/org/xml/sax/AttributeMap.java b/sandbox/contributions/webcrawler-LARM/src/hplb/org/xml/sax/AttributeMap.java new file mode 100644 index 00000000000..ef71ebaccd6 --- /dev/null +++ b/sandbox/contributions/webcrawler-LARM/src/hplb/org/xml/sax/AttributeMap.java @@ -0,0 +1,146 @@ +// $Id$ + +package hplb.org.xml.sax; + +import java.util.Enumeration; + +/** + * A map of attributes for the current element. + *

This interface is part of the Java implementation of SAX, + * the Simple API for XML. It is free for both commercial and + * non-commercial use, and is distributed with no warrantee, real + * or implied.

+ *

This map will be valid only during the invocation of the + * startElement callback: if you need to use attribute + * information elsewhere, you will need to make your own copies.

+ * @author David Megginson, Microstar Software Ltd. + * @see hplb.org.xml.sax.DocumentHandler#startElement + */ +public interface AttributeMap { + + + /** + * Find the names of all available attributes for an element. + *

This applies to the current element, and can be called only + * during an invocation of startElement.

+ * @return An enumeration of zero or more Strings. + * @see java.util.Enumeration + * @see hplb.org.xml.sax.DocumentHandler#startElement + */ + public Enumeration getAttributeNames (); + + + /** + * Get the value of an attribute as a String. + *

This applies to the current element, and can be called only + * during an invocation of startElement.

+ * @return The value as a String, or null if the attribute has no value. + * @see hplb.org.xml.sax.DocumentHandler#startElement + */ + public String getValue (String attributeName); + + + /** + * Check if an attribute value is the name of an entity. + *

This applies to the current element, and can be called only + * during an invocation of startElement.

+ * @return true if the attribute is an entity name. + * @see #getEntityPublicID + * @see #getEntitySystemID + * @see #getNotationName + * @see #getNotationPublicID + * @see #getNotationSystemID + * @see hplb.org.xml.sax.DocumentHandler#startElement + */ + public boolean isEntity (String aname); + + + /** + * Check if an attribute value is the name of a notation. + *

This applies to the current element, and can be called only + * during an invocation of startElement.

+ * @return true if the attribute is a notation name. + * @see #getNotationPublicID + * @see #getNotationSystemID + * @see hplb.org.xml.sax.DocumentHandler#startElement + */ + public boolean isNotation (String aname); + + + /** + * Check if an attribute value is a unique identifier. + *

This applies to the current element, and can be called only + * during an invocation of startElement.

+ * @return true if the attribute is a unique identifier. + * @see hplb.org.xml.sax.DocumentHandler#startElement + */ + public boolean isId (String aname); + + + /** + * Check if an attribute value is a reference to an ID. + *

This applies to the current element, and can be called only + * during an invocation of startElement.

+ * @return true if the attribute is a reference to an ID. + * @see hplb.org.xml.sax.DocumentHandler#startElement + */ + public boolean isIdref (String aname); + + + /** + * Get the public identifier for an ENTITY attribute. + *

This applies to the current element, and can be called only + * during an invocation of startElement.

+ * @return The public identifier or null if there is none (or if + * the attribute value is not an entity name) + * @see #isEntity + */ + public String getEntityPublicID (String aname); + + + /** + * Get the system identifer for an ENTITY attribute. + *

This applies to the current element, and can be called only + * during an invocation of startElement.

+ * @return The system identifier or null if there is none (or if + * the attribute value is not an entity name) + * @see #isEntity + */ + public String getEntitySystemID (String aname); + + + /** + * Get the notation name for an ENTITY attribute. + *

This applies to the current element, and can be called only + * during an invocation of startElement.

+ * @return The notation name or null if there is none (or if + * the attribute value is not an entity name) + * @see #isEntity + */ + public String getNotationName (String aname); + + + /** + * Get the notation public ID for an ENTITY or NOTATION attribute. + *

This applies to the current element, and can be called only + * during an invocation of startElement.

+ * @return The public identifier or null if there is none (or if + * the attribute value is not an entity or notation name) + * @see #isEntity + * @see #isNotation + */ + public String getNotationPublicID (String aname); + + + /** + * Get the notation system ID for an ENTITY or NOTATION attribute. + *

This applies to the current element, and can be called only + * during an invocation of startElement.

+ * @return The system identifier or null if there is none (or if + * the attribute value is not an entity or notation name) + * @see #isEntity + * @see #isNotation + */ + public String getNotationSystemID (String aname); + +} diff --git a/sandbox/contributions/webcrawler-LARM/src/hplb/org/xml/sax/DocumentHandler.java b/sandbox/contributions/webcrawler-LARM/src/hplb/org/xml/sax/DocumentHandler.java new file mode 100644 index 00000000000..13b83ec3eb1 --- /dev/null +++ b/sandbox/contributions/webcrawler-LARM/src/hplb/org/xml/sax/DocumentHandler.java @@ -0,0 +1,129 @@ +// $Id$ + +package hplb.org.xml.sax; + + +/** + * A callback interface for basic XML document events. + *

This interface is part of the Java implementation of SAX, + * the Simple API for XML. It is free for both commercial and + * non-commercial use, and is distributed with no warrantee, real + * or implied.

+ *

This is the main handler for basic document events; it provides + * information on roughly the same level as the ESIS in full SGML, + * concentrating on logical structure rather than lexical + * representation.

+ *

If you do not set a document handler, then by default all of these + * events will simply be ignored.

+ * @author David Megginson, Microstar Software Ltd. + * @see hplb.org.xml.sax.Parser@setDocumentHandler + */ +public interface DocumentHandler { + + + /** + * Handle the start of a document. + *

This is the first event called by a + * SAX-conformant parser, so you can use it to allocate and + * initialise new objects for the document.

+ * @exception java.lang.Exception You may throw any exception. + */ + public void startDocument () + throws Exception; + + + /** + * Handle the end of a document. + *

This is the last event called by a + * SAX-conformant parser, so you can use it to finalize and + * clean up objects for the document.

+ * @exception java.lang.Exception You may throw any exception. + */ + public void endDocument () + throws Exception; + + + /** + * Handle the document type declaration. + *

This will appear only if the XML document contains a + * DOCTYPE declaration.

+ * @param name The document type name. + * @param publicID The public identifier of the external DTD subset + * (if any), or null. + * @param systemID The system identifier of the external DTD subset + * (if any), or null. + * @param name The document type name. + * @exception java.lang.Exception You may throw any exception. + */ + public void doctype (String name, String publicID, String systemID) + throws Exception; + + + /** + * Handle the start of an element. + *

Please note that the information in the attributes + * parameter will be accurate only for the duration of this handler: + * if you need to use the information elsewhere, you should copy + * it.

+ * @param name The element type name. + * @param attributes The available attributes. + * @exception java.lang.Exception You may throw any exception. + */ + public void startElement (String name, AttributeMap attributes) + throws Exception; + + + /** + * Handle the end of an element. + * @exception java.lang.Exception You may throw any exception. + */ + public void endElement (String name) + throws Exception; + + + /** + * Handle significant character data. + *

Please note that the contents of the array will be + * accurate only for the duration of this handler: if you need to + * use them elsewhere, you should make your own copy, possible + * by constructing a string:

+ *
+    * String data = new String(ch, start, length);
+    * 
+ * @param ch An array of characters. + * @param start The starting position in the array. + * @param length The number of characters to use in the array. + * @exception java.lang.Exception You may throw any exception. + */ + public void characters (char ch[], int start, int length) + throws Exception; + + + /** + * Handle ignorable whitespace. + *

Please note that the contents of the array will be + * accurate only for the duration of this handler: if you need to + * use them elsewhere, you should make your own copy, possible + * by constructing a string:

+ *
+    * String whitespace = new String(ch, start, length);
+    * 
+ * @param ch An array of whitespace characters. + * @param start The starting position in the array. + * @param length The number of characters to use in the array. + * @exception java.lang.Exception You may throw any exception. + */ + public void ignorable (char ch[], int start, int length) + throws Exception; + + + /** + * Handle a processing instruction. + *

XML processing instructions have two parts: a target, which + * is a name, followed optionally by data.

+ * @exception java.lang.Exception You may throw any exception. + */ + public void processingInstruction (String name, String remainder) + throws Exception; + +} diff --git a/sandbox/contributions/webcrawler-LARM/src/hplb/org/xml/sax/EntityHandler.java b/sandbox/contributions/webcrawler-LARM/src/hplb/org/xml/sax/EntityHandler.java new file mode 100644 index 00000000000..93faa3dcf73 --- /dev/null +++ b/sandbox/contributions/webcrawler-LARM/src/hplb/org/xml/sax/EntityHandler.java @@ -0,0 +1,48 @@ +// $Id$ + +package hplb.org.xml.sax; + + +/** + * A callback interface for basic XML entity-related events. + *

This interface is part of the Java implementation of SAX, + * the Simple API for XML. It is free for both commercial and + * non-commercial use, and is distributed with no warrantee, real + * or implied.

+ *

If you do not set an entity handler, then a parser will + * resolve all entities to the suggested system ID, and will take no + * action for entity changes.

+ * @author David Megginson, Microstar Software Ltd. + * @see hplb.org.xml.sax.Parser#setEntityHandler + */ +public interface EntityHandler { + + + /** + * Resolve a system identifier. + *

Before loading any entity (including the document entity), + * SAX parsers will filter the system identifier through this + * callback, and you can return a different system identifier if you + * wish, or null to prevent the parser from reading any entity.

+ * @param ename The name of the entity, "[document]" for the + * document entity, or "[external DTD]" for the external + * DTD subset. + * @param publicID The public identifier, or null if there is none. + * @param systemID The system identifier suggested in the XML document. + * @return A system identifier, or null to skip the entity. + * @exception java.lang.Exception You may throw any exception. + */ + public String resolveEntity (String ename, String publicID, String systemID) + throws Exception; + + /** + * Handle a change in the current entity. + *

Whenever the parser switches the entity (URI) that it is reading + * from, it will call this handler to report the change.

+ * @param systemID The URI of the new entity. + * @exception java.lang.Exception You may throw any exception. + */ + public void changeEntity (String systemID) + throws Exception; + +} diff --git a/sandbox/contributions/webcrawler-LARM/src/hplb/org/xml/sax/ErrorHandler.java b/sandbox/contributions/webcrawler-LARM/src/hplb/org/xml/sax/ErrorHandler.java new file mode 100644 index 00000000000..4c8397029ee --- /dev/null +++ b/sandbox/contributions/webcrawler-LARM/src/hplb/org/xml/sax/ErrorHandler.java @@ -0,0 +1,52 @@ +// $Id$ + +package hplb.org.xml.sax; + + +/** + * A callback interface for basic XML error events. + *

This interface is part of the Java implementation of SAX, + * the Simple API for XML. It is free for both commercial and + * non-commercial use, and is distributed with no warrantee, real + * or implied.

+ *

If you do not set an error handler, then a parser will report + * warnings to System.err, and will throw an (unspecified) + * exception for fata errors.

+ * @author David Megginson, Microstar Software Ltd. + * @see hplb.org.xml.sax.Parser#setErrorHandler + */ +public interface ErrorHandler { + + /** + * Handle a non-fatal warning. + *

A SAX parser will use this callback to report a condition + * that is not serious enough to stop the parse (though you may + * still stop the parse if you wish).

+ * @param message The warning message. + * @param systemID The URI of the entity that caused the warning, or + * null if not available. + * @param line The line number in the entity, or -1 if not available. + * @param column The column number in the entity, or -1 if not available. + * @exception java.lang.Exception You may throw any exception. + */ + public void warning (String message, String systemID, int line, int column) + throws java.lang.Exception; + + /** + * Handle a fatal error. + *

A SAX parser will use this callback to report a condition + * that is serious enough to invalidate the parse, and may not + * report all (or any) significant parse events after this. Ordinarily, + * you should stop immediately with an exception, but you can continue + * to try to collect more errors if you wish.

+ * @param message The error message. + * @param systemID The URI of the entity that caused the error, or + * null if not available. + * @param line The line number in the entity, or -1 if not available. + * @param column The column number in the entity, or -1 if not available. + * @exception java.lang.Exception You may throw any exception. + */ + public void fatal (String message, String systemID, int line, int column) + throws Exception; + +} diff --git a/sandbox/contributions/webcrawler-LARM/src/hplb/org/xml/sax/HandlerBase.java b/sandbox/contributions/webcrawler-LARM/src/hplb/org/xml/sax/HandlerBase.java new file mode 100644 index 00000000000..1bf3f2a1099 --- /dev/null +++ b/sandbox/contributions/webcrawler-LARM/src/hplb/org/xml/sax/HandlerBase.java @@ -0,0 +1,201 @@ +// $Id$ + +package hplb.org.xml.sax; + + +/** + * A simple base class for deriving SAX event handlers. + *

This class is part of the Java implementation of SAX, + * the Simple API for XML. It is free for both commercial and + * non-commercial use, and is distributed with no warrantee, real + * or implied.

+ *

This class implements the default behaviour when no handler + * is specified (though parsers are not actually required to use + * this class).

+ * @author David Megginson, Microstar Software Ltd. + * @see hplb.org.xml.sax.XmlException + * @see hplb.org.xml.sax.EntityHandler + * @see hplb.org.xml.sax.DocumentHandler + * @see hplb.org.xml.sax.ErrorHandler + */ +public class HandlerBase + implements EntityHandler, DocumentHandler, ErrorHandler +{ + + + ////////////////////////////////////////////////////////////////////// + // Implementation of hplb.org.xml.sax.EntityHandler. + ////////////////////////////////////////////////////////////////////// + + /** + * Resolve an external entity. + *

By default, simply return the system ID supplied.

+ * @exception java.lang.Exception When you override this method, + * you may throw any exception. + * @see hplb.org.xml.sax.EntityHandler#resolveEntity + */ + public String resolveEntity (String ename, String publicID, String systemID) + throws Exception + { + return systemID; + } + + + /** + * Handle an entity-change event. + *

By default, do nothing.

+ * @exception java.lang.Exception When you override this method, + * you may throw any exception. + * @see hplb.org.xml.sax.EntityHandler#changeEntity + */ + public void changeEntity (String systemID) + throws Exception + { + } + + + + ////////////////////////////////////////////////////////////////////// + // Implementation of hplb.org.xml.sax.DocumentHandler. + ////////////////////////////////////////////////////////////////////// + + + /** + * Handle a start document event. + *

By default, do nothing.

+ * @exception java.lang.Exception When you override this method, + * you may throw any exception. + * @see hplb.org.xml.sax.DocumentHandler#startDocument + */ + public void startDocument () + throws Exception + {} + + + /** + * Handle a end document event. + *

By default, do nothing.

+ * @exception java.lang.Exception When you override this method, + * you may throw any exception. + * @see hplb.org.xml.sax.DocumentHandler#endDocument + */ + public void endDocument () + throws Exception + {} + + + /** + * Handle a document type declaration event. + *

By default, do nothing.

+ * @exception java.lang.Exception When you override this method, + * you may throw any exception. + * @see hplb.org.xml.sax.DocumentHandler#doctype + */ + public void doctype (String name, String publicID, String systemID) + throws Exception + {} + + + /** + * Handle a start element event. + *

By default, do nothing.

+ * @exception java.lang.Exception When you override this method, + * you may throw any exception. + * @see hplb.org.xml.sax.DocumentHandler#startElement + */ + public void startElement (String name, AttributeMap attributes) + throws Exception + {} + + + /** + * Handle an end element event. + *

By default, do nothing.

+ * @exception java.lang.Exception When you override this method, + * you may throw any exception. + * @see hplb.org.xml.sax.DocumentHandler#endElement + */ + public void endElement (String name) + throws Exception + {} + + + /** + * Handle a character data event. + *

By default, do nothing.

+ * @exception java.lang.Exception When you override this method, + * you may throw any exception. + * @see hplb.org.xml.sax.DocumentHandler#characters + */ + public void characters (char ch[], int start, int length) + throws Exception + {} + + + /** + * Handle an ignorable whitespace event. + *

By default, do nothing.

+ * @exception java.lang.Exception When you override this method, + * you may throw any exception. + * @see hplb.org.xml.sax.DocumentHandler#ignorable + */ + public void ignorable (char ch[], int start, int length) + throws Exception + {} + + + /** + * Handle a processing instruction event. + *

By default, do nothing.

+ * @exception java.lang.Exception When you override this method, + * you may throw any exception. + * @see hplb.org.xml.sax.DocumentHandler#processingInstruction + */ + public void processingInstruction (String name, String remainder) + throws Exception + {} + + + + ////////////////////////////////////////////////////////////////////// + // Implementation of ErrorHandler. + ////////////////////////////////////////////////////////////////////// + + + /** + * Handle a non-fatal error. + *

By default, report the warning to System.err.

+ * @exception java.lang.Exception When you override this method, + * you may throw any exception. + * @see hplb.org.xml.sax.ErrorHandler#warning + */ + public void warning (String message, String systemID, int line, int column) + throws Exception + { + System.err.println("Warning (" + + systemID + + ',' + + line + + ',' + + column + + "): " + + message); + } + + + /** + * Handle a fatal error. + *

By default, throw an instance of XmlException.

+ * @exception hplb.org.xml.sax.XmlException A fatal parsing error + * has been found. + * @exception java.lang.Exception When you override this method, + * you may throw any exception. + * @see hplb.org.xml.sax.ErrorHandler#fatal + */ + public void fatal (String message, String systemID, int line, int column) + throws XmlException, Exception + { + throw new XmlException(message, systemID, line, column); + } + +} diff --git a/sandbox/contributions/webcrawler-LARM/src/hplb/org/xml/sax/Makefile b/sandbox/contributions/webcrawler-LARM/src/hplb/org/xml/sax/Makefile new file mode 100644 index 00000000000..e2ad29c1f0f --- /dev/null +++ b/sandbox/contributions/webcrawler-LARM/src/hplb/org/xml/sax/Makefile @@ -0,0 +1,32 @@ +# This Makefile generated by jmkmf +# Java package is org.xml.sax + +.SUFFIXES: .java .class .jj +JPACKAGE = org.xml.sax +JAVA = java +JAVAC = javac +JAVACC = java COM.sun.labs.javacc.Main +JFLAGS = +OBJS = \ + AttributeMap.class \ + DocumentHandler.class \ + EntityHandler.class \ + ErrorHandler.class \ + HandlerBase.class \ + Parser.class \ + XmlException.class +JAVADOCFLAGS = -d ../../../doc/api -author -noindex -notree + +all: $(OBJS) + +doc: + javadoc $(JAVADOCFLAGS) $(JPACKAGE) + +.jj.java: org.xml.sax.jj + $(JAVACC) $< + +.java.class: $*.java + $(JAVAC) $(JFLAGS) $< + +clean: + rm -f *.class *~ diff --git a/sandbox/contributions/webcrawler-LARM/src/hplb/org/xml/sax/Parser.java b/sandbox/contributions/webcrawler-LARM/src/hplb/org/xml/sax/Parser.java new file mode 100644 index 00000000000..3033ef9d37f --- /dev/null +++ b/sandbox/contributions/webcrawler-LARM/src/hplb/org/xml/sax/Parser.java @@ -0,0 +1,71 @@ +// $Id$ + +package hplb.org.xml.sax; + + +/** + * A standard interface for event-driven XML parsers. + *

This interface is part of the Java implementation of SAX, + * the Simple API for XML. It is free for both commercial and + * non-commercial use, and is distributed with no warrantee, real + * or implied.

+ *

All SAX-conformant XML parsers (or their front-end SAX drivers) + * must implement this interface, together with a zero-argument + * constructor.

+ *

You can plug three different kinds of callback interfaces into + * a basic SAX parser: one for entity handling, one for basic document + * events, and one for error reporting. It is not an error to start + * a parse without setting any handlers.

+ * @author David Megginson, Microstar Software Ltd. + */ +public interface Parser { + + + /** + * Register the handler for basic entity events. + *

If you begin a parse without setting an entity handler, + * the parser will by default resolve all entities to their + * default system IDs.

+ * @param handler An object to receive callbacks for events. + * @see hplb.org.xml.sax.EntityHandler + */ + public void setEntityHandler (EntityHandler handler); + + + /** + * Register the handler for basic document events. + *

You may begin the parse without setting a handler, but + * in that case no document events will be reported.

+ * @param handler An object to receive callbacks for events. + * @see hplb.org.xml.sax.DocumentHandler + */ + public void setDocumentHandler (DocumentHandler handler); + + + /** + * Register the handler for errors and warnings. + *

If you begin a parse without setting an error handlers, + * warnings will be printed to System.err, and errors will + * throw an unspecified exception.

+ * @param handler An object to receive callbacks for errors. + * @see hplb.org.xml.sax.ErrorHandler + */ + public void setErrorHandler (ErrorHandler handler); + + + /** + * Parse an XML document. + *

Nothing exciting will happen unless you have set handlers.

+ * @param publicID The public identifier for the document, or null + * if none is available. + * @param systemID The system identifier (URI) for the document. + * @exception java.lang.Exception This method may throw any exception, + * but the parser itself + * will throw only exceptions derived from java.io.IOException; + * anything else will come from your handlers. + * @see #setEntityHandler + * @see #setDocumentHandler + * @see #setErrorHandler + */ + void parse (String publicID, String systemID) throws java.lang.Exception; +} diff --git a/sandbox/contributions/webcrawler-LARM/src/hplb/org/xml/sax/XmlException.java b/sandbox/contributions/webcrawler-LARM/src/hplb/org/xml/sax/XmlException.java new file mode 100644 index 00000000000..f7d4c244fd4 --- /dev/null +++ b/sandbox/contributions/webcrawler-LARM/src/hplb/org/xml/sax/XmlException.java @@ -0,0 +1,73 @@ +// $Id$ + +package hplb.org.xml.sax; + + +/** + * An exception for reporting XML parsing errors. + *

This interface is part of the Java implementation of SAX, + * the Simple API for XML. It is free for both commercial and + * non-commercial use, and is distributed with no warrantee, real + * or implied.

+ *

This exception is not a required part of SAX, and it is not + * referenced in any of the core interfaces. It is used only in + * the optional HandlerBase base class, as a means of signalling + * parsing errors.

+ * @author David Megginson, Microstar Software Ltd. + * @see hplb.org.xml.sax.HandlerBase#fatal + */ +public class XmlException extends Exception { + + + /** + * Construct a new exception with information about the location. + */ + public XmlException (String message, String systemID, int line, int column) + { + super(message); + this.systemID = systemID; + this.line = line; + this.column = column; + } + + + /** + * Find the system identifier (URI) where the error occurred. + * @return A string representing the URI, or null if none is available. + */ + public String getSystemID () + { + return systemID; + } + + + /** + * Find the line number where the error occurred. + * @return The line number, or -1 if none is available. + */ + public int getLine () + { + return line; + } + + + /** + * Find the column number (line offset) where the error occurred. + * @return The column number, or -1 if none is available. + */ + public int getColumn () + { + return column; + } + + + + // + // Internal state. + // + + private String systemID; + private int line; + private int column; + +} diff --git a/sandbox/contributions/webcrawler-LARM/src/hplb/xml/Atom.java b/sandbox/contributions/webcrawler-LARM/src/hplb/xml/Atom.java new file mode 100644 index 00000000000..097b85ddd5f --- /dev/null +++ b/sandbox/contributions/webcrawler-LARM/src/hplb/xml/Atom.java @@ -0,0 +1,41 @@ +/* + * $Id$ + * + * Copyright 1997 Hewlett-Packard Company + * + * This file may be copied, modified and distributed only in + * accordance with the terms of the limited licence contained + * in the accompanying file LICENSE.TXT. + */ + +package hplb.xml; + +import java.util.Hashtable; + +/** + * This class is responsible for maintaining strings as atoms, + * i.e. if two strings returned by getAtom() are equal in the sense of + * String.equal() then they are in fact the same Object. This is used to + * "intern" element and attribute names which can then be compared using + * the more efficient reference equality, a la "s1==s2". + * + * @author Anders Kristensen + */ +public final class Atom { + /** Holds atoms: element names (GIs), and attribute names. */ + private static final Hashtable atoms = new Hashtable(); + + /** + * Return an atom corresponding to the argument. + */ + public static String getAtom(String s) { + synchronized (atoms) { + String a = (String) atoms.get(s); + if (a == null) { + atoms.put(s, s); + a = s; + } + return a; + } + } +} diff --git a/sandbox/contributions/webcrawler-LARM/src/hplb/xml/AttrImpl.java b/sandbox/contributions/webcrawler-LARM/src/hplb/xml/AttrImpl.java new file mode 100644 index 00000000000..4e84b2fb10c --- /dev/null +++ b/sandbox/contributions/webcrawler-LARM/src/hplb/xml/AttrImpl.java @@ -0,0 +1,57 @@ +/* + * $Id$ + * + * Copyright 1997 Hewlett-Packard Company + * + * This file may be copied, modified and distributed only in + * accordance with the terms of the limited licence contained + * in the accompanying file LICENSE.TXT. + */ + +package hplb.xml; + +import hplb.org.w3c.dom.*; + +/** + * + * @author Anders Kristensen + */ +public final class AttrImpl implements Attribute { + protected String name; + protected Node value; + protected boolean specified; + + public AttrImpl(String name, String value) { + this(name, new TextImpl(Node.TEXT, value), true); + } + + public AttrImpl(String name, Node value, boolean specified) { + this.name = name; + this.value = value; + this.specified = specified; + } + + public String getName() { + return name; + } + + public Node getValue() { + return value; + } + + public void setValue(Node arg) { + value = arg; + } + + public boolean getSpecified() { + return specified; + } + + public void setSpecified(boolean arg) { + specified = arg; + } + + public String toString() { + return value.toString(); + } +} diff --git a/sandbox/contributions/webcrawler-LARM/src/hplb/xml/AttrListImpl.java b/sandbox/contributions/webcrawler-LARM/src/hplb/xml/AttrListImpl.java new file mode 100644 index 00000000000..429626703da --- /dev/null +++ b/sandbox/contributions/webcrawler-LARM/src/hplb/xml/AttrListImpl.java @@ -0,0 +1,183 @@ +/* + * $Id$ + * + * Copyright 1997 Hewlett-Packard Company + * + * This file may be copied, modified and distributed only in + * accordance with the terms of the limited licence contained + * in the accompanying file LICENSE.TXT. + */ + +package hplb.xml; + +import hplb.org.w3c.dom.*; + +/** + * An ordered Dictionary. keys() and elements() returns Enumerations + * which enumerate over elements in the order they were inserted. + * Elements are stored linearly. Operations put(), get(), and remove() + * are linear in the number of elements in the Dictionary. + * + *

Allows direct access to elements (as an alternative to using + * Enumerators) for speed. + * + *

Can function as a bag, i.e. it can be created with a mode + * which allows the same key to map to multiple entries. In this case + * operations get() and remove() operate on the first pair in + * the map. Hence to get hold of all values associated with a key it is + * necessary to use the direct access to underlying arrays. + * + * @author Anders Kristensen + */ +public class AttrListImpl implements AttributeList { + protected Attribute[] elms; + + /** + * Number of elements. The elements are held at indices 0 to n in elms. + */ + protected int n = 0; + + public AttrListImpl() { + this(2); + } + + /** + * Create an AttrListImpl with the specififed initial capacity. + */ + public AttrListImpl(int size) { + if (size <= 0) throw new IllegalArgumentException( + "Initial size must be at least 1"); + elms = new Attribute[size]; + } + + /** + * Returns the value to which the key is mapped in this dictionary. + */ + public synchronized Attribute getAttribute(String attrName) { + int i = getIndex(attrName); + return (i < 0 ? null : elms[i]); + } + + protected int getIndex(String name) { + for (int i = 0; i < n; i++) { + if (elms[i].getName().equals(name)) { + return i; + } + } + return -1; + } + + // XXX: what if attrName != attr.getName()??? + public synchronized Attribute setAttribute(Attribute attr) { + int i = getIndex(attr.getName()); + if (i >= 0) { + Attribute old = elms[i]; + elms[i] = attr; + return old; + } + + int len = elms.length; + if (len == n) { + // double size of key,elms arrays + AttrImpl[] e; + e = new AttrImpl[len * 2]; + System.arraycopy(elms, 0, e, 0, len); + elms = e; + } + elms[n] = attr; + n++; + return null; + } + + public synchronized Attribute remove(String attrName) { + int i = getIndex(attrName); + if (i < 0) return null; + Attribute val = elms[i]; + System.arraycopy(elms, i+1, elms, i, n-i-1); + n--; + return val; + } + + public synchronized Attribute item(int index) { + if (index < 0 || index >= n) { + throw new IndexOutOfBoundsException(""+index); + } + return elms[index]; + } + + /** Returns the number of keys in this dictionary. */ + public synchronized int getLength() { + return n; + } + + public synchronized String toString() { + StringBuffer sb = new StringBuffer(); + boolean f = true; + int n = getLength(); + + sb.append("{ "); + for (int i = 0; i < n; i++) { + if (f) { f = false; } + else { sb.append(", "); } + Attribute attr = item(i); + sb.append(attr.getName() + '=' + attr); + } + sb.append(" }"); + return sb.toString(); + } + + /**/ + // for testing + public static void main(String[] args) throws Exception { + AttrListImpl alist; + Attribute attr; + java.io.BufferedReader r; + java.util.StringTokenizer tok; + String op; + + if (args.length > 1) { + alist = new AttrListImpl(Integer.parseInt(args[0])); + } else { + alist = new AttrListImpl(); + } + + System.out.println( + "Enter operations... op's are one of\n"+ + "put \n"+ + "get \n"+ + "rem \n"+ + "size\n"+ + "quit\n"); + + r = new java.io.BufferedReader( + new java.io.InputStreamReader(System.in)); + while (true) { + System.out.print("doyourworst> "); + tok = new java.util.StringTokenizer(r.readLine()); + op = tok.nextToken(); + if ("put".equals(op)) { + attr = new AttrImpl(tok.nextToken(), tok.nextToken()); + System.out.println("Value: " + + alist.setAttribute(attr)); + } else if ("get".equals(op)) { + attr = alist.getAttribute(tok.nextToken()); + System.out.println("Value: " + + (attr == null ? "No such element" : attr.toString())); + } else if ("rem".equals(op)) { + attr = alist.remove(tok.nextToken()); + System.out.println("Value: " + attr); + } else if (op.startsWith("s")) { + System.out.println("Size: " + alist.getLength()); + } else if (op.startsWith("q")) { + break; + } else { + System.out.println("Unrecognized op: " + op); + } + + System.out.println("AttributeList: " + alist); + System.out.println("Size: " + alist.getLength()); + System.out.println(); + } + } + //*/ +} diff --git a/sandbox/contributions/webcrawler-LARM/src/hplb/xml/CharBuffer.java b/sandbox/contributions/webcrawler-LARM/src/hplb/xml/CharBuffer.java new file mode 100644 index 00000000000..5eee304178d --- /dev/null +++ b/sandbox/contributions/webcrawler-LARM/src/hplb/xml/CharBuffer.java @@ -0,0 +1,46 @@ +/* + * $Id$ + * + * Copyright 1997 Hewlett-Packard Company + * + * This file may be copied, modified and distributed only in + * accordance with the terms of the limited licence contained + * in the accompanying file LICENSE.TXT. + */ + +package hplb.xml; + +/** + * A java.io.CharArrayWriter with the additional property that users can get + * to the actual underlying storage. Hence it's very fast (and dangerous). + * @author Anders Kristensen + */ +public final class CharBuffer extends java.io.CharArrayWriter { + public CharBuffer() { + super(); + } + + public CharBuffer(int size) { + super(size); + } + + // use only to *decrement* size + public void setLength(int size) { + synchronized (lock) { + if (size < count) count = size; + } + } + + public char[] getCharArray() { + synchronized (lock) { + return buf; + } + } + + public int getLength() + { + return count; + } + + +} diff --git a/sandbox/contributions/webcrawler-LARM/src/hplb/xml/DOMImpl.java b/sandbox/contributions/webcrawler-LARM/src/hplb/xml/DOMImpl.java new file mode 100644 index 00000000000..5645c2c1ac7 --- /dev/null +++ b/sandbox/contributions/webcrawler-LARM/src/hplb/xml/DOMImpl.java @@ -0,0 +1,23 @@ +/* + * $Id$ + * + * Copyright 1997 Hewlett-Packard Company + * + * This file may be copied, modified and distributed only in + * accordance with the terms of the limited licence contained + * in the accompanying file LICENSE.TXT. + */ + +package hplb.xml; + +import hplb.org.w3c.dom.DOM; +import hplb.org.w3c.dom.Document; + +public class DOMImpl implements DOM { + public Document createDocument(String type) { + return new DocumentImpl(); + } + public boolean hasFeature(String feature) { + return false; + } +} diff --git a/sandbox/contributions/webcrawler-LARM/src/hplb/xml/DocContextImpl.java b/sandbox/contributions/webcrawler-LARM/src/hplb/xml/DocContextImpl.java new file mode 100644 index 00000000000..fc2db0c8eb3 --- /dev/null +++ b/sandbox/contributions/webcrawler-LARM/src/hplb/xml/DocContextImpl.java @@ -0,0 +1,25 @@ +/* + * $Id$ + * + * Copyright 1997 Hewlett-Packard Company + * + * This file may be copied, modified and distributed only in + * accordance with the terms of the limited licence contained + * in the accompanying file LICENSE.TXT. + */ + +package hplb.xml; + +import hplb.org.w3c.dom.*; + +public class DocContextImpl implements DocumentContext { + Document doc; + + public Document getDocument() { + return doc; + } + + public void setDocument(Document arg) { + doc = arg; + } +} diff --git a/sandbox/contributions/webcrawler-LARM/src/hplb/xml/DocumentImpl.java b/sandbox/contributions/webcrawler-LARM/src/hplb/xml/DocumentImpl.java new file mode 100644 index 00000000000..54badf69d2d --- /dev/null +++ b/sandbox/contributions/webcrawler-LARM/src/hplb/xml/DocumentImpl.java @@ -0,0 +1,106 @@ +/* + * $Id$ + * + * Copyright 1997 Hewlett-Packard Company + * + * This file may be copied, modified and distributed only in + * accordance with the terms of the limited licence contained + * in the accompanying file LICENSE.TXT. + */ + +package hplb.xml; + +import hplb.org.w3c.dom.*; + +/** + * + */ +public class DocumentImpl extends NodeImpl implements Document { + DocumentContext context; + Document masterDoc = this; + Node type; + Element rootNode; + + public DocumentImpl() { + super(Node.DOCUMENT); + } + + public Document getMasterDoc() { + return masterDoc; + } + public void setMasterDoc(Document arg) { + masterDoc = arg; + } + + public Node getDocumentType() { + return type; + } + public void setDocumentType(Node arg) { + type = arg; + } + + public Element getDocumentElement() { + return rootNode; + } + public void setDocumentElement(Element arg) { + rootNode = arg; + } + + public DocumentContext getContextInfo() { + return context; + } + public void setContextInfo(DocumentContext arg) { + context = arg; + } + + public Document createDocument() { + return new DocumentImpl(); + } + + public DocumentContext createDocumentContext() { + return new DocContextImpl(); + } + + public Element createElement(String tagName, AttributeList attributes) { + return new ElementImpl(tagName, attributes); + } + + public Text createTextNode(String data) { + return new TextImpl(Node.TEXT, data); + } + + public Comment createComment(String data) { + return new TextImpl(Node.COMMENT, data); + } + + public PI createPI(String name, String data) { + PI pi = new TextImpl(Node.PI, data); + pi.setName(name); + return pi; + } + + public Attribute createAttribute(String name, Node value) { + return new AttrImpl(name, value, true); + } + + public AttributeList createAttributeList() { + return new AttrListImpl(); + } + + public NodeIterator getElementsByTagName() { + throw new NullPointerException("NOT IMPLEMENTED"); + } + + public String toString() { + return "ROOT"; + /* + if (children == null) return ""; + StringBuffer sb = new StringBuffer(); + int len = children.getLength(); + for (int i = 0; i < len; i++) { + System.out.println(children.item(i)); + } + return sb.toString(); + */ + } +} diff --git a/sandbox/contributions/webcrawler-LARM/src/hplb/xml/ElementImpl.java b/sandbox/contributions/webcrawler-LARM/src/hplb/xml/ElementImpl.java new file mode 100644 index 00000000000..98b011cee9f --- /dev/null +++ b/sandbox/contributions/webcrawler-LARM/src/hplb/xml/ElementImpl.java @@ -0,0 +1,55 @@ +/* + * $Id$ + * + * Copyright 1997 Hewlett-Packard Company + * + * This file may be copied, modified and distributed only in + * accordance with the terms of the limited licence contained + * in the accompanying file LICENSE.TXT. + */ + +package hplb.xml; + +import hplb.org.w3c.dom.*; + +/** + * + * @author Anders Kristensen + */ +public class ElementImpl extends NodeImpl implements Element { + protected String tagName; + protected AttributeList attrs; // Note: Parser ensures this wont be null + + public ElementImpl(String tagName, AttributeList attributes) { + super(Node.ELEMENT); + this.tagName = tagName; + attrs = attributes; + } + + public String getTagName() { + return tagName; + } + + public AttributeList attributes() { + return attrs; + } + + public void setAttribute(Attribute newAttr) { + if (attrs == null) attrs = new AttrListImpl(); + attrs.setAttribute(newAttr); + } + + public void normalize() {} + + public NodeIterator getElementsByTagName() { + throw new IllegalArgumentException( + "Why wasn't this method defined by the DOM WG to take an arg???"); + } + + public String toString() { + boolean empty = (children == null || children.getLength() == 0); + return "<" + tagName + " " + + (attrs != null ? attrs.toString() : "{}") + + (empty ? " />" : ">"); + } +} diff --git a/sandbox/contributions/webcrawler-LARM/src/hplb/xml/EntityManager.java b/sandbox/contributions/webcrawler-LARM/src/hplb/xml/EntityManager.java new file mode 100644 index 00000000000..c88b0e27c0a --- /dev/null +++ b/sandbox/contributions/webcrawler-LARM/src/hplb/xml/EntityManager.java @@ -0,0 +1,135 @@ +/* + * $Id$ + * + * Copyright 1997 Hewlett-Packard Company + * + * This file may be copied, modified and distributed only in + * accordance with the terms of the limited licence contained + * in the accompanying file LICENSE.TXT. + */ + +package hplb.xml; +import java.util.Hashtable; +import java.io.*; + +/** + * A very simple entity manager. + * @author Anders Kristensen + */ +public class EntityManager { + protected Hashtable entities = new Hashtable(); + private hplb.org.xml.sax.Parser tok; + + public EntityManager(hplb.org.xml.sax.Parser tok) { + this.tok = tok; + entities.put("amp", "&"); + entities.put("lt", "<"); + entities.put("gt", ">"); + entities.put("apos", "'"); + entities.put("quot", "\""); + } + + /** + * Finds entitiy and character references in the provided char array + * and decodes them. The operation is destructive, i.e. the encoded + * string replaces the original - this is atrightforward since the + * new string can only get shorter. + */ + public final CharBuffer entityDecode(CharBuffer buffer) throws Exception { + char[] buf = buffer.getCharArray(); // avoids method calls + int len = buffer.size(); + + // not fastest but certainly simplest: + if (indexOf(buf, '&', 0, len) == -1) return buffer; + CharBuffer newbuf = new CharBuffer(len); + + for (int start = 0; ; ) { + int x = indexOf(buf, '&', start, len); + if (x == -1) { + newbuf.write(buf, start, len - start); + return newbuf; + } else { + newbuf.write(buf, start, x - start); + start = x+1; + x = indexOf(buf, ';', start, len); + if (x == -1) { + //tok.warning("Entity reference not semicolon terminated"); + newbuf.write('&'); + //break; //??????????? + } else { + try { + writeEntityDef(buf, start, x-start, newbuf); + start = x+1; + } catch (Exception ex) { + //tok.warning("Bad entity reference"); + } + } + } + } + } + + // character references are rare enough that we don't care about + // creating a String object for them unnecessarily... + public void writeEntityDef(char[] buf, int off, int len, Writer out) + throws Exception, IOException, NumberFormatException + { + Integer ch; + //System.out.println("Entity: " + new String(buf, off, len) +" "+off+" "+len); + + if (buf[off] == '#') { // character reference + off++; + len--; + if (buf[off] == 'x' || buf[off] == 'X') { + ch = Integer.valueOf(new String(buf, off+1, len-1), 16); + } else { + ch = Integer.valueOf(new String(buf, off, len)); + } + out.write(ch.intValue()); + } else { + String ent = new String(buf, off, len); + String val = (String) entities.get(ent); + if (val != null) { + out.write(val); + } else { + out.write("&" + ent + ";"); + //tok.warning("unknown entity reference: " + ent); + } + } + } + + public String defTextEntity(String entity, String value) { + return (String) entities.put(entity, value); + } + + /** + * Returns the index within this String of the first occurrence of the + * specified character, starting the search at fromIndex. This method + * returns -1 if the character is not found. + * @params buf the buffer to search + * @params ch the character to search for + * @params from the index to start the search from + * @params to the highest possible index returned plus 1 + * @throws IndexOutOfBoundsException if index out of bounds... + */ + public static final int indexOf(char[] buf, int ch, int from, int to) { + int i; + for (i = from; i < to && buf[i] != ch; i++) + ; // do nothing + if (i < to) return i; + else return -1; + } + + // FOR TESTING + /* + public static void main(String[] args) throws Exception { + Parser tok = new Parser(); + tst.xml.TokArgs.args(args, tok); + CharBuffer buf1 = new CharBuffer(); + buf1.write(args[0]); + CharBuffer buf2 = tok.entMngr.entityDecode(buf1); + + System.out.println("Changed: " + (buf1 != buf2)); + System.out.println("Result: [" + buf2 + "]"); + } + */ +} diff --git a/sandbox/contributions/webcrawler-LARM/src/hplb/xml/HTML.java b/sandbox/contributions/webcrawler-LARM/src/hplb/xml/HTML.java new file mode 100644 index 00000000000..7884315466a --- /dev/null +++ b/sandbox/contributions/webcrawler-LARM/src/hplb/xml/HTML.java @@ -0,0 +1,281 @@ +/* + * $Id$ + * + * Copyright 1997 Hewlett-Packard Company + * + * This file may be copied, modified and distributed only in + * accordance with the terms of the limited licence contained + * in the accompanying file LICENSE.TXT. + */ + +package hplb.xml; + +/** + * Parser customizations for correctly parsing HTML. + * Defines a set of empty elements (<hr>, <br>, etc.) + * and for some elements it defines which other start tags + * implicitly ends them. As an example, an <li> element within + * a <ul> list is terminated by either a </ul> end tag + * or another <li> start tag. + * + * @author Anders Kristensen + */ +public class HTML { + public static String A = a("a");// + public static String ACRONYM = a("acronym");// + public static String ADDRESS = a("address"); + public static String APPLET = a("applet");// + public static String AREA = a("area"); + public static String B = a("b");// + public static String BASE = a("base"); + public static String BASEFONT = a("basefont");// + public static String BDO = a("bdo");// + public static String BIG = a("big");// + public static String BLOCKQUOTE = a("blockquote"); + public static String BODY = a("body");// + public static String BR = a("br"); + public static String BUTTON = a("button");// + public static String CAPTION = a("caption");// + public static String CENTER = a("center"); + public static String CITE = a("cite");// + public static String CODE = a("code");// + public static String COL = a("col"); + public static String COLGROUP = a("colgroup");// + public static String DD = a("dd"); + public static String DEL = a("del");// + public static String DFN = a("dfn");// + public static String DIR = a("dir"); + public static String DIV = a("div"); + public static String DL = a("dl"); + public static String DT = a("dt"); + public static String EM = a("em");// + public static String FIELDSET = a("fieldset"); + public static String FONT = a("font");// + public static String FORM = a("form"); + public static String FRAME = a("frame"); + public static String FRAMESET = a("frameset");// + public static String H1 = a("h1"); + public static String H2 = a("h2"); + public static String H3 = a("h3"); + public static String H4 = a("h4"); + public static String H5 = a("h5"); + public static String H6 = a("h6"); + public static String HEAD = a("head"); + public static String HR = a("hr"); + public static String HTML = a("html"); + public static String I = a("i");// + public static String IFRAME = a("iframe");// + public static String IMG = a("img"); + public static String INPUT = a("input"); + public static String INS = a("ins");// + public static String ISINDEX = a("isindex");// + public static String KBD = a("kbd");// + public static String LABEL = a("label");// + public static String LEGEND = a("legend");// + public static String LI = a("li"); + public static String LINK = a("link"); + public static String MAP = a("map");// + public static String MENU = a("menu"); + public static String META = a("meta"); + public static String NOFRAMES = a("noframes");// + public static String NOSCRIPT = a("noscript"); + public static String OBJECT = a("object");// + public static String OL = a("ol"); + public static String OPTION = a("option");// + public static String P = a("p"); + public static String PARAM = a("param"); + public static String PRE = a("pre"); + public static String Q = a("q");// + public static String S = a("s");// + public static String SAMP = a("samp");// + public static String SCRIPT = a("script");// + public static String SELECT = a("select");// + public static String SMALL = a("small");// + public static String SPAN = a("span");// + public static String STRIKE = a("strike");// + public static String STRONG = a("strong");// + public static String STYLE = a("style");// + public static String SUB = a("sub");// + public static String SUP = a("sup");// + public static String TABLE = a("table"); + public static String TBODY = a("tbody");// + public static String TD = a("td");// + public static String TEXTAREA = a("textarea");// + public static String TFOOT = a("tfoot");// + public static String TH = a("th");// + public static String THEAD = a("thead");// + public static String TITLE = a("title");// + public static String TR = a("tr"); + public static String TT = a("tt");// + public static String U = a("u");// + public static String UL = a("ul"); + public static String VAR = a("var");// + + private static String a(String s) { return Atom.getAtom(s); } + + /** The full set of HTML4.0 element names. */ + public static final String[] elements = { + A, ACRONYM, ADDRESS, APPLET, AREA , B, BASE, BASEFONT, BDO, BIG, + BLOCKQUOTE, BODY, BR, BUTTON, CAPTION, CENTER, CITE, CODE, COL, + COLGROUP, DD, DEL, DFN, DIR, DIV, DL, DT, EM, FIELDSET, FONT, FORM, + FRAME, FRAMESET, H1, H2, H3, H4, H5, H6, HEAD, HR, HTML, I, IFRAME, + IMG, INPUT, INS, ISINDEX, KBD, LABEL, LEGEND, LI, LINK, MAP, MENU, + META, NOFRAMES, NOSCRIPT, OBJECT, OL, OPTION, P, PARAM, PRE, Q, S, + SAMP, SCRIPT, SELECT, SMALL, SPAN, STRIKE, STRONG, STYLE, SUB, SUP, + TABLE, TBODY, TD, TEXTAREA, TFOOT, TH, THEAD, TITLE, TR, TT, U, UL, VAR + }; + + // FIXME: the parser kindof supports optional end tags but not + // at all optional start tags (eg , ) + // FIXME: add support for HTML entities not in HTML (lots of those) + + // FIXME: this list probably not complete!!! + /** Empty elements in HTML4.0: br, img, etc. */ + public static final String[] emptyElms = { + AREA, BASE, BR, COL, FRAME, HR, IMG, LINK, META, PARAM }; + + public static final String[] li_terminators = { LI }; + public static final String[] dt_terminators = { DT, DD }; + public static final String[] dd_terminators = dt_terminators; + // terminators: and just about everything else + + /** Block-level HTML4.0 elements. */ + public static final String[] block_level = { + ADDRESS, BLOCKQUOTE, CENTER, DIR, DIV, DL, FIELDSET, FORM, + H1, H2, H3, H4, H5, H6, HR, MENU, NOSCRIPT, OL, P, PRE, TABLE, UL }; + + // The P element can contain any *inline* markup - hence it is + // terminated by any *blocklevel* markup (incl. other P elements): + public static final String[] p_terminators = block_level; + + // elements which cannot contain PCDATA don't care about whitespace + // FIXME: ignore_ws probably not complete [don't include empty elements] + public static final String[] ignore_ws = { + HEAD, HTML, OL, MENU, TABLE, TR , UL }; + + public static void applyHacks(Tokenizer tok) { + for (int i = 0; i < elements.length; i++) { + tok.ignoreCase(elements[i]); + } + + EntityManager entMngr = tok.entMngr; + + // standard SGML entities + entMngr.defTextEntity("amp", "&"); // ampersand + entMngr.defTextEntity("gt", ">"); // greater than + entMngr.defTextEntity("lt", "<"); // less than + entMngr.defTextEntity("quot", "\""); // double quote + + // PUBLIC ISO 8879-1986// entities Added Latin 1//EN//HTML + entMngr.defTextEntity("AElig", "\u00c6"); // capital AE diphthong (ligature) + entMngr.defTextEntity("Aacute", "\u00c1"); // capital A, acute accent + entMngr.defTextEntity("Acirc", "\u00c2"); // capital A, circumflex accent + entMngr.defTextEntity("Agrave", "\u00c0"); // capital A, grave accent + entMngr.defTextEntity("Aring", "\u00c5"); // capital A, ring + entMngr.defTextEntity("Atilde", "\u00c3"); // capital A, tilde + entMngr.defTextEntity("Auml", "\u00c4"); // capital A, dieresis or umlaut mark + entMngr.defTextEntity("Ccedil", "\u00c7"); // capital C, cedilla + entMngr.defTextEntity("ETH", "\u00d0"); // capital Eth, Icelandic + entMngr.defTextEntity("Eacute", "\u00c9"); // capital E, acute accent + entMngr.defTextEntity("Ecirc", "\u00ca"); // capital E, circumflex accent + entMngr.defTextEntity("Egrave", "\u00c8"); // capital E, grave accent + entMngr.defTextEntity("Euml", "\u00cb"); // capital E, dieresis or umlaut mark + entMngr.defTextEntity("Iacute", "\u00cd"); // capital I, acute accent + entMngr.defTextEntity("Icirc", "\u00ce"); // capital I, circumflex accent + entMngr.defTextEntity("Igrave", "\u00cc"); // capital I, grave accent + entMngr.defTextEntity("Iuml", "\u00cf"); // capital I, dieresis or umlaut mark + entMngr.defTextEntity("Ntilde", "\u00d1"); // capital N, tilde + entMngr.defTextEntity("Oacute", "\u00d3"); // capital O, acute accent + entMngr.defTextEntity("Ocirc", "\u00d4"); // capital O, circumflex accent + entMngr.defTextEntity("Ograve", "\u00d2"); // capital O, grave accent + entMngr.defTextEntity("Oslash", "\u00d8"); // capital O, slash + entMngr.defTextEntity("Otilde", "\u00d5"); // capital O, tilde + entMngr.defTextEntity("Ouml", "\u00d6"); // capital O, dieresis or umlaut mark + entMngr.defTextEntity("THORN", "\u00de"); // capital THORN, Icelandic + entMngr.defTextEntity("Uacute", "\u00da"); // capital U, acute accent + entMngr.defTextEntity("Ucirc", "\u00db"); // capital U, circumflex accent + entMngr.defTextEntity("Ugrave", "\u00d9"); // capital U, grave accent + entMngr.defTextEntity("Uuml", "\u00dc"); // capital U, dieresis or umlaut mark + entMngr.defTextEntity("Yacute", "\u00dd"); // capital Y, acute accent + entMngr.defTextEntity("aacute", "\u00e1"); // small a, acute accent + entMngr.defTextEntity("acirc", "\u00e2"); // small a, circumflex accent + entMngr.defTextEntity("aelig", "\u00e6"); // small ae diphthong (ligature) + entMngr.defTextEntity("agrave", "\u00e0"); // small a, grave accent + entMngr.defTextEntity("aring", "\u00e5"); // small a, ring + entMngr.defTextEntity("atilde", "\u00e3"); // small a, tilde + entMngr.defTextEntity("auml", "\u00e4"); // small a, dieresis or umlaut mark + entMngr.defTextEntity("ccedil", "\u00e7"); // small c, cedilla + entMngr.defTextEntity("eacute", "\u00e9"); // small e, acute accent + entMngr.defTextEntity("ecirc", "\u00ea"); // small e, circumflex accent + entMngr.defTextEntity("egrave", "\u00e8"); // small e, grave accent + entMngr.defTextEntity("eth", "\u00f0"); // small eth, Icelandic + entMngr.defTextEntity("euml", "\u00eb"); // small e, dieresis or umlaut mark + entMngr.defTextEntity("iacute", "\u00ed"); // small i, acute accent + entMngr.defTextEntity("icirc", "\u00ee"); // small i, circumflex accent + entMngr.defTextEntity("igrave", "\u00ec"); // small i, grave accent + entMngr.defTextEntity("iuml", "\u00ef"); // small i, dieresis or umlaut mark + entMngr.defTextEntity("ntilde", "\u00f1"); // small n, tilde + entMngr.defTextEntity("oacute", "\u00f3"); // small o, acute accent + entMngr.defTextEntity("ocirc", "\u00f4"); // small o, circumflex accent + entMngr.defTextEntity("ograve", "\u00f2"); // small o, grave accent + entMngr.defTextEntity("oslash", "\u00f8"); // small o, slash + entMngr.defTextEntity("otilde", "\u00f5"); // small o, tilde + entMngr.defTextEntity("ouml", "\u00f6"); // small o, dieresis or umlaut mark + entMngr.defTextEntity("szlig", "\u00df"); // small sharp s, German (sz ligature) + entMngr.defTextEntity("thorn", "\u00fe"); // small thorn, Icelandic + entMngr.defTextEntity("uacute", "\u00fa"); // small u, acute accent + entMngr.defTextEntity("ucirc", "\u00fb"); // small u, circumflex accent + entMngr.defTextEntity("ugrave", "\u00f9"); // small u, grave accent + entMngr.defTextEntity("uuml", "\u00fc"); // small u, dieresis or umlaut mark + entMngr.defTextEntity("yacute", "\u00fd"); // small y, acute accent + entMngr.defTextEntity("yuml", "\u00ff"); // small y, dieresis or umlaut mark + + // Some extra Latin 1 chars that are listed in the HTML3.2 draft (21-May-96) + entMngr.defTextEntity("nbsp", "\u00a0"); // non breaking space + entMngr.defTextEntity("reg", "\u00ae"); // registered sign + entMngr.defTextEntity("copy", "\u00a9"); // copyright sign + + // Additional ISO-8859/1 entities listed in rfc1866 (section 14) + entMngr.defTextEntity("iexcl", "\u00a1"); + entMngr.defTextEntity("cent", "\u00a2"); + entMngr.defTextEntity("pound", "\u00a3"); + entMngr.defTextEntity("curren", "\u00a4"); + entMngr.defTextEntity("yen", "\u00a5"); + entMngr.defTextEntity("brvbar", "\u00a6"); + entMngr.defTextEntity("sect", "\u00a7"); + entMngr.defTextEntity("uml", "\u00a8"); + entMngr.defTextEntity("ordf", "\u00aa"); + entMngr.defTextEntity("laquo", "\u00ab"); + entMngr.defTextEntity("not", "\u00ac"); + entMngr.defTextEntity("shy", "\u00ad"); // soft hyphen + entMngr.defTextEntity("macr", "\u00af"); + entMngr.defTextEntity("deg", "\u00b0"); + entMngr.defTextEntity("plusmn", "\u00b1"); + entMngr.defTextEntity("sup1", "\u00b9"); + entMngr.defTextEntity("sup2", "\u00b2"); + entMngr.defTextEntity("sup3", "\u00b3"); + entMngr.defTextEntity("acute", "\u00b4"); + entMngr.defTextEntity("micro", "\u00b5"); + entMngr.defTextEntity("para", "\u00b6"); + entMngr.defTextEntity("middot", "\u00b7"); + entMngr.defTextEntity("cedil", "\u00b8"); + entMngr.defTextEntity("ordm", "\u00ba"); + entMngr.defTextEntity("raquo", "\u00bb"); + entMngr.defTextEntity("frac14", "\u00bc"); + entMngr.defTextEntity("frac12", "\u00bd"); + entMngr.defTextEntity("frac34", "\u00be"); + entMngr.defTextEntity("iquest", "\u00bf"); + entMngr.defTextEntity("times", "\u00d7"); + entMngr.defTextEntity("divide", "\u00f7"); + } + + public static void applyHacks(Parser parser) { + parser.addEmptyElms(emptyElms); + parser.setElmTerminators(LI, li_terminators); + parser.setElmTerminators(DT, dt_terminators); + parser.setElmTerminators(DD, dd_terminators); + parser.setElmTerminators(P, p_terminators); + //parser.ignoreWS(ginore_ws); + applyHacks(parser.getTokenizer()); + } +} diff --git a/sandbox/contributions/webcrawler-LARM/src/hplb/xml/HtmlXmlParser.java b/sandbox/contributions/webcrawler-LARM/src/hplb/xml/HtmlXmlParser.java new file mode 100644 index 00000000000..d2563367eb5 --- /dev/null +++ b/sandbox/contributions/webcrawler-LARM/src/hplb/xml/HtmlXmlParser.java @@ -0,0 +1,34 @@ +/* + * $Id$ + * + * Copyright 1997 Hewlett-Packard Company + * + * This file may be copied, modified and distributed only in + * accordance with the terms of the limited licence contained + * in the accompanying file LICENSE.TXT. + */ + +package hplb.xml; + +/** + * The HtmlXmlParser is a Parser with some HTML specific hacks + * applied to it which means it will more or less correctly parse most + * HTML pages, also when they arbitrary embedded XML markup. It is + * very forgiving as is commonly the case with HTML parsers. + * + * @author Anders Kristensen + */ +public class HtmlXmlParser extends Parser { + public HtmlXmlParser() { + super(); + HTML.applyHacks(this); + tok.rcgnzCDATA = false; + } + + // for debugging + public static void main(String[] args) throws Exception { + Parser parser = new HtmlXmlParser(); + hplb.org.w3c.dom.Document doc = parser.parse(System.in); + Utils.pp(doc, System.out); + } +} diff --git a/sandbox/contributions/webcrawler-LARM/src/hplb/xml/NodeImpl.java b/sandbox/contributions/webcrawler-LARM/src/hplb/xml/NodeImpl.java new file mode 100644 index 00000000000..60ea04ad890 --- /dev/null +++ b/sandbox/contributions/webcrawler-LARM/src/hplb/xml/NodeImpl.java @@ -0,0 +1,88 @@ +/* + * $Id$ + * + * Copyright 1997 Hewlett-Packard Company + * + * This file may be copied, modified and distributed only in + * accordance with the terms of the limited licence contained + * in the accompanying file LICENSE.TXT. + */ + +package hplb.xml; + +import hplb.org.w3c.dom.*; + +/** + * Implements management of list of children. + * @author Anders Kristensen + */ +public abstract class NodeImpl implements Node { + protected int type; + protected NodeImpl parent; + protected NodeListImpl children = new NodeListImpl(); + + public NodeImpl(int type) { + this.type = type; + } + + public NodeListImpl getChildren() { + return children; + } + + public int getNodeType() { + return type; + } + + public Node getParentNode() { + return parent; + } + + public NodeIterator getChildNodes() { + return children.getIterator(); + } + + public boolean hasChildNodes() { + return (children.getLength() > 0); + } + + public Node getFirstChild() { + return children.item(0); + } + + public Node getPreviousSibling() { + if (parent == null) return null; + else return parent.children.getPreviousNode(this); + } + + public Node getNextSibling() { + if (parent == null) return null; + else return parent.children.getNextNode(this); + } + + public Node insertBefore(Node newChild, Node refChild) { + NodeImpl node = (NodeImpl) children.insertBefore(newChild, refChild); + if (node != null) ((NodeImpl) newChild).parent = this; + return node; + } + + public Node insertAfter(Node newChild, Node refChild) { + NodeImpl node = (NodeImpl) children.insertAfter(newChild, refChild); + if (node != null) ((NodeImpl) newChild).parent = this; + return node; + } + + public Node replaceChild(Node newChild, Node oldChild) { + NodeImpl node = (NodeImpl) children.replace(newChild, oldChild); + if (node != null) { + node.parent = null; + ((NodeImpl) newChild).parent = this; + } + return node; + } + + public Node removeChild(Node oldChild) { + NodeImpl node = (NodeImpl) children.remove(oldChild); + if (node != null) node.parent = null; + return node; + } +} diff --git a/sandbox/contributions/webcrawler-LARM/src/hplb/xml/NodeListImpl.java b/sandbox/contributions/webcrawler-LARM/src/hplb/xml/NodeListImpl.java new file mode 100644 index 00000000000..92271b549df --- /dev/null +++ b/sandbox/contributions/webcrawler-LARM/src/hplb/xml/NodeListImpl.java @@ -0,0 +1,223 @@ +/* + * $Id$ + * + * Copyright 1997 Hewlett-Packard Company + * + * This file may be copied, modified and distributed only in + * accordance with the terms of the limited licence contained + * in the accompanying file LICENSE.TXT. + */ + +package hplb.xml; + +import hplb.org.w3c.dom.*; + +/** + * + * @author Anders Kristensen + */ +public class NodeListImpl { + protected Node[] elms; + protected int count = 0; + + public NodeListImpl() { + this(5); + } + + public NodeListImpl(int size) { + if (size <= 0) throw new IllegalArgumentException( + "Initial size of must be at least 1"); + elms = new Node[size]; + } + + public synchronized void add(Node node) { + int len = elms.length; + if (len == count) { + Node[] e = new Node[len * 2]; + System.arraycopy(elms, 0, e, 0, len); + elms = e; + } + elms[count++] = node; + } + + public synchronized Node replace(int index, Node replaceNode) { + if (index < 0 || index >= count) { + throw new IndexOutOfBoundsException(""+index); + } else if (index == count) { + add(replaceNode); + return null; + } else { + Node node = elms[index]; + elms[index] = replaceNode; + return node; + } + } + + // XXX: TEST THIS METHOD!!! + public synchronized Node insert(int index, Node newNode) { + Node res = null; + if (index < 0 || index > count) { + throw new IndexOutOfBoundsException(""+index); + } else if (index == count) { + add(newNode); + } else { + int len = elms.length; + if (len == count) { + Node[] e = new Node[len * 2]; + System.arraycopy(elms, 0, e, 0, index); + System.arraycopy(elms, index, e, index+1, count-index); + elms = e; + } else { + System.arraycopy(elms, index, elms, index+1, count-index); + } + res = elms[index]; + elms[index] = newNode; + count++; + } + return res; + } + + public NodeIterator getIterator() { + return new NodeIteratorImpl(this); + } + + public synchronized Node remove(int index) { + if (index < 0 || index >= count) { + throw new IndexOutOfBoundsException(""+index); + } + Node node = elms[index]; + System.arraycopy(elms, index+1, elms, index, count-index-1); + count--; + return node; + } + + public synchronized Node item(int index) { + if (index < 0 || index >= count) { + return null; + } + return elms[index]; + } + + /** Returns the number of keys in this dictionary. */ + public synchronized int getLength() { + return count; + } + + public Node getPreviousNode(Node node) { + for (int i = 1; i < count; i++) { + if (elms[i] == node) return elms[i-1]; + } + return null; + } + + public Node getNextNode(Node node) { + for (int i = 0; i < count-1; i++) { + if (elms[i] == node) return elms[i+1]; + } + return null; + } + + public Node insertBefore(Node node, Node ref) { + int idx = index(ref); + if (idx > -1) insert(idx, node); + else add(node); + return node; + } + + public Node insertAfter(Node node, Node ref) { + int idx = index(ref); + if (idx > -1) insert(idx+1, node); + else add(node); + return node; + } + + public Node replace(Node node, Node ref) { + return replace(index(ref), node); + } + + public Node remove(Node node) { + int idx = index(node); + return (idx > -1 ? remove(idx) : null); + } + + public int index(Node node) { + for (int i = 0; i < count; i++) { + if (elms[i] == node) return i; + } + return -1; + } + + public synchronized String toString() { + StringBuffer sb = new StringBuffer(); + boolean f = true; + int count = getLength(); + + sb.append("{ "); + for (int i = 0; i < count; i++) { + if (f) { f = false; } + else { sb.append(", "); } + sb.append(item(i).toString()); + } + sb.append(" }"); + return sb.toString(); + } +} + + +// FIXME: doesn't work properly when list changed underneath iterator +// proper thing would be to use observer pattern on current element--if +// this is removed we get callback and reposition the cursor... THISISAHACK! +// FIXME synchronize on the list itself. +class NodeIteratorImpl implements NodeIterator { + NodeListImpl nlist; + int index; + + /** + * Create iterator over the specified NodeList. The initial position + * will be one *before* the first element. Calling toNext() will + * position the iterator at the first element. + */ + public NodeIteratorImpl(NodeListImpl list) { + nlist = list; + index = -1; + } + + public int getLength() { + return nlist.getLength(); + } + + public Node getCurrent() { + return (index >= 0 && index < nlist.count) ? nlist.item(index) : null; + } + + public Node toNext() { + if (index < nlist.count) index++; + return getCurrent(); + } + + public Node toPrevious() { + if (index >= 0) index--; + return getCurrent(); + } + + public Node toFirst() { + index = 0; + return getCurrent(); + } + + public Node toLast() { + index = nlist.count; + return getCurrent(); + } + + public Node toNth(int Nth) { + index = Nth; + return getCurrent(); + } + + // FIXME: multi-threading problems here... (race condition) + public Node toNode(Node destNode) { + int idx = nlist.index(destNode); + return (idx >= 0 ? toNth(idx) : null); + } +} diff --git a/sandbox/contributions/webcrawler-LARM/src/hplb/xml/Parser.java b/sandbox/contributions/webcrawler-LARM/src/hplb/xml/Parser.java new file mode 100644 index 00000000000..aa76e03f3c6 --- /dev/null +++ b/sandbox/contributions/webcrawler-LARM/src/hplb/xml/Parser.java @@ -0,0 +1,208 @@ +/* + * $Id$ + * + * Copyright 1997 Hewlett-Packard Company + * + * This file may be copied, modified and distributed only in + * accordance with the terms of the limited licence contained + * in the accompanying file LICENSE.TXT. + */ +package hplb.xml; + +import hplb.org.xml.sax.*; +import hplb.org.w3c.dom.*; +import java.util.*; +import java.io.*; + +/** + * Parses a stream of MarkupTokens into a tree structure. + * Uses Tokenizer. + * + *

This class has very shallow (no) understanding of HTML. Correct + * handling of <p> tags requires some special code as does correct + * handling of <li>. This parser doesn't know that an "li" tag can + * be terminated by another "li" tag or a "ul" end tag. Hence "li" is + * treated as an empty tag here which means that in the generated parse + * tree the children of the "li" element are represented as siblings of it. + * + * @see Tokenizer + * @author Anders Kristensen + */ +public class Parser implements DocumentHandler { + // FIXME: add support for discriminate per-element whitespace handling + + /** + * Set of elements which the parser will expect to be empty, i.e. it + * will not expect an end tag (e.g. IMG, META HTML elements). + * End tags for any of these are ignored... + */ + protected Hashtable emptyElms = new Hashtable(); + + /** + * Maps element names to a list of names of other elements which + * terminate that element. So for example "dt" might be mapped to + * ("dt", "dd") and "p" might be mapped to all blocklevel HTML + * elements. + */ + protected Hashtable terminators = new Hashtable(); + protected Tokenizer tok; + protected DOM dom; + protected Document root; + protected Node current; + + /** + * Non-fatal errors are written to this PrintStream. Fatal errors + * are reported as Exceptions. + */ + PrintStream err = System.err; + + public Parser() { + tok = new Tokenizer(); + tok.setDocumentHandler(this); + dom = new DOMImpl(); + } + + public DOM setDOM(DOM dom) { + DOM old = dom; + this.dom = dom; + return old; + } + + public Tokenizer getTokenizer() { + return tok; + } + + /** + * Add the set of HTML empty elements to the set of tags recognized + * as empty tags. + */ + public void addEmptyElms(String[] elms) { + for (int i = 0; i < elms.length; i++) { + emptyElms.put(elms[i], elms[i]); + } + } + + public void clearEmptyElmSet() { + emptyElms.clear(); + } + + public boolean isEmptyElm(String elmName) { + return emptyElms.get(elmName) != null; + } + + public void setElmTerminators(String elmName, String[] elmTerms) { + terminators.put(elmName, putIds(new Hashtable(), elmTerms)); + } + + public void addTerminator(String elmName, String elmTerm) { + Hashtable h = (Hashtable) terminators.get(elmName); + if (h == null) terminators.put(elmName, h = new Hashtable()); + h.put(elmTerm, elmTerm); + } + + public static final Dictionary putIds(Dictionary dict, String[] sary) { + for (int i = 0; i < sary.length; i++) { + dict.put(sary[i], sary[i]); + } + return dict; + } + + protected Document root() { + return root; + } + + public Document parse(InputStream in) throws Exception { + root = dom.createDocument(null); + current = root; + tok.parse(in); + return root(); + } + + public void startDocument() {} + public void endDocument() {} + + // FIXME: record in root DOCUMENT the id's of elements which have one + + public void doctype(String name, String publicID, String systemID) { + } + + public void startElement(String name, AttributeMap attributes) { + //System.out.println("CURRENT: " + current); + + // does this new element terminate the current element? + if (current != root) { + String tagName = ((Element) current).getTagName(); + if (tagName != null) { + Hashtable terms = (Hashtable) terminators.get(tagName); + if (terms != null && terms.get(name) != null) { + current = current.getParentNode(); // FIXME: could be null + } + } + } + + Element elm = root.createElement(name, getDOMAttrs(attributes)); + // FIXME:


gets written as
- the following line changes + // this tp
which is even wors - we should distinguish between + // those two types of empty elements. + current.insertBefore(elm, null); + if (!isEmptyElm(name)) current = elm; + } + + public void endElement(String name) { + // we go up the parse tree till we find the node which matches + // this end tag. This mechanism elegantly handles "implicitly + // closed" elements such as
  • being terminated by an + // enclosing
      being ended. + + //System.out.println("CURRENT: " + current); + + Node node = current; + for (;;) { + if (node == root) { + err.println("Stray end tag ignored: " + name + + " line " + tok.line + " column " + tok.column); + return; + } else if (name.equals(((Element) node).getTagName())) { + current = node.getParentNode(); + return; + } else { + node = node.getParentNode(); + } + } + } + + public void characters(char[] ch, int start, int length) { + current.insertBefore( + root.createTextNode(new String(ch, start, length)), null); + } + + public void ignorable (char ch[], int start, int length) { + System.out.println("Ignorable ws: " + new String(ch, start, length)); + } + + public void processingInstruction(String target, String remainder) { + // FIXME: the DOM says 2nd arg should be everything between "" + current.insertBefore(root.createPI(target, remainder), null); + } + + public AttributeList getDOMAttrs(AttributeMap attrs) { + String name; + Node value; + Enumeration e; + AttributeList domAttrs = root.createAttributeList(); + + for (e = attrs.getAttributeNames(); e.hasMoreElements(); ) { + name = (String) e.nextElement(); + value = root.createTextNode(attrs.getValue(name)); + domAttrs.setAttribute(root.createAttribute(name, value)); + } + return domAttrs; + } + + // for debugging + public static void main(String[] args) throws Exception { + Parser parser = new Parser(); + Document doc = parser.parse(System.in); + Utils.pp(doc, System.out); + } +} diff --git a/sandbox/contributions/webcrawler-LARM/src/hplb/xml/SAXAttributeMap.java b/sandbox/contributions/webcrawler-LARM/src/hplb/xml/SAXAttributeMap.java new file mode 100644 index 00000000000..69bee4117e4 --- /dev/null +++ b/sandbox/contributions/webcrawler-LARM/src/hplb/xml/SAXAttributeMap.java @@ -0,0 +1,229 @@ +/* + * $Id$ + * + * Copyright 1997 Hewlett-Packard Company + * + * This file may be copied, modified and distributed only in + * accordance with the terms of the limited licence contained + * in the accompanying file LICENSE.TXT. + */ + +package hplb.xml; + +import hplb.org.xml.sax.AttributeMap; +import java.util.Enumeration; + +/** + * An ordered Dictionary. keys() and elements() returns Enumerations + * which enumerate over elements in the order they were inserted. + * Elements are stored linearly. Operations put(), get(), and remove() + * are linear in the number of elements in the Dictionary. + * + *

      Allows direct access to elements (as an alternative to using + * Enumerators) for speed. + * + *

      Can function as a bag, i.e. it can be created with a mode + * which allows the same key to map to multiple entries. In this case + * operations get() and remove() operate on the first pair in + * the map. Hence to get hold of all values associated with a key it is + * necessary to use the direct access to underlying arrays. + * + * @author Anders Kristensen + */ +public class SAXAttributeMap implements AttributeMap { + + /** The list of keys. */ + public String[] keys; + + /** List of values associated with keys. */ + public String[] elms; + + /** + * Number of elements in the Dictionary. + * The elements are held at indices 0 to n in the keys and elms arrays. + */ + public int n = 0; + + public SAXAttributeMap() { + this(5); + } + + /** + * Create a SAXAttributeMap with the specififed initial cpacity. + */ + public SAXAttributeMap(int size) { + if (size <= 0) throw new IllegalArgumentException( + "Initial size must be at least 1"); + keys = new String[size]; + elms = new String[size]; + } + + /** Returns the number of keys in this dictionary. */ + public synchronized int size() { + return n; + } + + /** Returns true if this dictionary maps no keys to value. */ + public synchronized boolean isEmpty() { + return size() == 0; + } + + /** + * Returns an enumeration of the keys in this dictionary. + */ + public Enumeration getAttributeNames() { + return new SAXAttributeEnum(keys, n); + } + + /** + * Returns the value to which the key is mapped in this dictionary. + */ + public synchronized String getValue(String key) { + int i = getIndex(key); + return (i < 0 ? null : elms[i]); + } + + protected int getIndex(String key) { + for (int i = 0; i < n; i++) { + if (keys[i].equals(key)) + return i; + } + return -1; + } + + /** + * Maps the specified key to the specified value in this dictionary. + * Neither the key nor the value can be null. + * + *

      The value can be retrieved by calling the get method with a key + * that is equal to the original key. + * @return the previous value to which the key was mapped in + * this dictionary, or null if the key did not have a + * previous mapping. + * @throws NullPointerException if the key or value is null + */ + public synchronized String put(String key, String value) { + if (value == null) throw new NullPointerException("value is null"); + int i = getIndex(key); + if (i >= 0) { + String old = elms[i]; + elms[i] = value; + return old; + } + int len = keys.length; + if (len == n) { + // double size of key,elms arrays + String[] k, e; + k = new String[len * 2]; + e = new String[len * 2]; + System.arraycopy(keys, 0, k, 0, len); + System.arraycopy(elms, 0, e, 0, len); + keys = k; + elms = e; + } + keys[n] = key; + elms[n] = value; + n++; + return null; + } + + public void clear() { + n = 0; + } + + public boolean isEntity (String aname) { return false; } + public boolean isNotation (String aname) { return false; } + public boolean isId (String aname) { return false; } + public boolean isIdref (String aname) { return false; } + public String getEntityPublicID (String aname) { return null; } + public String getEntitySystemID (String aname) { return null; } + public String getNotationName (String aname) { return null; } + public String getNotationPublicID (String aname) { return null; } + public String getNotationSystemID (String aname) { return null; } + + public synchronized String toString() { + StringBuffer sb = new StringBuffer(); + boolean f = true; + + sb.append("{ "); + for (Enumeration e = getAttributeNames(); e.hasMoreElements(); ) { + if (f) { f = false; } + else { sb.append(", "); } + String key = (String) e.nextElement(); + sb.append("" + key + '=' + getValue(key)); + } + sb.append(" }"); + return sb.toString(); + } + + /* + // for testing + public static void main(String[] args) throws Exception { + SAXAttributeMap d; + java.io.BufferedReader r; + java.util.StringTokenizer tok; + String op; + + if (args.length > 1) { + d = new SAXAttributeMap(Integer.parseInt(args[0])); + } else { + d = new SAXAttributeMap(); + } + + System.out.println( + "Enter operations... op's are one of\n"+ + "put \n"+ + "get \n"+ + "enum\n"+ + "size\n"+ + "quit\n"); + + r = new java.io.BufferedReader( + new java.io.InputStreamReader(System.in)); + while (true) { + System.out.print("doyourworst> "); + tok = new java.util.StringTokenizer(r.readLine()); + op = tok.nextToken(); + if ("put".equals(op)) { + System.out.println("Value: " + + d.put(tok.nextToken(), tok.nextToken())); + } else if ("get".equals(op)) { + System.out.println("Value: " + d.getValue(tok.nextToken())); + } else if ("enum".equals(op)) { + for (Enumeration e = d.getAttributeNames(); + e.hasMoreElements(); ) { + System.out.println("" + e.nextElement() + " "); + } + } else if (op.startsWith("s")) { + System.out.println("Size: " + d.size()); + } else if (op.startsWith("q")) { + break; + } else { + System.out.println("Unrecognized op: " + op); + } + + System.out.println("Dictionary: " + d); + System.out.println("Size: " + d.size()); + System.out.println(); + } + } + */ +} + +class SAXAttributeEnum implements Enumeration { + String[] objs; + int i = 0, n; + + public SAXAttributeEnum(String[] objs, int n) { + this.objs = objs; + this.n = n; + } + + public boolean hasMoreElements() { + return i < n; + } + + public Object nextElement() { + return objs[i++]; + } +} diff --git a/sandbox/contributions/webcrawler-LARM/src/hplb/xml/TextImpl.java b/sandbox/contributions/webcrawler-LARM/src/hplb/xml/TextImpl.java new file mode 100644 index 00000000000..9df683f75a0 --- /dev/null +++ b/sandbox/contributions/webcrawler-LARM/src/hplb/xml/TextImpl.java @@ -0,0 +1,112 @@ +/* + * $Id$ + * + * Copyright 1997 Hewlett-Packard Company + * + * This file may be copied, modified and distributed only in + * accordance with the terms of the limited licence contained + * in the accompanying file LICENSE.TXT. + */ + +// FIXME: check parameters reasonable [within bounds] + +package hplb.xml; + +import hplb.org.w3c.dom.*; + +/** + * Class whose instances represent PCDATA, comments, and PIs (processing + * instructions. + * @author Anders Kristensen + */ +public class TextImpl extends NodeImpl implements Text, Comment, PI { + protected String data; + protected String name; // only valid for PIs + + /** + * Construct new leaf node whose value is textual. + * @param type one of Node.PI, Node.COMMENT, and Node.TEXT. + * @param data the PCDATA, CDATA, comment, whatever + */ + public TextImpl(int type, String data) { + super(type); + this.data = data; + } + + // getData/setData common for the three interfaces + public String getData() { + return data; + } + public void setData(String arg) { + data = arg; + } + + // Text specific methods: + + public void append(String data) { + this.data = this.data + data; + } + + public void insert(int offset, String data) { + this.data = this.data.substring(0, offset) + + data + + this.data.substring(offset); + } + + public void delete(int offset, int count) { + this.data = this.data.substring(0, offset) + + this.data.substring(offset + count); + } + + public void replace(int offset, int count, String data) { + this.data = this.data.substring(0, offset) + + data + + this.data.substring(offset + count); + } + + public void splice(Element element, int offset, int count) { + if (offset <= 0) { + parent.insertBefore(element, this); + } else if (offset+count > data.length()) { + parent.insertAfter(element, this); + } else { + Node n; + n = new TextImpl(Node.TEXT, data.substring(offset, offset+count)); + element.insertBefore(n , null); + n = new TextImpl(Node.TEXT, data.substring(offset+count)); + parent.insertAfter(n, this); + data = data.substring(0, offset); + } + } + + // PI specific methods: + public String getName() { + return name; + } + public void setName(String arg) { + name = arg; + } + + protected String typeAsString() { + switch (type) { + case Node.PI: return "PI"; + case Node.COMMENT: return "COMMENT"; + case Node.TEXT: return "TEXT"; + default: return "UNKNOWN"; + } + } + + public String toDebugString() { + return typeAsString() + + (data == null ? "" : Utils.compact(data)); + } + + public String toString() { + switch (type) { + case Node.PI: return ""; + case Node.COMMENT: return ""; + case Node.TEXT: return data; + default: return "UNKNOWN"; + } + } +} diff --git a/sandbox/contributions/webcrawler-LARM/src/hplb/xml/Tokenizer.java b/sandbox/contributions/webcrawler-LARM/src/hplb/xml/Tokenizer.java new file mode 100644 index 00000000000..9f77289b04f --- /dev/null +++ b/sandbox/contributions/webcrawler-LARM/src/hplb/xml/Tokenizer.java @@ -0,0 +1,690 @@ +/* + * $Id$ + * + * Copyright 1997 Hewlett-Packard Company + * + * This file may be copied, modified and distributed only in + * accordance with the terms of the limited licence contained + * in the accompanying file LICENSE.TXT. + */ + +/* + * FIXME: + * - use java.io.Reader and Unicode chars... + * - recognize PIs and CDATA + * - recognize PEs and CEs (optionally) + * - Do NOT map element and attr names to lower (or upper) case + */ + +package hplb.xml; + +import hplb.org.xml.sax.*; +import java.util.Dictionary; +import java.util.Hashtable; +import java.io.*; +import hplb.misc.ByteArray; +import java.net.URL; + +/** + * This is a hand-written lexical analyzer for XML/HTML Markup. + * The parser is simple, fast and quite robust. + * Element and attribute names are mapped to lower case. + * Comments are returned as (part of) PCDATA tokens. + * Markup elements within comments is not recognized as markup. + * + * @author Anders Kristensen + */ +public class Tokenizer implements hplb.org.xml.sax.Parser { + + /** The value of boolean attributes is this string. */ + public static final String BOOLATTR = Atom.getAtom("BOOLATTR"); + + // FSM states: + static final int ST_START = 1; + static final int ST_TAG_LT = 3; + static final int ST_TAG_NAME = 4; + static final int ST_TAG_WS = 5; + static final int ST_EMPTY_TAG_SLASH = 6; + static final int ST_NAME = 7; + static final int ST_NAME_WS = 8; + static final int ST_EQ = 9; + static final int ST_VALUE = 10; + static final int ST_VALUE_QUOTED = 11; + static final int ST_PCDATA = 21; + static final int ST_COMMENT = 22; + + HandlerBase dfltHandler = new HandlerBase(); + EntityHandler entHandler = dfltHandler; + DocumentHandler docHandler = dfltHandler; + ErrorHandler errHandler = dfltHandler; + SAXAttributeMap attrs = new SAXAttributeMap(); + String sysID; + + protected Hashtable noCaseElms; + public boolean rcgnzWS = true; // is white space chars recognized as PCDATA + // even when preceeding tags? + public boolean rcgnzEntities = true; + public boolean rcgnzCDATA = true; + public boolean rcgnzComments = true; // + public boolean atomize = false; // make element and attr names atoms + + CharBuffer buf = new CharBuffer(); + boolean isStartTag = true; + /** + * Signals whether a non-empty element has any children. If not we + * must generate an artificial empty-string child [characters(buf, 0, 0)]. + */ + boolean noChildren; + CharBuffer tagname = new CharBuffer(); + CharBuffer attrName = new CharBuffer(); + CharBuffer attrValue = new CharBuffer(); + Reader in; + + public final EntityManager entMngr = new EntityManager(this); + protected int state = ST_START; + protected int _line = 1; + protected int _column = 0; + public int line; // can be used in Handler callbacks + public int column; // can be used in Handler callbacks + protected int qchar; // <'> or <"> when parsing quoted attr values + // we recognize attribute name-value pairs for XML PI by setting + // the inXMLDecl flag and going to state ST_TAG_WS + boolean inXMLDecl = false; // see + + public Tokenizer() { + pos(); + } + + public void setEntityHandler(EntityHandler handler) { + entHandler = handler; + } + + public void setDocumentHandler(DocumentHandler handler) { + docHandler = handler; + } + + public void setErrorHandler(ErrorHandler handler) { + errHandler = handler; + } + + public void parse(String publicID, String sysID) throws Exception { + this.sysID = sysID; + parse(new URL(sysID).openStream()); + } + + public void parse(InputStream in) throws Exception + { + parse(new InputStreamReader(in)); + } + + public void parse(Reader in) throws Exception + { + this.in = in; + docHandler.startDocument(); + tokenize(); + docHandler.endDocument(); + } + + // invoked to remember current position + protected void pos() { + line = _line; + column = _column; + } + + public void ignoreCase(String elementName) { + if (noCaseElms == null) noCaseElms = new Hashtable(); + noCaseElms.put(elementName.toLowerCase(), elementName); + } + + public void rcgnzWS(boolean b) { + rcgnzWS = b; + } + + // invoked after doing any Handler callback - resets state + protected void toStart() { + state = ST_START; + buf.reset(); + tagname.reset(); + attrName.reset(); + attrValue.reset(); + attrs.clear(); + isStartTag = true; // until proven wrong + pos(); + } + + public void tokenize() throws Exception { + int c; + + while ((c = read()) != -1) { + switch (state) { + case ST_START: + switch (c) { + case '<': + state = ST_TAG_LT; + isStartTag = true; // until proven wrong + tagname.reset(); + break; + case ' ': case '\t': case '\r': case '\n': + if (!rcgnzWS) break; + // else fall through + default: + state = ST_PCDATA; + } + break; + + case ST_PCDATA: + if (c == '<') { + gotPCDATA(true); + state = ST_TAG_LT; + } + break; + + case ST_TAG_LT: + switch (c) { + case '/': + isStartTag = false; + state = ST_TAG_NAME; + break; + case '!': + c = read(); + if ((c == '-' && !rcgnzComments) || (c == '[' && !rcgnzCDATA)) { + state = ST_PCDATA; + break; + } + if (c == '-') state = ST_COMMENT; + else if (c == '[') parseCDATA(); + else { + // FIXME: shouldn't be delivered as PCDATA + warning("Bad markup " + buf); + state = ST_PCDATA; + } + break; + case '?': + parsePI(); + break; + case ' ': case '\t': case '\r': case '\n': + state = ST_TAG_WS; + break; + default: + tagname.write(c); + state = ST_TAG_NAME; + } + break; + + case ST_TAG_NAME: + switch (c) { + case ' ': case '\t': case '\r': case '\n': + state = ST_TAG_WS; + break; + case '/': state = ST_EMPTY_TAG_SLASH; break; + case '>': gotTag(false); break; + default: tagname.write(c); + } + break; + + case ST_TAG_WS: + switch (c) { + case ' ': case '\t': case '\r': case '\n': break; + case '/': state = ST_EMPTY_TAG_SLASH; break; + case '>': gotTag(false); break; + case '?': + if (inXMLDecl) { + if ((c = read()) != '>') { + errHandler.warning("XML PI not terminated properly", + sysID, _line, _column); + //err_continue("XML PI not terminated properly"); + } + //handler.gotXMLDecl(attrs); // FIXME(?) + toStart(); + break; + } + // NOTE: if !inXMLDecl we fall through to default case + default: + if (!isStartTag) { + // bit of a hack this... + errHandler.warning("Malformed tag: "+buf, sysID, _line, _column); + //err_continue("Malformed tag: "+buf); + if (c == '<') { + gotPCDATA(true); + state = ST_TAG_LT; + } else { + // we get here e.g. if there's an end tag with attributes + state = ST_PCDATA; + } + } else { + // FIXME: this accepts way too many first chars for attr name + attrName.write(c); + state = ST_NAME; + } + } + break; + + case ST_EMPTY_TAG_SLASH: + if (c == '>') { + //tagtype = TAG_EMPTY; + gotTag(true); + break; + } else { + // ERROR !? - can't throw Exception here - we go to next tag... + state = ST_PCDATA; + } + break; + + case ST_NAME: + switch (c) { + case ' ': case '\t': case '\r': case '\n': + if (attrName.size() > 0) { + state = ST_NAME_WS; + } + break; + case '>': + if (attrName.size() > 0) gotAttr(true); + gotTag(false); + break; + case '=': + state = ST_EQ; + break; + default: + if (isCtlOrTspecial(c)) { + state = ST_PCDATA; + } else { + attrName.write(c); + } + } + break; + + case ST_NAME_WS: // white-space between name and '=' + switch (c) { + case ' ': case '\t': case '\r': case '\n': break; + case '=': state = ST_EQ; break; + case '>': gotAttr(true); gotTag(false); break; + default: + if (isNameChar(c)) { + gotAttr(true); + attrName.write(c); + state = ST_TAG_WS; + } else { + state = ST_PCDATA; + } + } + break; + + case ST_EQ: // white-space between '=' and value + switch (c) { + case ' ': case '\t': case '\r': case '\n': break; + case '"': qchar = '"'; state = ST_VALUE_QUOTED; break; + case '\'': qchar = '\''; state = ST_VALUE_QUOTED; break; + default: + if (isCtlOrTspecial(c)) { + state = ST_PCDATA; + } else { + attrValue.write(c); + state = ST_VALUE; + } + } + break; + + case ST_VALUE: + switch (c) { + case ' ': case '\t': case '\r': case '\n': + gotAttr(false); + state = ST_TAG_WS; + break; + case '>': + gotAttr(false); + gotTag(false); + break; + case '/': + gotAttr(false); + state = ST_EMPTY_TAG_SLASH; + break; + default: + if (isCtlOrTspecial(c)) { + state = ST_PCDATA; + } else { + attrValue.write(c); + } + } + break; + + case ST_VALUE_QUOTED: + if (c == qchar) { + gotAttr(false); + state = ST_TAG_WS; + } else { + attrValue.write(c); + } + break; + + case ST_COMMENT: + // we've seen "...' + gotComment(); + //while (read_ex() != '>') ; + //state = ST_PCDATA; + } catch (EmptyInputStream ex) { + gotPCDATA(false); + break; + } + } + } + /* TODO: catch EmptyInputStream exception only here! + } catch (EmptyInputStream ex) { + err_continue("EOF while parsing " + token[state]); + } + */ + + // input stream ended - return rest, if any, as PCDATA + if (buf.size() > 0) { + gotPCDATA(false); + buf.reset(); + } + } + + // counts lines and columns - used in error reporting + // a line can be a single \r or \n or it can be \r\n - we handle them all + int cc; // last char read + public final int read() throws IOException { + int c = in.read(); + if (c != -1) { + buf.write(c); + + switch (c) { + case '\r': _line++; _column = 0; break; + case '\n': + if (cc != '\r') _line++; + _column = 0; + break; + default: + _column++; + } + cc = c; + } + return c; + } + + public final int read_ex() throws IOException, EmptyInputStream { + int c = read(); + if (c == -1) throw new EmptyInputStream(); + return c; + } + + // HTML allows boolean attributes - attributes without a + // value, or rather an implicit value which is the same as the name. + protected final void gotAttr(boolean isBoolean) throws Exception { + String nm = attrName.toString(); + if (atomize) nm = Atom.getAtom(nm); + String val = isBoolean ? BOOLATTR : + (rcgnzEntities ? entMngr.entityDecode(attrValue) : + attrValue).toString(); + attrName.reset(); + attrValue.reset(); + attrs.put(nm, val); + } + + protected void gotTag(boolean isEmpty) throws Exception { + String nm = tagname.toString(); + String nm_lc = nm.toLowerCase(); + if (noCaseElms != null && noCaseElms.get(nm_lc) != null) { + nm = nm_lc; + keysToLowerCase(attrs); + } + if (atomize) nm = Atom.getAtom(nm); + if (isStartTag) { + docHandler.startElement(nm, attrs); + //handler.gotSTag(nm, isEmpty, attrs, getBuffer()); + if (isEmpty) docHandler.endElement(nm); + noChildren = !isEmpty; + } else { + if (noChildren) { + docHandler.characters(buf.getCharArray(), 0, 0); + noChildren = false; + } + docHandler.endElement(nm); + //handler.gotETag(nm, getBuffer()); + } + toStart(); + } + + public final void keysToLowerCase(SAXAttributeMap attrs) { + for (int i = 0; i < attrs.n; i++) { + attrs.keys[i] = attrs.keys[i].toLowerCase(); + if (atomize) attrs.keys[i] = Atom.getAtom(attrs.keys[i]); + } + } + + // toomuch true iff we read a '<' of the next token + protected void gotPCDATA(boolean toomuch) throws Exception { + noChildren = false; + if (toomuch) { + buf.setLength(buf.size() - 1); + } + CharBuffer buf1 = rcgnzEntities ? entMngr.entityDecode(buf) : buf; + docHandler.characters(buf1.getCharArray(), 0, buf1.size()); + //handler.gotText(getBuffer()); + toStart(); + if (toomuch) { + buf.write('<'); + column--; + } + } + + // XXX: should pass the comment on as docHandler.ignorable() ?? + protected void gotComment() throws IOException, EmptyInputStream { + //toStart(); // so an unexpected EOF causes rest to be returned as PCDATA + while (read_ex() != '>') ; + toStart(); + } + + // Processing Instruction + protected void parsePI() throws Exception { + int i; + String target; + + noChildren = false; + inXMLDecl = false; + i = buf.size(); + try { + while (!isWS(read_ex())) ; + target = buf.toString(); + target = target.substring(i, target.length() - 1); + + if ("XML".equals(target)) { + inXMLDecl = true; + state = ST_TAG_WS; + return; + } + + while (isWS(read_ex())) ; + i = buf.size() - 1; + while (true) { + while (read_ex() != '?') ; + if (read_ex() == '>') { + String s = buf.toString(); + docHandler.processingInstruction( + Atom.getAtom(target), s.substring(i, s.length()-2)); + //handler.gotPI(Atom.getAtom(target), + // s.substring(i, s.length()-2)); + break; + } + } + } catch (EmptyInputStream ex) { + gotPCDATA(false); + errHandler.warning("EOF while parsing PI", sysID, _line, _column); + //err_continue("EOF while parsing PI"); + } + toStart(); + } + + // CDATA section + // XXX: should contents be amalgamated with surrounding PCDATA? + protected void parseCDATA() throws Exception { + // we've seen "') ; + docHandler.characters(buf.getCharArray(), i1, buf.size()-3-i1); + } else { + warning("Bad CDATA markup"); + state = ST_PCDATA; + } + } catch (EmptyInputStream ex) { + warning("EOF while parsing CDATA section"); + gotPCDATA(false); + } + toStart(); + } + + public boolean isWS(int c) { + switch (c) { + case ' ': case '\t': case '\r': case '\n': return true; + default: return false; + } + } + + /** + * Returns true if c is either an ascii control character or + * a tspecial according to the HTTP specification. + */ + // private static final boolean[] isCtlOrTSpecial = new boolean[] +// { +// /* 0 */ true , true , true , true , true , true , true , true , true , true , true , true , true , true , +// /* 14 */ true , true , true , true , true , true , true , true , true , true , true , true , true , true , +// /* 28 */ true , true , true , true , true , false, true , false, false, false, false, false, true , true , +// /* 42 */ false, false, true , false, false, true , false, false, false, false, false, false, false, false, +// /* 56 */ false, false, true , true , true , true , true , true , true , false, false, false, false, false, +// /* 70 */ false, false, false, false, false, false, false, false, false, false, false, false, false, false, +// /* 84 */ false, false, false, false, false, false, false, true , true , true , false, false, false, false, +// /* 98 */ false, false, false, false, false, false, false, false, false, false, false, false, false, false, +// /* 112 */ false, false, false, false, false, false, false, false, false, false, false, true , false, true , +// /* 126 */ false, true , false, false, false, false, false, false, false, false, false, false, false, false, +// /* 140 */ false, false, false, false, false, false, false, false, false, false, false, false, false, false, +// /* 154 */ false, false, false, false, false, false, false, false, false, false, false, false, false, false, +// /* 168 */ false, false, false, false, false, false, false, false, false, false, false, false, false, false, +// /* 182 */ false, false, false, false, false, false, false, false, false, false, false, false, false, false, +// /* 196 */ false, false, false, false, false, false, false, false, false, false, false, false, false, false, +// /* 210 */ false, false, false, false, false, false, false, false, false, false, false, false, false, false, +// /* 224 */ false, false, false, false, false, false, false, false, false, false, false, false, false, false, +// /* 238 */ false, false, false, false, false, false, false, false, false, false, false, false, false, false, +// /* 252 */ false, false, false, false +// }; + + public static final boolean isCtlOrTspecial(int c) { + switch (c) { + // control characters (0-31 and 127): + case 0: case 1: case 2: case 3: case 4: case 5: + case 6: case 7: case 8: case 9: case 10: case 11: + case 12: case 13: case 14: case 15: case 16: case 17: + case 18: case 19: case 20: case 21: case 22: case 23: + case 24: case 25: case 26: case 27: case 28: case 29: + case 30: case 31: case 127: + + // tspecials: + case '(': case ')': case '<': case '>': case '@': + case ',': case ';': case ':': case '\\': case '"': + case '/': case '[': case ']': case '?': case '=': + case '{': case '}': case ' ': // case '\t': + return true; + + default: + return false; + } + } + +/* public static void main(String[]) + { + System.out.println("private static final boolean[] isCtlOrTSpecial = \n{"); // bzw. isNameChar + for(int i=0; i<256; i++) + { + if(i>0) + System.out.print(", "); + if(i % 14 == 0) + { + System.out.print("\n/* " + i + " *" + "/ "); + } + if(Tokenizer.isCtlOrTspecial(i)) // bzw. isNameChar(i) + { + System.out.print("true "); + } + else + { + System.out.print("false"); + } + + + } + System.out.print("};\n\n"); + } + */ + +// public static final boolean isCtlOrTspecial(int c) +// { +// return (c < 256 ? isCtlOrTSpecial[c] : false); +// } +// +// private static final boolean[] isNameChar = +// { +// /* 0 */ false, false, false, false, false, false, false, false, false, false, false, false, false, false, +// /* 14 */ false, false, false, false, false, false, false, false, false, false, false, false, false, false, +// /* 28 */ false, false, false, false, false, false, false, false, false, false, false, false, false, false, +// /* 42 */ false, false, false, true , true , false, true , true , true , true , true , true , true , true , +// /* 56 */ true , true , false, false, false, false, false, false, false, true , true , true , true , true , +// /* 70 */ true , true , true , true , true , true , true , true , true , true , true , true , true , true , +// /* 84 */ true , true , true , true , true , true , true , false, false, false, false, true , false, true , +// /* 98 */ true , true , true , true , true , true , true , true , true , true , true , true , true , true , +// /* 112 */ true , true , true , true , true , true , true , true , true , true , true , false, false, false, +// /* 126 */ false, false, false, false, false, false, false, false, false, false, false, false, false, false, +// /* 140 */ false, false, false, false, false, false, false, false, false, false, false, false, false, false, +// /* 154 */ false, false, false, false, false, false, false, false, false, false, false, false, false, false, +// /* 168 */ false, false, false, false, false, false, false, false, false, false, false, false, false, false, +// /* 182 */ false, false, false, false, false, false, false, false, false, false, false, false, false, false, +// /* 196 */ false, false, false, false, false, false, false, false, false, false, false, false, false, false, +// /* 210 */ false, false, false, false, false, false, false, false, false, false, false, false, false, false, +// /* 224 */ false, false, false, false, false, false, false, false, false, false, false, false, false, false, +// /* 238 */ false, false, false, false, false, false, false, false, false, false, false, false, false, false, +// /* 252 */ false, false, false, false +// }; +// public static final boolean isNameChar(int c) +// { +// return (c < 256 ? isNameChar[c] : false); +// } +// + /* + // I don't think this is a very standard definition of what can + // go into tag and attribute names.*/ + public static final boolean isNameChar(int c) { + return ('a' <= c && c <= 'z') || + ('A' <= c && c <= 'Z') || + ('0' <= c && c <= '9') || + c == '.' || c == '-' || c == '_'; + } + + + + protected final void warning(String s) throws Exception { + errHandler.warning(s, sysID, _line, _column); + } + + protected final void fatal(String s) throws Exception { + errHandler.fatal(s, sysID, _line, _column); + } +} + +class EmptyInputStream extends Exception { + EmptyInputStream() {} +} diff --git a/sandbox/contributions/webcrawler-LARM/src/hplb/xml/Utils.java b/sandbox/contributions/webcrawler-LARM/src/hplb/xml/Utils.java new file mode 100644 index 00000000000..66eda2511f6 --- /dev/null +++ b/sandbox/contributions/webcrawler-LARM/src/hplb/xml/Utils.java @@ -0,0 +1,98 @@ +/* + * $Id$ + * + * Copyright 1997 Hewlett-Packard Company + * + * This file may be copied, modified and distributed only in + * accordance with the terms of the limited licence contained + * in the accompanying file LICENSE.TXT. + */ + +package hplb.xml; + +import hplb.org.w3c.dom.*; +import java.io.*; +import java.util.*; + +public class Utils { + /** Pretty-print elm. */ + public static void pp(Node node, PrintStream out) { + pp(node, out, 0); + } + + public static void pp(Node node, PrintStream out, int indent) { + indent(out, indent); + out.println("" + node); + indent += 2; + + NodeIterator iter = node.getChildNodes(); + Node child; + while ((child = iter.toNext()) != null) { + pp(child, out, indent); + } + } + + public static String compact(String s) { + if (s.length() < 18) { + return "[" + noCRLF(s) + "]"; + } else { + return "[" + noCRLF(s.substring(0, 7)) + "..." + + noCRLF(s.substring(s.length() - 7)) + "]"; + } + } + + public static String noCRLF(String s) { + return s.replace('\r', ' ').replace('\n', ' '); + } + + public static void indent(PrintStream out, int indent) { + for (int i = 0; i < indent; i++) out.print(' '); + } + + /** + * Encode an XML attribute value. Changes <"> to "&quote;". + */ + public static String encAttrVal(String val) { + if (val.indexOf('"') > -1) { + StringBuffer sbuf = new StringBuffer(); + int offset = 0, i; + while ((i = val.indexOf('"', offset)) > -1) { + sbuf.append(val.substring(offset, i)); + sbuf.append(""e;"); + offset = i+1; + } + sbuf.append(val.substring(offset)); + return sbuf.toString(); + } + return val; + } + + /** + * Encode the specified String as XML PCDATA, i.e. "<" is + * encoded as "&lt;" and "&" is encoded as "&amp;". + */ + public static String encPCDATA(String s) { + if (s.indexOf('<') > -1 || s.indexOf('&') > -1) { + StringBuffer sbuf = new StringBuffer(); + int offset = 0; + int i = s.indexOf('<', offset); + int j = s.indexOf('&', offset); + while (i > -1 || j > -1) { + if (i > j) { + sbuf.append(s.substring(offset, i)); + sbuf.append(""e;"); + offset = i+1; + i = s.indexOf('<', offset); + } else { + sbuf.append(s.substring(offset, j)); + sbuf.append(""e;"); + offset = j+1; + j = s.indexOf('&', offset); + } + } + sbuf.append(s.substring(offset)); + return sbuf.toString(); + } + return s; + } +} diff --git a/sandbox/contributions/webcrawler-LARM/src/hplb/xml/util/HtmlObserver.java b/sandbox/contributions/webcrawler-LARM/src/hplb/xml/util/HtmlObserver.java new file mode 100644 index 00000000000..ab8cbdbac5f --- /dev/null +++ b/sandbox/contributions/webcrawler-LARM/src/hplb/xml/util/HtmlObserver.java @@ -0,0 +1,40 @@ +/* + * $Id$ + * + * Copyright 1997 Hewlett-Packard Company + * + * This file may be copied, modified and distributed only in + * accordance with the terms of the limited licence contained + * in the accompanying file LICENSE.TXT. + */ + +package hplb.xml.util; + +import java.net.URL; + +/** + * A callback interface used in conjunction with UrlScanner. Allows actions + * to be taken whenever the scanner finds a URL in an HTML document. The + * scanner knows about most HTML 4.0 elements which can contain URLs. + * Can be used, for example, to implement robot code which crawls a hypertext + * graph. This interface is similar to Jeff Poskanzer's Acme.HtmlObserver. + * + * @see HtmlScanner + * @author Anders Kristensen + */ +public interface HtmlObserver { + /** Invoked when the scanner finds an <a href=""> URL. */ + public void gotAHref(String urlStr, URL contextUrl, Object data); + + /** Invoked when the scanner finds an <img src=""> URL. */ + public void gotImgSrc(String urlStr, URL contextUrl, Object data); + + /** Invoked when the scanner finds a <base href=""> URL. */ + public void gotBaseHref(String urlStr, URL contextUrl, Object data ); + + /** Invoked when the scanner finds a <area href=""> URL. */ + public void gotAreaHref(String urlStr, URL contextUrl, Object data ); + + /** Invoked when the scanner finds an <frame src=""> URL. */ + public void gotFrameSrc(String urlStr, URL contextUrl, Object data ); +} diff --git a/sandbox/contributions/webcrawler-LARM/src/hplb/xml/util/HtmlScanner.java b/sandbox/contributions/webcrawler-LARM/src/hplb/xml/util/HtmlScanner.java new file mode 100644 index 00000000000..c1e9a4f142d --- /dev/null +++ b/sandbox/contributions/webcrawler-LARM/src/hplb/xml/util/HtmlScanner.java @@ -0,0 +1,177 @@ +/* + * $Id$ + * + * Copyright 1997 Hewlett-Packard Company + * + * This file may be copied, modified and distributed only in + * accordance with the terms of the limited licence contained + * in the accompanying file LICENSE.TXT. + */ + +package hplb.xml.util; + +import hplb.org.xml.sax.HandlerBase; +import hplb.org.xml.sax.AttributeMap; +import hplb.org.xml.sax.XmlException; +import hplb.org.xml.sax.ErrorHandler; +import hplb.org.xml.sax.EntityHandler; +import hplb.org.xml.sax.DocumentHandler; +import hplb.xml.*; +import java.net.*; +import java.io.*; + +/** + * The HtmlScanner parses an HTML document for elements containing links. + * For each link found it will invoke a client-provided callback method. + * It knows about most HTML4.0 links and also knows about the <base>. + * + *

      For an example use see UrlScanner. + * + * @see HtmlObserver + * @see UrlScanner + * @author Anders Kristensen + */ +public class HtmlScanner extends HandlerBase { + HtmlObserver observer; + URL contextURL; + Object data; + Tokenizer tok; + Reader in; + + /** + * Parse the input on the specified stream as if it was HTML and + * invoke the provided observer as links are encountered. + * @param url the URL to parse for links + * @param observer the callback object + * @param data client-specific data; this is passed back to the + * client in callbacks; this scanner doesn't use it + * @throws Exception see hplb.org.xml.sax.Parser.parse() + * @see hplb.org.xml.sax.Parser.parse + */ + public HtmlScanner(URL url, HtmlObserver observer ) throws Exception { + this(new BufferedReader(new InputStreamReader(url.openStream())), url, observer); + } + + /** + * Parse the input on the specified stream as if it was HTML and + * invoke the provided observer as links are encountered. + * @param in the input stream + * @param url the URL corresponding to this document + * @param observer the callback object + * @throws Exception see hplb.org.xml.sax.Parser.parse() + * @see hplb.org.xml.sax.Parser.parse + * @deprecated + */ + public HtmlScanner(InputStream in, URL url, HtmlObserver observer) + throws Exception + { + this(new BufferedReader(new InputStreamReader(in)), url, observer, null); + } + + /** + * Parse the input on the specified stream as if it was HTML and + * invoke the provided observer as links are encountered. + * @param in the Reader + * @param url the URL corresponding to this document + * @param observer the callback object + * @throws Exception see hplb.org.xml.sax.Parser.parse() + * @see hplb.org.xml.sax.Parser.parse + */ + public HtmlScanner(Reader in, URL url, HtmlObserver observer) + throws Exception + { + this(in, url, observer, null); + } + + /** + * Parse the input on the specified stream as if it was HTML and + * invoke the provided observer as links are encountered. + * Although not deprecated, this method should not be used. Use HtmlScanner(Reader...) instead + * @deprecated + */ + public HtmlScanner(InputStream in, URL url, HtmlObserver observer, Object data) + throws Exception + { + this(new BufferedReader(new InputStreamReader(in)), url, observer, data); + } + + /** + * Parse the input on the specified stream as if it was HTML and + * invoke the provided observer as links are encountered. + * @param in the input stream + * @param url the URL corresponding to this document + * @param observer the callback object + * @param data client-specific data; this is passed back to the + * client in callbacks; this scanner doesn't use it + * @throws Exception see hplb.org.xml.sax.Parser.parse() + * @see hplb.org.xml.sax.Parser.parse + */ + public HtmlScanner(Reader in, URL url, HtmlObserver observer, Object data) + throws Exception + { + this.in = in; + this.observer = observer; + this.contextURL = url; + this.data = data; + tok = new Tokenizer(); + setDocumentHandler(this); + HTML.applyHacks(tok); + tok.rcgnzEntities = false; + tok.rcgnzCDATA = false; + tok.atomize = true; + } + + public void setDocumentHandler(DocumentHandler doc) + { + tok.setDocumentHandler(doc); + } + + public void setEntityHandler(EntityHandler ent) + { + tok.setEntityHandler(ent); + } + + public void setErrorHandler(ErrorHandler err) + { + tok.setErrorHandler(err); + } + + public void parse() throws Exception + { + tok.parse(in); + } + + public void startElement(String name, AttributeMap attributes) { + String val; + + if (name == HTML.A) { + if ((val = attributes.getValue("href")) != null) { + observer.gotAHref(val, contextURL, data); + } + } else if (name == HTML.IMG) { + if ((val = attributes.getValue("src")) != null) { + observer.gotImgSrc(val, contextURL, data); + } + } else if (name == HTML.BASE) { + if ((val = attributes.getValue("href")) != null) { + observer.gotBaseHref(val, contextURL, data); + if (contextURL != null) { + try { + contextURL = new URL(contextURL, val); + } catch (MalformedURLException ex) { + System.err.println("Bad URL: " + val + "."); + System.err.println(ex.getMessage()); + } + } + } + } else if (name == HTML.AREA) { + if ((val = attributes.getValue("href")) != null) { + observer.gotAreaHref(val, contextURL, data); + } + } else if (name == HTML.FRAME) { + if ((val = attributes.getValue("src")) != null) { + observer.gotFrameSrc(val, contextURL, data); + } + } + } +} diff --git a/sandbox/contributions/webcrawler-LARM/src/hplb/xml/util/NormalizeHtml.java b/sandbox/contributions/webcrawler-LARM/src/hplb/xml/util/NormalizeHtml.java new file mode 100644 index 00000000000..b590edab365 --- /dev/null +++ b/sandbox/contributions/webcrawler-LARM/src/hplb/xml/util/NormalizeHtml.java @@ -0,0 +1,143 @@ +/* + * $Id$ + * + * Copyright 1997 Hewlett-Packard Company + * + * This file may be copied, modified and distributed only in + * accordance with the terms of the limited licence contained + * in the accompanying file LICENSE.TXT. + */ + +package hplb.xml.util; + +import hplb.xml.*; +import hplb.org.w3c.dom.*; +import java.io.*; + +/** + * Reads an HTML document on System.in, "normalizes" it in a couple of ways, and + * writes it to System.out. In the process HTML4.0 element names are converted to + * upper case, attribute names are converted to lower case, all attribute values + * gets enclosed in double quotes, all non-empty elements with an optional and + * omitted end tag are given an end tag. + * + * @author Anders Kristensen + */ +public class NormalizeHtml { + static PrintStream out = System.out; + + public static void usage() { + System.exit(1); + } + + public static void main(String[] args) throws Exception { + /* + Tokenizer tok = new Tokenizer(); + tok.setDocumentHandler(new NormalizeHtml()); + HTML.applyHacks(tok); + //tok.rcgnzEntities = false; + tok.rcgnzCDATA = false; + tok.atomize = true; + tok.parse(System.in); + */ + HtmlXmlParser parser = new HtmlXmlParser(); + Tokenizer tok = parser.getTokenizer(); + tok.rcgnzEntities = false; + tok.rcgnzCDATA = false; + tok.rcgnzComments = false; + tok.atomize = true; + print(parser.parse(System.in)); + } + + public static void print(Document doc) { + //print(doc.getDocumentElement()); + NodeIterator iter = doc.getChildNodes(); + while (iter.toNext() != null) { + printNode(iter.getCurrent()); + } + } + + public static void printNode(Node node) { + if (node instanceof Document) print((Document) node); + else if (node instanceof Element) print((Element) node); + else if (node instanceof Text) print((Text) node); + else System.err.println("Error: non-text, non-element node ignored."); + } + + public static void print(Text text) { + //out.print(encodeText(text.getData(), false)); + out.print(text.getData()); + } + + public static void print(Element elm) { + String tagName = elm.getTagName(); + AttributeList attrs = elm.attributes(); + boolean isHtmlElm = isHtmlElm(tagName); + boolean isEmpty = (elm.getFirstChild() == null); + boolean isHtmlEmptyElm = + (tagName == HTML.AREA + || tagName == HTML.BASE + || tagName == HTML.BR + || tagName == HTML.COL + || tagName == HTML.FRAME + || tagName == HTML.HR + || tagName == HTML.IMG + || tagName == HTML.LINK + || tagName == HTML.META + || tagName == HTML.PARAM); + + if (isHtmlElm) tagName = tagName.toUpperCase(); + + // print start tag and attribute name-value pairs + out.print("<" + tagName); + int len = attrs.getLength(); + for (int i = 0; i < len; i++) { + print(attrs.item(i), isHtmlElm); + } + if (isEmpty && !isHtmlEmptyElm) out.print("/"); + out.print(">"); + if (isEmpty) return; + + // print content + NodeIterator iter = elm.getChildNodes(); + while (iter.toNext() != null) { + printNode(iter.getCurrent()); + } + + // print end tag + out.print(""); + } + + public static void print(Attribute attr, boolean toLower) { + String a = attr.getName(); + out.print(" " + (toLower ? a.toLowerCase() : a) + + "=\"" + encodeText(attr.toString(), true) +'"'); + } + + public static String encodeText(String s, boolean attr) { + StringBuffer sb = new StringBuffer(); + int ch, len = s.length(); + + for (int i = 0; i < len; i++) { + ch = s.charAt(i); + if (ch == '"') sb.append("""); + /* cause we don't recognize markup within PCDATA and attr values + else if (ch == '&') sb.append("&"); + else if (!attr && ch == '<') sb.append("<"); + else if (!attr && ch == '>') sb.append(">"); + else if ((" \r\n\t".indexOf((char) ch) != -1) + && (ch <= 31 || ch >= 127)) sb.append("&#"+ch+";"); + */ + else sb.append((char) ch); + } + return sb.toString(); + } + + public static boolean isHtmlElm(String tagName) { + int len = HTML.elements.length; + for (int i = 0; i < len; i++) { + if (tagName == HTML.elements[i]) return true; + } + return false; + } +} diff --git a/sandbox/contributions/webcrawler-LARM/src/hplb/xml/util/RmMarkup.java b/sandbox/contributions/webcrawler-LARM/src/hplb/xml/util/RmMarkup.java new file mode 100644 index 00000000000..dcb64434834 --- /dev/null +++ b/sandbox/contributions/webcrawler-LARM/src/hplb/xml/util/RmMarkup.java @@ -0,0 +1,32 @@ +/* + * $Id$ + * + * Copyright 1997 Hewlett-Packard Company + * + * This file may be copied, modified and distributed only in + * accordance with the terms of the limited licence contained + * in the accompanying file LICENSE.TXT. + */ + +package hplb.xml.util; + +import hplb.xml.Tokenizer; +import hplb.org.xml.sax.*; +import java.io.*; + +public class RmMarkup extends HandlerBase { + static Tokenizer tok; + static Writer out = new OutputStreamWriter(System.out); + + public void characters (char ch[], int start, int length) throws IOException { + out.write(ch, start, length); + } + + public static void main(String[] args) throws Exception { + tok = new Tokenizer(); + tok.setDocumentHandler(new RmMarkup()); + TokTest.args(args, tok); + tok.parse(System.in); + out.flush(); + } +} diff --git a/sandbox/contributions/webcrawler-LARM/src/hplb/xml/util/TokTest.java b/sandbox/contributions/webcrawler-LARM/src/hplb/xml/util/TokTest.java new file mode 100644 index 00000000000..07d34574cbf --- /dev/null +++ b/sandbox/contributions/webcrawler-LARM/src/hplb/xml/util/TokTest.java @@ -0,0 +1,107 @@ +/* + * $Id$ + * + * Copyright 1997 Hewlett-Packard Company + * + * This file may be copied, modified and distributed only in + * accordance with the terms of the limited licence contained + * in the accompanying file LICENSE.TXT. + */ + +package hplb.xml.util; + +import hplb.xml.*; +import hplb.org.xml.sax.*; +import java.io.*; + +/** + * Test of Tokenizer. + * Usage: TokTest [-w] < html-file + * @author Anders Kristensen + */ +public class TokTest implements DocumentHandler { + static Tokenizer tok; + static PrintStream out = System.out; + int n = 60; + int n2 = (n-3)/2; + + public void startDocument () { + out.println("START DOC"); + } + + public void endDocument () { + out.println("END DOC"); + } + + public void doctype (String name, String publicID, String systemID) { + out.println("DOC TYPE " + name + ", " + publicID + ", " + systemID); + } + + public void startElement (String name, AttributeMap attributes) { + out.println("START " + name + ", " + attributes); + } + + public void endElement (String name) { + out.println("END " + name); + } + + public void characters (char ch[], int start, int length) { + //out.println("Chars: " + new String(ch, start, length)); + out.println("Chars: " + compact(new String(ch, start, length))); + } + + public void ignorable (char ch[], int start, int length) { + out.println("Ignorable: " + compact(new String(ch, start, length))); + } + + public void processingInstruction (String name, String remainder) { + out.println("PI: " + name + ", " + compact(remainder)); + } + + // Returns short description of PCDATA argument. + public String compact(char[] buf) { + return compact(new String(buf)); + } + + public String compact(String s) { + if (s.length() < n) { + return "[" + noCRLF(s) + "]"; + } else { + return "[" + noCRLF(s.substring(0, n2)) + "..." + + noCRLF(s.substring(s.length() - n2)) + "]"; + } + } + + private static String noCRLF(String s) { + return s.replace('\r', ' ').replace('\n', ' '); + } + + /** + * Process options in 'args' vector and apply to the supplied Tokenizer. + */ + public static void args(String[] args, Tokenizer tok) { + // case mappoing: tags/attr names/attr values, upper/lower/depends... + for (int i = 0; i < args.length; i++) { + if ("-w".equals(args[i])) { + tok.rcgnzWS = true; + } else if ("-d".equals(args[i])) { + tok.rcgnzComments = false; + } else if ("-c".equals(args[i])) { + tok.rcgnzCDATA = false; + } else if ("-e".equals(args[i])) { + tok.rcgnzEntities = false; + } else if ("-h".equals(args[i])) { + HTML.applyHacks(tok); + } else { + System.err.println("Unrecognized option: " + args[i]); + } + } + } + + public static void main(String[] args) throws Exception { + tok = new Tokenizer(); + tok.setDocumentHandler(new TokTest()); + args(args, tok); + tok.parse(System.in); + } +} diff --git a/sandbox/contributions/webcrawler-LARM/src/hplb/xml/util/UrlScanner.java b/sandbox/contributions/webcrawler-LARM/src/hplb/xml/util/UrlScanner.java new file mode 100644 index 00000000000..b86f8f5e078 --- /dev/null +++ b/sandbox/contributions/webcrawler-LARM/src/hplb/xml/util/UrlScanner.java @@ -0,0 +1,166 @@ +/* + * $Id$ + * + * Copyright 1997 Hewlett-Packard Company + * + * This file may be copied, modified and distributed only in + * accordance with the terms of the limited licence contained + * in the accompanying file LICENSE.TXT. + * + * Copyright 1998 Hewlett-Packard Company + * + * This file may be copied, modified and distributed only in + * accordance with the terms of the limited licence contained + * in the accompanying file LICENSE.TXT. + */ + +package hplb.xml.util; + +import java.net.*; +import java.io.*; +import java.util.Date; + +/** + * Scans an HTML Web object for embedded link and prints them on stdout. + * Usage: + *

      + *  java hplb.www.client.UrlScan [-t] [-v] [-h proxy-host] [-p proxy-port] URL
      + *  where -t means test validity of embedded URLs and
      + *        -v means be verbose
      + * 
      + * + * @author Anders Kristensen + */ +public class UrlScanner implements HtmlObserver { + + // should use getenv and/or getProperty for these: + static String proxyHost; + static String proxyPort; + static boolean test; + static boolean verbose; + + public static void usage() { + PrintStream out = System.out; + out.println("Usage: UrlScan [-v] [-t] "); + out.println("Extracts URLs from System.in and writes them on stdout."); + out.println(" -v verbose mode"); + out.println(" -t test links (using HTTP HEAD requests)"); + + System.exit(1); + } + + public static void main(String[] args) throws Exception { + URL url = null; + //HttpClient cl; + //HttpResponse res = null; + + try { + url = new URL(args[args.length-1]); + for (int i = 0; i < args.length - 1; i++) { + if ("-t".equals(args[i])) { + test = true; + } else if ("-v".equals(args[i])) { + verbose = true; + } else if ("-h".equals(args[i])) { + proxyHost = args[++i]; + } else if ("-p".equals(args[i])) { + proxyPort = args[++i]; + } else { + usage(); + } + } + } catch (Exception e) { + usage(); + } + + //cl = new HttpClient(url); + if (proxyHost != null) { + System.getProperties().put("http.proxyHost", proxyHost); + } + if (proxyPort != null) { + System.getProperties().put("http.proxyPort", proxyPort); + } + /* + try { + res = cl.get(); + } catch (UnknownHostException e) { + panic("Couldn't connect to host " + e.getMessage()); + } catch (IOException e) { + panic("I/O exception"); + } catch (Exception e) { + panic("Error: " + e.getMessage()); + } + */ + + new HtmlScanner(url, new UrlScanner()); + } + + public static void panic(String reason) { + System.out.println(reason); + System.exit(1); + } + + public void gotAHref(String urlStr, URL contextUrl, Object data) { + try { + URL url = new URL(contextUrl, urlStr); + System.out.print(url.toExternalForm()); + if (test) testLink(url); + System.out.println(); + } catch (Exception e) { + if (verbose) e.printStackTrace(); + } + } + + /** Invoked when the scanner finds an <img src=""> URL. */ + public void gotImgSrc(String urlStr, URL contextUrl, Object data) { + try { + URL url = new URL(contextUrl, urlStr); + System.out.print(url.toExternalForm()); + if (test) testLink(url); + System.out.println(); + } catch (Exception e) { + if (verbose) e.printStackTrace(); + } + } + + /** Invoked when the scanner finds a <base href=""> URL. */ + public void gotBaseHref(String urlStr, URL contextUrl, Object data ) { + if (verbose) { + System.out.println("gotBASEHREF: " + urlStr); + System.out.println(" " + contextUrl); + } + } + + /** Invoked when the scanner finds a <area href=""> URL. */ + public void gotAreaHref(String urlStr, URL contextUrl, Object data ) { + if (verbose) { + System.out.println("gotAreaHref: " + urlStr); + System.out.println(" " + contextUrl); + } + } + + /** Invoked when the scanner finds an <frame src=""> URL. */ + public void gotFrameSrc(String urlStr, URL contextUrl, Object data ) { + try { + URL url = new URL(contextUrl, urlStr); + System.out.print(url.toExternalForm()); + if (test) testLink(url); + System.out.println(); + } catch (Exception e) { + if (verbose) e.printStackTrace(); + } + } + + public static void testLink(URL url) throws IOException { + throw new IOException("Not implemented"); + /* + HttpClient cl = new HttpClient(url); + if (proxyHost != null) + cl.setProxyAddr(proxyHost, proxyPort); + HttpResponse res = cl.head(); + + System.out.print(" " + res.getStatusCode()); + if (verbose) System.out.print(" " + res.getReason()); + */ + } +}