mirror of
synced 2025-03-05 07:49:22 +00:00
Initial revision
git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@150751 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Executable file
Executable file
@ -0,0 +1,23 @@
echo cleaning
rm -r build
rm -r classes
rm -r cachingqueue
rm -r logs
echo making build directory
mkdir build
cd build
echo extracting http client
jar xvf ../lib/HTTPClient.zip >/dev/nul
cd ..
cp -r src/* build
mkdir classes
echo compiling
javac -g -d classes -sourcepath build build/HTTPClient/*.java
javac -g -classpath ./lib/jakarta-oro-2.0.5.jar -d classes -sourcepath build build/de/lanlab/larm/fetcher/FetcherMain.java
Executable file
Executable file
@ -0,0 +1,5 @@
rm -r build
rm -r classes
Executable file
Executable file
@ -0,0 +1,4 @@
rm -r logs
rm -r cachingqueue
Executable file
Executable file
@ -0,0 +1,23 @@
echo cleaning
rm -r build
rm -r classes
rm -r cachingqueue
rm -r logs
echo making build directory
mkdir build
cd build
#echo extracting http client
#jar xvf ../lib/HTTPClient.zip >/dev/null
cd ..
cp -r src/* build
mkdir classes
echo compiling
#javac -g -d classes -sourcepath build build/HTTPClient/*.java
javac -g -d classes -sourcepath build build/de/lanlab/larm/fetcher/FetcherMain.java
Executable file
Executable file
@ -0,0 +1,4 @@
rm -r logs
mkdir logs
java -server -Xmx400mb -classpath classes:lib/jakarta-oro-2.0.5.jar de.lanlab.larm.fetcher.FetcherMain -start http://www.cis.uni-muenchen.de/ -restrictto http://[^/]*\.uni-muenchen\.de.* -threads 15
@ -0,0 +1,278 @@
* @(#)ContentEncodingModule.java 0.3-3 06/05/2001
* This file is part of the HTTPClient package
* Copyright (C) 1996-2001 Ronald Tschalär
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2 of the License, or (at your option) any later version.
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* Lesser General Public License for more details.
* You should have received a copy of the GNU Lesser General Public
* License along with this library; if not, write to the Free
* Software Foundation, Inc., 59 Temple Place, Suite 330, Boston,
* MA 02111-1307, USA
* For questions, suggestions, bug-reports, enhancement-requests etc.
* I may be contacted at:
* ronald@innovation.ch
* The HTTPClient's home page is located at:
* http://www.innovation.ch/java/HTTPClient/
package HTTPClient;
import java.io.IOException;
import java.util.Vector;
import java.util.zip.InflaterInputStream;
import java.util.zip.GZIPInputStream;
* This module handles the Content-Encoding response header. It currently
* handles the "gzip", "deflate", "compress" and "identity" tokens.
* @author Ronald Tschalär
* @created 29. Dezember 2001
* @version 0.3-3 06/05/2001
public class ContentEncodingModule implements HTTPClientModule
// Methods
* Invoked by the HTTPClient.
* @param req Description of the Parameter
* @param resp Description of the Parameter
* @return Description of the Return Value
* @exception ModuleException Description of the Exception
public int requestHandler(Request req, Response[] resp)
throws ModuleException
// parse Accept-Encoding header
int idx;
NVPair[] hdrs = req.getHeaders();
for (idx = 0; idx < hdrs.length; idx++)
if (hdrs[idx].getName().equalsIgnoreCase("Accept-Encoding"))
Vector pae;
if (idx == hdrs.length)
hdrs = Util.resizeArray(hdrs, idx + 1);
pae = new Vector();
pae = Util.parseHeader(hdrs[idx].getValue());
catch (ParseException pe)
throw new ModuleException(pe.toString());
// done if "*;q=1.0" present
HttpHeaderElement all = Util.getElement(pae, "*");
if (all != null)
NVPair[] params = all.getParams();
for (idx = 0; idx < params.length; idx++)
if (params[idx].getName().equalsIgnoreCase("q"))
if (idx == params.length)
// no qvalue, i.e. q=1.0
if (params[idx].getValue() == null ||
params[idx].getValue().length() == 0)
throw new ModuleException("Invalid q value for \"*\" in " +
"Accept-Encoding header: ");
if (Float.valueOf(params[idx].getValue()).floatValue() > 0.)
catch (NumberFormatException nfe)
throw new ModuleException("Invalid q value for \"*\" in " +
"Accept-Encoding header: " + nfe.getMessage());
// Add gzip, deflate and compress tokens to the Accept-Encoding header
if (!pae.contains(new HttpHeaderElement("deflate")))
pae.addElement(new HttpHeaderElement("deflate"));
if (!pae.contains(new HttpHeaderElement("gzip")))
pae.addElement(new HttpHeaderElement("gzip"));
if (!pae.contains(new HttpHeaderElement("x-gzip")))
pae.addElement(new HttpHeaderElement("x-gzip"));
if (!pae.contains(new HttpHeaderElement("compress")))
pae.addElement(new HttpHeaderElement("compress"));
if (!pae.contains(new HttpHeaderElement("x-compress")))
pae.addElement(new HttpHeaderElement("x-compress"));
hdrs[idx] = new NVPair("Accept-Encoding", Util.assembleHeader(pae));
* Invoked by the HTTPClient.
* @param resp Description of the Parameter
* @param req Description of the Parameter
public void responsePhase1Handler(Response resp, RoRequest req)
* Invoked by the HTTPClient.
* @param resp Description of the Parameter
* @param req Description of the Parameter
* @return Description of the Return Value
public int responsePhase2Handler(Response resp, Request req)
* Invoked by the HTTPClient.
* @param resp Description of the Parameter
* @param req Description of the Parameter
* @exception IOException Description of the Exception
* @exception ModuleException Description of the Exception
public void responsePhase3Handler(Response resp, RoRequest req)
throws IOException, ModuleException
String ce = resp.getHeader("Content-Encoding");
if (ce == null || req.getMethod().equals("HEAD") ||
resp.getStatusCode() == 206)
Vector pce;
pce = Util.parseHeader(ce);
catch (ParseException pe)
throw new ModuleException(pe.toString());
if (pce.size() == 0)
String encoding = ((HttpHeaderElement) pce.firstElement()).getName();
if (encoding.equalsIgnoreCase("gzip") ||
Log.write(Log.MODS, "CEM: pushing gzip-input-stream");
resp.inp_stream = new GZIPInputStream(resp.inp_stream);
pce.removeElementAt(pce.size() - 1);
else if (encoding.equalsIgnoreCase("deflate"))
Log.write(Log.MODS, "CEM: pushing inflater-input-stream");
resp.inp_stream = new InflaterInputStream(resp.inp_stream);
pce.removeElementAt(pce.size() - 1);
else if (encoding.equalsIgnoreCase("compress") ||
Log.write(Log.MODS, "CEM: pushing uncompress-input-stream");
resp.inp_stream = new UncompressInputStream(resp.inp_stream);
pce.removeElementAt(pce.size() - 1);
else if (encoding.equalsIgnoreCase("identity"))
Log.write(Log.MODS, "CEM: ignoring 'identity' token");
pce.removeElementAt(pce.size() - 1);
Log.write(Log.MODS, "CEM: Unknown content encoding '" +
encoding + "'");
if (pce.size() > 0)
resp.setHeader("Content-Encoding", Util.assembleHeader(pce));
* Invoked by the HTTPClient.
* @param resp Description of the Parameter
* @param req Description of the Parameter
public void trailerHandler(Response resp, RoRequest req)
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@ -0,0 +1,38 @@
* Title: LARM Lanlab Retrieval Machine<p>
* Description: <p>
* Copyright: Copyright (c) <p>
* Company: <p>
* @author
* @version 1.0
package de.lanlab.larm.fetcher;
* contains all global constants used in this package
public class Constants
* user agent string a fetcher task gives to the corresponding server
public static final String USER_AGENT = "Mozilla/4.06 [en] (WinNT; I)";
* Crawler Identification
public static final String CRAWLER_AGENT = "Fetcher/0.95";
* size of the temporary buffer to read web documents in
public final static int FETCHERTASK_READSIZE = 4096;
* don't read more than... bytes
public final static int FETCHERTASK_MAXFILESIZE = 2000000;
@ -0,0 +1,73 @@
* Title: LARM Lanlab Retrieval Machine<p>
* Description: <p>
* Copyright: Copyright (c)<p>
* Company: <p>
* @author
* @version 1.0
package de.lanlab.larm.fetcher;
import java.util.*;
import java.net.*;
* filter class; gets IP Adresses from host names and forwards them to
* the other parts of the application
* since URLs cache their IP addresses themselves, and HTTP 1.1 needs the
* host names to be sent to the server, this class is not used anymore
public class DNSResolver implements MessageListener
HashMap ipCache = new HashMap();
public DNSResolver()
public void notifyAddedToMessageHandler(MessageHandler m)
this.messageHandler = m;
MessageHandler messageHandler;
public Message handleRequest(Message message)
if(message instanceof URLMessage)
URL url = ((URLMessage)message).getUrl();
String host = url.getHost();
InetAddress ip;
/*InetAddress ip = (InetAddress)ipCache.get(host);
if(ip == null)
ip = InetAddress.getByName(host);
ipCache.put(host, ip);
//System.out.println("DNSResolver: new Cache Entry \"" + host + "\" = \"" + ip.getHostAddress() + "\"");*/
catch(UnknownHostException e)
ip = null;
return null;
//System.out.println("DNSResolver: unknown host \"" + host + "\"");
//System.out.println("DNSResolver: Cache hit: " + ip.getHostAddress());
return message;
@ -0,0 +1,224 @@
* LARM - LANLab Retrieval Machine
* $history: $
package de.lanlab.larm.fetcher;
import de.lanlab.larm.threads.ThreadPool;
import de.lanlab.larm.threads.ThreadPoolObserver;
import de.lanlab.larm.threads.InterruptableTask;
import de.lanlab.larm.storage.*;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.LinkedList;
import de.lanlab.larm.fetcher.FetcherTask;
* filter class; the Fetcher is the main class which keeps the ThreadPool that
* gets the documents. It should be placed at the very end of the MessageQueue,
* so that all filtering can be made beforehand.
* @author Clemens Marschner
public class Fetcher implements MessageListener
* holds the threads
ThreadPool fetcherPool;
* total number of docs read
int docsRead = 0;
* the storage where the docs are saved to
DocumentStorage storage;
* the host manager keeps track of host information
HostManager hostManager;
* initializes the fetcher with the given number of threads in the thread
* pool and a document storage.
* @param maxThreads the number of threads in the ThreadPool
* @param storage the storage where all documents are stored
* @param hostManager the host manager
public Fetcher(int maxThreads, DocumentStorage storage, HostManager hostManager)
this.storage = storage;
fetcherPool = new ThreadPool(maxThreads, new FetcherThreadFactory(hostManager));
fetcherPool.setQueue(new FetcherTaskQueue());
docsRead = 0;
this.hostManager = hostManager;
* initializes the pool with default values (5 threads, NullStorage)
public void init()
* initializes the pool with a NullStorage and the given number of threads
* @param maxThreads the number of threads in the thread pool
public void init(int maxThreads)
docsRead = 0;
* this function will be called by the message handler each time a URL
* passes all filters and gets to the fetcher. From here, it will be
* distributed to the FetcherPool, a thread pool which carries out the task,
* that is to fetch the document from the web.
* @param message the message, which should actually be a URLMessage
* @return Description of the Return Value
public Message handleRequest(Message message)
URLMessage urlMessage = (URLMessage) message;
fetcherPool.doTask(new FetcherTask(urlMessage), "");
// eat the message
return null;
* called by the message handler when this object is added to it
* @param handler the message handler
public void notifyAddedToMessageHandler(MessageHandler handler)
this.messageHandler = handler;
MessageHandler messageHandler;
* the thread pool observer will be called each time a thread changes its
* state, i.e. from IDLE to RUNNING, and each time the number of thread
* queue entries change.
* this just wraps the thread pool method
* @param t the class that implements the ThreadPoolObserver interface
public void addThreadPoolObserver(ThreadPoolObserver t)
* returns the number of tasks queued. Should return 0 if there are any idle
* threads. this method just wraps the ThreadPool method
* @return The queueSize value
public int getQueueSize()
return fetcherPool.getQueueSize();
* get the total number of threads.
* this method just wraps the ThreadPool method
* @return The workingThreadsCount value
public int getWorkingThreadsCount()
return fetcherPool.getIdleThreadsCount() + fetcherPool.getBusyThreadsCount();
* get the number of threads that are currently idle.
* this method just wraps the ThreadPool method
* @return The idleThreadsCount value
public int getIdleThreadsCount()
return fetcherPool.getIdleThreadsCount();
* get the number of threads that are currently busy.
* this method just wraps the ThreadPool method
* @return The busyThreadsCount value
public int getBusyThreadsCount()
return fetcherPool.getBusyThreadsCount();
* Gets the threadPool attribute of the Fetcher object
* beware: the original object is returned
* @TODO remove this / make it private if possible
* @return The threadPool value
public ThreadPool getThreadPool()
return fetcherPool;
* Gets the total number of docs read
* @return number of docs read
public int getDocsRead()
return docsRead;
* returns the (original) task queue
* @TODO remove this if possible
* @return The taskQueue value
public FetcherTaskQueue getTaskQueue()
return (FetcherTaskQueue) this.fetcherPool.getTaskQueue();
@ -0,0 +1,150 @@
package de.lanlab.larm.fetcher;
import java.awt.event.ActionListener;
import java.awt.event.ActionEvent;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.*;
import java.awt.event.*;
import de.lanlab.larm.gui.*;
import de.lanlab.larm.threads.*;
* this was used to connect the GUI to the fetcher
* @TODO put this into the GUI package, probably?
public class FetcherGUIController implements ActionListener
FetcherMain fetcherMain;
FetcherSummaryFrame fetcherFrame;
public FetcherGUIController(FetcherMain fetcherMainPrg, FetcherSummaryFrame fetcherFrameWin, String defaultStartURL)
this.fetcherMain = fetcherMainPrg;
this.fetcherFrame = fetcherFrameWin;
new ThreadPoolObserver()
public void threadUpdate(int threadNr, String action, String info)
String status = threadNr + ": " + action + ": " + info;
public void queueUpdate(String info, String action)
fetcherMain.monitor.addObserver(new Observer()
public void update(Observable o, Object arg)
// der ThreadMonitor wurde geupdated
//fetcherFrame.setStalledThreads(fetcherMain.monitor.getStalledThreadCount(10, 500.0));
// fetcherFrame.setDocsPerSecond(fetcherMain.monitor.getDocsPerSecond(5));
// wir nutzen die Gelegenheit, den aktuellen Speicherbestand auszugeben
/* fetcherMain.reFilter.addObserver(
new Observer()
public void update(Observable o, Object arg)
fetcherMain.messageHandler.addMessageQueueObserver(new Observer()
public void update(Observable o, Object arg)
// a message has been added or deleted
// this observer will be called if a filter has decided to throw a
// message away.
fetcherMain.messageHandler.addMessageProcessorObserver(new Observer()
public void update(Observable o, Object arg)
if(arg == fetcherMain.urlScopeFilter)
else if(arg == fetcherMain.urlVisitedFilter)
else if(arg == fetcherMain.reFilter)
else // it's the fetcher
new WindowAdapter()
public void windowClosed(WindowEvent e)
System.out.println("window Closed");
* will be called when the start button is pressed
public void actionPerformed(ActionEvent e)
System.out.println("Füge Start-URL ein");
// urlVisitedFilter.printAllURLs();
// urlVisitedFilter.clearHashtable();
fetcherMain.putURL(new URL(fetcherFrame.getStartURL()), false);
catch(Exception ex)
System.out.println("actionPerformed: Exception: " + ex.getMessage());
@ -0,0 +1,362 @@
* LARM - LANLab Retrieval Machine
* $history: $
package de.lanlab.larm.fetcher;
import de.lanlab.larm.threads.ThreadPoolObserver;
import de.lanlab.larm.threads.ThreadPool;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.*;
import de.lanlab.larm.gui.*;
import de.lanlab.larm.util.*;
import de.lanlab.larm.storage.*;
import javax.swing.UIManager;
import HTTPClient.*;
import org.apache.oro.text.regex.MalformedPatternException;
* ENTRY POINT: this class contains the main()-method of the application, does
* all the initializing and optionally connects the fetcher with the GUI.
* @author Clemens Marschner
* @created December 16, 2000
public class FetcherMain
* the main message pipeline
protected MessageHandler messageHandler;
* this filter records all incoming URLs and filters everything it already
* knows
protected URLVisitedFilter urlVisitedFilter;
* the scope filter filters URLs that fall out of the scope given by the
* regular expression
protected URLScopeFilter urlScopeFilter;
* The DNS resolver was supposed to hold the host addresses for all hosts
* this is done by URL itself today
* protected DNSResolver dnsResolver;
* the robot exclusion filter looks if a robots.txt is present on a host
* before it is first accessed
protected RobotExclusionFilter reFilter;
* the host manager keeps track of all hosts and is used by the filters.
protected HostManager hostManager;
* this rather flaky filter just filters out some URLs, i.e. different views
* of Apache the apache DirIndex module. Has to be made
* configurable in near future
protected KnownPathsFilter knownPathsFilter;
* this is the main document fetcher. It contains a thread pool that fetches the
* documents and stores them
protected Fetcher fetcher;
* the thread monitor once was only a monitoring tool, but now has become a
* vital part of the system that computes statistics and
* flushes the log file buffers
protected ThreadMonitor monitor;
* the storage is a central class that puts all fetched documents somewhere.
* Several differnt implementations exist.
protected DocumentStorage storage;
* the URL length filter filters URLs that are too long, i.e. because of errors
* in the implementation of dynamic web sites
protected URLLengthFilter urlLengthFilter;
* initializes all classes and registers anonymous adapter classes as
* listeners for fetcher events.
* @param nrThreads number of fetcher threads to be created
public FetcherMain(int nrThreads)
// to make things clear, this method is commented a bit better than
// the rest of the program...
// this is the main message queue. handlers are registered with
// the queue, and whenever a message is put in it, they are passed to the
// filters in a "chain of responibility" manner. Every listener can decide
// to throw the message away
messageHandler = new MessageHandler();
// the storage is the class which saves a WebDocument somewhere, no
// matter how it does it, whether it's in a file, in a database or
// whatever
// example for the (very slow) SQL Server storage:
// this.storage = new SQLServerStorage("sun.jdbc.odbc.JdbcOdbcDriver","jdbc:odbc:search","sa","...",nrThreads);
// the LogStorage used here does extensive logging. It logs all links and
// document information.
// it also saves all documents to page files. Probably this single storage
// could also be replaced by a pipeline; or even incorporated into the
// existing message pipeline
SimpleLogger log = new SimpleLogger("store", false);
this.storage = new LogStorage(log, true, "logs/pagefile");
// a third example would be the NullStorage, which converts the documents into
// heat, which evaporates above the processor
// NullStorage();
// create the filters and add them to the message queue
urlScopeFilter = new URLScopeFilter();
urlVisitedFilter = new URLVisitedFilter(100000, log);
// dnsResolver = new DNSResolver();
hostManager = new HostManager(1000);
reFilter = new RobotExclusionFilter(hostManager);
fetcher = new Fetcher(nrThreads, storage, hostManager);
knownPathsFilter = new KnownPathsFilter();
urlLengthFilter = new URLLengthFilter(255);
// prevent message box popups
// prevent GZipped files from being decoded
// initialize the threads
// the thread monitor watches the thread pool.
monitor = new ThreadMonitor(urlLengthFilter,
5000 // wake up every 5 seconds
// add all filters to the handler.
/* uncomment this to enable HTTPClient logging
HTTPClient.Log.setLogWriter(new java.io.FileWriter("logs/HttpClient.log"),false);
HTTPClient.Log.setLogging(HTTPClient.Log.ALL, true);
catch (Exception e)
* Sets the RexString attribute of the FetcherMain object
* @param restrictTo The new RexString value
public void setRexString(String restrictTo) throws MalformedPatternException
* Description of the Method
* @param url Description of Parameter
* @param isFrame Description of the Parameter
* @exception java.net.MalformedURLException Description of Exception
public void putURL(URL url, boolean isFrame)
throws java.net.MalformedURLException
messageHandler.putMessage(new URLMessage(url, null, isFrame));
catch (Exception e)
System.out.println("Exception: " + e.getMessage());
//System.out.println("URLs geschrieben");
* Description of the Method
public void startMonitor()
* the GUI is not working at this time. It was used in the very beginning, but
* synchronous updates turned out to slow down the program a lot, even if the
* GUI would be turned off. Thus, a lot
* of Observer messages where removed later. Nontheless, it's quite cool to see
* it working...
* @param f Description of Parameter
* @param startURL Description of Parameter
public void initGui(FetcherMain f, String startURL)
// if we're on a windows platform, make it look a bit more convenient
catch (Exception e)
// dann halt nicht...
System.out.println("Init FetcherFrame");
FetcherSummaryFrame fetcherFrame;
fetcherFrame = new FetcherSummaryFrame();
fetcherFrame.setSize(640, 450);
FetcherGUIController guiController = new FetcherGUIController(f, fetcherFrame, startURL);
* The main program. parsed
* @param args The command line arguments
public static void main(String[] args)
int nrThreads = 10;
String startURL = "";
String restrictTo = "*";
boolean gui = false;
boolean showInfo = false;
System.out.println("LARM - LANLab Retrieval Machine - Fetcher - V 1.00 - (C) LANLab 2000-02");
for (int i = 0; i < args.length; i++)
if (args[i].equals("-start"))
startURL = args[i];
System.out.println("Start-URL set to: " + startURL);
else if (args[i].equals("-restrictto"))
restrictTo = args[i];
System.out.println("Restricting URLs to " + restrictTo);
else if (args[i].equals("-threads"))
nrThreads = Integer.parseInt(args[i]);
System.out.println("Threads set to " + nrThreads);
else if (args[i].equals("-gui"))
gui = true;
else if (args[i].equals("-?"))
showInfo = true;
System.out.println("Unknown option: " + args[i] + "; use -? to get syntax");
//URL.setURLStreamHandlerFactory(new HttpTimeoutFactory(500));
// replaced by HTTPClient
FetcherMain f = new FetcherMain(nrThreads);
if (showInfo || (startURL.equals("") && gui == false))
System.out.println("Usage: FetcherMain -start <URL> -restrictto <RegEx> [-threads <nr=10>]"); // [-gui]
if (gui)
// f.initGui(f, startURL);
f.putURL(new URL(startURL), false);
catch (MalformedURLException e)
System.out.println("Malformed URL");
catch (MalformedPatternException e)
System.out.println("Wrong RegEx syntax. Must be a valid PERL RE");
@ -0,0 +1,617 @@
* LARM - LANLab Retrieval Machine
* $history: $
package de.lanlab.larm.fetcher;
import java.net.URL;
import de.lanlab.larm.threads.*;
import de.lanlab.larm.util.InputStreamObserver;
import de.lanlab.larm.util.ObservableInputStream;
import de.lanlab.larm.util.WebDocument;
import de.lanlab.larm.util.SimpleCharArrayReader;
import de.lanlab.larm.storage.DocumentStorage;
import de.lanlab.larm.util.State;
import de.lanlab.larm.util.SimpleLogger;
import de.lanlab.larm.net.HttpTimeoutFactory;
import HTTPClient.*;
import java.net.*;
import java.io.*;
import java.util.*;
import java.text.*;
import de.lanlab.larm.parser.Tokenizer;
import de.lanlab.larm.parser.LinkHandler;
* this class gets the documents from the web. It connects to the server given
* by the IP address in the URLMessage, gets the document, and forwards it to
* the storage. If it's an HTML document, it will be parsed and all links will
* be put into the message handler again.
* @author Clemens Marschner
public class FetcherTask
implements InterruptableTask, LinkHandler, Serializable
protected volatile boolean isInterrupted = false;
* each task has its own number. the class variable counts up if an instance
* of a fetcher task is created
static volatile int taskIdentity = 0;
* the number of this object
int taskNr;
* the BASE Href (defaults to contextUrl, may be changed with a <base> tag
* only valid within a doTask call
private volatile URL base;
* the URL of the docuzment
* only valid within a doTask call
private volatile URL contextUrl;
* the message handler the URL message comes from; same for all tasks
protected static volatile MessageHandler messageHandler;
* actual number of bytes read
* only valid within a doTask call
private volatile long bytesRead = 0;
* the storage this task will put the document to
private static volatile DocumentStorage storage;
* task state IDs. comparisons will be done by their references, so always
* use the IDs
public final static String FT_IDLE = "idle";
public final static String FT_STARTED = "started";
public final static String FT_OPENCONNECTION = "opening connection";
public final static String FT_CONNECTING = "connecting";
public final static String FT_GETTING = "getting";
public final static String FT_READING = "reading";
public final static String FT_SCANNING = "scanning";
public final static String FT_STORING = "storing";
public final static String FT_READY = "ready";
public final static String FT_CLOSING = "closing";
public final static String FT_EXCEPTION = "exception";
public final static String FT_INTERRUPTED = "interrupted";
private volatile State taskState = new State(FT_IDLE);
* the URLs found will be stored and only added to the message handler in the very
* end, to avoid too many synchronizations
private volatile LinkedList foundUrls;
* the URL to be get
protected volatile URLMessage actURLMessage;
* the document title, if present
private volatile String title;
* headers for HTTPClient
private static volatile NVPair headers[] = new NVPair[1];
headers[0] = new HTTPClient.NVPair("User-Agent", Constants.CRAWLER_AGENT);
* Gets a copy of the current taskState
* @return The taskState value
public State getTaskState()
return taskState.cloneState();
* Constructor for the FetcherTask object
* @param urlMessage Description of the Parameter
public FetcherTask(URLMessage urlMessage)
actURLMessage = urlMessage;
* Gets the uRLMessages attribute of the FetcherTask object
* @return The uRLMessages value
public URLMessage getActURLMessage()
return this.actURLMessage;
* Sets the document storage
* @param storage The new storage
public static void setStorage(DocumentStorage storage)
FetcherTask.storage = storage;
* Sets the messageHandler
* @param messageHandler The new messageHandler
public static void setMessageHandler(MessageHandler messageHandler)
FetcherTask.messageHandler = messageHandler;
* @return the URL as a string
public String getInfo()
return actURLMessage.getURLString();
* Gets the uRL attribute of the FetcherTask object
* @return The uRL value
public URL getURL()
return actURLMessage.getUrl();
SimpleLogger log;
SimpleLogger errorLog;
//private long startTime;
* this will be called by the fetcher thread and will do all the work
* @TODO probably split this up into different processing steps
* @param thread Description of the Parameter
public void run(ServerThread thread)
taskState.setState(FT_STARTED); // state information is always set to make the thread monitor happy
log = thread.getLog();
HostManager hm = ((FetcherThread)thread).getHostManager();
errorLog = thread.getErrorLog();
// startTime = System.currentTimeMillis();
int threadNr = ((FetcherThread) thread).getThreadNumber();
base = contextUrl = actURLMessage.getUrl();
String urlString = actURLMessage.getURLString();
String host = contextUrl.getHost();
int hostPos = urlString.indexOf(host);
int hostLen = host.length();
HostInfo hi = hm.getHostInfo(host); // get and create
// we make this check as late as possible to get the most current information
log.log("Bad Host: " + contextUrl + "; returning");
System.out.println("[" + threadNr + "] bad host: " + this.actURLMessage.getUrl());
taskState.setState(FT_READY, null);
foundUrls = new java.util.LinkedList();
HTTPConnection conn = null;
title = "*untitled*";
int size = 1;
InputStream in = null;
bytesRead = 0;
URL ipURL = contextUrl;
taskState.setState(FT_OPENCONNECTION, urlString);
log.log("connecting to " + ipURL.getHost());
taskState.setState(FT_CONNECTING, ipURL);
conn = new HTTPConnection(host);
// 75 s
taskState.setState(this.FT_GETTING, ipURL);
HTTPResponse response = conn.Get(ipURL.getFile(), "", headers);
int statusCode = response.getStatusCode();
byte[] fullBuffer = null;
String contentType = "";
int contentLength = 0;
if (statusCode != 404 && statusCode != 403)
// read up to Constants.FETCHERTASK_MAXFILESIZE bytes into a byte array
taskState.setState(FT_READING, ipURL);
contentType = response.getHeader("Content-Type");
String length = response.getHeader("Content-Length");
if (length != null)
contentLength = Integer.parseInt(length);
fullBuffer = response.getData(Constants.FETCHERTASK_MAXFILESIZE); // max. 2 MB
if (fullBuffer != null)
contentLength = fullBuffer.length;
this.bytesRead += contentLength;
//conn.stop(); // close connection. todo: Do some caching...
* conn.disconnect();
if (isInterrupted)
System.out.println("FetcherTask: interrupted while reading. File truncated");
log.log("interrupted while reading. File truncated");
if (fullBuffer != null)
taskState.setState(FT_SCANNING, ipURL);
log.log("read file (" + fullBuffer.length + " bytes). Now scanning.");
if (contentType.startsWith("text/html"))
// ouch. I haven't found a better solution yet. just slower ones.
char[] fullCharBuffer = new char[contentLength];
new InputStreamReader(new ByteArrayInputStream(fullBuffer)).read(fullCharBuffer);
Tokenizer tok = new Tokenizer();
tok.parse(new SimpleCharArrayReader(fullCharBuffer));
// System.out.println("Discovered unknown content type: " + contentType + " at " + urlString);
errorLog.log("[" + threadNr + "] Discovered unknown content type at " + urlString + ": " + contentType + ". just storing");
taskState.setState(FT_STORING, ipURL);
storage.store(new WebDocument(contextUrl, contentType, fullBuffer, statusCode, actURLMessage.getReferer(), contentLength, title));
catch (InterruptedIOException e)
// timeout while reading this file
System.out.println("[" + threadNr + "] FetcherTask: Timeout while opening: " + this.actURLMessage.getUrl());
errorLog.log("error: Timeout: " + this.actURLMessage.getUrl());
catch (FileNotFoundException e)
System.out.println("[" + threadNr + "] FetcherTask: File not Found: " + this.actURLMessage.getUrl());
errorLog.log("error: File not Found: " + this.actURLMessage.getUrl());
catch(NoRouteToHostException e)
// router is down or firewall prevents to connect
System.out.println("[" + threadNr + "] " + e.getClass().getName() + ": " + e.getMessage());
// e.printStackTrace();
errorLog.log("error: " + e.getClass().getName() + ": " + e.getMessage());
catch(ConnectException e)
// no server is listening at this port
System.out.println("[" + threadNr + "] " + e.getClass().getName() + ": " + e.getMessage());
// e.printStackTrace();
errorLog.log("error: " + e.getClass().getName() + ": " + e.getMessage());
catch (SocketException e)
System.out.println("[" + threadNr + "]: SocketException:" + e.getMessage());
errorLog.log("error: " + e.getClass().getName() + ": " + e.getMessage());
catch(UnknownHostException e)
// IP Address not to be determined
System.out.println("[" + threadNr + "] " + e.getClass().getName() + ": " + e.getMessage());
// e.printStackTrace();
errorLog.log("error: " + e.getClass().getName() + ": " + e.getMessage());
catch (IOException e)
System.out.println("[" + threadNr + "] " + e.getClass().getName() + ": " + e.getMessage());
// e.printStackTrace();
errorLog.log("error: IOException: " + e.getClass().getName() + ": " + e.getMessage());
catch (OutOfMemoryError ome)
System.out.println("[" + threadNr + "] Task " + this.taskNr + " OutOfMemory after " + size + " bytes");
errorLog.log("error: OutOfMemory after " + size + " bytes");
catch (Throwable e)
System.out.println("[" + threadNr + "] " + e.getMessage() + " type: " + e.getClass().getName());
System.out.println("[" + threadNr + "]: stopping");
errorLog.log("error: " + e.getClass().getName() + ": " + e.getMessage() + "; stopping");
if (isInterrupted)
System.out.println("Task was interrupted");
if (isInterrupted)
System.out.println("Task: closed everything");
* }
foundUrls = null;
* the interrupt method. not in use since the change to HTTPClient
* @TODO decide if we need this anymore
public void interrupt()
System.out.println("FetcherTask: interrupted!");
this.isInterrupted = true;
* try
* {
* if (conn != null)
* {
* ((HttpURLConnection) conn).disconnect();
* System.out.println("FetcherTask: disconnected URL Connection");
* conn = null;
* }
* if (in != null)
* {
* in.close();
* / possibly hangs at close() .> KeepAliveStream.close() -> MeteredStream.skip()
* System.out.println("FetcherTask: Closed Input Stream");
* in = null;
* }
* }
* catch (IOException e)
* {
* System.out.println("IOException while interrupting: ");
* e.printStackTrace();
* }
* System.out.println("FetcherTask: Set all IOs to null");
* this is called whenever a links was found in the current document,
* Don't create too many objects here, this will be called
* millions of times
* @param link Description of the Parameter
public void handleLink(String link, boolean isFrame)
// cut out Ref part
int refPart = link.indexOf("#");
if (refPart == 0)
else if (refPart > 0)
link = link.substring(0, refPart);
URL url = null;
if (link.startsWith("http:"))
// distinguish between absolute and relative URLs
url = new URL(link);
// relative url
url = new URL(base, link);
URLMessage urlMessage = new URLMessage(url, contextUrl, isFrame);
String urlString = urlMessage.getURLString();
//messageHandler.putMessage(new actURLMessage(url)); // put them in the very end
catch (MalformedURLException e)
//log.log("malformed url: base:" + base + " -+- link:" + link);
log.log("warning: " + e.getClass().getName() + ": " + e.getMessage());
catch (Exception e)
log.log("warning: " + e.getClass().getName() + ": " + e.getMessage());
// e.printStackTrace();
* called when a BASE tag was found
* @param base the HREF attribute
public void handleBase(String base)
this.base = new URL(base);
catch (MalformedURLException e)
log.log("warning: " + e.getClass().getName() + ": " + e.getMessage() + " while converting '" + base + "' to URL in document " + contextUrl);
* called when a TITLE tag was found
* @param title the string between <title> and >/title>
public void handleTitle(String title)
this.title = title;
* public void notifyOpened(ObservableInputStream in, long timeElapsed)
* {
* }
* public void notifyClosed(ObservableInputStream in, long timeElapsed)
* {
* }
* public void notifyRead(ObservableInputStream in, long timeElapsed, int nrRead, int totalRead)
* {
* if(totalRead / ((double)timeElapsed) < 0.3) // weniger als 300 bytes/s
* {
* System.out.println("Task " + this.taskNr + " stalled at pos " + totalRead + " with " + totalRead / (timeElapsed / 1000.0) + " bytes/s");
* }
* }
* public void notifyFinished(ObservableInputStream in, long timeElapsed, int totalRead)
* {
* /System.out.println("Task " + this.taskNr + " finished (" + totalRead + " bytes in " + timeElapsed + " ms with " + totalRead / (timeElapsed / 1000.0) + " bytes/s)");
* }
public long getBytesRead()
return bytesRead;
* do nothing if a warning occurs within the html parser
* @param message Description of the Parameter
* @param systemID Description of the Parameter
* @param line Description of the Parameter
* @param column Description of the Parameter
* @exception java.lang.Exception Description of the Exception
public void warning(String message, String systemID, int line, int column)
throws java.lang.Exception { }
* do nothing if a fatal error occurs...
* @param message Description of the Parameter
* @param systemID Description of the Parameter
* @param line Description of the Parameter
* @param column Description of the Parameter
* @exception Exception Description of the Exception
public void fatal(String message, String systemID, int line, int column)
throws Exception
System.out.println("fatal error: " + message);
log.log("fatal error: " + message);
@ -0,0 +1,198 @@
package de.lanlab.larm.fetcher;
import de.lanlab.larm.threads.*;
import de.lanlab.larm.util.*;
import java.util.*;
import java.net.URL;
* this special kind of task queue reorders the incoming tasks so that every subsequent
* task is for a different host.
* This is done by a "HashedCircularLinkedList" which allows random adding while
* a differnet thread iterates through the collection circularly.
* @author Clemens Marschner
* @created 23. November 2001
public class FetcherTaskQueue extends TaskQueue
* this is a hash that contains an entry for each server, which by itself is a
* CachingQueue that stores all tasks for this server
* @TODO probably link this to the host info structure
HashedCircularLinkedList servers = new HashedCircularLinkedList(100, 0.75f);
int size = 0;
* Constructor for the FetcherTaskQueue object. Does nothing
public FetcherTaskQueue() { }
* true if no task is queued
* @return The empty value
public boolean isEmpty()
return (size == 0);
* clear the queue. not synchronized.
public void clear()
* puts task into Queue.
* Warning: not synchronized
* @param t the task to be added. must be a FetcherTask
public void insert(Object t)
// assert (t != null && t.getURL() != null)
URLMessage um = ((FetcherTask)t).getActURLMessage();
URL act = um.getUrl();
String host = act.getHost();
Queue q;
q = ((Queue) servers.get(host));
if (q == null)
// add a new host to the queue
//String host2 = host.replace(':', '_').replace('/', '_').replace('\\', '_');
// make it file system ready
q = new CachingQueue(host, 100);
servers.put(host, q);
// assert((q != null) && (q instanceof FetcherTaskQueue));
* the size of the queue. make sure that insert() and size() calls are synchronized
* if the exact number matters.
* @return Description of the Return Value
public int size()
return size;
* the number of different hosts queued at the moment
public int getNumHosts()
return servers.size();
* get the next task. warning: not synchronized
* @return Description of the Return Value
public Object remove()
FetcherTask t = null;
if (servers.size() > 0)
Queue q = (Queue) servers.next();
// assert(q != null && q.size() > 0)
t = (FetcherTask)q.remove();
if (q.size() == 0)
q = null;
return t;
* tests
* @param args Description of the Parameter
public static void main(String args[])
FetcherTaskQueue q = new FetcherTaskQueue();
System.out.println("Test 1. put in 4 yahoos and 3 lmus. pull out LMU/Yahoo/LMU/Yahoo/LMU/Yahoo/Yahoo");
q.insert(new FetcherTask(new URLMessage(new URL("http://www.lmu.de/1"), null, false)));
q.insert(new FetcherTask(new URLMessage(new URL("http://www.lmu.de/2"), null, false)));
q.insert(new FetcherTask(new URLMessage(new URL("http://www.yahoo.de/1"), null, false)));
q.insert(new FetcherTask(new URLMessage(new URL("http://www.yahoo.de/2"), null, false)));
q.insert(new FetcherTask(new URLMessage(new URL("http://www.yahoo.de/3"), null, false)));
q.insert(new FetcherTask(new URLMessage(new URL("http://www.yahoo.de/4"), null, false)));
q.insert(new FetcherTask(new URLMessage(new URL("http://www.lmu.de/3"), null, false)));
catch (Throwable t)
System.out.println(((FetcherTask) q.remove()).getInfo());
System.out.println(((FetcherTask) q.remove()).getInfo());
System.out.println(((FetcherTask) q.remove()).getInfo());
System.out.println(((FetcherTask) q.remove()).getInfo());
System.out.println(((FetcherTask) q.remove()).getInfo());
System.out.println(((FetcherTask) q.remove()).getInfo());
System.out.println(((FetcherTask) q.remove()).getInfo());
System.out.println("Test 2. new Queue");
q = new FetcherTaskQueue();
System.out.println("size [0]:");
System.out.println("put 3 lmus.");
q.insert(new FetcherTask(new URLMessage(new URL("http://www.lmu.de/1"), null, false)));
q.insert(new FetcherTask(new URLMessage(new URL("http://www.lmu.de/2"), null, false)));
q.insert(new FetcherTask(new URLMessage(new URL("http://www.lmu.de/3"), null, false)));
System.out.print("pull out 1st element [lmu/1]: ");
System.out.println(((FetcherTask) q.remove()).getInfo());
System.out.println("size now [2]: " + q.size());
System.out.print("pull out 2nd element [lmu/2]: ");
System.out.println(((FetcherTask) q.remove()).getInfo());
System.out.println("size now [1]: " + q.size());
System.out.println("put in 3 yahoos");
q.insert(new FetcherTask(new URLMessage(new URL("http://www.yahoo.de/1"), null, false)));
q.insert(new FetcherTask(new URLMessage(new URL("http://www.yahoo.de/2"), null, false)));
q.insert(new FetcherTask(new URLMessage(new URL("http://www.yahoo.de/3"), null, false)));
System.out.println("remove [?]: " + ((FetcherTask) q.remove()).getInfo());
System.out.println("Size now [3]: " + q.size());
System.out.println("remove [?]: " + ((FetcherTask) q.remove()).getInfo());
System.out.println("Size now [2]: " + q.size());
System.out.println("remove [?]: " + ((FetcherTask) q.remove()).getInfo());
System.out.println("Size now [1]: " + q.size());
System.out.println("put in another Yahoo");
q.insert(new FetcherTask(new URLMessage(new URL("http://www.yahoo.de/4"), null, false)));
System.out.println("remove [?]: " + ((FetcherTask) q.remove()).getInfo());
System.out.println("Size now [1]: " + q.size());
System.out.println("remove [?]: " + ((FetcherTask) q.remove()).getInfo());
System.out.println("Size now [0]: " + q.size());
catch (Throwable t)
@ -0,0 +1,91 @@
* Title: LARM Lanlab Retrieval Machine<p>
* Description: <p>
* Copyright: Copyright (c)<p>
* Company: <p>
* @author
* @version 1.0
package de.lanlab.larm.fetcher;
import de.lanlab.larm.threads.ServerThread;
import de.lanlab.larm.util.State;
* a server thread for the thread pool that records the number
* of bytes read and the number of tasks run
* mainly for statistical purposes and to keep most of the information a task needs
* static
public class FetcherThread extends ServerThread
long totalBytesRead = 0;
long totalTasksRun = 0;
HostManager hostManager;
byte[] documentBuffer = new byte[Constants.FETCHERTASK_READSIZE];
public HostManager getHostManager()
return hostManager;
public FetcherThread(int threadNumber, ThreadGroup threadGroup, HostManager hostManager)
super(threadNumber,"FetcherThread " + threadNumber, threadGroup);
this.hostManager = hostManager;
public static String STATE_IDLE = "Idle";
State idleState = new State(STATE_IDLE); // only set if task is finished
protected void taskReady()
totalBytesRead += ((FetcherTask)task).getBytesRead();
public long getTotalBytesRead()
if(task != null)
return totalBytesRead + ((FetcherTask)task).getBytesRead();
return totalBytesRead;
public long getTotalTasksRun()
return totalTasksRun;
public byte[] getDocumentBuffer()
return documentBuffer;
public State getTaskState()
if(task != null)
// task could be null here
return ((FetcherTask)task).getTaskState();
return idleState.cloneState();
@ -0,0 +1,38 @@
* Title: LARM Lanlab Retrieval Machine<p>
* Description: <p>
* Copyright: Copyright (c)<p>
* Company: <p>
* @author
* @version 1.0
package de.lanlab.larm.fetcher;
import de.lanlab.larm.threads.*;
* this factory simply creates fetcher threads. It's passed
* to the ThreadPool because the pool is creating the threads on its own
public class FetcherThreadFactory extends ThreadFactory
//static int count = 0;
ThreadGroup threadGroup = new ThreadGroup("FetcherThreads");
HostManager hostManager;
public FetcherThreadFactory(HostManager hostManager)
this.hostManager = hostManager;
public ServerThread createServerThread(int count)
ServerThread newThread = new FetcherThread(count, threadGroup, hostManager);
return newThread;
@ -0,0 +1,29 @@
* Title: LARM Lanlab Retrieval Machine<p>
* Description: <p>
* Copyright: Copyright (c)<p>
* Company: <p>
* @author
* @version 1.0
package de.lanlab.larm.fetcher;
* base class of all filter classes
public abstract class Filter
* number of items filtered. augmented directly by
* the inheriting classes
protected int filtered = 0;
public int getFiltered()
return filtered;
@ -0,0 +1,56 @@
package de.lanlab.larm.fetcher;
* Title: LARM Lanlab Retrieval Machine Description: Copyright: Copyright (c)
* Company:
* @author
* @version 1.0
import java.io.*;
import java.util.zip.*;
import java.net.*;
* Description of the Class
* @author Administrator
* @created 28. Januar 2002
public class GZipTest
* Constructor for the GZipTest object
public GZipTest() { }
* The main program for the GZipTest class
* @param args The command line arguments
public static void main(String[] args)
String url = "http://speechdat.phonetik.uni-muenchen.de/speechdt//speechDB/FIXED1SL/BLOCK00/SES0006/A10006O5.aif";
ByteArrayOutputStream a = new ByteArrayOutputStream(url.length());
GZIPOutputStream g = new GZIPOutputStream(a);
OutputStreamWriter o = new OutputStreamWriter(g,"ISO-8859-1");
byte[] array = a.toByteArray();
System.out.println("URL: " + url + " \n Length: " + url.length() + "\n zipped: " + array.length
catch (Exception e)
{ e.printStackTrace();
@ -0,0 +1,121 @@
package de.lanlab.larm.fetcher;
* Title: LARM Lanlab Retrieval Machine Description: Copyright: Copyright (c)
* Company:
* @author Clemens Marschner
* @version 1.0
import java.util.HashMap;
import java.net.*;
import de.lanlab.larm.util.CachingQueue;
import de.lanlab.larm.util.Queue;
* contains information about a host. If a host doesn't respond too often, it's
* excluded from the crawl.
* This class is used by the HostManager
* @author Clemens Marschner
* @created 16. Februar 2002
public class HostInfo
static final String[] emptyKeepOutDirectories = new String[0];
int id;
int healthyCount = 5; // five strikes, and you're out
boolean isReachable = true;
boolean robotTxtChecked = false;
String[] disallows; // robot exclusion
boolean isLoadingRobotsTxt = false;
Queue queuedRequests = null; // robot exclusion
String hostName;
public HostInfo(String hostName, int id)
this.id = id;
this.disallows = HostInfo.emptyKeepOutDirectories;
this.hostName = hostName;
* is this host reachable and responding?
public boolean isHealthy()
return (healthyCount > 0) && isReachable;
* signals that the host returned with a bad request of whatever type
public void badRequest()
public void setReachable(boolean reachable)
isReachable = reachable;
public boolean isReachable()
return isReachable;
public boolean isRobotTxtChecked()
return robotTxtChecked;
* must be synchronized externally
public boolean isLoadingRobotsTxt()
return this.isLoadingRobotsTxt;
public void setLoadingRobotsTxt(boolean isLoading)
this.isLoadingRobotsTxt = isLoading;
this.queuedRequests = new CachingQueue("HostInfo_" + id + "_QueuedRequests", 100);
public void setRobotsChecked(boolean isChecked, String[] disallows)
this.robotTxtChecked = isChecked;
if(disallows != null)
this.disallows = disallows;
this.disallows = emptyKeepOutDirectories;
public synchronized boolean isAllowed(String path)
// assume keepOutDirectories is pretty short
// assert disallows != null
int length = disallows.length;
for(int i=0; i<length; i++)
return false;
return true;
@ -0,0 +1,86 @@
package de.lanlab.larm.fetcher;
* Title: LARM Lanlab Retrieval Machine Description: Copyright: Copyright (c)
* Company:
* @author
* @version 1.0
import java.util.HashMap;
* Description of the Class
* @author Administrator
* @created 16. Februar 2002
public class HostManager
HashMap hosts;
static int hostCount = 0;
* Constructor for the HostInfo object
* @param initialSize Description of the Parameter
public HostManager(int initialCapacity)
hosts = new HashMap(initialCapacity);
* Description of the Method
* @param hostName Description of the Parameter
* @return Description of the Return Value
public HostInfo put(String hostName)
if (!hosts.containsKey(hostName))
int hostID;
synchronized (this)
hostID = hostCount++;
HostInfo hi = new HostInfo(hostName,hostID);
hosts.put(hostName, hi);
return hi;
return (HostInfo)hosts.get(hostName);
hostID = hosts.get()
// assert hostID != -1;
return hostID;*/
* Gets the hostID attribute of the HostInfo object
* @param hostName Description of the Parameter
* @return The hostID value
public HostInfo getHostInfo(String hostName)
HostInfo hi = (HostInfo)hosts.get(hostName);
if(hi == null)
return put(hostName);
return hi;
public int getSize()
return hosts.size();
@ -0,0 +1,111 @@
package de.lanlab.larm.fetcher;
* Title: LARM Lanlab Retrieval Machine Description: Copyright: Copyright (c)
* Company:
* @author
* @created 17. Februar 2002
* @version 1.0
import java.net.*;
* this can be considered a hack
* @TODO implement this as a fast way to filter out different URL endings or beginnings
public class KnownPathsFilter extends Filter implements MessageListener
MessageHandler messageHandler;
String[] pathsToFilter =
String[] hostFilter =
String[] filesToFilter =
// exclude Apache directory files
int pathLength;
int fileLength;
int hostLength;
* Constructor for the KnownPathsFilter object
public KnownPathsFilter()
pathLength = pathsToFilter.length;
fileLength = filesToFilter.length;
hostLength = hostFilter.length;
* Description of the Method
* @param message Description of the Parameter
* @return Description of the Return Value
public Message handleRequest(Message message)
URL url = ((URLMessage)message).getUrl();
String file = url.getFile();
String host = url.getHost();
int i;
for (i = 0; i < pathLength; i++)
if (file.startsWith(pathsToFilter[i]))
return null;
for (i = 0; i < fileLength; i++)
if (file.endsWith(filesToFilter[i]))
return null;
for (i = 0; i<hostLength; i++)
return null;
return message;
* will be called as soon as the Listener is added to the Message Queue
* @param handler the Message Handler
public void notifyAddedToMessageHandler(MessageHandler handler)
this.messageHandler = messageHandler;
@ -0,0 +1,11 @@
package de.lanlab.larm.fetcher;
import java.io.*;
* Marker interface.
* represents a simple message.
public interface Message
@ -0,0 +1,248 @@
package de.lanlab.larm.fetcher;
import java.util.*;
import de.lanlab.larm.util.SimpleObservable;
import de.lanlab.larm.util.CachingQueue;
import de.lanlab.larm.util.UnderflowException;
* this is a message handler that runs in its own thread.
* Messages can be put via <code>putMessage</code> or <code>putMessages</code>
* (use the latter whenever possible).<br>
* The messages are passed to the filters in the order in which the filters where
* added to the handler.<br>
* They can consume the message by returning null. Otherwise, they return a Message
* object, usually the one they got.<br>
* The filters will run synchronously within the message handler thread<br>
* This implements a chain of responsibility-style message handling
public class MessageHandler implements Runnable
* the queue where messages are put in.
* Holds max. 2 x 5000 = 10.000 messages in RAM
private CachingQueue messageQueue = new CachingQueue("fetcherURLMessageQueue", 5000);
* list of Observers
private LinkedList listeners = new LinkedList();
* true as long as the thread is running
private boolean running = true;
* the message handler thread
private Thread t;
* flag for thread communication
boolean messagesWaiting = false;
* true when a message is processed by the filters
boolean workingOnMessage = false;
Object queueMonitor = new Object();
SimpleObservable messageQueueObservable = new SimpleObservable();
SimpleObservable messageProcessorObservable = new SimpleObservable();
public boolean isWorkingOnMessage()
return workingOnMessage;
* messageHandler-Thread erzeugen und starten
t = new Thread(this,"MessageHandler Thread");
t.setPriority(5); // higher priority to prevent starving when a lot of fetcher threads are used
* join messageHandler-Thread
public void finalize()
if(t != null)
t = null;
catch(InterruptedException e) {}
* registers a filter to the message handler
* @param MessageListener - the Listener
public void addListener(MessageListener m)
* registers a MessageQueueObserver
* It will be notified whenever a message is put into the Queue (Parameter is Int(1)) oder
* removed (Parameter is Int(-1))
* @param o the Observer
public void addMessageQueueObserver(Observer o)
* adds a message processorObeserver
* It will be notified when a message is consumed. In this case the parameter
* is the filter that consumed the message
* @param o the Observer
public void addMessageProcessorObserver(Observer o)
* einen Event in die Schlange schreiben
public void putMessage(Message msg)
messageQueueObservable.notifyObservers(new Integer(1));
messagesWaiting = true;
* add a collection of events to the message queue
public void putMessages(Collection msgs)
for(Iterator i = msgs.iterator(); i.hasNext();)
Message msg = (Message)i.next();
messageQueueObservable.notifyObservers(new Integer(1));
messagesWaiting = true;
* the main messageHandler-Thread.
public void run()
//System.out.println("MessageHandler-Thread started");
// wait for new messages
catch(InterruptedException e)
System.out.println("MessageHandler: Caught InterruptedException");
//messagesWaiting = false;
Message m;
m = (Message)messageQueue.remove();
if(messageQueue.size() == 0)
messagesWaiting = false;
//System.out.println("MessageHandler:run: Entferne erstes Element");
messageQueueObservable.notifyObservers(new Integer(-1)); // Message processed
// und verteilen. Die Listener erhalten die Message in ihrer
// Eintragungsreihenfolge und können die Message auch verändern
Iterator i = listeners.iterator();
MessageListener listener = (MessageListener)i.next();
m = (Message)listener.handleRequest(m);
if (m == null)
break; // Handler hat die Message konsumiert
catch(ClassCastException e)
System.out.println("MessageHandler:run: ClassCastException(2): " + e.getMessage());
catch (ClassCastException e)
System.out.println("MessageHandler:run: ClassCastException: " + e.getMessage());
catch (UnderflowException e)
messagesWaiting = false;
// System.out.println("MessageHandler: messagesWaiting = true although nothing queued!");
// @FIXME: here is still a multi threading issue. I don't get it why this happens.
// does someone want to draw a petri net of this?
catch (Exception e)
System.out.println("MessageHandler: " + e.getClass() + " " + e.getMessage());
public int getQueued()
return messageQueue.size();
@ -0,0 +1,36 @@
* LARM - LANLab Retrieval Machine
* $history: $
package de.lanlab.larm.fetcher;
* A Message Listener works on messages in a message queue Usually it returns
* the message back into the queue. But it can also change the message or create
* a new object. If it returns null, the message handler stops
* @author Administrator
* @created 24. November 2001
public interface MessageListener
* the handler
* @param message the message to be handled
* @return Message usually the original message
* null: the message was consumed
public Message handleRequest(Message message);
* will be called as soon as the Listener is added to the Message Queue
* @param handler the Message Handler
public void notifyAddedToMessageHandler(MessageHandler handler);
@ -0,0 +1,429 @@
* Title: LARM Lanlab Retrieval Machine<p>
* Description: <p>
* Copyright: Copyright (c)<p>
* Company: <p>
* @author Clemens Marschner
* @version 1.0
package de.lanlab.larm.fetcher;
import de.lanlab.larm.util.SimpleObservable;
import de.lanlab.larm.util.State;
import java.util.*;
import java.net.*;
import java.io.*;
import org.apache.oro.text.perl.Perl5Util;
import de.lanlab.larm.util.*;
import de.lanlab.larm.threads.*;
import HTTPClient.*;
* this factory simply creates fetcher threads. It's gonna be passed to the
* ThreadPool because the pool is creating the threads on its own
* @author Administrator
* @created 17. Februar 2002
class REFThreadFactory extends ThreadFactory
ThreadGroup threadGroup = new ThreadGroup("RobotExclusionFilter");
* Description of the Method
* @param count Description of the Parameter
* @return Description of the Return Value
public ServerThread createServerThread(int count)
ServerThread newThread = new ServerThread(count, "REF-" + count, threadGroup);
return newThread;
* the RE filter obeys the robot exclusion standard. If a new host name is supposed
* to be accessed, it first loads a "/robots.txt" on the given server and records the
* disallows stated in that file.
* The REFilter has a thread pool on its own to prevent the message handler from being
* clogged up if the server doesn't respond. Incoming messages are queued while the
* robots.txt is loaded.
* The information is stored in HostInfo records of the host manager class
* @author Clemens Marschner
* @created 17. Februar 2002
public class RobotExclusionFilter extends Filter implements MessageListener
protected HostManager hostManager;
protected SimpleLogger log;
* Constructor for the RobotExclusionFilter object
* @param hm Description of the Parameter
public RobotExclusionFilter(HostManager hm)
log = new SimpleLogger("RobotExclusionFilter");
hostManager = hm;
rePool = new ThreadPool(2, new REFThreadFactory());
log.log("refilter: initialized");
* called by the message handler
public void notifyAddedToMessageHandler(MessageHandler handler)
this.messageHandler = handler;
MessageHandler messageHandler = null;
ThreadPool rePool;
* method that handles each URL request<p>
* This method will get the robots.txt file the first time a server is
* requested. See the description above.
* @param message
* the (URL)Message
* @return
* the original message or NULL if this host had a disallow on that URL
* @link{http://info.webcrawler.com/mak/projects/robots/norobots.html})
public Message handleRequest(Message message)
//log.logThreadSafe("handleRequest: got message: " + message);
// assert message instanceof URLMessage;
URLMessage urlMsg = ((URLMessage) message);
URL url = urlMsg.getUrl();
//assert url != null;
HostInfo h = hostManager.getHostInfo(url.getHost());
if (!h.isRobotTxtChecked() && !h.isLoadingRobotsTxt())
log.logThreadSafe("handleRequest: starting to get robots.txt");
// probably this results in Race Conditions here
rePool.doTask(new RobotExclusionTask(h), new Integer(h.id));
synchronized (h)
// isLoading...() and queuedRequest.insert() must be atomic
if (h.isLoadingRobotsTxt())
//log.logThreadSafe("handleRequest: other thread is loading");
// assert h.queuedRequests != null
// not thread safe
log.logThreadSafe("handleRequest: queued file " + url);
return null;
//log.logThreadSafe("handleRequest: no thread is loading; robots.txt loaded");
//log.logThreadSafe("handleRequest: checking if allowed");
String path = url.getPath();
if (path == null || path.equals(""))
path = "/";
if (h.isAllowed(path))
// log.logThreadSafe("handleRequest: file " + urlMsg.getURLString() + " ok");
return message;
log.logThreadSafe("handleRequest: file " + urlMsg.getURLString() + " filtered");
catch (Exception e)
return null;
private static volatile NVPair headers[] = new NVPair[1];
headers[0] = new HTTPClient.NVPair("User-Agent", Constants.CRAWLER_AGENT);
* the task that actually loads and parses the robots.txt files
* @author Clemens Marschner
* @created 17. Februar 2002
class RobotExclusionTask implements InterruptableTask
HostInfo hostInfo;
* Constructor for the RobotExclusionTask object
* @param hostInfo Description of the Parameter
public RobotExclusionTask(HostInfo hostInfo)
this.hostInfo = hostInfo;
* dummy
* @return The info value
public String getInfo()
return "";
* not used
public void interrupt() { }
* gets a robots.txt file and adds the information to the hostInfo
* structure
* @param thread the server thread (passed by the thread pool)
public void run(ServerThread thread)
// assert hostInfo != null;
String threadName = Thread.currentThread().getName();
log.logThreadSafe("task " + threadName + ": starting to load " + hostInfo.hostName);
String[] disallows = null;
boolean errorOccured = false;
log.logThreadSafe("task " + threadName + ": getting connection");
HTTPConnection conn = new HTTPConnection(hostInfo.hostName);
// wait at most 20 secs
HTTPResponse res = conn.Get("/robots.txt", (String) null, headers);
log.logThreadSafe("task " + threadName + ": got connection.");
if (res.getStatusCode() != 200)
errorOccured = true;
log.logThreadSafe("task " + threadName + ": reading");
byte[] file = res.getData(40000);
// max. 40 kb
log.logThreadSafe("task " + threadName + ": reading done. parsing");
disallows = parse(new BufferedReader(new InputStreamReader(new ByteArrayInputStream(file))));
log.logThreadSafe("task " + threadName + ": parsing done. found " + disallows.length + " disallows");
// assert disallows != null
// HostInfo hostInfo = hostManager.getHostInfo(this.hostName);
// assert hostInfo != null
log.logThreadSafe("task " + threadName + ": setting disallows");
catch (java.net.UnknownHostException e)
log.logThreadSafe("task " + threadName + ": unknown host. setting to unreachable");
errorOccured = true;
catch (java.net.NoRouteToHostException e)
log.logThreadSafe("task " + threadName + ": no route to. setting to unreachable");
errorOccured = true;
catch (java.net.ConnectException e)
log.logThreadSafe("task " + threadName + ": connect exception. setting to unreachable");
errorOccured = true;
catch (java.io.InterruptedIOException e)
// time out. fatal in this case
log.logThreadSafe("task " + threadName + ": time out. setting to unreachable");
errorOccured = true;
catch (Throwable e)
errorOccured = true;
log.log("task " + threadName + ": unknown exception: " + e.getClass().getName() + ": " + e.getMessage() + ". continuing");
if (errorOccured)
synchronized (hostInfo)
hostInfo.setRobotsChecked(true, null);
// crawl everything
log.logThreadSafe("task " + threadName + ": error occured");
log.logThreadSafe("task " + threadName + ": now put " + hostInfo.queuedRequests.size() + " queueud requests back");
hostInfo.isLoadingRobotsTxt = false;
synchronized (hostInfo)
hostInfo.setRobotsChecked(true, disallows);
log.logThreadSafe("task " + threadName + ": done");
log.logThreadSafe("task " + threadName + ": now put " + hostInfo.queuedRequests.size() + " queueud requests back");
hostInfo.isLoadingRobotsTxt = false;
* put back queued URLs
private void putBackURLs()
while (hostInfo.queuedRequests.size() > 0)
messageHandler.putMessage((Message) hostInfo.queuedRequests.remove());
log.logThreadSafe("task " + Thread.currentThread().getName() + ": finished");
hostInfo.queuedRequests = null;
* this parses the robots.txt file. It was taken from the PERL implementation
* Since this is only rarely called, it's not optimized for speed
* @param r the robots.txt file
* @return the disallows
* @exception IOException any IOException
public String[] parse(BufferedReader r)
throws IOException
// taken from Perl
Perl5Util p = new Perl5Util();
String line;
boolean isMe = false;
boolean isAnon = false;
ArrayList disallowed = new ArrayList();
String ua = null;
while ((line = r.readLine()) != null)
if (p.match("/^#.*/", line))
// a comment
line = p.substitute("s/\\s*\\#.* //", line);
if (p.match("/^\\s*$/", line))
if (isMe)
else if (p.match("/^User-Agent:\\s*(.*)/i", line))
ua = p.group(1);
ua = p.substitute("s/\\s+$//", ua);
if (isMe)
else if (ua.equals("*"))
isAnon = true;
else if (Constants.CRAWLER_AGENT.startsWith(ua))
isMe = true;
else if (p.match("/^Disallow:\\s*(.*)/i", line))
if (ua == null)
isAnon = true;
// warn...
String disallow = p.group(1);
if (disallow != null && disallow.length() > 0)
// assume we have a relative path
disallow = "/";
if (isMe || isAnon)
// warn: unexpected line
String[] disalloweds = new String[disallowed.size()];
return disalloweds;
@ -0,0 +1,545 @@
* Title: LARM Lanlab Retrieval Machine<p>
* Description: <p>
* Copyright: Copyright (c)<p>
* Company: <p>
* @author
* @version 1.0
package de.lanlab.larm.fetcher;
import de.lanlab.larm.threads.*;
import java.util.*;
import java.text.*;
import java.io.*;
import de.lanlab.larm.util.State;
import de.lanlab.larm.util.SimpleLoggerManager;
* this monitor takes a sample of every thread every x milliseconds,
* and logs a lot of information. In the near past it has evolved into the multi
* purpose monitoring and maintenance facility.
* At the moment it prints status information
* to log files and to the console
* @TODO this can be done better. Probably with an agent where different services
* can be registered to be called every X seconds
public class ThreadMonitor extends Observable implements Runnable
* a reference to the thread pool that's gonna be observed
private ThreadPool threadPool;
class Sample
long bytesRead;
long docsRead;
long time;
public Sample(long bytesRead, long docsRead, long time)
this.bytesRead = bytesRead;
this.docsRead = docsRead;
this.time = time;
ArrayList bytesReadPerPeriod;
* Zeit zwischen den Messungen
int sampleDelta;
* the thread where this monitor runs in. Will run with high priority
Thread thread;
URLVisitedFilter urlVisitedFilter;
URLScopeFilter urlScopeFilter;
// DNSResolver dnsResolver;
RobotExclusionFilter reFilter;
MessageHandler messageHandler;
URLLengthFilter urlLengthFilter;
HostManager hostManager;
public final static double KBYTE = 1024;
public final static double MBYTE = 1024 * KBYTE;
public final static double ONEGBYTE = 1024 * MBYTE;
String formatBytes(long lbytes)
double bytes = (double)lbytes;
if(bytes >= ONEGBYTE)
return fractionFormat.format((bytes/ONEGBYTE)) + " GB";
else if(bytes >= MBYTE)
return fractionFormat.format(bytes/MBYTE) + " MB";
else if(bytes >= KBYTE)
return fractionFormat.format(bytes/KBYTE) + " KB";
return fractionFormat.format(bytes) + " Bytes";
* a logfile where status information is posted
* FIXME: put that in a seperate class (double code in FetcherTask)
PrintWriter logWriter;
private SimpleDateFormat formatter
= new SimpleDateFormat ("hh:mm:ss:SSSS");
private DecimalFormat fractionFormat = new DecimalFormat("0.00");
long startTime = System.currentTimeMillis();
private void log(String text)
logWriter.println(formatter.format(new Date()) + ";" + (System.currentTimeMillis()-startTime) + ";" + text);
catch(Exception e)
System.out.println("Couldn't write to logfile");
* construct the monitor gets a reference to all monitored filters
* @param threadPool the pool to be observed
* @param sampleDelta time in ms between samples
public ThreadMonitor(URLLengthFilter urlLengthFilter,
URLVisitedFilter urlVisitedFilter,
URLScopeFilter urlScopeFilter,
/*DNSResolver dnsResolver,*/
RobotExclusionFilter reFilter,
MessageHandler messageHandler,
ThreadPool threadPool,
HostManager hostManager,
int sampleDelta)
this.urlLengthFilter = urlLengthFilter;
this.urlVisitedFilter = urlVisitedFilter;
this.urlScopeFilter = urlScopeFilter;
/* this.dnsResolver = dnsResolver;*/
this.hostManager = hostManager;
this.reFilter = reFilter;
this.messageHandler = messageHandler;
this.threadPool = threadPool;
bytesReadPerPeriod = new ArrayList();
this.sampleDelta = sampleDelta;
this.thread = new Thread(this, "ThreadMonitor");
File logDir = new File("logs");
logWriter = new PrintWriter(new BufferedWriter(new FileWriter("logs/ThreadMonitor.log")));
catch(IOException e)
System.out.println("Couldn't create logfile (ThreadMonitor)");
* java.lang.Threads run method. To be invoked via start()
* the monitor's main thread takes the samples every sampleDelta ms
* Since Java is not real time, it remembers
public void run()
int nothingReadCount = 0;
long lastPeriodBytesRead = -1;
long monitorRunCount = 0;
long startTime = System.currentTimeMillis();
catch(InterruptedException e)
Iterator threadIterator = threadPool.getThreadIterator();
int i=0;
StringBuffer bytesReadString = new StringBuffer(200);
StringBuffer rawBytesReadString = new StringBuffer(200);
StringBuffer tasksRunString = new StringBuffer(200);
long overallBytesRead = 0;
long overallTasksRun = 0;
long now = System.currentTimeMillis();
boolean finished = false;
boolean restart = false;*/
boolean allThreadsIdle = true;
StringBuffer sb = new StringBuffer(500);
FetcherThread thread = (FetcherThread)threadIterator.next();
long totalBytesRead = thread.getTotalBytesRead();
overallBytesRead += totalBytesRead;
bytesReadString.append(formatBytes(totalBytesRead)).append( "; ");
rawBytesReadString.append(totalBytesRead).append("; ");
long tasksRun = thread.getTotalTasksRun();
overallTasksRun += tasksRun;
tasksRunString.append(tasksRun).append("; ");
// check task status
State state = thread.getTaskState();
//StringBuffer sb = new StringBuffer(200);
System.out.println(sb + "[" + thread.getThreadNumber() + "] " + state.getState() + " for " +
(now - state.getStateSince() ) + " ms " +
(state.getInfo() != null ? "(" + state.getInfo() +")" : "")
//if(allThreadsIdle) System.out.println("(not all threads are idle, '"+state.getState()+"' != '"+FetcherThread.STATE_IDLE+"')");
allThreadsIdle = false;
if (((state.equals(FetcherTask.FT_CONNECTING)) || (state.equals(FetcherTask.FT_GETTING)) || (state.equals(FetcherTask.FT_READING)) || (state.equals(FetcherTask.FT_CLOSING)))
&& ((now - state.getStateSince()) > 160000))
System.out.println("****Restarting Thread " + thread.getThreadNumber());
break; // Iterator is invalid
finished = true;
if(overallBytesRead == lastPeriodBytesRead)
disabled kickout feature - cm
nothingReadCount ++;
System.out.println("Anomaly: nothing read during the last period(s). " + (20-nothingReadCount+1) + " periods to exit");
if(nothingReadCount > 20) // nothing happens anymore
System.out.println("End at " + new Date().toString());
// print some information
nothingReadCount = 0;
lastPeriodBytesRead = overallBytesRead;
//State reState = new State("hhh"); //reFilter.getState();
//System.out.println(sb + "Robot-Excl.Filter State: " + reState.getState() + " since " + (now-reState.getStateSince()) + " ms " + (reState.getInfo() != null ? " at " + reState.getInfo() : ""));
addSample(new Sample(overallBytesRead, overallTasksRun, System.currentTimeMillis()));
int nrHosts = ((FetcherTaskQueue)threadPool.getTaskQueue()).getNumHosts();
int visitedSize = urlVisitedFilter.size();
int visitedStringSize = urlVisitedFilter.getStringSize();
double bytesPerSecond = getAverageBytesRead();
double docsPerSecond = getAverageDocsRead();
System.out.println(sb + "\nBytes total: " + formatBytes(overallBytesRead) + " (" + formatBytes((long)(((double)overallBytesRead)*1000/(System.currentTimeMillis()-startTime))) + " per second since start)" +
"\nBytes per Second: " + formatBytes((int)bytesPerSecond) + " (50 secs)" +
"\nDocs per Second: " + docsPerSecond +
"\nBytes per Thread: " + bytesReadString);
double docsPerSecondTotal = ((double)overallTasksRun)*1000/(System.currentTimeMillis()-startTime);
System.out.println(sb + "Docs read total: " + overallTasksRun + " Docs/s: " + fractionFormat.format(docsPerSecondTotal) +
"\nDocs p.thread: " + tasksRunString);
long memUsed = Runtime.getRuntime().totalMemory()-Runtime.getRuntime().freeMemory();
long memFree = Runtime.getRuntime().freeMemory();
long totalMem = Runtime.getRuntime().totalMemory();
System.out.println(sb + "Mem used: " + formatBytes(memUsed) + ", free: " + formatBytes(memFree) + " total VM: " + totalMem);
int urlsQueued = messageHandler.getQueued();
int urlsWaiting = threadPool.getQueueSize();
boolean isWorkingOnMessage = messageHandler.isWorkingOnMessage();
int urlsScopeFiltered = urlScopeFilter.getFiltered();
int urlsVisitedFiltered = urlVisitedFilter.getFiltered();
int urlsREFiltered = reFilter.getFiltered();
int urlLengthFiltered = urlLengthFilter.getFiltered();
System.out.println(sb + "URLs queued: " + urlsQueued + " waiting: " + urlsWaiting);
System.out.println(sb + "Message is being processed: " + isWorkingOnMessage);
System.out.println(sb + "URLs Filtered: length: " + urlLengthFiltered + " scope: " + urlsScopeFiltered + " visited: " + urlsVisitedFiltered + " robot.txt: " + urlsREFiltered);
System.out.println(sb + "Visited size: " + visitedSize + "; String Size in VisitedFilter: " + visitedStringSize + "; Number of Hosts: " + nrHosts + "; hosts in Host Manager: " + hostManager.getSize() + "\n");
log(sb + "" + now + ";" + overallBytesRead + ";" + overallTasksRun + ";" + urlsQueued + ";" + urlsWaiting + ";" + isWorkingOnMessage + ";" + urlsScopeFiltered + ";" + urlsVisitedFiltered + ";" + urlsREFiltered + ";" + memUsed + ";" + memFree + ";" + totalMem + ";" + nrHosts + ";" + visitedSize + ";" + visitedStringSize + ";" + rawBytesReadString + ";" + urlLengthFiltered);
if(!isWorkingOnMessage && (urlsQueued == 0) && (urlsWaiting == 0) && allThreadsIdle)
if(nothingReadCount > 3)
nothingReadCount = 0;
// Request Garbage Collection
if(monitorRunCount % 6 == 0)
if(monitorRunCount % 2 == 0)
catch(Exception e)
System.out.println("Monitor: Exception: " + e.getClass().getName());
* start the thread
public void start()
* interrupt the monitor thread
public void interrupt()
public synchronized void clear()
/*for(int i=0; i < timeSamples.length; i++)
/* public synchronized double getAverageReadCount(int maxPeriods)
int lastPeriod = bytesReadPerPeriod.size()-1;
int periods = Math.min(lastPeriod, maxPeriods);
if(periods < 2)
return 0.0;
long bytesLastPeriod = ((Sample)bytesReadPerPeriod.get(lastPeriod)).bytesRead;
long bytesBeforePeriod = ((Sample)bytesReadPerPeriod.get(lastPeriod - periods)).bytesRead;
long bytesRead = bytesLastPeriod - bytesBeforePeriod;
long endTime = ((Long)sampleTimeStamps.get(sampleTimeStamps.size()-1)).longValue();
long startTime = ((Long)sampleTimeStamps.get(sampleTimeStamps.size()-1 - periods)).longValue();
long duration = endTime - startTime;
System.out.println("bytes read: " + bytesRead + " duration in s: " + duration/1000.0 + " = " + ((double)bytesRead) / (duration/1000.0) + " per second");
return ((double)bytesRead) / (duration/1000.0);
/*public synchronized double getDocsPerSecond(int maxPeriods)
int lastPeriod = bytesReadPerPeriod.size()-1;
int periods = Math.min(lastPeriod, maxPeriods);
if(periods < 2)
return 0.0;
long docsLastPeriod = ((Sample)bytesReadPerPeriod.get(lastPeriod)).docsRead;
long docsBeforePeriod = ((Sample)bytesReadPerPeriod.get(lastPeriod - periods)).docsRead;
long docsRead = docsLastPeriod - docsBeforePeriod;
long endTime = ((Long)sampleTimeStamps.get(sampleTimeStamps.size()-1)).longValue();
long startTime = ((Long)sampleTimeStamps.get(sampleTimeStamps.size() - periods)).longValue();
long duration = endTime - startTime;
System.out.println("docs read: " + docsRead + " duration in s: " + duration/1000.0 + " = " + ((double)docsRead) / (duration/1000.0) + " per second");
return ((double)docsRead) / (duration/1000.0);
* retrieves the number of threads whose byteCount is below the threshold
* @param maxPeriods the number of periods to look back
* @param threshold the number of bytes per second that acts as the threshold for a stalled thread
/*public synchronized int getStalledThreadCount(int maxPeriods, double threshold)
int periods = Math.min(sampleTimeStamps.size(), maxPeriods);
int stalledThreads = 0;
int j=0, i=0;
if(periods > 1)
for(j=0; j<timeSamples.length; j++)
long threadByteCount = 0;
ArrayList actArrayList = timeSamples[j];
double bytesPerSecond = 0;
for(i=0; i<periods; i++)
Sample actSample = (Sample)(actArrayList.get(i));
threadByteCount += actSample.bytesRead;
catch(Exception e)
System.out.println("getAverageReadCount: " + e.getClass().getName() + ": " + e.getMessage() + "(" + i + ";" + j + ")");
bytesPerSecond = ((double)threadByteCount) /
- ((Long)sampleTimeStamps.get(sampleTimeStamps.size()-periods)).longValue()) * 1000.0;
if(bytesPerSecond < threshold)
return stalledThreads;
int samples=0;
public void addSample(Sample s)
if(samples < 10)
bytesReadPerPeriod.set(samples % 10, s);
public double getAverageBytesRead()
Iterator i = bytesReadPerPeriod.iterator();
Sample oldest = null;
Sample newest = null;
Sample s = (Sample)i.next();
if(oldest == null)
oldest = newest = s;
if(s.time < oldest.time)
oldest = s;
else if(s.time > newest.time)
newest = s;
return ((newest.bytesRead - oldest.bytesRead)/((newest.time - oldest.time)/1000.0));
public double getAverageDocsRead()
Iterator i = bytesReadPerPeriod.iterator();
Sample oldest = null;
Sample newest = null;
Sample s = (Sample)i.next();
if(oldest == null)
oldest = newest = s;
if(s.time < oldest.time)
oldest = s;
else if(s.time > newest.time)
newest = s;
return ((newest.docsRead - oldest.docsRead)/((newest.time - oldest.time)/1000.0));
@ -0,0 +1,69 @@
package de.lanlab.larm.fetcher;
* Title: LARM Lanlab Retrieval Machine Description: Copyright: Copyright (c)
* Company:
* @author
* @created 28. Januar 2002
* @version 1.0
* Title: LARM Lanlab Retrieval Machine Description: Copyright: Copyright (c)
* Company:
* kills URLs longer than X characters. Used to prevent endless loops where
* the page contains the current URL + some extension
* @author Clemens Marschner
* @created 28. Januar 2002
public class URLLengthFilter extends Filter implements MessageListener
* called by the message handler
* @param handler the handler
public void notifyAddedToMessageHandler(MessageHandler handler)
this.messageHandler = handler;
MessageHandler messageHandler;
int maxLength;
* Constructor for the URLLengthFilter object
* @param maxLength max length of the _total_ URL (protocol+host+port+path)
public URLLengthFilter(int maxLength)
this.maxLength = maxLength;
* handles the message
* @param message Description of the Parameter
* @return the original message or NULL if the URL was too long
public Message handleRequest(Message message)
URLMessage m = (URLMessage) message;
String file = m.getUrl().getFile();
if (file != null && file.length() > maxLength) // path + query
return null;
return message;
@ -0,0 +1,87 @@
package de.lanlab.larm.fetcher;
import java.net.*;
import java.io.*;
import de.lanlab.larm.util.URLUtils;
* represents a URL which is passed around in the messageHandler
public class URLMessage implements Message, Serializable
* the URL
protected URL url;
protected String urlString;
protected URL referer;
protected String refererString;
boolean isFrame;
public URLMessage(URL url, URL referer, boolean isFrame)
this.url = url;
this.urlString = url != null ? URLUtils.toExternalFormNoRef(url) : null;
this.referer = referer;
this.refererString = referer != null ? URLUtils.toExternalFormNoRef(referer) : null;
this.isFrame = isFrame;
//System.out.println("" + refererString + " -> " + urlString);
public URL getUrl()
return this.url;
public URL getReferer()
return this.referer;
public String toString()
return urlString;
public String getURLString()
return urlString;
public String getRefererString()
return refererString;
public int hashCode()
return url.hashCode();
private void writeObject(java.io.ObjectOutputStream out) throws IOException
private void readObject(java.io.ObjectInputStream in) throws IOException, ClassNotFoundException
url = (URL)in.readObject();
referer = (URL)in.readObject();
urlString = url.toExternalForm();
refererString = referer.toExternalForm();
isFrame = in.readBoolean();
public String getInfo()
return (referer != null ? refererString : "<start>") + "\t" + urlString + "\t" + (isFrame ? "1" : "0");
@ -0,0 +1,75 @@
package de.lanlab.larm.fetcher;
import org.apache.oro.text.regex.Perl5Matcher;
import org.apache.oro.text.regex.Perl5Compiler;
import org.apache.oro.text.regex.Pattern;
* Filter-Klasse; prüft eine eingegangene Message auf Einhaltung eines
* regulären Ausdrucks. Wenn die URL diesem Ausdruck
* nicht entspricht, wird sie verworfen
* @author Clemens Marschner
class URLScopeFilter extends Filter implements MessageListener
public void notifyAddedToMessageHandler(MessageHandler handler)
this.messageHandler = handler;
MessageHandler messageHandler;
* the regular expression which describes a valid URL
private Pattern pattern;
private Perl5Matcher matcher;
private Perl5Compiler compiler;
public URLScopeFilter()
matcher = new Perl5Matcher();
compiler = new Perl5Compiler();
public String getRexString()
return pattern.toString();
* set the regular expression
* @param rexString the expression
public void setRexString(String rexString) throws org.apache.oro.text.regex.MalformedPatternException
this.pattern = compiler.compile(rexString, Perl5Compiler.CASE_INSENSITIVE_MASK | Perl5Compiler.SINGLELINE_MASK);
//System.out.println("pattern set to: " + pattern);
* this method will be called by the message handler. Tests the URL
* and throws it out if it's not in the scope
public Message handleRequest(Message message)
if(message instanceof URLMessage)
String urlString = ((URLMessage)message).toString();
int length = urlString.length();
char buffer[] = new char[length];
//System.out.println("using pattern: " + pattern);
boolean match = matcher.matches(buffer, pattern);
//System.out.println("not in Scope: " + urlString);
return null;
return message;
@ -0,0 +1,114 @@
package de.lanlab.larm.fetcher;
import java.net.URL;
import java.util.*;
import de.lanlab.larm.util.SimpleLogger;
* contains a HashMap of all URLs already passed. Adds each URL to that list, or
* consumes it if it is already present
* @todo find ways to reduce memory consumption here. the approach is somewhat naive
* @author Clemens Marschner
* @created 3. Januar 2002
class URLVisitedFilter extends Filter implements MessageListener
* Description of the Method
* @param handler Description of the Parameter
public void notifyAddedToMessageHandler(MessageHandler handler)
this.messageHandler = handler;
MessageHandler messageHandler;
SimpleLogger log;
HashSet urlHash;
static Boolean dummy = new Boolean(true);
* Constructor for the URLVisitedFilter object
* @param initialHashCapacity Description of the Parameter
public URLVisitedFilter(int initialHashCapacity, SimpleLogger log)
urlHash = new HashSet(initialHashCapacity);
this.log = log;
//urlVector = new Vector(initialHashCapacity);
* clears everything
public void clearHashtable()
// urlVector.clear();
* @param message Description of the Parameter
* @return Description of the Return Value
public Message handleRequest(Message message)
if (message instanceof URLMessage)
URLMessage urlMessage = ((URLMessage) message);
URL url = urlMessage.getUrl();
String urlString = urlMessage.getURLString();
if (urlHash.contains(urlString))
//System.out.println("URLVisitedFilter: " + urlString + " already present.");
if(log != null)
return null;
// System.out.println("URLVisitedFilter: " + urlString + " not present yet.");
stringSize += urlString.length(); // see below
return message;
private int stringSize = 0;
* just a method to get a rough number of characters contained in the array
* with that you see that the total memory is mostly used by this class
public int getStringSize()
return stringSize;
public int size()
return urlHash.size();
@ -0,0 +1,875 @@
package de.lanlab.larm.graph;
* Title: LARM Lanlab Retrieval Machine Description: Copyright: Copyright (c)
* Company:
* @author
* @version 1.0
import java.io.*;
import java.util.*;
* Description of the Class
* @author Administrator
* @created 30. Januar 2002
class Node implements Comparable
LinkedList incoming;
// 16 + 4 per entry
//HashSet incomingNodes; // 16 + 16 per entry, 11 x 16 default size = 192
LinkedList outgoing;
// 16 + 4 per entry
//Object o;
//HashSet outgoingNodes; // 16 + 16 per entry, 11 x 16 default size = 192
//LinkedList shortestIncoming;
int id;
// 4
float distance;
// 8
String name;
// 4 + String object
String title;
// 4 + String object
float nodeRank[] = new float[2];
// 16
// 470 bytes + 2 string objects
* Description of the Field
public static int sortType = 0;
* Description of the Method
* @param n Description of the Parameter
* @return Description of the Return Value
public int compareTo(Object n)
if (sortType < 2)
double diff = ((Node) n).nodeRank[sortType] - nodeRank[sortType];
return diff < 0 ? -1 : diff > 0 ? 1 : 0;
return (((Node) n).incoming.size() - incoming.size());
* Constructor for the Node object
* @param id Description of the Parameter
* @param name Description of the Parameter
* @param title Description of the Parameter
public Node(int id, String name, String title)
this.id = id;
this.name = name;
this.title = title;
this.incoming = new LinkedList();
this.outgoing = new LinkedList();
//this.incomingNodes = new HashSet();
//this.outgoingNodes = new HashSet();
this.distance = Float.MAX_VALUE;
this.nodeRank[0] = this.nodeRank[1] = 1;
* Adds a feature to the Incoming attribute of the Node object
* @param incomingT The feature to be added to the Incoming attribute
* @return Description of the Return Value
public boolean addIncoming(Transition incomingT)
Integer id = new Integer(incomingT.getFrom().id);
if (!incoming.contains(id))
// attn: doesn't scale well, but also saves memory
return true;
return false;
* Adds a feature to the Outgoing attribute of the Node object
* @param outgoingT The feature to be added to the Outgoing attribute
* @return Description of the Return Value
public boolean addOutgoing(Transition outgoingT)
Integer id = new Integer(outgoingT.getTo().id);
if (!outgoing.contains(id))
return true;
return false;
* Gets the incoming attribute of the Node object
* @return The incoming value
public LinkedList getIncoming()
return incoming;
* Gets the outgoing attribute of the Node object
* @return The outgoing value
public LinkedList getOutgoing()
return outgoing;
* Sets the distance attribute of the Node object
* @param distance The new distance value
public void setDistance(float distance)
this.distance = distance;
* Gets the distance attribute of the Node object
* @return The distance value
public float getDistance()
return distance;
* Gets the name attribute of the Node object
* @return The name value
public String getName()
return name;
* Sets the title attribute of the Node object
* @param title The new title value
public void setTitle(String title)
this.title = title;
* Gets the title attribute of the Node object
* @return The title value
public String getTitle()
return title;
* Gets the nodeRank attribute of the Node object
* @param idx Description of the Parameter
* @return The nodeRank value
public float getNodeRank(int idx)
return nodeRank[idx];
* Sets the nodeRank attribute of the Node object
* @param nodeRank The new nodeRank value
* @param idx The new nodeRank value
public void setNodeRank(float nodeRank, int idx)
this.nodeRank[idx] = nodeRank;
* Description of the Class
* @author Administrator
* @created 30. Januar 2002
class Transition
Node from;
Node to;
float distance;
float linkRank[] = new float[2];
boolean isFrame;
* Constructor for the Transition object
* @param from Description of the Parameter
* @param to Description of the Parameter
* @param isFrame Description of the Parameter
public Transition(Node from, Node to, boolean isFrame)
LinkedList l = from.getOutgoing();
Iterator i = l.iterator();
Transition t = (Transition)i.next();
if(t.getTo() == to)
return; // schon enthalten
this.from = from;
this.to = to;
this.distance = Integer.MAX_VALUE;
this.isFrame = isFrame;
this.linkRank[0] = this.linkRank[1] = 1;
* Gets the to attribute of the Transition object
* @return The to value
public Node getTo()
return to;
* Gets the from attribute of the Transition object
* @return The from value
public Node getFrom()
return from;
* Gets the distance attribute of the Transition object
* @return The distance value
public float getDistance()
return distance;
* Sets the distance attribute of the Transition object
* @param distance The new distance value
public void setDistance(float distance)
this.distance = distance;
* Gets the frame attribute of the Transition object
* @return The frame value
public boolean isFrame()
return isFrame;
* Gets the linkRank attribute of the Transition object
* @param idx Description of the Parameter
* @return The linkRank value
public float getLinkRank(int idx)
return linkRank[idx];
* Sets the linkRank attribute of the Transition object
* @param linkRank The new linkRank value
* @param idx The new linkRank value
public void setLinkRank(float linkRank, int idx)
this.linkRank[idx] = linkRank;
* Description of the Class
* @author Administrator
* @created 30. Januar 2002
public class DistanceCount
HashMap nodes = new HashMap(100000);
LinkedList nodesToDo = new LinkedList();
static int id = 0;
* Gets the orCreateNode attribute of the DistanceCount object
* @param name Description of the Parameter
* @param title Description of the Parameter
* @return The orCreateNode value
Node getOrCreateNode(String name, String title)
Node node = (Node) nodes.get(name);
if (node != null)
if (title != null)
return node;
node = new Node(id++, name, title);
nodes.put(name, node);
return node;
* Constructor for the DistanceCount object
* @param filename Description of the Parameter
* @exception IOException Description of the Exception
public DistanceCount(String filename)
throws IOException
System.out.println("reading file...");
long t1 = System.currentTimeMillis();
BufferedReader b = new BufferedReader(new FileReader(filename));
String line;
boolean firstNotFound = true;
Node firstNode = null;
int lines = 0;
while ((line = b.readLine()) != null)
String title = null;
//StringTokenizer st = new StringTokenizer(line, " ");
StringTokenizer st = new StringTokenizer(line, "\t");
String from = st.nextToken();
if (from.endsWith("/"))
from = from.substring(0, from.length() - 1);
from = from.toLowerCase();
String to = st.nextToken();
if (to.endsWith("/"))
to = to.substring(0, to.length() - 1);
to = to.toLowerCase();
boolean isFrame = (Integer.parseInt(st.nextToken()) == 1);
if (st.countTokens() > 3)
title = "<untitled>";
//StringBuffer sb = new StringBuffer();
// result
// Mime Type
// Size
* while(st.hasMoreTokens())
* {
* sb.append(st.nextToken()).append(" ");
* }
title = st.nextToken();
if (title.length() > 2)
title = title.substring(1, title.length() - 1);
int indexOfPara = title.indexOf("\"");
if (indexOfPara > -1)
title = title.substring(0, indexOfPara);
Node fromNode = getOrCreateNode(from, null);
Node toNode = getOrCreateNode(to, title);
Transition t = new Transition(fromNode, toNode, isFrame);
* if(firstNotFound && to.equals(""))
* {
* firstNode = toNode;
* firstNotFound = false;
* }
if (lines % 10000 == 0)
System.out.println("" + lines + " Lines; " + nodes.size() + " nodes");
catch (NoSuchElementException e)
System.out.println("Malformed line " + lines + ": field number doesn't match");
catch (NumberFormatException e)
System.out.println("Malformed line " + lines + ": NumberFormat wrong");
System.out.println("finished; b" + lines + " Lines; " + nodes.size() + " nodes");
long t2 = System.currentTimeMillis();
System.out.println("" + (t2 - t1) + " ms");
* if(firstNotFound)
* {
* System.out.println("Couldn't find start page");
* System.exit(-1);
* }
* Description of the Method
* @param firstNode Description of the Parameter
public void calculateShortestDistance(Node firstNode)
int calculations = 0;
while (!nodesToDo.isEmpty())
if (calculations % 100000 == 0)
System.out.println("Calculations: " + calculations + "; nodes to go: " + nodesToDo.size() + " total Mem: " + Runtime.getRuntime().totalMemory() + "; free mem: " + Runtime.getRuntime().freeMemory());
Node act = (Node) nodesToDo.removeFirst();
LinkedList outTrans = act.getOutgoing();
float distance = act.getDistance();
Iterator i = outTrans.iterator();
while (i.hasNext())
Transition t = (Transition) i.next();
float transDistance = t.getDistance();
/*if (t.isFrame())
System.out.println("Frame from " + t.from.getName() + " to " + t.to.getName());
float newDistance = distance + (t.isFrame() ? 0.25f : 1f);
if (transDistance > newDistance)
Node to = t.getTo();
if (to.distance > distance)
* if(looksGood)
* {
* System.out.println("Node " + act.id + " looks good");
* }
System.out.println("Calculations: " + calculations );
public void clearDistances()
System.out.println("Clearing distance data...");
Iterator it = nodes.values().iterator();
int nr = 0;
while (it.hasNext())
Node n = (Node) it.next();
System.out.println("cleared " + nr + " nodes. done");
* Description of the Method
* @param nodeFrom Description of the Parameter
* @param nodeTo Description of the Parameter
public void printDistance(String nodeFrom, String nodeTo)
Node firstNode = (Node) nodes.get(nodeFrom);
if (firstNode == null)
System.out.println("FROM node not found");
Node toNode = (Node) nodes.get(nodeTo);
if (toNode == null)
System.out.println("TO node not found");
//System.out.println("resetting node distance...");
//t1 = System.currentTimeMillis();
//System.out.println("" + (t1-t2) + " ms");
* Collection nodeCollection = nodes.values();
* Object[] nodeArray = nodeCollection.toArray();
* Arrays.sort(nodeArray);
* t2 = System.currentTimeMillis();
* System.out.println("" + (t2-t1) + " ms");
* int from = 0;
* int to = 1;
* /calculate page Rank
* for(int i = 0; i< 1; i++)
* {
* from = i%2;
* to = (i+1) % 2;
* for(int j = 0; j<nodeArray.length; j++)
* {
* Node act = (Node)nodeArray[j];
* LinkedList inc = act.getIncoming();
* float pageRank = 0;
* Iterator it = inc.iterator();
* while(it.hasNext())
* {
* Transition t = (Transition)it.next();
* pageRank += t.getLinkRank(from);
* }
* act.setNodeRank(pageRank, to);
* LinkedList out = act.getOutgoing();
* int size = out.size();
* if(size > 0)
* {
* float linkRank = pageRank / size;
* it = out.iterator();
* while(it.hasNext())
* {
* Transition t = (Transition)it.next();
* t.setLinkRank(linkRank, to);
* }
* }
* }
* }
* System.out.println("\nLink Count:");
* for(int i=0; i<10; i++)
* {
* Node n = ((Node)nodeArray[i]);
* System.out.println("Node " + n.name + ": " + n.getIncoming().size() + "; pageRank: " + n.getNodeRank(to));
* }
* for(int i=nodeArray.length/2; i<nodeArray.length/2+10; i++)
* {
* Node n = ((Node)nodeArray[i]);
* System.out.println("Node " + n.name + ": " + n.getIncoming().size() + "; pageRank: " + n.getNodeRank(to));
* }
* for(int i=nodeArray.length-10; i<nodeArray.length; i++)
* {
* Node n = ((Node)nodeArray[i]);
* System.out.println("Node " + n.name + ": " + n.getIncoming().size() + "; pageRank: " + n.getNodeRank(to));
* }
* Node.sortType = to;
* Arrays.sort(nodeArray);
* System.out.println("\nPageRank Count:");
* for(int i=0; i<10; i++)
* {
* Node n = ((Node)nodeArray[i]);
* System.out.println("Node " + n.name + ": " + n.getIncoming().size() + "; pageRank: " + n.getNodeRank(to));
* }
* for(int i=nodeArray.length/2; i<nodeArray.length/2+10; i++)
* {
* Node n = ((Node)nodeArray[i]);
* System.out.println("Node " + n.name + ": " + n.getIncoming().size() + "; pageRank: " + n.getNodeRank(to));
* }
* for(int i=nodeArray.length-10; i<nodeArray.length; i++)
* {
* Node n = ((Node)nodeArray[i]);
* System.out.println("Node " + n.name + ": " + n.getIncoming().size() + "; pageRank: " + n.getNodeRank(to));
* }
* System.out.println("\nStats...");
* float distanceAccumulated=0;
* float distanceMax = 0;
* int notCounted = 0;
* for(int j = 0; j<nodeArray.length; j++)
* {
* Node n = (Node)nodeArray[j];
* if(n.distance != Integer.MAX_VALUE)
* {
* distanceAccumulated += n.distance;
* distanceMax = Math.max(distanceMax, n.distance);
* }
* else
* {
* notCounted++;
* }
* }
* System.out.println("Mean Distance: " + ((double)distanceAccumulated)/nodeArray.length);
* System.out.println("Max Distance: " + (distanceMax));
* System.out.println("Not reachable nodes(?): " + notCounted);
* System.out.println("Referer Median: " + ((Node)(nodeArray[Math.round(nodeArray.length/2)])).incoming.size());
* System.out.println("\nSamples:");
printShortestRoute(toNode, 0,0);
* Description of the Method
public void printRandomRoute()
Random r = new java.util.Random(System.currentTimeMillis());
Collection nodeColl = nodes.values();
Object[] nodeArray = (Object[])nodeColl.toArray();
int rnd = (int) (r.nextDouble() * nodeArray.length);
Node from = (Node) nodeArray[rnd];
rnd = (int) (r.nextDouble() * nodeArray.length);
Node to = (Node) nodeArray[rnd];
System.out.println("Calculating distance...");
printShortestRoute(to, 0,0);
* Description of the Method
* @param n Description of the Parameter
* @param indent Description of the Parameter
public void printShortestRoute(Node n, int indent, int linkCount)
String spaces = " ".substring(0, indent);
if (n.getIncoming().isEmpty())
System.out.println(spaces + "<start>");
System.out.print(spaces + "+- " + n.name + " (" + (n.getTitle() != null ? n.getTitle().substring(0,Math.min(n.getTitle().length(),25)) : "") + "\") D:" + n.distance + "; L:" + n.getIncoming().size() + "; C:" + linkCount);
Iterator it = n.getIncoming().iterator();
float dist = n.distance;
if (dist > 10000000)
System.out.println(spaces + "\n--no link--");
while (it.hasNext())
Transition t = (Transition) it.next();
if (t.distance <= dist)
if (t.isFrame())
System.out.println(" **F** ->");
System.out.println(" -> ");
printShortestRoute(t.getFrom(), indent + 1, linkCount + n.getIncoming().size());
* this class reads in store.log, constructs a graph of the crawled web and is able
* to perform a breadth-first search for the shortest distance between two nodes<br>
* Note: this is experimental stuff. get into the source code to see how it works
* @param args args[0] must point to the store.log file
public static void main(String[] args)
// Syntax: DistanceCount <store.log>
DistanceCount dc = new DistanceCount(args[0]);
boolean running = true;
BufferedReader in = new BufferedReader(new InputStreamReader(System.in),400);
while (running)
System.out.print("\n\nCommand (? for help) > ");
String newL;
String input = "";
//while((newL = in.readLine()) != null)
input = in.readLine();
StringTokenizer st = new StringTokenizer(input," ");
String command;
boolean printHelp = false;
if (!st.hasMoreTokens())
printHelp = true;
command = "?";
command = st.nextToken();
if ("?".equals(command))
printHelp = true;
else if ("d".equals(command))
String from = st.nextToken();
String to = st.nextToken();
dc.printDistance(from ,to);
else if ("q".equals(command))
running = false;
else if ("r".equals(command))
System.out.println("unknown command '" + command + "'");
catch (java.util.NoSuchElementException e)
System.out.println("Syntax error");
printHelp = true;
catch(Exception e)
if (printHelp)
System.out.println("\nSyntax\n" +
"? print this help message\n" +
"d <page1> <page2> print shortest route from page1 to page2\n" +
"r print random walk\n" +
"q quit");
catch (IOException e)
catch (ArrayIndexOutOfBoundsException e)
System.out.println("Syntax: java ... store.log");
@ -0,0 +1,154 @@
package de.lanlab.larm.gui;
A basic extension of the java.awt.Dialog class
import java.awt.*;
public class AboutDialog extends Dialog {
public AboutDialog(Frame parent, boolean modal)
super(parent, modal);
// This code is automatically generated by Visual Cafe when you add
// components to the visual environment. It instantiates and initializes
// the components. To modify the code, only use code syntax that matches
// what Visual Cafe can generate, or Visual Cafe may be unable to back
// parse your Java file into its visual environment.
label1.setText("LARM - LANLab Retrieval Machine");
label2.setText("(C) 2000 Clemens Marschner");
setTitle("AWT-Anwendung - Info");
SymWindow aSymWindow = new SymWindow();
SymAction lSymAction = new SymAction();
public AboutDialog(Frame parent, String title, boolean modal)
this(parent, modal);
public void addNotify()
// Record the size of the window prior to calling parents addNotify.
Dimension d = getSize();
// Only do this once.
if (fComponentsAdjusted)
// Adjust components according to the insets
Insets insets = getInsets();
setSize(insets.left + insets.right + d.width, insets.top + insets.bottom + d.height);
Component components[] = getComponents();
for (int i = 0; i < components.length; i++)
Point p = components[i].getLocation();
p.translate(insets.left, insets.top);
// Used for addNotify check.
fComponentsAdjusted = true;
public void setVisible(boolean b)
if (b)
Rectangle bounds = getParent().getBounds();
Rectangle abounds = getBounds();
setLocation(bounds.x + (bounds.width - abounds.width)/ 2,
bounds.y + (bounds.height - abounds.height)/2);
java.awt.Label label1 = new java.awt.Label();
java.awt.Button okButton = new java.awt.Button();
java.awt.Label label2 = new java.awt.Label();
// Used for addNotify check.
boolean fComponentsAdjusted = false;
class SymAction implements java.awt.event.ActionListener
public void actionPerformed(java.awt.event.ActionEvent event)
Object object = event.getSource();
if (object == okButton)
void okButton_ActionPerformed(java.awt.event.ActionEvent event)
// to do: code goes here.
void okButton_ActionPerformed_Interaction1(java.awt.event.ActionEvent event)
try {
} catch (Exception e) {
class SymWindow extends java.awt.event.WindowAdapter
public void windowClosing(java.awt.event.WindowEvent event)
Object object = event.getSource();
if (object == AboutDialog.this)
void AboutDialog_WindowClosing(java.awt.event.WindowEvent event)
// to do: code goes here.
void AboutDialog_WindowClosing_Interaction1(java.awt.event.WindowEvent event)
try {
} catch (Exception e) {
@ -0,0 +1,485 @@
package de.lanlab.larm.gui;
This simple extension of the java.awt.Frame class
contains all the elements necessary to act as the
main window of an application.
import java.awt.*;
import java.awt.event.ActionListener;
//import com.sun.java.swing.*;
public class FetcherFrame extends Frame
public FetcherFrame()
// This code is automatically generated by Visual Cafe when you add
// components to the visual environment. It instantiates and initializes
// the components. To modify the code, only use code syntax that matches
// what Visual Cafe can generate, or Visual Cafe may be unable to back
// parse your Java file into its visual environment.
setLayout(new BorderLayout(0,0));
//$$ openFileDialog1.move(24,312);
mainPanelWithBorders.setLayout(new BorderLayout(0,0));
add("Center", mainPanelWithBorders);
mainPanelWithBorders.add("North", northBorder);
mainPanelWithBorders.add("South", southBorder);
mainPanelWithBorders.add("West", westBorder);
mainPanelWithBorders.add("East", eastBorder);
mainPanel.setLayout(new BorderLayout(0,3));
mainPanelWithBorders.add("Center", mainPanel);
upperPanel.setLayout(new GridLayout(1,2,0,0));
mainPanel.add("North", upperPanel);
startButton.setFont(new Font("Dialog", Font.BOLD, 12));
restrictToLabel.setText("Restrict host to");
logPanel.setLayout(new BorderLayout(0,0));
logPanel.add("Center", logList);
lowerPanel.setLayout(new GridLayout(1,3,3,3));
mainPanel.add("Center", lowerPanel);
urlQueuePanel.setLayout(new BorderLayout(0,0));
urlQueuePanel.add("North", urlQueueLabel);
urlQueuePanel.add("Center", urlQueueList);
urlThreadPanel.setLayout(new BorderLayout(0,0));
urlThreadPanel.add("North", urlThreadLabel);
urlThreadPanel.add("Center", urlThreadList);
docQueuePanel.setLayout(new BorderLayout(0,0));
docQueuePanel.add("North", docQueueLabel);
docQueuePanel.add("Center", docQueueList);
docThreadPanel.setLayout(new BorderLayout(0,0));
docThreadPanel.add("North", docThreadLabel);
docThreadPanel.add("Center", docThreadList);
setTitle("LARM - Fetcher");
newMenuItem.setShortcut(new MenuShortcut(java.awt.event.KeyEvent.VK_N,false));
openMenuItem.setShortcut(new MenuShortcut(java.awt.event.KeyEvent.VK_O,false));
saveMenuItem.setShortcut(new MenuShortcut(java.awt.event.KeyEvent.VK_S,false));
saveAsMenuItem.setLabel("Speichern unter...");
cutMenuItem.setShortcut(new MenuShortcut(java.awt.event.KeyEvent.VK_X,false));
copyMenuItem.setShortcut(new MenuShortcut(java.awt.event.KeyEvent.VK_C,false));
pasteMenuItem.setShortcut(new MenuShortcut(java.awt.event.KeyEvent.VK_V,false));
//$$ mainMenuBar.move(0,312);
SymWindow aSymWindow = new SymWindow();
SymAction lSymAction = new SymAction();
public FetcherFrame(String title)
* Shows or hides the component depending on the boolean flag b.
* @param b if true, show the component; otherwise, hide the component.
* @see java.awt.Component#isVisible
public void setVisible(boolean b)
setLocation(50, 50);
static public void main(String args[])
//Create a new instance of our application's frame, and make it visible.
(new FetcherFrame()).setVisible(true);
catch (Throwable t)
//Ensure the application exits with an error condition.
public void addNotify()
// Record the size of the window prior to calling parents addNotify.
Dimension d = getSize();
if (fComponentsAdjusted)
// Adjust components according to the insets
setSize(getInsets().left + getInsets().right + d.width, getInsets().top + getInsets().bottom + d.height);
Component components[] = getComponents();
for (int i = 0; i < components.length; i++)
Point p = components[i].getLocation();
p.translate(getInsets().left, getInsets().top);
fComponentsAdjusted = true;
// Used for addNotify check.
boolean fComponentsAdjusted = false;
java.awt.FileDialog openFileDialog1 = new java.awt.FileDialog(this);
java.awt.Panel mainPanelWithBorders = new java.awt.Panel();
java.awt.Panel northBorder = new java.awt.Panel();
java.awt.Panel southBorder = new java.awt.Panel();
java.awt.Panel westBorder = new java.awt.Panel();
java.awt.Panel eastBorder = new java.awt.Panel();
java.awt.Panel mainPanel = new java.awt.Panel();
java.awt.Panel upperPanel = new java.awt.Panel();
java.awt.Panel preferencesPanel = new java.awt.Panel();
java.awt.Label startURLlabel = new java.awt.Label();
java.awt.TextField startURL = new java.awt.TextField(30);
java.awt.Button startButton = new java.awt.Button();
java.awt.Label restrictToLabel = new java.awt.Label();
java.awt.TextField restrictTo = new java.awt.TextField();
java.awt.Panel logPanel = new java.awt.Panel();
java.awt.List logList = new java.awt.List(8);
java.awt.Panel lowerPanel = new java.awt.Panel();
java.awt.Panel urlQueuePanel = new java.awt.Panel();
java.awt.Label urlQueueLabel = new java.awt.Label();
java.awt.List urlQueueList = new java.awt.List(5);
java.awt.Panel urlThreadPanel = new java.awt.Panel();
java.awt.Label urlThreadLabel = new java.awt.Label();
java.awt.List urlThreadList = new java.awt.List(4);
java.awt.Panel docQueuePanel = new java.awt.Panel();
java.awt.Label docQueueLabel = new java.awt.Label();
java.awt.List docQueueList = new java.awt.List(4);
java.awt.Panel docThreadPanel = new java.awt.Panel();
java.awt.Label docThreadLabel = new java.awt.Label();
java.awt.List docThreadList = new java.awt.List(4);
java.awt.MenuBar mainMenuBar = new java.awt.MenuBar();
java.awt.Menu menu1 = new java.awt.Menu();
java.awt.MenuItem newMenuItem = new java.awt.MenuItem();
java.awt.MenuItem openMenuItem = new java.awt.MenuItem();
java.awt.MenuItem saveMenuItem = new java.awt.MenuItem();
java.awt.MenuItem saveAsMenuItem = new java.awt.MenuItem();
java.awt.MenuItem separatorMenuItem = new java.awt.MenuItem();
java.awt.MenuItem exitMenuItem = new java.awt.MenuItem();
java.awt.Menu menu2 = new java.awt.Menu();
java.awt.MenuItem cutMenuItem = new java.awt.MenuItem();
java.awt.MenuItem copyMenuItem = new java.awt.MenuItem();
java.awt.MenuItem pasteMenuItem = new java.awt.MenuItem();
java.awt.Menu menu3 = new java.awt.Menu();
java.awt.MenuItem aboutMenuItem = new java.awt.MenuItem();
class SymWindow extends java.awt.event.WindowAdapter
public void windowClosing(java.awt.event.WindowEvent event)
Object object = event.getSource();
if (object == FetcherFrame.this)
void FetcherFrame_WindowClosing(java.awt.event.WindowEvent event)
// to do: code goes here.
void FetcherFrame_WindowClosing_Interaction1(java.awt.event.WindowEvent event)
try {
// QuitDialog Create and show as modal
(new QuitDialog(this, true)).setVisible(true);
} catch (Exception e) {
class SymAction implements java.awt.event.ActionListener
public void actionPerformed(java.awt.event.ActionEvent event)
Object object = event.getSource();
if (object == openMenuItem)
else if (object == aboutMenuItem)
else if (object == exitMenuItem)
else if (object == startButton)
void openMenuItem_ActionPerformed(java.awt.event.ActionEvent event)
// to do: code goes here.
void openMenuItem_ActionPerformed_Interaction1(java.awt.event.ActionEvent event)
try {
// OpenFileDialog Create and show as modal
int defMode = openFileDialog1.getMode();
String defTitle = openFileDialog1.getTitle();
String defDirectory = openFileDialog1.getDirectory();
String defFile = openFileDialog1.getFile();
openFileDialog1 = new java.awt.FileDialog(this, defTitle, defMode);
} catch (Exception e) {
void aboutMenuItem_ActionPerformed(java.awt.event.ActionEvent event)
// to do: code goes here.
void aboutMenuItem_ActionPerformed_Interaction1(java.awt.event.ActionEvent event)
try {
// AboutDialog Create and show as modal
(new AboutDialog(this, true)).setVisible(true);
} catch (Exception e) {
void exitMenuItem_ActionPerformed(java.awt.event.ActionEvent event)
// to do: code goes here.
void exitMenuItem_ActionPerformed_Interaction1(java.awt.event.ActionEvent event)
try {
// QuitDialog Create and show as modal
(new QuitDialog(this, true)).setVisible(true);
} catch (Exception e) {
public void startButton_ActionPerformed(java.awt.event.ActionEvent event)
// to do: code goes here.
public void addUrlQueueItem(String item)
public void removeUrlQueueItem(String item)
public void addDocQueueItem(String item)
public void removeDocQueueItem(String item)
public synchronized int addUrlThreadItem(String item)
return urlThreadList.getItemCount();
public synchronized int addUrlThreadItem(String item, int pos)
return urlThreadList.getItemCount();
public void replaceUrlThreadItem(String item, int index)
public synchronized int addDocThreadItem(String item)
return docThreadList.getItemCount();
public void replaceDocThreadItem(String item, int index)
public void addLogEntry(String entry)
public void clearLog()
public void addStartButtonListener(ActionListener a)
public String getRestrictTo()
return restrictTo.getText();
public void setRestrictTo(String restrictTo)
public String getStartURL()
return startURL.getText();
public void setStartURL(String startURL)
//public void setInfoText(String text)
// thi
@ -0,0 +1,332 @@
* Title: LARM Lanlab Retrieval Machine<p>
* Description: <p>
* Copyright: Copyright (c) <p>
* Company: <p>
* @author
* @version 1.0
package de.lanlab.larm.gui;
import javax.swing.*;
import java.awt.*;
import java.awt.event.*;
public class FetcherSummaryFrame extends JFrame
JPanel lowerPanel = new JPanel();
JPanel progressPanel = new JPanel();
JPanel middlePanel = new JPanel();
JPanel rightPanel = new JPanel();
BorderLayout borderLayout1 = new BorderLayout();
JPanel propertyPanel = new JPanel();
JLabel hostLabel = new JLabel();
JLabel urlRestrictionFrame = new JLabel();
JTextField startURL = new JTextField();
JTextField restrictTo = new JTextField();
JButton startButton = new JButton();
GridLayout gridLayout1 = new GridLayout();
JProgressBar urlQueuedProgress = new JProgressBar(0,100);
JLabel urlQueuedLabel = new JLabel();
JLabel scopeFilteredLabel = new JLabel();
JProgressBar scopeFilteredProgress = new JProgressBar(0,100);
JLabel visitedFilteredLabel = new JLabel();
JProgressBar visitedFilteredProgress = new JProgressBar(0,100);
JLabel workingThreadsLabel = new JLabel();
JProgressBar workingThreadsProgress = new JProgressBar(0,100);
JLabel idleThreadsLabel = new JLabel();
JProgressBar idleThreadsProgress = new JProgressBar(0,100);
JLabel busyThreadsLabel = new JLabel();
JProgressBar busyThreadsProgress = new JProgressBar(0,100);
JLabel requestQueueLabel = new JLabel();
JProgressBar requestQueueProgress = new JProgressBar();
JLabel stalledThreadsLabel = new JLabel();
JProgressBar stalledThreadsProgress = new JProgressBar();
JLabel dnsLabel = new JLabel();
JProgressBar dnsProgress = new JProgressBar(0,100);
JLabel freeMemLabel = new JLabel();
JLabel freeMemText = new JLabel();
JLabel totalMemLabel = new JLabel();
JLabel totalMemText = new JLabel();
JLabel bpsLabel = new JLabel();
JLabel bpsText = new JLabel();
JLabel docsLabel = new JLabel();
JLabel docsText = new JLabel();
JLabel docsReadLabel = new JLabel();
JLabel docsReadText = new JLabel();
JProgressBar urlsCaughtProgress = new JProgressBar(0,100);
JLabel urlsCaughtText = new JLabel();
JLabel robotsTxtsText = new JLabel();
JProgressBar robotsTxtsProgress = new JProgressBar(0,100);
public FetcherSummaryFrame()
this.setTitle("LARM - LANLab Retrieval Machine");
this.setSize(new Dimension(640,350));
catch(Exception e)
private void jbInit() throws Exception
propertyPanel.setMinimumSize(new Dimension(10, 70));
propertyPanel.setPreferredSize(new Dimension(10, 80));
hostLabel.setBounds(new Rectangle(18, 15, 76, 17));
urlRestrictionFrame.setText("URL-Restriction (regul. Ausdruck)");
urlRestrictionFrame.setBounds(new Rectangle(18, 37, 208, 17));
startURL.setBounds(new Rectangle(224, 14, 281, 21));
restrictTo.setBounds(new Rectangle(224, 38, 281, 21));
startButton.setBounds(new Rectangle(528, 14, 79, 47));
urlQueuedLabel.setText("URLs queued");
visitedFilteredLabel.setText("Visited gefiltert");
workingThreadsLabel.setText("Number of Working Threads");
idleThreadsLabel.setText("Idle Threads");
busyThreadsLabel.setText("Busy Threads");
requestQueueLabel.setText("requests queued");
stalledThreadsLabel.setText("stalled Threads");
stalledThreadsProgress.setPreferredSize(new Dimension(190, 25));
requestQueueProgress.setPreferredSize(new Dimension(190, 25));
busyThreadsProgress.setPreferredSize(new Dimension(190, 25));
idleThreadsProgress.setPreferredSize(new Dimension(190, 25));
workingThreadsProgress.setPreferredSize(new Dimension(190, 25));
urlQueuedProgress.setPreferredSize(new Dimension(190, 25));
scopeFilteredProgress.setPreferredSize(new Dimension(190, 25));
visitedFilteredProgress.setPreferredSize(new Dimension(190, 25));
dnsLabel.setText("DNS Hosts cached");
dnsProgress.setPreferredSize(new Dimension(190, 25));
freeMemLabel.setText("Free Mem");
freeMemLabel.setPreferredSize(new Dimension(60, 17));
freeMemText.setPreferredSize(new Dimension(120, 17));
freeMemText.setMinimumSize(new Dimension(100, 17));
totalMemLabel.setText("total Mem");
totalMemLabel.setPreferredSize(new Dimension(60, 17));
totalMemText.setPreferredSize(new Dimension(120, 17));
totalMemText.setMinimumSize(new Dimension(100, 17));
bpsLabel.setPreferredSize(new Dimension(60, 17));
bpsText.setMinimumSize(new Dimension(100, 17));
bpsText.setPreferredSize(new Dimension(120, 17));
docsLabel.setPreferredSize(new Dimension(60, 17));
docsText.setPreferredSize(new Dimension(120, 17));
docsText.setMinimumSize(new Dimension(100, 17));
docsReadLabel.setText("Docs read");
docsReadLabel.setPreferredSize(new Dimension(60, 17));
docsReadText.setPreferredSize(new Dimension(120, 17));
docsReadText.setMinimumSize(new Dimension(100, 17));
urlsCaughtProgress.setPreferredSize(new Dimension(190, 25));
urlsCaughtText.setText("URLs caught by Robots.txt");
robotsTxtsText.setText("Robots.txts found");
robotsTxtsProgress.setPreferredSize(new Dimension(190, 25));
this.getContentPane().add(lowerPanel, BorderLayout.CENTER);
lowerPanel.add(progressPanel, null);
progressPanel.add(urlQueuedLabel, null);
progressPanel.add(urlQueuedProgress, null);
progressPanel.add(scopeFilteredLabel, null);
progressPanel.add(scopeFilteredProgress, null);
progressPanel.add(visitedFilteredLabel, null);
progressPanel.add(visitedFilteredProgress, null);
progressPanel.add(dnsLabel, null);
progressPanel.add(dnsProgress, null);
progressPanel.add(robotsTxtsText, null);
progressPanel.add(robotsTxtsProgress, null);
progressPanel.add(urlsCaughtText, null);
progressPanel.add(urlsCaughtProgress, null);
lowerPanel.add(middlePanel, null);
middlePanel.add(workingThreadsLabel, null);
middlePanel.add(workingThreadsProgress, null);
middlePanel.add(idleThreadsLabel, null);
middlePanel.add(idleThreadsProgress, null);
middlePanel.add(busyThreadsLabel, null);
middlePanel.add(busyThreadsProgress, null);
middlePanel.add(requestQueueLabel, null);
middlePanel.add(requestQueueProgress, null);
middlePanel.add(stalledThreadsLabel, null);
middlePanel.add(stalledThreadsProgress, null);
lowerPanel.add(rightPanel, null);
rightPanel.add(docsLabel, null);
rightPanel.add(docsText, null);
rightPanel.add(docsReadLabel, null);
rightPanel.add(docsReadText, null);
rightPanel.add(bpsLabel, null);
rightPanel.add(bpsText, null);
rightPanel.add(totalMemLabel, null);
rightPanel.add(totalMemText, null);
rightPanel.add(freeMemLabel, null);
rightPanel.add(freeMemText, null);
this.getContentPane().add(propertyPanel, BorderLayout.NORTH);
propertyPanel.add(urlRestrictionFrame, null);
propertyPanel.add(restrictTo, null);
propertyPanel.add(hostLabel, null);
propertyPanel.add(startButton, null);
propertyPanel.add(startURL, null);
public void setCounterProgressBar(JProgressBar p, int value)
int oldMax = p.getMaximum();
int oldValue = p.getValue();
if(value > oldMax)
p.setMaximum(oldMax * 2);
else if (value < oldMax / 2 && oldValue >= oldMax / 2)
p.setMaximum(oldMax / 2);
p.setString("" + value);
public void setURLsQueued(int queued)
setCounterProgressBar(this.urlQueuedProgress, queued);
public void setScopeFiltered(int filtered)
setCounterProgressBar(this.scopeFilteredProgress, filtered);
public void setVisitedFiltered(int filtered)
setCounterProgressBar(this.visitedFilteredProgress, filtered);
public void setWorkingThreadsCount(int threads)
setCounterProgressBar(this.workingThreadsProgress, threads);
public void setIdleThreadsCount(int threads)
setCounterProgressBar(this.idleThreadsProgress, threads);
public void setBusyThreadsCount(int threads)
setCounterProgressBar(this.busyThreadsProgress, threads);
public void setRequestQueueCount(int requests)
setCounterProgressBar(this.requestQueueProgress, requests);
public void setDNSCount(int count)
setCounterProgressBar(this.dnsProgress, count);
public void setURLsCaughtCount(int count)
setCounterProgressBar(this.urlQueuedProgress, count);
public void addStartButtonListener(ActionListener a)
public String getRestrictTo()
return restrictTo.getText();
public void setRestrictTo(String restrictTo)
public String getStartURL()
return startURL.getText();
public void setStartURL(String startURL)
public void setStalledThreads(int stalled)
public void setBytesPerSecond(double bps)
bpsText.setText("" + bps);
public void setDocsPerSecond(double docs)
bpsText.setText("" + docs);
public void setFreeMem(long freeMem)
freeMemText.setText("" + freeMem);
public void setTotalMem(long totalMem)
totalMemText.setText("" + totalMem);
public void setRobotsTxtCount(int robotsTxtCount)
setCounterProgressBar(robotsTxtsProgress, robotsTxtCount);
public void setDocsRead(int docs)
bpsText.setText("" + docs);
@ -0,0 +1,184 @@
package de.lanlab.larm.gui;
A basic extension of the java.awt.Dialog class
import java.awt.*;
import java.awt.event.*;
public class QuitDialog extends Dialog
public QuitDialog(Frame parent, boolean modal)
super(parent, modal);
//Keep a local reference to the invoking frame
frame = parent;
// This code is automatically generated by Visual Cafe when you add
// components to the visual environment. It instantiates and initializes
// the components. To modify the code, only use code syntax that matches
// what Visual Cafe can generate, or Visual Cafe may be unable to back
// parse your Java file into its visual environment.
yesButton.setLabel(" Ja ");
yesButton.setFont(new Font("Dialog", Font.BOLD, 12));
noButton.setLabel(" Nein ");
noButton.setFont(new Font("Dialog", Font.BOLD, 12));
label1.setText("Möchten Sie LARM beenden?");
setTitle("LARM - Beenden");
SymWindow aSymWindow = new SymWindow();
SymAction lSymAction = new SymAction();
public void addNotify()
// Record the size of the window prior to calling parents addNotify.
Dimension d = getSize();
if (fComponentsAdjusted)
// Adjust components according to the insets
setSize(getInsets().left + getInsets().right + d.width, getInsets().top + getInsets().bottom + d.height);
Component components[] = getComponents();
for (int i = 0; i < components.length; i++)
Point p = components[i].getLocation();
p.translate(getInsets().left, getInsets().top);
fComponentsAdjusted = true;
public QuitDialog(Frame parent, String title, boolean modal)
this(parent, modal);
* Shows or hides the component depending on the boolean flag b.
* @param b if true, show the component; otherwise, hide the component.
* @see java.awt.Component#isVisible
public void setVisible(boolean b)
Rectangle bounds = getParent().getBounds();
Rectangle abounds = getBounds();
setLocation(bounds.x + (bounds.width - abounds.width)/ 2,
bounds.y + (bounds.height - abounds.height)/2);
// Used for addNotify check.
boolean fComponentsAdjusted = false;
// Invoking frame
Frame frame = null;
java.awt.Button yesButton = new java.awt.Button();
java.awt.Button noButton = new java.awt.Button();
java.awt.Label label1 = new java.awt.Label();
class SymAction implements java.awt.event.ActionListener
public void actionPerformed(java.awt.event.ActionEvent event)
Object object = event.getSource();
if (object == yesButton)
else if (object == noButton)
void yesButton_ActionPerformed(java.awt.event.ActionEvent event)
// to do: code goes here.
void yesButton_ActionPerformed_Interaction1(java.awt.event.ActionEvent event)
try {
frame.setVisible(false); // Hide the invoking frame
frame.dispose(); // Free system resources
this.dispose(); // Free system resources
System.exit(0); // close the application
} catch (Exception e) {
void noButton_ActionPerformed(java.awt.event.ActionEvent event)
// to do: code goes here.
void noButton_ActionPerformed_Interaction1(java.awt.event.ActionEvent event)
try {
} catch (Exception e) {
class SymWindow extends java.awt.event.WindowAdapter
public void windowClosing(java.awt.event.WindowEvent event)
Object object = event.getSource();
if (object == QuitDialog.this)
void QuitDialog_WindowClosing(java.awt.event.WindowEvent event)
// to do: code goes here.
void QuitDialog_WindowClosing_Interaction1(java.awt.event.WindowEvent event)
try {
} catch (Exception e) {
@ -0,0 +1,136 @@
package de.lanlab.larm.net;
// whatever package you want
import sun.net.www.http.HttpClient;
import sun.net.www.MessageHeader;
import sun.net.ProgressEntry;
import java.net.*;
import java.io.*;
* Description of the Class
*@author cmarschn
*@created 2. Mai 2001
public class HttpClientTimeout extends HttpClient {
private int timeout = -1;
* Constructor for the HttpClientTimeout object
*@param url Description of Parameter
*@param proxy Description of Parameter
*@param proxyPort Description of Parameter
*@exception IOException Description of Exception
public HttpClientTimeout(URL url, String proxy, int proxyPort) throws IOException {
super(url, proxy, proxyPort);
* Constructor for the HttpClientTimeout object
*@param url Description of Parameter
*@exception IOException Description of Exception
public HttpClientTimeout(URL url) throws IOException {
super(url, null, -1);
* Sets the Timeout attribute of the HttpClientTimeout object
*@param i The new Timeout value
*@exception SocketException Description of Exception
public void setTimeout(int i) throws SocketException {
this.timeout = -1;
* Gets the Socket attribute of the HttpClientTimeout object
*@return The Socket value
public Socket getSocket() {
return serverSocket;
* Description of the Method
*@param header Description of Parameter
*@param entry Description of Parameter
*@return Description of the Returned Value
*@exception java.io.IOException Description of Exception
public boolean parseHTTP(MessageHeader header, ProgressEntry entry) throws java.io.IOException {
if (this.timeout != -1) {
try {
catch (SocketException e) {
throw new java.io.IOException("unable to set socket timeout!");
return super.parseHTTP(header, entry);
* Description of the Method
*@exception IOException Description of Exception
public void close() throws IOException {
* public void SetTimeout(int i) throws SocketException {
* serverSocket.setSoTimeout(i);
* }
* This class has no public constructor for HTTP. This method is used to
* get an HttpClient to the specifed URL. If there's currently an
* active HttpClient to that server/port, you'll get that one.
* no longer syncrhonized -- it slows things down too much
* synchronize at a higher level
* Gets the New attribute of the HttpClientTimeout class
*@param url Description of Parameter
*@return The New value
*@exception IOException Description of Exception
public static HttpClientTimeout getNew(URL url) throws IOException {
* see if one's already around
HttpClientTimeout ret = (HttpClientTimeout) kac.get(url);
if (ret == null) {
ret = new HttpClientTimeout(url);
// CTOR called openServer()
else {
ret.url = url;
// don't know if we're keeping alive until we parse the headers
// for now, keepingAlive is false
return ret;
@ -0,0 +1,50 @@
package de.lanlab.larm.net;
import java.net.*;
* Description of the Class
*@author cmarschn
*@created 2. Mai 2001
public class HttpTimeoutFactory implements URLStreamHandlerFactory {
int fiTimeoutVal;
* Constructor for the HttpTimeoutFactory object
*@param iT Description of Parameter
public HttpTimeoutFactory(int iT) {
fiTimeoutVal = iT;
* Description of the Method
*@param str Description of Parameter
*@return Description of the Returned Value
public URLStreamHandler createURLStreamHandler(String str) {
return new HttpTimeoutHandler(fiTimeoutVal);
static HttpTimeoutFactory instance = null;
* gets an instance. only the first call will create it. In subsequent calls the iT
* parameter doesn't have a meaning.
public static HttpTimeoutFactory getInstance(int iT)
if(instance == null)
instance = new HttpTimeoutFactory(iT);
return instance;
@ -0,0 +1,80 @@
package de.lanlab.larm.net;
import java.net.*;
import java.io.IOException;
* Description of the Class
*@author cmarschn
*@created 2. Mai 2001
public class HttpTimeoutHandler extends sun.net.www.protocol.http.Handler {
int timeoutVal;
HttpURLConnectionTimeout fHUCT;
* Constructor for the HttpTimeoutHandler object
*@param iT Description of Parameter
public HttpTimeoutHandler(int iT) {
timeoutVal = iT;
* Gets the Socket attribute of the HttpTimeoutHandler object
*@return The Socket value
public Socket getSocket() {
return fHUCT.getSocket();
* Description of the Method
*@exception Exception Description of Exception
public void close() throws Exception {
* Description of the Method
*@param u Description of Parameter
*@return Description of the Returned Value
*@exception IOException Description of Exception
protected java.net.URLConnection openConnection(URL u) throws IOException {
return fHUCT = new HttpURLConnectionTimeout(u, this, timeoutVal);
* Gets the Proxy attribute of the HttpTimeoutHandler object
*@return The Proxy value
String getProxy() {
return proxy;
// breaking encapsulation
* Gets the ProxyPort attribute of the HttpTimeoutHandler object
*@return The ProxyPort value
int getProxyPort() {
return proxyPort;
// breaking encapsulation
@ -0,0 +1,226 @@
package de.lanlab.larm.net;
import java.net.*;
import java.io.*;
import sun.net.www.http.HttpClient;
* Description of the Class
*@author cmarschn
*@created 2. Mai 2001
public class HttpURLConnectionTimeout extends sun.net.www.protocol.http.HttpURLConnection {
int fiTimeoutVal;
HttpTimeoutHandler fHandler;
HttpClientTimeout fClient;
* Constructor for the HttpURLConnectionTimeout object
*@param u Description of Parameter
*@param handler Description of Parameter
*@param iTimeout Description of Parameter
*@exception IOException Description of Exception
public HttpURLConnectionTimeout(URL u, HttpTimeoutHandler handler, int iTimeout) throws IOException {
super(u, handler);
fHandler = handler;
fiTimeoutVal = iTimeout;
* Constructor for the HttpURLConnectionTimeout object
*@param u Description of Parameter
*@param host Description of Parameter
*@param port Description of Parameter
*@exception IOException Description of Exception
public HttpURLConnectionTimeout(URL u, String host, int port) throws IOException {
super(u, host, port);
* Description of the Method
*@exception IOException Description of Exception
public void connect() throws IOException {
if (connected) {
try {
if ("http".equals(url.getProtocol())
* && !failedOnce <- PRIVATE
) {
// for safety's sake, as reported by KLGroup
synchronized (url) {
http = HttpClientTimeout.getNew(url);
fClient = (HttpClientTimeout) http;
((HttpClientTimeout) http).setTimeout(fiTimeoutVal);
else {
// make sure to construct new connection if first
// attempt failed
http = new HttpClientTimeout(url, fHandler.getProxy(), fHandler.getProxyPort());
ps = (PrintStream) http.getOutputStream();
catch (IOException e) {
throw e;
// this was missing from the original version
connected = true;
* Create a new HttpClient object, bypassing the cache of HTTP client
* objects/connections.
*@param url the URL being accessed
*@return The NewClient value
*@exception IOException Description of Exception
protected HttpClient getNewClient(URL url)
throws IOException {
HttpClientTimeout client = new HttpClientTimeout(url, (String) null, -1);
try {
catch (Exception e) {
System.out.println("Unable to set timeout value");
return (HttpClient) client;
* Gets the Socket attribute of the HttpURLConnectionTimeout object
*@return The Socket value
Socket getSocket() {
return fClient.getSocket();
* Description of the Method
*@exception Exception Description of Exception
void close() throws Exception {
* opens a stream allowing redirects only to the same host.
*@param c Description of Parameter
*@return Description of the Returned Value
*@exception IOException Description of Exception
public static InputStream openConnectionCheckRedirects(URLConnection c)
throws IOException {
boolean redir;
int redirects = 0;
InputStream in = null;
do {
if (c instanceof HttpURLConnectionTimeout) {
((HttpURLConnectionTimeout) c).setInstanceFollowRedirects(false);
// We want to open the input stream before
// getting headers, because getHeaderField()
// et al swallow IOExceptions.
in = c.getInputStream();
redir = false;
if (c instanceof HttpURLConnectionTimeout) {
HttpURLConnectionTimeout http = (HttpURLConnectionTimeout) c;
int stat = http.getResponseCode();
if (stat >= 300 && stat <= 305 &&
stat != HttpURLConnection.HTTP_NOT_MODIFIED) {
URL base = http.getURL();
String loc = http.getHeaderField("Location");
URL target = null;
if (loc != null) {
target = new URL(base, loc);
if (target == null
|| !base.getProtocol().equals(target.getProtocol())
|| base.getPort() != target.getPort()
|| !HostsEquals(base, target)
|| redirects >= 5) {
throw new SecurityException("illegal URL redirect");
redir = true;
c = target.openConnection();
} while (redir);
return in;
// Same as java.net.URL.hostsEqual
* Description of the Method
*@param u1 Description of Parameter
*@param u2 Description of Parameter
*@return Description of the Returned Value
static boolean HostsEquals(URL u1, URL u2) {
final String h1 = u1.getHost();
final String h2 = u2.getHost();
if (h1 == null) {
return h2 == null;
else if (h2 == null) {
return false;
else if (h1.equalsIgnoreCase(h2)) {
return true;
// Have to resolve addresses before comparing, otherwise
// names like tachyon and tachyon.eng would compare different
final boolean result[] = {false};
new java.security.PrivilegedAction() {
* Main processing method for the HttpURLConnectionTimeout object
*@return Description of the Returned Value
public Object run() {
try {
InetAddress a1 = InetAddress.getByName(h1);
InetAddress a2 = InetAddress.getByName(h2);
result[0] = a1.equals(a2);
catch (UnknownHostException e) {
catch (SecurityException e) {
return null;
return result[0];
@ -0,0 +1,17 @@
* Title: LARM Lanlab Retrieval Machine<p>
* Description: <p>
* Copyright: Copyright (c)<p>
* Company: <p>
* @author
* @version 1.0
package de.lanlab.larm.parser;
public interface LinkHandler
public void handleLink(String value, boolean isFrame);
public void handleBase(String value);
public void handleTitle(String value);
File diff suppressed because it is too large
Load Diff
@ -0,0 +1,37 @@
* Title: LARM Lanlab Retrieval Machine<p>
* Description: <p>
* Copyright: Copyright (c)<p>
* Company: <p>
* @author
* @version 1.0
package de.lanlab.larm.storage;
import de.lanlab.larm.util.*;
* This interface stores documents provided by a fetcher task
* @author Clemens Marschner
public interface DocumentStorage
* called once when the storage is supposed to be initialized
public void open();
* called to store a web document
* @param doc the document
public void store(WebDocument doc);
@ -0,0 +1,165 @@
package de.lanlab.larm.storage;
import de.lanlab.larm.util.WebDocument;
import de.lanlab.larm.util.SimpleLogger;
import java.io.*;
* Title: LARM Lanlab Retrieval Machine Description: Copyright: Copyright (c)
* Company:
* @author
* @created 11. Januar 2002
* @version 1.0
* this class saves the documents into page files of 50 MB and keeps a record of all
* the positions into a Logger. the log file contains URL, page file number, and
* index within the page file.
public class LogStorage implements DocumentStorage
SimpleLogger log;
File pageFile;
FileOutputStream out;
int pageFileCount;
String filePrefix;
int offset;
boolean isValid = false;
* Description of the Field
public final static int MAXLENGTH = 50000000;
boolean logContents = false;
String fileName;
* Constructor for the LogStorage object
* @param log the logger where index information is saved to
* @param logContents whether all docs are to be stored in page files or not
* @param filePrefix the file name where the page file number is appended
public LogStorage(SimpleLogger log, boolean logContents, String filePrefix)
this.log = log;
pageFileCount = 0;
this.filePrefix = filePrefix;
this.logContents = logContents;
if (logContents)
* Description of the Method
public void open() { }
* Description of the Method
public void openPageFile()
int id = ++pageFileCount;
fileName = filePrefix + "_" + id + ".pfl";
this.offset = 0;
out = new FileOutputStream(fileName);
isValid = true;
catch (IOException io)
log.logThreadSafe("**ERROR: IOException while opening pageFile " + fileName + ": " + io.getClass().getName() + "; " + io.getMessage());
isValid = false;
* Gets the outputStream attribute of the LogStorage object
* @return The outputStream value
public OutputStream getOutputStream()
if (offset > MAXLENGTH)
catch (IOException io)
log.logThreadSafe("**ERROR: IOException while closing pageFile " + fileName + ": " + io.getClass().getName() + "; " + io.getMessage());
return out;
* Description of the Method
* @param bytes Description of the Parameter
* @return Description of the Return Value
public synchronized int writeToPageFile(byte[] bytes)
OutputStream out = getOutputStream();
int oldOffset = this.offset;
this.offset += bytes.length;
return oldOffset;
catch (IOException io)
log.logThreadSafe("**ERROR: IOException while writing " + bytes.length + " bytes to pageFile " + fileName + ": " + io.getClass().getName() + "; " + io.getMessage());
return -1;
* Sets the logger attribute of the LogStorage object
* @param log The new logger value
public void setLogger(SimpleLogger log)
this.log = log;
* stores the document if storing is enabled
* @param doc Description of the Parameter
public void store(WebDocument doc)
String docInfo = doc.getInfo();
if (logContents && isValid && doc.getDocumentBytes() != null)
int offset = writeToPageFile(doc.getDocumentBytes());
docInfo = docInfo + "\t" + pageFileCount + "\t" + offset;
@ -0,0 +1,26 @@
* Title: LARM Lanlab Retrieval Machine<p>
* Description: <p>
* Copyright: Copyright (c)<p>
* Company: <p>
* @author
* @version 1.0
package de.lanlab.larm.storage;
import de.lanlab.larm.util.*;
* doesn't do a lot
public class NullStorage implements DocumentStorage
public NullStorage()
public void open() {}
public void store(WebDocument doc) {}
@ -0,0 +1,176 @@
* Title: LARM Lanlab Retrieval Machine<p>
* Description: <p>
* Copyright: Copyright (c)<p>
* Company: <p>
* @author
* @version 1.0
package de.lanlab.larm.storage;
import java.sql.*;
import de.lanlab.larm.util.*;
import java.util.*;
* saves the document into an sql table. At this time only in MS SQL (and probably Sybase)
* a table "Document" with the columns DO_URL(varchar), DO_MimeType(varchar) and
* DO_Data2(BLOB) is created after start<br>
* notes: experimental; slow
public class SQLServerStorage implements DocumentStorage
private Vector freeCons;
private Vector busyCons;
private Vector freeStatements;
private Vector busyStatements;
private PreparedStatement addDoc;
public SQLServerStorage(String driver, String connectionString, String account, String password, int nrConnections)
freeCons = new Vector(nrConnections);
busyCons = new Vector(nrConnections);
freeStatements = new Vector(nrConnections);
busyStatements = new Vector(nrConnections);
Connection sqlConn;
PreparedStatement statement;
for(int i=0; i<nrConnections; i++)
sqlConn = DriverManager.getConnection(connectionString, account, password);
statement = sqlConn.prepareStatement("INSERT INTO Document (DO_URL, DO_MimeType, DO_Data2) VALUES (?,?,?)");
catch(SQLException e)
System.out.println(/*"Task " + taskNr + ": */ "SQLException: " + e.getMessage());
System.err.println(" SQLState: " + e.getSQLState());
System.err.println(" VendorError: " + e.getErrorCode());
catch(Exception e)
System.out.println("SQLServerStorage: " + e.getClass().getName() + ": " + e.getMessage());
public Connection getConnection()
Connection actual = (Connection)freeCons.firstElement();
if(actual == null)
return null;
return actual;
public void releaseConnection(Connection con)
public PreparedStatement getStatement()
PreparedStatement actual = (PreparedStatement)freeStatements.firstElement();
if(actual == null)
return null;
return actual;
public void releaseStatement(PreparedStatement statement)
public void open()
Connection conn = null;
conn = getConnection();
Statement delDoc = conn.createStatement();
// bisherige Daten löschen, indem die Tabelle neu angelegt wird (geht schneller)
delDoc.executeUpdate("if exists (select * from sysobjects where id = object_id(N'[dbo].[Document]') and OBJECTPROPERTY(id, N'IsUserTable') = 1)drop table [dbo].[Document]");
delDoc.executeUpdate("CREATE TABLE [dbo].[Document] ([DO_ID] [int] IDENTITY (1, 1) NOT NULL , [DA_CrawlPass] [int] NULL , [DO_URL] [varchar] (255) NULL , [DO_ContentType] [varchar] (50) NULL , [DO_Data] [text] NULL , [DO_Hashcode] [int] NULL , [DO_ContentLength] [int] NULL , [DO_ContentEncoding] [varchar] (20) NULL , [DO_Data2] [image] NULL, [DO_MimeType] [varchar] (255) NULL) ON [PRIMARY] TEXTIMAGE_ON [PRIMARY]"); // löschen
catch(SQLException e)
System.out.println(/*"Task " + taskNr + ": */"SQLException: " + e.getMessage());
System.err.println(" SQLState: " + e.getSQLState());
System.err.println(" VendorError: " + e.getErrorCode());
if(conn != null)
public void store(WebDocument document)
PreparedStatement addDoc = null;
addDoc = getStatement();
addDoc.setString(1, document.getURLString());
addDoc.setString(2, document.getMimeType());
addDoc.setBytes(3, document.getDocumentBytes());
catch(SQLException e)
System.out.println(/* "Task " + taskNr + ": */ "SQLException: " + e.getMessage());
System.err.println(" SQLState: " + e.getSQLState());
System.err.println(" VendorError: " + e.getErrorCode());
if(addDoc != null)
@ -0,0 +1,9 @@
package de.lanlab.larm.threads;
public interface InterruptableTask
public void run(ServerThread thread);
public void interrupt();
public String getInfo();
@ -0,0 +1,173 @@
package de.lanlab.larm.threads;
import java.util.Vector;
import java.util.Iterator;
import java.io.*;
import java.util.*;
import de.lanlab.larm.util.*;
* This thread class acts like a server. It's running idle within
* a thread pool until "runTask" is called. The given task will then
* be executed asynchronously
public class ServerThread extends Thread
* the task that is to be executed. null in idle-mode
protected InterruptableTask task = null;
private boolean busy = false;
private ArrayList listeners = new ArrayList();
private boolean isInterrupted = false;
private int threadNumber;
SimpleLogger log;
SimpleLogger errorLog;
public ServerThread(int threadNumber, String name, ThreadGroup threadGroup)
super(threadGroup, name);
public ServerThread(int threadNumber, String name)
void init(int threadNumber)
this.threadNumber = threadNumber;
File logDir = new File("logs");
log = new SimpleLogger("thread" + threadNumber);
errorLog = new SimpleLogger("thread" + threadNumber + "_errors");
* constructor
* @param threadNumber assigns an arbitrary number to this thread
* used by ServerThreadFactory
public ServerThread(int threadNumber)
* the run method runs asynchronously. It waits until runTask() is
* called
public void run()
while(task == null)
catch(InterruptedException e)
System.out.println("ServerThread " + threadNumber + " interrupted");
log.log("** Thread Interrupted **");
* this is the main method that will invoke a task to run.
public synchronized void runTask(InterruptableTask t)
busy = true;
task = t;
* it should be possible to interrupt a task with this function.
* therefore, the task has to check its interrupted()-state
public void interruptTask()
if(task != null)
* the server thread can either be in idle or busy mode
public boolean isBusy()
return busy;
public void addTaskReadyListener(TaskReadyListener l)
public void removeTaskReadyListener(TaskReadyListener l)
public void interrupt()
isInterrupted = true;
public int getThreadNumber()
return this.threadNumber;
public InterruptableTask getTask()
return task;
* this method will be called when the task ends. It notifies all
* of its observers about its changed state
protected void taskReady()
task = null;
busy = false;
Iterator Ie = listeners.iterator();
public SimpleLogger getLog()
return log;
public SimpleLogger getErrorLog()
return errorLog;
@ -0,0 +1,80 @@
package de.lanlab.larm.threads;
import de.lanlab.larm.util.Queue;
import java.util.Collection;
* Title: LARM Lanlab Retrieval Machine
* Description:
* Copyright: Copyright (c)
* Company:
* @author
* @version 1.0
import java.util.LinkedList;
import java.util.Iterator;
public class TaskQueue implements Queue
LinkedList queue = new LinkedList();
public TaskQueue()
public void insertMultiple(Collection c)
throw new UnsupportedOperationException();
* push a task to the start of the queue
* @param i the task
public void insert(Object i)
* get the last element out of the queue
* The element will be removed from the queue
* @return the task
public Object remove()
return queue.isEmpty() ? null : (InterruptableTask)queue.removeLast();
public Iterator iterator()
return queue.iterator();
public void clear()
public boolean isEmpty()
return queue.isEmpty();
public int size()
return queue.size();
@ -0,0 +1,9 @@
package de.lanlab.larm.threads;
import de.lanlab.larm.util.Observer;
public interface TaskReadyListener extends Observer
public void taskReady(ServerThread s);
@ -0,0 +1,20 @@
* Title: LARM Lanlab Retrieval Machine<p>
* Description: <p>
* Copyright: Copyright (c)<p>
* Company: <p>
* @author
* @version 1.0
package de.lanlab.larm.threads;
public class ThreadFactory
// static int count = 0;
public ServerThread createServerThread(int count)
return new ServerThread(count);
@ -0,0 +1,380 @@
package de.lanlab.larm.threads;
//import java.util.Vector;
import java.util.*;
* if you have many tasks to accomplish, you can do this with one of the
* following strategies:
* <uL>
* <li> do it one after another (single threaded). this may often be
* inefficient because most programs often wait for external resources
* <li> assign a new thread for each task (thread on demand). This will clog
* up the system if many tasks have to be accomplished synchronously
* <li> hold a number of tasks, and queue the requests if there are more
* tasks than threads (ThreadPool).
* </ul>
* This thread pool is based on an article in Java-Magazin 06/2000.
* synchronizations were removed unless necessary
public class ThreadPool implements ThreadingStrategy, TaskReadyListener {
private int maxThreads = MAX_THREADS;
* references to all threads are stored here
private HashMap allThreads = new HashMap();
* this vector takes all idle threads
private Vector idleThreads = new Vector();
* this vector takes all threads that are in operation (busy)
private Vector busyThreads = new Vector();
* if there are no idleThreads, tasks will go here
private TaskQueue queue = new TaskQueue();
* thread pool observers will be notified of status changes
private Vector threadPoolObservers = new Vector();
private boolean isStopped = false;
* default maximum number of threads, if not given by the user
public final static int MAX_THREADS = 5;
* thread was created
public final static String THREAD_CREATE = "T_CREATE";
* thread was created
public final static String THREAD_START = "T_START";
* thread is running
public final static String THREAD_RUNNING = "T_RUNNING";
* thread was stopped
public final static String THREAD_STOP = "T_STOP";
* thread was destroyed
public final static String THREAD_END = "T_END";
* thread is idle
public final static String THREAD_IDLE = "T_IDLE";
* a task was added to the queue, because all threads were busy
public final static String THREADQUEUE_ADD = "TQ_ADD";
* a task was removed from the queue, because a thread had finished and was
* ready
public final static String THREADQUEUE_REMOVE = "TQ_REMOVE";
* this factory will create the tasks
ThreadFactory factory;
* this constructor will create the pool with MAX_THREADS threads and the
* default factory
public ThreadPool() {
this(MAX_THREADS, new ThreadFactory());
* this constructor will create the pool with the default Factory
*@param max the maximum number of threads
public ThreadPool(int max) {
this(max, new ThreadFactory());
* constructor
*@param max maximum number of threads
*@param factory the thread factory with which the threads will be created
public ThreadPool(int max, ThreadFactory factory) {
maxThreads = max;
this.factory = factory;
* this init method will create the tasks. It must be called by hand
public void init() {
for (int i = 0; i < maxThreads; i++) {
* Description of the Method
*@param i Description of the Parameter
public void createThread(int i) {
ServerThread s = factory.createServerThread(i);
allThreads.put(new Integer(i), s);
sendMessage(i, THREAD_CREATE, "");
sendMessage(i, THREAD_IDLE, "");
// FIXME: synchronisationstechnisch buggy
* Description of the Method
*@param i Description of the Parameter
public void restartThread(int i) {
sendMessage(i, THREAD_STOP, "");
ServerThread t = (ServerThread) allThreads.get(new Integer(i));
allThreads.remove(new Integer(i));
// deprecated, I know, but the only way to overcome SUN's bugs
t = null;
* Description of the Method
*@param t Description of the Parameter
*@param key Description of the Parameter
public synchronized void doTask(InterruptableTask t, Object key) {
if (!idleThreads.isEmpty()) {
ServerThread s = (ServerThread) idleThreads.firstElement();
sendMessage(s.getThreadNumber(), THREAD_START, t.getInfo());
sendMessage(s.getThreadNumber(), THREAD_RUNNING, t.getInfo());
} else {
sendMessage(-1, THREADQUEUE_ADD, t.getInfo());
* this will interrupt all threads. Therefore the InterruptableTasks must
* attend on the interrupted-flag
public void interrupt() {
Iterator tasks = queue.iterator();
while (tasks.hasNext()) {
InterruptableTask t = (InterruptableTask) tasks.next();
sendMessage(-1, THREADQUEUE_REMOVE, t.getInfo());
// In der Hoffnung, dass alles klappt...
Iterator threads = busyThreads.iterator();
while (threads.hasNext()) {
((ServerThread) threads.next()).interruptTask();
* this will interrupt the tasks and end all threads
public void stop() {
isStopped = true;
Iterator threads = idleThreads.iterator();
while (threads.hasNext()) {
((ServerThread) threads.next()).interruptTask();
* wird von einem ServerThread aufgerufen, wenn dieser fertig ist
*@param s Description of the Parameter
*@param: ServerThread s - der aufrufende Thread
public synchronized void taskReady(ServerThread s) {
if (isStopped) {
sendMessage(s.getThreadNumber(), THREAD_STOP, s.getTask().getInfo());
} else if (!queue.isEmpty()) {
InterruptableTask t = (InterruptableTask) queue.remove();
sendMessage(-1, THREADQUEUE_REMOVE, t.getInfo());
sendMessage(s.getThreadNumber(), THREAD_START, "");
sendMessage(s.getThreadNumber(), THREAD_RUNNING, s.getTask().getInfo());
} else {
sendMessage(s.getThreadNumber(), THREAD_IDLE, "");
synchronized (idleThreads) {
* Description of the Method
public void waitForFinish() {
synchronized (idleThreads) {
while (busyThreads.size() != 0) {
//System.out.println("busyThreads: " + busyThreads.size());
try {
} catch (InterruptedException e) {
System.out.println("Interrupted: " + e.getMessage());
//System.out.println("busyThreads: " + busyThreads.size());
* Adds a feature to the ThreadPoolObserver attribute of the ThreadPool
* object
*@param o The feature to be added to the ThreadPoolObserver attribute
public void addThreadPoolObserver(ThreadPoolObserver o) {
* Description of the Method
*@param threadNr Description of the Parameter
*@param action Description of the Parameter
*@param info Description of the Parameter
protected void sendMessage(int threadNr, String action, String info) {
Iterator Ie = threadPoolObservers.iterator();
//System.out.println("ThreadPool: Sende " + action + " message an " + threadPoolObservers.size() + " Observers");
if (threadNr != -1) {
while (Ie.hasNext()) {
((ThreadPoolObserver) Ie.next()).threadUpdate(threadNr, action, info);
} else {
while (Ie.hasNext()) {
((ThreadPoolObserver) Ie.next()).queueUpdate(info, action);
* Gets the queueSize attribute of the ThreadPool object
*@return The queueSize value
public synchronized int getQueueSize() {
return this.queue.size();
* Gets the idleThreadsCount attribute of the ThreadPool object
*@return The idleThreadsCount value
public synchronized int getIdleThreadsCount() {
return this.idleThreads.size();
* Gets the busyThreadsCount attribute of the ThreadPool object
*@return The busyThreadsCount value
public synchronized int getBusyThreadsCount() {
return this.busyThreads.size();
* Gets the threadCount attribute of the ThreadPool object
*@return The threadCount value
public synchronized int getThreadCount() {
return this.idleThreads.size() + this.busyThreads.size();
* Gets the threadIterator attribute of the ThreadPool object
*@return The threadIterator value
public Iterator getThreadIterator() {
return allThreads.values().iterator();
// return allThreads.iterator();
* Description of the Method
*@param queue Description of the Parameter
public void setQueue(TaskQueue queue) {
this.queue = queue;
public TaskQueue getTaskQueue()
return queue;
@ -0,0 +1,12 @@
package de.lanlab.larm.threads;
import de.lanlab.larm.util.Observer;
* an observer that observes the thread pool...
public interface ThreadPoolObserver extends Observer
public void queueUpdate(String info, String action);
public void threadUpdate(int threadNr, String action, String info);
@ -0,0 +1,8 @@
package de.lanlab.larm.threads;
public interface ThreadingStrategy
public void doTask(InterruptableTask t, Object key);
public void interrupt();
public void stop();
@ -0,0 +1,721 @@
* Title: LARM Lanlab Retrieval Machine<p>
* Description: <p>
* Copyright: Copyright (c)<p>
* Company: <p>
* @author
* @version 1.0
package de.lanlab.larm.util;
import java.io.*;
import java.util.*;
class StoreException extends RuntimeException
Exception origException;
* Constructor for the StoreException object
* @param e Description of the Parameter
public StoreException(Exception e)
origException = e;
* Gets the message attribute of the StoreException object
* @return The message value
public String getMessage()
return origException.getMessage();
* Description of the Method
public void printStackTrace()
System.err.println("StoreException occured with reason: " + origException.getMessage());
* internal class that represents one block within a queue
* @author Clemens Marschner
* @created 3. Januar 2002
class QueueBlock
* the elements section will be set to null if it is on disk Vector elements
* must be Serializable
LinkedList elements;
* Anzahl Elemente im Block. Kopie von elements.size()
int size;
* maximale Blockgröße
int maxSize;
* if set, elements is null and block was written to file
boolean onDisk;
* Blockname
String name;
* initialisiert den Block
* @param name Der Blockname (muss eindeutig sein, sonst Kollision auf
* Dateiebene)
* @param maxSize maximale Blockgröße. Über- und Unterläufe werden durch
* Exceptions behandelt
public QueueBlock(String name, int maxSize)
this.name = name;
this.onDisk = false;
this.elements = new LinkedList();
this.maxSize = maxSize;
* serialisiert und speichert den Block auf Platte
* @exception StoreException Description of the Exception
public void store()
throws StoreException
ObjectOutputStream o = new ObjectOutputStream(new FileOutputStream(getFileName()));
elements = null;
onDisk = true;
//System.out.println("CachingQueue.store: Block stored");
catch (IOException e)
System.err.println("CachingQueue.store: IOException");
throw new StoreException(e);
* @return the filename of the block
String getFileName()
// package protected!
return "cachingqueue/" + name + ".cqb";
* load the block from disk
* @exception StoreException Description of the Exception
public void load()
throws StoreException
ObjectInputStream i = new ObjectInputStream(new FileInputStream(getFileName()));
elements = (LinkedList) i.readObject();
onDisk = false;
size = elements.size();
if (!(new File(getFileName()).delete()))
System.err.println("CachingQueue.load: file could not be deleted");
//System.out.println("CachingQueue.load: Block loaded");
catch (Exception e)
System.err.println("CachingQueue.load: Exception " + e.getClass().getName() + " occured");
throw new StoreException(e);
* inserts an object at the start of the queue must be synchronized by
* calling class to be thread safe
* @param o Description of the Parameter
* @exception StoreException Description of the Exception
public void insert(Object o)
throws StoreException
if (onDisk)
if (size >= maxSize)
throw new OverflowException();
* gibt das letzte Element aus der Queue zurück und löscht dieses must be
* made synchronized by calling class to be thread safe
* @return Description of the Return Value
* @exception UnderflowException Description of the Exception
* @exception StoreException Description of the Exception
public Object remove()
throws UnderflowException, StoreException
if (onDisk)
if (size <= 0)
throw new UnderflowException();
return elements.removeLast();
* @return the number of elements in the block
public int size()
return size;
* destructor. Assures that all files are deleted, even if the queue was not
* empty at the time when the program ended
public void finalize()
// System.err.println("finalize von " + name + " called");
if (onDisk)
// temp-Datei löschen. Passiert, wenn z.B. eine Exception aufgetreten ist
// System.err.println("CachingQueue.finalize von Block " + name + ": lösche Datei");
if (!(new File(getFileName()).delete()))
// Dateifehler möglich durch Exception: ignorieren
// System.err.println("CachingQueue.finalize: file could not be deleted although onDisk was true");
* this class holds a queue whose data is kept on disk whenever possible.
* It's a single ended queue, meaning data can only be added at the front and
* taken from the back. the queue itself is divided into blocks. Only the first
* and last blocks are kept in main memory, the rest is stored on disk. Only a
* LinkedList entry is kept in memory then.
* Blocks are swapped if an overflow (in case of insertions) or underflow (in case
* of removals) occur.<br>
* <pre>
* +---+---+---+---+-+
* put -> | M | S | S | S |M| -> remove
* +---+---+---+---+-+
* </pre>
* the maximum number of entries can be specified with the blockSize parameter. Thus,
* the queue actually holds a maximum number of 2 x blockSize objects in main memory,
* plus a few bytes for each block.<br>
* The objects contained in the blocks are stored with the standard Java
* serialization mechanism
* The files are named "cachingqueue\\Queuename_BlockNumber.cqb"
* note that the class is not synchronized
* @author Clemens Marschner
* @created 3. Januar 2002
public class CachingQueue implements Queue
* the Blocks
LinkedList queueBlocks;
* fast access to the first block
QueueBlock first = null;
* fast access to the last block
QueueBlock last = null;
* maximum block size
int blockSize;
* "primary key" identity count for each block
int blockCount = 0;
* active blocks
int numBlocks = 0;
* queue name
String name;
* total number of objects
int size;
* init
* @param name the name of the queue, used in files names
* @param blockSize maximum number of objects stored in one block
public CachingQueue(String name, int blockSize)
queueBlocks = new LinkedList();
this.name = name;
this.blockSize = blockSize;
File cq = new File("cachingqueue");
* inserts an object to the front of the queue
* @param o the object to be inserted. must implement Serializable
* @exception StoreException encapsulates Exceptions that occur when writing to hard disk
public synchronized void insert(Object o)
throws StoreException
if (last == null && first == null)
first = last = newBlock();
if (last == null && first != null)
// assert((last==null && first==null) || (last!= null && first!=null));
System.err.println("Error in CachingQueue: last!=first==null");
if (first.size() >= blockSize)
// save block and create a new one
QueueBlock newBlock = newBlock();
if (last != first)
first = newBlock;
* returns the last object from the queue
* @return the object returned
* @exception StoreException Description of the Exception
* @exception UnderflowException if the queue was empty
public synchronized Object remove()
throws StoreException, UnderflowException
if (last == null)
throw new UnderflowException();
if (last.size() <= 0)
if (numBlocks == 1)
last = first;
else if (numBlocks == 0)
first = last = null;
throw new UnderflowException();
else if (numBlocks < 0)
// assert(numBlocks >= 0)
System.err.println("CachingQueue.remove: numBlocks<0!");
throw new UnderflowException();
last = (QueueBlock) queueBlocks.getLast();
return last.remove();
* not supported
* @param c Description of the Parameter
public void insertMultiple(java.util.Collection c)
throw new UnsupportedOperationException();
* creates a new block
* @return Description of the Return Value
private QueueBlock newBlock()
return new QueueBlock(name + "_" + blockCount++, blockSize);
* total number of objects contained in the queue
* @return Description of the Return Value
public int size()
return size;
* testing
* @param args The command line arguments
public static void main(String[] args)
System.out.println("Test1: " + CachingQueueTester.testUnderflow());
System.out.println("Test2: " + CachingQueueTester.testInsert());
System.out.println("Test3: " + CachingQueueTester.testBufReadWrite());
System.out.println("Test4: " + CachingQueueTester.testBufReadWrite2());
System.out.println("Test5: " + CachingQueueTester.testUnderflow2());
System.out.println("Test6: " + CachingQueueTester.testBufReadWrite3());
System.out.println("Test7: " + CachingQueueTester.testExceptions());
* Testklasse TODO: auslagern und per JUnit handhaben
* @author Administrator
* @created 3. Januar 2002
class AssertionFailedException extends RuntimeException
* Testklasse. Enthält einige Tests für die Funktionalität der CachingQueue
* @author Administrator
* @created 3. Januar 2002
class CachingQueueTester
* A unit test for JUnit
* @return Description of the Return Value
public static boolean testUnderflow()
CachingQueue cq = new CachingQueue("testQueue1", 10);
catch (UnderflowException e)
return true;
catch (Exception e)
return false;
* A unit test for JUnit
* @return Description of the Return Value
public static boolean testInsert()
CachingQueue cq = new CachingQueue("testQueue2", 10);
String test = "Test1";
assert(cq.size() == 0);
assert(cq.size() == 1);
return (cq.remove() == test);
* A unit test for JUnit
* @return Description of the Return Value
public static boolean testBufReadWrite()
CachingQueue cq = new CachingQueue("testQueue3", 2);
String test1 = "Test1";
String test2 = "Test2";
String test3 = "Test3";
assert(cq.size() == 3);
assert(cq.size() == 1);
return (cq.remove() == test3);
* A unit test for JUnit
* @return Description of the Return Value
public static boolean testBufReadWrite2()
CachingQueue cq = new CachingQueue("testQueue4", 2);
String test1 = "Test1";
String test2 = "Test2";
String test3 = "Test3";
String test4 = "Test4";
String test5 = "Test5";
assert(cq.size() == 5);
String t = (String) cq.remove();
t = (String) cq.remove();
t = (String) cq.remove();
t = (String) cq.remove();
t = (String) cq.remove();
assert(cq.size() == 0);
return (t.equals(test5));
* Description of the Method
* @param expr Description of the Parameter
public static void assert(boolean expr)
if (!expr)
throw new AssertionFailedException();
* A unit test for JUnit
* @return Description of the Return Value
public static boolean testUnderflow2()
CachingQueue cq = new CachingQueue("testQueue5", 2);
String test1 = "Test1";
String test2 = "Test2";
String test3 = "Test3";
String test4 = "Test4";
String test5 = "Test5";
catch (UnderflowException e)
return true;
return false;
* A unit test for JUnit
* @return Description of the Return Value
public static boolean testBufReadWrite3()
CachingQueue cq = new CachingQueue("testQueue4", 1);
String test1 = "Test1";
String test2 = "Test2";
String test3 = "Test3";
String test4 = "Test4";
String test5 = "Test5";
String t = (String) cq.remove();
t = (String) cq.remove();
t = (String) cq.remove();
t = (String) cq.remove();
t = (String) cq.remove();
return (t.equals(test5));
* A unit test for JUnit
* @return Description of the Return Value
public static boolean testExceptions()
CachingQueue cq = new CachingQueue("testQueue5", 1);
String test1 = "Test1";
String test2 = "Test2";
String test3 = "Test3";
String test4 = "Test4";
String test5 = "Test5";
if (!(new File("testQueue5_1.cqb").delete()))
System.err.println("CachingQueueTester.textExceptions: Store 1 nicht vorhanden. Filename geändert?");
if (!(new File("testQueue5_2.cqb").delete()))
System.err.println("CachingQueueTester.textExceptions: Store 2 nicht vorhanden. Filename geändert?");
String t = (String) cq.remove();
t = (String) cq.remove();
t = (String) cq.remove();
t = (String) cq.remove();
t = (String) cq.remove();
catch (StoreException e)
return true;
cq = null;
// finalizer müssten aufgerufen werden
return false;
@ -0,0 +1,273 @@
package de.lanlab.larm.util;
import java.lang.reflect.*;
import java.io.*;
import java.util.*;
* Title: LARM Lanlab Retrieval Machine
* Description:
* Copyright: Copyright (c)
* Company:
* @author
* @version 1.0
* prints class information with the reflection api
* for debugging only
public class ClassInfo
public ClassInfo()
* Usage: java ClassInfo PackageName.MyNewClassName PackageName.DerivedClassName
public static void main(String[] args)
String name = args[0];
String derivedName = args[1];
LinkedList l = new LinkedList();
ListIterator itry = l.listIterator();
Class cls = Class.forName(name);
name = cls.getName();
String pkg = getPackageName(name);
String clss = getClassName(name);
StringWriter importsWriter = new StringWriter();
PrintWriter imports = new PrintWriter(importsWriter);
StringWriter outWriter = new StringWriter();
PrintWriter out = new PrintWriter(outWriter);
TreeSet importClasses = new TreeSet();
out.println("/**\n * (class description here)\n */\npublic class " + derivedName + " " + (cls.isInterface() ? "implements " : "extends ") + clss + "\n{");
Method[] m = cls.getMethods();
for(int i= 0; i< m.length; i++)
Method thism = m[i];
if((thism.getModifiers() & Modifier.PRIVATE) == 0 && ((thism.getModifiers() & Modifier.FINAL) == 0)
&& (thism.getDeclaringClass().getName() != "java.lang.Object"))
out.println(" /**");
out.println(" * (method description here)");
out.println(" * defined in " + thism.getDeclaringClass().getName());
Class[] parameters = thism.getParameterTypes();
for(int j = 0; j < parameters.length; j ++)
if(getPackageName(parameters[j].getName()) != "")
out.println(" * @param p" + j + " (parameter description here)");
if(thism.getReturnType().getName() != "void")
String returnPackage = getPackageName(thism.getReturnType().getName());
if(returnPackage != "")
out.println(" * @return (return value description here)");
out.println(" */");
out.print(" " + getModifierString(thism.getModifiers()) + getClassName(thism.getReturnType().getName()) + " ");
out.print(thism.getName() + "(");
for(int j = 0; j < parameters.length; j ++)
out.print(", ");
out.print(getClassName(parameters[j].getName()) + " p" + j);
Class[] exceptions = thism.getExceptionTypes();
if (exceptions.length > 0)
out.print(" throws ");
for(int k = 0; k < exceptions.length; k++)
if(k > 0)
out.print(", ");
String exCompleteName = exceptions[k].getName();
String exName = getClassName(exCompleteName);
out.print("\n" +
" {\n" +
" /**@todo: Implement this " + thism.getName() + "() method */\n" +
" throw new UnsupportedOperationException(\"Method " + thism.getName() + "() not yet implemented.\");\n" +
" }\n\n");
Iterator importIterator = importClasses.iterator();
String importName = (String)importIterator.next();
imports.println("import " + importName + ";");
if(getPackageName(derivedName) != "")
System.out.println("package " + getPackageName(derivedName) + ";\n");
System.out.println( "/**\n" +
" * Title: \n" +
" * Description:\n" +
" * Copyright: Copyright (c)\n" +
" * Company:\n" +
" * @author\n" +
" * @version 1.0\n" +
" */\n");
catch(Throwable t)
public static String getPackageName(String className)
if(className.charAt(0) == '[')
case 'L':
return getPackageName(className.substring(2,className.length()-1));
return "";
String name = className.lastIndexOf(".") != -1 ? className.substring(0, className.lastIndexOf(".")) : "";
//System.out.println("Package: " + name);
return name;
public static String getClassName(String className)
if(className.charAt(0) == '[')
case 'L':
return getClassName(className.substring(2,className.length()-1)) + "[]";
case 'C':
return "char[]";
case 'I':
return "int[]";
case 'B':
return "byte[]";
// rest is missing here
String name = (className.lastIndexOf(".") > -1) ? className.substring(className.lastIndexOf(".")+1) : className;
//System.out.println("Class: " + name);
return name;
static String getImportStatement(String className)
String pack = getPackageName(className);
String clss = getClassName(className);
if(clss.indexOf("[]") > -1)
return pack + "." + clss.substring(0,clss.length() - 2);
return pack + "." + clss;
public static String getModifierString(int modifiers)
StringBuffer mods = new StringBuffer();
if((modifiers & Modifier.ABSTRACT) != 0)
mods.append("abstract ");
if((modifiers & Modifier.FINAL) != 0)
mods.append("final ");
if((modifiers & Modifier.INTERFACE) != 0)
mods.append("interface ");
if((modifiers & Modifier.NATIVE) != 0)
mods.append("native ");
if((modifiers & Modifier.PRIVATE) != 0)
mods.append("private ");
if((modifiers & Modifier.PROTECTED) != 0)
mods.append("protected ");
if((modifiers & Modifier.PUBLIC) != 0)
mods.append("public ");
if((modifiers & Modifier.STATIC) != 0)
mods.append("static ");
if((modifiers & Modifier.STRICT) != 0)
mods.append("strictfp ");
if((modifiers & Modifier.SYNCHRONIZED) != 0)
mods.append("synchronized ");
if((modifiers & Modifier.TRANSIENT) != 0)
mods.append("transient ");
if((modifiers & Modifier.VOLATILE) != 0)
mods.append("volatile ");
return mods.toString();
@ -0,0 +1,319 @@
package de.lanlab.larm.util;
* Title:
* Description:
* Copyright: Copyright (c)
* Company:
* @author
* @version 1.0
import java.util.*;
* simple hashed linked list. It allows for inserting and removing elements like
* in a hash table (in fact, it uses a HashMap), while still being able to easily
* traverse the collection like a list. In addition, the iterator is circular. It
* always returns a next element as long as there are elements in the list. In
* contrast to the iterator of Sun's collection classes, this class can cope with
* inserts and removals while traversing the list.<p>
* Elements are always added to the end of the list, that is, always at the same place<br>
* All operations should work in near constant time as the list grows. Only the
* trade-off costs of a hash (memory versus speed) have to be considered.
* The List doesn't accept null elements
* @todo put the traversal function into an Iterator
* @todo implement the class as a derivate from a Hash
public class HashedCircularLinkedList
* Entry class.
private static class Entry
Object key;
Object element;
Entry next;
Entry previous;
Entry(Object element, Entry next, Entry previous, Object key)
this.element = element;
this.next = next;
this.previous = previous;
this.key = key;
* the list. contains objects
private transient Entry header = new Entry(null, null, null, null);
* the hash. maps keys to entries, which by themselves map to objects
HashMap keys;
private transient int size = 0;
/** the current entry in the traversal */
Entry current = null;
* Constructs an empty list.
public HashedCircularLinkedList(int initialCapacity, float loadFactor)
header.next = header.previous = header;
keys = new HashMap(initialCapacity, loadFactor);
* Returns the number of elements in this list.
* @return the number of elements in this list.
public int size()
return size;
* Removes the first occurrence of the specified element in this list. If
* the list does not contain the element, it is unchanged. More formally,
* removes the element with the lowest index <tt>i</tt> such that
* <tt>(o==null ? get(i)==null : o.equals(get(i)))</tt> (if such an
* element exists).
* @param o element to be removed from this list, if present.
* @return <tt>true</tt> if the list contained the specified element.
public boolean removeByKey(Object o)
// assert(o != null)
Entry e = (Entry)keys.get(o);
if(e != null)
if(e == current)
if(size > 1)
current = previousEntry(current);
current = null;
return true;
return false;
* Removes all of the elements from this list.
public void clear()
// list
header.next = header.previous = header;
// hash
size = 0;
current = null;
private Entry addEntryBefore(Object key, Object o, Entry e)
Entry newEntry = new Entry(o, e, e.previous, key);
newEntry.previous.next = newEntry;
newEntry.next.previous = newEntry;
return newEntry;
private void removeEntryFromList(Entry e)
if(e != null)
if (e == header)
throw new NoSuchElementException();
e.previous.next = e.next;
e.next.previous = e.previous;
* (method description here)
* defined in java.util.Map
* @param p0 (parameter description here)
* @param p1 (parameter description here)
* @return (return value description here)
public boolean put(Object key, Object value)
if(key != null && !keys.containsKey(key))
Entry e = addEntryBefore(key, value, header); // add it as the last element
keys.put(key, e); // link key to entry
return true;
return false;
public boolean hasNext()
return (size > 0);
private Entry nextEntry(Entry e)
// assert(e != null)
if(size > 1)
if(e == null)
e = header;
Entry next = e.next;
if(next == header)
next = next.next;
return next;
else if(size == 1)
return header.next;
return null;
private Entry previousEntry(Entry e)
// assert(e != null)
if(size > 1)
if(e == null)
e = header;
Entry previous = e.previous;
if(previous == header)
previous = previous.previous;
return previous;
else if(size == 1)
return header.previous;
return null;
public Object next()
current = nextEntry(current);
if(current != null)
return current.element;
return null;
public void removeCurrent()
public Object get(Object key)
Entry e = ((Entry)keys.get(key));
if(e != null)
return e.element;
return null;
* testing
public static void main(String[] args)
HashedCircularLinkedList h = new HashedCircularLinkedList(20, 0.75f);
h.put("1", "a");
h.put("2", "b");
h.put("3", "c");
String t;
System.out.println("size [3]: " + h.size());
t = (String)h.next();
System.out.println("2nd element via get [b]: " + h.get("2"));
System.out.println("next element [a]: " + t);
t = (String)h.next();
System.out.println("next element [b]: " + t);
t = (String)h.next();
System.out.println("next element [c]: " + t);
t = (String)h.next();
System.out.println("1st element after circular traversal [a]: " + t);
System.out.println("1st element after remove [null]: " + h.get("1"));
System.out.println("size after removal [2]: " + h.size());
t = (String)h.next();
System.out.println("next element [b]: " + t);
t = (String)h.next();
System.out.println("next element [c]: " + t);
t = (String)h.next();
System.out.println("next element [b]: " + t);
t = (String)h.next();
System.out.println("next element after 1 removal [c]: " + t);
t = (String)h.next();
System.out.println("next element: [c]: " + t);
System.out.println("size after 3 removals [0]: " + h.size());
t = (String)h.next();
System.out.println("next element [null]: " + t);
@ -0,0 +1,18 @@
* Title: LARM Lanlab Retrieval Machine<p>
* Description: <p>
* Copyright: Copyright (c) <p>
* Company: <p>
* @author
* @version 1.0
package de.lanlab.larm.util;
public interface InputStreamObserver
public void notifyOpened(ObservableInputStream in, long timeElapsed);
public void notifyClosed(ObservableInputStream in, long timeElapsed);
public void notifyRead(ObservableInputStream in, long timeElapsed, int nrRead, int totalRead);
public void notifyFinished(ObservableInputStream in, long timeElapsed, int totalRead);
@ -0,0 +1,19 @@
package de.lanlab.larm.util;
import java.io.*;
public class Logger
private FileOutputStream out;
public Logger(String fileName)
@ -0,0 +1,101 @@
* Title: LARM Lanlab Retrieval Machine<p>
* Description: <p>
* Copyright: Copyright (c) <p>
* Company: <p>
* @author
* @version 1.0
package de.lanlab.larm.util;
import java.io.*;
public class ObservableInputStream extends FilterInputStream
private boolean reporting = true;
private long startTime;
private int totalRead = 0;
private int step = 1;
private int nextStep = 0;
InputStreamObserver observer;
public ObservableInputStream(InputStream in, InputStreamObserver iso, int reportingStep)
startTime = System.currentTimeMillis();
observer = iso;
observer.notifyOpened(this, System.currentTimeMillis() - startTime);
nextStep = step = reportingStep;
public void close() throws IOException
observer.notifyClosed(this, System.currentTimeMillis() - startTime);
public void setReporting(boolean reporting)
this.reporting = reporting;
public boolean isReporting()
return reporting;
public void setReportingStep(int step)
this.step = step;
public int read() throws IOException
int readByte = super.read();
notifyObserver(readByte>=0? 1 : 0);
return readByte;
public int read(byte[] b) throws IOException
int nrRead = super.read(b);
return nrRead;
private void notifyObserver(int nrRead)
if(nrRead > 0)
totalRead += nrRead;
if(totalRead > nextStep)
nextStep += step;
observer.notifyRead(this, System.currentTimeMillis() - startTime, nrRead, totalRead);
observer.notifyFinished(this, System.currentTimeMillis() - startTime, totalRead);
public int read(byte[] b, int offs, int size) throws IOException
int nrRead = super.read(b, offs, size);
return nrRead;
@ -0,0 +1,9 @@
package de.lanlab.larm.util;
* not used
public interface Observer
@ -0,0 +1,15 @@
package de.lanlab.larm.util;
* Title: LARM
* Description:
* Copyright: Copyright (c) 2001
* Company: LMU-IP
* @author Clemens Marschner
* @version 1.0
public class OverflowException extends RuntimeException
@ -0,0 +1,20 @@
package de.lanlab.larm.util;
* Title: LARM Lanlab Retrieval Machine
* Description:
* Copyright: Copyright (c)
* Company:
* @author
* @version 1.0
import java.util.Collection;
public interface Queue
public Object remove();
public void insert(Object o);
public void insertMultiple(Collection c);
public int size();
@ -0,0 +1,285 @@
* @(#)SimpleCharArrayReader.java 1.35 00/02/02
package de.lanlab.larm.util;
import java.io.*;
* A <code>SimpleCharArrayReader</code> contains
* an internal buffer that contains bytes that
* may be read from the stream. An internal
* counter keeps track of the next byte to
* be supplied by the <code>read</code> method.
* <br>
* In contrast to the original <code>CharArrayReader</code> this
* version is not thread safe. The monitor on the read()-function caused programs
* to slow down much, because this function is called for every character. This
* class can thus only be used if only one thread is accessing the stream
* @author Clemens Marschner
* @version 1.00
* @see java.io.ByteArrayInputStream
class SimpleCharArrayReader extends Reader
* A flag that is set to true when this stream is closed.
private boolean isClosed = false;
* An array of bytes that was provided
* by the creator of the stream. Elements <code>buf[0]</code>
* through <code>buf[count-1]</code> are the
* only bytes that can ever be read from the
* stream; element <code>buf[pos]</code> is
* the next byte to be read.
protected char buf[];
* The index of the next character to read from the input stream buffer.
* This value should always be nonnegative
* and not larger than the value of <code>count</code>.
* The next byte to be read from the input stream buffer
* will be <code>buf[pos]</code>.
protected int pos;
* The currently marked position in the stream.
* SimpleCharArrayReader objects are marked at position zero by
* default when constructed. They may be marked at another
* position within the buffer by the <code>mark()</code> method.
* The current buffer position is set to this point by the
* <code>reset()</code> method.
* @since JDK1.1
protected int mark = 0;
* The index one greater than the last valid character in the input
* stream buffer.
* This value should always be nonnegative
* and not larger than the length of <code>buf</code>.
* It is one greater than the position of
* the last byte within <code>buf</code> that
* can ever be read from the input stream buffer.
protected int count;
* Creates a <code>SimpleCharArrayReader</code>
* so that it uses <code>buf</code> as its
* buffer array.
* The buffer array is not copied.
* The initial value of <code>pos</code>
* is <code>0</code> and the initial value
* of <code>count</code> is the length of
* <code>buf</code>.
* @param buf the input buffer.
public SimpleCharArrayReader(char buf[])
this.buf = buf;
this.pos = 0;
this.count = buf.length;
* Creates <code>SimpleCharArrayReader</code>
* that uses <code>buf</code> as its
* buffer array. The initial value of <code>pos</code>
* is <code>offset</code> and the initial value
* of <code>count</code> is <code>offset+len</code>.
* The buffer array is not copied.
* <p>
* Note that if bytes are simply read from
* the resulting input stream, elements <code>buf[pos]</code>
* through <code>buf[pos+len-1]</code> will
* be read; however, if a <code>reset</code>
* operation is performed, then bytes <code>buf[0]</code>
* through b<code>uf[pos-1]</code> will then
* become available for input.
* @param buf the input buffer.
* @param offset the offset in the buffer of the first byte to read.
* @param length the maximum number of bytes to read from the buffer.
public SimpleCharArrayReader(char buf[], int offset, int length)
this.buf = buf;
this.pos = offset;
this.count = Math.min(offset + length, buf.length);
this.mark = offset;
* Reads the next byte of data from this input stream. The value
* byte is returned as an <code>int</code> in the range
* <code>0</code> to <code>255</code>. If no byte is available
* because the end of the stream has been reached, the value
* <code>-1</code> is returned.
* <p>
* @return the next byte of data, or <code>-1</code> if the end of the
* stream has been reached.
public int read()
return (pos < count) ? (buf[pos++] & 0xff) : -1;
* Reads up to <code>len</code> bytes of data into an array of bytes
* from this input stream.
* If <code>pos</code> equals <code>count</code>,
* then <code>-1</code> is returned to indicate
* end of file. Otherwise, the number <code>k</code>
* of bytes read is equal to the smaller of
* <code>len</code> and <code>count-pos</code>.
* If <code>k</code> is positive, then bytes
* <code>buf[pos]</code> through <code>buf[pos+k-1]</code>
* are copied into <code>b[off]</code> through
* <code>b[off+k-1]</code> in the manner performed
* by <code>System.arraycopy</code>. The
* value <code>k</code> is added into <code>pos</code>
* and <code>k</code> is returned.
* <p>
* This <code>read</code> method cannot block.
* @param b the buffer into which the data is read.
* @param off the start offset of the data.
* @param len the maximum number of bytes read.
* @return the total number of bytes read into the buffer, or
* <code>-1</code> if there is no more data because the end of
* the stream has been reached.
public int read(char b[], int off, int len)
if (b == null)
throw new NullPointerException();
else if ((off < 0) || (off > b.length) || (len < 0) ||
((off + len) > b.length) || ((off + len) < 0))
throw new IndexOutOfBoundsException();
if (pos >= count)
return -1;
if (pos + len > count)
len = count - pos;
if (len <= 0)
return 0;
System.arraycopy(buf, pos, b, off, len);
pos += len;
return len;
* Skips <code>n</code> bytes of input from this input stream. Fewer
* bytes might be skipped if the end of the input stream is reached.
* The actual number <code>k</code>
* of bytes to be skipped is equal to the smaller
* of <code>n</code> and <code>count-pos</code>.
* The value <code>k</code> is added into <code>pos</code>
* and <code>k</code> is returned.
* @param n the number of bytes to be skipped.
* @return the actual number of bytes skipped.
public long skip(long n)
if (pos + n > count)
n = count - pos;
if (n < 0)
return 0;
pos += n;
return n;
* Returns the number of bytes that can be read from this input
* stream without blocking.
* The value returned is
* <code>count - pos</code>,
* which is the number of bytes remaining to be read from the input buffer.
* @return the number of bytes that can be read from the input stream
* without blocking.
public int available()
return count - pos;
* Tests if SimpleCharArrayReader supports mark/reset.
* @since JDK1.1
public boolean markSupported()
return true;
* Set the current marked position in the stream.
* SimpleCharArrayReader objects are marked at position zero by
* default when constructed. They may be marked at another
* position within the buffer by this method.
* @since JDK1.1
public void mark(int readAheadLimit)
mark = pos;
* Resets the buffer to the marked position. The marked position
* is the beginning unless another position was marked.
* The value of <code>pos</code> is set to 0.
public void reset()
pos = mark;
* Closes this input stream and releases any system resources
* associated with the stream.
* <p>
public void close() throws IOException
isClosed = true;
/** Check to make sure that the stream has not been closed */
private void ensureOpen()
/* This method does nothing for now. Once we add throws clauses
* to the I/O methods in this class, it will throw an IOException
* if the stream has been closed.
@ -0,0 +1,112 @@
package de.lanlab.larm.util;
* Title: LARM Lanlab Retrieval Machine
* Description:
* Copyright: Copyright (c)
* Company:
* @author
* @version 1.0
import java.io.*;
import java.util.*;
import java.text.*;
* this class is only used for SPEED. Its log function is not thread safe by
* default.
* It uses a BufferdWriter.
* It registers with a logger manager, which can be used to flush several loggers
* at once
* @todo: including the date slows down a lot
public class SimpleLogger
private SimpleDateFormat formatter = new SimpleDateFormat ("HH:mm:ss:SSSS");
Writer logFile;
StringBuffer buffer = new StringBuffer(1000);
long startTime = System.currentTimeMillis();
boolean includeDate;
public void setStartTime(long startTime)
this.startTime = startTime;
public synchronized void logThreadSafe(String text)
public synchronized void logThreadSafe(Throwable t)
public void log(String text)
buffer.append(formatter.format(new Date())).append(": ").append(System.currentTimeMillis()-startTime).append(" ms: ");
catch(IOException e)
System.out.println("Couldn't write to logfile");
public void log(Throwable t)
t.printStackTrace(new PrintWriter(logFile));
boolean flushAtOnce = false;
public void setFlushAtOnce(boolean flush)
this.flushAtOnce = flush;
public SimpleLogger(String name)
init(name, true);
public SimpleLogger(String name, boolean includeDate)
init(name, includeDate);
public void flush() throws IOException
private void init(String name, boolean includeDate)
logFile = new BufferedWriter(new FileWriter("logs/" + name + ".log"));
catch(IOException e)
System.out.println("IOException while creating logfile " + name + ":");
@ -0,0 +1,65 @@
package de.lanlab.larm.util;
* Title: LARM Lanlab Retrieval Machine
* Description:
* Copyright: Copyright (c)
* Company:
* @author
* @version 1.0
import java.util.*;
import java.io.IOException;
* this singleton manages all loggers. It can be used to flush all SimpleLoggers
* at once
public class SimpleLoggerManager
static SimpleLoggerManager instance = null;
ArrayList logs;
private SimpleLoggerManager()
logs = new ArrayList();
public void register(SimpleLogger logger)
public void flush() throws IOException
Iterator it = logs.iterator();
IOException ex = null;
SimpleLogger logger = (SimpleLogger)it.next();
catch(IOException e)
ex = e;
if(ex != null)
throw ex;
public static SimpleLoggerManager getInstance()
if(instance == null)
instance = new SimpleLoggerManager();
return instance;
@ -0,0 +1,21 @@
* Title: LARM Lanlab Retrieval Machine<p>
* Description: <p>
* Copyright: Copyright (c) <p>
* Company: <p>
* @author
* @version 1.0
package de.lanlab.larm.util;
import java.util.Observable;
public class SimpleObservable extends Observable
public void setChanged()
@ -0,0 +1,91 @@
package de.lanlab.larm.util;
import java.io.Serializable;
* Title: LARM Lanlab Retrieval Machine
* Description:
* Copyright: Copyright (c)
* Company:
* @author
* @version 1.0
* thread safe state information.
* The get methods are not synchronized. Clone the state object before using them
* If you use a state object in a class, always return a clone
* <pre>public class MyClass {
* State state = new State("Running");
* public State getState() { return state.cloneState() }</pre>
* note on serialization: if you deserialize a state, the state string will be newly created.
* that means you then have to compare the states via equal() and not ==
public class State implements Cloneable, Serializable
private String state;
private long stateSince;
private Object info;
public State(String state)
private State(String state, long stateSince)
init(state, stateSince, null);
private State(String state, long stateSince, Object info)
init(state, stateSince, info);
private void init(String state, long stateSince, Object info)
this.state = state;
this.stateSince = stateSince;
this.info = info;
public void setState(String state)
setState(state, null);
public synchronized void setState(String state, Object info)
this.state = state;
this.stateSince = System.currentTimeMillis();
this.info = info;
public String getState()
return state;
public long getStateSince()
return stateSince;
public Object getInfo()
return info;
public synchronized Object clone()
return new State(state, stateSince, info);
public State cloneState()
return (State)clone();
@ -0,0 +1,60 @@
package de.lanlab.larm.util;
* Title: LARM Lanlab Retrieval Machine Description: Copyright: Copyright (c)
* Company:
* @author
* @version 1.0
import java.net.URL;
* Description of the Class
* @author Administrator
* @created 27. Januar 2002
public class URLUtils
* does the same as URL.toExternalForm(), but leaves out the Ref part (which we would
* cut off anyway) and handles the String Buffer so that no call of expandCapacity() will
* be necessary
* only meaningful if the default URLStreamHandler is used (as is the case with http, https, or shttp)
* @param u the URL to be converted
* @return the URL as String
public static String toExternalFormNoRef(URL u)
String protocol = u.getProtocol();
String authority = u.getAuthority();
String file = u.getFile();
StringBuffer result = new StringBuffer(
(protocol == null ? 0 : protocol.length()) +
(authority == null ? 0 : authority.length()) +
(file == null ? 1 : file.length()) + 3
if (u.getAuthority() != null && u.getAuthority().length() > 0)
if (u.getFile() != null && u.getFile().length() > 0)
return result.toString();
@ -0,0 +1,15 @@
package de.lanlab.larm.util;
* Title: LARM
* Description:
* Copyright: Copyright (c) 2001
* Company: LMU-IP
* @author Clemens Marschner
* @version 1.0
public class UnderflowException extends RuntimeException
@ -0,0 +1,94 @@
package de.lanlab.larm.util;
import java.net.URL;
import de.lanlab.larm.fetcher.URLMessage;
* a web document of whatever type. generated by a fetcher task
public class WebDocument extends URLMessage
protected String mimeType;
protected byte[] document;
protected int resultCode;
protected int size;
protected String title;
public WebDocument(URL url, String mimeType, byte[] document, int resultCode, URL referer, int size, String title)
super(url, referer, false);
this.url = url;
this.mimeType = mimeType;
this.document = document;
this.resultCode = resultCode;
this.size = size;
this.title = title;
public String getTitle()
return title;
public URL getUrl()
return url;
public int getSize()
return this.size;
public void setSize(int size)
this.size = size;
public void setDocument(byte[] document)
this.document = document;
public int getResultCode()
return resultCode;
public void setResultCode(int resultCode)
this.resultCode = resultCode;
public byte[] getDocumentBytes()
return this.document;
public void setUrl(URL url)
this.url = url;
public void setMimeType(String mimeType)
this.mimeType = mimeType;
public String getMimeType()
return mimeType;
public String getInfo()
return super.getInfo() + "\t" +
this.resultCode + "\t" +
this.mimeType + "\t" +
this.size + "\t" +
"\"" + this.title.replace('\"', (char)0xff ).replace('\n',' ').replace('\r',' ') + "\"";
@ -0,0 +1,294 @@
* $Id$
* Copyright 1997 Hewlett-Packard Company
* This file may be copied, modified and distributed only in
* accordance with the terms of the limited licence contained
* in the accompanying file LICENSE.TXT.
package hplb.misc;
import java.io.*;
import java.net.*;
* This class is a container for algorithms working on byte arrays - some
* of the algorithms are analogous to those in java.lang.String.
* @author Anders Kristensen
public class ByteArray {
/** Returns copy of characters in s as a new byte array. */
public static final byte[] getBytes(String s) {
int len = s.length();
byte b[] = new byte[len];
s.getBytes(0, len, b, 0);
return b;
/** Returns contents of file as byte array. */
public static byte[] loadFromFile(String filename) throws IOException {
return loadFromFile(new File(filename));
/** Returns contents of file <i>file</i> as byte array. */
public static byte[] loadFromFile(File file) throws IOException {
int n, nread = 0, len = (int) file.length();
FileInputStream fin = new FileInputStream(file);
byte[] content = new byte[len];
while (nread < len) {
if ((n = fin.read(content, nread, len - nread)) == -1)
throw new IOException("Error loading Compound from file");
nread += n;
return content;
* Reads n bytes from the specified input stream. It will return
* fewer bytes if fewer bytes are available on the stream.
* Hence the application should check the resulting arrays length.
public static byte[] readn(InputStream in, int n) throws IOException {
byte[] buf = new byte[n];
int ntotal = 0;
int nread;
while (ntotal < n) {
nread = in.read(buf, ntotal, n - ntotal);
if (nread < 0) {
// we got less than expected - return what we got
byte[] newbuf = new byte[ntotal];
System.arraycopy(buf, 0, newbuf, 0, ntotal);
return newbuf;
ntotal += nread;
return buf;
* Return contents of a WWW resource identified by a URL.
* @param url the resource to retrieve
* @return the resource contents as a byte array
public static byte[] getContent(URL url) throws IOException {
URLConnection conn = url.openConnection();
InputStream in = conn.getInputStream();
int length;
* N.B. URLConnection.getContentLength() is buggy for "http" resources
* (at least in JDK1.0.2) and won't work for "file" URLs either.
length = length = conn.getContentLength();
if (length == -1)
length = conn.getHeaderFieldInt("Content-Length", -1);
if (length == -1)
return readAll(in);
return readn(in, length);
* Read all input from an InputStream and return as a byte array.
* This method will not return before the end of the stream is reached.
* @return contents of the stream
public static byte[] readAll(InputStream in) throws IOException {
byte[] buf = new byte[1024];
int nread, ntotal = 0;
while ((nread = in.read(buf, ntotal, buf.length - ntotal)) > -1) {
ntotal += nread;
if (ntotal == buf.length) {
// extend buffer
byte[] newbuf = new byte[buf.length * 2];
System.arraycopy(buf, 0, newbuf, 0, buf.length);
buf = newbuf;
if (ntotal < buf.length) {
// we cannot have excess space
byte[] newbuf = new byte[ntotal];
System.arraycopy(buf, 0, newbuf, 0, ntotal);
buf = newbuf;
return buf;
* Copies data from the specified input stream to the output stream
* until end of file is met.
* @return the total number of bytes written to the output stream
public static int cpybytes(InputStream in, OutputStream out)
throws IOException
byte[] buf = new byte[1024];
int n, ntotal = 0;
while ((n = in.read(buf)) > -1) {
out.write(buf, 0, n);
ntotal += n;
return ntotal;
* Copies data from the specified input stream to the output stream
* until <em>n</em> bytes has been copied or end of file is met.
* @return the total number of bytes written to the output stream
public static int cpybytes(InputStream in, OutputStream out, int n)
throws IOException
int sz = n < 1024 ? n : 1024;
byte[] buf = new byte[sz];
int chunk, nread, ntotal = 0;
chunk = sz;
while (ntotal < n && (nread = in.read(buf, 0, chunk)) > -1) {
out.write(buf, 0, nread);
ntotal += nread;
chunk = (n - ntotal < sz) ? n - ntotal : sz;
return ntotal;
* Returns the index within this String of the first occurrence of the
* specified character or -1 if the character is not found.
* @params buf the buffer to search
* @params ch the character to search for
public static final int indexOf(byte[] buf,
int ch) {
return indexOf(buf, ch, 0, buf.length);
* Returns the index within this String of the first occurrence of the
* specified character, starting the search at fromIndex. This method
* returns -1 if the character is not found.
* @params buf the buffer to search
* @params ch the character to search for
* @params fromIndex the index to start the search from
* @params toIndex the highest possible index returned plus 1
public static final int indexOf(byte[] buf,
int ch,
int fromIndex,
int toIndex) {
int i;
for (i = fromIndex; i < toIndex && buf[i] != ch; i++)
; // do nothing
if (i < toIndex)
return i;
return -1;
* Returns the index of the first occurrence of s in the specified
* buffer or -1 if this is not found.
public static final int indexOf(byte[] buf, String s) {
return indexOf(buf, s, 0);
* Returns the index of the first occurrence of s in the specified
* buffer. The search starts from fromIndex. This method returns -1
* if the index is not found.
public static final int indexOf(byte[] buf, String s, int fromIndex) {
int i; // index into buf
int j; // index into s
int max_i = buf.length;
int max_j = s.length();
for (i = fromIndex; i + max_j <= max_i; i++) {
for (j = 0; j < max_j; j++) {
if (buf[j + i] != s.charAt(j))
if (j == max_j) return i;
return -1;
// for testing indexOf(byte[], String, int)
public static void main(String[] args) {
byte[] buf = getBytes(args[0]);
System.out.println("IndexOf(arg0, arg1, 0) = " + indexOf(buf, args[1], 3));
public static final boolean isSpace(int ch) {
if (ch == ' ' || ch == '\t' || ch == '\n' || ch == '\r') return true;
else return false;
public static final int skipSpaces(byte[] buf, int fromIndex, int toIndex) {
int i;
for (i = fromIndex; i < toIndex && isSpace(buf[i]); i++)
return i;
* Find byte pattern ptrn in buffer buf.
* @return index of first occurrence of ptrn in buf, -1 if no occurence
public static final int findBytes(byte buf[],
int off,
int len,
byte ptrn[]) {
// Note: This code is completely incomprehensible without a drawing...
int buf_len = off + len;
int ptrn_len = ptrn.length;
int i; // index into buf
int j; // index into ptrn;
byte b = ptrn[0]; // next byte of interest
for (i = off; i < buf_len; ) {
j = 0;
while (i < buf_len && j < ptrn_len && buf[i] == ptrn[j]) {
if (i == buf_len || j == ptrn_len)
return i - j;
else {
// We have to go back a bit as there may be an overlapping
// match starting a bit later in buf...
i = i - j + 1;
return -1;
// for testing findBytes(byte[], int, int, byte[])
public static void main(String args[]) {
if (args.length < 4) {
System.err.println("Usage: s1 off len s2");
byte b1[] = new byte[args[0].length()];
byte b2[] = new byte[args[3].length()];
args[0].getBytes(0, args[0].length(), b1, 0);
args[3].getBytes(0, args[3].length(), b2, 0);
int off = Integer.parseInt(args[1]);
int len = Integer.parseInt(args[2]);
System.out.println("Index = " + findBytes(b1, off, len, b2));
@ -0,0 +1,20 @@
* $Id$
package hplb.org.w3c.dom;
public interface Attribute {
public String getName();
public Node getValue();
public void setValue(Node arg);
public boolean getSpecified();
public void setSpecified(boolean arg);
public String toString();
@ -0,0 +1,16 @@
* $Id$
package hplb.org.w3c.dom;
public interface AttributeList {
public Attribute getAttribute(String attrName);
public Attribute setAttribute(Attribute attr);
public Attribute remove(String attrName);
public Attribute item(int index);
public int getLength();
@ -0,0 +1,13 @@
* $Id$
package hplb.org.w3c.dom;
* Represents the content of comments: <!-- ... -->
public interface Comment extends Node {
public String getData();
public void setData(String arg);
@ -0,0 +1,13 @@
* $Id$
package hplb.org.w3c.dom;
public interface DOM {
public Document createDocument(String type);
public boolean hasFeature(String feature);
@ -0,0 +1,28 @@
* $Id$
package hplb.org.w3c.dom;
public interface Document extends DocumentFragment {
public Node getDocumentType();
public void setDocumentType(Node arg);
public Element getDocumentElement();
public void setDocumentElement(Element arg);
public DocumentContext getContextInfo();
public void setContextInfo(DocumentContext arg);
public DocumentContext createDocumentContext();
public Element createElement(String tagName, AttributeList attributes);
public Text createTextNode(String data);
public Comment createComment(String data);
public PI createPI(String name, String data);
public Attribute createAttribute(String name, Node value);
public AttributeList createAttributeList();
public NodeIterator getElementsByTagName();
@ -0,0 +1,14 @@
* $Id$
package hplb.org.w3c.dom;
public interface DocumentContext {
public Document getDocument();
public void setDocument(Document arg);
@ -0,0 +1,13 @@
* $Id$
package hplb.org.w3c.dom;
public interface DocumentFragment extends Node {
public Document getMasterDoc();
public void setMasterDoc(Document arg);
@ -0,0 +1,16 @@
* $Id$
package hplb.org.w3c.dom;
public interface Element extends Node {
public String getTagName();
public AttributeList attributes();
public void setAttribute(Attribute newAttr);
public void normalize();
public NodeIterator getElementsByTagName();
@ -0,0 +1,38 @@
# This Makefile generated by hplb.util.jmkmf
# Java package is org.w3c.dom
.SUFFIXES: .java .class .jj
JPACKAGE = org.w3c.dom
JAVA = java
JAVAC = javac
JAVACC = java COM.sun.labs.javacc.Main
OBJS = \
Attribute.class \
AttributeList.class \
Comment.class \
DOM.class \
Document.class \
DocumentContext.class \
DocumentFragment.class \
Element.class \
Node.class \
NodeIterator.class \
PI.class \
Text.class \
JAVADOCFLAGS = -d ../../../doc/api -author -noindex -notree
all: $(OBJS)
.jj.java: $*.jj
$(JAVACC) $<
.java.class: $*.java
rm -f *.class *~
@ -0,0 +1,29 @@
* $Id$
package hplb.org.w3c.dom;
public interface Node {
// NodeType
public static final int DOCUMENT = 1;
public static final int ELEMENT = 2;
public static final int ATTRIBUTE = 3;
public static final int PI = 4;
public static final int COMMENT = 5;
public static final int TEXT = 6;
public int getNodeType();
public Node getParentNode();
public NodeIterator getChildNodes();
public boolean hasChildNodes();
public Node getFirstChild();
public Node getPreviousSibling();
public Node getNextSibling();
public Node insertBefore(Node newChild, Node refChild);
public Node replaceChild(Node newChild, Node oldChild);
public Node removeChild(Node oldChild);
@ -0,0 +1,19 @@
* $Id$
package hplb.org.w3c.dom;
public interface NodeIterator {
public int getLength();
public Node getCurrent();
public Node toNext();
public Node toPrevious();
public Node toFirst();
public Node toLast();
public Node toNth(int Nth);
public Node toNode(Node destNode);
@ -0,0 +1,16 @@
* $Id$
package hplb.org.w3c.dom;
* Processing Instruction
public interface PI extends Node {
public String getName();
public void setName(String arg);
public String getData();
public void setData(String arg);
@ -0,0 +1,19 @@
* $Id$
package hplb.org.w3c.dom;
public interface Text extends Node {
public String getData();
public void setData(String arg);
public void append(String data);
public void insert(int offset, String data);
public void delete(int offset, int count);
public void replace(int offset, int count, String data);
public void splice(Element element, int offset, int count);
@ -0,0 +1,20 @@
* $Id$
package hplb.org.w3c.dom;
public interface TreeIterator extends NodeIterator {
public int numChildren();
public int numPreviousSiblings();
public int numNextSiblings();
public Node toParent();
public Node toPreviousSibling();
public Node toNextSibling();
public Node toFirstChild();
public Node toLastChild();
public Node toNthChild();
@ -0,0 +1,146 @@
// $Id$
package hplb.org.xml.sax;
import java.util.Enumeration;
* A map of attributes for the current element.
* <p><em>This interface is part of the Java implementation of SAX,
* the Simple API for XML. It is free for both commercial and
* non-commercial use, and is distributed with no warrantee, real
* or implied.</em></p>
* <p>This map will be valid only during the invocation of the
* <code>startElement</code> callback: if you need to use attribute
* information elsewhere, you will need to make your own copies.</p>
* @author David Megginson, Microstar Software Ltd.
* @see hplb.org.xml.sax.DocumentHandler#startElement
public interface AttributeMap {
* Find the names of all available attributes for an element.
* <p>This applies to the current element, and can be called only
* during an invocation of <code>startElement</code>.</p>
* @return An enumeration of zero or more Strings.
* @see java.util.Enumeration
* @see hplb.org.xml.sax.DocumentHandler#startElement
public Enumeration getAttributeNames ();
* Get the value of an attribute as a String.
* <p>This applies to the current element, and can be called only
* during an invocation of <code>startElement</code>.</p>
* @return The value as a String, or null if the attribute has no value.
* @see hplb.org.xml.sax.DocumentHandler#startElement
public String getValue (String attributeName);
* Check if an attribute value is the name of an entity.
* <p>This applies to the current element, and can be called only
* during an invocation of <code>startElement</code>.</p>
* @return true if the attribute is an entity name.
* @see #getEntityPublicID
* @see #getEntitySystemID
* @see #getNotationName
* @see #getNotationPublicID
* @see #getNotationSystemID
* @see hplb.org.xml.sax.DocumentHandler#startElement
public boolean isEntity (String aname);
* Check if an attribute value is the name of a notation.
* <p>This applies to the current element, and can be called only
* during an invocation of <code>startElement</code>.</p>
* @return true if the attribute is a notation name.
* @see #getNotationPublicID
* @see #getNotationSystemID
* @see hplb.org.xml.sax.DocumentHandler#startElement
public boolean isNotation (String aname);
* Check if an attribute value is a unique identifier.
* <p>This applies to the current element, and can be called only
* during an invocation of <code>startElement</code>.</p>
* @return true if the attribute is a unique identifier.
* @see hplb.org.xml.sax.DocumentHandler#startElement
public boolean isId (String aname);
* Check if an attribute value is a reference to an ID.
* <p>This applies to the current element, and can be called only
* during an invocation of <code>startElement</code>.</p>
* @return true if the attribute is a reference to an ID.
* @see hplb.org.xml.sax.DocumentHandler#startElement
public boolean isIdref (String aname);
* Get the public identifier for an ENTITY attribute.
* <p>This applies to the current element, and can be called only
* during an invocation of <code>startElement</code>.</p>
* @return The public identifier or null if there is none (or if
* the attribute value is not an entity name)
* @see #isEntity
public String getEntityPublicID (String aname);
* Get the system identifer for an ENTITY attribute.
* <p>This applies to the current element, and can be called only
* during an invocation of <code>startElement</code>.</p>
* @return The system identifier or null if there is none (or if
* the attribute value is not an entity name)
* @see #isEntity
public String getEntitySystemID (String aname);
* Get the notation name for an ENTITY attribute.
* <p>This applies to the current element, and can be called only
* during an invocation of <code>startElement</code>.</p>
* @return The notation name or null if there is none (or if
* the attribute value is not an entity name)
* @see #isEntity
public String getNotationName (String aname);
* Get the notation public ID for an ENTITY or NOTATION attribute.
* <p>This applies to the current element, and can be called only
* during an invocation of <code>startElement</code>.</p>
* @return The public identifier or null if there is none (or if
* the attribute value is not an entity or notation name)
* @see #isEntity
* @see #isNotation
public String getNotationPublicID (String aname);
* Get the notation system ID for an ENTITY or NOTATION attribute.
* <p>This applies to the current element, and can be called only
* during an invocation of <code>startElement</code>.</p>
* @return The system identifier or null if there is none (or if
* the attribute value is not an entity or notation name)
* @see #isEntity
* @see #isNotation
public String getNotationSystemID (String aname);
@ -0,0 +1,129 @@
// $Id$
package hplb.org.xml.sax;
* A callback interface for basic XML document events.
* <p><em>This interface is part of the Java implementation of SAX,
* the Simple API for XML. It is free for both commercial and
* non-commercial use, and is distributed with no warrantee, real
* or implied.</em></p>
* <p>This is the main handler for basic document events; it provides
* information on roughly the same level as the ESIS in full SGML,
* concentrating on logical structure rather than lexical
* representation.</p>
* <p>If you do not set a document handler, then by default all of these
* events will simply be ignored.</p>
* @author David Megginson, Microstar Software Ltd.
* @see hplb.org.xml.sax.Parser@setDocumentHandler
public interface DocumentHandler {
* Handle the start of a document.
* <p>This is the first event called by a
* SAX-conformant parser, so you can use it to allocate and
* initialise new objects for the document.</p>
* @exception java.lang.Exception You may throw any exception.
public void startDocument ()
throws Exception;
* Handle the end of a document.
* <p>This is the last event called by a
* SAX-conformant parser, so you can use it to finalize and
* clean up objects for the document.</p>
* @exception java.lang.Exception You may throw any exception.
public void endDocument ()
throws Exception;
* Handle the document type declaration.
* <p>This will appear only if the XML document contains a
* <code>DOCTYPE</code> declaration.</p>
* @param name The document type name.
* @param publicID The public identifier of the external DTD subset
* (if any), or null.
* @param systemID The system identifier of the external DTD subset
* (if any), or null.
* @param name The document type name.
* @exception java.lang.Exception You may throw any exception.
public void doctype (String name, String publicID, String systemID)
throws Exception;
* Handle the start of an element.
* <p>Please note that the information in the <code>attributes</code>
* parameter will be accurate only for the duration of this handler:
* if you need to use the information elsewhere, you should copy
* it.</p>
* @param name The element type name.
* @param attributes The available attributes.
* @exception java.lang.Exception You may throw any exception.
public void startElement (String name, AttributeMap attributes)
throws Exception;
* Handle the end of an element.
* @exception java.lang.Exception You may throw any exception.
public void endElement (String name)
throws Exception;
* Handle significant character data.
* <p>Please note that the contents of the array will be
* accurate only for the duration of this handler: if you need to
* use them elsewhere, you should make your own copy, possible
* by constructing a string:</p>
* <pre>
* String data = new String(ch, start, length);
* </pre>
* @param ch An array of characters.
* @param start The starting position in the array.
* @param length The number of characters to use in the array.
* @exception java.lang.Exception You may throw any exception.
public void characters (char ch[], int start, int length)
throws Exception;
* Handle ignorable whitespace.
* <p>Please note that the contents of the array will be
* accurate only for the duration of this handler: if you need to
* use them elsewhere, you should make your own copy, possible
* by constructing a string:</p>
* <pre>
* String whitespace = new String(ch, start, length);
* </pre>
* @param ch An array of whitespace characters.
* @param start The starting position in the array.
* @param length The number of characters to use in the array.
* @exception java.lang.Exception You may throw any exception.
public void ignorable (char ch[], int start, int length)
throws Exception;
* Handle a processing instruction.
* <p>XML processing instructions have two parts: a target, which
* is a name, followed optionally by data.</p>
* @exception java.lang.Exception You may throw any exception.
public void processingInstruction (String name, String remainder)
throws Exception;
@ -0,0 +1,48 @@
// $Id$
package hplb.org.xml.sax;
* A callback interface for basic XML entity-related events.
* <p><em>This interface is part of the Java implementation of SAX,
* the Simple API for XML. It is free for both commercial and
* non-commercial use, and is distributed with no warrantee, real
* or implied.</em></p>
* <p>If you do not set an entity handler, then a parser will
* resolve all entities to the suggested system ID, and will take no
* action for entity changes.</p>
* @author David Megginson, Microstar Software Ltd.
* @see hplb.org.xml.sax.Parser#setEntityHandler
public interface EntityHandler {
* Resolve a system identifier.
* <p>Before loading any entity (including the document entity),
* SAX parsers will filter the system identifier through this
* callback, and you can return a different system identifier if you
* wish, or null to prevent the parser from reading any entity.</p>
* @param ename The name of the entity, "[document]" for the
* document entity, or "[external DTD]" for the external
* DTD subset.
* @param publicID The public identifier, or null if there is none.
* @param systemID The system identifier suggested in the XML document.
* @return A system identifier, or null to skip the entity.
* @exception java.lang.Exception You may throw any exception.
public String resolveEntity (String ename, String publicID, String systemID)
throws Exception;
* Handle a change in the current entity.
* <p>Whenever the parser switches the entity (URI) that it is reading
* from, it will call this handler to report the change.</p>
* @param systemID The URI of the new entity.
* @exception java.lang.Exception You may throw any exception.
public void changeEntity (String systemID)
throws Exception;
@ -0,0 +1,52 @@
// $Id$
package hplb.org.xml.sax;
* A callback interface for basic XML error events.
* <p><em>This interface is part of the Java implementation of SAX,
* the Simple API for XML. It is free for both commercial and
* non-commercial use, and is distributed with no warrantee, real
* or implied.</em></p>
* <p>If you do not set an error handler, then a parser will report
* warnings to <code>System.err</code>, and will throw an (unspecified)
* exception for fata errors.</p>
* @author David Megginson, Microstar Software Ltd.
* @see hplb.org.xml.sax.Parser#setErrorHandler
public interface ErrorHandler {
* Handle a non-fatal warning.
* <p>A SAX parser will use this callback to report a condition
* that is not serious enough to stop the parse (though you may
* still stop the parse if you wish).</p>
* @param message The warning message.
* @param systemID The URI of the entity that caused the warning, or
* null if not available.
* @param line The line number in the entity, or -1 if not available.
* @param column The column number in the entity, or -1 if not available.
* @exception java.lang.Exception You may throw any exception.
public void warning (String message, String systemID, int line, int column)
throws java.lang.Exception;
* Handle a fatal error.
* <p>A SAX parser will use this callback to report a condition
* that is serious enough to invalidate the parse, and may not
* report all (or any) significant parse events after this. Ordinarily,
* you should stop immediately with an exception, but you can continue
* to try to collect more errors if you wish.</p>
* @param message The error message.
* @param systemID The URI of the entity that caused the error, or
* null if not available.
* @param line The line number in the entity, or -1 if not available.
* @param column The column number in the entity, or -1 if not available.
* @exception java.lang.Exception You may throw any exception.
public void fatal (String message, String systemID, int line, int column)
throws Exception;
@ -0,0 +1,201 @@
// $Id$
package hplb.org.xml.sax;
* A simple base class for deriving SAX event handlers.
* <p><em>This class is part of the Java implementation of SAX,
* the Simple API for XML. It is free for both commercial and
* non-commercial use, and is distributed with no warrantee, real
* or implied.</em></p>
* <p>This class implements the default behaviour when no handler
* is specified (though parsers are not actually required to use
* this class).</p>
* @author David Megginson, Microstar Software Ltd.
* @see hplb.org.xml.sax.XmlException
* @see hplb.org.xml.sax.EntityHandler
* @see hplb.org.xml.sax.DocumentHandler
* @see hplb.org.xml.sax.ErrorHandler
public class HandlerBase
implements EntityHandler, DocumentHandler, ErrorHandler
// Implementation of hplb.org.xml.sax.EntityHandler.
* Resolve an external entity.
* <p>By default, simply return the system ID supplied.</p>
* @exception java.lang.Exception When you override this method,
* you may throw any exception.
* @see hplb.org.xml.sax.EntityHandler#resolveEntity
public String resolveEntity (String ename, String publicID, String systemID)
throws Exception
return systemID;
* Handle an entity-change event.
* <p>By default, do nothing.</p>
* @exception java.lang.Exception When you override this method,
* you may throw any exception.
* @see hplb.org.xml.sax.EntityHandler#changeEntity
public void changeEntity (String systemID)
throws Exception
// Implementation of hplb.org.xml.sax.DocumentHandler.
* Handle a start document event.
* <p>By default, do nothing.</p>
* @exception java.lang.Exception When you override this method,
* you may throw any exception.
* @see hplb.org.xml.sax.DocumentHandler#startDocument
public void startDocument ()
throws Exception
* Handle a end document event.
* <p>By default, do nothing.</p>
* @exception java.lang.Exception When you override this method,
* you may throw any exception.
* @see hplb.org.xml.sax.DocumentHandler#endDocument
public void endDocument ()
throws Exception
* Handle a document type declaration event.
* <p>By default, do nothing.</p>
* @exception java.lang.Exception When you override this method,
* you may throw any exception.
* @see hplb.org.xml.sax.DocumentHandler#doctype
public void doctype (String name, String publicID, String systemID)
throws Exception
* Handle a start element event.
* <p>By default, do nothing.</p>
* @exception java.lang.Exception When you override this method,
* you may throw any exception.
* @see hplb.org.xml.sax.DocumentHandler#startElement
public void startElement (String name, AttributeMap attributes)
throws Exception
* Handle an end element event.
* <p>By default, do nothing.</p>
* @exception java.lang.Exception When you override this method,
* you may throw any exception.
* @see hplb.org.xml.sax.DocumentHandler#endElement
public void endElement (String name)
throws Exception
* Handle a character data event.
* <p>By default, do nothing.</p>
* @exception java.lang.Exception When you override this method,
* you may throw any exception.
* @see hplb.org.xml.sax.DocumentHandler#characters
public void characters (char ch[], int start, int length)
throws Exception
* Handle an ignorable whitespace event.
* <p>By default, do nothing.</p>
* @exception java.lang.Exception When you override this method,
* you may throw any exception.
* @see hplb.org.xml.sax.DocumentHandler#ignorable
public void ignorable (char ch[], int start, int length)
throws Exception
* Handle a processing instruction event.
* <p>By default, do nothing.</p>
* @exception java.lang.Exception When you override this method,
* you may throw any exception.
* @see hplb.org.xml.sax.DocumentHandler#processingInstruction
public void processingInstruction (String name, String remainder)
throws Exception
// Implementation of ErrorHandler.
* Handle a non-fatal error.
* <p>By default, report the warning to System.err.</p>
* @exception java.lang.Exception When you override this method,
* you may throw any exception.
* @see hplb.org.xml.sax.ErrorHandler#warning
public void warning (String message, String systemID, int line, int column)
throws Exception
System.err.println("Warning (" +
systemID +
',' +
line +
',' +
column +
"): " +
* Handle a fatal error.
* <p>By default, throw an instance of XmlException.</p>
* @exception hplb.org.xml.sax.XmlException A fatal parsing error
* has been found.
* @exception java.lang.Exception When you override this method,
* you may throw any exception.
* @see hplb.org.xml.sax.ErrorHandler#fatal
public void fatal (String message, String systemID, int line, int column)
throws XmlException, Exception
throw new XmlException(message, systemID, line, column);
@ -0,0 +1,32 @@
# This Makefile generated by jmkmf
# Java package is org.xml.sax
.SUFFIXES: .java .class .jj
JPACKAGE = org.xml.sax
JAVA = java
JAVAC = javac
JAVACC = java COM.sun.labs.javacc.Main
OBJS = \
AttributeMap.class \
DocumentHandler.class \
EntityHandler.class \
ErrorHandler.class \
HandlerBase.class \
Parser.class \
JAVADOCFLAGS = -d ../../../doc/api -author -noindex -notree
all: $(OBJS)
.jj.java: org.xml.sax.jj
$(JAVACC) $<
.java.class: $*.java
rm -f *.class *~
@ -0,0 +1,71 @@
// $Id$
package hplb.org.xml.sax;
* A standard interface for event-driven XML parsers.
* <p><em>This interface is part of the Java implementation of SAX,
* the Simple API for XML. It is free for both commercial and
* non-commercial use, and is distributed with no warrantee, real
* or implied.</em></p>
* <p>All SAX-conformant XML parsers (or their front-end SAX drivers)
* <em>must</em> implement this interface, together with a zero-argument
* constructor.</p>
* <p>You can plug three different kinds of callback interfaces into
* a basic SAX parser: one for entity handling, one for basic document
* events, and one for error reporting. It is not an error to start
* a parse without setting any handlers.</p>
* @author David Megginson, Microstar Software Ltd.
public interface Parser {
* Register the handler for basic entity events.
* <p>If you begin a parse without setting an entity handler,
* the parser will by default resolve all entities to their
* default system IDs.</p>
* @param handler An object to receive callbacks for events.
* @see hplb.org.xml.sax.EntityHandler
public void setEntityHandler (EntityHandler handler);
* Register the handler for basic document events.
* <p>You may begin the parse without setting a handler, but
* in that case no document events will be reported.</p>
* @param handler An object to receive callbacks for events.
* @see hplb.org.xml.sax.DocumentHandler
public void setDocumentHandler (DocumentHandler handler);
* Register the handler for errors and warnings.
* <p>If you begin a parse without setting an error handlers,
* warnings will be printed to System.err, and errors will
* throw an unspecified exception.</p>
* @param handler An object to receive callbacks for errors.
* @see hplb.org.xml.sax.ErrorHandler
public void setErrorHandler (ErrorHandler handler);
* Parse an XML document.
* <p>Nothing exciting will happen unless you have set handlers.</p>
* @param publicID The public identifier for the document, or null
* if none is available.
* @param systemID The system identifier (URI) for the document.
* @exception java.lang.Exception This method may throw any exception,
* but the parser itself
* will throw only exceptions derived from java.io.IOException;
* anything else will come from your handlers.
* @see #setEntityHandler
* @see #setDocumentHandler
* @see #setErrorHandler
void parse (String publicID, String systemID) throws java.lang.Exception;
@ -0,0 +1,73 @@
// $Id$
package hplb.org.xml.sax;
* An exception for reporting XML parsing errors.
* <p><em>This interface is part of the Java implementation of SAX,
* the Simple API for XML. It is free for both commercial and
* non-commercial use, and is distributed with no warrantee, real
* or implied.</em></p>
* <p>This exception is not a required part of SAX, and it is not
* referenced in any of the core interfaces. It is used only in
* the optional HandlerBase base class, as a means of signalling
* parsing errors.</p>
* @author David Megginson, Microstar Software Ltd.
* @see hplb.org.xml.sax.HandlerBase#fatal
public class XmlException extends Exception {
* Construct a new exception with information about the location.
public XmlException (String message, String systemID, int line, int column)
this.systemID = systemID;
this.line = line;
this.column = column;
* Find the system identifier (URI) where the error occurred.
* @return A string representing the URI, or null if none is available.
public String getSystemID ()
return systemID;
* Find the line number where the error occurred.
* @return The line number, or -1 if none is available.
public int getLine ()
return line;
* Find the column number (line offset) where the error occurred.
* @return The column number, or -1 if none is available.
public int getColumn ()
return column;
// Internal state.
private String systemID;
private int line;
private int column;
Normal file
Normal file
@ -0,0 +1,41 @@
* $Id$
* Copyright 1997 Hewlett-Packard Company
* This file may be copied, modified and distributed only in
* accordance with the terms of the limited licence contained
* in the accompanying file LICENSE.TXT.
package hplb.xml;
import java.util.Hashtable;
* This class is responsible for maintaining strings as <em>atoms</em>,
* i.e. if two strings returned by getAtom() are equal in the sense of
* String.equal() then they are in fact the same Object. This is used to
* "intern" element and attribute names which can then be compared using
* the more efficient reference equality, a la "s1==s2".
* @author Anders Kristensen
public final class Atom {
/** Holds atoms: element names (GIs), and attribute names. */
private static final Hashtable atoms = new Hashtable();
* Return an atom corresponding to the argument.
public static String getAtom(String s) {
synchronized (atoms) {
String a = (String) atoms.get(s);
if (a == null) {
atoms.put(s, s);
a = s;
return a;
@ -0,0 +1,57 @@
* $Id$
* Copyright 1997 Hewlett-Packard Company
* This file may be copied, modified and distributed only in
* accordance with the terms of the limited licence contained
* in the accompanying file LICENSE.TXT.
package hplb.xml;
import hplb.org.w3c.dom.*;
* @author Anders Kristensen
public final class AttrImpl implements Attribute {
protected String name;
protected Node value;
protected boolean specified;
public AttrImpl(String name, String value) {
this(name, new TextImpl(Node.TEXT, value), true);
public AttrImpl(String name, Node value, boolean specified) {
this.name = name;
this.value = value;
this.specified = specified;
public String getName() {
return name;
public Node getValue() {
return value;
public void setValue(Node arg) {
value = arg;
public boolean getSpecified() {
return specified;
public void setSpecified(boolean arg) {
specified = arg;
public String toString() {
return value.toString();
@ -0,0 +1,183 @@
* $Id$
* Copyright 1997 Hewlett-Packard Company
* This file may be copied, modified and distributed only in
* accordance with the terms of the limited licence contained
* in the accompanying file LICENSE.TXT.
package hplb.xml;
import hplb.org.w3c.dom.*;
* An ordered Dictionary. keys() and elements() returns Enumerations
* which enumerate over elements in the order they were inserted.
* Elements are stored linearly. Operations put(), get(), and remove()
* are linear in the number of elements in the Dictionary.
* <p>Allows direct access to elements (as an alternative to using
* Enumerators) for speed.
* <p>Can function as a <em>bag</em>, i.e. it can be created with a mode
* which allows the same key to map to multiple entries. In this case
* operations get() and remove() operate on the <em>first</em> pair in
* the map. Hence to get hold of all values associated with a key it is
* necessary to use the direct access to underlying arrays.
* @author Anders Kristensen
public class AttrListImpl implements AttributeList {
protected Attribute[] elms;
* Number of elements. The elements are held at indices 0 to n in elms.
protected int n = 0;
public AttrListImpl() {
* Create an AttrListImpl with the specififed initial capacity.
public AttrListImpl(int size) {
if (size <= 0) throw new IllegalArgumentException(
"Initial size must be at least 1");
elms = new Attribute[size];
* Returns the value to which the key is mapped in this dictionary.
public synchronized Attribute getAttribute(String attrName) {
int i = getIndex(attrName);
return (i < 0 ? null : elms[i]);
protected int getIndex(String name) {
for (int i = 0; i < n; i++) {
if (elms[i].getName().equals(name)) {
return i;
return -1;
// XXX: what if attrName != attr.getName()???
public synchronized Attribute setAttribute(Attribute attr) {
int i = getIndex(attr.getName());
if (i >= 0) {
Attribute old = elms[i];
elms[i] = attr;
return old;
int len = elms.length;
if (len == n) {
// double size of key,elms arrays
AttrImpl[] e;
e = new AttrImpl[len * 2];
System.arraycopy(elms, 0, e, 0, len);
elms = e;
elms[n] = attr;
return null;
public synchronized Attribute remove(String attrName) {
int i = getIndex(attrName);
if (i < 0) return null;
Attribute val = elms[i];
System.arraycopy(elms, i+1, elms, i, n-i-1);
return val;
public synchronized Attribute item(int index) {
if (index < 0 || index >= n) {
throw new IndexOutOfBoundsException(""+index);
return elms[index];
/** Returns the number of keys in this dictionary. */
public synchronized int getLength() {
return n;
public synchronized String toString() {
StringBuffer sb = new StringBuffer();
boolean f = true;
int n = getLength();
sb.append("{ ");
for (int i = 0; i < n; i++) {
if (f) { f = false; }
else { sb.append(", "); }
Attribute attr = item(i);
sb.append(attr.getName() + '=' + attr);
sb.append(" }");
return sb.toString();
// for testing
public static void main(String[] args) throws Exception {
AttrListImpl alist;
Attribute attr;
java.io.BufferedReader r;
java.util.StringTokenizer tok;
String op;
if (args.length > 1) {
alist = new AttrListImpl(Integer.parseInt(args[0]));
} else {
alist = new AttrListImpl();
"Enter operations... op's are one of\n"+
"put <key> <val>\n"+
"get <key>\n"+
"rem <key>\n"+
r = new java.io.BufferedReader(
new java.io.InputStreamReader(System.in));
while (true) {
System.out.print("doyourworst> ");
tok = new java.util.StringTokenizer(r.readLine());
op = tok.nextToken();
if ("put".equals(op)) {
attr = new AttrImpl(tok.nextToken(), tok.nextToken());
System.out.println("Value: " +
} else if ("get".equals(op)) {
attr = alist.getAttribute(tok.nextToken());
System.out.println("Value: " +
(attr == null ? "No such element" : attr.toString()));
} else if ("rem".equals(op)) {
attr = alist.remove(tok.nextToken());
System.out.println("Value: " + attr);
} else if (op.startsWith("s")) {
System.out.println("Size: " + alist.getLength());
} else if (op.startsWith("q")) {
} else {
System.out.println("Unrecognized op: " + op);
System.out.println("AttributeList: " + alist);
System.out.println("Size: " + alist.getLength());
@ -0,0 +1,46 @@
* $Id$
* Copyright 1997 Hewlett-Packard Company
* This file may be copied, modified and distributed only in
* accordance with the terms of the limited licence contained
* in the accompanying file LICENSE.TXT.
package hplb.xml;
* A java.io.CharArrayWriter with the additional property that users can get
* to the actual underlying storage. Hence it's very fast (and dangerous).
* @author Anders Kristensen
public final class CharBuffer extends java.io.CharArrayWriter {
public CharBuffer() {
public CharBuffer(int size) {
// use only to *decrement* size
public void setLength(int size) {
synchronized (lock) {
if (size < count) count = size;
public char[] getCharArray() {
synchronized (lock) {
return buf;
public int getLength()
return count;
@ -0,0 +1,23 @@
* $Id$
* Copyright 1997 Hewlett-Packard Company
* This file may be copied, modified and distributed only in
* accordance with the terms of the limited licence contained
* in the accompanying file LICENSE.TXT.
package hplb.xml;
import hplb.org.w3c.dom.DOM;
import hplb.org.w3c.dom.Document;
public class DOMImpl implements DOM {
public Document createDocument(String type) {
return new DocumentImpl();
public boolean hasFeature(String feature) {
return false;
@ -0,0 +1,25 @@
* $Id$
* Copyright 1997 Hewlett-Packard Company
* This file may be copied, modified and distributed only in
* accordance with the terms of the limited licence contained
* in the accompanying file LICENSE.TXT.
package hplb.xml;
import hplb.org.w3c.dom.*;
public class DocContextImpl implements DocumentContext {
Document doc;
public Document getDocument() {
return doc;
public void setDocument(Document arg) {
doc = arg;
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user