- Made constructor private, added a few FIXMEs, etc.

git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@150826 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Otis Gospodnetic 2002-09-15 00:38:14 +00:00
parent c3a69fd77d
commit 69397847cd
1 changed files with 30 additions and 36 deletions

View File

@ -56,16 +56,16 @@ package de.lanlab.larm.fetcher;
import de.lanlab.larm.threads.ThreadPoolObserver; import de.lanlab.larm.threads.ThreadPoolObserver;
import de.lanlab.larm.threads.ThreadPool; import de.lanlab.larm.threads.ThreadPool;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.*;
import de.lanlab.larm.gui.*; import de.lanlab.larm.gui.*;
import de.lanlab.larm.util.*; import de.lanlab.larm.util.*;
import de.lanlab.larm.storage.*; import de.lanlab.larm.storage.*;
import de.lanlab.larm.net.*; import de.lanlab.larm.net.*;
import javax.swing.UIManager;
import HTTPClient.*; import HTTPClient.*;
import org.apache.oro.text.regex.MalformedPatternException; import org.apache.oro.text.regex.MalformedPatternException;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.*;
import javax.swing.UIManager;
/** /**
@ -109,11 +109,6 @@ public class FetcherMain
*/ */
protected RobotExclusionFilter reFilter; protected RobotExclusionFilter reFilter;
/**
* the host manager keeps track of all hosts and is used by the filters.
*/
protected HostManager hostManager;
/** /**
* this rather flaky filter just filters out some URLs, i.e. different views * this rather flaky filter just filters out some URLs, i.e. different views
* of Apache the apache DirIndex module. Has to be made * of Apache the apache DirIndex module. Has to be made
@ -121,19 +116,28 @@ public class FetcherMain
*/ */
protected KnownPathsFilter knownPathsFilter; protected KnownPathsFilter knownPathsFilter;
/**
* the URL length filter filters URLs that are too long, i.e. because of errors
* in the implementation of dynamic web sites
*/
protected URLLengthFilter urlLengthFilter;
/**
* the host manager keeps track of all hosts and is used by the filters.
*/
protected HostManager hostManager;
/** /**
* this is the main document fetcher. It contains a thread pool that fetches the * this is the main document fetcher. It contains a thread pool that fetches the
* documents and stores them * documents and stores them
*/ */
protected Fetcher fetcher; protected Fetcher fetcher;
/** /**
* the thread monitor once was only a monitoring tool, but now has become a * the thread monitor once was only a monitoring tool, but now has become a
* vital part of the system that computes statistics and * vital part of the system that computes statistics and
* flushes the log file buffers * flushes the log file buffers
*/ */
protected ThreadMonitor monitor; protected ThreadMonitor monitor;
/** /**
@ -142,25 +146,19 @@ public class FetcherMain
*/ */
protected DocumentStorage storage; protected DocumentStorage storage;
/**
* the URL length filter filters URLs that are too long, i.e. because of errors
* in the implementation of dynamic web sites
*/
protected URLLengthFilter urlLengthFilter;
/** /**
* initializes all classes and registers anonymous adapter classes as * initializes all classes and registers anonymous adapter classes as
* listeners for fetcher events. * listeners for fetcher events.
* *
* @param nrThreads number of fetcher threads to be created * @param nrThreads number of fetcher threads to be created
*/ */
public FetcherMain(int nrThreads) private FetcherMain(int nrThreads)
{ {
// to make things clear, this method is commented a bit better than // to make things clear, this method is commented a bit better than
// the rest of the program... // the rest of the program...
// this is the main message queue. handlers are registered with // this is the main message queue. handlers are registered with
// the queue, and whenever a message is put in it, they are passed to the // the queue, and whenever a message is put in it, the message is passed to the
// filters in a "chain of responibility" manner. Every listener can decide // filters in a "chain of responibility" manner. Every listener can decide
// to throw the message away // to throw the message away
messageHandler = new MessageHandler(); messageHandler = new MessageHandler();
@ -169,7 +167,6 @@ public class FetcherMain
// matter how it does it, whether it's in a file, in a database or // matter how it does it, whether it's in a file, in a database or
// whatever // whatever
// example for the (very slow) SQL Server storage: // example for the (very slow) SQL Server storage:
// this.storage = new SQLServerStorage("sun.jdbc.odbc.JdbcOdbcDriver","jdbc:odbc:search","sa","...",nrThreads); // this.storage = new SQLServerStorage("sun.jdbc.odbc.JdbcOdbcDriver","jdbc:odbc:search","sa","...",nrThreads);
@ -190,6 +187,7 @@ public class FetcherMain
LuceneStorage luceneStorage = new LuceneStorage(); LuceneStorage luceneStorage = new LuceneStorage();
luceneStorage.setAnalyzer(new org.apache.lucene.analysis.de.GermanAnalyzer()); luceneStorage.setAnalyzer(new org.apache.lucene.analysis.de.GermanAnalyzer());
luceneStorage.setCreate(true); luceneStorage.setCreate(true);
// FIXME: index name and path need to be configurable
luceneStorage.setIndexName("luceneIndex"); luceneStorage.setIndexName("luceneIndex");
luceneStorage.setFieldInfo("url", LuceneStorage.INDEX | LuceneStorage.STORE); luceneStorage.setFieldInfo("url", LuceneStorage.INDEX | LuceneStorage.STORE);
luceneStorage.setFieldInfo("content", LuceneStorage.INDEX | LuceneStorage.STORE | LuceneStorage.TOKEN); luceneStorage.setFieldInfo("content", LuceneStorage.INDEX | LuceneStorage.STORE | LuceneStorage.TOKEN);
@ -202,30 +200,24 @@ public class FetcherMain
// heat, which evaporates above the processor // heat, which evaporates above the processor
// NullStorage(); // NullStorage();
// create the filters and add them to the message queue
urlScopeFilter = new URLScopeFilter();
urlVisitedFilter = new URLVisitedFilter(100000);
// dnsResolver = new DNSResolver();
hostManager = new HostManager(1000); hostManager = new HostManager(1000);
// create the filters and add them to the message queue
reFilter = new RobotExclusionFilter(hostManager); reFilter = new RobotExclusionFilter(hostManager);
urlScopeFilter = new URLScopeFilter();
fetcher = new Fetcher(nrThreads, storage, storage, hostManager); urlVisitedFilter = new URLVisitedFilter(100000);
knownPathsFilter = new KnownPathsFilter(); knownPathsFilter = new KnownPathsFilter();
urlLengthFilter = new URLLengthFilter(255); urlLengthFilter = new URLLengthFilter(255);
// dnsResolver = new DNSResolver();
fetcher = new Fetcher(nrThreads, storage, storage, hostManager);
// prevent message box popups // prevent message box popups
HTTPConnection.setDefaultAllowUserInteraction(false); HTTPConnection.setDefaultAllowUserInteraction(false);
// prevent GZipped files from being decoded // prevent GZipped files from being decoded
HTTPConnection.removeDefaultModule(HTTPClient.ContentEncodingModule.class); HTTPConnection.removeDefaultModule(HTTPClient.ContentEncodingModule.class);
// initialize the threads // initialize the threads
fetcher.init(); fetcher.init();
@ -266,9 +258,9 @@ public class FetcherMain
/** /**
* Sets the RexString attribute of the FetcherMain object * Sets the RexString attribute of <code>UrlScopeFilter</code>.
* *
* @param restrictTo The new RexString value * @param restrictTo the new RexString value
*/ */
public void setRexString(String restrictTo) throws MalformedPatternException public void setRexString(String restrictTo) throws MalformedPatternException
{ {
@ -292,6 +284,7 @@ public class FetcherMain
} }
catch (Exception e) catch (Exception e)
{ {
// FIXME: replace with logging
System.out.println("Exception: " + e.getMessage()); System.out.println("Exception: " + e.getMessage());
e.printStackTrace(); e.printStackTrace();
} }
@ -344,7 +337,7 @@ public class FetcherMain
/** /**
* The main program. parsed * The main program.
* *
* @param args The command line arguments * @param args The command line arguments
*/ */
@ -357,6 +350,8 @@ public class FetcherMain
boolean gui = false; boolean gui = false;
boolean showInfo = false; boolean showInfo = false;
System.out.println("LARM - LANLab Retrieval Machine - Fetcher - V 1.00 - (C) LANLab 2000-02"); System.out.println("LARM - LANLab Retrieval Machine - Fetcher - V 1.00 - (C) LANLab 2000-02");
// FIXME: consider using Jakarta Commons' CLI package for command line parameters
for (int i = 0; i < args.length; i++) for (int i = 0; i < args.length; i++)
{ {
if (args[i].equals("-start")) if (args[i].equals("-start"))
@ -419,7 +414,6 @@ public class FetcherMain
catch (MalformedURLException e) catch (MalformedURLException e)
{ {
System.out.println("Malformed URL"); System.out.println("Malformed URL");
} }
} }
} }