mirror of https://github.com/apache/lucene.git
- Made constructor private, added a few FIXMEs, etc.
git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@150826 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
c3a69fd77d
commit
69397847cd
|
@ -56,16 +56,16 @@ package de.lanlab.larm.fetcher;
|
||||||
|
|
||||||
import de.lanlab.larm.threads.ThreadPoolObserver;
|
import de.lanlab.larm.threads.ThreadPoolObserver;
|
||||||
import de.lanlab.larm.threads.ThreadPool;
|
import de.lanlab.larm.threads.ThreadPool;
|
||||||
import java.net.MalformedURLException;
|
|
||||||
import java.net.URL;
|
|
||||||
import java.util.*;
|
|
||||||
import de.lanlab.larm.gui.*;
|
import de.lanlab.larm.gui.*;
|
||||||
import de.lanlab.larm.util.*;
|
import de.lanlab.larm.util.*;
|
||||||
import de.lanlab.larm.storage.*;
|
import de.lanlab.larm.storage.*;
|
||||||
import de.lanlab.larm.net.*;
|
import de.lanlab.larm.net.*;
|
||||||
import javax.swing.UIManager;
|
|
||||||
import HTTPClient.*;
|
import HTTPClient.*;
|
||||||
import org.apache.oro.text.regex.MalformedPatternException;
|
import org.apache.oro.text.regex.MalformedPatternException;
|
||||||
|
import java.net.MalformedURLException;
|
||||||
|
import java.net.URL;
|
||||||
|
import java.util.*;
|
||||||
|
import javax.swing.UIManager;
|
||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -109,11 +109,6 @@ public class FetcherMain
|
||||||
*/
|
*/
|
||||||
protected RobotExclusionFilter reFilter;
|
protected RobotExclusionFilter reFilter;
|
||||||
|
|
||||||
/**
|
|
||||||
* the host manager keeps track of all hosts and is used by the filters.
|
|
||||||
*/
|
|
||||||
protected HostManager hostManager;
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* this rather flaky filter just filters out some URLs, i.e. different views
|
* this rather flaky filter just filters out some URLs, i.e. different views
|
||||||
* of Apache the apache DirIndex module. Has to be made
|
* of Apache the apache DirIndex module. Has to be made
|
||||||
|
@ -121,19 +116,28 @@ public class FetcherMain
|
||||||
*/
|
*/
|
||||||
protected KnownPathsFilter knownPathsFilter;
|
protected KnownPathsFilter knownPathsFilter;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* the URL length filter filters URLs that are too long, i.e. because of errors
|
||||||
|
* in the implementation of dynamic web sites
|
||||||
|
*/
|
||||||
|
protected URLLengthFilter urlLengthFilter;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* the host manager keeps track of all hosts and is used by the filters.
|
||||||
|
*/
|
||||||
|
protected HostManager hostManager;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* this is the main document fetcher. It contains a thread pool that fetches the
|
* this is the main document fetcher. It contains a thread pool that fetches the
|
||||||
* documents and stores them
|
* documents and stores them
|
||||||
*/
|
*/
|
||||||
protected Fetcher fetcher;
|
protected Fetcher fetcher;
|
||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* the thread monitor once was only a monitoring tool, but now has become a
|
* the thread monitor once was only a monitoring tool, but now has become a
|
||||||
* vital part of the system that computes statistics and
|
* vital part of the system that computes statistics and
|
||||||
* flushes the log file buffers
|
* flushes the log file buffers
|
||||||
*/
|
*/
|
||||||
|
|
||||||
protected ThreadMonitor monitor;
|
protected ThreadMonitor monitor;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -142,25 +146,19 @@ public class FetcherMain
|
||||||
*/
|
*/
|
||||||
protected DocumentStorage storage;
|
protected DocumentStorage storage;
|
||||||
|
|
||||||
/**
|
|
||||||
* the URL length filter filters URLs that are too long, i.e. because of errors
|
|
||||||
* in the implementation of dynamic web sites
|
|
||||||
*/
|
|
||||||
protected URLLengthFilter urlLengthFilter;
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* initializes all classes and registers anonymous adapter classes as
|
* initializes all classes and registers anonymous adapter classes as
|
||||||
* listeners for fetcher events.
|
* listeners for fetcher events.
|
||||||
*
|
*
|
||||||
* @param nrThreads number of fetcher threads to be created
|
* @param nrThreads number of fetcher threads to be created
|
||||||
*/
|
*/
|
||||||
public FetcherMain(int nrThreads)
|
private FetcherMain(int nrThreads)
|
||||||
{
|
{
|
||||||
// to make things clear, this method is commented a bit better than
|
// to make things clear, this method is commented a bit better than
|
||||||
// the rest of the program...
|
// the rest of the program...
|
||||||
|
|
||||||
// this is the main message queue. handlers are registered with
|
// this is the main message queue. handlers are registered with
|
||||||
// the queue, and whenever a message is put in it, they are passed to the
|
// the queue, and whenever a message is put in it, the message is passed to the
|
||||||
// filters in a "chain of responibility" manner. Every listener can decide
|
// filters in a "chain of responibility" manner. Every listener can decide
|
||||||
// to throw the message away
|
// to throw the message away
|
||||||
messageHandler = new MessageHandler();
|
messageHandler = new MessageHandler();
|
||||||
|
@ -169,7 +167,6 @@ public class FetcherMain
|
||||||
// matter how it does it, whether it's in a file, in a database or
|
// matter how it does it, whether it's in a file, in a database or
|
||||||
// whatever
|
// whatever
|
||||||
|
|
||||||
|
|
||||||
// example for the (very slow) SQL Server storage:
|
// example for the (very slow) SQL Server storage:
|
||||||
// this.storage = new SQLServerStorage("sun.jdbc.odbc.JdbcOdbcDriver","jdbc:odbc:search","sa","...",nrThreads);
|
// this.storage = new SQLServerStorage("sun.jdbc.odbc.JdbcOdbcDriver","jdbc:odbc:search","sa","...",nrThreads);
|
||||||
|
|
||||||
|
@ -190,6 +187,7 @@ public class FetcherMain
|
||||||
LuceneStorage luceneStorage = new LuceneStorage();
|
LuceneStorage luceneStorage = new LuceneStorage();
|
||||||
luceneStorage.setAnalyzer(new org.apache.lucene.analysis.de.GermanAnalyzer());
|
luceneStorage.setAnalyzer(new org.apache.lucene.analysis.de.GermanAnalyzer());
|
||||||
luceneStorage.setCreate(true);
|
luceneStorage.setCreate(true);
|
||||||
|
// FIXME: index name and path need to be configurable
|
||||||
luceneStorage.setIndexName("luceneIndex");
|
luceneStorage.setIndexName("luceneIndex");
|
||||||
luceneStorage.setFieldInfo("url", LuceneStorage.INDEX | LuceneStorage.STORE);
|
luceneStorage.setFieldInfo("url", LuceneStorage.INDEX | LuceneStorage.STORE);
|
||||||
luceneStorage.setFieldInfo("content", LuceneStorage.INDEX | LuceneStorage.STORE | LuceneStorage.TOKEN);
|
luceneStorage.setFieldInfo("content", LuceneStorage.INDEX | LuceneStorage.STORE | LuceneStorage.TOKEN);
|
||||||
|
@ -202,30 +200,24 @@ public class FetcherMain
|
||||||
// heat, which evaporates above the processor
|
// heat, which evaporates above the processor
|
||||||
// NullStorage();
|
// NullStorage();
|
||||||
|
|
||||||
// create the filters and add them to the message queue
|
|
||||||
urlScopeFilter = new URLScopeFilter();
|
|
||||||
|
|
||||||
urlVisitedFilter = new URLVisitedFilter(100000);
|
|
||||||
|
|
||||||
// dnsResolver = new DNSResolver();
|
|
||||||
hostManager = new HostManager(1000);
|
hostManager = new HostManager(1000);
|
||||||
|
|
||||||
|
// create the filters and add them to the message queue
|
||||||
reFilter = new RobotExclusionFilter(hostManager);
|
reFilter = new RobotExclusionFilter(hostManager);
|
||||||
|
urlScopeFilter = new URLScopeFilter();
|
||||||
fetcher = new Fetcher(nrThreads, storage, storage, hostManager);
|
urlVisitedFilter = new URLVisitedFilter(100000);
|
||||||
|
|
||||||
knownPathsFilter = new KnownPathsFilter();
|
knownPathsFilter = new KnownPathsFilter();
|
||||||
|
|
||||||
urlLengthFilter = new URLLengthFilter(255);
|
urlLengthFilter = new URLLengthFilter(255);
|
||||||
|
|
||||||
|
// dnsResolver = new DNSResolver();
|
||||||
|
fetcher = new Fetcher(nrThreads, storage, storage, hostManager);
|
||||||
|
|
||||||
// prevent message box popups
|
// prevent message box popups
|
||||||
HTTPConnection.setDefaultAllowUserInteraction(false);
|
HTTPConnection.setDefaultAllowUserInteraction(false);
|
||||||
|
|
||||||
// prevent GZipped files from being decoded
|
// prevent GZipped files from being decoded
|
||||||
HTTPConnection.removeDefaultModule(HTTPClient.ContentEncodingModule.class);
|
HTTPConnection.removeDefaultModule(HTTPClient.ContentEncodingModule.class);
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
// initialize the threads
|
// initialize the threads
|
||||||
fetcher.init();
|
fetcher.init();
|
||||||
|
|
||||||
|
@ -266,9 +258,9 @@ public class FetcherMain
|
||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Sets the RexString attribute of the FetcherMain object
|
* Sets the RexString attribute of <code>UrlScopeFilter</code>.
|
||||||
*
|
*
|
||||||
* @param restrictTo The new RexString value
|
* @param restrictTo the new RexString value
|
||||||
*/
|
*/
|
||||||
public void setRexString(String restrictTo) throws MalformedPatternException
|
public void setRexString(String restrictTo) throws MalformedPatternException
|
||||||
{
|
{
|
||||||
|
@ -292,6 +284,7 @@ public class FetcherMain
|
||||||
}
|
}
|
||||||
catch (Exception e)
|
catch (Exception e)
|
||||||
{
|
{
|
||||||
|
// FIXME: replace with logging
|
||||||
System.out.println("Exception: " + e.getMessage());
|
System.out.println("Exception: " + e.getMessage());
|
||||||
e.printStackTrace();
|
e.printStackTrace();
|
||||||
}
|
}
|
||||||
|
@ -344,7 +337,7 @@ public class FetcherMain
|
||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* The main program. parsed
|
* The main program.
|
||||||
*
|
*
|
||||||
* @param args The command line arguments
|
* @param args The command line arguments
|
||||||
*/
|
*/
|
||||||
|
@ -357,6 +350,8 @@ public class FetcherMain
|
||||||
boolean gui = false;
|
boolean gui = false;
|
||||||
boolean showInfo = false;
|
boolean showInfo = false;
|
||||||
System.out.println("LARM - LANLab Retrieval Machine - Fetcher - V 1.00 - (C) LANLab 2000-02");
|
System.out.println("LARM - LANLab Retrieval Machine - Fetcher - V 1.00 - (C) LANLab 2000-02");
|
||||||
|
|
||||||
|
// FIXME: consider using Jakarta Commons' CLI package for command line parameters
|
||||||
for (int i = 0; i < args.length; i++)
|
for (int i = 0; i < args.length; i++)
|
||||||
{
|
{
|
||||||
if (args[i].equals("-start"))
|
if (args[i].equals("-start"))
|
||||||
|
@ -419,7 +414,6 @@ public class FetcherMain
|
||||||
catch (MalformedURLException e)
|
catch (MalformedURLException e)
|
||||||
{
|
{
|
||||||
System.out.println("Malformed URL");
|
System.out.println("Malformed URL");
|
||||||
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue