mirror of https://github.com/apache/lucene.git
more than one start URL; hostResolver
git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@150839 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
aaffb3ed75
commit
e50d457ef7
|
@ -56,12 +56,12 @@ package de.lanlab.larm.fetcher;
|
||||||
|
|
||||||
import de.lanlab.larm.threads.ThreadPoolObserver;
|
import de.lanlab.larm.threads.ThreadPoolObserver;
|
||||||
import de.lanlab.larm.threads.ThreadPool;
|
import de.lanlab.larm.threads.ThreadPool;
|
||||||
import de.lanlab.larm.gui.*;
|
|
||||||
import de.lanlab.larm.util.*;
|
import de.lanlab.larm.util.*;
|
||||||
import de.lanlab.larm.storage.*;
|
import de.lanlab.larm.storage.*;
|
||||||
import de.lanlab.larm.net.*;
|
import de.lanlab.larm.net.*;
|
||||||
import HTTPClient.*;
|
import HTTPClient.*;
|
||||||
import org.apache.oro.text.regex.MalformedPatternException;
|
import org.apache.oro.text.regex.MalformedPatternException;
|
||||||
|
import java.io.*;
|
||||||
import java.net.MalformedURLException;
|
import java.net.MalformedURLException;
|
||||||
import java.net.URL;
|
import java.net.URL;
|
||||||
import java.util.*;
|
import java.util.*;
|
||||||
|
@ -109,6 +109,17 @@ public class FetcherMain
|
||||||
*/
|
*/
|
||||||
protected RobotExclusionFilter reFilter;
|
protected RobotExclusionFilter reFilter;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* the host manager keeps track of all hosts and is used by the filters.
|
||||||
|
*/
|
||||||
|
protected HostManager hostManager;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* the host resolver can change a host that occurs within a URL to a different
|
||||||
|
* host, depending on the rules specified in a configuration file
|
||||||
|
*/
|
||||||
|
protected HostResolver hostResolver;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* this rather flaky filter just filters out some URLs, i.e. different views
|
* this rather flaky filter just filters out some URLs, i.e. different views
|
||||||
* of Apache the apache DirIndex module. Has to be made
|
* of Apache the apache DirIndex module. Has to be made
|
||||||
|
@ -122,10 +133,6 @@ public class FetcherMain
|
||||||
*/
|
*/
|
||||||
protected URLLengthFilter urlLengthFilter;
|
protected URLLengthFilter urlLengthFilter;
|
||||||
|
|
||||||
/**
|
|
||||||
* the host manager keeps track of all hosts and is used by the filters.
|
|
||||||
*/
|
|
||||||
protected HostManager hostManager;
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* this is the main document fetcher. It contains a thread pool that fetches the
|
* this is the main document fetcher. It contains a thread pool that fetches the
|
||||||
|
@ -152,7 +159,7 @@ public class FetcherMain
|
||||||
*
|
*
|
||||||
* @param nrThreads number of fetcher threads to be created
|
* @param nrThreads number of fetcher threads to be created
|
||||||
*/
|
*/
|
||||||
private FetcherMain(int nrThreads)
|
public FetcherMain(int nrThreads, String hostResolverFile) throws Exception
|
||||||
{
|
{
|
||||||
// to make things clear, this method is commented a bit better than
|
// to make things clear, this method is commented a bit better than
|
||||||
// the rest of the program...
|
// the rest of the program...
|
||||||
|
@ -172,44 +179,73 @@ public class FetcherMain
|
||||||
|
|
||||||
// the LogStorage used here does extensive logging. It logs all links and
|
// the LogStorage used here does extensive logging. It logs all links and
|
||||||
// document information.
|
// document information.
|
||||||
// it also saves all documents to page files. Probably this single storage
|
// it also saves all documents to page files.
|
||||||
// could also be replaced by a pipeline; or even incorporated into the
|
File logsDir = new File("logs");
|
||||||
// existing message pipeline
|
logsDir.mkdir(); // ensure log directory exists
|
||||||
SimpleLogger storeLog = new SimpleLogger("store", false);
|
|
||||||
SimpleLogger linksLog = new SimpleLogger("links", false);
|
|
||||||
|
|
||||||
|
// in this experimental implementation, the crawler is pretty verbose
|
||||||
|
// the SimpleLogger, however, is a FlyWeight logger which is buffered and
|
||||||
|
// not thread safe by default
|
||||||
|
SimpleLogger storeLog = new SimpleLogger("store", /* add date/time? */ false);
|
||||||
|
SimpleLogger visitedLog = new SimpleLogger("URLVisitedFilter", /* add date/time? */ false);
|
||||||
|
SimpleLogger scopeLog = new SimpleLogger("URLScopeFilter", /* add date/time? */ false);
|
||||||
|
SimpleLogger pathsLog = new SimpleLogger("KnownPathsFilter", /* add date/time? */ false);
|
||||||
|
SimpleLogger linksLog = new SimpleLogger("links", /* add date/time? */ false);
|
||||||
|
SimpleLogger lengthLog = new SimpleLogger("length", /* add date/time? */ false);
|
||||||
|
|
||||||
StoragePipeline storage = new StoragePipeline();
|
StoragePipeline storage = new StoragePipeline();
|
||||||
//storage.addDocStorage(new LogStorage(storeLog, /* save in page files? */ false, /* logfile prefix */ "logs/pagefile"));
|
|
||||||
|
|
||||||
|
// in the default configuration, the crawler will only save the document
|
||||||
|
// information to store.log and the link information to links.log
|
||||||
|
// The contents of the files are _not_ saved. If you set
|
||||||
|
// "save in page files" to "true", they will be saved in "page files",
|
||||||
|
// binary files each containing a set of documents. Here, the
|
||||||
|
// maximum file size is ~50 MB (crawled files won't be split up into different
|
||||||
|
// files). The logs/store.log file contains pointers to these files: a page
|
||||||
|
// file number, the offset within that file, and the document's length
|
||||||
|
|
||||||
|
// FIXME: default constructor for all storages + bean access methods
|
||||||
|
storage.addDocStorage(new LogStorage(storeLog, /* save in page files? */ false,
|
||||||
|
/* page file prefix */ "logs/pagefile"));
|
||||||
storage.addLinkStorage(new LinkLogStorage(linksLog));
|
storage.addLinkStorage(new LinkLogStorage(linksLog));
|
||||||
storage.addLinkStorage(messageHandler);
|
storage.addLinkStorage(messageHandler);
|
||||||
|
/*
|
||||||
|
// experimental Lucene storage. will slow the crawler down *a lot*
|
||||||
LuceneStorage luceneStorage = new LuceneStorage();
|
LuceneStorage luceneStorage = new LuceneStorage();
|
||||||
luceneStorage.setAnalyzer(new org.apache.lucene.analysis.de.GermanAnalyzer());
|
luceneStorage.setAnalyzer(new org.apache.lucene.analysis.de.GermanAnalyzer());
|
||||||
luceneStorage.setCreate(true);
|
luceneStorage.setCreate(true);
|
||||||
// FIXME: index name and path need to be configurable
|
// FIXME: index name and path need to be configurable
|
||||||
luceneStorage.setIndexName("luceneIndex");
|
luceneStorage.setIndexName("luceneIndex");
|
||||||
|
// the field names come from URLMessage.java and WebDocument.java. See
|
||||||
|
// LuceneStorage source for details
|
||||||
luceneStorage.setFieldInfo("url", LuceneStorage.INDEX | LuceneStorage.STORE);
|
luceneStorage.setFieldInfo("url", LuceneStorage.INDEX | LuceneStorage.STORE);
|
||||||
luceneStorage.setFieldInfo("content", LuceneStorage.INDEX | LuceneStorage.STORE | LuceneStorage.TOKEN);
|
luceneStorage.setFieldInfo("content", LuceneStorage.INDEX | LuceneStorage.STORE | LuceneStorage.TOKEN);
|
||||||
storage.addDocStorage(luceneStorage);
|
storage.addDocStorage(luceneStorage);
|
||||||
|
*/
|
||||||
|
|
||||||
storage.open();
|
storage.open();
|
||||||
|
|
||||||
//storage.addStorage(new JMSStorage(...));
|
//storage.addStorage(new JMSStorage(...));
|
||||||
|
|
||||||
// a third example would be the NullStorage, which converts the documents into
|
|
||||||
// heat, which evaporates above the processor
|
|
||||||
// NullStorage();
|
|
||||||
|
|
||||||
hostManager = new HostManager(1000);
|
|
||||||
|
|
||||||
// create the filters and add them to the message queue
|
// create the filters and add them to the message queue
|
||||||
reFilter = new RobotExclusionFilter(hostManager);
|
urlScopeFilter = new URLScopeFilter(scopeLog);
|
||||||
urlScopeFilter = new URLScopeFilter();
|
|
||||||
urlVisitedFilter = new URLVisitedFilter(100000);
|
|
||||||
knownPathsFilter = new KnownPathsFilter();
|
|
||||||
urlLengthFilter = new URLLengthFilter(255);
|
|
||||||
|
|
||||||
// dnsResolver = new DNSResolver();
|
// dnsResolver = new DNSResolver();
|
||||||
|
hostManager = new HostManager(1000);
|
||||||
|
hostResolver = new HostResolver();
|
||||||
|
hostResolver.initFromFile(hostResolverFile);
|
||||||
|
hostManager.setHostResolver(hostResolver);
|
||||||
|
|
||||||
|
// hostManager.addSynonym("www.fachsprachen.uni-muenchen.de", "www.fremdsprachen.uni-muenchen.de");
|
||||||
|
// hostManager.addSynonym("www.uni-muenchen.de", "www.lmu.de");
|
||||||
|
// hostManager.addSynonym("www.uni-muenchen.de", "uni-muenchen.de");
|
||||||
|
// hostManager.addSynonym("webinfo.uni-muenchen.de", "www.webinfo.uni-muenchen.de");
|
||||||
|
// hostManager.addSynonym("webinfo.uni-muenchen.de", "webinfo.campus.lmu.de");
|
||||||
|
// hostManager.addSynonym("www.s-a.uni-muenchen.de", "s-a.uni-muenchen.de");
|
||||||
|
|
||||||
|
reFilter = new RobotExclusionFilter(hostManager);
|
||||||
|
|
||||||
fetcher = new Fetcher(nrThreads, storage, storage, hostManager);
|
fetcher = new Fetcher(nrThreads, storage, storage, hostManager);
|
||||||
|
|
||||||
// prevent message box popups
|
// prevent message box popups
|
||||||
|
@ -218,6 +254,8 @@ public class FetcherMain
|
||||||
// prevent GZipped files from being decoded
|
// prevent GZipped files from being decoded
|
||||||
HTTPConnection.removeDefaultModule(HTTPClient.ContentEncodingModule.class);
|
HTTPConnection.removeDefaultModule(HTTPClient.ContentEncodingModule.class);
|
||||||
|
|
||||||
|
urlVisitedFilter = new URLVisitedFilter(visitedLog, 100000);
|
||||||
|
|
||||||
// initialize the threads
|
// initialize the threads
|
||||||
fetcher.init();
|
fetcher.init();
|
||||||
|
|
||||||
|
@ -241,12 +279,15 @@ public class FetcherMain
|
||||||
messageHandler.addListener(reFilter);
|
messageHandler.addListener(reFilter);
|
||||||
messageHandler.addListener(urlVisitedFilter);
|
messageHandler.addListener(urlVisitedFilter);
|
||||||
messageHandler.addListener(knownPathsFilter);
|
messageHandler.addListener(knownPathsFilter);
|
||||||
|
|
||||||
messageHandler.addListener(fetcher);
|
messageHandler.addListener(fetcher);
|
||||||
|
|
||||||
/* uncomment this to enable HTTPClient logging
|
//uncomment this to enable HTTPClient logging
|
||||||
|
/*
|
||||||
try
|
try
|
||||||
{
|
{
|
||||||
HTTPClient.Log.setLogWriter(new java.io.FileWriter("logs/HttpClient.log"),false);
|
HTTPClient.Log.setLogWriter(new java.io.OutputStreamWriter(System.out) //new java.io.FileWriter("logs/HttpClient.log")
|
||||||
|
,false);
|
||||||
HTTPClient.Log.setLogging(HTTPClient.Log.ALL, true);
|
HTTPClient.Log.setLogging(HTTPClient.Log.ALL, true);
|
||||||
}
|
}
|
||||||
catch (Exception e)
|
catch (Exception e)
|
||||||
|
@ -254,6 +295,7 @@ public class FetcherMain
|
||||||
e.printStackTrace();
|
e.printStackTrace();
|
||||||
}
|
}
|
||||||
*/
|
*/
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@ -276,11 +318,11 @@ public class FetcherMain
|
||||||
* @exception java.net.MalformedURLException Description of Exception
|
* @exception java.net.MalformedURLException Description of Exception
|
||||||
*/
|
*/
|
||||||
public void putURL(URL url, boolean isFrame)
|
public void putURL(URL url, boolean isFrame)
|
||||||
throws java.net.MalformedURLException
|
// throws java.net.MalformedURLException
|
||||||
{
|
{
|
||||||
try
|
try
|
||||||
{
|
{
|
||||||
messageHandler.putMessage(new URLMessage(url, null, isFrame, null, this.hostManager));
|
messageHandler.putMessage(new URLMessage(url, null, isFrame == true ? URLMessage.LINKTYPE_FRAME : URLMessage.LINKTYPE_ANCHOR, null, this.hostResolver));
|
||||||
}
|
}
|
||||||
catch (Exception e)
|
catch (Exception e)
|
||||||
{
|
{
|
||||||
|
@ -288,7 +330,6 @@ public class FetcherMain
|
||||||
System.out.println("Exception: " + e.getMessage());
|
System.out.println("Exception: " + e.getMessage());
|
||||||
e.printStackTrace();
|
e.printStackTrace();
|
||||||
}
|
}
|
||||||
//System.out.println("URLs geschrieben");
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@ -341,24 +382,69 @@ public class FetcherMain
|
||||||
*
|
*
|
||||||
* @param args The command line arguments
|
* @param args The command line arguments
|
||||||
*/
|
*/
|
||||||
public static void main(String[] args)
|
public static void main(String[] args) throws Exception
|
||||||
{
|
{
|
||||||
int nrThreads = 10;
|
int nrThreads = 10;
|
||||||
|
|
||||||
String startURL = "";
|
ArrayList startURLs = new ArrayList();
|
||||||
String restrictTo = "http://141.84.120.82/ll/cmarschn/.*";
|
String restrictTo = ".*";
|
||||||
boolean gui = false;
|
boolean gui = false;
|
||||||
boolean showInfo = false;
|
boolean showInfo = false;
|
||||||
System.out.println("LARM - LANLab Retrieval Machine - Fetcher - V 1.00 - (C) LANLab 2000-02");
|
String hostResolverFile = "";
|
||||||
|
System.out.println("LARM - LANLab Retrieval Machine - Fetcher - V 1.00 - B.20020914");
|
||||||
// FIXME: consider using Jakarta Commons' CLI package for command line parameters
|
// FIXME: consider using Jakarta Commons' CLI package for command line parameters
|
||||||
|
|
||||||
for (int i = 0; i < args.length; i++)
|
for (int i = 0; i < args.length; i++)
|
||||||
{
|
{
|
||||||
if (args[i].equals("-start"))
|
if (args[i].equals("-start"))
|
||||||
{
|
{
|
||||||
i++;
|
i++;
|
||||||
startURL = args[i];
|
String arg = args[i];
|
||||||
System.out.println("Start-URL set to: " + startURL);
|
if(arg.startsWith("@"))
|
||||||
|
{
|
||||||
|
// input is a file with one URL per line
|
||||||
|
String fileName = arg.substring(1);
|
||||||
|
System.out.println("reading URL file " + fileName);
|
||||||
|
try
|
||||||
|
{
|
||||||
|
BufferedReader r = new BufferedReader(new FileReader(fileName));
|
||||||
|
String line;
|
||||||
|
int count=0;
|
||||||
|
while((line = r.readLine()) != null)
|
||||||
|
{
|
||||||
|
try
|
||||||
|
{
|
||||||
|
startURLs.add(new URL(line));
|
||||||
|
count++;
|
||||||
|
}
|
||||||
|
catch (MalformedURLException e)
|
||||||
|
{
|
||||||
|
System.out.println("Malformed URL '" + line + "' in line " + (count+1) + " of file " + fileName);
|
||||||
|
|
||||||
|
}
|
||||||
|
}
|
||||||
|
r.close();
|
||||||
|
System.out.println("added " + count + " URLs from " + fileName);
|
||||||
|
}
|
||||||
|
catch(IOException e)
|
||||||
|
{
|
||||||
|
System.out.println("Couldn't read '" + fileName + "': " + e);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
System.out.println("got URL " + arg);
|
||||||
|
try
|
||||||
|
{
|
||||||
|
startURLs.add(new URL(arg));
|
||||||
|
System.out.println("Start-URL added: " + arg);
|
||||||
|
}
|
||||||
|
catch (MalformedURLException e)
|
||||||
|
{
|
||||||
|
System.out.println("Malformed URL '" + arg + "'");
|
||||||
|
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
else if (args[i].equals("-restrictto"))
|
else if (args[i].equals("-restrictto"))
|
||||||
{
|
{
|
||||||
|
@ -372,6 +458,13 @@ public class FetcherMain
|
||||||
nrThreads = Integer.parseInt(args[i]);
|
nrThreads = Integer.parseInt(args[i]);
|
||||||
System.out.println("Threads set to " + nrThreads);
|
System.out.println("Threads set to " + nrThreads);
|
||||||
}
|
}
|
||||||
|
else if (args[i].equals("-hostresolver"))
|
||||||
|
{
|
||||||
|
i++;
|
||||||
|
hostResolverFile = args[i];
|
||||||
|
System.out.println("reading host resolver props from '" + hostResolverFile + "'");
|
||||||
|
|
||||||
|
}
|
||||||
else if (args[i].equals("-gui"))
|
else if (args[i].equals("-gui"))
|
||||||
{
|
{
|
||||||
gui = true;
|
gui = true;
|
||||||
|
@ -390,10 +483,60 @@ public class FetcherMain
|
||||||
//URL.setURLStreamHandlerFactory(new HttpTimeoutFactory(500));
|
//URL.setURLStreamHandlerFactory(new HttpTimeoutFactory(500));
|
||||||
// replaced by HTTPClient
|
// replaced by HTTPClient
|
||||||
|
|
||||||
FetcherMain f = new FetcherMain(nrThreads);
|
FetcherMain f = new FetcherMain(nrThreads, hostResolverFile);
|
||||||
if (showInfo || (startURL.equals("") && gui == false))
|
if (showInfo || "".equals(hostResolverFile) || (startURLs.isEmpty() && gui == false))
|
||||||
{
|
{
|
||||||
System.out.println("Usage: FetcherMain -start <URL> -restrictto <RegEx> [-threads <nr=10>]"); // [-gui]
|
System.out.println("The LARM crawler\n" +
|
||||||
|
"\n" +
|
||||||
|
"The LARM crawler is a fast parallel crawler, currently designed for\n" +
|
||||||
|
"large intranets (up to a couple hundred hosts with some hundred thousand\n" +
|
||||||
|
"documents). It is currently restricted by a relatively high memory overhead\n" +
|
||||||
|
"per crawled host, and by a HashMap of already crawled URLs which is also held\n" +
|
||||||
|
"in memory.\n" +
|
||||||
|
"\n" +
|
||||||
|
"Usage: FetcherMain <-start <URL>|@<filename>>+ -restrictto <RegEx>\n" +
|
||||||
|
" [-threads <nr=10>] [-hostresolver <filename>]\n" +
|
||||||
|
"\n" +
|
||||||
|
"Commands:\n" +
|
||||||
|
" -start specify one or more URLs to start with. You can as well specify a file" +
|
||||||
|
" that contains URLs, one each line\n" +
|
||||||
|
" -restrictto a Perl 5 regular expression each URL must match. It is run against the\n" +
|
||||||
|
" _complete_ URL, including the http:// part\n" +
|
||||||
|
" -threads the number of crawling threads. defaults to 10\n" +
|
||||||
|
" -hostresolver specify a file that contains rules for changing the host part of \n" +
|
||||||
|
" a URL during the normalization process (experimental).\n" +
|
||||||
|
"Caution: The <RegEx> is applied to the _normalized_ form of a URL.\n" +
|
||||||
|
" See URLNormalizer for details\n" +
|
||||||
|
"Example:\n" +
|
||||||
|
" -start @urls1.txt -start @urls2.txt -start http://localhost/ " +
|
||||||
|
" -restrictto http://[^/]*\\.localhost/.* -threads 25\n" +
|
||||||
|
"\n" +
|
||||||
|
"The host resolver file may contain the following commands: \n" +
|
||||||
|
" startsWith(part1) = part2\n" +
|
||||||
|
" if host starts with part1, this part will be replaced by part2\n" +
|
||||||
|
" endsWith(part1) = part2\n" +
|
||||||
|
" if host ends with part1, this part will be replaced by part2. This is done after\n" +
|
||||||
|
" startsWith was processed\n" +
|
||||||
|
" synonym(host1) = host2\n" +
|
||||||
|
" the keywords startsWith, endsWith and synonym are case sensitive\n" +
|
||||||
|
" host1 will be replaced with host2. this is done _after_ startsWith and endsWith was \n" +
|
||||||
|
" processed. Due to a bug in BeanUtils, dots are not allowed in the keys (in parentheses)\n" +
|
||||||
|
" and have to be escaped with commas. To simplify, commas are also replaced in property \n" +
|
||||||
|
" values. So just use commas instead of dots. The resulting host names are only used for \n" +
|
||||||
|
" comparisons and do not have to be existing URLs (although the syntax has to be valid).\n" +
|
||||||
|
" However, the names will often be passed to java.net.URL which will try to make a DNS name\n" +
|
||||||
|
" resolution, which will time out if the server can't be found. \n" +
|
||||||
|
" Example:" +
|
||||||
|
" synonym(www1,host,com) = host,com\n" +
|
||||||
|
" startsWith(www,) = ,\n" +
|
||||||
|
" endsWith(host1,com) = host,com\n" +
|
||||||
|
"The crawler will show a status message every 5 seconds, which is printed by ThreadMonitor.java\n" +
|
||||||
|
"It will stop after the ThreadMonitor found the message queue and the crawling threads to be idle a \n" +
|
||||||
|
"couple of times.\n" +
|
||||||
|
"The crawled data will be saved within a logs/ directory. A cachingqueue/ directory is used for\n" +
|
||||||
|
"temporary queues.\n" +
|
||||||
|
"Note that this implementation is experimental, and that the command line options cover only a part \n" +
|
||||||
|
"of the parameters. Much of the configuration can only be done by modifying FetcherMain.java\n");
|
||||||
System.exit(0);
|
System.exit(0);
|
||||||
}
|
}
|
||||||
try
|
try
|
||||||
|
@ -403,17 +546,14 @@ public class FetcherMain
|
||||||
if (gui)
|
if (gui)
|
||||||
{
|
{
|
||||||
// f.initGui(f, startURL);
|
// f.initGui(f, startURL);
|
||||||
|
// the GUI is not longer supported
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
|
||||||
try
|
|
||||||
{
|
{
|
||||||
f.startMonitor();
|
f.startMonitor();
|
||||||
f.putURL(new URL(startURL), false);
|
for(Iterator it = startURLs.iterator(); it.hasNext(); )
|
||||||
}
|
|
||||||
catch (MalformedURLException e)
|
|
||||||
{
|
{
|
||||||
System.out.println("Malformed URL");
|
f.putURL((URL)it.next(), false);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue