more than one start URL; hostResolver

git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@150839 13f79535-47bb-0310-9956-ffa450edef68
2002-10-22 15:05:07 +00:00 · 2002-10-22 15:05:07 +00:00 · e50d457ef7
parent aaffb3ed75
commit e50d457ef7
1 changed files with 186 additions and 46 deletions
--- a/sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/fetcher/FetcherMain.java
+++ b/sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/fetcher/FetcherMain.java
@ -56,12 +56,12 @@ package de.lanlab.larm.fetcher;

 import de.lanlab.larm.threads.ThreadPoolObserver;
 import de.lanlab.larm.threads.ThreadPool;
-import de.lanlab.larm.gui.*;
 import de.lanlab.larm.util.*;
 import de.lanlab.larm.storage.*;
 import de.lanlab.larm.net.*;
 import HTTPClient.*;
 import org.apache.oro.text.regex.MalformedPatternException;
+import java.io.*;
 import java.net.MalformedURLException;
 import java.net.URL;
 import java.util.*;
@ -109,6 +109,17 @@ public class FetcherMain
     */
    protected RobotExclusionFilter reFilter;

+    /**
+     * the host manager keeps track of all hosts and is used by the filters.
+     */
+    protected HostManager hostManager;
+
+    /**
+     * the host resolver can change a host that occurs within a URL to a different
+     * host, depending on the rules specified in a configuration file
+     */
+    protected HostResolver hostResolver;
+
    /**
     * this rather flaky filter just filters out some URLs, i.e. different views
     * of Apache the apache DirIndex module. Has to be made
@ -122,10 +133,6 @@ public class FetcherMain
     */
    protected URLLengthFilter urlLengthFilter;

-    /**
-     * the host manager keeps track of all hosts and is used by the filters.
-     */
-    protected HostManager hostManager;

    /**
     * this is the main document fetcher. It contains a thread pool that fetches the
@ -152,7 +159,7 @@ public class FetcherMain
     *
     * @param nrThreads  number of fetcher threads to be created
     */
-    private FetcherMain(int nrThreads)
+    public FetcherMain(int nrThreads, String hostResolverFile) throws Exception
    {
        // to make things clear, this method is commented a bit better than
        // the rest of the program...
@ -172,44 +179,73 @@ public class FetcherMain

        // the LogStorage used here does extensive logging. It logs all links and
        // document information.
-        // it also saves all documents to page files. Probably this single storage
-        // could also be replaced by a pipeline; or even incorporated into the
-        // existing message pipeline
-        SimpleLogger storeLog = new SimpleLogger("store", false);
-        SimpleLogger linksLog = new SimpleLogger("links", false);
+        // it also saves all documents to page files.
+        File logsDir = new File("logs");
+        logsDir.mkdir();    // ensure log directory exists

+        // in this experimental implementation, the crawler is pretty verbose
+        // the SimpleLogger, however, is a FlyWeight logger which is buffered and
+        // not thread safe by default
+        SimpleLogger storeLog = new SimpleLogger("store", /* add date/time? */ false);
+        SimpleLogger visitedLog = new SimpleLogger("URLVisitedFilter", /* add date/time? */ false);
+        SimpleLogger scopeLog = new SimpleLogger("URLScopeFilter", /* add date/time? */ false);
+        SimpleLogger pathsLog = new SimpleLogger("KnownPathsFilter", /* add date/time? */ false);
+        SimpleLogger linksLog = new SimpleLogger("links", /* add date/time? */ false);
+        SimpleLogger lengthLog = new SimpleLogger("length", /* add date/time? */ false);

        StoragePipeline storage = new StoragePipeline();
-        //storage.addDocStorage(new LogStorage(storeLog, /* save in page files? */ false, /* logfile prefix */ "logs/pagefile"));
+
+
+        // in the default configuration, the crawler will only save the document
+        // information to store.log and the link information to links.log
+        // The contents of the files are _not_ saved. If you set
+        // "save in page files" to "true", they will be saved in "page files",
+        // binary files each containing a set of documents. Here, the
+        // maximum file size is ~50 MB (crawled files won't be split up into different
+        // files). The logs/store.log file contains pointers to these files: a page
+        // file number, the offset within that file, and the document's length
+
+        // FIXME: default constructor for all storages + bean access methods
+        storage.addDocStorage(new LogStorage(storeLog, /* save in page files? */ false,
+                                             /* page file prefix */ "logs/pagefile"));
        storage.addLinkStorage(new LinkLogStorage(linksLog));
        storage.addLinkStorage(messageHandler);
-
+        /*
+        // experimental Lucene storage. will slow the crawler down *a lot*
        LuceneStorage luceneStorage = new LuceneStorage();
        luceneStorage.setAnalyzer(new org.apache.lucene.analysis.de.GermanAnalyzer());
        luceneStorage.setCreate(true);
 	// FIXME: index name and path need to be configurable
        luceneStorage.setIndexName("luceneIndex");
+        // the field names come from URLMessage.java and WebDocument.java. See
+        // LuceneStorage source for details
        luceneStorage.setFieldInfo("url", LuceneStorage.INDEX | LuceneStorage.STORE);
        luceneStorage.setFieldInfo("content", LuceneStorage.INDEX | LuceneStorage.STORE | LuceneStorage.TOKEN);
        storage.addDocStorage(luceneStorage);
+        */
+
        storage.open();

        //storage.addStorage(new JMSStorage(...));

-        // a third example would be the NullStorage, which converts the documents into
-        // heat, which evaporates above the processor
-        // NullStorage();
-
-        hostManager = new HostManager(1000);
-
        // create the filters and add them to the message queue
-        reFilter = new RobotExclusionFilter(hostManager);
-        urlScopeFilter = new URLScopeFilter();
-        urlVisitedFilter = new URLVisitedFilter(100000);
-        knownPathsFilter = new KnownPathsFilter();
-        urlLengthFilter = new URLLengthFilter(255);
+        urlScopeFilter = new URLScopeFilter(scopeLog);

        // dnsResolver = new DNSResolver();
+        hostManager = new HostManager(1000);
+        hostResolver = new HostResolver();
+        hostResolver.initFromFile(hostResolverFile);
+        hostManager.setHostResolver(hostResolver);
+
+//        hostManager.addSynonym("www.fachsprachen.uni-muenchen.de", "www.fremdsprachen.uni-muenchen.de");
+//        hostManager.addSynonym("www.uni-muenchen.de", "www.lmu.de");
+//        hostManager.addSynonym("www.uni-muenchen.de", "uni-muenchen.de");
+//        hostManager.addSynonym("webinfo.uni-muenchen.de", "www.webinfo.uni-muenchen.de");
+//        hostManager.addSynonym("webinfo.uni-muenchen.de", "webinfo.campus.lmu.de");
+//        hostManager.addSynonym("www.s-a.uni-muenchen.de", "s-a.uni-muenchen.de");
+
+        reFilter = new RobotExclusionFilter(hostManager);
+
        fetcher = new Fetcher(nrThreads, storage, storage, hostManager);

        // prevent message box popups
@ -218,6 +254,8 @@ public class FetcherMain
        // prevent GZipped files from being decoded
        HTTPConnection.removeDefaultModule(HTTPClient.ContentEncodingModule.class);

+        urlVisitedFilter = new URLVisitedFilter(visitedLog, 100000);
+
        // initialize the threads
        fetcher.init();

@ -241,12 +279,15 @@ public class FetcherMain
        messageHandler.addListener(reFilter);
        messageHandler.addListener(urlVisitedFilter);
        messageHandler.addListener(knownPathsFilter);
+
        messageHandler.addListener(fetcher);

-        /* uncomment this to enable HTTPClient logging
+         //uncomment this to enable HTTPClient logging
+        /*
        try
        {
-            HTTPClient.Log.setLogWriter(new java.io.FileWriter("logs/HttpClient.log"),false);
+            HTTPClient.Log.setLogWriter(new java.io.OutputStreamWriter(System.out) //new java.io.FileWriter("logs/HttpClient.log")
+            ,false);
            HTTPClient.Log.setLogging(HTTPClient.Log.ALL, true);
        }
        catch (Exception e)
@ -254,6 +295,7 @@ public class FetcherMain
            e.printStackTrace();
        }
        */
+
    }


@ -276,11 +318,11 @@ public class FetcherMain
     * @exception java.net.MalformedURLException  Description of Exception
     */
    public void putURL(URL url, boolean isFrame)
-        throws java.net.MalformedURLException
+     //   throws java.net.MalformedURLException
    {
        try
        {
-            messageHandler.putMessage(new URLMessage(url, null, isFrame, null, this.hostManager));
+            messageHandler.putMessage(new URLMessage(url, null, isFrame == true ? URLMessage.LINKTYPE_FRAME : URLMessage.LINKTYPE_ANCHOR, null, this.hostResolver));
        }
        catch (Exception e)
        {
@ -288,7 +330,6 @@ public class FetcherMain
            System.out.println("Exception: " + e.getMessage());
            e.printStackTrace();
        }
-        //System.out.println("URLs geschrieben");
    }


@ -341,24 +382,69 @@ public class FetcherMain
     *
     * @param args  The command line arguments
     */
-    public static void main(String[] args)
+    public static void main(String[] args) throws Exception
    {
        int nrThreads = 10;

-        String startURL = "";
-        String restrictTo = "http://141.84.120.82/ll/cmarschn/.*";
+        ArrayList startURLs = new ArrayList();
+        String restrictTo = ".*";
        boolean gui = false;
        boolean showInfo = false;
-        System.out.println("LARM - LANLab Retrieval Machine - Fetcher - V 1.00 - (C) LANLab 2000-02");
-
+        String hostResolverFile = "";
+        System.out.println("LARM - LANLab Retrieval Machine - Fetcher - V 1.00 - B.20020914");
 	// FIXME: consider using Jakarta Commons' CLI package for command line parameters
+
        for (int i = 0; i < args.length; i++)
        {
            if (args[i].equals("-start"))
            {
                i++;
-                startURL = args[i];
-                System.out.println("Start-URL set to: " + startURL);
+                String arg = args[i];
+                if(arg.startsWith("@"))
+                {
+                    // input is a file with one URL per line
+                    String fileName = arg.substring(1);
+                    System.out.println("reading URL file " + fileName);
+                    try
+                    {
+                        BufferedReader r = new BufferedReader(new FileReader(fileName));
+                        String line;
+                        int count=0;
+                        while((line = r.readLine()) != null)
+                        {
+                            try
+                            {
+                                startURLs.add(new URL(line));
+                                count++;
+                            }
+                            catch (MalformedURLException e)
+                            {
+                                System.out.println("Malformed URL '" + line + "' in line " + (count+1) + " of file " + fileName);
+
+                            }
+                        }
+                        r.close();
+                        System.out.println("added " + count + " URLs from " + fileName);
+                    }
+                    catch(IOException e)
+                    {
+                        System.out.println("Couldn't read '" + fileName + "': " + e);
+                    }
+                }
+                else
+                {
+                    System.out.println("got URL " + arg);
+                    try
+                    {
+                        startURLs.add(new URL(arg));
+                        System.out.println("Start-URL added: " + arg);
+                    }
+                    catch (MalformedURLException e)
+                    {
+                        System.out.println("Malformed URL '" + arg + "'");
+
+                    }
+                }
            }
            else if (args[i].equals("-restrictto"))
            {
@ -372,6 +458,13 @@ public class FetcherMain
                nrThreads = Integer.parseInt(args[i]);
                System.out.println("Threads set to " + nrThreads);
            }
+            else if (args[i].equals("-hostresolver"))
+            {
+                i++;
+                hostResolverFile = args[i];
+                System.out.println("reading host resolver props from  '" + hostResolverFile + "'");
+
+            }
            else if (args[i].equals("-gui"))
            {
                gui = true;
@ -390,10 +483,60 @@ public class FetcherMain
        //URL.setURLStreamHandlerFactory(new HttpTimeoutFactory(500));
        // replaced by HTTPClient

-        FetcherMain f = new FetcherMain(nrThreads);
-        if (showInfo || (startURL.equals("") && gui == false))
+        FetcherMain f = new FetcherMain(nrThreads, hostResolverFile);
+        if (showInfo || "".equals(hostResolverFile) || (startURLs.isEmpty() && gui == false))
        {
-            System.out.println("Usage: FetcherMain -start <URL> -restrictto <RegEx> [-threads <nr=10>]"); // [-gui]
+            System.out.println("The LARM crawler\n" +
+                               "\n" +
+                               "The LARM crawler is a fast parallel crawler, currently designed for\n" +
+                               "large intranets (up to a couple hundred hosts with some hundred thousand\n" +
+                               "documents). It is currently restricted by a relatively high memory overhead\n" +
+                               "per crawled host, and by a HashMap of already crawled URLs which is also held\n" +
+                               "in memory.\n" +
+                               "\n" +
+                               "Usage:   FetcherMain <-start <URL>|@<filename>>+ -restrictto <RegEx>\n" +
+                               "                    [-threads <nr=10>] [-hostresolver <filename>]\n" +
+                               "\n" +
+                               "Commands:\n" +
+                               "         -start specify one or more URLs to start with. You can as well specify a file" +
+                               "                that contains URLs, one each line\n" +
+                               "         -restrictto a Perl 5 regular expression each URL must match. It is run against the\n" +
+                               "                     _complete_ URL, including the http:// part\n" +
+                               "         -threads  the number of crawling threads. defaults to 10\n" +
+                               "         -hostresolver specify a file that contains rules for changing the host part of \n" +
+                               "                       a URL during the normalization process (experimental).\n" +
+                               "Caution: The <RegEx> is applied to the _normalized_ form of a URL.\n" +
+                               "         See URLNormalizer for details\n" +
+                               "Example:\n" +
+                               "    -start @urls1.txt -start @urls2.txt -start http://localhost/ " +
+                               "    -restrictto http://[^/]*\\.localhost/.* -threads 25\n" +
+                               "\n" +
+                               "The host resolver file may contain the following commands: \n" +
+                               "  startsWith(part1) = part2\n" +
+                               "      if host starts with part1, this part will be replaced by part2\n" +
+                               "   endsWith(part1) = part2\n" +
+                               "       if host ends with part1, this part will be replaced by part2. This is done after\n" +
+                               "       startsWith was processed\n" +
+                               "   synonym(host1) = host2\n" +
+                               "       the keywords startsWith, endsWith and synonym are case sensitive\n" +
+                               "       host1 will be replaced with host2. this is done _after_ startsWith and endsWith was \n" +
+                               "       processed. Due to a bug in BeanUtils, dots are not allowed in the keys (in parentheses)\n" +
+                               "       and have to be escaped with commas. To simplify, commas are also replaced in property \n" +
+                               "       values. So just use commas instead of dots. The resulting host names are only used for \n" +
+                               "       comparisons and do not have to be existing URLs (although the syntax has to be valid).\n" +
+                               "       However, the names will often be passed to java.net.URL which will try to make a DNS name\n" +
+                               "       resolution, which will time out if the server can't be found. \n" +
+                               "   Example:" +
+                               "     synonym(www1,host,com) = host,com\n" +
+                               "     startsWith(www,) = ,\n" +
+                               "     endsWith(host1,com) = host,com\n" +
+                               "The crawler will show a status message every 5 seconds, which is printed by ThreadMonitor.java\n" +
+                               "It will stop after the ThreadMonitor found the message queue and the crawling threads to be idle a \n" +
+                               "couple of times.\n" +
+                               "The crawled data will be saved within a logs/ directory. A cachingqueue/ directory is used for\n" +
+                               "temporary queues.\n" +
+                               "Note that this implementation is experimental, and that the command line options cover only a part \n" +
+                               "of the parameters. Much of the configuration can only be done by modifying FetcherMain.java\n");
            System.exit(0);
        }
        try
@ -403,17 +546,14 @@ public class FetcherMain
            if (gui)
            {
                // f.initGui(f, startURL);
+                // the GUI is not longer supported
            }
            else
            {
-                try
+                f.startMonitor();
+                for(Iterator it = startURLs.iterator(); it.hasNext(); )
                {
-                    f.startMonitor();
-                    f.putURL(new URL(startURL), false);
-                }
-                catch (MalformedURLException e)
-                {
-                    System.out.println("Malformed URL");
+                    f.putURL((URL)it.next(), false);
                }
            }
        }