diff --git a/sandbox/contributions/webcrawler-LARM/CHANGES.txt b/sandbox/contributions/webcrawler-LARM/CHANGES.txt index b11e0036f8e..12b7032ee7f 100644 --- a/sandbox/contributions/webcrawler-LARM/CHANGES.txt +++ b/sandbox/contributions/webcrawler-LARM/CHANGES.txt @@ -1,5 +1,8 @@ $Id$ +2003-04-11 (cmarschner) + * fixed build issues + 2002-06-18 (cmarschner) * added an experimental version of Lucene storage. see FetcherMain.java for details how to use it LuceneStorage simply saves all fields as specified in WebDocument. add a converter to the diff --git a/sandbox/contributions/webcrawler-LARM/build.properties.sample b/sandbox/contributions/webcrawler-LARM/build.properties.sample deleted file mode 100644 index be18577c6d3..00000000000 --- a/sandbox/contributions/webcrawler-LARM/build.properties.sample +++ /dev/null @@ -1,3 +0,0 @@ -lucene.jar=/usr/local/jakarta-lucene/lucene.jar -oro.jar=/usr/local/jakarta-oro/oro.jar -debug=on diff --git a/sandbox/contributions/webcrawler-LARM/build.sh b/sandbox/contributions/webcrawler-LARM/build.sh deleted file mode 100755 index 0b86a7cfa91..00000000000 --- a/sandbox/contributions/webcrawler-LARM/build.sh +++ /dev/null @@ -1,23 +0,0 @@ -#!/bin/sh - -#clean -echo cleaning -rm -r build -rm -r classes -rm -r cachingqueue -rm -r logs - -#build -echo making build directory -mkdir build -cd build -echo extracting http client -jar xvf ../libs/HTTPClient.zip >/dev/nul -cd .. -cp -r src/* build -mkdir classes -echo compiling -javac -g -d classes -sourcepath build build/HTTPClient/*.java -javac -g -classpath ./libs/jakarta-oro-2.0.5.jar -d classes -sourcepath build build/de/lanlab/larm/fetcher/FetcherMain.java - - diff --git a/sandbox/contributions/webcrawler-LARM/run.bat b/sandbox/contributions/webcrawler-LARM/run.bat deleted file mode 100755 index 71022a9daec..00000000000 --- a/sandbox/contributions/webcrawler-LARM/run.bat +++ /dev/null @@ -1,3 +0,0 @@ -rmdir /s /q -r logs -mkdir logs -java -server -Xmx400mb -classpath classes;libs/jakarta-oro-2.0.5.jar de.lanlab.larm.fetcher.FetcherMain -start http://www.cis.uni-muenchen.de/ -restrictto http://.*\.uni-muenchen\.de.* -threads 15 diff --git a/sandbox/contributions/webcrawler-LARM/run.sh b/sandbox/contributions/webcrawler-LARM/run.sh deleted file mode 100755 index d7ea61890a1..00000000000 --- a/sandbox/contributions/webcrawler-LARM/run.sh +++ /dev/null @@ -1,41 +0,0 @@ -#!/bin/sh - -# -# $Id$ -# - -BASE_DIR=./runtime -LOG_DIR=$BASE_DIR/logs -CACHE_DIR=$BASE_DIR/cachingqueue -CLASSPATH=build/classes:libs/jakarta-oro-2.0.5.jar:libs/HTTPClient.zip:/usr/local/jakarta-lucene/lucene.jar -SLEEP_TIME=2 - -if [ $# -lt 4 ] -then - echo "Usage: `basename $0` <# threads> " >&2 - exit 1 -fi - -START_URL=$1 -SCOPE_REGEX=$2 -THREAD_COUNT=$3 -MAX_MEM=$4 - - -echo Removing $LOG_DIR... -sleep $SLEEP_TIME -rm -r $LOG_DIR -echo Removing $CACHE_DIR... -sleep $SLEEP_TIME -rm -r $CACHE_DIR -echo Creating $LOG_DIR -sleep $SLEEP_TIME -mkdir -p $LOG_DIR -echo Creating $CACHE_DIR -sleep $SLEEP_TIME -mkdir -p $CACHE_DIR - -CMD="java -server -Xmx$MAX_MEM -classpath $CLASSPATH de.lanlab.larm.fetcher.FetcherMain -start $START_URL -restrictto $SCOPE_REGEX -threads $THREAD_COUNT" -echo Starting LARM with: $CMD - -$CMD diff --git a/sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/fetcher/FetcherMain.java b/sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/fetcher/FetcherMain.java index 6a56f90048d..5db26746916 100644 --- a/sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/fetcher/FetcherMain.java +++ b/sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/fetcher/FetcherMain.java @@ -65,7 +65,6 @@ import java.io.*; import java.net.MalformedURLException; import java.net.URL; import java.util.*; -import javax.swing.UIManager; /** @@ -206,7 +205,7 @@ public class FetcherMain // file number, the offset within that file, and the document's length // FIXME: default constructor for all storages + bean access methods - storage.addDocStorage(new LogStorage(storeLog, /* save in page files? */ false, + storage.addDocStorage(new LogStorage(storeLog, /* save in page files? */ true, /* page file prefix */ "logs/pagefile")); storage.addLinkStorage(new LinkLogStorage(linksLog)); storage.addLinkStorage(messageHandler); @@ -234,7 +233,10 @@ public class FetcherMain // dnsResolver = new DNSResolver(); hostManager = new HostManager(1000); hostResolver = new HostResolver(); - hostResolver.initFromFile(hostResolverFile); + if(hostResolverFile != null && !"".equals(hostResolverFile)) + { + hostResolver.initFromFile(hostResolverFile); + } hostManager.setHostResolver(hostResolver); // hostManager.addSynonym("www.fachsprachen.uni-muenchen.de", "www.fremdsprachen.uni-muenchen.de"); @@ -248,6 +250,10 @@ public class FetcherMain fetcher = new Fetcher(nrThreads, storage, storage, hostManager); + urlLengthFilter = new URLLengthFilter(500, lengthLog); + + //knownPathsFilter = new KnownPathsFilter() + // prevent message box popups HTTPConnection.setDefaultAllowUserInteraction(false); @@ -278,7 +284,7 @@ public class FetcherMain messageHandler.addListener(urlScopeFilter); messageHandler.addListener(reFilter); messageHandler.addListener(urlVisitedFilter); - messageHandler.addListener(knownPathsFilter); + //messageHandler.addListener(knownPathsFilter); messageHandler.addListener(fetcher); @@ -484,7 +490,7 @@ public class FetcherMain // replaced by HTTPClient FetcherMain f = new FetcherMain(nrThreads, hostResolverFile); - if (showInfo || "".equals(hostResolverFile) || (startURLs.isEmpty() && gui == false)) + if (showInfo || (startURLs.isEmpty() && gui == false)) { System.out.println("The LARM crawler\n" + "\n" +