From 8790e328db59c5277a9850487f1cf44d01b25834 Mon Sep 17 00:00:00 2001 From: cmarschner Date: Tue, 18 Jun 2002 00:45:10 +0000 Subject: [PATCH] added experimental version of LuceneStorage git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@150785 13f79535-47bb-0310-9956-ffa450edef68 --- .../src/de/lanlab/larm/fetcher/Fetcher.java | 6 ++++ .../de/lanlab/larm/fetcher/FetcherMain.java | 13 +++++-- .../de/lanlab/larm/fetcher/FetcherTask.java | 34 ++++++++++++------- 3 files changed, 39 insertions(+), 14 deletions(-) diff --git a/sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/fetcher/Fetcher.java b/sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/fetcher/Fetcher.java index ec831f616c3..8cc43c9d946 100644 --- a/sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/fetcher/Fetcher.java +++ b/sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/fetcher/Fetcher.java @@ -93,6 +93,11 @@ public class Fetcher implements MessageListener */ DocumentStorage storage; + /** + * the storage where the links are saved to + */ + LinkStorage linkStorage; + /** * the host manager keeps track of host information */ @@ -110,6 +115,7 @@ public class Fetcher implements MessageListener public Fetcher(int maxThreads, DocumentStorage docStorage, LinkStorage linkStorage, HostManager hostManager) { this.storage = storage; + this.linkStorage = linkStorage; FetcherTask.setDocStorage(docStorage); FetcherTask.setLinkStorage(linkStorage); fetcherPool = new ThreadPool(maxThreads, new FetcherThreadFactory(hostManager)); diff --git a/sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/fetcher/FetcherMain.java b/sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/fetcher/FetcherMain.java index a326ceee19c..15af780165e 100644 --- a/sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/fetcher/FetcherMain.java +++ b/sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/fetcher/FetcherMain.java @@ -183,10 +183,19 @@ public class FetcherMain StoragePipeline storage = new StoragePipeline(); - storage.addDocStorage(new LogStorage(storeLog, /* save in page files? */ false, /* logfile prefix */ "logs/pagefile")); + //storage.addDocStorage(new LogStorage(storeLog, /* save in page files? */ false, /* logfile prefix */ "logs/pagefile")); storage.addLinkStorage(new LinkLogStorage(linksLog)); storage.addLinkStorage(messageHandler); - //storage.addStorage(new LuceneStorage(...)); + + LuceneStorage luceneStorage = new LuceneStorage(); + luceneStorage.setAnalyzer(new org.apache.lucene.analysis.de.GermanAnalyzer()); + luceneStorage.setCreate(true); + luceneStorage.setIndexName("luceneIndex"); + luceneStorage.setFieldInfo("url", LuceneStorage.INDEX | LuceneStorage.STORE); + luceneStorage.setFieldInfo("content", LuceneStorage.INDEX | LuceneStorage.STORE | LuceneStorage.TOKEN); + storage.addDocStorage(luceneStorage); + storage.open(); + //storage.addStorage(new JMSStorage(...)); // a third example would be the NullStorage, which converts the documents into diff --git a/sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/fetcher/FetcherTask.java b/sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/fetcher/FetcherTask.java index 586549e9a1d..1ed8a85db7a 100644 --- a/sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/fetcher/FetcherTask.java +++ b/sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/fetcher/FetcherTask.java @@ -80,6 +80,8 @@ import de.lanlab.larm.net.*; * the storage. If it's an HTML document, it will be parsed and all links will * be put into the message handler again. * + * stores contents of the files in field "contents" + * * @author Clemens Marschner * @version $Id$ */ @@ -406,32 +408,40 @@ public class FetcherTask Tokenizer tok = new Tokenizer(); tok.setLinkHandler(this); tok.parse(new SimpleCharArrayReader(fullCharBuffer)); + + taskState.setState(FT_STORING, ipURL); + linkStorage.storeLinks(foundUrls); + WebDocument d = new WebDocument(contextUrl, contentType, statusCode, actURLMessage.getReferer(), contentLength, title, date, hostManager); + d.addField("content", fullCharBuffer); + docStorage.store(d); } else { // System.out.println("Discovered unknown content type: " + contentType + " at " + urlString); - errorLog.log("[" + threadNr + "] Discovered unknown content type at " + urlString + ": " + contentType + ". just storing"); + //errorLog.log("[" + threadNr + "] Discovered unknown content type at " + urlString + ": " + contentType + ". just storing"); + taskState.setState(FT_STORING, ipURL); + linkStorage.storeLinks(foundUrls); + WebDocument d = new WebDocument(contextUrl, contentType, statusCode, actURLMessage.getReferer(), contentLength, title, date, hostManager); + d.addField("content", fullBuffer); + docStorage.store(d); } log.log("scanned"); } - taskState.setState(FT_STORING, ipURL); - linkStorage.storeLinks(foundUrls); - //messageHandler.putMessages(foundUrls); - docStorage.store(new WebDocument(contextUrl, contentType, fullBuffer, statusCode, actURLMessage.getReferer(), contentLength, title, hostManager)); + log.log("stored"); } } catch (InterruptedIOException e) { // timeout while reading this file - System.out.println("[" + threadNr + "] FetcherTask: Timeout while opening: " + this.actURLMessage.getUrl()); + //System.out.println("[" + threadNr + "] FetcherTask: Timeout while opening: " + this.actURLMessage.getUrl()); errorLog.log("error: Timeout: " + this.actURLMessage.getUrl()); hi.badRequest(); } catch (FileNotFoundException e) { taskState.setState(FT_EXCEPTION); - System.out.println("[" + threadNr + "] FetcherTask: File not Found: " + this.actURLMessage.getUrl()); + //System.out.println("[" + threadNr + "] FetcherTask: File not Found: " + this.actURLMessage.getUrl()); errorLog.log("error: File not Found: " + this.actURLMessage.getUrl()); } catch(NoRouteToHostException e) @@ -439,7 +449,7 @@ public class FetcherTask // router is down or firewall prevents to connect hi.setReachable(false); taskState.setState(FT_EXCEPTION); - System.out.println("[" + threadNr + "] " + e.getClass().getName() + ": " + e.getMessage()); + //System.out.println("[" + threadNr + "] " + e.getClass().getName() + ": " + e.getMessage()); // e.printStackTrace(); errorLog.log("error: " + e.getClass().getName() + ": " + e.getMessage()); } @@ -448,14 +458,14 @@ public class FetcherTask // no server is listening at this port hi.setReachable(false); taskState.setState(FT_EXCEPTION); - System.out.println("[" + threadNr + "] " + e.getClass().getName() + ": " + e.getMessage()); + //System.out.println("[" + threadNr + "] " + e.getClass().getName() + ": " + e.getMessage()); // e.printStackTrace(); errorLog.log("error: " + e.getClass().getName() + ": " + e.getMessage()); } catch (SocketException e) { taskState.setState(FT_EXCEPTION); - System.out.println("[" + threadNr + "]: SocketException:" + e.getMessage()); + //System.out.println("[" + threadNr + "]: SocketException:" + e.getMessage()); errorLog.log("error: " + e.getClass().getName() + ": " + e.getMessage()); } @@ -464,7 +474,7 @@ public class FetcherTask // IP Address not to be determined hi.setReachable(false); taskState.setState(FT_EXCEPTION); - System.out.println("[" + threadNr + "] " + e.getClass().getName() + ": " + e.getMessage()); + //System.out.println("[" + threadNr + "] " + e.getClass().getName() + ": " + e.getMessage()); // e.printStackTrace(); errorLog.log("error: " + e.getClass().getName() + ": " + e.getMessage()); @@ -472,7 +482,7 @@ public class FetcherTask catch (IOException e) { taskState.setState(FT_EXCEPTION); - System.out.println("[" + threadNr + "] " + e.getClass().getName() + ": " + e.getMessage()); + //System.out.println("[" + threadNr + "] " + e.getClass().getName() + ": " + e.getMessage()); // e.printStackTrace(); errorLog.log("error: IOException: " + e.getClass().getName() + ": " + e.getMessage());