mirror of https://github.com/apache/lucene.git
added experimental version of LuceneStorage
git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@150785 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
bdd627d35c
commit
8790e328db
|
@ -93,6 +93,11 @@ public class Fetcher implements MessageListener
|
|||
*/
|
||||
DocumentStorage storage;
|
||||
|
||||
/**
|
||||
* the storage where the links are saved to
|
||||
*/
|
||||
LinkStorage linkStorage;
|
||||
|
||||
/**
|
||||
* the host manager keeps track of host information
|
||||
*/
|
||||
|
@ -110,6 +115,7 @@ public class Fetcher implements MessageListener
|
|||
public Fetcher(int maxThreads, DocumentStorage docStorage, LinkStorage linkStorage, HostManager hostManager)
|
||||
{
|
||||
this.storage = storage;
|
||||
this.linkStorage = linkStorage;
|
||||
FetcherTask.setDocStorage(docStorage);
|
||||
FetcherTask.setLinkStorage(linkStorage);
|
||||
fetcherPool = new ThreadPool(maxThreads, new FetcherThreadFactory(hostManager));
|
||||
|
|
|
@ -183,10 +183,19 @@ public class FetcherMain
|
|||
|
||||
|
||||
StoragePipeline storage = new StoragePipeline();
|
||||
storage.addDocStorage(new LogStorage(storeLog, /* save in page files? */ false, /* logfile prefix */ "logs/pagefile"));
|
||||
//storage.addDocStorage(new LogStorage(storeLog, /* save in page files? */ false, /* logfile prefix */ "logs/pagefile"));
|
||||
storage.addLinkStorage(new LinkLogStorage(linksLog));
|
||||
storage.addLinkStorage(messageHandler);
|
||||
//storage.addStorage(new LuceneStorage(...));
|
||||
|
||||
LuceneStorage luceneStorage = new LuceneStorage();
|
||||
luceneStorage.setAnalyzer(new org.apache.lucene.analysis.de.GermanAnalyzer());
|
||||
luceneStorage.setCreate(true);
|
||||
luceneStorage.setIndexName("luceneIndex");
|
||||
luceneStorage.setFieldInfo("url", LuceneStorage.INDEX | LuceneStorage.STORE);
|
||||
luceneStorage.setFieldInfo("content", LuceneStorage.INDEX | LuceneStorage.STORE | LuceneStorage.TOKEN);
|
||||
storage.addDocStorage(luceneStorage);
|
||||
storage.open();
|
||||
|
||||
//storage.addStorage(new JMSStorage(...));
|
||||
|
||||
// a third example would be the NullStorage, which converts the documents into
|
||||
|
|
|
@ -80,6 +80,8 @@ import de.lanlab.larm.net.*;
|
|||
* the storage. If it's an HTML document, it will be parsed and all links will
|
||||
* be put into the message handler again.
|
||||
*
|
||||
* stores contents of the files in field "contents"
|
||||
*
|
||||
* @author Clemens Marschner
|
||||
* @version $Id$
|
||||
*/
|
||||
|
@ -406,32 +408,40 @@ public class FetcherTask
|
|||
Tokenizer tok = new Tokenizer();
|
||||
tok.setLinkHandler(this);
|
||||
tok.parse(new SimpleCharArrayReader(fullCharBuffer));
|
||||
|
||||
taskState.setState(FT_STORING, ipURL);
|
||||
linkStorage.storeLinks(foundUrls);
|
||||
WebDocument d = new WebDocument(contextUrl, contentType, statusCode, actURLMessage.getReferer(), contentLength, title, date, hostManager);
|
||||
d.addField("content", fullCharBuffer);
|
||||
docStorage.store(d);
|
||||
}
|
||||
else
|
||||
{
|
||||
// System.out.println("Discovered unknown content type: " + contentType + " at " + urlString);
|
||||
errorLog.log("[" + threadNr + "] Discovered unknown content type at " + urlString + ": " + contentType + ". just storing");
|
||||
//errorLog.log("[" + threadNr + "] Discovered unknown content type at " + urlString + ": " + contentType + ". just storing");
|
||||
taskState.setState(FT_STORING, ipURL);
|
||||
linkStorage.storeLinks(foundUrls);
|
||||
WebDocument d = new WebDocument(contextUrl, contentType, statusCode, actURLMessage.getReferer(), contentLength, title, date, hostManager);
|
||||
d.addField("content", fullBuffer);
|
||||
docStorage.store(d);
|
||||
}
|
||||
log.log("scanned");
|
||||
}
|
||||
taskState.setState(FT_STORING, ipURL);
|
||||
linkStorage.storeLinks(foundUrls);
|
||||
//messageHandler.putMessages(foundUrls);
|
||||
docStorage.store(new WebDocument(contextUrl, contentType, fullBuffer, statusCode, actURLMessage.getReferer(), contentLength, title, hostManager));
|
||||
|
||||
log.log("stored");
|
||||
}
|
||||
}
|
||||
catch (InterruptedIOException e)
|
||||
{
|
||||
// timeout while reading this file
|
||||
System.out.println("[" + threadNr + "] FetcherTask: Timeout while opening: " + this.actURLMessage.getUrl());
|
||||
//System.out.println("[" + threadNr + "] FetcherTask: Timeout while opening: " + this.actURLMessage.getUrl());
|
||||
errorLog.log("error: Timeout: " + this.actURLMessage.getUrl());
|
||||
hi.badRequest();
|
||||
}
|
||||
catch (FileNotFoundException e)
|
||||
{
|
||||
taskState.setState(FT_EXCEPTION);
|
||||
System.out.println("[" + threadNr + "] FetcherTask: File not Found: " + this.actURLMessage.getUrl());
|
||||
//System.out.println("[" + threadNr + "] FetcherTask: File not Found: " + this.actURLMessage.getUrl());
|
||||
errorLog.log("error: File not Found: " + this.actURLMessage.getUrl());
|
||||
}
|
||||
catch(NoRouteToHostException e)
|
||||
|
@ -439,7 +449,7 @@ public class FetcherTask
|
|||
// router is down or firewall prevents to connect
|
||||
hi.setReachable(false);
|
||||
taskState.setState(FT_EXCEPTION);
|
||||
System.out.println("[" + threadNr + "] " + e.getClass().getName() + ": " + e.getMessage());
|
||||
//System.out.println("[" + threadNr + "] " + e.getClass().getName() + ": " + e.getMessage());
|
||||
// e.printStackTrace();
|
||||
errorLog.log("error: " + e.getClass().getName() + ": " + e.getMessage());
|
||||
}
|
||||
|
@ -448,14 +458,14 @@ public class FetcherTask
|
|||
// no server is listening at this port
|
||||
hi.setReachable(false);
|
||||
taskState.setState(FT_EXCEPTION);
|
||||
System.out.println("[" + threadNr + "] " + e.getClass().getName() + ": " + e.getMessage());
|
||||
//System.out.println("[" + threadNr + "] " + e.getClass().getName() + ": " + e.getMessage());
|
||||
// e.printStackTrace();
|
||||
errorLog.log("error: " + e.getClass().getName() + ": " + e.getMessage());
|
||||
}
|
||||
catch (SocketException e)
|
||||
{
|
||||
taskState.setState(FT_EXCEPTION);
|
||||
System.out.println("[" + threadNr + "]: SocketException:" + e.getMessage());
|
||||
//System.out.println("[" + threadNr + "]: SocketException:" + e.getMessage());
|
||||
errorLog.log("error: " + e.getClass().getName() + ": " + e.getMessage());
|
||||
|
||||
}
|
||||
|
@ -464,7 +474,7 @@ public class FetcherTask
|
|||
// IP Address not to be determined
|
||||
hi.setReachable(false);
|
||||
taskState.setState(FT_EXCEPTION);
|
||||
System.out.println("[" + threadNr + "] " + e.getClass().getName() + ": " + e.getMessage());
|
||||
//System.out.println("[" + threadNr + "] " + e.getClass().getName() + ": " + e.getMessage());
|
||||
// e.printStackTrace();
|
||||
errorLog.log("error: " + e.getClass().getName() + ": " + e.getMessage());
|
||||
|
||||
|
@ -472,7 +482,7 @@ public class FetcherTask
|
|||
catch (IOException e)
|
||||
{
|
||||
taskState.setState(FT_EXCEPTION);
|
||||
System.out.println("[" + threadNr + "] " + e.getClass().getName() + ": " + e.getMessage());
|
||||
//System.out.println("[" + threadNr + "] " + e.getClass().getName() + ": " + e.getMessage());
|
||||
// e.printStackTrace();
|
||||
errorLog.log("error: IOException: " + e.getClass().getName() + ": " + e.getMessage());
|
||||
|
||||
|
|
Loading…
Reference in New Issue