From 9f5185b44ce0d98d466f007ed52734fd7726b8de Mon Sep 17 00:00:00 2001 From: cmarschner Date: Tue, 22 Oct 2002 15:21:00 +0000 Subject: [PATCH] takes normalized URL string for comparisons; added logging git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@150845 13f79535-47bb-0310-9956-ffa450edef68 --- .../src/de/lanlab/larm/fetcher/URLScopeFilter.java | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/fetcher/URLScopeFilter.java b/sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/fetcher/URLScopeFilter.java index 2928c78cc50..300df87c84e 100644 --- a/sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/fetcher/URLScopeFilter.java +++ b/sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/fetcher/URLScopeFilter.java @@ -57,6 +57,7 @@ package de.lanlab.larm.fetcher; import org.apache.oro.text.regex.Perl5Matcher; import org.apache.oro.text.regex.Perl5Compiler; import org.apache.oro.text.regex.Pattern; +import de.lanlab.larm.util.*; /** * filter class. Tries to match a regular expression with an incoming URL @@ -77,11 +78,13 @@ class URLScopeFilter extends Filter implements MessageListener private Pattern pattern; private Perl5Matcher matcher; private Perl5Compiler compiler; + SimpleLogger log; - public URLScopeFilter() + public URLScopeFilter(SimpleLogger log) { matcher = new Perl5Matcher(); compiler = new Perl5Compiler(); + this.log = log; } public String getRexString() @@ -108,7 +111,7 @@ class URLScopeFilter extends Filter implements MessageListener { if(message instanceof URLMessage) { - String urlString = ((URLMessage)message).toString(); + String urlString = ((URLMessage)message).getNormalizedURLString(); int length = urlString.length(); char buffer[] = new char[length]; urlString.getChars(0,length,buffer,0); @@ -117,8 +120,10 @@ class URLScopeFilter extends Filter implements MessageListener boolean match = matcher.matches(buffer, pattern); if(!match) { - //System.out.println("not in Scope: " + urlString); + //log.log("URLScopeFilter: not in scope: " + urlString); + log.log(message.toString()); filtered++; + return null; } }