SOLR-2826: URLClassify Update Processor

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1300091 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Jan Høydahl 2012-03-13 12:23:11 +00:00
parent ea862e17a6
commit ba6024cc7f
3 changed files with 382 additions and 0 deletions

View File

@ -0,0 +1,234 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.solr.update.processor;
import java.io.IOException;
import java.net.MalformedURLException;
import java.net.URI;
import java.net.URISyntaxException;
import java.net.URL;
import java.util.HashSet;
import org.apache.solr.common.SolrInputDocument;
import org.apache.solr.common.params.SolrParams;
import org.apache.solr.request.SolrQueryRequest;
import org.apache.solr.response.SolrQueryResponse;
import org.apache.solr.update.AddUpdateCommand;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/**
* Update processor which examines a URL and outputs to various other fields
* characteristics of that URL, including length, number of path levels, whether
* it is a top level URL (levels==0), whether it looks like a landing/index page,
* a canonical representation of the URL (e.g. stripping index.html), the domain
* and path parts of the URL etc.
* <p>
* This processor is intended used in connection with processing web resuources,
* and helping to produce values which may be used for boosting or filtering later.
*/
public class URLClassifyProcessor extends UpdateRequestProcessor {
private static final String INPUT_FIELD_PARAM = "inputField";
private static final String OUTPUT_LENGTH_FIELD_PARAM = "lengthOutputField";
private static final String OUTPUT_LEVELS_FIELD_PARAM = "levelsOutputField";
private static final String OUTPUT_TOPLEVEL_FIELD_PARAM = "toplevelOutputField";
private static final String OUTPUT_LANDINGPAGE_FIELD_PARAM = "landingpageOutputField";
private static final String OUTPUT_DOMAIN_FIELD_PARAM = "domainOutputField";
private static final String OUTPUT_CANONICALURL_FIELD_PARAM = "canonicalUrlOutputField";
private static final String DEFAULT_URL_FIELDNAME = "url";
private static final String DEFAULT_LENGTH_FIELDNAME = "url_length";
private static final String DEFAULT_LEVELS_FIELDNAME = "url_levels";
private static final String DEFAULT_TOPLEVEL_FIELDNAME = "url_toplevel";
private static final String DEFAULT_LANDINGPAGE_FIELDNAME = "url_landingpage";
private final static Logger log = LoggerFactory.getLogger(URLClassifyProcessor.class);
private boolean enabled = true;
private String urlFieldname = DEFAULT_URL_FIELDNAME;
private String lengthFieldname = DEFAULT_LENGTH_FIELDNAME;
private String levelsFieldname = DEFAULT_LEVELS_FIELDNAME;
private String toplevelpageFieldname = DEFAULT_TOPLEVEL_FIELDNAME;
private String landingpageFieldname = DEFAULT_LANDINGPAGE_FIELDNAME;
private String domainFieldname = null;
private String canonicalUrlFieldname = null;
private String[] landingPageSuffixes = {
"/",
"index.html",
"index.htm",
"index.phtml",
"index.shtml",
"index.xml",
"index.php",
"index.asp",
"index.aspx",
"welcome.html",
"welcome.htm",
"welcome.phtml",
"welcome.shtml",
"welcome.xml",
"welcome.php",
"welcome.asp",
"welcome.aspx"
};
public URLClassifyProcessor(SolrParams parameters,
SolrQueryRequest request,
SolrQueryResponse response,
UpdateRequestProcessor nextProcessor) {
super(nextProcessor);
HashSet<String> landingPageSuffixesSet = new HashSet<String>();
for(String s : landingPageSuffixes) {
landingPageSuffixesSet.add(s);
}
this.initParameters(parameters);
}
private void initParameters(SolrParams parameters) {
if (parameters != null) {
this.setEnabled(parameters.getBool("enabled", true));
this.urlFieldname = parameters.get(INPUT_FIELD_PARAM, DEFAULT_URL_FIELDNAME);
this.lengthFieldname = parameters.get(OUTPUT_LENGTH_FIELD_PARAM, DEFAULT_LENGTH_FIELDNAME);
this.levelsFieldname = parameters.get(OUTPUT_LEVELS_FIELD_PARAM, DEFAULT_LEVELS_FIELDNAME);
this.toplevelpageFieldname = parameters.get(OUTPUT_TOPLEVEL_FIELD_PARAM, DEFAULT_TOPLEVEL_FIELDNAME);
this.landingpageFieldname = parameters.get(OUTPUT_LANDINGPAGE_FIELD_PARAM, DEFAULT_LANDINGPAGE_FIELDNAME);
this.domainFieldname = parameters.get(OUTPUT_DOMAIN_FIELD_PARAM);
this.canonicalUrlFieldname = parameters.get(OUTPUT_CANONICALURL_FIELD_PARAM);
}
}
@Override
public void processAdd(AddUpdateCommand command) throws IOException {
if (isEnabled()) {
SolrInputDocument document = command.getSolrInputDocument();
if (document.containsKey(urlFieldname)) {
String url = (String) document.getFieldValue(urlFieldname);
try {
URL normalizedURL = getNormalizedURL(url);
document.setField(lengthFieldname, length(normalizedURL));
document.setField(levelsFieldname, levels(normalizedURL));
document.setField(toplevelpageFieldname, isTopLevelPage(normalizedURL) ? 1 : 0);
document.setField(landingpageFieldname, isLandingPage(normalizedURL) ? 1 : 0);
if (domainFieldname != null) {
document.setField(domainFieldname, normalizedURL.getHost());
}
if (canonicalUrlFieldname != null) {
document.setField(canonicalUrlFieldname, getCanonicalUrl(normalizedURL));
}
log.debug(document.toString());
} catch (MalformedURLException e) {
e.printStackTrace();
} catch (URISyntaxException e) {
e.printStackTrace();
}
}
}
super.processAdd(command);
}
/**
* Gets a canonical form of the URL for use as main URL
* @param url The input url
* @return The URL object representing the canonical URL
*/
public URL getCanonicalUrl(URL url) {
// NOTE: Do we want to make sure this URL is normalized? (Christian thinks we should)
String urlString = url.toString();
try {
String lps = landingPageSuffix(url);
return new URL(urlString.replaceFirst("/"+lps+"$", "/"));
} catch (MalformedURLException e) {
e.printStackTrace();
}
return url;
}
/**
* Calculates the length of the URL in characters
* @param url The input URL
* @return the length of the URL
*/
public int length(URL url) {
return url.toString().length();
}
/**
* Calculates the number of path levels in the given URL
* @param url The input URL
* @return the number of levels, where a top-level URL is 0
*/
public int levels(URL url) {
// Remove any trailing slashes for the purpose of level counting
String path = getPathWithoutSuffix(url).replaceAll("/+$", "");
int levels = 0;
for (int i = 0; i < path.length(); i++) {
if (path.charAt(i) == '/') {
levels++;
}
}
return levels;
}
/**
* Calculates whether a URL is a top level page
* @param url The input URL
* @return true if page is a top level page
*/
public boolean isTopLevelPage(URL url) {
// Remove any trailing slashes for the purpose of level counting
String path = getPathWithoutSuffix(url).replaceAll("/+$", "");
return path.length() == 0 && url.getQuery() == null;
}
/**
* Calculates whether the URL is a landing page or not
* @param url The input URL
* @return true if URL represents a landing page (index page)
*/
public boolean isLandingPage(URL url) {
if (url.getQuery() != null) {
return false;
} else {
return landingPageSuffix(url) != "";
}
}
public URL getNormalizedURL(String url) throws MalformedURLException, URISyntaxException {
return new URI(url).normalize().toURL();
}
public boolean isEnabled() {
return enabled;
}
public void setEnabled(boolean enabled) {
this.enabled = enabled;
}
private String landingPageSuffix(URL url) {
String path = url.getPath().toLowerCase();
for(String suffix : landingPageSuffixes) {
if(path.endsWith(suffix)) {
return suffix;
}
}
return "";
}
private String getPathWithoutSuffix(URL url) {
return url.getPath().toLowerCase().replaceFirst(landingPageSuffix(url)+"$", "");
}
}

View File

@ -0,0 +1,44 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.solr.update.processor;
import org.apache.solr.common.params.SolrParams;
import org.apache.solr.common.util.NamedList;
import org.apache.solr.request.SolrQueryRequest;
import org.apache.solr.response.SolrQueryResponse;
/**
* Creates URLClassifyProcessor
*/
public class URLClassifyProcessorFactory extends UpdateRequestProcessorFactory {
private SolrParams params;
@Override
public void init(@SuppressWarnings("rawtypes") final NamedList args) {
if (args != null) {
this.params = SolrParams.toSolrParams(args);
}
}
@Override
public UpdateRequestProcessor getInstance(SolrQueryRequest request,
SolrQueryResponse response,
UpdateRequestProcessor nextProcessor) {
return new URLClassifyProcessor(params, request, response, nextProcessor);
}
}

View File

@ -0,0 +1,104 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.solr.update.processor;
import java.io.IOException;
import java.net.MalformedURLException;
import java.net.URISyntaxException;
import org.apache.solr.SolrTestCaseJ4;
import org.apache.solr.common.SolrInputDocument;
import org.apache.solr.update.AddUpdateCommand;
import org.apache.solr.update.processor.URLClassifyProcessor;
import org.apache.solr.update.processor.URLClassifyProcessorFactory;
import org.junit.BeforeClass;
import org.junit.Test;
public class URLClassifyProcessorTest extends SolrTestCaseJ4 {
private static URLClassifyProcessor classifyProcessor;
@BeforeClass
public static void initTest() {
classifyProcessor =
(URLClassifyProcessor) new URLClassifyProcessorFactory().getInstance(null, null, null);
}
@Test
public void testProcessor() throws IOException {
AddUpdateCommand addCommand = new AddUpdateCommand(null);
SolrInputDocument document = new SolrInputDocument();
document.addField("id", "test");
document.addField("url", "http://www.example.com");
addCommand.solrDoc = document;
classifyProcessor.processAdd(addCommand);
}
@Test
public void testNormalizations() throws MalformedURLException, URISyntaxException {
String url1 = "http://www.example.com/research/";
String url2 = "http://www.example.com/research/../research/";
assertEquals(classifyProcessor.getNormalizedURL(url1), classifyProcessor.getNormalizedURL(url2));
}
@Test
public void testLength() throws MalformedURLException, URISyntaxException {
assertEquals(22, classifyProcessor.length(classifyProcessor.getNormalizedURL("http://www.example.com")));
}
@Test
public void testLevels() throws MalformedURLException, URISyntaxException {
assertEquals(1, classifyProcessor.levels(classifyProcessor.getNormalizedURL("http://www.example.com/research/")));
assertEquals(1, classifyProcessor.levels(classifyProcessor.getNormalizedURL("http://www.example.com/research/index.html")));
assertEquals(1, classifyProcessor.levels(classifyProcessor.getNormalizedURL("http://www.example.com/research/../research/")));
assertEquals(0, classifyProcessor.levels(classifyProcessor.getNormalizedURL("http://www.example.com/")));
assertEquals(0, classifyProcessor.levels(classifyProcessor.getNormalizedURL("http://www.example.com/index.htm")));
assertEquals(0, classifyProcessor.levels(classifyProcessor.getNormalizedURL("http://www.example.com")));
assertEquals(0, classifyProcessor.levels(classifyProcessor.getNormalizedURL("https://www.example.com")));
assertEquals(0, classifyProcessor.levels(classifyProcessor.getNormalizedURL("http://www.example.com////")));
}
@Test
public void testLandingPage() throws MalformedURLException, URISyntaxException {
assertTrue(classifyProcessor.isLandingPage(classifyProcessor.getNormalizedURL("http://www.example.com/index.html")));
assertTrue(classifyProcessor.isLandingPage(classifyProcessor.getNormalizedURL("http://www.example.com/index.htm")));
assertTrue(classifyProcessor.isLandingPage(classifyProcessor.getNormalizedURL("http://www.example.com/welcome.html")));
assertTrue(classifyProcessor.isLandingPage(classifyProcessor.getNormalizedURL("http://www.example.com/welcome.htm")));
assertTrue(classifyProcessor.isLandingPage(classifyProcessor.getNormalizedURL("http://www.example.com/index.php")));
assertTrue(classifyProcessor.isLandingPage(classifyProcessor.getNormalizedURL("http://www.example.com/index.asp")));
assertTrue(classifyProcessor.isLandingPage(classifyProcessor.getNormalizedURL("http://www.example.com/research/")));
assertTrue(classifyProcessor.isLandingPage(classifyProcessor.getNormalizedURL("https://www.example.com/research/")));
assertTrue(classifyProcessor.isLandingPage(classifyProcessor.getNormalizedURL("http://www.example.com/")));
assertFalse(classifyProcessor.isLandingPage(classifyProcessor.getNormalizedURL("http://www.example.com/intro.htm")));
}
@Test
public void testTopLevelPage() throws MalformedURLException, URISyntaxException {
assertTrue(classifyProcessor.isTopLevelPage(classifyProcessor.getNormalizedURL("http://www.example.com")));
assertTrue(classifyProcessor.isTopLevelPage(classifyProcessor.getNormalizedURL("http://www.example.com/")));
assertTrue(classifyProcessor.isTopLevelPage(classifyProcessor.getNormalizedURL("http://subdomain.example.com:1234/#anchor")));
assertTrue(classifyProcessor.isTopLevelPage(classifyProcessor.getNormalizedURL("http://www.example.com/index.html")));
assertFalse(classifyProcessor.isTopLevelPage(classifyProcessor.getNormalizedURL("http://www.example.com/foo")));
assertFalse(classifyProcessor.isTopLevelPage(classifyProcessor.getNormalizedURL("http://subdomain.example.com/?sorting=lastModified%253Adesc&tag=myTag&view=feed")));
}
@Test
public void testCanonicalUrl() throws MalformedURLException, URISyntaxException {
assertEquals("http://www.example.com/", classifyProcessor.getCanonicalUrl(classifyProcessor.getNormalizedURL("http://www.example.com/index.html")).toString());
}
}