From ba6024cc7f7e90a9dbd600a0704341316c20d07c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jan=20H=C3=B8ydahl?= Date: Tue, 13 Mar 2012 12:23:11 +0000 Subject: [PATCH] SOLR-2826: URLClassify Update Processor git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1300091 13f79535-47bb-0310-9956-ffa450edef68 --- .../processor/URLClassifyProcessor.java | 234 ++++++++++++++++++ .../URLClassifyProcessorFactory.java | 44 ++++ .../processor/URLClassifyProcessorTest.java | 104 ++++++++ 3 files changed, 382 insertions(+) create mode 100644 solr/core/src/java/org/apache/solr/update/processor/URLClassifyProcessor.java create mode 100644 solr/core/src/java/org/apache/solr/update/processor/URLClassifyProcessorFactory.java create mode 100644 solr/core/src/test/org/apache/solr/update/processor/URLClassifyProcessorTest.java diff --git a/solr/core/src/java/org/apache/solr/update/processor/URLClassifyProcessor.java b/solr/core/src/java/org/apache/solr/update/processor/URLClassifyProcessor.java new file mode 100644 index 00000000000..f6e3d4f590f --- /dev/null +++ b/solr/core/src/java/org/apache/solr/update/processor/URLClassifyProcessor.java @@ -0,0 +1,234 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.solr.update.processor; + +import java.io.IOException; +import java.net.MalformedURLException; +import java.net.URI; +import java.net.URISyntaxException; +import java.net.URL; +import java.util.HashSet; + +import org.apache.solr.common.SolrInputDocument; +import org.apache.solr.common.params.SolrParams; +import org.apache.solr.request.SolrQueryRequest; +import org.apache.solr.response.SolrQueryResponse; +import org.apache.solr.update.AddUpdateCommand; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** + * Update processor which examines a URL and outputs to various other fields + * characteristics of that URL, including length, number of path levels, whether + * it is a top level URL (levels==0), whether it looks like a landing/index page, + * a canonical representation of the URL (e.g. stripping index.html), the domain + * and path parts of the URL etc. + *

+ * This processor is intended used in connection with processing web resuources, + * and helping to produce values which may be used for boosting or filtering later. + */ +public class URLClassifyProcessor extends UpdateRequestProcessor { + + private static final String INPUT_FIELD_PARAM = "inputField"; + private static final String OUTPUT_LENGTH_FIELD_PARAM = "lengthOutputField"; + private static final String OUTPUT_LEVELS_FIELD_PARAM = "levelsOutputField"; + private static final String OUTPUT_TOPLEVEL_FIELD_PARAM = "toplevelOutputField"; + private static final String OUTPUT_LANDINGPAGE_FIELD_PARAM = "landingpageOutputField"; + private static final String OUTPUT_DOMAIN_FIELD_PARAM = "domainOutputField"; + private static final String OUTPUT_CANONICALURL_FIELD_PARAM = "canonicalUrlOutputField"; + private static final String DEFAULT_URL_FIELDNAME = "url"; + private static final String DEFAULT_LENGTH_FIELDNAME = "url_length"; + private static final String DEFAULT_LEVELS_FIELDNAME = "url_levels"; + private static final String DEFAULT_TOPLEVEL_FIELDNAME = "url_toplevel"; + private static final String DEFAULT_LANDINGPAGE_FIELDNAME = "url_landingpage"; + private final static Logger log = LoggerFactory.getLogger(URLClassifyProcessor.class); + private boolean enabled = true; + private String urlFieldname = DEFAULT_URL_FIELDNAME; + private String lengthFieldname = DEFAULT_LENGTH_FIELDNAME; + private String levelsFieldname = DEFAULT_LEVELS_FIELDNAME; + private String toplevelpageFieldname = DEFAULT_TOPLEVEL_FIELDNAME; + private String landingpageFieldname = DEFAULT_LANDINGPAGE_FIELDNAME; + private String domainFieldname = null; + private String canonicalUrlFieldname = null; + private String[] landingPageSuffixes = { + "/", + "index.html", + "index.htm", + "index.phtml", + "index.shtml", + "index.xml", + "index.php", + "index.asp", + "index.aspx", + "welcome.html", + "welcome.htm", + "welcome.phtml", + "welcome.shtml", + "welcome.xml", + "welcome.php", + "welcome.asp", + "welcome.aspx" + }; + + public URLClassifyProcessor(SolrParams parameters, + SolrQueryRequest request, + SolrQueryResponse response, + UpdateRequestProcessor nextProcessor) { + super(nextProcessor); + + HashSet landingPageSuffixesSet = new HashSet(); + for(String s : landingPageSuffixes) { + landingPageSuffixesSet.add(s); + } + this.initParameters(parameters); + } + + private void initParameters(SolrParams parameters) { + if (parameters != null) { + this.setEnabled(parameters.getBool("enabled", true)); + this.urlFieldname = parameters.get(INPUT_FIELD_PARAM, DEFAULT_URL_FIELDNAME); + this.lengthFieldname = parameters.get(OUTPUT_LENGTH_FIELD_PARAM, DEFAULT_LENGTH_FIELDNAME); + this.levelsFieldname = parameters.get(OUTPUT_LEVELS_FIELD_PARAM, DEFAULT_LEVELS_FIELDNAME); + this.toplevelpageFieldname = parameters.get(OUTPUT_TOPLEVEL_FIELD_PARAM, DEFAULT_TOPLEVEL_FIELDNAME); + this.landingpageFieldname = parameters.get(OUTPUT_LANDINGPAGE_FIELD_PARAM, DEFAULT_LANDINGPAGE_FIELDNAME); + this.domainFieldname = parameters.get(OUTPUT_DOMAIN_FIELD_PARAM); + this.canonicalUrlFieldname = parameters.get(OUTPUT_CANONICALURL_FIELD_PARAM); + } + } + + @Override + public void processAdd(AddUpdateCommand command) throws IOException { + if (isEnabled()) { + SolrInputDocument document = command.getSolrInputDocument(); + if (document.containsKey(urlFieldname)) { + String url = (String) document.getFieldValue(urlFieldname); + try { + URL normalizedURL = getNormalizedURL(url); + document.setField(lengthFieldname, length(normalizedURL)); + document.setField(levelsFieldname, levels(normalizedURL)); + document.setField(toplevelpageFieldname, isTopLevelPage(normalizedURL) ? 1 : 0); + document.setField(landingpageFieldname, isLandingPage(normalizedURL) ? 1 : 0); + if (domainFieldname != null) { + document.setField(domainFieldname, normalizedURL.getHost()); + } + if (canonicalUrlFieldname != null) { + document.setField(canonicalUrlFieldname, getCanonicalUrl(normalizedURL)); + } + log.debug(document.toString()); + } catch (MalformedURLException e) { + e.printStackTrace(); + } catch (URISyntaxException e) { + e.printStackTrace(); + } + } + } + super.processAdd(command); + } + + /** + * Gets a canonical form of the URL for use as main URL + * @param url The input url + * @return The URL object representing the canonical URL + */ + public URL getCanonicalUrl(URL url) { + // NOTE: Do we want to make sure this URL is normalized? (Christian thinks we should) + String urlString = url.toString(); + try { + String lps = landingPageSuffix(url); + return new URL(urlString.replaceFirst("/"+lps+"$", "/")); + } catch (MalformedURLException e) { + e.printStackTrace(); + } + return url; + } + + /** + * Calculates the length of the URL in characters + * @param url The input URL + * @return the length of the URL + */ + public int length(URL url) { + return url.toString().length(); + } + + /** + * Calculates the number of path levels in the given URL + * @param url The input URL + * @return the number of levels, where a top-level URL is 0 + */ + public int levels(URL url) { + // Remove any trailing slashes for the purpose of level counting + String path = getPathWithoutSuffix(url).replaceAll("/+$", ""); + int levels = 0; + for (int i = 0; i < path.length(); i++) { + if (path.charAt(i) == '/') { + levels++; + } + } + return levels; + } + + /** + * Calculates whether a URL is a top level page + * @param url The input URL + * @return true if page is a top level page + */ + public boolean isTopLevelPage(URL url) { + // Remove any trailing slashes for the purpose of level counting + String path = getPathWithoutSuffix(url).replaceAll("/+$", ""); + return path.length() == 0 && url.getQuery() == null; + } + + /** + * Calculates whether the URL is a landing page or not + * @param url The input URL + * @return true if URL represents a landing page (index page) + */ + public boolean isLandingPage(URL url) { + if (url.getQuery() != null) { + return false; + } else { + return landingPageSuffix(url) != ""; + } + } + + public URL getNormalizedURL(String url) throws MalformedURLException, URISyntaxException { + return new URI(url).normalize().toURL(); + } + + public boolean isEnabled() { + return enabled; + } + + public void setEnabled(boolean enabled) { + this.enabled = enabled; + } + + private String landingPageSuffix(URL url) { + String path = url.getPath().toLowerCase(); + for(String suffix : landingPageSuffixes) { + if(path.endsWith(suffix)) { + return suffix; + } + } + return ""; + } + + private String getPathWithoutSuffix(URL url) { + return url.getPath().toLowerCase().replaceFirst(landingPageSuffix(url)+"$", ""); + } +} diff --git a/solr/core/src/java/org/apache/solr/update/processor/URLClassifyProcessorFactory.java b/solr/core/src/java/org/apache/solr/update/processor/URLClassifyProcessorFactory.java new file mode 100644 index 00000000000..79eb8e79b2f --- /dev/null +++ b/solr/core/src/java/org/apache/solr/update/processor/URLClassifyProcessorFactory.java @@ -0,0 +1,44 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.solr.update.processor; + +import org.apache.solr.common.params.SolrParams; +import org.apache.solr.common.util.NamedList; +import org.apache.solr.request.SolrQueryRequest; +import org.apache.solr.response.SolrQueryResponse; + +/** + * Creates URLClassifyProcessor + */ +public class URLClassifyProcessorFactory extends UpdateRequestProcessorFactory { + + private SolrParams params; + + @Override + public void init(@SuppressWarnings("rawtypes") final NamedList args) { + if (args != null) { + this.params = SolrParams.toSolrParams(args); + } + } + + @Override + public UpdateRequestProcessor getInstance(SolrQueryRequest request, + SolrQueryResponse response, + UpdateRequestProcessor nextProcessor) { + return new URLClassifyProcessor(params, request, response, nextProcessor); + } +} diff --git a/solr/core/src/test/org/apache/solr/update/processor/URLClassifyProcessorTest.java b/solr/core/src/test/org/apache/solr/update/processor/URLClassifyProcessorTest.java new file mode 100644 index 00000000000..df92fc928d9 --- /dev/null +++ b/solr/core/src/test/org/apache/solr/update/processor/URLClassifyProcessorTest.java @@ -0,0 +1,104 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.solr.update.processor; + +import java.io.IOException; +import java.net.MalformedURLException; +import java.net.URISyntaxException; + +import org.apache.solr.SolrTestCaseJ4; +import org.apache.solr.common.SolrInputDocument; +import org.apache.solr.update.AddUpdateCommand; +import org.apache.solr.update.processor.URLClassifyProcessor; +import org.apache.solr.update.processor.URLClassifyProcessorFactory; +import org.junit.BeforeClass; +import org.junit.Test; + +public class URLClassifyProcessorTest extends SolrTestCaseJ4 { + + private static URLClassifyProcessor classifyProcessor; + + @BeforeClass + public static void initTest() { + classifyProcessor = + (URLClassifyProcessor) new URLClassifyProcessorFactory().getInstance(null, null, null); + } + + @Test + public void testProcessor() throws IOException { + AddUpdateCommand addCommand = new AddUpdateCommand(null); + SolrInputDocument document = new SolrInputDocument(); + document.addField("id", "test"); + document.addField("url", "http://www.example.com"); + addCommand.solrDoc = document; + classifyProcessor.processAdd(addCommand); + } + + @Test + public void testNormalizations() throws MalformedURLException, URISyntaxException { + String url1 = "http://www.example.com/research/"; + String url2 = "http://www.example.com/research/../research/"; + assertEquals(classifyProcessor.getNormalizedURL(url1), classifyProcessor.getNormalizedURL(url2)); + } + + @Test + public void testLength() throws MalformedURLException, URISyntaxException { + assertEquals(22, classifyProcessor.length(classifyProcessor.getNormalizedURL("http://www.example.com"))); + } + + @Test + public void testLevels() throws MalformedURLException, URISyntaxException { + assertEquals(1, classifyProcessor.levels(classifyProcessor.getNormalizedURL("http://www.example.com/research/"))); + assertEquals(1, classifyProcessor.levels(classifyProcessor.getNormalizedURL("http://www.example.com/research/index.html"))); + assertEquals(1, classifyProcessor.levels(classifyProcessor.getNormalizedURL("http://www.example.com/research/../research/"))); + assertEquals(0, classifyProcessor.levels(classifyProcessor.getNormalizedURL("http://www.example.com/"))); + assertEquals(0, classifyProcessor.levels(classifyProcessor.getNormalizedURL("http://www.example.com/index.htm"))); + assertEquals(0, classifyProcessor.levels(classifyProcessor.getNormalizedURL("http://www.example.com"))); + assertEquals(0, classifyProcessor.levels(classifyProcessor.getNormalizedURL("https://www.example.com"))); + assertEquals(0, classifyProcessor.levels(classifyProcessor.getNormalizedURL("http://www.example.com////"))); + } + + @Test + public void testLandingPage() throws MalformedURLException, URISyntaxException { + assertTrue(classifyProcessor.isLandingPage(classifyProcessor.getNormalizedURL("http://www.example.com/index.html"))); + assertTrue(classifyProcessor.isLandingPage(classifyProcessor.getNormalizedURL("http://www.example.com/index.htm"))); + assertTrue(classifyProcessor.isLandingPage(classifyProcessor.getNormalizedURL("http://www.example.com/welcome.html"))); + assertTrue(classifyProcessor.isLandingPage(classifyProcessor.getNormalizedURL("http://www.example.com/welcome.htm"))); + assertTrue(classifyProcessor.isLandingPage(classifyProcessor.getNormalizedURL("http://www.example.com/index.php"))); + assertTrue(classifyProcessor.isLandingPage(classifyProcessor.getNormalizedURL("http://www.example.com/index.asp"))); + assertTrue(classifyProcessor.isLandingPage(classifyProcessor.getNormalizedURL("http://www.example.com/research/"))); + assertTrue(classifyProcessor.isLandingPage(classifyProcessor.getNormalizedURL("https://www.example.com/research/"))); + assertTrue(classifyProcessor.isLandingPage(classifyProcessor.getNormalizedURL("http://www.example.com/"))); + assertFalse(classifyProcessor.isLandingPage(classifyProcessor.getNormalizedURL("http://www.example.com/intro.htm"))); + } + + @Test + public void testTopLevelPage() throws MalformedURLException, URISyntaxException { + assertTrue(classifyProcessor.isTopLevelPage(classifyProcessor.getNormalizedURL("http://www.example.com"))); + assertTrue(classifyProcessor.isTopLevelPage(classifyProcessor.getNormalizedURL("http://www.example.com/"))); + assertTrue(classifyProcessor.isTopLevelPage(classifyProcessor.getNormalizedURL("http://subdomain.example.com:1234/#anchor"))); + assertTrue(classifyProcessor.isTopLevelPage(classifyProcessor.getNormalizedURL("http://www.example.com/index.html"))); + + assertFalse(classifyProcessor.isTopLevelPage(classifyProcessor.getNormalizedURL("http://www.example.com/foo"))); + assertFalse(classifyProcessor.isTopLevelPage(classifyProcessor.getNormalizedURL("http://subdomain.example.com/?sorting=lastModified%253Adesc&tag=myTag&view=feed"))); + } + + @Test + public void testCanonicalUrl() throws MalformedURLException, URISyntaxException { + assertEquals("http://www.example.com/", classifyProcessor.getCanonicalUrl(classifyProcessor.getNormalizedURL("http://www.example.com/index.html")).toString()); + } +}