mirror of https://github.com/apache/lucene.git
SOLR-2826: URLClassify Update Processor
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1300091 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
ea862e17a6
commit
ba6024cc7f
|
@ -0,0 +1,234 @@
|
|||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.solr.update.processor;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.net.MalformedURLException;
|
||||
import java.net.URI;
|
||||
import java.net.URISyntaxException;
|
||||
import java.net.URL;
|
||||
import java.util.HashSet;
|
||||
|
||||
import org.apache.solr.common.SolrInputDocument;
|
||||
import org.apache.solr.common.params.SolrParams;
|
||||
import org.apache.solr.request.SolrQueryRequest;
|
||||
import org.apache.solr.response.SolrQueryResponse;
|
||||
import org.apache.solr.update.AddUpdateCommand;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
/**
|
||||
* Update processor which examines a URL and outputs to various other fields
|
||||
* characteristics of that URL, including length, number of path levels, whether
|
||||
* it is a top level URL (levels==0), whether it looks like a landing/index page,
|
||||
* a canonical representation of the URL (e.g. stripping index.html), the domain
|
||||
* and path parts of the URL etc.
|
||||
* <p>
|
||||
* This processor is intended used in connection with processing web resuources,
|
||||
* and helping to produce values which may be used for boosting or filtering later.
|
||||
*/
|
||||
public class URLClassifyProcessor extends UpdateRequestProcessor {
|
||||
|
||||
private static final String INPUT_FIELD_PARAM = "inputField";
|
||||
private static final String OUTPUT_LENGTH_FIELD_PARAM = "lengthOutputField";
|
||||
private static final String OUTPUT_LEVELS_FIELD_PARAM = "levelsOutputField";
|
||||
private static final String OUTPUT_TOPLEVEL_FIELD_PARAM = "toplevelOutputField";
|
||||
private static final String OUTPUT_LANDINGPAGE_FIELD_PARAM = "landingpageOutputField";
|
||||
private static final String OUTPUT_DOMAIN_FIELD_PARAM = "domainOutputField";
|
||||
private static final String OUTPUT_CANONICALURL_FIELD_PARAM = "canonicalUrlOutputField";
|
||||
private static final String DEFAULT_URL_FIELDNAME = "url";
|
||||
private static final String DEFAULT_LENGTH_FIELDNAME = "url_length";
|
||||
private static final String DEFAULT_LEVELS_FIELDNAME = "url_levels";
|
||||
private static final String DEFAULT_TOPLEVEL_FIELDNAME = "url_toplevel";
|
||||
private static final String DEFAULT_LANDINGPAGE_FIELDNAME = "url_landingpage";
|
||||
private final static Logger log = LoggerFactory.getLogger(URLClassifyProcessor.class);
|
||||
private boolean enabled = true;
|
||||
private String urlFieldname = DEFAULT_URL_FIELDNAME;
|
||||
private String lengthFieldname = DEFAULT_LENGTH_FIELDNAME;
|
||||
private String levelsFieldname = DEFAULT_LEVELS_FIELDNAME;
|
||||
private String toplevelpageFieldname = DEFAULT_TOPLEVEL_FIELDNAME;
|
||||
private String landingpageFieldname = DEFAULT_LANDINGPAGE_FIELDNAME;
|
||||
private String domainFieldname = null;
|
||||
private String canonicalUrlFieldname = null;
|
||||
private String[] landingPageSuffixes = {
|
||||
"/",
|
||||
"index.html",
|
||||
"index.htm",
|
||||
"index.phtml",
|
||||
"index.shtml",
|
||||
"index.xml",
|
||||
"index.php",
|
||||
"index.asp",
|
||||
"index.aspx",
|
||||
"welcome.html",
|
||||
"welcome.htm",
|
||||
"welcome.phtml",
|
||||
"welcome.shtml",
|
||||
"welcome.xml",
|
||||
"welcome.php",
|
||||
"welcome.asp",
|
||||
"welcome.aspx"
|
||||
};
|
||||
|
||||
public URLClassifyProcessor(SolrParams parameters,
|
||||
SolrQueryRequest request,
|
||||
SolrQueryResponse response,
|
||||
UpdateRequestProcessor nextProcessor) {
|
||||
super(nextProcessor);
|
||||
|
||||
HashSet<String> landingPageSuffixesSet = new HashSet<String>();
|
||||
for(String s : landingPageSuffixes) {
|
||||
landingPageSuffixesSet.add(s);
|
||||
}
|
||||
this.initParameters(parameters);
|
||||
}
|
||||
|
||||
private void initParameters(SolrParams parameters) {
|
||||
if (parameters != null) {
|
||||
this.setEnabled(parameters.getBool("enabled", true));
|
||||
this.urlFieldname = parameters.get(INPUT_FIELD_PARAM, DEFAULT_URL_FIELDNAME);
|
||||
this.lengthFieldname = parameters.get(OUTPUT_LENGTH_FIELD_PARAM, DEFAULT_LENGTH_FIELDNAME);
|
||||
this.levelsFieldname = parameters.get(OUTPUT_LEVELS_FIELD_PARAM, DEFAULT_LEVELS_FIELDNAME);
|
||||
this.toplevelpageFieldname = parameters.get(OUTPUT_TOPLEVEL_FIELD_PARAM, DEFAULT_TOPLEVEL_FIELDNAME);
|
||||
this.landingpageFieldname = parameters.get(OUTPUT_LANDINGPAGE_FIELD_PARAM, DEFAULT_LANDINGPAGE_FIELDNAME);
|
||||
this.domainFieldname = parameters.get(OUTPUT_DOMAIN_FIELD_PARAM);
|
||||
this.canonicalUrlFieldname = parameters.get(OUTPUT_CANONICALURL_FIELD_PARAM);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void processAdd(AddUpdateCommand command) throws IOException {
|
||||
if (isEnabled()) {
|
||||
SolrInputDocument document = command.getSolrInputDocument();
|
||||
if (document.containsKey(urlFieldname)) {
|
||||
String url = (String) document.getFieldValue(urlFieldname);
|
||||
try {
|
||||
URL normalizedURL = getNormalizedURL(url);
|
||||
document.setField(lengthFieldname, length(normalizedURL));
|
||||
document.setField(levelsFieldname, levels(normalizedURL));
|
||||
document.setField(toplevelpageFieldname, isTopLevelPage(normalizedURL) ? 1 : 0);
|
||||
document.setField(landingpageFieldname, isLandingPage(normalizedURL) ? 1 : 0);
|
||||
if (domainFieldname != null) {
|
||||
document.setField(domainFieldname, normalizedURL.getHost());
|
||||
}
|
||||
if (canonicalUrlFieldname != null) {
|
||||
document.setField(canonicalUrlFieldname, getCanonicalUrl(normalizedURL));
|
||||
}
|
||||
log.debug(document.toString());
|
||||
} catch (MalformedURLException e) {
|
||||
e.printStackTrace();
|
||||
} catch (URISyntaxException e) {
|
||||
e.printStackTrace();
|
||||
}
|
||||
}
|
||||
}
|
||||
super.processAdd(command);
|
||||
}
|
||||
|
||||
/**
|
||||
* Gets a canonical form of the URL for use as main URL
|
||||
* @param url The input url
|
||||
* @return The URL object representing the canonical URL
|
||||
*/
|
||||
public URL getCanonicalUrl(URL url) {
|
||||
// NOTE: Do we want to make sure this URL is normalized? (Christian thinks we should)
|
||||
String urlString = url.toString();
|
||||
try {
|
||||
String lps = landingPageSuffix(url);
|
||||
return new URL(urlString.replaceFirst("/"+lps+"$", "/"));
|
||||
} catch (MalformedURLException e) {
|
||||
e.printStackTrace();
|
||||
}
|
||||
return url;
|
||||
}
|
||||
|
||||
/**
|
||||
* Calculates the length of the URL in characters
|
||||
* @param url The input URL
|
||||
* @return the length of the URL
|
||||
*/
|
||||
public int length(URL url) {
|
||||
return url.toString().length();
|
||||
}
|
||||
|
||||
/**
|
||||
* Calculates the number of path levels in the given URL
|
||||
* @param url The input URL
|
||||
* @return the number of levels, where a top-level URL is 0
|
||||
*/
|
||||
public int levels(URL url) {
|
||||
// Remove any trailing slashes for the purpose of level counting
|
||||
String path = getPathWithoutSuffix(url).replaceAll("/+$", "");
|
||||
int levels = 0;
|
||||
for (int i = 0; i < path.length(); i++) {
|
||||
if (path.charAt(i) == '/') {
|
||||
levels++;
|
||||
}
|
||||
}
|
||||
return levels;
|
||||
}
|
||||
|
||||
/**
|
||||
* Calculates whether a URL is a top level page
|
||||
* @param url The input URL
|
||||
* @return true if page is a top level page
|
||||
*/
|
||||
public boolean isTopLevelPage(URL url) {
|
||||
// Remove any trailing slashes for the purpose of level counting
|
||||
String path = getPathWithoutSuffix(url).replaceAll("/+$", "");
|
||||
return path.length() == 0 && url.getQuery() == null;
|
||||
}
|
||||
|
||||
/**
|
||||
* Calculates whether the URL is a landing page or not
|
||||
* @param url The input URL
|
||||
* @return true if URL represents a landing page (index page)
|
||||
*/
|
||||
public boolean isLandingPage(URL url) {
|
||||
if (url.getQuery() != null) {
|
||||
return false;
|
||||
} else {
|
||||
return landingPageSuffix(url) != "";
|
||||
}
|
||||
}
|
||||
|
||||
public URL getNormalizedURL(String url) throws MalformedURLException, URISyntaxException {
|
||||
return new URI(url).normalize().toURL();
|
||||
}
|
||||
|
||||
public boolean isEnabled() {
|
||||
return enabled;
|
||||
}
|
||||
|
||||
public void setEnabled(boolean enabled) {
|
||||
this.enabled = enabled;
|
||||
}
|
||||
|
||||
private String landingPageSuffix(URL url) {
|
||||
String path = url.getPath().toLowerCase();
|
||||
for(String suffix : landingPageSuffixes) {
|
||||
if(path.endsWith(suffix)) {
|
||||
return suffix;
|
||||
}
|
||||
}
|
||||
return "";
|
||||
}
|
||||
|
||||
private String getPathWithoutSuffix(URL url) {
|
||||
return url.getPath().toLowerCase().replaceFirst(landingPageSuffix(url)+"$", "");
|
||||
}
|
||||
}
|
|
@ -0,0 +1,44 @@
|
|||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.solr.update.processor;
|
||||
|
||||
import org.apache.solr.common.params.SolrParams;
|
||||
import org.apache.solr.common.util.NamedList;
|
||||
import org.apache.solr.request.SolrQueryRequest;
|
||||
import org.apache.solr.response.SolrQueryResponse;
|
||||
|
||||
/**
|
||||
* Creates URLClassifyProcessor
|
||||
*/
|
||||
public class URLClassifyProcessorFactory extends UpdateRequestProcessorFactory {
|
||||
|
||||
private SolrParams params;
|
||||
|
||||
@Override
|
||||
public void init(@SuppressWarnings("rawtypes") final NamedList args) {
|
||||
if (args != null) {
|
||||
this.params = SolrParams.toSolrParams(args);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public UpdateRequestProcessor getInstance(SolrQueryRequest request,
|
||||
SolrQueryResponse response,
|
||||
UpdateRequestProcessor nextProcessor) {
|
||||
return new URLClassifyProcessor(params, request, response, nextProcessor);
|
||||
}
|
||||
}
|
|
@ -0,0 +1,104 @@
|
|||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.solr.update.processor;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.net.MalformedURLException;
|
||||
import java.net.URISyntaxException;
|
||||
|
||||
import org.apache.solr.SolrTestCaseJ4;
|
||||
import org.apache.solr.common.SolrInputDocument;
|
||||
import org.apache.solr.update.AddUpdateCommand;
|
||||
import org.apache.solr.update.processor.URLClassifyProcessor;
|
||||
import org.apache.solr.update.processor.URLClassifyProcessorFactory;
|
||||
import org.junit.BeforeClass;
|
||||
import org.junit.Test;
|
||||
|
||||
public class URLClassifyProcessorTest extends SolrTestCaseJ4 {
|
||||
|
||||
private static URLClassifyProcessor classifyProcessor;
|
||||
|
||||
@BeforeClass
|
||||
public static void initTest() {
|
||||
classifyProcessor =
|
||||
(URLClassifyProcessor) new URLClassifyProcessorFactory().getInstance(null, null, null);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testProcessor() throws IOException {
|
||||
AddUpdateCommand addCommand = new AddUpdateCommand(null);
|
||||
SolrInputDocument document = new SolrInputDocument();
|
||||
document.addField("id", "test");
|
||||
document.addField("url", "http://www.example.com");
|
||||
addCommand.solrDoc = document;
|
||||
classifyProcessor.processAdd(addCommand);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testNormalizations() throws MalformedURLException, URISyntaxException {
|
||||
String url1 = "http://www.example.com/research/";
|
||||
String url2 = "http://www.example.com/research/../research/";
|
||||
assertEquals(classifyProcessor.getNormalizedURL(url1), classifyProcessor.getNormalizedURL(url2));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testLength() throws MalformedURLException, URISyntaxException {
|
||||
assertEquals(22, classifyProcessor.length(classifyProcessor.getNormalizedURL("http://www.example.com")));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testLevels() throws MalformedURLException, URISyntaxException {
|
||||
assertEquals(1, classifyProcessor.levels(classifyProcessor.getNormalizedURL("http://www.example.com/research/")));
|
||||
assertEquals(1, classifyProcessor.levels(classifyProcessor.getNormalizedURL("http://www.example.com/research/index.html")));
|
||||
assertEquals(1, classifyProcessor.levels(classifyProcessor.getNormalizedURL("http://www.example.com/research/../research/")));
|
||||
assertEquals(0, classifyProcessor.levels(classifyProcessor.getNormalizedURL("http://www.example.com/")));
|
||||
assertEquals(0, classifyProcessor.levels(classifyProcessor.getNormalizedURL("http://www.example.com/index.htm")));
|
||||
assertEquals(0, classifyProcessor.levels(classifyProcessor.getNormalizedURL("http://www.example.com")));
|
||||
assertEquals(0, classifyProcessor.levels(classifyProcessor.getNormalizedURL("https://www.example.com")));
|
||||
assertEquals(0, classifyProcessor.levels(classifyProcessor.getNormalizedURL("http://www.example.com////")));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testLandingPage() throws MalformedURLException, URISyntaxException {
|
||||
assertTrue(classifyProcessor.isLandingPage(classifyProcessor.getNormalizedURL("http://www.example.com/index.html")));
|
||||
assertTrue(classifyProcessor.isLandingPage(classifyProcessor.getNormalizedURL("http://www.example.com/index.htm")));
|
||||
assertTrue(classifyProcessor.isLandingPage(classifyProcessor.getNormalizedURL("http://www.example.com/welcome.html")));
|
||||
assertTrue(classifyProcessor.isLandingPage(classifyProcessor.getNormalizedURL("http://www.example.com/welcome.htm")));
|
||||
assertTrue(classifyProcessor.isLandingPage(classifyProcessor.getNormalizedURL("http://www.example.com/index.php")));
|
||||
assertTrue(classifyProcessor.isLandingPage(classifyProcessor.getNormalizedURL("http://www.example.com/index.asp")));
|
||||
assertTrue(classifyProcessor.isLandingPage(classifyProcessor.getNormalizedURL("http://www.example.com/research/")));
|
||||
assertTrue(classifyProcessor.isLandingPage(classifyProcessor.getNormalizedURL("https://www.example.com/research/")));
|
||||
assertTrue(classifyProcessor.isLandingPage(classifyProcessor.getNormalizedURL("http://www.example.com/")));
|
||||
assertFalse(classifyProcessor.isLandingPage(classifyProcessor.getNormalizedURL("http://www.example.com/intro.htm")));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testTopLevelPage() throws MalformedURLException, URISyntaxException {
|
||||
assertTrue(classifyProcessor.isTopLevelPage(classifyProcessor.getNormalizedURL("http://www.example.com")));
|
||||
assertTrue(classifyProcessor.isTopLevelPage(classifyProcessor.getNormalizedURL("http://www.example.com/")));
|
||||
assertTrue(classifyProcessor.isTopLevelPage(classifyProcessor.getNormalizedURL("http://subdomain.example.com:1234/#anchor")));
|
||||
assertTrue(classifyProcessor.isTopLevelPage(classifyProcessor.getNormalizedURL("http://www.example.com/index.html")));
|
||||
|
||||
assertFalse(classifyProcessor.isTopLevelPage(classifyProcessor.getNormalizedURL("http://www.example.com/foo")));
|
||||
assertFalse(classifyProcessor.isTopLevelPage(classifyProcessor.getNormalizedURL("http://subdomain.example.com/?sorting=lastModified%253Adesc&tag=myTag&view=feed")));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testCanonicalUrl() throws MalformedURLException, URISyntaxException {
|
||||
assertEquals("http://www.example.com/", classifyProcessor.getCanonicalUrl(classifyProcessor.getNormalizedURL("http://www.example.com/index.html")).toString());
|
||||
}
|
||||
}
|
Loading…
Reference in New Issue