mirror of https://github.com/apache/lucene.git
SOLR-2826: URLClassify Update Processor
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1300091 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
ea862e17a6
commit
ba6024cc7f
|
@ -0,0 +1,234 @@
|
||||||
|
/**
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
package org.apache.solr.update.processor;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.net.MalformedURLException;
|
||||||
|
import java.net.URI;
|
||||||
|
import java.net.URISyntaxException;
|
||||||
|
import java.net.URL;
|
||||||
|
import java.util.HashSet;
|
||||||
|
|
||||||
|
import org.apache.solr.common.SolrInputDocument;
|
||||||
|
import org.apache.solr.common.params.SolrParams;
|
||||||
|
import org.apache.solr.request.SolrQueryRequest;
|
||||||
|
import org.apache.solr.response.SolrQueryResponse;
|
||||||
|
import org.apache.solr.update.AddUpdateCommand;
|
||||||
|
import org.slf4j.Logger;
|
||||||
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Update processor which examines a URL and outputs to various other fields
|
||||||
|
* characteristics of that URL, including length, number of path levels, whether
|
||||||
|
* it is a top level URL (levels==0), whether it looks like a landing/index page,
|
||||||
|
* a canonical representation of the URL (e.g. stripping index.html), the domain
|
||||||
|
* and path parts of the URL etc.
|
||||||
|
* <p>
|
||||||
|
* This processor is intended used in connection with processing web resuources,
|
||||||
|
* and helping to produce values which may be used for boosting or filtering later.
|
||||||
|
*/
|
||||||
|
public class URLClassifyProcessor extends UpdateRequestProcessor {
|
||||||
|
|
||||||
|
private static final String INPUT_FIELD_PARAM = "inputField";
|
||||||
|
private static final String OUTPUT_LENGTH_FIELD_PARAM = "lengthOutputField";
|
||||||
|
private static final String OUTPUT_LEVELS_FIELD_PARAM = "levelsOutputField";
|
||||||
|
private static final String OUTPUT_TOPLEVEL_FIELD_PARAM = "toplevelOutputField";
|
||||||
|
private static final String OUTPUT_LANDINGPAGE_FIELD_PARAM = "landingpageOutputField";
|
||||||
|
private static final String OUTPUT_DOMAIN_FIELD_PARAM = "domainOutputField";
|
||||||
|
private static final String OUTPUT_CANONICALURL_FIELD_PARAM = "canonicalUrlOutputField";
|
||||||
|
private static final String DEFAULT_URL_FIELDNAME = "url";
|
||||||
|
private static final String DEFAULT_LENGTH_FIELDNAME = "url_length";
|
||||||
|
private static final String DEFAULT_LEVELS_FIELDNAME = "url_levels";
|
||||||
|
private static final String DEFAULT_TOPLEVEL_FIELDNAME = "url_toplevel";
|
||||||
|
private static final String DEFAULT_LANDINGPAGE_FIELDNAME = "url_landingpage";
|
||||||
|
private final static Logger log = LoggerFactory.getLogger(URLClassifyProcessor.class);
|
||||||
|
private boolean enabled = true;
|
||||||
|
private String urlFieldname = DEFAULT_URL_FIELDNAME;
|
||||||
|
private String lengthFieldname = DEFAULT_LENGTH_FIELDNAME;
|
||||||
|
private String levelsFieldname = DEFAULT_LEVELS_FIELDNAME;
|
||||||
|
private String toplevelpageFieldname = DEFAULT_TOPLEVEL_FIELDNAME;
|
||||||
|
private String landingpageFieldname = DEFAULT_LANDINGPAGE_FIELDNAME;
|
||||||
|
private String domainFieldname = null;
|
||||||
|
private String canonicalUrlFieldname = null;
|
||||||
|
private String[] landingPageSuffixes = {
|
||||||
|
"/",
|
||||||
|
"index.html",
|
||||||
|
"index.htm",
|
||||||
|
"index.phtml",
|
||||||
|
"index.shtml",
|
||||||
|
"index.xml",
|
||||||
|
"index.php",
|
||||||
|
"index.asp",
|
||||||
|
"index.aspx",
|
||||||
|
"welcome.html",
|
||||||
|
"welcome.htm",
|
||||||
|
"welcome.phtml",
|
||||||
|
"welcome.shtml",
|
||||||
|
"welcome.xml",
|
||||||
|
"welcome.php",
|
||||||
|
"welcome.asp",
|
||||||
|
"welcome.aspx"
|
||||||
|
};
|
||||||
|
|
||||||
|
public URLClassifyProcessor(SolrParams parameters,
|
||||||
|
SolrQueryRequest request,
|
||||||
|
SolrQueryResponse response,
|
||||||
|
UpdateRequestProcessor nextProcessor) {
|
||||||
|
super(nextProcessor);
|
||||||
|
|
||||||
|
HashSet<String> landingPageSuffixesSet = new HashSet<String>();
|
||||||
|
for(String s : landingPageSuffixes) {
|
||||||
|
landingPageSuffixesSet.add(s);
|
||||||
|
}
|
||||||
|
this.initParameters(parameters);
|
||||||
|
}
|
||||||
|
|
||||||
|
private void initParameters(SolrParams parameters) {
|
||||||
|
if (parameters != null) {
|
||||||
|
this.setEnabled(parameters.getBool("enabled", true));
|
||||||
|
this.urlFieldname = parameters.get(INPUT_FIELD_PARAM, DEFAULT_URL_FIELDNAME);
|
||||||
|
this.lengthFieldname = parameters.get(OUTPUT_LENGTH_FIELD_PARAM, DEFAULT_LENGTH_FIELDNAME);
|
||||||
|
this.levelsFieldname = parameters.get(OUTPUT_LEVELS_FIELD_PARAM, DEFAULT_LEVELS_FIELDNAME);
|
||||||
|
this.toplevelpageFieldname = parameters.get(OUTPUT_TOPLEVEL_FIELD_PARAM, DEFAULT_TOPLEVEL_FIELDNAME);
|
||||||
|
this.landingpageFieldname = parameters.get(OUTPUT_LANDINGPAGE_FIELD_PARAM, DEFAULT_LANDINGPAGE_FIELDNAME);
|
||||||
|
this.domainFieldname = parameters.get(OUTPUT_DOMAIN_FIELD_PARAM);
|
||||||
|
this.canonicalUrlFieldname = parameters.get(OUTPUT_CANONICALURL_FIELD_PARAM);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void processAdd(AddUpdateCommand command) throws IOException {
|
||||||
|
if (isEnabled()) {
|
||||||
|
SolrInputDocument document = command.getSolrInputDocument();
|
||||||
|
if (document.containsKey(urlFieldname)) {
|
||||||
|
String url = (String) document.getFieldValue(urlFieldname);
|
||||||
|
try {
|
||||||
|
URL normalizedURL = getNormalizedURL(url);
|
||||||
|
document.setField(lengthFieldname, length(normalizedURL));
|
||||||
|
document.setField(levelsFieldname, levels(normalizedURL));
|
||||||
|
document.setField(toplevelpageFieldname, isTopLevelPage(normalizedURL) ? 1 : 0);
|
||||||
|
document.setField(landingpageFieldname, isLandingPage(normalizedURL) ? 1 : 0);
|
||||||
|
if (domainFieldname != null) {
|
||||||
|
document.setField(domainFieldname, normalizedURL.getHost());
|
||||||
|
}
|
||||||
|
if (canonicalUrlFieldname != null) {
|
||||||
|
document.setField(canonicalUrlFieldname, getCanonicalUrl(normalizedURL));
|
||||||
|
}
|
||||||
|
log.debug(document.toString());
|
||||||
|
} catch (MalformedURLException e) {
|
||||||
|
e.printStackTrace();
|
||||||
|
} catch (URISyntaxException e) {
|
||||||
|
e.printStackTrace();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
super.processAdd(command);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Gets a canonical form of the URL for use as main URL
|
||||||
|
* @param url The input url
|
||||||
|
* @return The URL object representing the canonical URL
|
||||||
|
*/
|
||||||
|
public URL getCanonicalUrl(URL url) {
|
||||||
|
// NOTE: Do we want to make sure this URL is normalized? (Christian thinks we should)
|
||||||
|
String urlString = url.toString();
|
||||||
|
try {
|
||||||
|
String lps = landingPageSuffix(url);
|
||||||
|
return new URL(urlString.replaceFirst("/"+lps+"$", "/"));
|
||||||
|
} catch (MalformedURLException e) {
|
||||||
|
e.printStackTrace();
|
||||||
|
}
|
||||||
|
return url;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Calculates the length of the URL in characters
|
||||||
|
* @param url The input URL
|
||||||
|
* @return the length of the URL
|
||||||
|
*/
|
||||||
|
public int length(URL url) {
|
||||||
|
return url.toString().length();
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Calculates the number of path levels in the given URL
|
||||||
|
* @param url The input URL
|
||||||
|
* @return the number of levels, where a top-level URL is 0
|
||||||
|
*/
|
||||||
|
public int levels(URL url) {
|
||||||
|
// Remove any trailing slashes for the purpose of level counting
|
||||||
|
String path = getPathWithoutSuffix(url).replaceAll("/+$", "");
|
||||||
|
int levels = 0;
|
||||||
|
for (int i = 0; i < path.length(); i++) {
|
||||||
|
if (path.charAt(i) == '/') {
|
||||||
|
levels++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return levels;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Calculates whether a URL is a top level page
|
||||||
|
* @param url The input URL
|
||||||
|
* @return true if page is a top level page
|
||||||
|
*/
|
||||||
|
public boolean isTopLevelPage(URL url) {
|
||||||
|
// Remove any trailing slashes for the purpose of level counting
|
||||||
|
String path = getPathWithoutSuffix(url).replaceAll("/+$", "");
|
||||||
|
return path.length() == 0 && url.getQuery() == null;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Calculates whether the URL is a landing page or not
|
||||||
|
* @param url The input URL
|
||||||
|
* @return true if URL represents a landing page (index page)
|
||||||
|
*/
|
||||||
|
public boolean isLandingPage(URL url) {
|
||||||
|
if (url.getQuery() != null) {
|
||||||
|
return false;
|
||||||
|
} else {
|
||||||
|
return landingPageSuffix(url) != "";
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public URL getNormalizedURL(String url) throws MalformedURLException, URISyntaxException {
|
||||||
|
return new URI(url).normalize().toURL();
|
||||||
|
}
|
||||||
|
|
||||||
|
public boolean isEnabled() {
|
||||||
|
return enabled;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setEnabled(boolean enabled) {
|
||||||
|
this.enabled = enabled;
|
||||||
|
}
|
||||||
|
|
||||||
|
private String landingPageSuffix(URL url) {
|
||||||
|
String path = url.getPath().toLowerCase();
|
||||||
|
for(String suffix : landingPageSuffixes) {
|
||||||
|
if(path.endsWith(suffix)) {
|
||||||
|
return suffix;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return "";
|
||||||
|
}
|
||||||
|
|
||||||
|
private String getPathWithoutSuffix(URL url) {
|
||||||
|
return url.getPath().toLowerCase().replaceFirst(landingPageSuffix(url)+"$", "");
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,44 @@
|
||||||
|
/**
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
package org.apache.solr.update.processor;
|
||||||
|
|
||||||
|
import org.apache.solr.common.params.SolrParams;
|
||||||
|
import org.apache.solr.common.util.NamedList;
|
||||||
|
import org.apache.solr.request.SolrQueryRequest;
|
||||||
|
import org.apache.solr.response.SolrQueryResponse;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Creates URLClassifyProcessor
|
||||||
|
*/
|
||||||
|
public class URLClassifyProcessorFactory extends UpdateRequestProcessorFactory {
|
||||||
|
|
||||||
|
private SolrParams params;
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void init(@SuppressWarnings("rawtypes") final NamedList args) {
|
||||||
|
if (args != null) {
|
||||||
|
this.params = SolrParams.toSolrParams(args);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public UpdateRequestProcessor getInstance(SolrQueryRequest request,
|
||||||
|
SolrQueryResponse response,
|
||||||
|
UpdateRequestProcessor nextProcessor) {
|
||||||
|
return new URLClassifyProcessor(params, request, response, nextProcessor);
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,104 @@
|
||||||
|
/**
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
package org.apache.solr.update.processor;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.net.MalformedURLException;
|
||||||
|
import java.net.URISyntaxException;
|
||||||
|
|
||||||
|
import org.apache.solr.SolrTestCaseJ4;
|
||||||
|
import org.apache.solr.common.SolrInputDocument;
|
||||||
|
import org.apache.solr.update.AddUpdateCommand;
|
||||||
|
import org.apache.solr.update.processor.URLClassifyProcessor;
|
||||||
|
import org.apache.solr.update.processor.URLClassifyProcessorFactory;
|
||||||
|
import org.junit.BeforeClass;
|
||||||
|
import org.junit.Test;
|
||||||
|
|
||||||
|
public class URLClassifyProcessorTest extends SolrTestCaseJ4 {
|
||||||
|
|
||||||
|
private static URLClassifyProcessor classifyProcessor;
|
||||||
|
|
||||||
|
@BeforeClass
|
||||||
|
public static void initTest() {
|
||||||
|
classifyProcessor =
|
||||||
|
(URLClassifyProcessor) new URLClassifyProcessorFactory().getInstance(null, null, null);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testProcessor() throws IOException {
|
||||||
|
AddUpdateCommand addCommand = new AddUpdateCommand(null);
|
||||||
|
SolrInputDocument document = new SolrInputDocument();
|
||||||
|
document.addField("id", "test");
|
||||||
|
document.addField("url", "http://www.example.com");
|
||||||
|
addCommand.solrDoc = document;
|
||||||
|
classifyProcessor.processAdd(addCommand);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testNormalizations() throws MalformedURLException, URISyntaxException {
|
||||||
|
String url1 = "http://www.example.com/research/";
|
||||||
|
String url2 = "http://www.example.com/research/../research/";
|
||||||
|
assertEquals(classifyProcessor.getNormalizedURL(url1), classifyProcessor.getNormalizedURL(url2));
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testLength() throws MalformedURLException, URISyntaxException {
|
||||||
|
assertEquals(22, classifyProcessor.length(classifyProcessor.getNormalizedURL("http://www.example.com")));
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testLevels() throws MalformedURLException, URISyntaxException {
|
||||||
|
assertEquals(1, classifyProcessor.levels(classifyProcessor.getNormalizedURL("http://www.example.com/research/")));
|
||||||
|
assertEquals(1, classifyProcessor.levels(classifyProcessor.getNormalizedURL("http://www.example.com/research/index.html")));
|
||||||
|
assertEquals(1, classifyProcessor.levels(classifyProcessor.getNormalizedURL("http://www.example.com/research/../research/")));
|
||||||
|
assertEquals(0, classifyProcessor.levels(classifyProcessor.getNormalizedURL("http://www.example.com/")));
|
||||||
|
assertEquals(0, classifyProcessor.levels(classifyProcessor.getNormalizedURL("http://www.example.com/index.htm")));
|
||||||
|
assertEquals(0, classifyProcessor.levels(classifyProcessor.getNormalizedURL("http://www.example.com")));
|
||||||
|
assertEquals(0, classifyProcessor.levels(classifyProcessor.getNormalizedURL("https://www.example.com")));
|
||||||
|
assertEquals(0, classifyProcessor.levels(classifyProcessor.getNormalizedURL("http://www.example.com////")));
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testLandingPage() throws MalformedURLException, URISyntaxException {
|
||||||
|
assertTrue(classifyProcessor.isLandingPage(classifyProcessor.getNormalizedURL("http://www.example.com/index.html")));
|
||||||
|
assertTrue(classifyProcessor.isLandingPage(classifyProcessor.getNormalizedURL("http://www.example.com/index.htm")));
|
||||||
|
assertTrue(classifyProcessor.isLandingPage(classifyProcessor.getNormalizedURL("http://www.example.com/welcome.html")));
|
||||||
|
assertTrue(classifyProcessor.isLandingPage(classifyProcessor.getNormalizedURL("http://www.example.com/welcome.htm")));
|
||||||
|
assertTrue(classifyProcessor.isLandingPage(classifyProcessor.getNormalizedURL("http://www.example.com/index.php")));
|
||||||
|
assertTrue(classifyProcessor.isLandingPage(classifyProcessor.getNormalizedURL("http://www.example.com/index.asp")));
|
||||||
|
assertTrue(classifyProcessor.isLandingPage(classifyProcessor.getNormalizedURL("http://www.example.com/research/")));
|
||||||
|
assertTrue(classifyProcessor.isLandingPage(classifyProcessor.getNormalizedURL("https://www.example.com/research/")));
|
||||||
|
assertTrue(classifyProcessor.isLandingPage(classifyProcessor.getNormalizedURL("http://www.example.com/")));
|
||||||
|
assertFalse(classifyProcessor.isLandingPage(classifyProcessor.getNormalizedURL("http://www.example.com/intro.htm")));
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testTopLevelPage() throws MalformedURLException, URISyntaxException {
|
||||||
|
assertTrue(classifyProcessor.isTopLevelPage(classifyProcessor.getNormalizedURL("http://www.example.com")));
|
||||||
|
assertTrue(classifyProcessor.isTopLevelPage(classifyProcessor.getNormalizedURL("http://www.example.com/")));
|
||||||
|
assertTrue(classifyProcessor.isTopLevelPage(classifyProcessor.getNormalizedURL("http://subdomain.example.com:1234/#anchor")));
|
||||||
|
assertTrue(classifyProcessor.isTopLevelPage(classifyProcessor.getNormalizedURL("http://www.example.com/index.html")));
|
||||||
|
|
||||||
|
assertFalse(classifyProcessor.isTopLevelPage(classifyProcessor.getNormalizedURL("http://www.example.com/foo")));
|
||||||
|
assertFalse(classifyProcessor.isTopLevelPage(classifyProcessor.getNormalizedURL("http://subdomain.example.com/?sorting=lastModified%253Adesc&tag=myTag&view=feed")));
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testCanonicalUrl() throws MalformedURLException, URISyntaxException {
|
||||||
|
assertEquals("http://www.example.com/", classifyProcessor.getCanonicalUrl(classifyProcessor.getNormalizedURL("http://www.example.com/index.html")).toString());
|
||||||
|
}
|
||||||
|
}
|
Loading…
Reference in New Issue