From ba6024cc7f7e90a9dbd600a0704341316c20d07c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jan=20H=C3=B8ydahl?= <janhoy@apache.org>
Date: Tue, 13 Mar 2012 12:23:11 +0000
Subject: [PATCH] SOLR-2826: URLClassify Update Processor

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1300091 13f79535-47bb-0310-9956-ffa450edef68
---
 .../processor/URLClassifyProcessor.java       | 234 ++++++++++++++++++
 .../URLClassifyProcessorFactory.java          |  44 ++++
 .../processor/URLClassifyProcessorTest.java   | 104 ++++++++
 3 files changed, 382 insertions(+)
 create mode 100644 solr/core/src/java/org/apache/solr/update/processor/URLClassifyProcessor.java
 create mode 100644 solr/core/src/java/org/apache/solr/update/processor/URLClassifyProcessorFactory.java
 create mode 100644 solr/core/src/test/org/apache/solr/update/processor/URLClassifyProcessorTest.java
diff --git a/solr/core/src/java/org/apache/solr/update/processor/URLClassifyProcessor.java b/solr/core/src/java/org/apache/solr/update/processor/URLClassifyProcessor.java
new file mode 100644
index 00000000000..f6e3d4f590f
--- /dev/null
+++ b/solr/core/src/java/org/apache/solr/update/processor/URLClassifyProcessor.java
@@ -0,0 +1,234 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.solr.update.processor;
+
+import java.io.IOException;
+import java.net.MalformedURLException;
+import java.net.URI;
+import java.net.URISyntaxException;
+import java.net.URL;
+import java.util.HashSet;
+
+import org.apache.solr.common.SolrInputDocument;
+import org.apache.solr.common.params.SolrParams;
+import org.apache.solr.request.SolrQueryRequest;
+import org.apache.solr.response.SolrQueryResponse;
+import org.apache.solr.update.AddUpdateCommand;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * Update processor which examines a URL and outputs to various other fields
+ * characteristics of that URL, including length, number of path levels, whether
+ * it is a top level URL (levels==0), whether it looks like a landing/index page,
+ * a canonical representation of the URL (e.g. stripping index.html), the domain
+ * and path parts of the URL etc.
+ * <p>
+ * This processor is intended used in connection with processing web resuources,
+ * and helping to produce values which may be used for boosting or filtering later.
+ */
+public class URLClassifyProcessor extends UpdateRequestProcessor {
+  
+  private static final String INPUT_FIELD_PARAM = "inputField";
+  private static final String OUTPUT_LENGTH_FIELD_PARAM = "lengthOutputField";
+  private static final String OUTPUT_LEVELS_FIELD_PARAM = "levelsOutputField";
+  private static final String OUTPUT_TOPLEVEL_FIELD_PARAM = "toplevelOutputField";
+  private static final String OUTPUT_LANDINGPAGE_FIELD_PARAM = "landingpageOutputField";
+  private static final String OUTPUT_DOMAIN_FIELD_PARAM = "domainOutputField";
+  private static final String OUTPUT_CANONICALURL_FIELD_PARAM = "canonicalUrlOutputField";
+  private static final String DEFAULT_URL_FIELDNAME = "url";
+  private static final String DEFAULT_LENGTH_FIELDNAME = "url_length";
+  private static final String DEFAULT_LEVELS_FIELDNAME = "url_levels";
+  private static final String DEFAULT_TOPLEVEL_FIELDNAME = "url_toplevel";
+  private static final String DEFAULT_LANDINGPAGE_FIELDNAME = "url_landingpage";
+  private final static Logger log = LoggerFactory.getLogger(URLClassifyProcessor.class);
+  private boolean enabled = true;
+  private String urlFieldname = DEFAULT_URL_FIELDNAME;
+  private String lengthFieldname = DEFAULT_LENGTH_FIELDNAME;
+  private String levelsFieldname = DEFAULT_LEVELS_FIELDNAME;
+  private String toplevelpageFieldname = DEFAULT_TOPLEVEL_FIELDNAME;
+  private String landingpageFieldname = DEFAULT_LANDINGPAGE_FIELDNAME;
+  private String domainFieldname = null;
+  private String canonicalUrlFieldname = null;
+  private String[] landingPageSuffixes = {
+      "/",
+      "index.html",
+      "index.htm",
+      "index.phtml",
+      "index.shtml",
+      "index.xml",
+      "index.php",
+      "index.asp",
+      "index.aspx",
+      "welcome.html",
+      "welcome.htm",
+      "welcome.phtml",
+      "welcome.shtml",
+      "welcome.xml",
+      "welcome.php",
+      "welcome.asp",
+      "welcome.aspx"
+  };
+  
+  public URLClassifyProcessor(SolrParams parameters,
+      SolrQueryRequest request,
+      SolrQueryResponse response,
+      UpdateRequestProcessor nextProcessor) {
+    super(nextProcessor);
+    
+    HashSet<String> landingPageSuffixesSet = new HashSet<String>();
+    for(String s : landingPageSuffixes) {
+      landingPageSuffixesSet.add(s);
+    }
+    this.initParameters(parameters);
+  }
+  
+  private void initParameters(SolrParams parameters) {
+    if (parameters != null) {
+      this.setEnabled(parameters.getBool("enabled", true));
+      this.urlFieldname = parameters.get(INPUT_FIELD_PARAM, DEFAULT_URL_FIELDNAME);
+      this.lengthFieldname = parameters.get(OUTPUT_LENGTH_FIELD_PARAM, DEFAULT_LENGTH_FIELDNAME);
+      this.levelsFieldname = parameters.get(OUTPUT_LEVELS_FIELD_PARAM, DEFAULT_LEVELS_FIELDNAME);
+      this.toplevelpageFieldname = parameters.get(OUTPUT_TOPLEVEL_FIELD_PARAM, DEFAULT_TOPLEVEL_FIELDNAME);
+      this.landingpageFieldname = parameters.get(OUTPUT_LANDINGPAGE_FIELD_PARAM, DEFAULT_LANDINGPAGE_FIELDNAME);
+      this.domainFieldname = parameters.get(OUTPUT_DOMAIN_FIELD_PARAM);
+      this.canonicalUrlFieldname = parameters.get(OUTPUT_CANONICALURL_FIELD_PARAM);
+    }
+  }
+  
+  @Override
+  public void processAdd(AddUpdateCommand command) throws IOException {
+    if (isEnabled()) {
+      SolrInputDocument document = command.getSolrInputDocument();
+      if (document.containsKey(urlFieldname)) {
+        String url = (String) document.getFieldValue(urlFieldname);
+        try {
+          URL normalizedURL = getNormalizedURL(url);
+          document.setField(lengthFieldname, length(normalizedURL));
+          document.setField(levelsFieldname, levels(normalizedURL));
+          document.setField(toplevelpageFieldname, isTopLevelPage(normalizedURL) ? 1 : 0);
+          document.setField(landingpageFieldname, isLandingPage(normalizedURL) ? 1 : 0);
+          if (domainFieldname != null) {
+            document.setField(domainFieldname, normalizedURL.getHost());
+          }
+          if (canonicalUrlFieldname != null) {
+            document.setField(canonicalUrlFieldname, getCanonicalUrl(normalizedURL));
+          }
+          log.debug(document.toString());
+        } catch (MalformedURLException e) {
+          e.printStackTrace();
+        } catch (URISyntaxException e) {
+          e.printStackTrace();
+        }
+      }
+    }
+    super.processAdd(command);
+  }
+  
+  /**
+   * Gets a canonical form of the URL for use as main URL
+   * @param url The input url
+   * @return The URL object representing the canonical URL
+   */
+  public URL getCanonicalUrl(URL url) {
+    // NOTE: Do we want to make sure this URL is normalized? (Christian thinks we should)
+    String urlString = url.toString();
+    try {
+      String lps = landingPageSuffix(url);
+      return new URL(urlString.replaceFirst("/"+lps+"$", "/"));
+    } catch (MalformedURLException e) {
+      e.printStackTrace();
+    }
+    return url;
+  }
+  
+  /**
+   * Calculates the length of the URL in characters
+   * @param url The input URL
+   * @return the length of the URL
+   */
+  public int length(URL url) {
+    return url.toString().length();
+  }
+  
+  /**
+   * Calculates the number of path levels in the given URL
+   * @param url The input URL
+   * @return the number of levels, where a top-level URL is 0
+   */
+  public int levels(URL url) {
+    // Remove any trailing slashes for the purpose of level counting
+    String path = getPathWithoutSuffix(url).replaceAll("/+$", "");
+    int levels = 0;
+    for (int i = 0; i < path.length(); i++) {
+      if (path.charAt(i) == '/') {
+        levels++;
+      }
+    }
+    return levels;
+  }
+  
+  /**
+   * Calculates whether a URL is a top level page
+   * @param url The input URL
+   * @return true if page is a top level page
+   */
+  public boolean isTopLevelPage(URL url) {
+    // Remove any trailing slashes for the purpose of level counting
+    String path = getPathWithoutSuffix(url).replaceAll("/+$", "");
+    return path.length() == 0 && url.getQuery() == null;
+  }
+  
+  /**
+   * Calculates whether the URL is a landing page or not
+   * @param url The input URL
+   * @return true if URL represents a landing page (index page)
+   */
+  public boolean isLandingPage(URL url) {
+    if (url.getQuery() != null) {
+      return false;
+    } else {
+      return landingPageSuffix(url) != "";
+    }
+  }
+  
+  public URL getNormalizedURL(String url) throws MalformedURLException, URISyntaxException {
+    return new URI(url).normalize().toURL();
+  }
+  
+  public boolean isEnabled() {
+    return enabled;
+  }
+  
+  public void setEnabled(boolean enabled) {
+    this.enabled = enabled;
+  }
+  
+  private String landingPageSuffix(URL url) {
+    String path = url.getPath().toLowerCase();
+    for(String suffix : landingPageSuffixes) {
+      if(path.endsWith(suffix)) {
+        return suffix;
+      }
+    }
+    return "";
+  }
+  
+  private String getPathWithoutSuffix(URL url) {
+    return url.getPath().toLowerCase().replaceFirst(landingPageSuffix(url)+"$", "");
+  }
+}
diff --git a/solr/core/src/java/org/apache/solr/update/processor/URLClassifyProcessorFactory.java b/solr/core/src/java/org/apache/solr/update/processor/URLClassifyProcessorFactory.java
new file mode 100644
index 00000000000..79eb8e79b2f
--- /dev/null
+++ b/solr/core/src/java/org/apache/solr/update/processor/URLClassifyProcessorFactory.java
@@ -0,0 +1,44 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.solr.update.processor;
+
+import org.apache.solr.common.params.SolrParams;
+import org.apache.solr.common.util.NamedList;
+import org.apache.solr.request.SolrQueryRequest;
+import org.apache.solr.response.SolrQueryResponse;
+
+/**
+ * Creates URLClassifyProcessor
+ */
+public class URLClassifyProcessorFactory extends UpdateRequestProcessorFactory {
+  
+  private SolrParams params;
+  
+  @Override
+  public void init(@SuppressWarnings("rawtypes") final NamedList args) {
+    if (args != null) {
+      this.params = SolrParams.toSolrParams(args);
+    }
+  }
+  
+  @Override
+  public UpdateRequestProcessor getInstance(SolrQueryRequest request,
+      SolrQueryResponse response,
+      UpdateRequestProcessor nextProcessor) {
+    return new URLClassifyProcessor(params, request, response, nextProcessor);
+  }
+}
diff --git a/solr/core/src/test/org/apache/solr/update/processor/URLClassifyProcessorTest.java b/solr/core/src/test/org/apache/solr/update/processor/URLClassifyProcessorTest.java
new file mode 100644
index 00000000000..df92fc928d9
--- /dev/null
+++ b/solr/core/src/test/org/apache/solr/update/processor/URLClassifyProcessorTest.java
@@ -0,0 +1,104 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.solr.update.processor;
+
+import java.io.IOException;
+import java.net.MalformedURLException;
+import java.net.URISyntaxException;
+
+import org.apache.solr.SolrTestCaseJ4;
+import org.apache.solr.common.SolrInputDocument;
+import org.apache.solr.update.AddUpdateCommand;
+import org.apache.solr.update.processor.URLClassifyProcessor;
+import org.apache.solr.update.processor.URLClassifyProcessorFactory;
+import org.junit.BeforeClass;
+import org.junit.Test;
+
+public class URLClassifyProcessorTest extends SolrTestCaseJ4 {
+  
+  private static URLClassifyProcessor classifyProcessor;
+  
+  @BeforeClass
+  public static void initTest() {
+    classifyProcessor =
+      (URLClassifyProcessor) new URLClassifyProcessorFactory().getInstance(null, null, null);
+  }
+  
+  @Test
+  public void testProcessor() throws IOException {
+    AddUpdateCommand addCommand = new AddUpdateCommand(null);
+    SolrInputDocument document = new SolrInputDocument();
+    document.addField("id", "test");
+    document.addField("url", "http://www.example.com");
+    addCommand.solrDoc = document;
+    classifyProcessor.processAdd(addCommand);
+  }
+  
+  @Test
+  public void testNormalizations() throws MalformedURLException, URISyntaxException {
+    String url1 = "http://www.example.com/research/";
+    String url2 = "http://www.example.com/research/../research/";
+    assertEquals(classifyProcessor.getNormalizedURL(url1), classifyProcessor.getNormalizedURL(url2));
+  }
+  
+  @Test
+  public void testLength() throws MalformedURLException, URISyntaxException {
+    assertEquals(22, classifyProcessor.length(classifyProcessor.getNormalizedURL("http://www.example.com")));
+  }
+  
+  @Test
+  public void testLevels() throws MalformedURLException, URISyntaxException {
+    assertEquals(1, classifyProcessor.levels(classifyProcessor.getNormalizedURL("http://www.example.com/research/")));
+    assertEquals(1, classifyProcessor.levels(classifyProcessor.getNormalizedURL("http://www.example.com/research/index.html")));
+    assertEquals(1, classifyProcessor.levels(classifyProcessor.getNormalizedURL("http://www.example.com/research/../research/")));
+    assertEquals(0, classifyProcessor.levels(classifyProcessor.getNormalizedURL("http://www.example.com/")));
+    assertEquals(0, classifyProcessor.levels(classifyProcessor.getNormalizedURL("http://www.example.com/index.htm")));
+    assertEquals(0, classifyProcessor.levels(classifyProcessor.getNormalizedURL("http://www.example.com")));
+    assertEquals(0, classifyProcessor.levels(classifyProcessor.getNormalizedURL("https://www.example.com")));
+    assertEquals(0, classifyProcessor.levels(classifyProcessor.getNormalizedURL("http://www.example.com////")));
+  }
+  
+  @Test
+  public void testLandingPage() throws MalformedURLException, URISyntaxException {
+    assertTrue(classifyProcessor.isLandingPage(classifyProcessor.getNormalizedURL("http://www.example.com/index.html")));
+    assertTrue(classifyProcessor.isLandingPage(classifyProcessor.getNormalizedURL("http://www.example.com/index.htm")));
+    assertTrue(classifyProcessor.isLandingPage(classifyProcessor.getNormalizedURL("http://www.example.com/welcome.html")));
+    assertTrue(classifyProcessor.isLandingPage(classifyProcessor.getNormalizedURL("http://www.example.com/welcome.htm")));
+    assertTrue(classifyProcessor.isLandingPage(classifyProcessor.getNormalizedURL("http://www.example.com/index.php")));
+    assertTrue(classifyProcessor.isLandingPage(classifyProcessor.getNormalizedURL("http://www.example.com/index.asp")));
+    assertTrue(classifyProcessor.isLandingPage(classifyProcessor.getNormalizedURL("http://www.example.com/research/")));
+    assertTrue(classifyProcessor.isLandingPage(classifyProcessor.getNormalizedURL("https://www.example.com/research/")));
+    assertTrue(classifyProcessor.isLandingPage(classifyProcessor.getNormalizedURL("http://www.example.com/")));
+    assertFalse(classifyProcessor.isLandingPage(classifyProcessor.getNormalizedURL("http://www.example.com/intro.htm")));
+  }
+  
+  @Test
+  public void testTopLevelPage() throws MalformedURLException, URISyntaxException {
+    assertTrue(classifyProcessor.isTopLevelPage(classifyProcessor.getNormalizedURL("http://www.example.com")));
+    assertTrue(classifyProcessor.isTopLevelPage(classifyProcessor.getNormalizedURL("http://www.example.com/")));
+    assertTrue(classifyProcessor.isTopLevelPage(classifyProcessor.getNormalizedURL("http://subdomain.example.com:1234/#anchor")));
+    assertTrue(classifyProcessor.isTopLevelPage(classifyProcessor.getNormalizedURL("http://www.example.com/index.html")));
+    
+    assertFalse(classifyProcessor.isTopLevelPage(classifyProcessor.getNormalizedURL("http://www.example.com/foo")));
+    assertFalse(classifyProcessor.isTopLevelPage(classifyProcessor.getNormalizedURL("http://subdomain.example.com/?sorting=lastModified%253Adesc&tag=myTag&view=feed")));
+  }
+  
+  @Test
+  public void testCanonicalUrl() throws MalformedURLException, URISyntaxException {
+    assertEquals("http://www.example.com/", classifyProcessor.getCanonicalUrl(classifyProcessor.getNormalizedURL("http://www.example.com/index.html")).toString());
+  }
+}