SOLR-13324 - Don't swallow/print exception in URLClassifyProcessor anymore

This commit is contained in:
Gus Heck 2019-03-24 19:07:26 -04:00
parent bca22d58e2
commit c60685f9e4
2 changed files with 22 additions and 23 deletions

View File

@ -41,6 +41,10 @@ Upgrade Notes
expanding the 'expr' parameter can be reinstated with -DStreamingExpressionMacros=true passed to the JVM at startup
(Gus Heck).
* SOLR-13324: URLClassifyProcessor#getCanonicalUrl now throws MalformedURLException rather than hiding it. Although the
present code is unlikely to produce such an exception it may be possible in future changes or in subclasses.
Currently this change should only effect compatibility of custom code overriding this method (Gus Heck).
New Features
----------------------
* SOLR-13131: Category Routed Aliases are now available for data driven assignment of documents to collections based on

View File

@ -43,7 +43,7 @@ import org.slf4j.LoggerFactory;
* and helping to produce values which may be used for boosting or filtering later.
*/
public class URLClassifyProcessor extends UpdateRequestProcessor {
private static final String INPUT_FIELD_PARAM = "inputField";
private static final String OUTPUT_LENGTH_FIELD_PARAM = "lengthOutputField";
private static final String OUTPUT_LEVELS_FIELD_PARAM = "levelsOutputField";
@ -84,16 +84,16 @@ public class URLClassifyProcessor extends UpdateRequestProcessor {
"welcome.asp",
"welcome.aspx"
};
public URLClassifyProcessor(SolrParams parameters,
SolrQueryRequest request,
SolrQueryResponse response,
UpdateRequestProcessor nextProcessor) {
super(nextProcessor);
this.initParameters(parameters);
}
private void initParameters(SolrParams parameters) {
if (parameters != null) {
this.setEnabled(parameters.getBool("enabled", true));
@ -106,7 +106,7 @@ public class URLClassifyProcessor extends UpdateRequestProcessor {
this.canonicalUrlFieldname = parameters.get(OUTPUT_CANONICALURL_FIELD_PARAM);
}
}
@Override
public void processAdd(AddUpdateCommand command) throws IOException {
if (isEnabled()) {
@ -133,24 +133,19 @@ public class URLClassifyProcessor extends UpdateRequestProcessor {
}
super.processAdd(command);
}
/**
* Gets a canonical form of the URL for use as main URL
* @param url The input url
* @return The URL object representing the canonical URL
*/
public URL getCanonicalUrl(URL url) {
public URL getCanonicalUrl(URL url) throws MalformedURLException {
// NOTE: Do we want to make sure this URL is normalized? (Christian thinks we should)
String urlString = url.toString();
try {
String lps = landingPageSuffix(url);
return new URL(urlString.replaceFirst("/"+lps+"$", "/"));
} catch (MalformedURLException e) {
e.printStackTrace();
}
return url;
String lps = landingPageSuffix(url);
return new URL(urlString.replaceFirst("/" + lps + "$", "/"));
}
/**
* Calculates the length of the URL in characters
* @param url The input URL
@ -159,7 +154,7 @@ public class URLClassifyProcessor extends UpdateRequestProcessor {
public int length(URL url) {
return url.toString().length();
}
/**
* Calculates the number of path levels in the given URL
* @param url The input URL
@ -176,7 +171,7 @@ public class URLClassifyProcessor extends UpdateRequestProcessor {
}
return levels;
}
/**
* Calculates whether a URL is a top level page
* @param url The input URL
@ -187,7 +182,7 @@ public class URLClassifyProcessor extends UpdateRequestProcessor {
String path = getPathWithoutSuffix(url).replaceAll("/+$", "");
return path.length() == 0 && url.getQuery() == null;
}
/**
* Calculates whether the URL is a landing page or not
* @param url The input URL
@ -200,19 +195,19 @@ public class URLClassifyProcessor extends UpdateRequestProcessor {
return landingPageSuffix(url) != "";
}
}
public URL getNormalizedURL(String url) throws MalformedURLException, URISyntaxException {
return new URI(url).normalize().toURL();
}
public boolean isEnabled() {
return enabled;
}
public void setEnabled(boolean enabled) {
this.enabled = enabled;
}
private String landingPageSuffix(URL url) {
String path = url.getPath().toLowerCase(Locale.ROOT);
for(String suffix : landingPageSuffixes) {
@ -222,7 +217,7 @@ public class URLClassifyProcessor extends UpdateRequestProcessor {
}
return "";
}
private String getPathWithoutSuffix(URL url) {
return url.getPath().toLowerCase(Locale.ROOT).replaceFirst(landingPageSuffix(url)+"$", "");
}