SOLR-8903: Move SolrJ DateUtil to contrib/extraction as ExtractionDateUtil.

And removed obsolete methods.
This commit is contained in:
David Smiley 2016-03-30 15:00:29 -04:00
parent 39932f5758
commit 5e5fd66257
8 changed files with 117 additions and 173 deletions

View File

@ -180,6 +180,9 @@ Upgrading from Solr 5.x
When there is a non-zero number of milliseconds, it is padded with zeros to 3 digits. Negative year (BC) dates are
now possible. Parsing: It is now an error to supply a portion of the date out of its, range, like 67 seconds.
* SolrJ no longer includes DateUtil. If for some reason you need to format or parse dates, simply use Instant.format()
and Instant.parse().
Detailed Change List
----------------------
@ -522,6 +525,9 @@ Other Changes
now parse (and format) dates with a leading '+' or '-' (BC dates or dates > 4 digit year.
[value] and ms() and contrib/analytics now parse with date math. (David Smiley)
* SOLR-8904: DateUtil in SolrJ moved to the extraction contrib as ExtractionDateUtil. Obsolete methods were removed.
(David Smiley)
================== 5.5.1 ==================
Bug Fixes

View File

@ -17,23 +17,6 @@
package org.apache.solr.handler.extraction;
import org.apache.solr.common.SolrException;
import org.apache.solr.common.SolrException.ErrorCode;
import org.apache.solr.common.util.DateUtil;
import org.apache.solr.common.util.NamedList;
import org.apache.solr.core.SolrCore;
import org.apache.solr.request.SolrQueryRequest;
import org.apache.solr.security.AuthorizationContext;
import org.apache.solr.security.PermissionNameProvider;
import org.apache.solr.update.processor.UpdateRequestProcessor;
import org.apache.solr.util.plugin.SolrCoreAware;
import org.apache.solr.handler.ContentStreamHandlerBase;
import org.apache.solr.handler.loader.ContentStreamLoader;
import org.apache.tika.config.TikaConfig;
import org.apache.tika.mime.MimeTypeException;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.File;
import java.io.IOException;
import java.lang.invoke.MethodHandles;
@ -42,6 +25,22 @@ import java.util.HashSet;
import java.util.Iterator;
import java.util.Map;
import org.apache.solr.common.SolrException;
import org.apache.solr.common.SolrException.ErrorCode;
import org.apache.solr.common.util.NamedList;
import org.apache.solr.core.SolrCore;
import org.apache.solr.handler.ContentStreamHandlerBase;
import org.apache.solr.handler.loader.ContentStreamLoader;
import org.apache.solr.request.SolrQueryRequest;
import org.apache.solr.security.AuthorizationContext;
import org.apache.solr.security.PermissionNameProvider;
import org.apache.solr.update.processor.UpdateRequestProcessor;
import org.apache.solr.util.plugin.SolrCoreAware;
import org.apache.tika.config.TikaConfig;
import org.apache.tika.mime.MimeTypeException;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/**
* Handler for rich documents like PDF or Word or any other file format that Tika handles that need the text to be extracted
@ -59,7 +58,7 @@ public class ExtractingRequestHandler extends ContentStreamHandlerBase implement
protected ParseContextConfig parseContextConfig;
protected Collection<String> dateFormats = DateUtil.DEFAULT_DATE_FORMATS;
protected Collection<String> dateFormats = ExtractionDateUtil.DEFAULT_DATE_FORMATS;
protected SolrContentHandlerFactory factory;

View File

@ -14,11 +14,13 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.solr.common.util;
import java.io.IOException;
import java.text.DateFormat;
package org.apache.solr.handler.extraction;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.time.Instant;
import java.time.format.DateTimeFormatter;
import java.time.format.DateTimeFormatterBuilder;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Calendar;
@ -32,7 +34,7 @@ import java.util.TimeZone;
/**
* This class has some code from HttpClient DateUtil.
*/
public class DateUtil {
public class ExtractionDateUtil {
//start HttpClient
/**
* Date format pattern used to parse HTTP date headers in RFC 1123 format.
@ -67,6 +69,12 @@ public class DateUtil {
//---------------------------------------------------------------------------------------
/**
* Differs by {@link DateTimeFormatter#ISO_INSTANT} in that it's lenient.
*/
public static final DateTimeFormatter ISO_8601_PARSER = new DateTimeFormatterBuilder()
.parseCaseInsensitive().parseLenient().appendInstant().toFormatter(Locale.ROOT);
/**
* A suite of default date formats that can be parsed, and thus transformed to the Solr specific format
*/
@ -95,9 +103,12 @@ public class DateUtil {
}
public static Date parseDate(String d, Collection<String> fmts) throws ParseException {
// 2007-04-26T08:05:04Z
if (d.endsWith("Z") && d.length() > 20) {
return getThreadLocalDateFormat().parse(d);
if (d.length() > 0 && d.charAt(d.length() - 1) == 'Z') {
try {
return new Date(ISO_8601_PARSER.parse(d, Instant::from).toEpochMilli());
} catch (Exception e) {
//ignore; perhaps we can parse with one of the formats below...
}
}
return parseDate(d, fmts, null);
}
@ -140,6 +151,7 @@ public class DateUtil {
dateValue = dateValue.substring(1, dateValue.length() - 1);
}
//TODO upgrade to Java 8 DateTimeFormatter. But how to deal with the GMT as a default?
SimpleDateFormat dateParser = null;
Iterator formatIter = dateFormats.iterator();
@ -163,97 +175,4 @@ public class DateUtil {
throw new ParseException("Unable to parse the date " + dateValue, 0);
}
/**
* Returns a formatter that can be use by the current thread if needed to
* convert Date objects to the Internal representation.
*
* @return The {@link java.text.DateFormat} for the current thread
*/
public static DateFormat getThreadLocalDateFormat() {
return fmtThreadLocal.get();
}
public static TimeZone UTC = TimeZone.getTimeZone("UTC");
private static ThreadLocalDateFormat fmtThreadLocal = new ThreadLocalDateFormat();
private static class ThreadLocalDateFormat extends ThreadLocal<DateFormat> {
DateFormat proto;
public ThreadLocalDateFormat() {
super();
//2007-04-26T08:05:04Z
SimpleDateFormat tmp = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss.SSS'Z'", Locale.ROOT);
tmp.setTimeZone(UTC);
proto = tmp;
}
@Override
protected DateFormat initialValue() {
return (DateFormat) proto.clone();
}
}
/** Formats the date and returns the calendar instance that was used (which may be reused) */
public static Calendar formatDate(Date date, Calendar cal, Appendable out) throws IOException {
// using a stringBuilder for numbers can be nice since
// a temporary string isn't used (it's added directly to the
// builder's buffer.
StringBuilder sb = out instanceof StringBuilder ? (StringBuilder)out : new StringBuilder();
if (cal==null) cal = Calendar.getInstance(TimeZone.getTimeZone("GMT"), Locale.ROOT);
cal.setTime(date);
int i = cal.get(Calendar.YEAR);
sb.append(i);
sb.append('-');
i = cal.get(Calendar.MONTH) + 1; // 0 based, so add 1
if (i<10) sb.append('0');
sb.append(i);
sb.append('-');
i=cal.get(Calendar.DAY_OF_MONTH);
if (i<10) sb.append('0');
sb.append(i);
sb.append('T');
i=cal.get(Calendar.HOUR_OF_DAY); // 24 hour time format
if (i<10) sb.append('0');
sb.append(i);
sb.append(':');
i=cal.get(Calendar.MINUTE);
if (i<10) sb.append('0');
sb.append(i);
sb.append(':');
i=cal.get(Calendar.SECOND);
if (i<10) sb.append('0');
sb.append(i);
i=cal.get(Calendar.MILLISECOND);
if (i != 0) {
sb.append('.');
if (i<100) sb.append('0');
if (i<10) sb.append('0');
sb.append(i);
// handle canonical format specifying fractional
// seconds shall not end in '0'. Given the slowness of
// integer div/mod, simply checking the last character
// is probably the fastest way to check.
int lastIdx = sb.length()-1;
if (sb.charAt(lastIdx)=='0') {
lastIdx--;
if (sb.charAt(lastIdx)=='0') {
lastIdx--;
}
sb.setLength(lastIdx+1);
}
}
sb.append('Z');
if (out != sb)
out.append(sb);
return cal;
}
}

View File

@ -17,7 +17,6 @@
package org.apache.solr.handler.extraction;
import java.lang.invoke.MethodHandles;
import java.text.DateFormat;
import java.util.ArrayDeque;
import java.util.Collection;
import java.util.Collections;
@ -31,7 +30,6 @@ import java.util.Set;
import org.apache.solr.common.SolrInputDocument;
import org.apache.solr.common.params.SolrParams;
import org.apache.solr.common.util.DateUtil;
import org.apache.solr.schema.IndexSchema;
import org.apache.solr.schema.SchemaField;
import org.apache.solr.schema.TrieDateField;
@ -83,7 +81,7 @@ public class SolrContentHandler extends DefaultHandler implements ExtractingPara
private Set<String> literalFieldNames = null;
public SolrContentHandler(Metadata metadata, SolrParams params, IndexSchema schema) {
this(metadata, params, schema, DateUtil.DEFAULT_DATE_FORMATS);
this(metadata, params, schema, ExtractionDateUtil.DEFAULT_DATE_FORMATS);
}
@ -317,7 +315,7 @@ public class SolrContentHandler extends DefaultHandler implements ExtractingPara
/**
* Can be used to transform input values based on their {@link org.apache.solr.schema.SchemaField}
* <p>
* This implementation only formats dates using the {@link org.apache.solr.common.util.DateUtil}.
* This implementation only formats dates using the {@link ExtractionDateUtil}.
*
* @param val The value to transform
* @param schFld The {@link org.apache.solr.schema.SchemaField}
@ -328,10 +326,8 @@ public class SolrContentHandler extends DefaultHandler implements ExtractingPara
if (schFld != null && schFld.getType() instanceof TrieDateField) {
//try to transform the date
try {
Date date = DateUtil.parseDate(val, dateFormats);
DateFormat df = DateUtil.getThreadLocalDateFormat();
result = df.format(date);
Date date = ExtractionDateUtil.parseDate(val, dateFormats); // may throw
result = date.toInstant().toString();//ISO format
} catch (Exception e) {
// Let the specific fieldType handle errors
// throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, "Invalid value: " + val + " for field: " + schFld, e);

View File

@ -0,0 +1,61 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.solr.handler.extraction;
import java.text.ParseException;
import java.util.Date;
import java.util.Locale;
import org.apache.lucene.util.LuceneTestCase;
public class TestExtractionDateUtil extends LuceneTestCase {
public void testISO8601() throws Exception {
// dates with atypical years
assertParseFormatEquals("0001-01-01T01:01:01Z", null);
assertParseFormatEquals("+12021-12-01T03:03:03Z", null);
assertParseFormatEquals("0000-04-04T04:04:04Z", null); // note: 0 AD is also known as 1 BC
// dates with negative years (BC)
assertParseFormatEquals("-0005-05-05T05:05:05Z", null);
assertParseFormatEquals("-2021-12-01T04:04:04Z", null);
assertParseFormatEquals("-12021-12-01T02:02:02Z", null);
// dates that only parse thanks to lenient mode of DateTimeFormatter
assertParseFormatEquals("10995-12-31T23:59:59.990Z", "+10995-12-31T23:59:59.990Z"); // missing '+' 5 digit year
assertParseFormatEquals("995-1-2T3:4:5Z", "0995-01-02T03:04:05Z"); // wasn't 0 padded
}
private static void assertParseFormatEquals(String inputStr, String expectedStr) throws ParseException {
if (expectedStr == null) {
expectedStr = inputStr;
}
Date inputDate = ExtractionDateUtil.parseDate(inputStr);
String resultStr = inputDate.toInstant().toString();
assertEquals("d:" + inputDate.getTime(), expectedStr, resultStr);
}
public void testParseDate() throws ParseException {
assertParsedDate(1226583351000L, "Thu Nov 13 04:35:51 AKST 2008");
}
private static void assertParsedDate(long ts, String dateStr) throws ParseException {
long parsed = ExtractionDateUtil.parseDate(dateStr).getTime();
assertTrue(String.format(Locale.ENGLISH, "Incorrect parsed timestamp: %d != %d (%s)", ts, parsed, dateStr), Math.abs(ts - parsed) <= 1000L);
}
}

View File

@ -29,13 +29,19 @@ import java.util.Map;
import java.util.Map.Entry;
import java.util.TreeMap;
import com.google.common.base.Joiner;
import com.google.common.base.Preconditions;
import com.google.common.collect.ArrayListMultimap;
import com.google.common.collect.ListMultimap;
import com.google.common.io.Closeables;
import com.typesafe.config.Config;
import org.apache.solr.common.SolrInputDocument;
import org.apache.solr.common.SolrInputField;
import org.apache.solr.common.params.MultiMapSolrParams;
import org.apache.solr.common.params.SolrParams;
import org.apache.solr.common.util.DateUtil;
import org.apache.solr.common.util.SuppressForbidden;
import org.apache.solr.handler.extraction.ExtractingParams;
import org.apache.solr.handler.extraction.ExtractionDateUtil;
import org.apache.solr.handler.extraction.SolrContentHandler;
import org.apache.solr.handler.extraction.SolrContentHandlerFactory;
import org.apache.solr.morphlines.solr.SolrLocator;
@ -50,7 +56,6 @@ import org.apache.tika.sax.XHTMLContentHandler;
import org.apache.tika.sax.xpath.Matcher;
import org.apache.tika.sax.xpath.MatchingContentHandler;
import org.apache.tika.sax.xpath.XPathParser;
import org.kitesdk.morphline.api.Command;
import org.kitesdk.morphline.api.CommandBuilder;
import org.kitesdk.morphline.api.MorphlineCompilationException;
@ -63,13 +68,6 @@ import org.kitesdk.morphline.stdio.AbstractParser;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
import com.google.common.base.Joiner;
import com.google.common.base.Preconditions;
import com.google.common.collect.ArrayListMultimap;
import com.google.common.collect.ListMultimap;
import com.google.common.io.Closeables;
import com.typesafe.config.Config;
/**
* Command that pipes the first attachment of a record into one of the given Tika parsers, then maps
* the Tika output back to a record using SolrCell.
@ -151,7 +149,7 @@ public final class SolrCellBuilder implements CommandBuilder {
cellParams.put(ExtractingParams.XPATH_EXPRESSION, xpathExpr);
}
this.dateFormats = getConfigs().getStringList(config, "dateFormats", new ArrayList<>(DateUtil.DEFAULT_DATE_FORMATS));
this.dateFormats = getConfigs().getStringList(config, "dateFormats", new ArrayList<>(ExtractionDateUtil.DEFAULT_DATE_FORMATS));
String handlerStr = getConfigs().getString(config, "solrContentHandlerFactory", TrimSolrContentHandlerFactory.class.getName());
Class<? extends SolrContentHandlerFactory> factoryClass;

View File

@ -25,7 +25,7 @@ import org.apache.commons.io.FileUtils;
import org.apache.lucene.util.Constants;
import org.apache.solr.common.SolrInputDocument;
import org.apache.solr.common.params.MapSolrParams;
import org.apache.solr.common.util.DateUtil;
import org.apache.solr.handler.extraction.ExtractionDateUtil;
import org.apache.solr.handler.extraction.SolrContentHandler;
import org.apache.solr.morphlines.solr.AbstractSolrMorphlineTestBase;
import org.apache.solr.schema.IndexSchema;
@ -270,7 +270,7 @@ public class SolrCellMorphlineTest extends AbstractSolrMorphlineTestBase {
// which will cause the ContentHandler to be invoked.
metadata.set(fieldName, getFoobarWithNonChars());
StripNonCharSolrContentHandlerFactory contentHandlerFactory =
new StripNonCharSolrContentHandlerFactory(DateUtil.DEFAULT_DATE_FORMATS);
new StripNonCharSolrContentHandlerFactory(ExtractionDateUtil.DEFAULT_DATE_FORMATS);
IndexSchema schema = h.getCore().getLatestSchema();
SolrContentHandler contentHandler =
contentHandlerFactory.createSolrContentHandler(metadata, new MapSolrParams(new HashMap()), schema);

View File

@ -1,35 +0,0 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.solr.common.util;
import java.text.ParseException;
import java.util.Locale;
import org.apache.lucene.util.LuceneTestCase;
public class TestDateUtil extends LuceneTestCase {
public void testParseDate() throws ParseException {
assertParsedDate(1226583351000L, "Thu Nov 13 04:35:51 AKST 2008");
}
private static void assertParsedDate(long ts, String dateStr) throws ParseException {
long parsed = DateUtil.parseDate(dateStr).getTime();
assertTrue(String.format(Locale.ENGLISH, "Incorrect parsed timestamp: %d != %d (%s)", ts, parsed, dateStr), Math.abs(ts - parsed) <= 1000L);
}
}