SOLR-12591: ParseDateFieldUpdateProcessorFactory: Use "lenient" and strip surrounding quotes.

More tests, ported from "extract" contrib stuff.
This commit is contained in:
Bar Rotstein 2018-08-15 00:04:09 -04:00 committed by David Smiley
parent 0d89ff2e61
commit ec01cc981c
5 changed files with 149 additions and 9 deletions

View File

@ -55,8 +55,8 @@ Other Changes
* SOLR-12614: Make "Nodes" view the default in AdminUI "Cloud" tab (janhoy) * SOLR-12614: Make "Nodes" view the default in AdminUI "Cloud" tab (janhoy)
* SOLR-12586: Remove Joda Time dependency. Upgrade ParseDateFieldUpdateProcessorFactory (present in "schemaless mode") * SOLR-12586, SOLR-12591: Upgrade ParseDateFieldUpdateProcessorFactory (present in "schemaless mode") to use Java 8's
to use Java 8's java.time.DateTimeFormatter instead (see upgrade notes). java.time.DateTimeFormatter instead of Joda time (see upgrade notes). "Lenient" is enabled. Removed Joda Time dependency.
(David Smiley, Bar Rotstein) (David Smiley, Bar Rotstein)
================== 7.5.0 ================== ================== 7.5.0 ==================

View File

@ -26,6 +26,7 @@ import java.time.ZoneOffset;
import java.time.format.DateTimeFormatter; import java.time.format.DateTimeFormatter;
import java.time.format.DateTimeFormatterBuilder; import java.time.format.DateTimeFormatterBuilder;
import java.time.format.DateTimeParseException; import java.time.format.DateTimeParseException;
import java.time.format.ResolverStyle;
import java.time.temporal.TemporalAccessor; import java.time.temporal.TemporalAccessor;
import java.time.temporal.TemporalQueries; import java.time.temporal.TemporalQueries;
import java.util.Collection; import java.util.Collection;
@ -51,7 +52,7 @@ import org.slf4j.LoggerFactory;
* Attempts to mutate selected fields that have only CharSequence-typed values * Attempts to mutate selected fields that have only CharSequence-typed values
* into Date values. Solr will continue to index date/times in the UTC time * into Date values. Solr will continue to index date/times in the UTC time
* zone, but the input date/times may be expressed using other time zones, * zone, but the input date/times may be expressed using other time zones,
* and will be converted to UTC when they are mutated. * and will be converted to an unambiguous {@link Date} when they are mutated.
* </p> * </p>
* <p> * <p>
* The default selection behavior is to mutate both those fields that don't match * The default selection behavior is to mutate both those fields that don't match
@ -67,6 +68,8 @@ import org.slf4j.LoggerFactory;
* One or more date "format" specifiers must be specified. See * One or more date "format" specifiers must be specified. See
* <a href="https://docs.oracle.com/javase/8/docs/api/java/time/format/DateTimeFormatter.html" * <a href="https://docs.oracle.com/javase/8/docs/api/java/time/format/DateTimeFormatter.html"
* >Java 8's DateTimeFormatter javadocs</a> for a description of format strings. * >Java 8's DateTimeFormatter javadocs</a> for a description of format strings.
* Note that "lenient" and case insensitivity is enabled.
* Furthermore, inputs surrounded in single quotes will be removed if found.
* </p> * </p>
* <p> * <p>
* A default time zone name or offset may optionally be specified for those dates * A default time zone name or offset may optionally be specified for those dates
@ -120,6 +123,16 @@ public class ParseDateFieldUpdateProcessorFactory extends FieldMutatingUpdatePro
protected Object mutateValue(Object srcVal) { protected Object mutateValue(Object srcVal) {
if (srcVal instanceof CharSequence) { if (srcVal instanceof CharSequence) {
String srcStringVal = srcVal.toString(); String srcStringVal = srcVal.toString();
// trim single quotes around date if present
// see issue #5279 (Apache HttpClient)
int stringValLen = srcStringVal.length();
if (stringValLen > 1
&& srcStringVal.startsWith("'")
&& srcStringVal.endsWith("'")
) {
srcStringVal = srcStringVal.substring(1, stringValLen - 1);
}
for (Map.Entry<String,DateTimeFormatter> format : formats.entrySet()) { for (Map.Entry<String,DateTimeFormatter> format : formats.entrySet()) {
DateTimeFormatter parser = format.getValue(); DateTimeFormatter parser = format.getValue();
try { try {
@ -159,8 +172,9 @@ public class ParseDateFieldUpdateProcessorFactory extends FieldMutatingUpdatePro
Collection<String> formatsParam = args.removeConfigArgs(FORMATS_PARAM); Collection<String> formatsParam = args.removeConfigArgs(FORMATS_PARAM);
if (null != formatsParam) { if (null != formatsParam) {
for (String value : formatsParam) { for (String value : formatsParam) {
DateTimeFormatter formatter = new DateTimeFormatterBuilder().parseCaseInsensitive() DateTimeFormatter formatter = new DateTimeFormatterBuilder().parseLenient().parseCaseInsensitive()
.appendPattern(value).toFormatter(locale).withZone(defaultTimeZone); .appendPattern(value).toFormatter(locale)
.withResolverStyle(ResolverStyle.LENIENT).withZone(defaultTimeZone);
validateFormatter(formatter); validateFormatter(formatter);
formats.put(value, formatter); formats.put(value, formatter);
} }

View File

@ -109,6 +109,21 @@
</processor> </processor>
</updateRequestProcessorChain> </updateRequestProcessorChain>
<updateRequestProcessorChain name="parse-date-patterns-from-extract-contrib">
<processor class="solr.ParseDateFieldUpdateProcessorFactory">
<str name="defaultTimeZone">UTC</str>
<str name="locale">en</str>
<arr name="format">
<str>yyyy-MM-dd['T'[HH:mm:ss['.'SSS][z</str>
<str>yyyy-MM-dd HH:mm:ss</str>
<str>EEE MMM d HH:mm:ss [z ]yyyy</str>
<str>EEEE, dd-MMM-yy HH:mm:ss zzz</str>
<str>EEE, dd MMM yyyy HH:mm:ss zzz</str>
</arr>
</processor>
<processor class="solr.RunUpdateProcessorFactory" />
</updateRequestProcessorChain>
<updateRequestProcessorChain name="parse-int"> <updateRequestProcessorChain name="parse-int">
<processor class="solr.ParseIntFieldUpdateProcessorFactory"/> <processor class="solr.ParseIntFieldUpdateProcessorFactory"/>
<processor class="solr.RunUpdateProcessorFactory"/> <processor class="solr.RunUpdateProcessorFactory"/>

View File

@ -16,6 +16,7 @@
*/ */
package org.apache.solr.update.processor; package org.apache.solr.update.processor;
import java.io.IOException;
import java.time.Instant; import java.time.Instant;
import java.time.LocalDate; import java.time.LocalDate;
import java.time.LocalDateTime; import java.time.LocalDateTime;
@ -36,7 +37,6 @@ import java.util.Set;
import org.apache.solr.common.SolrInputDocument; import org.apache.solr.common.SolrInputDocument;
import org.apache.solr.schema.IndexSchema; import org.apache.solr.schema.IndexSchema;
import org.junit.BeforeClass; import org.junit.BeforeClass;
/** /**
* Tests for the field mutating update processors * Tests for the field mutating update processors
* that parse Dates, Longs, Doubles, and Booleans. * that parse Dates, Longs, Doubles, and Booleans.
@ -896,13 +896,124 @@ public class ParsingFieldUpdateProcessorsTest extends UpdateProcessorTestBase {
assertTrue(mixedDates.isEmpty()); assertTrue(mixedDates.isEmpty());
} }
private Date parse(DateTimeFormatter dateTimeFormatter, String dateString) { // tests that mimic the tests that were in TestExtractionDateUtil
public void testISO8601() throws IOException {
// dates with atypical years
// This test tries to mimic TestExtractionDateUtil#testISO8601
String[] dateStrings = {
"0001-01-01T01:01:01Z", "+12021-12-01T03:03:03Z",
"0000-04-04T04:04:04Z", "-0005-05-05T05:05:05Z",
"-2021-12-01T04:04:04Z", "-12021-12-01T02:02:02Z"
};
int id = 1;
// ensure strings are parsed
for(String notInFormatDateString: dateStrings) {
IndexSchema schema = h.getCore().getLatestSchema();
assertNotNull(schema.getFieldOrNull("date_dt")); // should match "*_dt" dynamic field
SolrInputDocument d = processAdd("parse-date-patterns-from-extract-contrib", doc(f("id", id), f("date_dt", notInFormatDateString)));
assertNotNull(d);
assertTrue("Date string: " + notInFormatDateString + " was not parsed as a date", d.getFieldValue("date_dt") instanceof Date);
assertEquals(notInFormatDateString, ((Date) d.getField("date_dt").getFirstValue()).toInstant().toString());
assertU(commit());
assertQ(req("id:" + id), "//date[@name='date_dt'][.='" + notInFormatDateString + "']");
++id;
}
// odd values are date strings, even values are expected strings
String[] lenientDateStrings = {
"10995-12-31T23:59:59.990Z", "+10995-12-31T23:59:59.990Z",
"995-1-2T3:4:5Z", "0995-01-02T03:04:05Z",
"2021-01-01t03:04:05", "2021-01-01T03:04:05Z",
"2021-12-01 04:04:04", "2021-12-01T04:04:04Z"
};
// ensure sure strings that should be parsed using lenient resolver are properly parsed
for(int i = 0; i < lenientDateStrings.length; ++i) {
String lenientDateString = lenientDateStrings[i];
String expectedString = lenientDateStrings[++i];
IndexSchema schema = h.getCore().getLatestSchema();
assertNotNull(schema.getFieldOrNull("date_dt")); // should match "*_dt" dynamic field
SolrInputDocument d = processAdd("parse-date-patterns-from-extract-contrib", doc(f("id", id), f("date_dt", lenientDateString)));
assertNotNull(d);
assertTrue("Date string: " + lenientDateString + " was not parsed as a date",
d.getFieldValue("date_dt") instanceof Date);
assertEquals(expectedString, ((Date) d.getField("date_dt").getFirstValue()).toInstant().toString());
++id;
}
}
// this test has had problems when the JDK timezone is Americas/Metlakatla
public void testAKSTZone() throws IOException {
final String inputString = "Thu Nov 13 04:35:51 AKST 2008";
final long expectTs = 1226583351000L;
assertEquals(expectTs,
DateTimeFormatter.ofPattern("EEE MMM d HH:mm:ss z yyyy", Locale.ENGLISH)
.withZone(ZoneId.of("UTC")).parse(inputString, Instant::from).toEpochMilli());
assertParsedDate(inputString, Date.from(Instant.ofEpochMilli(expectTs)), "parse-date-patterns-from-extract-contrib");
}
public void testNoTime() throws IOException {
Instant instant = instant(2005, 10, 7, 0, 0, 0);
String inputString = "2005-10-07";
assertParsedDate(inputString, Date.from(instant), "parse-date-patterns-from-extract-contrib");
}
public void testRfc1123() throws IOException {
assertParsedDate("Fri, 07 Oct 2005 13:14:15 GMT", Date.from(inst20051007131415()), "parse-date-patterns-from-extract-contrib");
}
public void testRfc1036() throws IOException {
assertParsedDate("Friday, 07-Oct-05 13:14:15 GMT", Date.from(inst20051007131415()), "parse-date-patterns-from-extract-contrib");
}
public void testAnsiC() throws IOException {
assertParsedDate(
"Fri Oct 7 13:14:15 2005", Date.from(inst20051007131415()), "parse-date-patterns-from-extract-contrib");
assertParsedDate("Fri Oct 7 05:14:15 AKDT 2005", Date.from(inst20051007131415()), "parse-date-patterns-from-extract-contrib"); // with timezone (not ANSI C) in DST
}
public void testLenient() throws IOException {
/// the Ansi C format, but input here has longer day of week
assertParsedDate("Friday Oct 7 13:14:15 2005", Date.from(inst20051007131415()), "parse-date-patterns-from-extract-contrib");
}
public void testParseQuotedDate() throws IOException {
// also using 2 digit day
assertParsedDate("'Fri, 14 Oct 2005 13:14:15 GMT'",
Date.from(instant(2005, 10, 14, 13, 14, 15)), "parse-date-patterns-from-extract-contrib");
}
private static Instant instant(final int year, final int month, final int day, int hour, int minute, int second) {
return LocalDate.of(year, month, day).atTime(hour, minute, second).toInstant(ZoneOffset.UTC);
}
private Instant inst20051007131415() {
return instant(2005, 10, 7, 13, 14, 15);
}
private void assertParsedDate(String inputDateString, Date expectedDate, String chain) throws IOException {
IndexSchema schema = h.getCore().getLatestSchema();
assertNotNull(schema.getFieldOrNull("date_dt")); // should match "*_dt" dynamic field
SolrInputDocument d = processAdd(chain, doc(f("id", "1"), f("date_dt", inputDateString)));
assertNotNull(d);
assertTrue("Date string: " + inputDateString + " was not parsed as a date",
d.getFieldValue("date_dt") instanceof Date);
assertEquals(expectedDate, d.getField("date_dt").getFirstValue());
}
private static Date parse(DateTimeFormatter dateTimeFormatter, String dateString) {
final TemporalAccessor temporalAccessor = dateTimeFormatter.parseBest(dateString, OffsetDateTime::from, final TemporalAccessor temporalAccessor = dateTimeFormatter.parseBest(dateString, OffsetDateTime::from,
ZonedDateTime::from, LocalDateTime::from, LocalDate::from, Instant::from); ZonedDateTime::from, LocalDateTime::from, LocalDate::from, Instant::from);
return temporalToDate(temporalAccessor, dateTimeFormatter.getZone()); return temporalToDate(temporalAccessor, dateTimeFormatter.getZone());
} }
private Date temporalToDate(TemporalAccessor in, ZoneId timeZoneId) { private static Date temporalToDate(TemporalAccessor in, ZoneId timeZoneId) {
if (in instanceof OffsetDateTime) { if (in instanceof OffsetDateTime) {
return Date.from(((OffsetDateTime) in).toInstant()); return Date.from(((OffsetDateTime) in).toInstant());
} else if (in instanceof ZonedDateTime) { } else if (in instanceof ZonedDateTime) {

View File

@ -317,7 +317,7 @@ These factories all provide functionality to _modify_ fields in a document as th
{solr-javadocs}/solr-core/org/apache/solr/update/processor/ParseBooleanFieldUpdateProcessorFactory.html[ParseBooleanFieldUpdateProcessorFactory]:: Attempts to mutate selected fields that have only CharSequence-typed values into Boolean values. {solr-javadocs}/solr-core/org/apache/solr/update/processor/ParseBooleanFieldUpdateProcessorFactory.html[ParseBooleanFieldUpdateProcessorFactory]:: Attempts to mutate selected fields that have only CharSequence-typed values into Boolean values.
{solr-javadocs}/solr-core/org/apache/solr/update/processor/ParseDateFieldUpdateProcessorFactory.html[ParseDateFieldUpdateProcessorFactory]:: Attempts to mutate selected fields that have only CharSequence-typed values into Solr date values. {solr-javadocs}/solr-core/org/apache/solr/update/processor/ParseDateFieldUpdateProcessorFactory.html[ParseDateFieldUpdateProcessorFactory]:: Attempts to mutate selected fields that have only CharSequence-typed values into Date values.
{solr-javadocs}/solr-core/org/apache/solr/update/processor/ParseNumericFieldUpdateProcessorFactory.html[ParseNumericFieldUpdateProcessorFactory] derived classes:: {solr-javadocs}/solr-core/org/apache/solr/update/processor/ParseNumericFieldUpdateProcessorFactory.html[ParseNumericFieldUpdateProcessorFactory] derived classes::