From ec01cc981c0ff221c79014f3665fd21c227d5651 Mon Sep 17 00:00:00 2001 From: Bar Rotstein Date: Wed, 15 Aug 2018 00:04:09 -0400 Subject: [PATCH] SOLR-12591: ParseDateFieldUpdateProcessorFactory: Use "lenient" and strip surrounding quotes. More tests, ported from "extract" contrib stuff. --- solr/CHANGES.txt | 4 +- .../ParseDateFieldUpdateProcessorFactory.java | 20 ++- ...config-parsing-update-processor-chains.xml | 15 +++ .../ParsingFieldUpdateProcessorsTest.java | 117 +++++++++++++++++- .../src/update-request-processors.adoc | 2 +- 5 files changed, 149 insertions(+), 9 deletions(-) diff --git a/solr/CHANGES.txt b/solr/CHANGES.txt index d165e218d52..f515c7ee92b 100644 --- a/solr/CHANGES.txt +++ b/solr/CHANGES.txt @@ -55,8 +55,8 @@ Other Changes * SOLR-12614: Make "Nodes" view the default in AdminUI "Cloud" tab (janhoy) -* SOLR-12586: Remove Joda Time dependency. Upgrade ParseDateFieldUpdateProcessorFactory (present in "schemaless mode") - to use Java 8's java.time.DateTimeFormatter instead (see upgrade notes). +* SOLR-12586, SOLR-12591: Upgrade ParseDateFieldUpdateProcessorFactory (present in "schemaless mode") to use Java 8's + java.time.DateTimeFormatter instead of Joda time (see upgrade notes). "Lenient" is enabled. Removed Joda Time dependency. (David Smiley, Bar Rotstein) ================== 7.5.0 ================== diff --git a/solr/core/src/java/org/apache/solr/update/processor/ParseDateFieldUpdateProcessorFactory.java b/solr/core/src/java/org/apache/solr/update/processor/ParseDateFieldUpdateProcessorFactory.java index f0ea5d25720..2561fdb904f 100644 --- a/solr/core/src/java/org/apache/solr/update/processor/ParseDateFieldUpdateProcessorFactory.java +++ b/solr/core/src/java/org/apache/solr/update/processor/ParseDateFieldUpdateProcessorFactory.java @@ -26,6 +26,7 @@ import java.time.ZoneOffset; import java.time.format.DateTimeFormatter; import java.time.format.DateTimeFormatterBuilder; import java.time.format.DateTimeParseException; +import java.time.format.ResolverStyle; import java.time.temporal.TemporalAccessor; import java.time.temporal.TemporalQueries; import java.util.Collection; @@ -51,7 +52,7 @@ import org.slf4j.LoggerFactory; * Attempts to mutate selected fields that have only CharSequence-typed values * into Date values. Solr will continue to index date/times in the UTC time * zone, but the input date/times may be expressed using other time zones, - * and will be converted to UTC when they are mutated. + * and will be converted to an unambiguous {@link Date} when they are mutated. *

*

* The default selection behavior is to mutate both those fields that don't match @@ -67,6 +68,8 @@ import org.slf4j.LoggerFactory; * One or more date "format" specifiers must be specified. See * Java 8's DateTimeFormatter javadocs for a description of format strings. + * Note that "lenient" and case insensitivity is enabled. + * Furthermore, inputs surrounded in single quotes will be removed if found. *

*

* A default time zone name or offset may optionally be specified for those dates @@ -120,6 +123,16 @@ public class ParseDateFieldUpdateProcessorFactory extends FieldMutatingUpdatePro protected Object mutateValue(Object srcVal) { if (srcVal instanceof CharSequence) { String srcStringVal = srcVal.toString(); + // trim single quotes around date if present + // see issue #5279 (Apache HttpClient) + int stringValLen = srcStringVal.length(); + if (stringValLen > 1 + && srcStringVal.startsWith("'") + && srcStringVal.endsWith("'") + ) { + srcStringVal = srcStringVal.substring(1, stringValLen - 1); + } + for (Map.Entry format : formats.entrySet()) { DateTimeFormatter parser = format.getValue(); try { @@ -159,8 +172,9 @@ public class ParseDateFieldUpdateProcessorFactory extends FieldMutatingUpdatePro Collection formatsParam = args.removeConfigArgs(FORMATS_PARAM); if (null != formatsParam) { for (String value : formatsParam) { - DateTimeFormatter formatter = new DateTimeFormatterBuilder().parseCaseInsensitive() - .appendPattern(value).toFormatter(locale).withZone(defaultTimeZone); + DateTimeFormatter formatter = new DateTimeFormatterBuilder().parseLenient().parseCaseInsensitive() + .appendPattern(value).toFormatter(locale) + .withResolverStyle(ResolverStyle.LENIENT).withZone(defaultTimeZone); validateFormatter(formatter); formats.put(value, formatter); } diff --git a/solr/core/src/test-files/solr/collection1/conf/solrconfig-parsing-update-processor-chains.xml b/solr/core/src/test-files/solr/collection1/conf/solrconfig-parsing-update-processor-chains.xml index 83be4eef789..6914ba2d095 100644 --- a/solr/core/src/test-files/solr/collection1/conf/solrconfig-parsing-update-processor-chains.xml +++ b/solr/core/src/test-files/solr/collection1/conf/solrconfig-parsing-update-processor-chains.xml @@ -109,6 +109,21 @@ + + + UTC + en + + yyyy-MM-dd['T'[HH:mm:ss['.'SSS][z + yyyy-MM-dd HH:mm:ss + EEE MMM d HH:mm:ss [z ]yyyy + EEEE, dd-MMM-yy HH:mm:ss zzz + EEE, dd MMM yyyy HH:mm:ss zzz + + + + + diff --git a/solr/core/src/test/org/apache/solr/update/processor/ParsingFieldUpdateProcessorsTest.java b/solr/core/src/test/org/apache/solr/update/processor/ParsingFieldUpdateProcessorsTest.java index e26ca4122bf..334b14a628f 100644 --- a/solr/core/src/test/org/apache/solr/update/processor/ParsingFieldUpdateProcessorsTest.java +++ b/solr/core/src/test/org/apache/solr/update/processor/ParsingFieldUpdateProcessorsTest.java @@ -16,6 +16,7 @@ */ package org.apache.solr.update.processor; +import java.io.IOException; import java.time.Instant; import java.time.LocalDate; import java.time.LocalDateTime; @@ -36,7 +37,6 @@ import java.util.Set; import org.apache.solr.common.SolrInputDocument; import org.apache.solr.schema.IndexSchema; import org.junit.BeforeClass; - /** * Tests for the field mutating update processors * that parse Dates, Longs, Doubles, and Booleans. @@ -896,13 +896,124 @@ public class ParsingFieldUpdateProcessorsTest extends UpdateProcessorTestBase { assertTrue(mixedDates.isEmpty()); } - private Date parse(DateTimeFormatter dateTimeFormatter, String dateString) { + // tests that mimic the tests that were in TestExtractionDateUtil + public void testISO8601() throws IOException { + // dates with atypical years + // This test tries to mimic TestExtractionDateUtil#testISO8601 + + String[] dateStrings = { + "0001-01-01T01:01:01Z", "+12021-12-01T03:03:03Z", + "0000-04-04T04:04:04Z", "-0005-05-05T05:05:05Z", + "-2021-12-01T04:04:04Z", "-12021-12-01T02:02:02Z" + }; + + int id = 1; + + // ensure strings are parsed + for(String notInFormatDateString: dateStrings) { + IndexSchema schema = h.getCore().getLatestSchema(); + assertNotNull(schema.getFieldOrNull("date_dt")); // should match "*_dt" dynamic field + SolrInputDocument d = processAdd("parse-date-patterns-from-extract-contrib", doc(f("id", id), f("date_dt", notInFormatDateString))); + assertNotNull(d); + assertTrue("Date string: " + notInFormatDateString + " was not parsed as a date", d.getFieldValue("date_dt") instanceof Date); + assertEquals(notInFormatDateString, ((Date) d.getField("date_dt").getFirstValue()).toInstant().toString()); + assertU(commit()); + assertQ(req("id:" + id), "//date[@name='date_dt'][.='" + notInFormatDateString + "']"); + ++id; + } + + // odd values are date strings, even values are expected strings + String[] lenientDateStrings = { + "10995-12-31T23:59:59.990Z", "+10995-12-31T23:59:59.990Z", + "995-1-2T3:4:5Z", "0995-01-02T03:04:05Z", + "2021-01-01t03:04:05", "2021-01-01T03:04:05Z", + "2021-12-01 04:04:04", "2021-12-01T04:04:04Z" + }; + + // ensure sure strings that should be parsed using lenient resolver are properly parsed + for(int i = 0; i < lenientDateStrings.length; ++i) { + String lenientDateString = lenientDateStrings[i]; + String expectedString = lenientDateStrings[++i]; + IndexSchema schema = h.getCore().getLatestSchema(); + assertNotNull(schema.getFieldOrNull("date_dt")); // should match "*_dt" dynamic field + SolrInputDocument d = processAdd("parse-date-patterns-from-extract-contrib", doc(f("id", id), f("date_dt", lenientDateString))); + assertNotNull(d); + assertTrue("Date string: " + lenientDateString + " was not parsed as a date", + d.getFieldValue("date_dt") instanceof Date); + assertEquals(expectedString, ((Date) d.getField("date_dt").getFirstValue()).toInstant().toString()); + ++id; + } + } + + // this test has had problems when the JDK timezone is Americas/Metlakatla + public void testAKSTZone() throws IOException { + final String inputString = "Thu Nov 13 04:35:51 AKST 2008"; + + final long expectTs = 1226583351000L; + assertEquals(expectTs, + DateTimeFormatter.ofPattern("EEE MMM d HH:mm:ss z yyyy", Locale.ENGLISH) + .withZone(ZoneId.of("UTC")).parse(inputString, Instant::from).toEpochMilli()); + + assertParsedDate(inputString, Date.from(Instant.ofEpochMilli(expectTs)), "parse-date-patterns-from-extract-contrib"); + } + + public void testNoTime() throws IOException { + Instant instant = instant(2005, 10, 7, 0, 0, 0); + String inputString = "2005-10-07"; + assertParsedDate(inputString, Date.from(instant), "parse-date-patterns-from-extract-contrib"); + } + + public void testRfc1123() throws IOException { + assertParsedDate("Fri, 07 Oct 2005 13:14:15 GMT", Date.from(inst20051007131415()), "parse-date-patterns-from-extract-contrib"); + } + + public void testRfc1036() throws IOException { + assertParsedDate("Friday, 07-Oct-05 13:14:15 GMT", Date.from(inst20051007131415()), "parse-date-patterns-from-extract-contrib"); + } + + public void testAnsiC() throws IOException { + assertParsedDate( + "Fri Oct 7 13:14:15 2005", Date.from(inst20051007131415()), "parse-date-patterns-from-extract-contrib"); + + assertParsedDate("Fri Oct 7 05:14:15 AKDT 2005", Date.from(inst20051007131415()), "parse-date-patterns-from-extract-contrib"); // with timezone (not ANSI C) in DST + } + + public void testLenient() throws IOException { + /// the Ansi C format, but input here has longer day of week + assertParsedDate("Friday Oct 7 13:14:15 2005", Date.from(inst20051007131415()), "parse-date-patterns-from-extract-contrib"); + } + + public void testParseQuotedDate() throws IOException { + // also using 2 digit day + assertParsedDate("'Fri, 14 Oct 2005 13:14:15 GMT'", + Date.from(instant(2005, 10, 14, 13, 14, 15)), "parse-date-patterns-from-extract-contrib"); + } + + private static Instant instant(final int year, final int month, final int day, int hour, int minute, int second) { + return LocalDate.of(year, month, day).atTime(hour, minute, second).toInstant(ZoneOffset.UTC); + } + + private Instant inst20051007131415() { + return instant(2005, 10, 7, 13, 14, 15); + } + + private void assertParsedDate(String inputDateString, Date expectedDate, String chain) throws IOException { + IndexSchema schema = h.getCore().getLatestSchema(); + assertNotNull(schema.getFieldOrNull("date_dt")); // should match "*_dt" dynamic field + SolrInputDocument d = processAdd(chain, doc(f("id", "1"), f("date_dt", inputDateString))); + assertNotNull(d); + assertTrue("Date string: " + inputDateString + " was not parsed as a date", + d.getFieldValue("date_dt") instanceof Date); + assertEquals(expectedDate, d.getField("date_dt").getFirstValue()); + } + + private static Date parse(DateTimeFormatter dateTimeFormatter, String dateString) { final TemporalAccessor temporalAccessor = dateTimeFormatter.parseBest(dateString, OffsetDateTime::from, ZonedDateTime::from, LocalDateTime::from, LocalDate::from, Instant::from); return temporalToDate(temporalAccessor, dateTimeFormatter.getZone()); } - private Date temporalToDate(TemporalAccessor in, ZoneId timeZoneId) { + private static Date temporalToDate(TemporalAccessor in, ZoneId timeZoneId) { if (in instanceof OffsetDateTime) { return Date.from(((OffsetDateTime) in).toInstant()); } else if (in instanceof ZonedDateTime) { diff --git a/solr/solr-ref-guide/src/update-request-processors.adoc b/solr/solr-ref-guide/src/update-request-processors.adoc index 21e56cfe8cb..267ffbdabc7 100644 --- a/solr/solr-ref-guide/src/update-request-processors.adoc +++ b/solr/solr-ref-guide/src/update-request-processors.adoc @@ -317,7 +317,7 @@ These factories all provide functionality to _modify_ fields in a document as th {solr-javadocs}/solr-core/org/apache/solr/update/processor/ParseBooleanFieldUpdateProcessorFactory.html[ParseBooleanFieldUpdateProcessorFactory]:: Attempts to mutate selected fields that have only CharSequence-typed values into Boolean values. -{solr-javadocs}/solr-core/org/apache/solr/update/processor/ParseDateFieldUpdateProcessorFactory.html[ParseDateFieldUpdateProcessorFactory]:: Attempts to mutate selected fields that have only CharSequence-typed values into Solr date values. +{solr-javadocs}/solr-core/org/apache/solr/update/processor/ParseDateFieldUpdateProcessorFactory.html[ParseDateFieldUpdateProcessorFactory]:: Attempts to mutate selected fields that have only CharSequence-typed values into Date values. {solr-javadocs}/solr-core/org/apache/solr/update/processor/ParseNumericFieldUpdateProcessorFactory.html[ParseNumericFieldUpdateProcessorFactory] derived classes::