Make date parsing more flexible for linedocsfile (europarl, enwiki) (#13075)

This commit is contained in:
Dawid Weiss 2024-02-05 19:04:07 +01:00 committed by GitHub
parent 9ab84f4be2
commit 635d09001a
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 76 additions and 20 deletions

View File

@ -18,13 +18,10 @@ package org.apache.lucene.backward_index;
import com.carrotsearch.randomizedtesting.annotations.ParametersFactory;
import java.io.IOException;
import java.text.ParsePosition;
import java.text.SimpleDateFormat;
import java.time.LocalDateTime;
import java.time.ZoneOffset;
import java.util.Arrays;
import java.util.Date;
import java.util.Locale;
import java.util.Random;
import java.util.TimeZone;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.IntPoint;
@ -164,26 +161,17 @@ public class TestIndexSortBackwardsCompatibility extends BackwardsCompatibilityT
conf.setIndexSort(new Sort(new SortField("dateDV", SortField.Type.LONG, true)));
IndexWriter writer = new IndexWriter(directory, conf);
LineFileDocs docs = new LineFileDocs(new Random(0));
SimpleDateFormat parser = new SimpleDateFormat("yyyy-MM-dd", Locale.ROOT);
parser.setTimeZone(TimeZone.getTimeZone("UTC"));
ParsePosition position = new ParsePosition(0);
for (int i = 0; i < 50; i++) {
Document doc = TestUtil.cloneDocument(docs.nextDoc());
String dateString = doc.get("date");
position.setIndex(0);
Date date = parser.parse(dateString, position);
if (position.getErrorIndex() != -1) {
throw new AssertionError("failed to parse \"" + dateString + "\" as date");
}
if (position.getIndex() != dateString.length()) {
throw new AssertionError("failed to parse \"" + dateString + "\" as date");
}
LocalDateTime date = LineFileDocs.DATE_FIELD_VALUE_TO_LOCALDATETIME.apply(dateString);
doc.add(
new NumericDocValuesField(
"docid_intDV", doc.getField("docid_int").numericValue().longValue()));
doc.add(
new SortedDocValuesField("titleDV", new BytesRef(doc.getField("title").stringValue())));
doc.add(new NumericDocValuesField("dateDV", date.getTime()));
doc.add(new NumericDocValuesField("dateDV", date.toInstant(ZoneOffset.UTC).toEpochMilli()));
if (i % 10 == 0) { // commit every 10 documents
writer.commit();
}
@ -206,9 +194,6 @@ public class TestIndexSortBackwardsCompatibility extends BackwardsCompatibilityT
topDocs = searcher.search(new FieldExistsQuery("titleDV"), 10);
assertEquals(50, topDocs.totalHits.value);
topDocs = searcher.search(new TermQuery(new Term("body", "ja")), 10);
assertTrue(topDocs.totalHits.value > 0);
topDocs =
searcher.search(
IntPoint.newRangeQuery("docid_int", 42, 44),

View File

@ -29,10 +29,16 @@ import java.nio.charset.StandardCharsets;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.time.LocalDate;
import java.time.LocalDateTime;
import java.time.format.DateTimeFormatter;
import java.time.format.DateTimeFormatterBuilder;
import java.util.ArrayList;
import java.util.List;
import java.util.Locale;
import java.util.Random;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.function.Function;
import java.util.zip.GZIPInputStream;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
@ -53,6 +59,35 @@ import org.apache.lucene.util.IOUtils;
* created by benchmark's WriteLineDoc task
*/
public class LineFileDocs implements Closeable {
/**
* Converts date formats for europarl ("2023-02-23") and enwiki ("12-JAN-2010 12:32:45.000") into
* {@link LocalDateTime}.
*/
public static final Function<String, LocalDateTime> DATE_FIELD_VALUE_TO_LOCALDATETIME =
new Function<>() {
final DateTimeFormatter euroParl =
new DateTimeFormatterBuilder()
.parseStrict()
.parseCaseInsensitive()
.appendPattern("uuuu-MM-dd")
.toFormatter(Locale.ROOT);
final DateTimeFormatter enwiki =
new DateTimeFormatterBuilder()
.parseStrict()
.parseCaseInsensitive()
.appendPattern("dd-MMM-uuuu HH:mm:ss['.'SSS]")
.toFormatter(Locale.ROOT);
@Override
public LocalDateTime apply(String s) {
if (s.matches("^[0-9]{4}-[0-9]{2}-[0-9]{2}$")) {
return euroParl.parse(s, LocalDate::from).atStartOfDay();
} else {
return enwiki.parse(s, LocalDateTime::from);
}
}
};
private BufferedReader reader;
private static final int BUFFER_SIZE = 1 << 16; // 64K

View File

@ -0,0 +1,36 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.tests.util;
import java.time.LocalDateTime;
public class TestLineFileDocs extends LuceneTestCase {
/**
* Tests that {@link #expectThrows} behaves correctly when the Runnable throws (an instance of a
* subclass of) the expected Exception type: by returning that Exception.
*/
public void testDateFieldNormalization() {
// europarl corpus uses this data format.
assertEquals(
LocalDateTime.of(2023, 2, 23, 0, 0),
LineFileDocs.DATE_FIELD_VALUE_TO_LOCALDATETIME.apply("2023-02-23"));
// enwiki uses this data format.
assertEquals(
LocalDateTime.of(2010, 1, 12, 12, 32, 45),
LineFileDocs.DATE_FIELD_VALUE_TO_LOCALDATETIME.apply("12-JAN-2010 12:32:45.000"));
}
}