mirror of https://github.com/apache/lucene.git
Make date parsing more flexible for linedocsfile (europarl, enwiki) (#13075)
This commit is contained in:
parent
9ab84f4be2
commit
635d09001a
|
@ -18,13 +18,10 @@ package org.apache.lucene.backward_index;
|
|||
|
||||
import com.carrotsearch.randomizedtesting.annotations.ParametersFactory;
|
||||
import java.io.IOException;
|
||||
import java.text.ParsePosition;
|
||||
import java.text.SimpleDateFormat;
|
||||
import java.time.LocalDateTime;
|
||||
import java.time.ZoneOffset;
|
||||
import java.util.Arrays;
|
||||
import java.util.Date;
|
||||
import java.util.Locale;
|
||||
import java.util.Random;
|
||||
import java.util.TimeZone;
|
||||
import org.apache.lucene.document.Document;
|
||||
import org.apache.lucene.document.Field;
|
||||
import org.apache.lucene.document.IntPoint;
|
||||
|
@ -164,26 +161,17 @@ public class TestIndexSortBackwardsCompatibility extends BackwardsCompatibilityT
|
|||
conf.setIndexSort(new Sort(new SortField("dateDV", SortField.Type.LONG, true)));
|
||||
IndexWriter writer = new IndexWriter(directory, conf);
|
||||
LineFileDocs docs = new LineFileDocs(new Random(0));
|
||||
SimpleDateFormat parser = new SimpleDateFormat("yyyy-MM-dd", Locale.ROOT);
|
||||
parser.setTimeZone(TimeZone.getTimeZone("UTC"));
|
||||
ParsePosition position = new ParsePosition(0);
|
||||
|
||||
for (int i = 0; i < 50; i++) {
|
||||
Document doc = TestUtil.cloneDocument(docs.nextDoc());
|
||||
String dateString = doc.get("date");
|
||||
position.setIndex(0);
|
||||
Date date = parser.parse(dateString, position);
|
||||
if (position.getErrorIndex() != -1) {
|
||||
throw new AssertionError("failed to parse \"" + dateString + "\" as date");
|
||||
}
|
||||
if (position.getIndex() != dateString.length()) {
|
||||
throw new AssertionError("failed to parse \"" + dateString + "\" as date");
|
||||
}
|
||||
LocalDateTime date = LineFileDocs.DATE_FIELD_VALUE_TO_LOCALDATETIME.apply(dateString);
|
||||
doc.add(
|
||||
new NumericDocValuesField(
|
||||
"docid_intDV", doc.getField("docid_int").numericValue().longValue()));
|
||||
doc.add(
|
||||
new SortedDocValuesField("titleDV", new BytesRef(doc.getField("title").stringValue())));
|
||||
doc.add(new NumericDocValuesField("dateDV", date.getTime()));
|
||||
doc.add(new NumericDocValuesField("dateDV", date.toInstant(ZoneOffset.UTC).toEpochMilli()));
|
||||
if (i % 10 == 0) { // commit every 10 documents
|
||||
writer.commit();
|
||||
}
|
||||
|
@ -206,9 +194,6 @@ public class TestIndexSortBackwardsCompatibility extends BackwardsCompatibilityT
|
|||
topDocs = searcher.search(new FieldExistsQuery("titleDV"), 10);
|
||||
assertEquals(50, topDocs.totalHits.value);
|
||||
|
||||
topDocs = searcher.search(new TermQuery(new Term("body", "ja")), 10);
|
||||
assertTrue(topDocs.totalHits.value > 0);
|
||||
|
||||
topDocs =
|
||||
searcher.search(
|
||||
IntPoint.newRangeQuery("docid_int", 42, 44),
|
||||
|
|
|
@ -29,10 +29,16 @@ import java.nio.charset.StandardCharsets;
|
|||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.nio.file.Paths;
|
||||
import java.time.LocalDate;
|
||||
import java.time.LocalDateTime;
|
||||
import java.time.format.DateTimeFormatter;
|
||||
import java.time.format.DateTimeFormatterBuilder;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.Locale;
|
||||
import java.util.Random;
|
||||
import java.util.concurrent.atomic.AtomicInteger;
|
||||
import java.util.function.Function;
|
||||
import java.util.zip.GZIPInputStream;
|
||||
import org.apache.lucene.document.Document;
|
||||
import org.apache.lucene.document.Field;
|
||||
|
@ -53,6 +59,35 @@ import org.apache.lucene.util.IOUtils;
|
|||
* created by benchmark's WriteLineDoc task
|
||||
*/
|
||||
public class LineFileDocs implements Closeable {
|
||||
/**
|
||||
* Converts date formats for europarl ("2023-02-23") and enwiki ("12-JAN-2010 12:32:45.000") into
|
||||
* {@link LocalDateTime}.
|
||||
*/
|
||||
public static final Function<String, LocalDateTime> DATE_FIELD_VALUE_TO_LOCALDATETIME =
|
||||
new Function<>() {
|
||||
final DateTimeFormatter euroParl =
|
||||
new DateTimeFormatterBuilder()
|
||||
.parseStrict()
|
||||
.parseCaseInsensitive()
|
||||
.appendPattern("uuuu-MM-dd")
|
||||
.toFormatter(Locale.ROOT);
|
||||
|
||||
final DateTimeFormatter enwiki =
|
||||
new DateTimeFormatterBuilder()
|
||||
.parseStrict()
|
||||
.parseCaseInsensitive()
|
||||
.appendPattern("dd-MMM-uuuu HH:mm:ss['.'SSS]")
|
||||
.toFormatter(Locale.ROOT);
|
||||
|
||||
@Override
|
||||
public LocalDateTime apply(String s) {
|
||||
if (s.matches("^[0-9]{4}-[0-9]{2}-[0-9]{2}$")) {
|
||||
return euroParl.parse(s, LocalDate::from).atStartOfDay();
|
||||
} else {
|
||||
return enwiki.parse(s, LocalDateTime::from);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
private BufferedReader reader;
|
||||
private static final int BUFFER_SIZE = 1 << 16; // 64K
|
||||
|
|
|
@ -0,0 +1,36 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.lucene.tests.util;
|
||||
|
||||
import java.time.LocalDateTime;
|
||||
|
||||
public class TestLineFileDocs extends LuceneTestCase {
|
||||
/**
|
||||
* Tests that {@link #expectThrows} behaves correctly when the Runnable throws (an instance of a
|
||||
* subclass of) the expected Exception type: by returning that Exception.
|
||||
*/
|
||||
public void testDateFieldNormalization() {
|
||||
// europarl corpus uses this data format.
|
||||
assertEquals(
|
||||
LocalDateTime.of(2023, 2, 23, 0, 0),
|
||||
LineFileDocs.DATE_FIELD_VALUE_TO_LOCALDATETIME.apply("2023-02-23"));
|
||||
// enwiki uses this data format.
|
||||
assertEquals(
|
||||
LocalDateTime.of(2010, 1, 12, 12, 32, 45),
|
||||
LineFileDocs.DATE_FIELD_VALUE_TO_LOCALDATETIME.apply("12-JAN-2010 12:32:45.000"));
|
||||
}
|
||||
}
|
Loading…
Reference in New Issue