mirror of https://github.com/apache/lucene.git
SOLR-12593: remove date parsing from extract contrib
* added "ignored_*" to the default configSet * Updated Ref Guide info on Solr Cell to demonstrate usage without using the techproducts configSet Closes #438
This commit is contained in:
parent
918b11c7ce
commit
964cc88cee
|
@ -52,6 +52,9 @@ Upgrade Notes
|
|||
expanded from before to subsume those patterns previously handled by the "extract" contrib (Solr Cell / Tika).
|
||||
(David Smiley, Bar Rotstein)
|
||||
|
||||
* SOLR-12593: The "extraction" contrib (Solr Cell) no longer does any date parsing, and thus no longer has the
|
||||
"date.formats" configuration. To ensure date strings are properly parsed, use ParseDateFieldUpdateProcessorFactory
|
||||
(an URP) commonly registered with the name "parse-date" in "schemaless mode". (David Smiley, Bar Rotstein)
|
||||
|
||||
New Features
|
||||
----------------------
|
||||
|
@ -62,6 +65,8 @@ SOLR-12591: Expand the set of recognized date format patterns of schemaless mode
|
|||
locale was changed from ROOT to en_US since well-known patterns assume this locale.
|
||||
(David Smiley, Bar Rotstein)
|
||||
|
||||
SOLR-12593: The default configSet now includes an "ignored_*" dynamic field. (David Smiley)
|
||||
|
||||
Other Changes
|
||||
----------------------
|
||||
|
||||
|
|
|
@ -16,15 +16,9 @@
|
|||
*/
|
||||
package org.apache.solr.handler.extraction;
|
||||
|
||||
|
||||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.lang.invoke.MethodHandles;
|
||||
import java.util.Collection;
|
||||
import java.util.HashSet;
|
||||
import java.util.Iterator;
|
||||
import java.util.Map;
|
||||
|
||||
import org.apache.solr.common.SolrException;
|
||||
import org.apache.solr.common.SolrException.ErrorCode;
|
||||
|
@ -39,31 +33,22 @@ import org.apache.solr.update.processor.UpdateRequestProcessor;
|
|||
import org.apache.solr.util.plugin.SolrCoreAware;
|
||||
import org.apache.tika.config.TikaConfig;
|
||||
import org.apache.tika.exception.TikaException;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
import org.xml.sax.SAXException;
|
||||
|
||||
|
||||
/**
|
||||
* Handler for rich documents like PDF or Word or any other file format that Tika handles that need the text to be extracted
|
||||
* first from the document.
|
||||
*/
|
||||
public class ExtractingRequestHandler extends ContentStreamHandlerBase implements SolrCoreAware , PermissionNameProvider {
|
||||
|
||||
private static final Logger log = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());
|
||||
|
||||
public static final String PARSE_CONTEXT_CONFIG = "parseContext.config";
|
||||
public static final String CONFIG_LOCATION = "tika.config";
|
||||
public static final String DATE_FORMATS = "date.formats";
|
||||
|
||||
protected TikaConfig config;
|
||||
protected ParseContextConfig parseContextConfig;
|
||||
|
||||
|
||||
protected Collection<String> dateFormats = ExtractionDateUtil.DEFAULT_DATE_FORMATS;
|
||||
protected SolrContentHandlerFactory factory;
|
||||
|
||||
|
||||
@Override
|
||||
public PermissionNameProvider.Name getPermissionName(AuthorizationContext request) {
|
||||
return PermissionNameProvider.Name.READ_PERM;
|
||||
|
@ -99,17 +84,6 @@ public class ExtractingRequestHandler extends ContentStreamHandlerBase implement
|
|||
throw new SolrException(ErrorCode.SERVER_ERROR, e);
|
||||
}
|
||||
}
|
||||
|
||||
NamedList configDateFormats = (NamedList) initArgs.get(DATE_FORMATS);
|
||||
if (configDateFormats != null && configDateFormats.size() > 0) {
|
||||
dateFormats = new HashSet<>();
|
||||
Iterator<Map.Entry> it = configDateFormats.iterator();
|
||||
while (it.hasNext()) {
|
||||
String format = (String) it.next().getValue();
|
||||
log.info("Adding Date Format: " + format);
|
||||
dateFormats.add(format);
|
||||
}
|
||||
}
|
||||
}
|
||||
if (config == null) {
|
||||
try (InputStream is = core.getResourceLoader().getClassLoader().getResourceAsStream("solr-default-tika-config.xml")){
|
||||
|
@ -125,10 +99,9 @@ public class ExtractingRequestHandler extends ContentStreamHandlerBase implement
|
|||
}
|
||||
|
||||
protected SolrContentHandlerFactory createFactory() {
|
||||
return new SolrContentHandlerFactory(dateFormats);
|
||||
return new SolrContentHandlerFactory();
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
protected ContentStreamLoader newLoader(SolrQueryRequest req, UpdateRequestProcessor processor) {
|
||||
return new ExtractingDocumentLoader(req, processor, config, parseContextConfig, factory);
|
||||
|
@ -140,5 +113,3 @@ public class ExtractingRequestHandler extends ContentStreamHandlerBase implement
|
|||
return "Add/Update Rich document";
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
|
|
@ -1,178 +0,0 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.solr.handler.extraction;
|
||||
|
||||
import java.text.ParseException;
|
||||
import java.text.SimpleDateFormat;
|
||||
import java.time.Instant;
|
||||
import java.time.format.DateTimeFormatter;
|
||||
import java.time.format.DateTimeFormatterBuilder;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.Calendar;
|
||||
import java.util.Collection;
|
||||
import java.util.Date;
|
||||
import java.util.Iterator;
|
||||
import java.util.Locale;
|
||||
import java.util.TimeZone;
|
||||
|
||||
|
||||
/**
|
||||
* This class has some code from HttpClient DateUtil.
|
||||
*/
|
||||
public class ExtractionDateUtil {
|
||||
//start HttpClient
|
||||
/**
|
||||
* Date format pattern used to parse HTTP date headers in RFC 1123 format.
|
||||
*/
|
||||
public static final String PATTERN_RFC1123 = "EEE, dd MMM yyyy HH:mm:ss zzz";
|
||||
|
||||
/**
|
||||
* Date format pattern used to parse HTTP date headers in RFC 1036 format.
|
||||
*/
|
||||
public static final String PATTERN_RFC1036 = "EEEE, dd-MMM-yy HH:mm:ss zzz";
|
||||
|
||||
/**
|
||||
* Date format pattern used to parse HTTP date headers in ANSI C
|
||||
* <code>asctime()</code> format.
|
||||
*/
|
||||
public static final String PATTERN_ASCTIME = "EEE MMM d HH:mm:ss yyyy";
|
||||
//These are included for back compat
|
||||
private static final Collection<String> DEFAULT_HTTP_CLIENT_PATTERNS = Arrays.asList(
|
||||
PATTERN_ASCTIME, PATTERN_RFC1036, PATTERN_RFC1123);
|
||||
|
||||
private static final Date DEFAULT_TWO_DIGIT_YEAR_START;
|
||||
|
||||
static {
|
||||
Calendar calendar = Calendar.getInstance(TimeZone.getTimeZone("GMT"), Locale.ROOT);
|
||||
calendar.set(2000, Calendar.JANUARY, 1, 0, 0);
|
||||
DEFAULT_TWO_DIGIT_YEAR_START = calendar.getTime();
|
||||
}
|
||||
|
||||
private static final TimeZone GMT = TimeZone.getTimeZone("GMT");
|
||||
|
||||
//end HttpClient
|
||||
|
||||
//---------------------------------------------------------------------------------------
|
||||
|
||||
/**
|
||||
* Differs by {@link DateTimeFormatter#ISO_INSTANT} in that it's lenient.
|
||||
*/
|
||||
public static final DateTimeFormatter ISO_8601_PARSER = new DateTimeFormatterBuilder()
|
||||
.parseCaseInsensitive().parseLenient().appendInstant().toFormatter(Locale.ROOT);
|
||||
|
||||
/**
|
||||
* A suite of default date formats that can be parsed, and thus transformed to the Solr specific format
|
||||
*/
|
||||
public static final Collection<String> DEFAULT_DATE_FORMATS = new ArrayList<>();
|
||||
|
||||
static {
|
||||
DEFAULT_DATE_FORMATS.add("yyyy-MM-dd'T'HH:mm:ss'Z'");
|
||||
DEFAULT_DATE_FORMATS.add("yyyy-MM-dd'T'HH:mm:ss");
|
||||
DEFAULT_DATE_FORMATS.add("yyyy-MM-dd");
|
||||
DEFAULT_DATE_FORMATS.add("yyyy-MM-dd hh:mm:ss");
|
||||
DEFAULT_DATE_FORMATS.add("yyyy-MM-dd HH:mm:ss");
|
||||
DEFAULT_DATE_FORMATS.add("EEE MMM d hh:mm:ss z yyyy");
|
||||
DEFAULT_DATE_FORMATS.addAll(DEFAULT_HTTP_CLIENT_PATTERNS);
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns a formatter that can be use by the current thread if needed to
|
||||
* convert Date objects to the Internal representation.
|
||||
*
|
||||
* @param d The input date to parse
|
||||
* @return The parsed {@link java.util.Date}
|
||||
* @throws java.text.ParseException If the input can't be parsed
|
||||
*/
|
||||
public static Date parseDate(String d) throws ParseException {
|
||||
return parseDate(d, DEFAULT_DATE_FORMATS);
|
||||
}
|
||||
|
||||
public static Date parseDate(String d, Collection<String> fmts) throws ParseException {
|
||||
if (d.length() > 0 && d.charAt(d.length() - 1) == 'Z') {
|
||||
try {
|
||||
return new Date(ISO_8601_PARSER.parse(d, Instant::from).toEpochMilli());
|
||||
} catch (Exception e) {
|
||||
//ignore; perhaps we can parse with one of the formats below...
|
||||
}
|
||||
}
|
||||
return parseDate(d, fmts, null);
|
||||
}
|
||||
|
||||
/**
|
||||
* Slightly modified from org.apache.commons.httpclient.util.DateUtil.parseDate
|
||||
* <p>
|
||||
* Parses the date value using the given date formats.
|
||||
*
|
||||
* @param dateValue the date value to parse
|
||||
* @param dateFormats the date formats to use
|
||||
* @param startDate During parsing, two digit years will be placed in the range
|
||||
* <code>startDate</code> to <code>startDate + 100 years</code>. This value may
|
||||
* be <code>null</code>. When <code>null</code> is given as a parameter, year
|
||||
* <code>2000</code> will be used.
|
||||
* @return the parsed date
|
||||
* @throws ParseException if none of the dataFormats could parse the dateValue
|
||||
*/
|
||||
public static Date parseDate(
|
||||
String dateValue,
|
||||
Collection<String> dateFormats,
|
||||
Date startDate
|
||||
) throws ParseException {
|
||||
|
||||
if (dateValue == null) {
|
||||
throw new IllegalArgumentException("dateValue is null");
|
||||
}
|
||||
if (dateFormats == null) {
|
||||
dateFormats = DEFAULT_HTTP_CLIENT_PATTERNS;
|
||||
}
|
||||
if (startDate == null) {
|
||||
startDate = DEFAULT_TWO_DIGIT_YEAR_START;
|
||||
}
|
||||
// trim single quotes around date if present
|
||||
// see issue #5279
|
||||
if (dateValue.length() > 1
|
||||
&& dateValue.startsWith("'")
|
||||
&& dateValue.endsWith("'")
|
||||
) {
|
||||
dateValue = dateValue.substring(1, dateValue.length() - 1);
|
||||
}
|
||||
|
||||
//TODO upgrade to Java 8 DateTimeFormatter. But how to deal with the GMT as a default?
|
||||
SimpleDateFormat dateParser = null;
|
||||
Iterator formatIter = dateFormats.iterator();
|
||||
|
||||
while (formatIter.hasNext()) {
|
||||
String format = (String) formatIter.next();
|
||||
if (dateParser == null) {
|
||||
dateParser = new SimpleDateFormat(format, Locale.ENGLISH);
|
||||
dateParser.setTimeZone(GMT);
|
||||
dateParser.set2DigitYearStart(startDate);
|
||||
} else {
|
||||
dateParser.applyPattern(format);
|
||||
}
|
||||
try {
|
||||
return dateParser.parse(dateValue);
|
||||
} catch (ParseException pe) {
|
||||
// ignore this exception, we will try the next format
|
||||
}
|
||||
}
|
||||
|
||||
// we were unable to parse the date
|
||||
throw new ParseException("Unable to parse the date " + dateValue, 0);
|
||||
}
|
||||
|
||||
}
|
|
@ -18,9 +18,7 @@ package org.apache.solr.handler.extraction;
|
|||
|
||||
import java.lang.invoke.MethodHandles;
|
||||
import java.util.ArrayDeque;
|
||||
import java.util.Collection;
|
||||
import java.util.Collections;
|
||||
import java.util.Date;
|
||||
import java.util.Deque;
|
||||
import java.util.HashMap;
|
||||
import java.util.HashSet;
|
||||
|
@ -32,7 +30,6 @@ import org.apache.solr.common.SolrInputDocument;
|
|||
import org.apache.solr.common.params.SolrParams;
|
||||
import org.apache.solr.schema.IndexSchema;
|
||||
import org.apache.solr.schema.SchemaField;
|
||||
import org.apache.solr.schema.NumberType;
|
||||
import org.apache.tika.metadata.Metadata;
|
||||
import org.apache.tika.metadata.TikaMetadataKeys;
|
||||
import org.slf4j.Logger;
|
||||
|
@ -61,8 +58,6 @@ public class SolrContentHandler extends DefaultHandler implements ExtractingPara
|
|||
|
||||
protected final SolrInputDocument document;
|
||||
|
||||
protected final Collection<String> dateFormats;
|
||||
|
||||
protected final Metadata metadata;
|
||||
protected final SolrParams params;
|
||||
protected final StringBuilder catchAllBuilder = new StringBuilder(2048);
|
||||
|
@ -80,18 +75,12 @@ public class SolrContentHandler extends DefaultHandler implements ExtractingPara
|
|||
|
||||
private Set<String> literalFieldNames = null;
|
||||
|
||||
|
||||
public SolrContentHandler(Metadata metadata, SolrParams params, IndexSchema schema) {
|
||||
this(metadata, params, schema, ExtractionDateUtil.DEFAULT_DATE_FORMATS);
|
||||
}
|
||||
|
||||
|
||||
public SolrContentHandler(Metadata metadata, SolrParams params,
|
||||
IndexSchema schema, Collection<String> dateFormats) {
|
||||
this.document = new SolrInputDocument();
|
||||
this.metadata = metadata;
|
||||
this.params = params;
|
||||
this.schema = schema;
|
||||
this.dateFormats = dateFormats;
|
||||
|
||||
this.lowerNames = params.getBool(LOWERNAMES, false);
|
||||
this.captureAttribs = params.getBool(CAPTURE_ATTRIBUTES, false);
|
||||
|
@ -253,12 +242,12 @@ public class SolrContentHandler extends DefaultHandler implements ExtractingPara
|
|||
}
|
||||
|
||||
if (fval != null) {
|
||||
document.addField(name, transformValue(fval, sf));
|
||||
document.addField(name, fval);
|
||||
}
|
||||
|
||||
if (vals != null) {
|
||||
for (String val : vals) {
|
||||
document.addField(name, transformValue(val, sf));
|
||||
document.addField(name, val);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -310,30 +299,6 @@ public class SolrContentHandler extends DefaultHandler implements ExtractingPara
|
|||
characters(chars, offset, length);
|
||||
}
|
||||
|
||||
/**
|
||||
* Can be used to transform input values based on their {@link org.apache.solr.schema.SchemaField}
|
||||
* <p>
|
||||
* This implementation only formats dates using the {@link ExtractionDateUtil}.
|
||||
*
|
||||
* @param val The value to transform
|
||||
* @param schFld The {@link org.apache.solr.schema.SchemaField}
|
||||
* @return The potentially new value.
|
||||
*/
|
||||
protected String transformValue(String val, SchemaField schFld) {
|
||||
String result = val;
|
||||
if (schFld != null && NumberType.DATE.equals(schFld.getType().getNumberType())) {
|
||||
//try to transform the date
|
||||
try {
|
||||
Date date = ExtractionDateUtil.parseDate(val, dateFormats); // may throw
|
||||
result = date.toInstant().toString();//ISO format
|
||||
} catch (Exception e) {
|
||||
// Let the specific fieldType handle errors
|
||||
// throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, "Invalid value: " + val + " for field: " + schFld, e);
|
||||
}
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the name mapping
|
||||
*
|
||||
|
|
|
@ -20,22 +20,16 @@ import org.apache.tika.metadata.Metadata;
|
|||
import org.apache.solr.common.params.SolrParams;
|
||||
import org.apache.solr.schema.IndexSchema;
|
||||
|
||||
import java.util.Collection;
|
||||
|
||||
|
||||
/**
|
||||
*
|
||||
*
|
||||
**/
|
||||
public class SolrContentHandlerFactory {
|
||||
protected Collection<String> dateFormats;
|
||||
|
||||
public SolrContentHandlerFactory(Collection<String> dateFormats) {
|
||||
this.dateFormats = dateFormats;
|
||||
}
|
||||
public SolrContentHandlerFactory() { }
|
||||
|
||||
public SolrContentHandler createSolrContentHandler(Metadata metadata, SolrParams params, IndexSchema schema) {
|
||||
return new SolrContentHandler(metadata, params, schema,
|
||||
dateFormats);
|
||||
return new SolrContentHandler(metadata, params, schema);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -1,62 +0,0 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.solr.handler.extraction;
|
||||
|
||||
import java.text.ParseException;
|
||||
import java.util.Date;
|
||||
import java.util.Locale;
|
||||
|
||||
import org.apache.lucene.util.LuceneTestCase;
|
||||
|
||||
public class TestExtractionDateUtil extends LuceneTestCase {
|
||||
|
||||
public void testISO8601() throws Exception {
|
||||
// dates with atypical years
|
||||
assertParseFormatEquals("0001-01-01T01:01:01Z", null);
|
||||
assertParseFormatEquals("+12021-12-01T03:03:03Z", null);
|
||||
|
||||
assertParseFormatEquals("0000-04-04T04:04:04Z", null); // note: 0 AD is also known as 1 BC
|
||||
|
||||
// dates with negative years (BC)
|
||||
assertParseFormatEquals("-0005-05-05T05:05:05Z", null);
|
||||
assertParseFormatEquals("-2021-12-01T04:04:04Z", null);
|
||||
assertParseFormatEquals("-12021-12-01T02:02:02Z", null);
|
||||
|
||||
// dates that only parse thanks to lenient mode of DateTimeFormatter
|
||||
assertParseFormatEquals("10995-12-31T23:59:59.990Z", "+10995-12-31T23:59:59.990Z"); // missing '+' 5 digit year
|
||||
assertParseFormatEquals("995-1-2T3:4:5Z", "0995-01-02T03:04:05Z"); // wasn't 0 padded
|
||||
}
|
||||
|
||||
private static void assertParseFormatEquals(String inputStr, String expectedStr) throws ParseException {
|
||||
if (expectedStr == null) {
|
||||
expectedStr = inputStr;
|
||||
}
|
||||
Date inputDate = ExtractionDateUtil.parseDate(inputStr);
|
||||
String resultStr = inputDate.toInstant().toString();
|
||||
assertEquals("d:" + inputDate.getTime(), expectedStr, resultStr);
|
||||
}
|
||||
|
||||
@AwaitsFix(bugUrl = "https://issues.apache.org/jira/browse/SOLR-12593")
|
||||
public void testParseDate() throws ParseException {
|
||||
assertParsedDate(1226583351000L, "Thu Nov 13 04:35:51 AKST 2008");
|
||||
}
|
||||
|
||||
private static void assertParsedDate(long ts, String dateStr) throws ParseException {
|
||||
long parsed = ExtractionDateUtil.parseDate(dateStr).getTime();
|
||||
assertTrue(String.format(Locale.ENGLISH, "Incorrect parsed timestamp: %d != %d (%s)", ts, parsed, dateStr), Math.abs(ts - parsed) <= 1000L);
|
||||
}
|
||||
}
|
|
@ -140,6 +140,7 @@
|
|||
<dynamicField name="*_d" type="pdouble" indexed="true" stored="true"/>
|
||||
<dynamicField name="*_ds" type="pdoubles" indexed="true" stored="true"/>
|
||||
<dynamicField name="random_*" type="random"/>
|
||||
<dynamicField name="ignored_*" type="ignored"/>
|
||||
|
||||
<!-- Type used for data-driven schema, to add a string copy for each text field -->
|
||||
<dynamicField name="*_str" type="strings" stored="false" docValues="true" indexed="false" useDocValuesAsStored="false"/>
|
||||
|
@ -214,6 +215,9 @@
|
|||
<fieldType name="pdoubles" class="solr.DoublePointField" docValues="true" multiValued="true"/>
|
||||
<fieldType name="random" class="solr.RandomSortField" indexed="true"/>
|
||||
|
||||
<!-- since fields of this type are by default not stored or indexed,
|
||||
any data added to them will be ignored outright. -->
|
||||
<fieldType name="ignored" stored="false" indexed="false" multiValued="true" class="solr.StrField" />
|
||||
|
||||
<!-- The format for this date field is of the form 1995-12-31T23:59:59Z, and
|
||||
is a more restricted form of the canonical representation of dateTime
|
||||
|
|
|
@ -797,7 +797,6 @@
|
|||
class="solr.extraction.ExtractingRequestHandler" >
|
||||
<lst name="defaults">
|
||||
<str name="lowernames">true</str>
|
||||
<str name="fmap.meta">ignored_</str>
|
||||
<str name="fmap.content">_text_</str>
|
||||
</lst>
|
||||
</requestHandler>
|
||||
|
|
|
@ -140,6 +140,7 @@
|
|||
<dynamicField name="*_d" type="pdouble" indexed="true" stored="true"/>
|
||||
<dynamicField name="*_ds" type="pdoubles" indexed="true" stored="true"/>
|
||||
<dynamicField name="random_*" type="random"/>
|
||||
<dynamicField name="ignored_*" type="ignored"/>
|
||||
|
||||
<!-- Type used for data-driven schema, to add a string copy for each text field -->
|
||||
<dynamicField name="*_str" type="strings" stored="false" docValues="true" indexed="false" useDocValuesAsStored="false"/>
|
||||
|
@ -214,6 +215,9 @@
|
|||
<fieldType name="pdoubles" class="solr.DoublePointField" docValues="true" multiValued="true"/>
|
||||
<fieldType name="random" class="solr.RandomSortField" indexed="true"/>
|
||||
|
||||
<!-- since fields of this type are by default not stored or indexed,
|
||||
any data added to them will be ignored outright. -->
|
||||
<fieldType name="ignored" stored="false" indexed="false" multiValued="true" class="solr.StrField" />
|
||||
|
||||
<!-- The format for this date field is of the form 1995-12-31T23:59:59Z, and
|
||||
is a more restricted form of the canonical representation of dateTime
|
||||
|
|
|
@ -797,7 +797,6 @@
|
|||
class="solr.extraction.ExtractingRequestHandler" >
|
||||
<lst name="defaults">
|
||||
<str name="lowernames">true</str>
|
||||
<str name="fmap.meta">ignored_</str>
|
||||
<str name="fmap.content">_text_</str>
|
||||
</lst>
|
||||
</requestHandler>
|
||||
|
|
|
@ -26,41 +26,50 @@ If you want to supply your own `ContentHandler` for Solr to use, you can extend
|
|||
|
||||
When using the Solr Cell framework, it is helpful to keep the following in mind:
|
||||
|
||||
* Tika will automatically attempt to determine the input document type (Word, PDF, HTML) and extract the content appropriately. If you like, you can explicitly specify a MIME type for Tika with the `stream.type` parameter.
|
||||
* Tika works by producing an XHTML stream that it feeds to a SAX ContentHandler. SAX is a common interface implemented for many different XML parsers. For more information, see http://www.saxproject.org/quickstart.html.
|
||||
* Solr then responds to Tika's SAX events and creates the fields to index.
|
||||
* Tika produces metadata such as Title, Subject, and Author according to specifications such as the DublinCore. See http://tika.apache.org/{ivy-tika-version}/formats.html for the file types supported.
|
||||
* Tika adds all the extracted text to the `content` field.
|
||||
* You can map Tika's metadata fields to Solr fields.
|
||||
* You can pass in literals for field values. Literals will override Tika-parsed values, including fields in the Tika metadata object, the Tika content field, and any "captured content" fields.
|
||||
* You can apply an XPath expression to the Tika XHTML to restrict the content that is produced.
|
||||
* Tika will automatically attempt to determine the input document type (e.g. Word, PDF, HTML) and extract the content appropriately.
|
||||
If you like, you can explicitly specify a MIME type for Tika with the `stream.type` parameter.
|
||||
See http://tika.apache.org/{ivy-tika-version}/formats.html for the file types supported.
|
||||
* Briefly, Tika internally works by synthesizing an XHTML document from the core content of the parsed document which is passed to a configured http://www.saxproject.org/quickstart.html[SAX] ContentHandler provided by Solr Cell.
|
||||
Solr responds to Tika's SAX events to create one or more text fields from the content.
|
||||
Tika exposes document metadata as well (apart from the XHTML).
|
||||
* Tika produces metadata such as Title, Subject, and Author according to specifications such as the DublinCore.
|
||||
The metadata available is highly dependent on the file types and what they in turn contain.
|
||||
Solr Cell supplies some metadata of its own too.
|
||||
* Solr Cell concatenates text from the internal XHTML into a `content` field.
|
||||
You can configure which elements should be included/ignored, and which should map to another field.
|
||||
* Solr Cell maps each piece of metadata onto a field.
|
||||
By default it maps to the same name but several parameters control how this is done.
|
||||
* When Solr Cell finishes creating the internal `SolrInputDocument`, the rest of the Lucene/Solr indexing stack takes over.
|
||||
The next step after any update handler is the <<update-request-processors.adoc#update-request-processors,Update Request Processor>> chain.
|
||||
|
||||
[TIP]
|
||||
[NOTE]
|
||||
====
|
||||
While Apache Tika is quite powerful, it is not perfect and fails on some files. PDF files are particularly problematic, mostly due to the PDF format itself. In case of a failure processing any file, the `ExtractingRequestHandler` does not have a secondary mechanism to try to extract some text from the file; it will throw an exception and fail.
|
||||
====
|
||||
|
||||
== Trying out Tika with the Solr techproducts Example
|
||||
== Trying out Tika
|
||||
|
||||
You can try out the Tika framework using the `techproducts` example included in Solr.
|
||||
You can try out the Tika framework using the `schemaless` example included in Solr.
|
||||
This will simply create a core/collection "gettingstarted" with the default configSet.
|
||||
|
||||
Start the example:
|
||||
|
||||
[source,bash]
|
||||
----
|
||||
bin/solr -e techproducts
|
||||
bin/solr -e schemaless
|
||||
----
|
||||
|
||||
You can now use curl to send a sample PDF file via HTTP POST:
|
||||
|
||||
[source,bash]
|
||||
----
|
||||
curl 'http://localhost:8983/solr/techproducts/update/extract?literal.id=doc1&commit=true' -F "myfile=@example/exampledocs/solr-word.pdf"
|
||||
curl 'http://localhost:8983/solr/gettingstarted/update/extract?literal.id=doc1&uprefix=ignored_&commit=true' -F "myfile=@example/exampledocs/solr-word.pdf"
|
||||
----
|
||||
|
||||
The URL above calls the Extracting Request Handler, uploads the file `solr-word.pdf` and assigns it the unique ID `doc1`. Here's a closer look at the components of this command:
|
||||
|
||||
* The `literal.id=doc1` parameter provides the necessary unique ID for the document being indexed.
|
||||
* The `literal.id=doc1` parameter provides a unique ID for the document being indexed.
|
||||
There are alternatives to this like mapping a metadata field to the ID, generating a new UUID, and generating an ID from a signature (hash) of the content.
|
||||
|
||||
* The `commit=true parameter` causes Solr to perform a commit after indexing the document, making it immediately searchable. For optimum performance when loading many documents, don't call the commit command until you are done.
|
||||
|
||||
|
@ -68,25 +77,36 @@ The URL above calls the Extracting Request Handler, uploads the file `solr-word.
|
|||
|
||||
* The argument `myfile=@tutorial.html` needs a valid path, which can be absolute or relative.
|
||||
|
||||
You can also use `bin/post` to send a PDF file into Solr (without the params, the `literal.id` parameter would be set to the absolute path to the file):
|
||||
You can also use `bin/post` to send a PDF file into Solr (without the params, the post tool would set `literal.id` to the absolute path to the file):
|
||||
|
||||
[source,bash]
|
||||
----
|
||||
bin/post -c techproducts example/exampledocs/solr-word.pdf -params "literal.id=a"
|
||||
bin/post -c gettingstarted example/exampledocs/solr-word.pdf -params "literal.id=doc1"
|
||||
----
|
||||
|
||||
Now you should be able to execute a query and find that document. You can make a request like `\http://localhost:8983/solr/techproducts/select?q=pdf`.
|
||||
Now you should be able to execute a query and find that document. You can make a request like `\http://localhost:8983/solr/gettingstarted/select?q=pdf`.
|
||||
|
||||
You may notice that although the content of the sample document has been indexed and stored, there are not a lot of metadata fields associated with this document. This is because unknown fields are ignored according to the default parameters configured for the `/update/extract` handler in `solrconfig.xml`, and this behavior can be easily changed or overridden. For example, to store and see all metadata and content, execute the following:
|
||||
You may notice there are many metadata fields associated with this document.
|
||||
Solr's configuration is by default in "schemaless" (data driven) mode, and thus all metadata fields extracted get their own field.
|
||||
You might instead want to ignore them generally except for a few you specify.
|
||||
To do that, use the `uprefix` parameter to map unknown (to the schema) metadata field names to a schema field name that is effectively ignored.
|
||||
The dynamic field `ignored_*` is good for this purpose.
|
||||
For the fields you do want to map, explicitly set them using `fmap.IN=OUT` and/or ensure the field is defined in the schema.
|
||||
Here's an example:
|
||||
|
||||
[source,bash]
|
||||
----
|
||||
bin/post -c techproducts example/exampledocs/solr-word.pdf -params "literal.id=doc1&uprefix=attr_"
|
||||
bin/post -c gettingstarted example/exampledocs/solr-word.pdf -params "literal.id=doc1&uprefix=ignored_&fmap.last_modified=last_modified_dt"
|
||||
----
|
||||
|
||||
In this command, the `uprefix=attr_` parameter causes all generated fields that aren't defined in the schema to be prefixed with `attr_`, which is a dynamic field that is stored and indexed.
|
||||
|
||||
This command allows you to query the document using an attribute, as in: `\http://localhost:8983/solr/techproducts/select?q=attr_meta:microsoft`.
|
||||
[NOTE]
|
||||
====
|
||||
This won't have the intended effect if you run it at this point in the sequence of this tutorial.
|
||||
Previously we added the document without these parameters; schemaless mode automatically added all fields at that time.
|
||||
"uprefix" only applies to fields that are _undefined_ (hence the 'u' in "uprefix"), so these won't be prefixed now.
|
||||
However you will see the new "last_modified_dt" field.
|
||||
The easiest way to try this properly is to start over by deleting `example/schemaless/` (while Solr is stopped).
|
||||
====
|
||||
|
||||
== Solr Cell Input Parameters
|
||||
|
||||
|
@ -101,9 +121,6 @@ Indexes attributes of the Tika XHTML elements into separate fields, named after
|
|||
`commitWithin`::
|
||||
Add the document within the specified number of milliseconds.
|
||||
|
||||
`date.formats`::
|
||||
Defines the date format patterns to identify in the documents.
|
||||
|
||||
`defaultField`::
|
||||
If the `uprefix` parameter (see below) is not specified and a field cannot be determined, the default field will be used.
|
||||
|
||||
|
@ -144,7 +161,7 @@ Defines a password to use for a password-protected PDF or OOXML file
|
|||
Defines a file path and name to a customized Tika configuration file. This is only required if you have customized your Tika implementation.
|
||||
|
||||
`uprefix`::
|
||||
Prefixes all fields that are not defined in the schema with the given prefix. This is very useful when combined with dynamic field definitions. Example: `uprefix=ignored_` would effectively ignore all unknown fields generated by Tika given the example schema contains `<dynamicField name="ignored_*" type="ignored"/>`
|
||||
Prefixes all fields _that are undefined in the schema_ with the given prefix. This is very useful when combined with dynamic field definitions. Example: `uprefix=ignored_` would effectively ignore all unknown fields generated by Tika given the default schema contains `<dynamicField name="ignored_*" type="ignored"/>`
|
||||
|
||||
`xpath`::
|
||||
When extracting, only return Tika XHTML content that satisfies the given XPath expression. See http://tika.apache.org/{ivy-tika-version}/ for details on the format of Tika XHTML. See also http://wiki.apache.org/solr/TikaExtractOnlyExampleOutput.
|
||||
|
@ -160,7 +177,7 @@ Here is the order in which the Solr Cell framework, using the Extracting Request
|
|||
|
||||
== Configuring the Solr ExtractingRequestHandler
|
||||
|
||||
If you are not working with the supplied `sample_techproducts_configs` or `_default` <<config-sets.adoc#config-sets,config set>>, you must configure your own `solrconfig.xml` to know about the Jar's containing the `ExtractingRequestHandler` and its dependencies:
|
||||
If you are not working with the supplied <<config-sets.adoc#config-sets,config sets>>, you must configure your own `solrconfig.xml` to know about the Jar's containing the `ExtractingRequestHandler` and its dependencies:
|
||||
|
||||
[source,xml]
|
||||
----
|
||||
|
@ -179,11 +196,6 @@ You can then configure the `ExtractingRequestHandler` in `solrconfig.xml`.
|
|||
</lst>
|
||||
<!--Optional. Specify a path to a tika configuration file. See the Tika docs for details.-->
|
||||
<str name="tika.config">/my/path/to/tika.config</str>
|
||||
<!-- Optional. Specify one or more date formats to parse. See DateUtil.DEFAULT_DATE_FORMATS
|
||||
for default date formats -->
|
||||
<lst name="date.formats">
|
||||
<str>yyyy-MM-dd</str>
|
||||
</lst>
|
||||
<!-- Optional. Specify an external file containing parser-specific properties.
|
||||
This file is located in the same directory as solrconfig.xml by default.-->
|
||||
<str name="parseContext.config">parseContext.xml</str>
|
||||
|
@ -192,17 +204,16 @@ You can then configure the `ExtractingRequestHandler` in `solrconfig.xml`.
|
|||
|
||||
In the defaults section, we are mapping Tika's Last-Modified Metadata attribute to a field named `last_modified`. We are also telling it to ignore undeclared fields. These are all overridden parameters.
|
||||
|
||||
The `tika.config` entry points to a file containing a Tika configuration. The `date.formats` allows you to specify various `java.text.SimpleDateFormats` date formats for working with transforming extracted input to a Date. Solr comes configured with the following date formats (see the `DateUtil` in Solr):
|
||||
The `tika.config` entry points to a file containing a Tika configuration.
|
||||
|
||||
* `yyyy-MM-dd'T'HH:mm:ss'Z'`
|
||||
* `yyyy-MM-dd'T'HH:mm:ss`
|
||||
* `yyyy-MM-dd`
|
||||
* `yyyy-MM-dd hh:mm:ss`
|
||||
* `yyyy-MM-dd HH:mm:ss`
|
||||
* `EEE MMM d hh:mm:ss z yyyy`
|
||||
* `EEE, dd MMM yyyy HH:mm:ss zzz`
|
||||
* `EEEE, dd-MMM-yy HH:mm:ss zzz`
|
||||
* `EEE MMM d HH:mm:ss yyyy`
|
||||
[TIP]
|
||||
====
|
||||
You likely need to have <<update-request-processors.adoc#update-request-processors,Update Request Processors>> (URPs) that parse numbers and dates and do other manipulations on the metadata fields generated by Solr Cell.
|
||||
In Solr's default configuration, "schemaless" (data driven) mode is enabled, which does a variety of such processing already.
|
||||
_If you don't use this mode_, you can still selectively specify the desired URPs.
|
||||
An easy way to specify this is to configure the parameter `processor` (under `defaults`) to `uuid,remove-blank,field-name-mutating,parse-boolean,parse-long,parse-double,parse-date`.
|
||||
That suggested list was taken right from the `add-unknown-fields-to-the-schema` URP chain, excluding `add-schema-fields`.
|
||||
====
|
||||
|
||||
=== Parser-Specific Properties
|
||||
|
||||
|
@ -270,7 +281,7 @@ The command below captures `<div>` tags separately, and then maps all the instan
|
|||
|
||||
[source,bash]
|
||||
----
|
||||
bin/post -c techproducts example/exampledocs/sample.html -params "literal.id=doc2&captureAttr=true&defaultField=_text_&fmap.div=foo_t&capture=div"
|
||||
bin/post -c gettingstarted example/exampledocs/sample.html -params "literal.id=doc2&captureAttr=true&defaultField=_text_&fmap.div=foo_t&capture=div"
|
||||
----
|
||||
|
||||
==== Using Literals to Define Your Own Metadata
|
||||
|
@ -279,7 +290,7 @@ To add in your own metadata, pass in the literal parameter along with the file:
|
|||
|
||||
[source,bash]
|
||||
----
|
||||
bin/post -c techproducts -params "literal.id=doc4&captureAttr=true&defaultField=text&capture=div&fmap.div=foo_t&literal.blah_s=Bah" example/exampledocs/sample.html
|
||||
bin/post -c gettingstarted -params "literal.id=doc4&captureAttr=true&defaultField=text&capture=div&fmap.div=foo_t&literal.blah_s=Bah" example/exampledocs/sample.html
|
||||
----
|
||||
|
||||
==== XPath Expressions
|
||||
|
@ -288,7 +299,7 @@ The example below passes in an XPath expression to restrict the XHTML returned b
|
|||
|
||||
[source,bash]
|
||||
----
|
||||
bin/post -c techproducts -params "literal.id=doc5&captureAttr=true&defaultField=text&capture=div&fmap.div=foo_t&xpath=/xhtml:html/xhtml:body/xhtml:div//node()" example/exampledocs/sample.html
|
||||
bin/post -c gettingstarted -params "literal.id=doc5&captureAttr=true&defaultField=text&capture=div&fmap.div=foo_t&xpath=/xhtml:html/xhtml:body/xhtml:div//node()" example/exampledocs/sample.html
|
||||
----
|
||||
|
||||
=== Extracting Data without Indexing It
|
||||
|
@ -299,14 +310,14 @@ The example below sets the `extractOnly=true` parameter to extract data without
|
|||
|
||||
[source,bash]
|
||||
----
|
||||
curl "http://localhost:8983/solr/techproducts/update/extract?&extractOnly=true" --data-binary @example/exampledocs/sample.html -H 'Content-type:text/html'
|
||||
curl "http://localhost:8983/solr/gettingstarted/update/extract?&extractOnly=true" --data-binary @example/exampledocs/sample.html -H 'Content-type:text/html'
|
||||
----
|
||||
|
||||
The output includes XML generated by Tika (and further escaped by Solr's XML) using a different output format to make it more readable (`-out yes` instructs the tool to echo Solr's output to the console):
|
||||
|
||||
[source,bash]
|
||||
----
|
||||
bin/post -c techproducts -params "extractOnly=true&wt=ruby&indent=true" -out yes example/exampledocs/sample.html
|
||||
bin/post -c gettingstarted -params "extractOnly=true&wt=ruby&indent=true" -out yes example/exampledocs/sample.html
|
||||
----
|
||||
|
||||
== Sending Documents to Solr with a POST
|
||||
|
@ -315,10 +326,10 @@ The example below streams the file as the body of the POST, which does not, then
|
|||
|
||||
[source,bash]
|
||||
----
|
||||
curl "http://localhost:8983/solr/techproducts/update/extract?literal.id=doc6&defaultField=text&commit=true" --data-binary @example/exampledocs/sample.html -H 'Content-type:text/html'
|
||||
curl "http://localhost:8983/solr/gettingstarted/update/extract?literal.id=doc6&defaultField=text&commit=true" --data-binary @example/exampledocs/sample.html -H 'Content-type:text/html'
|
||||
----
|
||||
|
||||
== Sending Documents to Solr with Solr Cell and SolrJ
|
||||
== Sending Documents to Solr with SolrJ
|
||||
|
||||
SolrJ is a Java client that you can use to add documents to the index, update the index, or query the index. You'll find more information on SolrJ in <<client-apis.adoc#client-apis,Client APIs>>.
|
||||
|
||||
|
|
Loading…
Reference in New Issue