SOLR-12593: remove date parsing from extract contrib

* added "ignored_*" to the default configSet
* Updated Ref Guide info on Solr Cell to demonstrate usage without using the techproducts configSet

Closes #438
This commit is contained in:
David Smiley 2018-09-28 16:50:11 -04:00
parent 918b11c7ce
commit 964cc88cee
11 changed files with 85 additions and 373 deletions

View File

@ -52,6 +52,9 @@ Upgrade Notes
expanded from before to subsume those patterns previously handled by the "extract" contrib (Solr Cell / Tika).
(David Smiley, Bar Rotstein)
* SOLR-12593: The "extraction" contrib (Solr Cell) no longer does any date parsing, and thus no longer has the
"date.formats" configuration. To ensure date strings are properly parsed, use ParseDateFieldUpdateProcessorFactory
(an URP) commonly registered with the name "parse-date" in "schemaless mode". (David Smiley, Bar Rotstein)
New Features
----------------------
@ -62,6 +65,8 @@ SOLR-12591: Expand the set of recognized date format patterns of schemaless mode
locale was changed from ROOT to en_US since well-known patterns assume this locale.
(David Smiley, Bar Rotstein)
SOLR-12593: The default configSet now includes an "ignored_*" dynamic field. (David Smiley)
Other Changes
----------------------

View File

@ -16,15 +16,9 @@
*/
package org.apache.solr.handler.extraction;
import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.lang.invoke.MethodHandles;
import java.util.Collection;
import java.util.HashSet;
import java.util.Iterator;
import java.util.Map;
import org.apache.solr.common.SolrException;
import org.apache.solr.common.SolrException.ErrorCode;
@ -39,31 +33,22 @@ import org.apache.solr.update.processor.UpdateRequestProcessor;
import org.apache.solr.util.plugin.SolrCoreAware;
import org.apache.tika.config.TikaConfig;
import org.apache.tika.exception.TikaException;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.xml.sax.SAXException;
/**
* Handler for rich documents like PDF or Word or any other file format that Tika handles that need the text to be extracted
* first from the document.
*/
public class ExtractingRequestHandler extends ContentStreamHandlerBase implements SolrCoreAware , PermissionNameProvider {
private static final Logger log = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());
public static final String PARSE_CONTEXT_CONFIG = "parseContext.config";
public static final String CONFIG_LOCATION = "tika.config";
public static final String DATE_FORMATS = "date.formats";
protected TikaConfig config;
protected ParseContextConfig parseContextConfig;
protected Collection<String> dateFormats = ExtractionDateUtil.DEFAULT_DATE_FORMATS;
protected SolrContentHandlerFactory factory;
@Override
public PermissionNameProvider.Name getPermissionName(AuthorizationContext request) {
return PermissionNameProvider.Name.READ_PERM;
@ -99,17 +84,6 @@ public class ExtractingRequestHandler extends ContentStreamHandlerBase implement
throw new SolrException(ErrorCode.SERVER_ERROR, e);
}
}
NamedList configDateFormats = (NamedList) initArgs.get(DATE_FORMATS);
if (configDateFormats != null && configDateFormats.size() > 0) {
dateFormats = new HashSet<>();
Iterator<Map.Entry> it = configDateFormats.iterator();
while (it.hasNext()) {
String format = (String) it.next().getValue();
log.info("Adding Date Format: " + format);
dateFormats.add(format);
}
}
}
if (config == null) {
try (InputStream is = core.getResourceLoader().getClassLoader().getResourceAsStream("solr-default-tika-config.xml")){
@ -125,10 +99,9 @@ public class ExtractingRequestHandler extends ContentStreamHandlerBase implement
}
protected SolrContentHandlerFactory createFactory() {
return new SolrContentHandlerFactory(dateFormats);
return new SolrContentHandlerFactory();
}
@Override
protected ContentStreamLoader newLoader(SolrQueryRequest req, UpdateRequestProcessor processor) {
return new ExtractingDocumentLoader(req, processor, config, parseContextConfig, factory);
@ -139,6 +112,4 @@ public class ExtractingRequestHandler extends ContentStreamHandlerBase implement
public String getDescription() {
return "Add/Update Rich document";
}
}
}

View File

@ -1,178 +0,0 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.solr.handler.extraction;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.time.Instant;
import java.time.format.DateTimeFormatter;
import java.time.format.DateTimeFormatterBuilder;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Calendar;
import java.util.Collection;
import java.util.Date;
import java.util.Iterator;
import java.util.Locale;
import java.util.TimeZone;
/**
* This class has some code from HttpClient DateUtil.
*/
public class ExtractionDateUtil {
//start HttpClient
/**
* Date format pattern used to parse HTTP date headers in RFC 1123 format.
*/
public static final String PATTERN_RFC1123 = "EEE, dd MMM yyyy HH:mm:ss zzz";
/**
* Date format pattern used to parse HTTP date headers in RFC 1036 format.
*/
public static final String PATTERN_RFC1036 = "EEEE, dd-MMM-yy HH:mm:ss zzz";
/**
* Date format pattern used to parse HTTP date headers in ANSI C
* <code>asctime()</code> format.
*/
public static final String PATTERN_ASCTIME = "EEE MMM d HH:mm:ss yyyy";
//These are included for back compat
private static final Collection<String> DEFAULT_HTTP_CLIENT_PATTERNS = Arrays.asList(
PATTERN_ASCTIME, PATTERN_RFC1036, PATTERN_RFC1123);
private static final Date DEFAULT_TWO_DIGIT_YEAR_START;
static {
Calendar calendar = Calendar.getInstance(TimeZone.getTimeZone("GMT"), Locale.ROOT);
calendar.set(2000, Calendar.JANUARY, 1, 0, 0);
DEFAULT_TWO_DIGIT_YEAR_START = calendar.getTime();
}
private static final TimeZone GMT = TimeZone.getTimeZone("GMT");
//end HttpClient
//---------------------------------------------------------------------------------------
/**
* Differs by {@link DateTimeFormatter#ISO_INSTANT} in that it's lenient.
*/
public static final DateTimeFormatter ISO_8601_PARSER = new DateTimeFormatterBuilder()
.parseCaseInsensitive().parseLenient().appendInstant().toFormatter(Locale.ROOT);
/**
* A suite of default date formats that can be parsed, and thus transformed to the Solr specific format
*/
public static final Collection<String> DEFAULT_DATE_FORMATS = new ArrayList<>();
static {
DEFAULT_DATE_FORMATS.add("yyyy-MM-dd'T'HH:mm:ss'Z'");
DEFAULT_DATE_FORMATS.add("yyyy-MM-dd'T'HH:mm:ss");
DEFAULT_DATE_FORMATS.add("yyyy-MM-dd");
DEFAULT_DATE_FORMATS.add("yyyy-MM-dd hh:mm:ss");
DEFAULT_DATE_FORMATS.add("yyyy-MM-dd HH:mm:ss");
DEFAULT_DATE_FORMATS.add("EEE MMM d hh:mm:ss z yyyy");
DEFAULT_DATE_FORMATS.addAll(DEFAULT_HTTP_CLIENT_PATTERNS);
}
/**
* Returns a formatter that can be use by the current thread if needed to
* convert Date objects to the Internal representation.
*
* @param d The input date to parse
* @return The parsed {@link java.util.Date}
* @throws java.text.ParseException If the input can't be parsed
*/
public static Date parseDate(String d) throws ParseException {
return parseDate(d, DEFAULT_DATE_FORMATS);
}
public static Date parseDate(String d, Collection<String> fmts) throws ParseException {
if (d.length() > 0 && d.charAt(d.length() - 1) == 'Z') {
try {
return new Date(ISO_8601_PARSER.parse(d, Instant::from).toEpochMilli());
} catch (Exception e) {
//ignore; perhaps we can parse with one of the formats below...
}
}
return parseDate(d, fmts, null);
}
/**
* Slightly modified from org.apache.commons.httpclient.util.DateUtil.parseDate
* <p>
* Parses the date value using the given date formats.
*
* @param dateValue the date value to parse
* @param dateFormats the date formats to use
* @param startDate During parsing, two digit years will be placed in the range
* <code>startDate</code> to <code>startDate + 100 years</code>. This value may
* be <code>null</code>. When <code>null</code> is given as a parameter, year
* <code>2000</code> will be used.
* @return the parsed date
* @throws ParseException if none of the dataFormats could parse the dateValue
*/
public static Date parseDate(
String dateValue,
Collection<String> dateFormats,
Date startDate
) throws ParseException {
if (dateValue == null) {
throw new IllegalArgumentException("dateValue is null");
}
if (dateFormats == null) {
dateFormats = DEFAULT_HTTP_CLIENT_PATTERNS;
}
if (startDate == null) {
startDate = DEFAULT_TWO_DIGIT_YEAR_START;
}
// trim single quotes around date if present
// see issue #5279
if (dateValue.length() > 1
&& dateValue.startsWith("'")
&& dateValue.endsWith("'")
) {
dateValue = dateValue.substring(1, dateValue.length() - 1);
}
//TODO upgrade to Java 8 DateTimeFormatter. But how to deal with the GMT as a default?
SimpleDateFormat dateParser = null;
Iterator formatIter = dateFormats.iterator();
while (formatIter.hasNext()) {
String format = (String) formatIter.next();
if (dateParser == null) {
dateParser = new SimpleDateFormat(format, Locale.ENGLISH);
dateParser.setTimeZone(GMT);
dateParser.set2DigitYearStart(startDate);
} else {
dateParser.applyPattern(format);
}
try {
return dateParser.parse(dateValue);
} catch (ParseException pe) {
// ignore this exception, we will try the next format
}
}
// we were unable to parse the date
throw new ParseException("Unable to parse the date " + dateValue, 0);
}
}

View File

@ -18,9 +18,7 @@ package org.apache.solr.handler.extraction;
import java.lang.invoke.MethodHandles;
import java.util.ArrayDeque;
import java.util.Collection;
import java.util.Collections;
import java.util.Date;
import java.util.Deque;
import java.util.HashMap;
import java.util.HashSet;
@ -32,7 +30,6 @@ import org.apache.solr.common.SolrInputDocument;
import org.apache.solr.common.params.SolrParams;
import org.apache.solr.schema.IndexSchema;
import org.apache.solr.schema.SchemaField;
import org.apache.solr.schema.NumberType;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.TikaMetadataKeys;
import org.slf4j.Logger;
@ -61,8 +58,6 @@ public class SolrContentHandler extends DefaultHandler implements ExtractingPara
protected final SolrInputDocument document;
protected final Collection<String> dateFormats;
protected final Metadata metadata;
protected final SolrParams params;
protected final StringBuilder catchAllBuilder = new StringBuilder(2048);
@ -79,19 +74,13 @@ public class SolrContentHandler extends DefaultHandler implements ExtractingPara
private final boolean literalsOverride;
private Set<String> literalFieldNames = null;
public SolrContentHandler(Metadata metadata, SolrParams params, IndexSchema schema) {
this(metadata, params, schema, ExtractionDateUtil.DEFAULT_DATE_FORMATS);
}
public SolrContentHandler(Metadata metadata, SolrParams params,
IndexSchema schema, Collection<String> dateFormats) {
this.document = new SolrInputDocument();
this.metadata = metadata;
this.params = params;
this.schema = schema;
this.dateFormats = dateFormats;
this.lowerNames = params.getBool(LOWERNAMES, false);
this.captureAttribs = params.getBool(CAPTURE_ATTRIBUTES, false);
@ -253,12 +242,12 @@ public class SolrContentHandler extends DefaultHandler implements ExtractingPara
}
if (fval != null) {
document.addField(name, transformValue(fval, sf));
document.addField(name, fval);
}
if (vals != null) {
for (String val : vals) {
document.addField(name, transformValue(val, sf));
document.addField(name, val);
}
}
@ -310,30 +299,6 @@ public class SolrContentHandler extends DefaultHandler implements ExtractingPara
characters(chars, offset, length);
}
/**
* Can be used to transform input values based on their {@link org.apache.solr.schema.SchemaField}
* <p>
* This implementation only formats dates using the {@link ExtractionDateUtil}.
*
* @param val The value to transform
* @param schFld The {@link org.apache.solr.schema.SchemaField}
* @return The potentially new value.
*/
protected String transformValue(String val, SchemaField schFld) {
String result = val;
if (schFld != null && NumberType.DATE.equals(schFld.getType().getNumberType())) {
//try to transform the date
try {
Date date = ExtractionDateUtil.parseDate(val, dateFormats); // may throw
result = date.toInstant().toString();//ISO format
} catch (Exception e) {
// Let the specific fieldType handle errors
// throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, "Invalid value: " + val + " for field: " + schFld, e);
}
}
return result;
}
/**
* Get the name mapping
*

View File

@ -20,22 +20,16 @@ import org.apache.tika.metadata.Metadata;
import org.apache.solr.common.params.SolrParams;
import org.apache.solr.schema.IndexSchema;
import java.util.Collection;
/**
*
*
**/
public class SolrContentHandlerFactory {
protected Collection<String> dateFormats;
public SolrContentHandlerFactory(Collection<String> dateFormats) {
this.dateFormats = dateFormats;
}
public SolrContentHandlerFactory() { }
public SolrContentHandler createSolrContentHandler(Metadata metadata, SolrParams params, IndexSchema schema) {
return new SolrContentHandler(metadata, params, schema,
dateFormats);
return new SolrContentHandler(metadata, params, schema);
}
}

View File

@ -1,62 +0,0 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.solr.handler.extraction;
import java.text.ParseException;
import java.util.Date;
import java.util.Locale;
import org.apache.lucene.util.LuceneTestCase;
public class TestExtractionDateUtil extends LuceneTestCase {
public void testISO8601() throws Exception {
// dates with atypical years
assertParseFormatEquals("0001-01-01T01:01:01Z", null);
assertParseFormatEquals("+12021-12-01T03:03:03Z", null);
assertParseFormatEquals("0000-04-04T04:04:04Z", null); // note: 0 AD is also known as 1 BC
// dates with negative years (BC)
assertParseFormatEquals("-0005-05-05T05:05:05Z", null);
assertParseFormatEquals("-2021-12-01T04:04:04Z", null);
assertParseFormatEquals("-12021-12-01T02:02:02Z", null);
// dates that only parse thanks to lenient mode of DateTimeFormatter
assertParseFormatEquals("10995-12-31T23:59:59.990Z", "+10995-12-31T23:59:59.990Z"); // missing '+' 5 digit year
assertParseFormatEquals("995-1-2T3:4:5Z", "0995-01-02T03:04:05Z"); // wasn't 0 padded
}
private static void assertParseFormatEquals(String inputStr, String expectedStr) throws ParseException {
if (expectedStr == null) {
expectedStr = inputStr;
}
Date inputDate = ExtractionDateUtil.parseDate(inputStr);
String resultStr = inputDate.toInstant().toString();
assertEquals("d:" + inputDate.getTime(), expectedStr, resultStr);
}
@AwaitsFix(bugUrl = "https://issues.apache.org/jira/browse/SOLR-12593")
public void testParseDate() throws ParseException {
assertParsedDate(1226583351000L, "Thu Nov 13 04:35:51 AKST 2008");
}
private static void assertParsedDate(long ts, String dateStr) throws ParseException {
long parsed = ExtractionDateUtil.parseDate(dateStr).getTime();
assertTrue(String.format(Locale.ENGLISH, "Incorrect parsed timestamp: %d != %d (%s)", ts, parsed, dateStr), Math.abs(ts - parsed) <= 1000L);
}
}

View File

@ -140,6 +140,7 @@
<dynamicField name="*_d" type="pdouble" indexed="true" stored="true"/>
<dynamicField name="*_ds" type="pdoubles" indexed="true" stored="true"/>
<dynamicField name="random_*" type="random"/>
<dynamicField name="ignored_*" type="ignored"/>
<!-- Type used for data-driven schema, to add a string copy for each text field -->
<dynamicField name="*_str" type="strings" stored="false" docValues="true" indexed="false" useDocValuesAsStored="false"/>
@ -148,7 +149,7 @@
<dynamicField name="*_dts" type="pdate" indexed="true" stored="true" multiValued="true"/>
<dynamicField name="*_p" type="location" indexed="true" stored="true"/>
<dynamicField name="*_srpt" type="location_rpt" indexed="true" stored="true"/>
<!-- payloaded dynamic fields -->
<dynamicField name="*_dpf" type="delimited_payloads_float" indexed="true" stored="true"/>
<dynamicField name="*_dpi" type="delimited_payloads_int" indexed="true" stored="true"/>
@ -207,13 +208,16 @@
<fieldType name="pfloat" class="solr.FloatPointField" docValues="true"/>
<fieldType name="plong" class="solr.LongPointField" docValues="true"/>
<fieldType name="pdouble" class="solr.DoublePointField" docValues="true"/>
<fieldType name="pints" class="solr.IntPointField" docValues="true" multiValued="true"/>
<fieldType name="pfloats" class="solr.FloatPointField" docValues="true" multiValued="true"/>
<fieldType name="plongs" class="solr.LongPointField" docValues="true" multiValued="true"/>
<fieldType name="pdoubles" class="solr.DoublePointField" docValues="true" multiValued="true"/>
<fieldType name="random" class="solr.RandomSortField" indexed="true"/>
<!-- since fields of this type are by default not stored or indexed,
any data added to them will be ignored outright. -->
<fieldType name="ignored" stored="false" indexed="false" multiValued="true" class="solr.StrField" />
<!-- The format for this date field is of the form 1995-12-31T23:59:59Z, and
is a more restricted form of the canonical representation of dateTime

View File

@ -797,7 +797,6 @@
class="solr.extraction.ExtractingRequestHandler" >
<lst name="defaults">
<str name="lowernames">true</str>
<str name="fmap.meta">ignored_</str>
<str name="fmap.content">_text_</str>
</lst>
</requestHandler>

View File

@ -140,6 +140,7 @@
<dynamicField name="*_d" type="pdouble" indexed="true" stored="true"/>
<dynamicField name="*_ds" type="pdoubles" indexed="true" stored="true"/>
<dynamicField name="random_*" type="random"/>
<dynamicField name="ignored_*" type="ignored"/>
<!-- Type used for data-driven schema, to add a string copy for each text field -->
<dynamicField name="*_str" type="strings" stored="false" docValues="true" indexed="false" useDocValuesAsStored="false"/>
@ -148,7 +149,7 @@
<dynamicField name="*_dts" type="pdate" indexed="true" stored="true" multiValued="true"/>
<dynamicField name="*_p" type="location" indexed="true" stored="true"/>
<dynamicField name="*_srpt" type="location_rpt" indexed="true" stored="true"/>
<!-- payloaded dynamic fields -->
<dynamicField name="*_dpf" type="delimited_payloads_float" indexed="true" stored="true"/>
<dynamicField name="*_dpi" type="delimited_payloads_int" indexed="true" stored="true"/>
@ -207,13 +208,16 @@
<fieldType name="pfloat" class="solr.FloatPointField" docValues="true"/>
<fieldType name="plong" class="solr.LongPointField" docValues="true"/>
<fieldType name="pdouble" class="solr.DoublePointField" docValues="true"/>
<fieldType name="pints" class="solr.IntPointField" docValues="true" multiValued="true"/>
<fieldType name="pfloats" class="solr.FloatPointField" docValues="true" multiValued="true"/>
<fieldType name="plongs" class="solr.LongPointField" docValues="true" multiValued="true"/>
<fieldType name="pdoubles" class="solr.DoublePointField" docValues="true" multiValued="true"/>
<fieldType name="random" class="solr.RandomSortField" indexed="true"/>
<!-- since fields of this type are by default not stored or indexed,
any data added to them will be ignored outright. -->
<fieldType name="ignored" stored="false" indexed="false" multiValued="true" class="solr.StrField" />
<!-- The format for this date field is of the form 1995-12-31T23:59:59Z, and
is a more restricted form of the canonical representation of dateTime

View File

@ -797,7 +797,6 @@
class="solr.extraction.ExtractingRequestHandler" >
<lst name="defaults">
<str name="lowernames">true</str>
<str name="fmap.meta">ignored_</str>
<str name="fmap.content">_text_</str>
</lst>
</requestHandler>

View File

@ -26,41 +26,50 @@ If you want to supply your own `ContentHandler` for Solr to use, you can extend
When using the Solr Cell framework, it is helpful to keep the following in mind:
* Tika will automatically attempt to determine the input document type (Word, PDF, HTML) and extract the content appropriately. If you like, you can explicitly specify a MIME type for Tika with the `stream.type` parameter.
* Tika works by producing an XHTML stream that it feeds to a SAX ContentHandler. SAX is a common interface implemented for many different XML parsers. For more information, see http://www.saxproject.org/quickstart.html.
* Solr then responds to Tika's SAX events and creates the fields to index.
* Tika produces metadata such as Title, Subject, and Author according to specifications such as the DublinCore. See http://tika.apache.org/{ivy-tika-version}/formats.html for the file types supported.
* Tika adds all the extracted text to the `content` field.
* You can map Tika's metadata fields to Solr fields.
* You can pass in literals for field values. Literals will override Tika-parsed values, including fields in the Tika metadata object, the Tika content field, and any "captured content" fields.
* You can apply an XPath expression to the Tika XHTML to restrict the content that is produced.
* Tika will automatically attempt to determine the input document type (e.g. Word, PDF, HTML) and extract the content appropriately.
If you like, you can explicitly specify a MIME type for Tika with the `stream.type` parameter.
See http://tika.apache.org/{ivy-tika-version}/formats.html for the file types supported.
* Briefly, Tika internally works by synthesizing an XHTML document from the core content of the parsed document which is passed to a configured http://www.saxproject.org/quickstart.html[SAX] ContentHandler provided by Solr Cell.
Solr responds to Tika's SAX events to create one or more text fields from the content.
Tika exposes document metadata as well (apart from the XHTML).
* Tika produces metadata such as Title, Subject, and Author according to specifications such as the DublinCore.
The metadata available is highly dependent on the file types and what they in turn contain.
Solr Cell supplies some metadata of its own too.
* Solr Cell concatenates text from the internal XHTML into a `content` field.
You can configure which elements should be included/ignored, and which should map to another field.
* Solr Cell maps each piece of metadata onto a field.
By default it maps to the same name but several parameters control how this is done.
* When Solr Cell finishes creating the internal `SolrInputDocument`, the rest of the Lucene/Solr indexing stack takes over.
The next step after any update handler is the <<update-request-processors.adoc#update-request-processors,Update Request Processor>> chain.
[TIP]
[NOTE]
====
While Apache Tika is quite powerful, it is not perfect and fails on some files. PDF files are particularly problematic, mostly due to the PDF format itself. In case of a failure processing any file, the `ExtractingRequestHandler` does not have a secondary mechanism to try to extract some text from the file; it will throw an exception and fail.
====
== Trying out Tika with the Solr techproducts Example
== Trying out Tika
You can try out the Tika framework using the `techproducts` example included in Solr.
You can try out the Tika framework using the `schemaless` example included in Solr.
This will simply create a core/collection "gettingstarted" with the default configSet.
Start the example:
[source,bash]
----
bin/solr -e techproducts
bin/solr -e schemaless
----
You can now use curl to send a sample PDF file via HTTP POST:
[source,bash]
----
curl 'http://localhost:8983/solr/techproducts/update/extract?literal.id=doc1&commit=true' -F "myfile=@example/exampledocs/solr-word.pdf"
curl 'http://localhost:8983/solr/gettingstarted/update/extract?literal.id=doc1&uprefix=ignored_&commit=true' -F "myfile=@example/exampledocs/solr-word.pdf"
----
The URL above calls the Extracting Request Handler, uploads the file `solr-word.pdf` and assigns it the unique ID `doc1`. Here's a closer look at the components of this command:
* The `literal.id=doc1` parameter provides the necessary unique ID for the document being indexed.
* The `literal.id=doc1` parameter provides a unique ID for the document being indexed.
There are alternatives to this like mapping a metadata field to the ID, generating a new UUID, and generating an ID from a signature (hash) of the content.
* The `commit=true parameter` causes Solr to perform a commit after indexing the document, making it immediately searchable. For optimum performance when loading many documents, don't call the commit command until you are done.
@ -68,25 +77,36 @@ The URL above calls the Extracting Request Handler, uploads the file `solr-word.
* The argument `myfile=@tutorial.html` needs a valid path, which can be absolute or relative.
You can also use `bin/post` to send a PDF file into Solr (without the params, the `literal.id` parameter would be set to the absolute path to the file):
You can also use `bin/post` to send a PDF file into Solr (without the params, the post tool would set `literal.id` to the absolute path to the file):
[source,bash]
----
bin/post -c techproducts example/exampledocs/solr-word.pdf -params "literal.id=a"
bin/post -c gettingstarted example/exampledocs/solr-word.pdf -params "literal.id=doc1"
----
Now you should be able to execute a query and find that document. You can make a request like `\http://localhost:8983/solr/techproducts/select?q=pdf`.
Now you should be able to execute a query and find that document. You can make a request like `\http://localhost:8983/solr/gettingstarted/select?q=pdf`.
You may notice that although the content of the sample document has been indexed and stored, there are not a lot of metadata fields associated with this document. This is because unknown fields are ignored according to the default parameters configured for the `/update/extract` handler in `solrconfig.xml`, and this behavior can be easily changed or overridden. For example, to store and see all metadata and content, execute the following:
You may notice there are many metadata fields associated with this document.
Solr's configuration is by default in "schemaless" (data driven) mode, and thus all metadata fields extracted get their own field.
You might instead want to ignore them generally except for a few you specify.
To do that, use the `uprefix` parameter to map unknown (to the schema) metadata field names to a schema field name that is effectively ignored.
The dynamic field `ignored_*` is good for this purpose.
For the fields you do want to map, explicitly set them using `fmap.IN=OUT` and/or ensure the field is defined in the schema.
Here's an example:
[source,bash]
----
bin/post -c techproducts example/exampledocs/solr-word.pdf -params "literal.id=doc1&uprefix=attr_"
bin/post -c gettingstarted example/exampledocs/solr-word.pdf -params "literal.id=doc1&uprefix=ignored_&fmap.last_modified=last_modified_dt"
----
In this command, the `uprefix=attr_` parameter causes all generated fields that aren't defined in the schema to be prefixed with `attr_`, which is a dynamic field that is stored and indexed.
This command allows you to query the document using an attribute, as in: `\http://localhost:8983/solr/techproducts/select?q=attr_meta:microsoft`.
[NOTE]
====
This won't have the intended effect if you run it at this point in the sequence of this tutorial.
Previously we added the document without these parameters; schemaless mode automatically added all fields at that time.
"uprefix" only applies to fields that are _undefined_ (hence the 'u' in "uprefix"), so these won't be prefixed now.
However you will see the new "last_modified_dt" field.
The easiest way to try this properly is to start over by deleting `example/schemaless/` (while Solr is stopped).
====
== Solr Cell Input Parameters
@ -101,9 +121,6 @@ Indexes attributes of the Tika XHTML elements into separate fields, named after
`commitWithin`::
Add the document within the specified number of milliseconds.
`date.formats`::
Defines the date format patterns to identify in the documents.
`defaultField`::
If the `uprefix` parameter (see below) is not specified and a field cannot be determined, the default field will be used.
@ -144,7 +161,7 @@ Defines a password to use for a password-protected PDF or OOXML file
Defines a file path and name to a customized Tika configuration file. This is only required if you have customized your Tika implementation.
`uprefix`::
Prefixes all fields that are not defined in the schema with the given prefix. This is very useful when combined with dynamic field definitions. Example: `uprefix=ignored_` would effectively ignore all unknown fields generated by Tika given the example schema contains `<dynamicField name="ignored_*" type="ignored"/>`
Prefixes all fields _that are undefined in the schema_ with the given prefix. This is very useful when combined with dynamic field definitions. Example: `uprefix=ignored_` would effectively ignore all unknown fields generated by Tika given the default schema contains `<dynamicField name="ignored_*" type="ignored"/>`
`xpath`::
When extracting, only return Tika XHTML content that satisfies the given XPath expression. See http://tika.apache.org/{ivy-tika-version}/ for details on the format of Tika XHTML. See also http://wiki.apache.org/solr/TikaExtractOnlyExampleOutput.
@ -160,7 +177,7 @@ Here is the order in which the Solr Cell framework, using the Extracting Request
== Configuring the Solr ExtractingRequestHandler
If you are not working with the supplied `sample_techproducts_configs` or `_default` <<config-sets.adoc#config-sets,config set>>, you must configure your own `solrconfig.xml` to know about the Jar's containing the `ExtractingRequestHandler` and its dependencies:
If you are not working with the supplied <<config-sets.adoc#config-sets,config sets>>, you must configure your own `solrconfig.xml` to know about the Jar's containing the `ExtractingRequestHandler` and its dependencies:
[source,xml]
----
@ -179,11 +196,6 @@ You can then configure the `ExtractingRequestHandler` in `solrconfig.xml`.
</lst>
<!--Optional. Specify a path to a tika configuration file. See the Tika docs for details.-->
<str name="tika.config">/my/path/to/tika.config</str>
<!-- Optional. Specify one or more date formats to parse. See DateUtil.DEFAULT_DATE_FORMATS
for default date formats -->
<lst name="date.formats">
<str>yyyy-MM-dd</str>
</lst>
<!-- Optional. Specify an external file containing parser-specific properties.
This file is located in the same directory as solrconfig.xml by default.-->
<str name="parseContext.config">parseContext.xml</str>
@ -192,17 +204,16 @@ You can then configure the `ExtractingRequestHandler` in `solrconfig.xml`.
In the defaults section, we are mapping Tika's Last-Modified Metadata attribute to a field named `last_modified`. We are also telling it to ignore undeclared fields. These are all overridden parameters.
The `tika.config` entry points to a file containing a Tika configuration. The `date.formats` allows you to specify various `java.text.SimpleDateFormats` date formats for working with transforming extracted input to a Date. Solr comes configured with the following date formats (see the `DateUtil` in Solr):
The `tika.config` entry points to a file containing a Tika configuration.
* `yyyy-MM-dd'T'HH:mm:ss'Z'`
* `yyyy-MM-dd'T'HH:mm:ss`
* `yyyy-MM-dd`
* `yyyy-MM-dd hh:mm:ss`
* `yyyy-MM-dd HH:mm:ss`
* `EEE MMM d hh:mm:ss z yyyy`
* `EEE, dd MMM yyyy HH:mm:ss zzz`
* `EEEE, dd-MMM-yy HH:mm:ss zzz`
* `EEE MMM d HH:mm:ss yyyy`
[TIP]
====
You likely need to have <<update-request-processors.adoc#update-request-processors,Update Request Processors>> (URPs) that parse numbers and dates and do other manipulations on the metadata fields generated by Solr Cell.
In Solr's default configuration, "schemaless" (data driven) mode is enabled, which does a variety of such processing already.
_If you don't use this mode_, you can still selectively specify the desired URPs.
An easy way to specify this is to configure the parameter `processor` (under `defaults`) to `uuid,remove-blank,field-name-mutating,parse-boolean,parse-long,parse-double,parse-date`.
That suggested list was taken right from the `add-unknown-fields-to-the-schema` URP chain, excluding `add-schema-fields`.
====
=== Parser-Specific Properties
@ -270,7 +281,7 @@ The command below captures `<div>` tags separately, and then maps all the instan
[source,bash]
----
bin/post -c techproducts example/exampledocs/sample.html -params "literal.id=doc2&captureAttr=true&defaultField=_text_&fmap.div=foo_t&capture=div"
bin/post -c gettingstarted example/exampledocs/sample.html -params "literal.id=doc2&captureAttr=true&defaultField=_text_&fmap.div=foo_t&capture=div"
----
==== Using Literals to Define Your Own Metadata
@ -279,7 +290,7 @@ To add in your own metadata, pass in the literal parameter along with the file:
[source,bash]
----
bin/post -c techproducts -params "literal.id=doc4&captureAttr=true&defaultField=text&capture=div&fmap.div=foo_t&literal.blah_s=Bah" example/exampledocs/sample.html
bin/post -c gettingstarted -params "literal.id=doc4&captureAttr=true&defaultField=text&capture=div&fmap.div=foo_t&literal.blah_s=Bah" example/exampledocs/sample.html
----
==== XPath Expressions
@ -288,7 +299,7 @@ The example below passes in an XPath expression to restrict the XHTML returned b
[source,bash]
----
bin/post -c techproducts -params "literal.id=doc5&captureAttr=true&defaultField=text&capture=div&fmap.div=foo_t&xpath=/xhtml:html/xhtml:body/xhtml:div//node()" example/exampledocs/sample.html
bin/post -c gettingstarted -params "literal.id=doc5&captureAttr=true&defaultField=text&capture=div&fmap.div=foo_t&xpath=/xhtml:html/xhtml:body/xhtml:div//node()" example/exampledocs/sample.html
----
=== Extracting Data without Indexing It
@ -299,14 +310,14 @@ The example below sets the `extractOnly=true` parameter to extract data without
[source,bash]
----
curl "http://localhost:8983/solr/techproducts/update/extract?&extractOnly=true" --data-binary @example/exampledocs/sample.html -H 'Content-type:text/html'
curl "http://localhost:8983/solr/gettingstarted/update/extract?&extractOnly=true" --data-binary @example/exampledocs/sample.html -H 'Content-type:text/html'
----
The output includes XML generated by Tika (and further escaped by Solr's XML) using a different output format to make it more readable (`-out yes` instructs the tool to echo Solr's output to the console):
[source,bash]
----
bin/post -c techproducts -params "extractOnly=true&wt=ruby&indent=true" -out yes example/exampledocs/sample.html
bin/post -c gettingstarted -params "extractOnly=true&wt=ruby&indent=true" -out yes example/exampledocs/sample.html
----
== Sending Documents to Solr with a POST
@ -315,10 +326,10 @@ The example below streams the file as the body of the POST, which does not, then
[source,bash]
----
curl "http://localhost:8983/solr/techproducts/update/extract?literal.id=doc6&defaultField=text&commit=true" --data-binary @example/exampledocs/sample.html -H 'Content-type:text/html'
curl "http://localhost:8983/solr/gettingstarted/update/extract?literal.id=doc6&defaultField=text&commit=true" --data-binary @example/exampledocs/sample.html -H 'Content-type:text/html'
----
== Sending Documents to Solr with Solr Cell and SolrJ
== Sending Documents to Solr with SolrJ
SolrJ is a Java client that you can use to add documents to the index, update the index, or query the index. You'll find more information on SolrJ in <<client-apis.adoc#client-apis,Client APIs>>.