From 964cc88cee7d62edf03a923e3217809d630af5d5 Mon Sep 17 00:00:00 2001 From: David Smiley Date: Fri, 28 Sep 2018 16:50:11 -0400 Subject: [PATCH] SOLR-12593: remove date parsing from extract contrib * added "ignored_*" to the default configSet * Updated Ref Guide info on Solr Cell to demonstrate usage without using the techproducts configSet Closes #438 --- solr/CHANGES.txt | 5 + .../extraction/ExtractingRequestHandler.java | 33 +--- .../extraction/ExtractionDateUtil.java | 178 ------------------ .../extraction/SolrContentHandler.java | 43 +---- .../extraction/SolrContentHandlerFactory.java | 10 +- .../extraction/TestExtractionDateUtil.java | 62 ------ .../configsets/_default/conf/managed-schema | 8 +- .../configsets/_default/conf/solrconfig.xml | 1 - .../configsets/_default/conf/managed-schema | 8 +- .../configsets/_default/conf/solrconfig.xml | 1 - ...data-with-solr-cell-using-apache-tika.adoc | 109 ++++++----- 11 files changed, 85 insertions(+), 373 deletions(-) delete mode 100644 solr/contrib/extraction/src/java/org/apache/solr/handler/extraction/ExtractionDateUtil.java delete mode 100644 solr/contrib/extraction/src/test/org/apache/solr/handler/extraction/TestExtractionDateUtil.java diff --git a/solr/CHANGES.txt b/solr/CHANGES.txt index 408ab53d4b9..490bcf2f662 100644 --- a/solr/CHANGES.txt +++ b/solr/CHANGES.txt @@ -52,6 +52,9 @@ Upgrade Notes expanded from before to subsume those patterns previously handled by the "extract" contrib (Solr Cell / Tika). (David Smiley, Bar Rotstein) +* SOLR-12593: The "extraction" contrib (Solr Cell) no longer does any date parsing, and thus no longer has the + "date.formats" configuration. To ensure date strings are properly parsed, use ParseDateFieldUpdateProcessorFactory + (an URP) commonly registered with the name "parse-date" in "schemaless mode". (David Smiley, Bar Rotstein) New Features ---------------------- @@ -62,6 +65,8 @@ SOLR-12591: Expand the set of recognized date format patterns of schemaless mode locale was changed from ROOT to en_US since well-known patterns assume this locale. (David Smiley, Bar Rotstein) +SOLR-12593: The default configSet now includes an "ignored_*" dynamic field. (David Smiley) + Other Changes ---------------------- diff --git a/solr/contrib/extraction/src/java/org/apache/solr/handler/extraction/ExtractingRequestHandler.java b/solr/contrib/extraction/src/java/org/apache/solr/handler/extraction/ExtractingRequestHandler.java index 44f7ac4d3d9..3af9b5b5aa8 100644 --- a/solr/contrib/extraction/src/java/org/apache/solr/handler/extraction/ExtractingRequestHandler.java +++ b/solr/contrib/extraction/src/java/org/apache/solr/handler/extraction/ExtractingRequestHandler.java @@ -16,15 +16,9 @@ */ package org.apache.solr.handler.extraction; - import java.io.File; import java.io.IOException; import java.io.InputStream; -import java.lang.invoke.MethodHandles; -import java.util.Collection; -import java.util.HashSet; -import java.util.Iterator; -import java.util.Map; import org.apache.solr.common.SolrException; import org.apache.solr.common.SolrException.ErrorCode; @@ -39,31 +33,22 @@ import org.apache.solr.update.processor.UpdateRequestProcessor; import org.apache.solr.util.plugin.SolrCoreAware; import org.apache.tika.config.TikaConfig; import org.apache.tika.exception.TikaException; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; import org.xml.sax.SAXException; - /** * Handler for rich documents like PDF or Word or any other file format that Tika handles that need the text to be extracted * first from the document. */ public class ExtractingRequestHandler extends ContentStreamHandlerBase implements SolrCoreAware , PermissionNameProvider { - private static final Logger log = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass()); - public static final String PARSE_CONTEXT_CONFIG = "parseContext.config"; public static final String CONFIG_LOCATION = "tika.config"; - public static final String DATE_FORMATS = "date.formats"; protected TikaConfig config; protected ParseContextConfig parseContextConfig; - - protected Collection dateFormats = ExtractionDateUtil.DEFAULT_DATE_FORMATS; protected SolrContentHandlerFactory factory; - @Override public PermissionNameProvider.Name getPermissionName(AuthorizationContext request) { return PermissionNameProvider.Name.READ_PERM; @@ -99,17 +84,6 @@ public class ExtractingRequestHandler extends ContentStreamHandlerBase implement throw new SolrException(ErrorCode.SERVER_ERROR, e); } } - - NamedList configDateFormats = (NamedList) initArgs.get(DATE_FORMATS); - if (configDateFormats != null && configDateFormats.size() > 0) { - dateFormats = new HashSet<>(); - Iterator it = configDateFormats.iterator(); - while (it.hasNext()) { - String format = (String) it.next().getValue(); - log.info("Adding Date Format: " + format); - dateFormats.add(format); - } - } } if (config == null) { try (InputStream is = core.getResourceLoader().getClassLoader().getResourceAsStream("solr-default-tika-config.xml")){ @@ -125,10 +99,9 @@ public class ExtractingRequestHandler extends ContentStreamHandlerBase implement } protected SolrContentHandlerFactory createFactory() { - return new SolrContentHandlerFactory(dateFormats); + return new SolrContentHandlerFactory(); } - @Override protected ContentStreamLoader newLoader(SolrQueryRequest req, UpdateRequestProcessor processor) { return new ExtractingDocumentLoader(req, processor, config, parseContextConfig, factory); @@ -139,6 +112,4 @@ public class ExtractingRequestHandler extends ContentStreamHandlerBase implement public String getDescription() { return "Add/Update Rich document"; } -} - - +} \ No newline at end of file diff --git a/solr/contrib/extraction/src/java/org/apache/solr/handler/extraction/ExtractionDateUtil.java b/solr/contrib/extraction/src/java/org/apache/solr/handler/extraction/ExtractionDateUtil.java deleted file mode 100644 index b7ccf8203da..00000000000 --- a/solr/contrib/extraction/src/java/org/apache/solr/handler/extraction/ExtractionDateUtil.java +++ /dev/null @@ -1,178 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.solr.handler.extraction; - -import java.text.ParseException; -import java.text.SimpleDateFormat; -import java.time.Instant; -import java.time.format.DateTimeFormatter; -import java.time.format.DateTimeFormatterBuilder; -import java.util.ArrayList; -import java.util.Arrays; -import java.util.Calendar; -import java.util.Collection; -import java.util.Date; -import java.util.Iterator; -import java.util.Locale; -import java.util.TimeZone; - - -/** - * This class has some code from HttpClient DateUtil. - */ -public class ExtractionDateUtil { - //start HttpClient - /** - * Date format pattern used to parse HTTP date headers in RFC 1123 format. - */ - public static final String PATTERN_RFC1123 = "EEE, dd MMM yyyy HH:mm:ss zzz"; - - /** - * Date format pattern used to parse HTTP date headers in RFC 1036 format. - */ - public static final String PATTERN_RFC1036 = "EEEE, dd-MMM-yy HH:mm:ss zzz"; - - /** - * Date format pattern used to parse HTTP date headers in ANSI C - * asctime() format. - */ - public static final String PATTERN_ASCTIME = "EEE MMM d HH:mm:ss yyyy"; - //These are included for back compat - private static final Collection DEFAULT_HTTP_CLIENT_PATTERNS = Arrays.asList( - PATTERN_ASCTIME, PATTERN_RFC1036, PATTERN_RFC1123); - - private static final Date DEFAULT_TWO_DIGIT_YEAR_START; - - static { - Calendar calendar = Calendar.getInstance(TimeZone.getTimeZone("GMT"), Locale.ROOT); - calendar.set(2000, Calendar.JANUARY, 1, 0, 0); - DEFAULT_TWO_DIGIT_YEAR_START = calendar.getTime(); - } - - private static final TimeZone GMT = TimeZone.getTimeZone("GMT"); - - //end HttpClient - - //--------------------------------------------------------------------------------------- - - /** - * Differs by {@link DateTimeFormatter#ISO_INSTANT} in that it's lenient. - */ - public static final DateTimeFormatter ISO_8601_PARSER = new DateTimeFormatterBuilder() - .parseCaseInsensitive().parseLenient().appendInstant().toFormatter(Locale.ROOT); - - /** - * A suite of default date formats that can be parsed, and thus transformed to the Solr specific format - */ - public static final Collection DEFAULT_DATE_FORMATS = new ArrayList<>(); - - static { - DEFAULT_DATE_FORMATS.add("yyyy-MM-dd'T'HH:mm:ss'Z'"); - DEFAULT_DATE_FORMATS.add("yyyy-MM-dd'T'HH:mm:ss"); - DEFAULT_DATE_FORMATS.add("yyyy-MM-dd"); - DEFAULT_DATE_FORMATS.add("yyyy-MM-dd hh:mm:ss"); - DEFAULT_DATE_FORMATS.add("yyyy-MM-dd HH:mm:ss"); - DEFAULT_DATE_FORMATS.add("EEE MMM d hh:mm:ss z yyyy"); - DEFAULT_DATE_FORMATS.addAll(DEFAULT_HTTP_CLIENT_PATTERNS); - } - - /** - * Returns a formatter that can be use by the current thread if needed to - * convert Date objects to the Internal representation. - * - * @param d The input date to parse - * @return The parsed {@link java.util.Date} - * @throws java.text.ParseException If the input can't be parsed - */ - public static Date parseDate(String d) throws ParseException { - return parseDate(d, DEFAULT_DATE_FORMATS); - } - - public static Date parseDate(String d, Collection fmts) throws ParseException { - if (d.length() > 0 && d.charAt(d.length() - 1) == 'Z') { - try { - return new Date(ISO_8601_PARSER.parse(d, Instant::from).toEpochMilli()); - } catch (Exception e) { - //ignore; perhaps we can parse with one of the formats below... - } - } - return parseDate(d, fmts, null); - } - - /** - * Slightly modified from org.apache.commons.httpclient.util.DateUtil.parseDate - *

- * Parses the date value using the given date formats. - * - * @param dateValue the date value to parse - * @param dateFormats the date formats to use - * @param startDate During parsing, two digit years will be placed in the range - * startDate to startDate + 100 years. This value may - * be null. When null is given as a parameter, year - * 2000 will be used. - * @return the parsed date - * @throws ParseException if none of the dataFormats could parse the dateValue - */ - public static Date parseDate( - String dateValue, - Collection dateFormats, - Date startDate - ) throws ParseException { - - if (dateValue == null) { - throw new IllegalArgumentException("dateValue is null"); - } - if (dateFormats == null) { - dateFormats = DEFAULT_HTTP_CLIENT_PATTERNS; - } - if (startDate == null) { - startDate = DEFAULT_TWO_DIGIT_YEAR_START; - } - // trim single quotes around date if present - // see issue #5279 - if (dateValue.length() > 1 - && dateValue.startsWith("'") - && dateValue.endsWith("'") - ) { - dateValue = dateValue.substring(1, dateValue.length() - 1); - } - - //TODO upgrade to Java 8 DateTimeFormatter. But how to deal with the GMT as a default? - SimpleDateFormat dateParser = null; - Iterator formatIter = dateFormats.iterator(); - - while (formatIter.hasNext()) { - String format = (String) formatIter.next(); - if (dateParser == null) { - dateParser = new SimpleDateFormat(format, Locale.ENGLISH); - dateParser.setTimeZone(GMT); - dateParser.set2DigitYearStart(startDate); - } else { - dateParser.applyPattern(format); - } - try { - return dateParser.parse(dateValue); - } catch (ParseException pe) { - // ignore this exception, we will try the next format - } - } - - // we were unable to parse the date - throw new ParseException("Unable to parse the date " + dateValue, 0); - } - -} \ No newline at end of file diff --git a/solr/contrib/extraction/src/java/org/apache/solr/handler/extraction/SolrContentHandler.java b/solr/contrib/extraction/src/java/org/apache/solr/handler/extraction/SolrContentHandler.java index 8c012e58e41..8d871a432e5 100644 --- a/solr/contrib/extraction/src/java/org/apache/solr/handler/extraction/SolrContentHandler.java +++ b/solr/contrib/extraction/src/java/org/apache/solr/handler/extraction/SolrContentHandler.java @@ -18,9 +18,7 @@ package org.apache.solr.handler.extraction; import java.lang.invoke.MethodHandles; import java.util.ArrayDeque; -import java.util.Collection; import java.util.Collections; -import java.util.Date; import java.util.Deque; import java.util.HashMap; import java.util.HashSet; @@ -32,7 +30,6 @@ import org.apache.solr.common.SolrInputDocument; import org.apache.solr.common.params.SolrParams; import org.apache.solr.schema.IndexSchema; import org.apache.solr.schema.SchemaField; -import org.apache.solr.schema.NumberType; import org.apache.tika.metadata.Metadata; import org.apache.tika.metadata.TikaMetadataKeys; import org.slf4j.Logger; @@ -61,8 +58,6 @@ public class SolrContentHandler extends DefaultHandler implements ExtractingPara protected final SolrInputDocument document; - protected final Collection dateFormats; - protected final Metadata metadata; protected final SolrParams params; protected final StringBuilder catchAllBuilder = new StringBuilder(2048); @@ -79,19 +74,13 @@ public class SolrContentHandler extends DefaultHandler implements ExtractingPara private final boolean literalsOverride; private Set literalFieldNames = null; - + + public SolrContentHandler(Metadata metadata, SolrParams params, IndexSchema schema) { - this(metadata, params, schema, ExtractionDateUtil.DEFAULT_DATE_FORMATS); - } - - - public SolrContentHandler(Metadata metadata, SolrParams params, - IndexSchema schema, Collection dateFormats) { this.document = new SolrInputDocument(); this.metadata = metadata; this.params = params; this.schema = schema; - this.dateFormats = dateFormats; this.lowerNames = params.getBool(LOWERNAMES, false); this.captureAttribs = params.getBool(CAPTURE_ATTRIBUTES, false); @@ -253,12 +242,12 @@ public class SolrContentHandler extends DefaultHandler implements ExtractingPara } if (fval != null) { - document.addField(name, transformValue(fval, sf)); + document.addField(name, fval); } if (vals != null) { for (String val : vals) { - document.addField(name, transformValue(val, sf)); + document.addField(name, val); } } @@ -310,30 +299,6 @@ public class SolrContentHandler extends DefaultHandler implements ExtractingPara characters(chars, offset, length); } - /** - * Can be used to transform input values based on their {@link org.apache.solr.schema.SchemaField} - *

- * This implementation only formats dates using the {@link ExtractionDateUtil}. - * - * @param val The value to transform - * @param schFld The {@link org.apache.solr.schema.SchemaField} - * @return The potentially new value. - */ - protected String transformValue(String val, SchemaField schFld) { - String result = val; - if (schFld != null && NumberType.DATE.equals(schFld.getType().getNumberType())) { - //try to transform the date - try { - Date date = ExtractionDateUtil.parseDate(val, dateFormats); // may throw - result = date.toInstant().toString();//ISO format - } catch (Exception e) { - // Let the specific fieldType handle errors - // throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, "Invalid value: " + val + " for field: " + schFld, e); - } - } - return result; - } - /** * Get the name mapping * diff --git a/solr/contrib/extraction/src/java/org/apache/solr/handler/extraction/SolrContentHandlerFactory.java b/solr/contrib/extraction/src/java/org/apache/solr/handler/extraction/SolrContentHandlerFactory.java index c91dd47306f..f95125e2aee 100644 --- a/solr/contrib/extraction/src/java/org/apache/solr/handler/extraction/SolrContentHandlerFactory.java +++ b/solr/contrib/extraction/src/java/org/apache/solr/handler/extraction/SolrContentHandlerFactory.java @@ -20,22 +20,16 @@ import org.apache.tika.metadata.Metadata; import org.apache.solr.common.params.SolrParams; import org.apache.solr.schema.IndexSchema; -import java.util.Collection; - /** * * **/ public class SolrContentHandlerFactory { - protected Collection dateFormats; - public SolrContentHandlerFactory(Collection dateFormats) { - this.dateFormats = dateFormats; - } + public SolrContentHandlerFactory() { } public SolrContentHandler createSolrContentHandler(Metadata metadata, SolrParams params, IndexSchema schema) { - return new SolrContentHandler(metadata, params, schema, - dateFormats); + return new SolrContentHandler(metadata, params, schema); } } diff --git a/solr/contrib/extraction/src/test/org/apache/solr/handler/extraction/TestExtractionDateUtil.java b/solr/contrib/extraction/src/test/org/apache/solr/handler/extraction/TestExtractionDateUtil.java deleted file mode 100644 index e5816a6a7a7..00000000000 --- a/solr/contrib/extraction/src/test/org/apache/solr/handler/extraction/TestExtractionDateUtil.java +++ /dev/null @@ -1,62 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.solr.handler.extraction; - -import java.text.ParseException; -import java.util.Date; -import java.util.Locale; - -import org.apache.lucene.util.LuceneTestCase; - -public class TestExtractionDateUtil extends LuceneTestCase { - - public void testISO8601() throws Exception { - // dates with atypical years - assertParseFormatEquals("0001-01-01T01:01:01Z", null); - assertParseFormatEquals("+12021-12-01T03:03:03Z", null); - - assertParseFormatEquals("0000-04-04T04:04:04Z", null); // note: 0 AD is also known as 1 BC - - // dates with negative years (BC) - assertParseFormatEquals("-0005-05-05T05:05:05Z", null); - assertParseFormatEquals("-2021-12-01T04:04:04Z", null); - assertParseFormatEquals("-12021-12-01T02:02:02Z", null); - - // dates that only parse thanks to lenient mode of DateTimeFormatter - assertParseFormatEquals("10995-12-31T23:59:59.990Z", "+10995-12-31T23:59:59.990Z"); // missing '+' 5 digit year - assertParseFormatEquals("995-1-2T3:4:5Z", "0995-01-02T03:04:05Z"); // wasn't 0 padded - } - - private static void assertParseFormatEquals(String inputStr, String expectedStr) throws ParseException { - if (expectedStr == null) { - expectedStr = inputStr; - } - Date inputDate = ExtractionDateUtil.parseDate(inputStr); - String resultStr = inputDate.toInstant().toString(); - assertEquals("d:" + inputDate.getTime(), expectedStr, resultStr); - } - - @AwaitsFix(bugUrl = "https://issues.apache.org/jira/browse/SOLR-12593") - public void testParseDate() throws ParseException { - assertParsedDate(1226583351000L, "Thu Nov 13 04:35:51 AKST 2008"); - } - - private static void assertParsedDate(long ts, String dateStr) throws ParseException { - long parsed = ExtractionDateUtil.parseDate(dateStr).getTime(); - assertTrue(String.format(Locale.ENGLISH, "Incorrect parsed timestamp: %d != %d (%s)", ts, parsed, dateStr), Math.abs(ts - parsed) <= 1000L); - } -} diff --git a/solr/core/src/test-files/solr/configsets/_default/conf/managed-schema b/solr/core/src/test-files/solr/configsets/_default/conf/managed-schema index 95c0c369b08..b2e32727740 100644 --- a/solr/core/src/test-files/solr/configsets/_default/conf/managed-schema +++ b/solr/core/src/test-files/solr/configsets/_default/conf/managed-schema @@ -140,6 +140,7 @@ + @@ -148,7 +149,7 @@ - + @@ -207,13 +208,16 @@ - + + + @@ -148,7 +149,7 @@ - + @@ -207,13 +208,16 @@ - + + + /my/path/to/tika.config - - - yyyy-MM-dd - parseContext.xml @@ -192,17 +204,16 @@ You can then configure the `ExtractingRequestHandler` in `solrconfig.xml`. In the defaults section, we are mapping Tika's Last-Modified Metadata attribute to a field named `last_modified`. We are also telling it to ignore undeclared fields. These are all overridden parameters. -The `tika.config` entry points to a file containing a Tika configuration. The `date.formats` allows you to specify various `java.text.SimpleDateFormats` date formats for working with transforming extracted input to a Date. Solr comes configured with the following date formats (see the `DateUtil` in Solr): +The `tika.config` entry points to a file containing a Tika configuration. -* `yyyy-MM-dd'T'HH:mm:ss'Z'` -* `yyyy-MM-dd'T'HH:mm:ss` -* `yyyy-MM-dd` -* `yyyy-MM-dd hh:mm:ss` -* `yyyy-MM-dd HH:mm:ss` -* `EEE MMM d hh:mm:ss z yyyy` -* `EEE, dd MMM yyyy HH:mm:ss zzz` -* `EEEE, dd-MMM-yy HH:mm:ss zzz` -* `EEE MMM d HH:mm:ss yyyy` +[TIP] +==== +You likely need to have <> (URPs) that parse numbers and dates and do other manipulations on the metadata fields generated by Solr Cell. +In Solr's default configuration, "schemaless" (data driven) mode is enabled, which does a variety of such processing already. +_If you don't use this mode_, you can still selectively specify the desired URPs. +An easy way to specify this is to configure the parameter `processor` (under `defaults`) to `uuid,remove-blank,field-name-mutating,parse-boolean,parse-long,parse-double,parse-date`. +That suggested list was taken right from the `add-unknown-fields-to-the-schema` URP chain, excluding `add-schema-fields`. +==== === Parser-Specific Properties @@ -270,7 +281,7 @@ The command below captures `

` tags separately, and then maps all the instan [source,bash] ---- -bin/post -c techproducts example/exampledocs/sample.html -params "literal.id=doc2&captureAttr=true&defaultField=_text_&fmap.div=foo_t&capture=div" +bin/post -c gettingstarted example/exampledocs/sample.html -params "literal.id=doc2&captureAttr=true&defaultField=_text_&fmap.div=foo_t&capture=div" ---- ==== Using Literals to Define Your Own Metadata @@ -279,7 +290,7 @@ To add in your own metadata, pass in the literal parameter along with the file: [source,bash] ---- -bin/post -c techproducts -params "literal.id=doc4&captureAttr=true&defaultField=text&capture=div&fmap.div=foo_t&literal.blah_s=Bah" example/exampledocs/sample.html +bin/post -c gettingstarted -params "literal.id=doc4&captureAttr=true&defaultField=text&capture=div&fmap.div=foo_t&literal.blah_s=Bah" example/exampledocs/sample.html ---- ==== XPath Expressions @@ -288,7 +299,7 @@ The example below passes in an XPath expression to restrict the XHTML returned b [source,bash] ---- -bin/post -c techproducts -params "literal.id=doc5&captureAttr=true&defaultField=text&capture=div&fmap.div=foo_t&xpath=/xhtml:html/xhtml:body/xhtml:div//node()" example/exampledocs/sample.html +bin/post -c gettingstarted -params "literal.id=doc5&captureAttr=true&defaultField=text&capture=div&fmap.div=foo_t&xpath=/xhtml:html/xhtml:body/xhtml:div//node()" example/exampledocs/sample.html ---- === Extracting Data without Indexing It @@ -299,14 +310,14 @@ The example below sets the `extractOnly=true` parameter to extract data without [source,bash] ---- -curl "http://localhost:8983/solr/techproducts/update/extract?&extractOnly=true" --data-binary @example/exampledocs/sample.html -H 'Content-type:text/html' +curl "http://localhost:8983/solr/gettingstarted/update/extract?&extractOnly=true" --data-binary @example/exampledocs/sample.html -H 'Content-type:text/html' ---- The output includes XML generated by Tika (and further escaped by Solr's XML) using a different output format to make it more readable (`-out yes` instructs the tool to echo Solr's output to the console): [source,bash] ---- -bin/post -c techproducts -params "extractOnly=true&wt=ruby&indent=true" -out yes example/exampledocs/sample.html +bin/post -c gettingstarted -params "extractOnly=true&wt=ruby&indent=true" -out yes example/exampledocs/sample.html ---- == Sending Documents to Solr with a POST @@ -315,10 +326,10 @@ The example below streams the file as the body of the POST, which does not, then [source,bash] ---- -curl "http://localhost:8983/solr/techproducts/update/extract?literal.id=doc6&defaultField=text&commit=true" --data-binary @example/exampledocs/sample.html -H 'Content-type:text/html' +curl "http://localhost:8983/solr/gettingstarted/update/extract?literal.id=doc6&defaultField=text&commit=true" --data-binary @example/exampledocs/sample.html -H 'Content-type:text/html' ---- -== Sending Documents to Solr with Solr Cell and SolrJ +== Sending Documents to Solr with SolrJ SolrJ is a Java client that you can use to add documents to the index, update the index, or query the index. You'll find more information on SolrJ in <>.