SOLR-12593: remove date parsing from extract contrib

* added "ignored_*" to the default configSet * Updated Ref Guide info on Solr Cell to demonstrate usage without using the techproducts configSet Closes #438
2018-09-28 16:50:11 -04:00 · 2018-09-28 16:50:11 -04:00 · 964cc88cee
parent 918b11c7ce
commit 964cc88cee
11 changed files with 85 additions and 373 deletions
--- a/solr/CHANGES.txt
+++ b/solr/CHANGES.txt
@ -52,6 +52,9 @@ Upgrade Notes
  expanded from before to subsume those patterns previously handled by the "extract" contrib (Solr Cell / Tika).
  (David Smiley, Bar Rotstein)

+* SOLR-12593: The "extraction" contrib (Solr Cell) no longer does any date parsing, and thus no longer has the
+  "date.formats" configuration.  To ensure date strings are properly parsed, use ParseDateFieldUpdateProcessorFactory
+  (an URP) commonly registered with the name "parse-date" in "schemaless mode".  (David Smiley, Bar Rotstein)

 New Features
 ----------------------
@ -62,6 +65,8 @@ SOLR-12591: Expand the set of recognized date format patterns of schemaless mode
  locale was changed from ROOT to en_US since well-known patterns assume this locale.
  (David Smiley, Bar Rotstein)

+SOLR-12593: The default configSet now includes an "ignored_*" dynamic field.  (David Smiley)
+
 Other Changes
 ----------------------

--- a/solr/contrib/extraction/src/java/org/apache/solr/handler/extraction/ExtractingRequestHandler.java
+++ b/solr/contrib/extraction/src/java/org/apache/solr/handler/extraction/ExtractingRequestHandler.java
@ -16,15 +16,9 @@
 */
 package org.apache.solr.handler.extraction;

-
 import java.io.File;
 import java.io.IOException;
 import java.io.InputStream;
-import java.lang.invoke.MethodHandles;
-import java.util.Collection;
-import java.util.HashSet;
-import java.util.Iterator;
-import java.util.Map;

 import org.apache.solr.common.SolrException;
 import org.apache.solr.common.SolrException.ErrorCode;
@ -39,31 +33,22 @@ import org.apache.solr.update.processor.UpdateRequestProcessor;
 import org.apache.solr.util.plugin.SolrCoreAware;
 import org.apache.tika.config.TikaConfig;
 import org.apache.tika.exception.TikaException;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
 import org.xml.sax.SAXException;

-
 /**
 * Handler for rich documents like PDF or Word or any other file format that Tika handles that need the text to be extracted
 * first from the document.
 */
 public class ExtractingRequestHandler extends ContentStreamHandlerBase implements SolrCoreAware , PermissionNameProvider {

-  private static final Logger log = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());
-
  public static final String PARSE_CONTEXT_CONFIG = "parseContext.config";
  public static final String CONFIG_LOCATION = "tika.config";
-  public static final String DATE_FORMATS = "date.formats";

  protected TikaConfig config;
  protected ParseContextConfig parseContextConfig;

-
-  protected Collection<String> dateFormats = ExtractionDateUtil.DEFAULT_DATE_FORMATS;
  protected SolrContentHandlerFactory factory;

-
  @Override
  public PermissionNameProvider.Name getPermissionName(AuthorizationContext request) {
    return PermissionNameProvider.Name.READ_PERM;
@ -99,17 +84,6 @@ public class ExtractingRequestHandler extends ContentStreamHandlerBase implement
          throw new SolrException(ErrorCode.SERVER_ERROR, e);
        }
      }
-
-      NamedList configDateFormats = (NamedList) initArgs.get(DATE_FORMATS);
-      if (configDateFormats != null && configDateFormats.size() > 0) {
-        dateFormats = new HashSet<>();
-        Iterator<Map.Entry> it = configDateFormats.iterator();
-        while (it.hasNext()) {
-          String format = (String) it.next().getValue();
-          log.info("Adding Date Format: " + format);
-          dateFormats.add(format);
-        }
-      }
    }
    if (config == null) {
      try (InputStream is = core.getResourceLoader().getClassLoader().getResourceAsStream("solr-default-tika-config.xml")){
@ -125,10 +99,9 @@ public class ExtractingRequestHandler extends ContentStreamHandlerBase implement
  }

  protected SolrContentHandlerFactory createFactory() {
-    return new SolrContentHandlerFactory(dateFormats);
+    return new SolrContentHandlerFactory();
  }

-
  @Override
  protected ContentStreamLoader newLoader(SolrQueryRequest req, UpdateRequestProcessor processor) {
    return new ExtractingDocumentLoader(req, processor, config, parseContextConfig, factory);
@ -139,6 +112,4 @@ public class ExtractingRequestHandler extends ContentStreamHandlerBase implement
  public String getDescription() {
    return "Add/Update Rich document";
  }
-}
-
-
+}
--- a/solr/contrib/extraction/src/java/org/apache/solr/handler/extraction/ExtractionDateUtil.java
+++ b/solr/contrib/extraction/src/java/org/apache/solr/handler/extraction/ExtractionDateUtil.java
@ -1,178 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.solr.handler.extraction;
-
-import java.text.ParseException;
-import java.text.SimpleDateFormat;
-import java.time.Instant;
-import java.time.format.DateTimeFormatter;
-import java.time.format.DateTimeFormatterBuilder;
-import java.util.ArrayList;
-import java.util.Arrays;
-import java.util.Calendar;
-import java.util.Collection;
-import java.util.Date;
-import java.util.Iterator;
-import java.util.Locale;
-import java.util.TimeZone;
-
-
-/**
- * This class has some code from HttpClient DateUtil.
- */
-public class ExtractionDateUtil {
-  //start HttpClient
-  /**
-   * Date format pattern used to parse HTTP date headers in RFC 1123 format.
-   */
-  public static final String PATTERN_RFC1123 = "EEE, dd MMM yyyy HH:mm:ss zzz";
-
-  /**
-   * Date format pattern used to parse HTTP date headers in RFC 1036 format.
-   */
-  public static final String PATTERN_RFC1036 = "EEEE, dd-MMM-yy HH:mm:ss zzz";
-
-  /**
-   * Date format pattern used to parse HTTP date headers in ANSI C
-   * <code>asctime()</code> format.
-   */
-  public static final String PATTERN_ASCTIME = "EEE MMM d HH:mm:ss yyyy";
-  //These are included for back compat
-  private static final Collection<String> DEFAULT_HTTP_CLIENT_PATTERNS = Arrays.asList(
-          PATTERN_ASCTIME, PATTERN_RFC1036, PATTERN_RFC1123);
-
-  private static final Date DEFAULT_TWO_DIGIT_YEAR_START;
-
-  static {
-    Calendar calendar = Calendar.getInstance(TimeZone.getTimeZone("GMT"), Locale.ROOT);
-    calendar.set(2000, Calendar.JANUARY, 1, 0, 0);
-    DEFAULT_TWO_DIGIT_YEAR_START = calendar.getTime();
-  }
-
-  private static final TimeZone GMT = TimeZone.getTimeZone("GMT");
-
-  //end HttpClient
-
-  //---------------------------------------------------------------------------------------
-
-  /**
-   * Differs by {@link DateTimeFormatter#ISO_INSTANT} in that it's lenient.
-   */
-  public static final DateTimeFormatter ISO_8601_PARSER = new DateTimeFormatterBuilder()
-      .parseCaseInsensitive().parseLenient().appendInstant().toFormatter(Locale.ROOT);
-
-  /**
-   * A suite of default date formats that can be parsed, and thus transformed to the Solr specific format
-   */
-  public static final Collection<String> DEFAULT_DATE_FORMATS = new ArrayList<>();
-
-  static {
-    DEFAULT_DATE_FORMATS.add("yyyy-MM-dd'T'HH:mm:ss'Z'");
-    DEFAULT_DATE_FORMATS.add("yyyy-MM-dd'T'HH:mm:ss");
-    DEFAULT_DATE_FORMATS.add("yyyy-MM-dd");
-    DEFAULT_DATE_FORMATS.add("yyyy-MM-dd hh:mm:ss");
-    DEFAULT_DATE_FORMATS.add("yyyy-MM-dd HH:mm:ss");
-    DEFAULT_DATE_FORMATS.add("EEE MMM d hh:mm:ss z yyyy");
-    DEFAULT_DATE_FORMATS.addAll(DEFAULT_HTTP_CLIENT_PATTERNS);
-  }
-
-  /**
-   * Returns a formatter that can be use by the current thread if needed to
-   * convert Date objects to the Internal representation.
-   *
-   * @param d The input date to parse
-   * @return The parsed {@link java.util.Date}
-   * @throws java.text.ParseException If the input can't be parsed
-   */
-  public static Date parseDate(String d) throws ParseException {
-    return parseDate(d, DEFAULT_DATE_FORMATS);
-  }
-
-  public static Date parseDate(String d, Collection<String> fmts) throws ParseException {
-    if (d.length() > 0 && d.charAt(d.length() - 1) == 'Z') {
-      try {
-        return new Date(ISO_8601_PARSER.parse(d, Instant::from).toEpochMilli());
-      } catch (Exception e) {
-        //ignore; perhaps we can parse with one of the formats below...
-      }
-    }
-    return parseDate(d, fmts, null);
-  }
-
-  /**
-   * Slightly modified from org.apache.commons.httpclient.util.DateUtil.parseDate
-   * <p>
-   * Parses the date value using the given date formats.
-   *
-   * @param dateValue   the date value to parse
-   * @param dateFormats the date formats to use
-   * @param startDate   During parsing, two digit years will be placed in the range
-   *                    <code>startDate</code> to <code>startDate + 100 years</code>. This value may
-   *                    be <code>null</code>. When <code>null</code> is given as a parameter, year
-   *                    <code>2000</code> will be used.
-   * @return the parsed date
-   * @throws ParseException if none of the dataFormats could parse the dateValue
-   */
-  public static Date parseDate(
-          String dateValue,
-          Collection<String> dateFormats,
-          Date startDate
-  ) throws ParseException {
-
-    if (dateValue == null) {
-      throw new IllegalArgumentException("dateValue is null");
-    }
-    if (dateFormats == null) {
-      dateFormats = DEFAULT_HTTP_CLIENT_PATTERNS;
-    }
-    if (startDate == null) {
-      startDate = DEFAULT_TWO_DIGIT_YEAR_START;
-    }
-    // trim single quotes around date if present
-    // see issue #5279
-    if (dateValue.length() > 1
-            && dateValue.startsWith("'")
-            && dateValue.endsWith("'")
-            ) {
-      dateValue = dateValue.substring(1, dateValue.length() - 1);
-    }
-
-    //TODO upgrade to Java 8 DateTimeFormatter. But how to deal with the GMT as a default?
-    SimpleDateFormat dateParser = null;
-    Iterator formatIter = dateFormats.iterator();
-
-    while (formatIter.hasNext()) {
-      String format = (String) formatIter.next();
-      if (dateParser == null) {
-        dateParser = new SimpleDateFormat(format, Locale.ENGLISH);
-        dateParser.setTimeZone(GMT);
-        dateParser.set2DigitYearStart(startDate);
-      } else {
-        dateParser.applyPattern(format);
-      }
-      try {
-        return dateParser.parse(dateValue);
-      } catch (ParseException pe) {
-        // ignore this exception, we will try the next format
-      }
-    }
-
-    // we were unable to parse the date
-    throw new ParseException("Unable to parse the date " + dateValue, 0);
-  }
-
-}
--- a/solr/contrib/extraction/src/java/org/apache/solr/handler/extraction/SolrContentHandler.java
+++ b/solr/contrib/extraction/src/java/org/apache/solr/handler/extraction/SolrContentHandler.java
@ -18,9 +18,7 @@ package org.apache.solr.handler.extraction;

 import java.lang.invoke.MethodHandles;
 import java.util.ArrayDeque;
-import java.util.Collection;
 import java.util.Collections;
-import java.util.Date;
 import java.util.Deque;
 import java.util.HashMap;
 import java.util.HashSet;
@ -32,7 +30,6 @@ import org.apache.solr.common.SolrInputDocument;
 import org.apache.solr.common.params.SolrParams;
 import org.apache.solr.schema.IndexSchema;
 import org.apache.solr.schema.SchemaField;
-import org.apache.solr.schema.NumberType;
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.metadata.TikaMetadataKeys;
 import org.slf4j.Logger;
@ -61,8 +58,6 @@ public class SolrContentHandler extends DefaultHandler implements ExtractingPara

  protected final SolrInputDocument document;

-  protected final Collection<String> dateFormats;
-
  protected final Metadata metadata;
  protected final SolrParams params;
  protected final StringBuilder catchAllBuilder = new StringBuilder(2048);
@ -79,19 +74,13 @@ public class SolrContentHandler extends DefaultHandler implements ExtractingPara
  private final boolean literalsOverride;
  
  private Set<String> literalFieldNames = null;
-  
+
+
  public SolrContentHandler(Metadata metadata, SolrParams params, IndexSchema schema) {
-    this(metadata, params, schema, ExtractionDateUtil.DEFAULT_DATE_FORMATS);
-  }
-
-
-  public SolrContentHandler(Metadata metadata, SolrParams params,
-                            IndexSchema schema, Collection<String> dateFormats) {
    this.document = new SolrInputDocument();
    this.metadata = metadata;
    this.params = params;
    this.schema = schema;
-    this.dateFormats = dateFormats;

    this.lowerNames = params.getBool(LOWERNAMES, false);
    this.captureAttribs = params.getBool(CAPTURE_ATTRIBUTES, false);
@ -253,12 +242,12 @@ public class SolrContentHandler extends DefaultHandler implements ExtractingPara
    }

    if (fval != null) {
-      document.addField(name, transformValue(fval, sf));
+      document.addField(name, fval);
    }

    if (vals != null) {
      for (String val : vals) {
-        document.addField(name, transformValue(val, sf));
+        document.addField(name, val);
      }
    }

@ -310,30 +299,6 @@ public class SolrContentHandler extends DefaultHandler implements ExtractingPara
    characters(chars, offset, length);
  }

-  /**
-   * Can be used to transform input values based on their {@link org.apache.solr.schema.SchemaField}
-   * <p>
-   * This implementation only formats dates using the {@link ExtractionDateUtil}.
-   *
-   * @param val    The value to transform
-   * @param schFld The {@link org.apache.solr.schema.SchemaField}
-   * @return The potentially new value.
-   */
-  protected String transformValue(String val, SchemaField schFld) {
-    String result = val;
-    if (schFld != null && NumberType.DATE.equals(schFld.getType().getNumberType())) {
-      //try to transform the date
-      try {
-        Date date = ExtractionDateUtil.parseDate(val, dateFormats); // may throw
-        result = date.toInstant().toString();//ISO format
-      } catch (Exception e) {
-        // Let the specific fieldType handle errors
-        // throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, "Invalid value: " + val + " for field: " + schFld, e);
-      }
-    }
-    return result;
-  }
-
  /**
   * Get the name mapping
   *
--- a/solr/contrib/extraction/src/java/org/apache/solr/handler/extraction/SolrContentHandlerFactory.java
+++ b/solr/contrib/extraction/src/java/org/apache/solr/handler/extraction/SolrContentHandlerFactory.java
@ -20,22 +20,16 @@ import org.apache.tika.metadata.Metadata;
 import org.apache.solr.common.params.SolrParams;
 import org.apache.solr.schema.IndexSchema;

-import java.util.Collection;
-

 /**
 *
 *
 **/
 public class SolrContentHandlerFactory {
-  protected Collection<String> dateFormats;

-  public SolrContentHandlerFactory(Collection<String> dateFormats) {
-    this.dateFormats = dateFormats;
-  }
+  public SolrContentHandlerFactory() { }

  public SolrContentHandler createSolrContentHandler(Metadata metadata, SolrParams params, IndexSchema schema) {
-    return new SolrContentHandler(metadata, params, schema,
-            dateFormats);
+    return new SolrContentHandler(metadata, params, schema);
  }
 }
--- a/solr/contrib/extraction/src/test/org/apache/solr/handler/extraction/TestExtractionDateUtil.java
+++ b/solr/contrib/extraction/src/test/org/apache/solr/handler/extraction/TestExtractionDateUtil.java
@ -1,62 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.solr.handler.extraction;
-
-import java.text.ParseException;
-import java.util.Date;
-import java.util.Locale;
-
-import org.apache.lucene.util.LuceneTestCase;
-
-public class TestExtractionDateUtil extends LuceneTestCase {
-
-  public void testISO8601() throws Exception {
-    // dates with atypical years
-    assertParseFormatEquals("0001-01-01T01:01:01Z", null);
-    assertParseFormatEquals("+12021-12-01T03:03:03Z", null);
-
-    assertParseFormatEquals("0000-04-04T04:04:04Z", null); // note: 0 AD is also known as 1 BC
-
-    // dates with negative years (BC)
-    assertParseFormatEquals("-0005-05-05T05:05:05Z", null);
-    assertParseFormatEquals("-2021-12-01T04:04:04Z", null);
-    assertParseFormatEquals("-12021-12-01T02:02:02Z", null);
-
-    // dates that only parse thanks to lenient mode of DateTimeFormatter
-    assertParseFormatEquals("10995-12-31T23:59:59.990Z", "+10995-12-31T23:59:59.990Z"); // missing '+' 5 digit year
-    assertParseFormatEquals("995-1-2T3:4:5Z", "0995-01-02T03:04:05Z"); // wasn't 0 padded
-  }
-
-  private static void assertParseFormatEquals(String inputStr, String expectedStr) throws ParseException {
-    if (expectedStr == null) {
-      expectedStr = inputStr;
-    }
-    Date inputDate = ExtractionDateUtil.parseDate(inputStr);
-    String resultStr = inputDate.toInstant().toString();
-    assertEquals("d:" + inputDate.getTime(), expectedStr, resultStr);
-  }
-
-  @AwaitsFix(bugUrl = "https://issues.apache.org/jira/browse/SOLR-12593")
-  public void testParseDate() throws ParseException {
-    assertParsedDate(1226583351000L, "Thu Nov 13 04:35:51 AKST 2008");
-  }
-
-  private static void assertParsedDate(long ts, String dateStr) throws ParseException {
-    long parsed = ExtractionDateUtil.parseDate(dateStr).getTime();
-    assertTrue(String.format(Locale.ENGLISH, "Incorrect parsed timestamp: %d != %d (%s)", ts, parsed, dateStr), Math.abs(ts - parsed) <= 1000L);
-  }
-}
--- a/solr/core/src/test-files/solr/configsets/_default/conf/managed-schema
+++ b/solr/core/src/test-files/solr/configsets/_default/conf/managed-schema
@ -140,6 +140,7 @@
    <dynamicField name="*_d"  type="pdouble" indexed="true"  stored="true"/>
    <dynamicField name="*_ds" type="pdoubles" indexed="true"  stored="true"/>
    <dynamicField name="random_*" type="random"/>
+    <dynamicField name="ignored_*" type="ignored"/>

    <!-- Type used for data-driven schema, to add a string copy for each text field -->
    <dynamicField name="*_str" type="strings" stored="false" docValues="true" indexed="false" useDocValuesAsStored="false"/>
@ -148,7 +149,7 @@
    <dynamicField name="*_dts" type="pdate"    indexed="true"  stored="true" multiValued="true"/>
    <dynamicField name="*_p"  type="location" indexed="true" stored="true"/>
    <dynamicField name="*_srpt"  type="location_rpt" indexed="true" stored="true"/>
-    
+
    <!-- payloaded dynamic fields -->
    <dynamicField name="*_dpf" type="delimited_payloads_float" indexed="true"  stored="true"/>
    <dynamicField name="*_dpi" type="delimited_payloads_int" indexed="true"  stored="true"/>
@ -207,13 +208,16 @@
    <fieldType name="pfloat" class="solr.FloatPointField" docValues="true"/>
    <fieldType name="plong" class="solr.LongPointField" docValues="true"/>
    <fieldType name="pdouble" class="solr.DoublePointField" docValues="true"/>
-    
+
    <fieldType name="pints" class="solr.IntPointField" docValues="true" multiValued="true"/>
    <fieldType name="pfloats" class="solr.FloatPointField" docValues="true" multiValued="true"/>
    <fieldType name="plongs" class="solr.LongPointField" docValues="true" multiValued="true"/>
    <fieldType name="pdoubles" class="solr.DoublePointField" docValues="true" multiValued="true"/>
    <fieldType name="random" class="solr.RandomSortField" indexed="true"/>

+    <!-- since fields of this type are by default not stored or indexed,
+       any data added to them will be ignored outright.  -->
+    <fieldType name="ignored" stored="false" indexed="false" multiValued="true" class="solr.StrField" />

    <!-- The format for this date field is of the form 1995-12-31T23:59:59Z, and
         is a more restricted form of the canonical representation of dateTime
--- a/solr/core/src/test-files/solr/configsets/_default/conf/solrconfig.xml
+++ b/solr/core/src/test-files/solr/configsets/_default/conf/solrconfig.xml
@ -797,7 +797,6 @@
                  class="solr.extraction.ExtractingRequestHandler" >
    <lst name="defaults">
      <str name="lowernames">true</str>
-      <str name="fmap.meta">ignored_</str>
      <str name="fmap.content">_text_</str>
    </lst>
  </requestHandler>
--- a/solr/server/solr/configsets/_default/conf/managed-schema
+++ b/solr/server/solr/configsets/_default/conf/managed-schema
@ -140,6 +140,7 @@
    <dynamicField name="*_d"  type="pdouble" indexed="true"  stored="true"/>
    <dynamicField name="*_ds" type="pdoubles" indexed="true"  stored="true"/>
    <dynamicField name="random_*" type="random"/>
+    <dynamicField name="ignored_*" type="ignored"/>

    <!-- Type used for data-driven schema, to add a string copy for each text field -->
    <dynamicField name="*_str" type="strings" stored="false" docValues="true" indexed="false" useDocValuesAsStored="false"/>
@ -148,7 +149,7 @@
    <dynamicField name="*_dts" type="pdate"    indexed="true"  stored="true" multiValued="true"/>
    <dynamicField name="*_p"  type="location" indexed="true" stored="true"/>
    <dynamicField name="*_srpt"  type="location_rpt" indexed="true" stored="true"/>
-    
+
    <!-- payloaded dynamic fields -->
    <dynamicField name="*_dpf" type="delimited_payloads_float" indexed="true"  stored="true"/>
    <dynamicField name="*_dpi" type="delimited_payloads_int" indexed="true"  stored="true"/>
@ -207,13 +208,16 @@
    <fieldType name="pfloat" class="solr.FloatPointField" docValues="true"/>
    <fieldType name="plong" class="solr.LongPointField" docValues="true"/>
    <fieldType name="pdouble" class="solr.DoublePointField" docValues="true"/>
-    
+
    <fieldType name="pints" class="solr.IntPointField" docValues="true" multiValued="true"/>
    <fieldType name="pfloats" class="solr.FloatPointField" docValues="true" multiValued="true"/>
    <fieldType name="plongs" class="solr.LongPointField" docValues="true" multiValued="true"/>
    <fieldType name="pdoubles" class="solr.DoublePointField" docValues="true" multiValued="true"/>
    <fieldType name="random" class="solr.RandomSortField" indexed="true"/>

+    <!-- since fields of this type are by default not stored or indexed,
+       any data added to them will be ignored outright.  -->
+    <fieldType name="ignored" stored="false" indexed="false" multiValued="true" class="solr.StrField" />

    <!-- The format for this date field is of the form 1995-12-31T23:59:59Z, and
         is a more restricted form of the canonical representation of dateTime
--- a/solr/server/solr/configsets/_default/conf/solrconfig.xml
+++ b/solr/server/solr/configsets/_default/conf/solrconfig.xml
@ -797,7 +797,6 @@
                  class="solr.extraction.ExtractingRequestHandler" >
    <lst name="defaults">
      <str name="lowernames">true</str>
-      <str name="fmap.meta">ignored_</str>
      <str name="fmap.content">_text_</str>
    </lst>
  </requestHandler>
--- a/solr/solr-ref-guide/src/uploading-data-with-solr-cell-using-apache-tika.adoc
+++ b/solr/solr-ref-guide/src/uploading-data-with-solr-cell-using-apache-tika.adoc
@ -26,41 +26,50 @@ If you want to supply your own `ContentHandler` for Solr to use, you can extend

 When using the Solr Cell framework, it is helpful to keep the following in mind:

-* Tika will automatically attempt to determine the input document type (Word, PDF, HTML) and extract the content appropriately. If you like, you can explicitly specify a MIME type for Tika with the `stream.type` parameter.
-* Tika works by producing an XHTML stream that it feeds to a SAX ContentHandler. SAX is a common interface implemented for many different XML parsers. For more information, see http://www.saxproject.org/quickstart.html.
-* Solr then responds to Tika's SAX events and creates the fields to index.
-* Tika produces metadata such as Title, Subject, and Author according to specifications such as the DublinCore. See http://tika.apache.org/{ivy-tika-version}/formats.html for the file types supported.
-* Tika adds all the extracted text to the `content` field.
-* You can map Tika's metadata fields to Solr fields.
-* You can pass in literals for field values. Literals will override Tika-parsed values, including fields in the Tika metadata object, the Tika content field, and any "captured content" fields.
-* You can apply an XPath expression to the Tika XHTML to restrict the content that is produced.
+* Tika will automatically attempt to determine the input document type (e.g. Word, PDF, HTML) and extract the content appropriately.
+If you like, you can explicitly specify a MIME type for Tika with the `stream.type` parameter.
+See http://tika.apache.org/{ivy-tika-version}/formats.html for the file types supported.
+* Briefly, Tika internally works by synthesizing an XHTML document from the core content of the parsed document which is passed to a configured http://www.saxproject.org/quickstart.html[SAX] ContentHandler provided by Solr Cell.
+Solr responds to Tika's SAX events to create one or more text fields from the content.
+Tika exposes document metadata as well (apart from the XHTML).
+* Tika produces metadata such as Title, Subject, and Author according to specifications such as the DublinCore.
+The metadata available is highly dependent on the file types and what they in turn contain.
+Solr Cell supplies some metadata of its own too.
+* Solr Cell concatenates text from the internal XHTML into a `content` field.
+You can configure which elements should be included/ignored, and which should map to another field.
+* Solr Cell maps each piece of metadata onto a field.
+By default it maps to the same name but several parameters control how this is done.
+* When Solr Cell finishes creating the internal `SolrInputDocument`, the rest of the Lucene/Solr indexing stack takes over.
+The next step after any update handler is the <<update-request-processors.adoc#update-request-processors,Update Request Processor>> chain.

-[TIP]
+[NOTE]
 ====
 While Apache Tika is quite powerful, it is not perfect and fails on some files. PDF files are particularly problematic, mostly due to the PDF format itself. In case of a failure processing any file, the `ExtractingRequestHandler` does not have a secondary mechanism to try to extract some text from the file; it will throw an exception and fail.
 ====

-== Trying out Tika with the Solr techproducts Example
+== Trying out Tika

-You can try out the Tika framework using the `techproducts` example included in Solr.
+You can try out the Tika framework using the `schemaless` example included in Solr.
+This will simply create a core/collection "gettingstarted" with the default configSet.

 Start the example:

 [source,bash]
 ----
-bin/solr -e techproducts
+bin/solr -e schemaless
 ----

 You can now use curl to send a sample PDF file via HTTP POST:

 [source,bash]
 ----
-curl 'http://localhost:8983/solr/techproducts/update/extract?literal.id=doc1&commit=true' -F "myfile=@example/exampledocs/solr-word.pdf"
+curl 'http://localhost:8983/solr/gettingstarted/update/extract?literal.id=doc1&uprefix=ignored_&commit=true' -F "myfile=@example/exampledocs/solr-word.pdf"
 ----

 The URL above calls the Extracting Request Handler, uploads the file `solr-word.pdf` and assigns it the unique ID `doc1`. Here's a closer look at the components of this command:

-* The `literal.id=doc1` parameter provides the necessary unique ID for the document being indexed.
+* The `literal.id=doc1` parameter provides a unique ID for the document being indexed.
+There are alternatives to this like mapping a metadata field to the ID, generating a new UUID, and generating an ID from a signature (hash) of the content.

 * The `commit=true parameter` causes Solr to perform a commit after indexing the document, making it immediately searchable. For optimum performance when loading many documents, don't call the commit command until you are done.

@ -68,25 +77,36 @@ The URL above calls the Extracting Request Handler, uploads the file `solr-word.

 * The argument `myfile=@tutorial.html` needs a valid path, which can be absolute or relative.

-You can also use `bin/post` to send a PDF file into Solr (without the params, the `literal.id` parameter would be set to the absolute path to the file):
+You can also use `bin/post` to send a PDF file into Solr (without the params, the post tool would set `literal.id` to the absolute path to the file):

 [source,bash]
 ----
-bin/post -c techproducts example/exampledocs/solr-word.pdf -params "literal.id=a"
+bin/post -c gettingstarted example/exampledocs/solr-word.pdf -params "literal.id=doc1"
 ----

-Now you should be able to execute a query and find that document. You can make a request like `\http://localhost:8983/solr/techproducts/select?q=pdf`.
+Now you should be able to execute a query and find that document. You can make a request like `\http://localhost:8983/solr/gettingstarted/select?q=pdf`.

-You may notice that although the content of the sample document has been indexed and stored, there are not a lot of metadata fields associated with this document. This is because unknown fields are ignored according to the default parameters configured for the `/update/extract` handler in `solrconfig.xml`, and this behavior can be easily changed or overridden. For example, to store and see all metadata and content, execute the following:
+You may notice there are many metadata fields associated with this document.
+Solr's configuration is by default in "schemaless" (data driven) mode, and thus all metadata fields extracted get their own field.
+You might instead want to ignore them generally except for a few you specify.
+To do that, use the `uprefix` parameter to map unknown (to the schema) metadata field names to a schema field name that is effectively ignored.
+The dynamic field `ignored_*` is good for this purpose.
+For the fields you do want to map, explicitly set them using `fmap.IN=OUT` and/or ensure the field is defined in the schema.
+Here's an example:

 [source,bash]
 ----
-bin/post -c techproducts example/exampledocs/solr-word.pdf -params "literal.id=doc1&uprefix=attr_"
+bin/post -c gettingstarted example/exampledocs/solr-word.pdf -params "literal.id=doc1&uprefix=ignored_&fmap.last_modified=last_modified_dt"
 ----

-In this command, the `uprefix=attr_` parameter causes all generated fields that aren't defined in the schema to be prefixed with `attr_`, which is a dynamic field that is stored and indexed.
-
-This command allows you to query the document using an attribute, as in: `\http://localhost:8983/solr/techproducts/select?q=attr_meta:microsoft`.
+[NOTE]
+====
+This won't have the intended effect if you run it at this point in the sequence of this tutorial.
+Previously we added the document without these parameters; schemaless mode automatically added all fields at that time.
+"uprefix" only applies to fields that are _undefined_ (hence the 'u' in "uprefix"), so these won't be prefixed now.
+However you will see the new "last_modified_dt" field.
+The easiest way to try this properly is to start over by deleting `example/schemaless/` (while Solr is stopped).
+====

 == Solr Cell Input Parameters

@ -101,9 +121,6 @@ Indexes attributes of the Tika XHTML elements into separate fields, named after
 `commitWithin`::
 Add the document within the specified number of milliseconds.

-`date.formats`::
-Defines the date format patterns to identify in the documents.
-
 `defaultField`::
 If the `uprefix` parameter (see below) is not specified and a field cannot be determined, the default field will be used.

@ -144,7 +161,7 @@ Defines a password to use for a password-protected PDF or OOXML file
 Defines a file path and name to a customized Tika configuration file. This is only required if you have customized your Tika implementation.

 `uprefix`::
-Prefixes all fields that are not defined in the schema with the given prefix. This is very useful when combined with dynamic field definitions. Example: `uprefix=ignored_` would effectively ignore all unknown fields generated by Tika given the example schema contains `<dynamicField name="ignored_*" type="ignored"/>`
+Prefixes all fields _that are undefined in the schema_ with the given prefix. This is very useful when combined with dynamic field definitions. Example: `uprefix=ignored_` would effectively ignore all unknown fields generated by Tika given the default schema contains `<dynamicField name="ignored_*" type="ignored"/>`

 `xpath`::
 When extracting, only return Tika XHTML content that satisfies the given XPath expression. See http://tika.apache.org/{ivy-tika-version}/ for details on the format of Tika XHTML. See also http://wiki.apache.org/solr/TikaExtractOnlyExampleOutput.
@ -160,7 +177,7 @@ Here is the order in which the Solr Cell framework, using the Extracting Request

 == Configuring the Solr ExtractingRequestHandler

-If you are not working with the supplied `sample_techproducts_configs` or `_default` <<config-sets.adoc#config-sets,config set>>, you must configure your own `solrconfig.xml` to know about the Jar's containing the `ExtractingRequestHandler` and its dependencies:
+If you are not working with the supplied <<config-sets.adoc#config-sets,config sets>>, you must configure your own `solrconfig.xml` to know about the Jar's containing the `ExtractingRequestHandler` and its dependencies:

 [source,xml]
 ----
@ -179,11 +196,6 @@ You can then configure the `ExtractingRequestHandler` in `solrconfig.xml`.
  </lst>
  <!--Optional.  Specify a path to a tika configuration file. See the Tika docs for details.-->
  <str name="tika.config">/my/path/to/tika.config</str>
-  <!-- Optional. Specify one or more date formats to parse. See DateUtil.DEFAULT_DATE_FORMATS
-       for default date formats -->
-  <lst name="date.formats">
-    <str>yyyy-MM-dd</str>
-  </lst>
  <!-- Optional. Specify an external file containing parser-specific properties.
       This file is located in the same directory as solrconfig.xml by default.-->
  <str name="parseContext.config">parseContext.xml</str>
@ -192,17 +204,16 @@ You can then configure the `ExtractingRequestHandler` in `solrconfig.xml`.

 In the defaults section, we are mapping Tika's Last-Modified Metadata attribute to a field named `last_modified`. We are also telling it to ignore undeclared fields. These are all overridden parameters.

-The `tika.config` entry points to a file containing a Tika configuration. The `date.formats` allows you to specify various `java.text.SimpleDateFormats` date formats for working with transforming extracted input to a Date. Solr comes configured with the following date formats (see the `DateUtil` in Solr):
+The `tika.config` entry points to a file containing a Tika configuration.

-* `yyyy-MM-dd'T'HH:mm:ss'Z'`
-* `yyyy-MM-dd'T'HH:mm:ss`
-* `yyyy-MM-dd`
-* `yyyy-MM-dd hh:mm:ss`
-* `yyyy-MM-dd HH:mm:ss`
-* `EEE MMM d hh:mm:ss z yyyy`
-* `EEE, dd MMM yyyy HH:mm:ss zzz`
-* `EEEE, dd-MMM-yy HH:mm:ss zzz`
-* `EEE MMM d HH:mm:ss yyyy`
+[TIP]
+====
+You likely need to have <<update-request-processors.adoc#update-request-processors,Update Request Processors>> (URPs) that parse numbers and dates and do other manipulations on the metadata fields generated by Solr Cell.
+In Solr's default configuration, "schemaless" (data driven) mode is enabled, which does a variety of such processing already.
+_If you don't use this mode_, you can still selectively specify the desired URPs.
+An easy way to specify this is to configure the parameter `processor` (under `defaults`) to `uuid,remove-blank,field-name-mutating,parse-boolean,parse-long,parse-double,parse-date`.
+That suggested list was taken right from the `add-unknown-fields-to-the-schema` URP chain, excluding `add-schema-fields`.
+====

 === Parser-Specific Properties

@ -270,7 +281,7 @@ The command below captures `<div>` tags separately, and then maps all the instan

 [source,bash]
 ----
-bin/post -c techproducts example/exampledocs/sample.html -params "literal.id=doc2&captureAttr=true&defaultField=_text_&fmap.div=foo_t&capture=div"
+bin/post -c gettingstarted example/exampledocs/sample.html -params "literal.id=doc2&captureAttr=true&defaultField=_text_&fmap.div=foo_t&capture=div"
 ----

 ==== Using Literals to Define Your Own Metadata
@ -279,7 +290,7 @@ To add in your own metadata, pass in the literal parameter along with the file:

 [source,bash]
 ----
-bin/post -c techproducts -params "literal.id=doc4&captureAttr=true&defaultField=text&capture=div&fmap.div=foo_t&literal.blah_s=Bah" example/exampledocs/sample.html
+bin/post -c gettingstarted -params "literal.id=doc4&captureAttr=true&defaultField=text&capture=div&fmap.div=foo_t&literal.blah_s=Bah" example/exampledocs/sample.html
 ----

 ==== XPath Expressions
@ -288,7 +299,7 @@ The example below passes in an XPath expression to restrict the XHTML returned b

 [source,bash]
 ----
-bin/post -c techproducts -params "literal.id=doc5&captureAttr=true&defaultField=text&capture=div&fmap.div=foo_t&xpath=/xhtml:html/xhtml:body/xhtml:div//node()" example/exampledocs/sample.html
+bin/post -c gettingstarted -params "literal.id=doc5&captureAttr=true&defaultField=text&capture=div&fmap.div=foo_t&xpath=/xhtml:html/xhtml:body/xhtml:div//node()" example/exampledocs/sample.html
 ----

 === Extracting Data without Indexing It
@ -299,14 +310,14 @@ The example below sets the `extractOnly=true` parameter to extract data without

 [source,bash]
 ----
-curl "http://localhost:8983/solr/techproducts/update/extract?&extractOnly=true" --data-binary @example/exampledocs/sample.html -H 'Content-type:text/html'
+curl "http://localhost:8983/solr/gettingstarted/update/extract?&extractOnly=true" --data-binary @example/exampledocs/sample.html -H 'Content-type:text/html'
 ----

 The output includes XML generated by Tika (and further escaped by Solr's XML) using a different output format to make it more readable (`-out yes` instructs the tool to echo Solr's output to the console):

 [source,bash]
 ----
-bin/post -c techproducts -params "extractOnly=true&wt=ruby&indent=true" -out yes example/exampledocs/sample.html
+bin/post -c gettingstarted -params "extractOnly=true&wt=ruby&indent=true" -out yes example/exampledocs/sample.html
 ----

 == Sending Documents to Solr with a POST
@ -315,10 +326,10 @@ The example below streams the file as the body of the POST, which does not, then

 [source,bash]
 ----
-curl "http://localhost:8983/solr/techproducts/update/extract?literal.id=doc6&defaultField=text&commit=true" --data-binary @example/exampledocs/sample.html -H 'Content-type:text/html'
+curl "http://localhost:8983/solr/gettingstarted/update/extract?literal.id=doc6&defaultField=text&commit=true" --data-binary @example/exampledocs/sample.html -H 'Content-type:text/html'
 ----

-== Sending Documents to Solr with Solr Cell and SolrJ
+== Sending Documents to Solr with SolrJ

 SolrJ is a Java client that you can use to add documents to the index, update the index, or query the index. You'll find more information on SolrJ in <<client-apis.adoc#client-apis,Client APIs>>.