SOLR-7139: Fix SolrContentHandler for TIKA to ignore multiple startDocument events

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1662457 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Uwe Schindler 2015-02-26 14:26:38 +00:00
parent 46ede7622e
commit 09a0dcb6c6
2 changed files with 40 additions and 33 deletions

View File

@ -156,6 +156,9 @@ Bug Fixes
* SOLR-7128: Two phase distributed search is fetching extra fields in GET_TOP_IDS phase. * SOLR-7128: Two phase distributed search is fetching extra fields in GET_TOP_IDS phase.
(Pablo Queixalos, shalin) (Pablo Queixalos, shalin)
* SOLR-7139: Fix SolrContentHandler for TIKA to ignore multiple startDocument events.
(Chris Mattman, Uwe Schindler)
Optimizations Optimizations
---------------------- ----------------------

View File

@ -17,6 +17,18 @@
package org.apache.solr.handler.extraction; package org.apache.solr.handler.extraction;
import java.text.DateFormat;
import java.util.ArrayDeque;
import java.util.Collection;
import java.util.Collections;
import java.util.Date;
import java.util.Deque;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.Map;
import java.util.Set;
import org.apache.solr.common.SolrInputDocument; import org.apache.solr.common.SolrInputDocument;
import org.apache.solr.common.params.SolrParams; import org.apache.solr.common.params.SolrParams;
import org.apache.solr.common.util.DateUtil; import org.apache.solr.common.util.DateUtil;
@ -31,14 +43,13 @@ import org.xml.sax.Attributes;
import org.xml.sax.SAXException; import org.xml.sax.SAXException;
import org.xml.sax.helpers.DefaultHandler; import org.xml.sax.helpers.DefaultHandler;
import java.text.DateFormat;
import java.util.*;
/** /**
* The class responsible for handling Tika events and translating them into {@link org.apache.solr.common.SolrInputDocument}s. * The class responsible for handling Tika events and translating them into {@link org.apache.solr.common.SolrInputDocument}s.
* <B>This class is not thread-safe.</B> * <B>This class is not thread-safe.</B>
* <p> * <p>
* This class cannot be reused, you have to create a new instance per document!
* <p>
* User's may wish to override this class to provide their own functionality. * User's may wish to override this class to provide their own functionality.
* *
* @see org.apache.solr.handler.extraction.SolrContentHandlerFactory * @see org.apache.solr.handler.extraction.SolrContentHandlerFactory
@ -46,27 +57,30 @@ import java.util.*;
* @see org.apache.solr.handler.extraction.ExtractingDocumentLoader * @see org.apache.solr.handler.extraction.ExtractingDocumentLoader
*/ */
public class SolrContentHandler extends DefaultHandler implements ExtractingParams { public class SolrContentHandler extends DefaultHandler implements ExtractingParams {
private transient static Logger log = LoggerFactory.getLogger(SolrContentHandler.class); private transient static final Logger log = LoggerFactory.getLogger(SolrContentHandler.class);
protected SolrInputDocument document;
protected Collection<String> dateFormats = DateUtil.DEFAULT_DATE_FORMATS; public static final String contentFieldName = "content";
protected Metadata metadata; protected final SolrInputDocument document;
protected SolrParams params;
protected StringBuilder catchAllBuilder = new StringBuilder(2048);
protected IndexSchema schema;
protected Map<String, StringBuilder> fieldBuilders = Collections.emptyMap();
private LinkedList<StringBuilder> bldrStack = new LinkedList<>();
protected boolean captureAttribs; protected final Collection<String> dateFormats;
protected boolean lowerNames;
protected String contentFieldName = "content";
protected String unknownFieldPrefix = ""; protected final Metadata metadata;
protected String defaultField = ""; protected final SolrParams params;
protected final StringBuilder catchAllBuilder = new StringBuilder(2048);
protected final IndexSchema schema;
protected final Map<String, StringBuilder> fieldBuilders;
private final Deque<StringBuilder> bldrStack = new ArrayDeque<>();
private boolean literalsOverride; protected final boolean captureAttribs;
private Set<String> literalFieldNames; protected final boolean lowerNames;
protected final String unknownFieldPrefix;
protected final String defaultField;
private final boolean literalsOverride;
private Set<String> literalFieldNames = null;
public SolrContentHandler(Metadata metadata, SolrParams params, IndexSchema schema) { public SolrContentHandler(Metadata metadata, SolrParams params, IndexSchema schema) {
this(metadata, params, schema, DateUtil.DEFAULT_DATE_FORMATS); this(metadata, params, schema, DateUtil.DEFAULT_DATE_FORMATS);
@ -75,7 +89,7 @@ public class SolrContentHandler extends DefaultHandler implements ExtractingPara
public SolrContentHandler(Metadata metadata, SolrParams params, public SolrContentHandler(Metadata metadata, SolrParams params,
IndexSchema schema, Collection<String> dateFormats) { IndexSchema schema, Collection<String> dateFormats) {
document = new SolrInputDocument(); this.document = new SolrInputDocument();
this.metadata = metadata; this.metadata = metadata;
this.params = params; this.params = params;
this.schema = schema; this.schema = schema;
@ -86,12 +100,15 @@ public class SolrContentHandler extends DefaultHandler implements ExtractingPara
this.literalsOverride = params.getBool(LITERALS_OVERRIDE, true); this.literalsOverride = params.getBool(LITERALS_OVERRIDE, true);
this.unknownFieldPrefix = params.get(UNKNOWN_FIELD_PREFIX, ""); this.unknownFieldPrefix = params.get(UNKNOWN_FIELD_PREFIX, "");
this.defaultField = params.get(DEFAULT_FIELD, ""); this.defaultField = params.get(DEFAULT_FIELD, "");
String[] captureFields = params.getParams(CAPTURE_ELEMENTS); String[] captureFields = params.getParams(CAPTURE_ELEMENTS);
if (captureFields != null && captureFields.length > 0) { if (captureFields != null && captureFields.length > 0) {
fieldBuilders = new HashMap<>(); fieldBuilders = new HashMap<>();
for (int i = 0; i < captureFields.length; i++) { for (int i = 0; i < captureFields.length; i++) {
fieldBuilders.put(captureFields[i], new StringBuilder()); fieldBuilders.put(captureFields[i], new StringBuilder());
} }
} else {
fieldBuilders = Collections.emptyMap();
} }
bldrStack.add(catchAllBuilder); bldrStack.add(catchAllBuilder);
} }
@ -253,19 +270,6 @@ public class SolrContentHandler extends DefaultHandler implements ExtractingPara
// if (vals==null && fval==null) throw new RuntimeException(name + " has no non-null value "); // if (vals==null && fval==null) throw new RuntimeException(name + " has no non-null value ");
} }
@Override
public void startDocument() throws SAXException {
document.clear();
catchAllBuilder.setLength(0);
for (StringBuilder builder : fieldBuilders.values()) {
builder.setLength(0);
}
bldrStack.clear();
bldrStack.add(catchAllBuilder);
}
@Override @Override
public void startElement(String uri, String localName, String qName, Attributes attributes) throws SAXException { public void startElement(String uri, String localName, String qName, Attributes attributes) throws SAXException {
StringBuilder theBldr = fieldBuilders.get(localName); StringBuilder theBldr = fieldBuilders.get(localName);