mirror of https://github.com/apache/lucene.git
SOLR-7139: Fix SolrContentHandler for TIKA to ignore multiple startDocument events
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1662457 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
46ede7622e
commit
09a0dcb6c6
|
@ -156,6 +156,9 @@ Bug Fixes
|
||||||
* SOLR-7128: Two phase distributed search is fetching extra fields in GET_TOP_IDS phase.
|
* SOLR-7128: Two phase distributed search is fetching extra fields in GET_TOP_IDS phase.
|
||||||
(Pablo Queixalos, shalin)
|
(Pablo Queixalos, shalin)
|
||||||
|
|
||||||
|
* SOLR-7139: Fix SolrContentHandler for TIKA to ignore multiple startDocument events.
|
||||||
|
(Chris Mattman, Uwe Schindler)
|
||||||
|
|
||||||
Optimizations
|
Optimizations
|
||||||
----------------------
|
----------------------
|
||||||
|
|
||||||
|
|
|
@ -17,6 +17,18 @@
|
||||||
|
|
||||||
package org.apache.solr.handler.extraction;
|
package org.apache.solr.handler.extraction;
|
||||||
|
|
||||||
|
import java.text.DateFormat;
|
||||||
|
import java.util.ArrayDeque;
|
||||||
|
import java.util.Collection;
|
||||||
|
import java.util.Collections;
|
||||||
|
import java.util.Date;
|
||||||
|
import java.util.Deque;
|
||||||
|
import java.util.HashMap;
|
||||||
|
import java.util.HashSet;
|
||||||
|
import java.util.Iterator;
|
||||||
|
import java.util.Map;
|
||||||
|
import java.util.Set;
|
||||||
|
|
||||||
import org.apache.solr.common.SolrInputDocument;
|
import org.apache.solr.common.SolrInputDocument;
|
||||||
import org.apache.solr.common.params.SolrParams;
|
import org.apache.solr.common.params.SolrParams;
|
||||||
import org.apache.solr.common.util.DateUtil;
|
import org.apache.solr.common.util.DateUtil;
|
||||||
|
@ -31,14 +43,13 @@ import org.xml.sax.Attributes;
|
||||||
import org.xml.sax.SAXException;
|
import org.xml.sax.SAXException;
|
||||||
import org.xml.sax.helpers.DefaultHandler;
|
import org.xml.sax.helpers.DefaultHandler;
|
||||||
|
|
||||||
import java.text.DateFormat;
|
|
||||||
import java.util.*;
|
|
||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* The class responsible for handling Tika events and translating them into {@link org.apache.solr.common.SolrInputDocument}s.
|
* The class responsible for handling Tika events and translating them into {@link org.apache.solr.common.SolrInputDocument}s.
|
||||||
* <B>This class is not thread-safe.</B>
|
* <B>This class is not thread-safe.</B>
|
||||||
* <p>
|
* <p>
|
||||||
|
* This class cannot be reused, you have to create a new instance per document!
|
||||||
|
* <p>
|
||||||
* User's may wish to override this class to provide their own functionality.
|
* User's may wish to override this class to provide their own functionality.
|
||||||
*
|
*
|
||||||
* @see org.apache.solr.handler.extraction.SolrContentHandlerFactory
|
* @see org.apache.solr.handler.extraction.SolrContentHandlerFactory
|
||||||
|
@ -46,27 +57,30 @@ import java.util.*;
|
||||||
* @see org.apache.solr.handler.extraction.ExtractingDocumentLoader
|
* @see org.apache.solr.handler.extraction.ExtractingDocumentLoader
|
||||||
*/
|
*/
|
||||||
public class SolrContentHandler extends DefaultHandler implements ExtractingParams {
|
public class SolrContentHandler extends DefaultHandler implements ExtractingParams {
|
||||||
private transient static Logger log = LoggerFactory.getLogger(SolrContentHandler.class);
|
private transient static final Logger log = LoggerFactory.getLogger(SolrContentHandler.class);
|
||||||
protected SolrInputDocument document;
|
|
||||||
|
|
||||||
protected Collection<String> dateFormats = DateUtil.DEFAULT_DATE_FORMATS;
|
public static final String contentFieldName = "content";
|
||||||
|
|
||||||
protected Metadata metadata;
|
protected final SolrInputDocument document;
|
||||||
protected SolrParams params;
|
|
||||||
protected StringBuilder catchAllBuilder = new StringBuilder(2048);
|
|
||||||
protected IndexSchema schema;
|
|
||||||
protected Map<String, StringBuilder> fieldBuilders = Collections.emptyMap();
|
|
||||||
private LinkedList<StringBuilder> bldrStack = new LinkedList<>();
|
|
||||||
|
|
||||||
protected boolean captureAttribs;
|
protected final Collection<String> dateFormats;
|
||||||
protected boolean lowerNames;
|
|
||||||
protected String contentFieldName = "content";
|
|
||||||
|
|
||||||
protected String unknownFieldPrefix = "";
|
protected final Metadata metadata;
|
||||||
protected String defaultField = "";
|
protected final SolrParams params;
|
||||||
|
protected final StringBuilder catchAllBuilder = new StringBuilder(2048);
|
||||||
|
protected final IndexSchema schema;
|
||||||
|
protected final Map<String, StringBuilder> fieldBuilders;
|
||||||
|
private final Deque<StringBuilder> bldrStack = new ArrayDeque<>();
|
||||||
|
|
||||||
private boolean literalsOverride;
|
protected final boolean captureAttribs;
|
||||||
private Set<String> literalFieldNames;
|
protected final boolean lowerNames;
|
||||||
|
|
||||||
|
protected final String unknownFieldPrefix;
|
||||||
|
protected final String defaultField;
|
||||||
|
|
||||||
|
private final boolean literalsOverride;
|
||||||
|
|
||||||
|
private Set<String> literalFieldNames = null;
|
||||||
|
|
||||||
public SolrContentHandler(Metadata metadata, SolrParams params, IndexSchema schema) {
|
public SolrContentHandler(Metadata metadata, SolrParams params, IndexSchema schema) {
|
||||||
this(metadata, params, schema, DateUtil.DEFAULT_DATE_FORMATS);
|
this(metadata, params, schema, DateUtil.DEFAULT_DATE_FORMATS);
|
||||||
|
@ -75,7 +89,7 @@ public class SolrContentHandler extends DefaultHandler implements ExtractingPara
|
||||||
|
|
||||||
public SolrContentHandler(Metadata metadata, SolrParams params,
|
public SolrContentHandler(Metadata metadata, SolrParams params,
|
||||||
IndexSchema schema, Collection<String> dateFormats) {
|
IndexSchema schema, Collection<String> dateFormats) {
|
||||||
document = new SolrInputDocument();
|
this.document = new SolrInputDocument();
|
||||||
this.metadata = metadata;
|
this.metadata = metadata;
|
||||||
this.params = params;
|
this.params = params;
|
||||||
this.schema = schema;
|
this.schema = schema;
|
||||||
|
@ -86,12 +100,15 @@ public class SolrContentHandler extends DefaultHandler implements ExtractingPara
|
||||||
this.literalsOverride = params.getBool(LITERALS_OVERRIDE, true);
|
this.literalsOverride = params.getBool(LITERALS_OVERRIDE, true);
|
||||||
this.unknownFieldPrefix = params.get(UNKNOWN_FIELD_PREFIX, "");
|
this.unknownFieldPrefix = params.get(UNKNOWN_FIELD_PREFIX, "");
|
||||||
this.defaultField = params.get(DEFAULT_FIELD, "");
|
this.defaultField = params.get(DEFAULT_FIELD, "");
|
||||||
|
|
||||||
String[] captureFields = params.getParams(CAPTURE_ELEMENTS);
|
String[] captureFields = params.getParams(CAPTURE_ELEMENTS);
|
||||||
if (captureFields != null && captureFields.length > 0) {
|
if (captureFields != null && captureFields.length > 0) {
|
||||||
fieldBuilders = new HashMap<>();
|
fieldBuilders = new HashMap<>();
|
||||||
for (int i = 0; i < captureFields.length; i++) {
|
for (int i = 0; i < captureFields.length; i++) {
|
||||||
fieldBuilders.put(captureFields[i], new StringBuilder());
|
fieldBuilders.put(captureFields[i], new StringBuilder());
|
||||||
}
|
}
|
||||||
|
} else {
|
||||||
|
fieldBuilders = Collections.emptyMap();
|
||||||
}
|
}
|
||||||
bldrStack.add(catchAllBuilder);
|
bldrStack.add(catchAllBuilder);
|
||||||
}
|
}
|
||||||
|
@ -253,19 +270,6 @@ public class SolrContentHandler extends DefaultHandler implements ExtractingPara
|
||||||
// if (vals==null && fval==null) throw new RuntimeException(name + " has no non-null value ");
|
// if (vals==null && fval==null) throw new RuntimeException(name + " has no non-null value ");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public void startDocument() throws SAXException {
|
|
||||||
document.clear();
|
|
||||||
catchAllBuilder.setLength(0);
|
|
||||||
for (StringBuilder builder : fieldBuilders.values()) {
|
|
||||||
builder.setLength(0);
|
|
||||||
}
|
|
||||||
bldrStack.clear();
|
|
||||||
bldrStack.add(catchAllBuilder);
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public void startElement(String uri, String localName, String qName, Attributes attributes) throws SAXException {
|
public void startElement(String uri, String localName, String qName, Attributes attributes) throws SAXException {
|
||||||
StringBuilder theBldr = fieldBuilders.get(localName);
|
StringBuilder theBldr = fieldBuilders.get(localName);
|
||||||
|
|
Loading…
Reference in New Issue