From ed573a067baea8a9f695e870daf94e32a6e78491 Mon Sep 17 00:00:00 2001 From: Grant Ingersoll Date: Fri, 13 May 2011 13:34:18 +0000 Subject: [PATCH] SOLR-2511: slight refactoring to make it easier to override git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1102718 13f79535-47bb-0310-9956-ffa450edef68 --- .../extraction/SolrContentHandler.java | 96 +++++++++++++------ 1 file changed, 66 insertions(+), 30 deletions(-) diff --git a/solr/contrib/extraction/src/main/java/org/apache/solr/handler/extraction/SolrContentHandler.java b/solr/contrib/extraction/src/main/java/org/apache/solr/handler/extraction/SolrContentHandler.java index 80fa6c964c4..552577c397e 100644 --- a/solr/contrib/extraction/src/main/java/org/apache/solr/handler/extraction/SolrContentHandler.java +++ b/solr/contrib/extraction/src/main/java/org/apache/solr/handler/extraction/SolrContentHandler.java @@ -47,23 +47,23 @@ import java.util.*; */ public class SolrContentHandler extends DefaultHandler implements ExtractingParams { private transient static Logger log = LoggerFactory.getLogger(SolrContentHandler.class); - private SolrInputDocument document; + protected SolrInputDocument document; - private Collection dateFormats = DateUtil.DEFAULT_DATE_FORMATS; + protected Collection dateFormats = DateUtil.DEFAULT_DATE_FORMATS; - private Metadata metadata; - private SolrParams params; - private StringBuilder catchAllBuilder = new StringBuilder(2048); - private IndexSchema schema; - private Map fieldBuilders = Collections.emptyMap(); + protected Metadata metadata; + protected SolrParams params; + protected StringBuilder catchAllBuilder = new StringBuilder(2048); + protected IndexSchema schema; + protected Map fieldBuilders = Collections.emptyMap(); private LinkedList bldrStack = new LinkedList(); - private boolean captureAttribs; - private boolean lowerNames; - private String contentFieldName = "content"; + protected boolean captureAttribs; + protected boolean lowerNames; + protected String contentFieldName = "content"; - private String unknownFieldPrefix = ""; - private String defaultField = ""; + protected String unknownFieldPrefix = ""; + protected String defaultField = ""; public SolrContentHandler(Metadata metadata, SolrParams params, IndexSchema schema) { this(metadata, params, schema, DateUtil.DEFAULT_DATE_FORMATS); @@ -99,16 +99,57 @@ public class SolrContentHandler extends DefaultHandler implements ExtractingPara * The base implementation adds the metadata as fields, allowing for potential remapping. * * @return The {@link org.apache.solr.common.SolrInputDocument}. + * + * @see #addMetadata() + * @see #addCapturedContent() + * @see #addContent() + * @see #addLiterals() */ public SolrInputDocument newDocument() { float boost = 1.0f; //handle the metadata extracted from the document - for (String name : metadata.names()) { - String[] vals = metadata.getValues(name); - addField(name, null, vals); - } + addMetadata(); //handle the literals from the params + addLiterals(); + + + //add in the content + addContent(); + + //add in the captured content + addCapturedContent(); + + if (log.isDebugEnabled()) { + log.debug("Doc: {}", document); + } + return document; + } + + /** + * Add the per field captured content to the Solr Document. Default implementation uses the + * {@link #fieldBuilders} info + */ + protected void addCapturedContent() { + for (Map.Entry entry : fieldBuilders.entrySet()) { + if (entry.getValue().length() > 0) { + addField(entry.getKey(), entry.getValue().toString(), null); + } + } + } + + /** + * Add in the catch all content to the field. Default impl. uses the {@link #contentFieldName} + * and the {@link #catchAllBuilder} + */ + protected void addContent() { + addField(contentFieldName, catchAllBuilder.toString(), null); + } + + /** + * Add in the literals to the document using the {@link #params} and the {@link #LITERALS_PREFIX}. + */ + protected void addLiterals() { Iterator paramNames = params.getParameterNamesIterator(); while (paramNames.hasNext()) { String pname = paramNames.next(); @@ -117,28 +158,23 @@ public class SolrContentHandler extends DefaultHandler implements ExtractingPara String name = pname.substring(LITERALS_PREFIX.length()); addField(name, null, params.getParams(pname)); } + } - - //add in the content - addField(contentFieldName, catchAllBuilder.toString(), null); - - //add in the captured content - for (Map.Entry entry : fieldBuilders.entrySet()) { - if (entry.getValue().length() > 0) { - addField(entry.getKey(), entry.getValue().toString(), null); - } + /** + * Add in any metadata using {@link #metadata} as the source. + */ + protected void addMetadata() { + for (String name : metadata.names()) { + String[] vals = metadata.getValues(name); + addField(name, null, vals); } - if (log.isDebugEnabled()) { - log.debug("Doc: " + document); - } - return document; } // Naming rules: // 1) optionally map names to nicenames (lowercase+underscores) // 2) execute "map" commands // 3) if resulting field is unknown, map it to a common prefix - private void addField(String fname, String fval, String[] vals) { + protected void addField(String fname, String fval, String[] vals) { if (lowerNames) { StringBuilder sb = new StringBuilder(); for (int i=0; i