From 381a30b26ca1737123b65aefc685367d1aa038b9 Mon Sep 17 00:00:00 2001 From: David Smiley Date: Mon, 28 Jan 2019 13:25:06 -0500 Subject: [PATCH] SOLR-12768: added _nest_path_ to the default schema (thereby enabling nested docs) * new NestPathField encapsulating details for how _nest_path_ is indexed ** tweaked the analysis to index 1 token instead of variable * TokenizerChain has new CustomAnalyzer copy-constructor --- solr/CHANGES.txt | 7 ++ .../apache/solr/analysis/TokenizerChain.java | 13 ++++ .../transform/ChildDocTransformerFactory.java | 15 +++-- .../org/apache/solr/schema/FieldType.java | 5 +- .../org/apache/solr/schema/NestPathField.java | 66 +++++++++++++++++++ .../NestedUpdateProcessorFactory.java | 10 +-- .../solr/collection1/conf/schema-nest.xml | 20 ++---- .../configsets/_default/conf/managed-schema | 6 ++ .../TestChildDocTransformerHierarchy.java | 45 ++++++++++--- .../update/TestNestedUpdateProcessor.java | 37 +++++++---- .../configsets/_default/conf/managed-schema | 6 ++ 11 files changed, 183 insertions(+), 47 deletions(-) create mode 100644 solr/core/src/java/org/apache/solr/schema/NestPathField.java diff --git a/solr/CHANGES.txt b/solr/CHANGES.txt index 169081d25f3..9fae7ad05d4 100644 --- a/solr/CHANGES.txt +++ b/solr/CHANGES.txt @@ -149,6 +149,13 @@ New Features * SOLR-12639: Umbrella JIRA for adding support HTTP/2 (Cao Manh Dat) +* SOLR-12768: Improved nested document support, and enabled in the default schema with the presence of _nest_path_. + When this field is present, certain things happen automatically. An internal URP is automatically used to populate + it. The [child] (doc transformer) will return a hierarchy with relationships; no params needed. The relationship + path is indexed for use in queries (can be disabled if not needed). Also, child documents needn't provide a uniqueKey + value as Solr will supply one automatically by concatenating a path to that of the parent document's key. + (David Smiley, Moshe Bla). + Bug Fixes ---------------------- diff --git a/solr/core/src/java/org/apache/solr/analysis/TokenizerChain.java b/solr/core/src/java/org/apache/solr/analysis/TokenizerChain.java index 95755e13b75..993804544be 100644 --- a/solr/core/src/java/org/apache/solr/analysis/TokenizerChain.java +++ b/solr/core/src/java/org/apache/solr/analysis/TokenizerChain.java @@ -22,6 +22,7 @@ import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.Tokenizer; import org.apache.lucene.analysis.core.KeywordTokenizer; +import org.apache.lucene.analysis.custom.CustomAnalyzer; import org.apache.lucene.analysis.util.CharFilterFactory; import org.apache.lucene.analysis.util.TokenFilterFactory; import org.apache.lucene.analysis.util.TokenizerFactory; @@ -30,6 +31,7 @@ import org.apache.lucene.analysis.util.TokenizerFactory; * An analyzer that uses a tokenizer and a list of token filters to * create a TokenStream. * + * It should probably be replaced with {@link CustomAnalyzer}. * @since 3.1 */ public final class TokenizerChain extends SolrAnalyzer { @@ -40,6 +42,17 @@ public final class TokenizerChain extends SolrAnalyzer { final private TokenizerFactory tokenizer; final private TokenFilterFactory[] filters; + /** Copy from CustomAnalyzer. */ + public TokenizerChain(CustomAnalyzer customAnalyzer) { + this( + customAnalyzer.getCharFilterFactories().toArray(new CharFilterFactory[0]), + customAnalyzer.getTokenizerFactory(), + customAnalyzer.getTokenFilterFactories().toArray(new TokenFilterFactory[0])); + setPositionIncrementGap(customAnalyzer.getPositionIncrementGap(null)); + setVersion(customAnalyzer.getVersion()); + assert customAnalyzer.getOffsetGap(null) == 1; // note: we don't support setting the offset gap + } + /** * Creates a new TokenizerChain w/o any CharFilterFactories. * diff --git a/solr/core/src/java/org/apache/solr/response/transform/ChildDocTransformerFactory.java b/solr/core/src/java/org/apache/solr/response/transform/ChildDocTransformerFactory.java index 82be49dfa33..b38565e0be4 100644 --- a/solr/core/src/java/org/apache/solr/response/transform/ChildDocTransformerFactory.java +++ b/solr/core/src/java/org/apache/solr/response/transform/ChildDocTransformerFactory.java @@ -150,19 +150,24 @@ public class ChildDocTransformerFactory extends TransformerFactory { // NOTE: THIS FEATURE IS PRESENTLY EXPERIMENTAL; WAIT TO SEE IT IN THE REF GUIDE. FINAL SYNTAX IS TBD. protected static String processPathHierarchyQueryString(String queryString) { // if the filter includes a path string, build a lucene query string to match those specific child documents. - // e.g. toppings/ingredients/name_s:cocoa -> +_nest_path_:"toppings/ingredients/" +(name_s:cocoa) + // e.g. /toppings/ingredients/name_s:cocoa -> +_nest_path_:/toppings/ingredients +(name_s:cocoa) + // ingredients/name_s:cocoa -> +_nest_path_:*/ingredients +(name_s:cocoa) int indexOfFirstColon = queryString.indexOf(':'); if (indexOfFirstColon <= 0) { return queryString;// give up } int indexOfLastPathSepChar = queryString.lastIndexOf(PATH_SEP_CHAR, indexOfFirstColon); if (indexOfLastPathSepChar < 0) { - return queryString; + // regular filter, not hierarchy based. + return ClientUtils.escapeQueryChars(queryString.substring(0, indexOfFirstColon)) + + ":" + ClientUtils.escapeQueryChars(queryString.substring(indexOfFirstColon + 1)); } - String path = queryString.substring(0, indexOfLastPathSepChar + 1); - String remaining = queryString.substring(indexOfLastPathSepChar + 1); + final boolean isAbsolutePath = queryString.charAt(0) == PATH_SEP_CHAR; + String path = ClientUtils.escapeQueryChars(queryString.substring(0, indexOfLastPathSepChar)); + String remaining = queryString.substring(indexOfLastPathSepChar + 1); // last part of path hierarchy + return - "+" + NEST_PATH_FIELD_NAME + ":" + ClientUtils.escapeQueryChars(path) + "+" + NEST_PATH_FIELD_NAME + (isAbsolutePath? ":": ":*\\/") + path + " +(" + remaining + ")"; } } diff --git a/solr/core/src/java/org/apache/solr/schema/FieldType.java b/solr/core/src/java/org/apache/solr/schema/FieldType.java index 8bcf839aaf3..f960d8eb404 100644 --- a/solr/core/src/java/org/apache/solr/schema/FieldType.java +++ b/solr/core/src/java/org/apache/solr/schema/FieldType.java @@ -164,7 +164,10 @@ public abstract class FieldType extends FieldProperties { return false; } - // Handle additional arguments... + /** + * Initializes the field type. Subclasses should usually override {@link #init(IndexSchema, Map)} + * which is called by this method. + */ protected void setArgs(IndexSchema schema, Map args) { // default to STORED, INDEXED, OMIT_TF_POSITIONS and MULTIVALUED depending on schema version properties = (STORED | INDEXED); diff --git a/solr/core/src/java/org/apache/solr/schema/NestPathField.java b/solr/core/src/java/org/apache/solr/schema/NestPathField.java new file mode 100644 index 00000000000..926aa7e13a1 --- /dev/null +++ b/solr/core/src/java/org/apache/solr/schema/NestPathField.java @@ -0,0 +1,66 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.solr.schema; + +import java.io.IOException; +import java.util.Map; + +import org.apache.lucene.analysis.core.KeywordTokenizerFactory; +import org.apache.lucene.analysis.custom.CustomAnalyzer; +import org.apache.lucene.analysis.pattern.PatternReplaceFilterFactory; +import org.apache.solr.analysis.TokenizerChain; +import org.apache.solr.common.SolrException; + +/** + * To be used for field {@link IndexSchema#NEST_PATH_FIELD_NAME} for enhanced + * nested doc information. By defining a field type, we can encapsulate the configuration + * here so that the schema is free of it. Alternatively, some notion of "implicit field types" + * would be cool and a more general way of accomplishing this. + * + * @see org.apache.solr.update.processor.NestedUpdateProcessorFactory + * @since 8.0 + */ +public class NestPathField extends SortableTextField { + + @Override + public void setArgs(IndexSchema schema, Map args) { + args.putIfAbsent("stored", "false"); + args.putIfAbsent("omitTermFreqAndPositions", "true"); + args.putIfAbsent("omitNorms", "true"); + args.putIfAbsent("maxCharsForDocValues", "-1"); + super.setArgs(schema, args); + + // CustomAnalyzer is easy to use + CustomAnalyzer customAnalyzer; + try { + customAnalyzer = CustomAnalyzer.builder(schema.getResourceLoader()) + .withDefaultMatchVersion(schema.getDefaultLuceneMatchVersion()) + .withTokenizer(KeywordTokenizerFactory.class) + .addTokenFilter(PatternReplaceFilterFactory.class, + "pattern", "#\\d*", + "replace", "all") + .build(); + } catch (IOException e) { + throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, e);//impossible? + } + // Solr HTTP Schema APIs don't know about CustomAnalyzer so use TokenizerChain instead + setIndexAnalyzer(new TokenizerChain(customAnalyzer)); + // leave queryAnalyzer as literal + } + +} diff --git a/solr/core/src/java/org/apache/solr/update/processor/NestedUpdateProcessorFactory.java b/solr/core/src/java/org/apache/solr/update/processor/NestedUpdateProcessorFactory.java index aa459bd7094..af109f77a0b 100644 --- a/solr/core/src/java/org/apache/solr/update/processor/NestedUpdateProcessorFactory.java +++ b/solr/core/src/java/org/apache/solr/update/processor/NestedUpdateProcessorFactory.java @@ -45,7 +45,7 @@ public class NestedUpdateProcessorFactory extends UpdateRequestProcessorFactory if(!(storeParent || storePath)) { return next; } - return new NestedUpdateProcessor(req, shouldStoreDocParent(req.getSchema()), shouldStoreDocPath(req.getSchema()), next); + return new NestedUpdateProcessor(req, storeParent, storePath, next); } private static boolean shouldStoreDocParent(IndexSchema schema) { @@ -100,10 +100,10 @@ public class NestedUpdateProcessorFactory extends UpdateRequestProcessorFactory String parentDocId = doc.getField(uniqueKeyFieldName).getFirstValue().toString(); cDoc.setField(uniqueKeyFieldName, generateChildUniqueId(parentDocId, fieldName, sChildNum)); } - final String lastKeyPath = fieldName + NUM_SEP_CHAR + sChildNum; - // concat of all paths children.grandChild => children#1/grandChild# - final String childDocPath = fullPath == null ? lastKeyPath : fullPath + PATH_SEP_CHAR + lastKeyPath; - processChildDoc((SolrInputDocument) val, doc, childDocPath); + final String lastKeyPath = PATH_SEP_CHAR + fieldName + NUM_SEP_CHAR + sChildNum; + // concat of all paths children.grandChild => /children#1/grandChild# + final String childDocPath = fullPath == null ? lastKeyPath : fullPath + lastKeyPath; + processChildDoc(cDoc, doc, childDocPath); ++childNum; } } diff --git a/solr/core/src/test-files/solr/collection1/conf/schema-nest.xml b/solr/core/src/test-files/solr/collection1/conf/schema-nest.xml index 313e58654b4..d20d734c1f5 100644 --- a/solr/core/src/test-files/solr/collection1/conf/schema-nest.xml +++ b/solr/core/src/test-files/solr/collection1/conf/schema-nest.xml @@ -29,9 +29,9 @@ - + - + @@ -39,6 +39,8 @@ + + @@ -46,20 +48,6 @@ - - - - - - - - - - - - - - id diff --git a/solr/core/src/test-files/solr/configsets/_default/conf/managed-schema b/solr/core/src/test-files/solr/configsets/_default/conf/managed-schema index b2e32727740..5b9b9bfa8d4 100644 --- a/solr/core/src/test-files/solr/configsets/_default/conf/managed-schema +++ b/solr/core/src/test-files/solr/configsets/_default/conf/managed-schema @@ -113,7 +113,13 @@ + + + + + + + + + + + +