From 1bca06b8a938bfe89b288877399942d217c8453d Mon Sep 17 00:00:00 2001 From: "Md. Abdulla-Al-Sun" Date: Thu, 24 Aug 2017 18:05:22 +0600 Subject: [PATCH 01/44] LUCENE-7490: Added bengali language analyzer --- lucene/NOTICE.txt | 5 +- .../lucene/analysis/bn/BengaliAnalyzer.java | 132 +++++++++++++ .../bn/BengaliNormalizationFilter.java | 59 ++++++ .../bn/BengaliNormalizationFilterFactory.java | 55 ++++++ .../lucene/analysis/bn/BengaliNormalizer.java | 155 +++++++++++++++ .../lucene/analysis/bn/BengaliStemFilter.java | 49 +++++ .../analysis/bn/BengaliStemFilterFactory.java | 48 +++++ .../lucene/analysis/bn/BengaliStemmer.java | 183 ++++++++++++++++++ .../lucene/analysis/bn/package-info.java | 21 ++ ...he.lucene.analysis.util.TokenFilterFactory | 2 + .../apache/lucene/analysis/bn/stopwords.txt | 121 ++++++++++++ .../analysis/bn/TestBengaliAnalyzer.java | 55 ++++++ .../analysis/bn/TestBengaliFilters.java | 80 ++++++++ .../analysis/bn/TestBengaliNormalizer.java | 93 +++++++++ .../analysis/bn/TestBengaliStemmer.java | 79 ++++++++ 15 files changed, 1135 insertions(+), 2 deletions(-) create mode 100644 lucene/analysis/common/src/java/org/apache/lucene/analysis/bn/BengaliAnalyzer.java create mode 100644 lucene/analysis/common/src/java/org/apache/lucene/analysis/bn/BengaliNormalizationFilter.java create mode 100644 lucene/analysis/common/src/java/org/apache/lucene/analysis/bn/BengaliNormalizationFilterFactory.java create mode 100644 lucene/analysis/common/src/java/org/apache/lucene/analysis/bn/BengaliNormalizer.java create mode 100644 lucene/analysis/common/src/java/org/apache/lucene/analysis/bn/BengaliStemFilter.java create mode 100644 lucene/analysis/common/src/java/org/apache/lucene/analysis/bn/BengaliStemFilterFactory.java create mode 100644 lucene/analysis/common/src/java/org/apache/lucene/analysis/bn/BengaliStemmer.java create mode 100644 lucene/analysis/common/src/java/org/apache/lucene/analysis/bn/package-info.java create mode 100644 lucene/analysis/common/src/resources/org/apache/lucene/analysis/bn/stopwords.txt create mode 100644 lucene/analysis/common/src/test/org/apache/lucene/analysis/bn/TestBengaliAnalyzer.java create mode 100644 lucene/analysis/common/src/test/org/apache/lucene/analysis/bn/TestBengaliFilters.java create mode 100644 lucene/analysis/common/src/test/org/apache/lucene/analysis/bn/TestBengaliNormalizer.java create mode 100644 lucene/analysis/common/src/test/org/apache/lucene/analysis/bn/TestBengaliStemmer.java diff --git a/lucene/NOTICE.txt b/lucene/NOTICE.txt index 1903adc743d..7e0c54e2995 100644 --- a/lucene/NOTICE.txt +++ b/lucene/NOTICE.txt @@ -54,13 +54,14 @@ The KStem stemmer in was developed by Bob Krovetz and Sergio Guzman-Lara (CIIR-UMass Amherst) under the BSD-license. -The Arabic,Persian,Romanian,Bulgarian, and Hindi analyzers (common) come with a default +The Arabic,Persian,Romanian,Bulgarian, Hindi and Bengali analyzers (common) come with a default stopword list that is BSD-licensed created by Jacques Savoy. These files reside in: analysis/common/src/resources/org/apache/lucene/analysis/ar/stopwords.txt, analysis/common/src/resources/org/apache/lucene/analysis/fa/stopwords.txt, analysis/common/src/resources/org/apache/lucene/analysis/ro/stopwords.txt, analysis/common/src/resources/org/apache/lucene/analysis/bg/stopwords.txt, -analysis/common/src/resources/org/apache/lucene/analysis/hi/stopwords.txt +analysis/common/src/resources/org/apache/lucene/analysis/hi/stopwords.txt, +analysis/common/src/resources/org/apache/lucene/analysis/bn/stopwords.txt See http://members.unine.ch/jacques.savoy/clef/index.html. The German,Spanish,Finnish,French,Hungarian,Italian,Portuguese,Russian and Swedish light stemmers diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/bn/BengaliAnalyzer.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/bn/BengaliAnalyzer.java new file mode 100644 index 00000000000..912c4dd125c --- /dev/null +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/bn/BengaliAnalyzer.java @@ -0,0 +1,132 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.analysis.bn; + + +import org.apache.lucene.analysis.*; +import org.apache.lucene.analysis.core.DecimalDigitFilter; +import org.apache.lucene.analysis.in.IndicNormalizationFilter; +import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter; +import org.apache.lucene.analysis.standard.StandardFilter; +import org.apache.lucene.analysis.standard.StandardTokenizer; + +import java.io.IOException; +import java.io.Reader; + +/** + * Analyzer for Bengali. + */ +public final class BengaliAnalyzer extends StopwordAnalyzerBase { + private final CharArraySet stemExclusionSet; + + /** + * File containing default Bengali stopwords. + * + * Default stopword list is from http://members.unine.ch/jacques.savoy/clef/bengaliST.txt + * The stopword list is BSD-Licensed. + */ + public final static String DEFAULT_STOPWORD_FILE = "stopwords.txt"; + private static final String STOPWORDS_COMMENT = "#"; + + /** + * Returns an unmodifiable instance of the default stop-words set. + * @return an unmodifiable instance of the default stop-words set. + */ + public static CharArraySet getDefaultStopSet(){ + return DefaultSetHolder.DEFAULT_STOP_SET; + } + + /** + * Atomically loads the DEFAULT_STOP_SET in a lazy fashion once the outer class + * accesses the static final set the first time.; + */ + private static class DefaultSetHolder { + static final CharArraySet DEFAULT_STOP_SET; + + static { + try { + DEFAULT_STOP_SET = loadStopwordSet(false, BengaliAnalyzer.class, DEFAULT_STOPWORD_FILE, STOPWORDS_COMMENT); + } catch (IOException ex) { + throw new RuntimeException("Unable to load default stopword set"); + } + } + } + + /** + * Builds an analyzer with the given stop words + * + * @param stopwords a stopword set + * @param stemExclusionSet a stemming exclusion set + */ + public BengaliAnalyzer(CharArraySet stopwords, CharArraySet stemExclusionSet) { + super(stopwords); + this.stemExclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy(stemExclusionSet)); + } + + /** + * Builds an analyzer with the given stop words + * + * @param stopwords a stopword set + */ + public BengaliAnalyzer(CharArraySet stopwords) { + this(stopwords, CharArraySet.EMPTY_SET); + } + + /** + * Builds an analyzer with the default stop words: + * {@link #DEFAULT_STOPWORD_FILE}. + */ + public BengaliAnalyzer() { + this(DefaultSetHolder.DEFAULT_STOP_SET); + } + + /** + * Creates + * {@link TokenStreamComponents} + * used to tokenize all the text in the provided {@link Reader}. + * + * @return {@link TokenStreamComponents} + * built from a {@link StandardTokenizer} filtered with + * {@link LowerCaseFilter}, {@link DecimalDigitFilter}, {@link IndicNormalizationFilter}, + * {@link BengaliNormalizationFilter}, {@link SetKeywordMarkerFilter} + * if a stem exclusion set is provided, {@link BengaliStemFilter}, and + * Bengali Stop words + */ + @Override + protected TokenStreamComponents createComponents(String fieldName) { + final Tokenizer source = new StandardTokenizer(); + TokenStream result = new LowerCaseFilter(source); + result = new DecimalDigitFilter(result); + if (!stemExclusionSet.isEmpty()) + result = new SetKeywordMarkerFilter(result, stemExclusionSet); + result = new IndicNormalizationFilter(result); + result = new BengaliNormalizationFilter(result); + result = new StopFilter(result, stopwords); + result = new BengaliStemFilter(result); + return new TokenStreamComponents(source, result); + } + + @Override + protected TokenStream normalize(String fieldName, TokenStream in) { + TokenStream result = new StandardFilter(in); + result = new LowerCaseFilter(result); + result = new DecimalDigitFilter(result); + result = new IndicNormalizationFilter(result); + result = new BengaliNormalizationFilter(result); + return result; + } +} diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/bn/BengaliNormalizationFilter.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/bn/BengaliNormalizationFilter.java new file mode 100644 index 00000000000..46874b5b588 --- /dev/null +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/bn/BengaliNormalizationFilter.java @@ -0,0 +1,59 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.analysis.bn; + + +import org.apache.lucene.analysis.TokenFilter; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter; +import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; +import org.apache.lucene.analysis.tokenattributes.KeywordAttribute; + +import java.io.IOException; + +/** + * A {@link TokenFilter} that applies {@link BengaliNormalizer} to normalize the + * orthography. + *

+ * In some cases the normalization may cause unrelated terms to conflate, so + * to prevent terms from being normalized use an instance of + * {@link SetKeywordMarkerFilter} or a custom {@link TokenFilter} that sets + * the {@link KeywordAttribute} before this {@link TokenStream}. + *

+ * @see BengaliNormalizer + */ +public final class BengaliNormalizationFilter extends TokenFilter { + + private final BengaliNormalizer normalizer = new BengaliNormalizer(); + private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); + private final KeywordAttribute keywordAtt = addAttribute(KeywordAttribute.class); + + public BengaliNormalizationFilter(TokenStream input) { + super(input); + } + + @Override + public boolean incrementToken() throws IOException { + if (input.incrementToken()) { + if (!keywordAtt.isKeyword()) + termAtt.setLength(normalizer.normalize(termAtt.buffer(), + termAtt.length())); + return true; + } + return false; + } +} diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/bn/BengaliNormalizationFilterFactory.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/bn/BengaliNormalizationFilterFactory.java new file mode 100644 index 00000000000..43618d6dbb3 --- /dev/null +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/bn/BengaliNormalizationFilterFactory.java @@ -0,0 +1,55 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.analysis.bn; + + +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.util.AbstractAnalysisFactory; +import org.apache.lucene.analysis.util.MultiTermAwareComponent; +import org.apache.lucene.analysis.util.TokenFilterFactory; + +import java.util.Map; + +/** + * Factory for {@link BengaliNormalizationFilter}. + *
+ * <fieldType name="text_bnnormal" class="solr.TextField" positionIncrementGap="100">
+ *   <analyzer>
+ *     <tokenizer class="solr.StandardTokenizerFactory"/>
+ *     <filter class="solr.BengaliNormalizationFilterFactory"/>
+ *   </analyzer>
+ * </fieldType>
+ */ +public class BengaliNormalizationFilterFactory extends TokenFilterFactory implements MultiTermAwareComponent { + + public BengaliNormalizationFilterFactory(Map args) { + super(args); + if (!args.isEmpty()) { + throw new IllegalArgumentException("Unknown parameters: " + args); + } + } + + @Override + public TokenStream create(TokenStream input) { + return new BengaliNormalizationFilter(input); + } + + @Override + public AbstractAnalysisFactory getMultiTermComponent() { + return this; + } +} diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/bn/BengaliNormalizer.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/bn/BengaliNormalizer.java new file mode 100644 index 00000000000..b416d1a365c --- /dev/null +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/bn/BengaliNormalizer.java @@ -0,0 +1,155 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.analysis.bn; + + +import static org.apache.lucene.analysis.util.StemmerUtil.delete; + +/** + * Normalizer for Bengali. + *

+ * Implements the Bengali-language specific algorithm specified in: + * A Double Metaphone encoding for Bangla and its application in spelling checker + * Naushad UzZaman and Mumit Khan. + * http://www.panl10n.net/english/final%20reports/pdf%20files/Bangladesh/BAN16.pdf + *

+ */ +public class BengaliNormalizer { + /** + * Normalize an input buffer of Bengali text + * + * @param s input buffer + * @param len length of input buffer + * @return length of input buffer after normalization + */ + public int normalize(char s[], int len) { + + for (int i = 0; i < len; i++) { + switch (s[i]) { + // delete Chandrabindu + case '\u0981': + len = delete(s, i, len); + i--; + break; + + // DirghoI kar -> RosshoI kar + case '\u09C0': + s[i] = '\u09BF'; + break; + + // DirghoU kar -> RosshoU kar + case '\u09C2': + s[i] = '\u09C1'; + break; + + // Khio (Ka + Hoshonto + Murdorno Sh) + case '\u0995': + if(i + 2 < len && s[i+1] == '\u09CD' && s[i+2] == '\u09BF') { + if (i == 0) { + s[i] = '\u0996'; + len = delete(s, i + 2, len); + len = delete(s, i + 1, len); + } else { + s[i+1] = '\u0996'; + len = delete(s, i + 2, len); + } + } + break; + + // Nga to Anusvara + case '\u0999': + s[i] = '\u0982'; + break; + + // Ja Phala + case '\u09AF': + if(i - 2 == 0 && s[i-1] == '\u09CD') { + s[i - 1] = '\u09C7'; + + if(s[i+1] == '\u09BE') { + len = delete(s, i+1, len); + } + len = delete(s, i, len); + i --; + } else { + len = delete(s, i, len); + len = delete(s, i-1, len); + i -=2; + } + break; + + // Ba Phalaa + case '\u09AC': + if((i >= 1 && s[i-1] != '\u09CD') || i == 0) + break; + if(i - 2 == 0) { + len = delete(s, i, len); + len = delete(s, i - 1, len); + i -= 2; + } else if(i - 5 >= 0 && s[i - 3] == '\u09CD') { + len = delete(s, i, len); + len = delete(s, i-1, len); + i -=2; + } else { + s[i - 1] = s[i - 2]; + len = delete(s, i, len); + i --; + } + break; + + // Visarga + case '\u0983': + if(i == len -1) { + if(len <= 3) { + s[i] = '\u09B9'; + } else { + len = delete(s, i, len); + } + } else { + s[i] = s[i+1]; + } + break; + + //All sh + case '\u09B6': + case '\u09B7': + s[i] = '\u09B8'; + break; + + //check na + case '\u09A3': + s[i] = '\u09A8'; + break; + + //check ra + case '\u09DC': + case '\u09DD': + s[i] = '\u09B0'; + break; + + case '\u09CE': + s[i] = '\u09A4'; + break; + + default: + break; + } + } + + return len; + } +} diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/bn/BengaliStemFilter.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/bn/BengaliStemFilter.java new file mode 100644 index 00000000000..97870272136 --- /dev/null +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/bn/BengaliStemFilter.java @@ -0,0 +1,49 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.analysis.bn; + + +import org.apache.lucene.analysis.TokenFilter; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; +import org.apache.lucene.analysis.tokenattributes.KeywordAttribute; + +import java.io.IOException; + +/** + * A {@link TokenFilter} that applies {@link BengaliStemmer} to stem Bengali words. + */ +public final class BengaliStemFilter extends TokenFilter { + private final CharTermAttribute termAttribute = addAttribute(CharTermAttribute.class); + private final KeywordAttribute keywordAttribute = addAttribute(KeywordAttribute.class); + private final BengaliStemmer bengaliStemmer = new BengaliStemmer(); + + public BengaliStemFilter(TokenStream input) { + super(input); + } + + @Override + public boolean incrementToken() throws IOException { + if (input.incrementToken()) { + if (!keywordAttribute.isKeyword()) + termAttribute.setLength(bengaliStemmer.stem(termAttribute.buffer(), termAttribute.length())); + return true; + } else { + return false; + } + } +} diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/bn/BengaliStemFilterFactory.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/bn/BengaliStemFilterFactory.java new file mode 100644 index 00000000000..b082d9e5b77 --- /dev/null +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/bn/BengaliStemFilterFactory.java @@ -0,0 +1,48 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.analysis.bn; + + +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.util.TokenFilterFactory; + +import java.util.Map; + +/** + * Factory for {@link BengaliStemFilter}. + *
+ * <fieldType name="text_histem" class="solr.TextField" positionIncrementGap="100">
+ *   <analyzer>
+ *     <tokenizer class="solr.StandardTokenizerFactory"/>
+ *     <filter class="solr.BengaliStemFilterFactory"/>
+ *   </analyzer>
+ * </fieldType>
+ */ +public class BengaliStemFilterFactory extends TokenFilterFactory { + + public BengaliStemFilterFactory(Map args) { + super(args); + if (!args.isEmpty()) { + throw new IllegalArgumentException("Unknown parameters: " + args); + } + } + + @Override + public TokenStream create(TokenStream input) { + return new BengaliStemFilter(input); + } +} diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/bn/BengaliStemmer.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/bn/BengaliStemmer.java new file mode 100644 index 00000000000..8bc555a440d --- /dev/null +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/bn/BengaliStemmer.java @@ -0,0 +1,183 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.analysis.bn; + + +import static org.apache.lucene.analysis.util.StemmerUtil.endsWith; + +/** + * Stemmer for Bengali. + *

+ * The algorithm is based on the report in: + * Natural Language Processing in an Indian Language (Bengali)-I: Verb Phrase Analysis + * P Sengupta and B B Chaudhuri + *

+ * + *

+ * Few Stemmer criteria are taken from: + * http://members.unine.ch/jacques.savoy/clef/BengaliStemmerLight.java.txt + *

+ */ +public class BengaliStemmer { + public int stem(char buffer[], int len) { + + // 8 + if (len > 9 && (endsWith(buffer, len, "িয়াছিলাম") + || endsWith(buffer, len, "িতেছিলাম") + || endsWith(buffer, len, "িতেছিলেন") + || endsWith(buffer, len, "ইতেছিলেন") + || endsWith(buffer, len, "িয়াছিলেন") + || endsWith(buffer, len, "ইয়াছিলেন") + )) + return len - 8; + + // 7 + if ((len > 8) && (endsWith(buffer, len, "িতেছিলি") + || endsWith(buffer, len, "িতেছিলে") + || endsWith(buffer, len, "িয়াছিলা") + || endsWith(buffer, len, "িয়াছিলে") + || endsWith(buffer, len, "িতেছিলা") + || endsWith(buffer, len, "িয়াছিলি") + + || endsWith(buffer, len, "য়েদেরকে") + )) + return len - 7; + + // 6 + if ((len > 7) && (endsWith(buffer, len, "িতেছিস") + || endsWith(buffer, len, "িতেছেন") + || endsWith(buffer, len, "িয়াছিস") + || endsWith(buffer, len, "িয়াছেন") + || endsWith(buffer, len, "েছিলাম") + || endsWith(buffer, len, "েছিলেন") + + || endsWith(buffer, len, "েদেরকে") + )) + return len - 6; + + // 5 + if ((len > 6) && (endsWith(buffer, len, "িতেছি") + || endsWith(buffer, len, "িতেছা") + || endsWith(buffer, len, "িতেছে") + || endsWith(buffer, len, "ছিলাম") + || endsWith(buffer, len, "ছিলেন") + || endsWith(buffer, len, "িয়াছি") + || endsWith(buffer, len, "িয়াছা") + || endsWith(buffer, len, "িয়াছে") + || endsWith(buffer, len, "েছিলে") + || endsWith(buffer, len, "েছিলা") + + || endsWith(buffer, len, "য়েদের") + || endsWith(buffer, len, "দেরকে") + )) + return len - 5; + + // 4 + if ((len > 5) && (endsWith(buffer, len, "িলাম") + || endsWith(buffer, len, "িলেন") + || endsWith(buffer, len, "িতাম") + || endsWith(buffer, len, "িতেন") + || endsWith(buffer, len, "িবেন") + || endsWith(buffer, len, "ছিলি") + || endsWith(buffer, len, "ছিলে") + || endsWith(buffer, len, "ছিলা") + || endsWith(buffer, len, "তেছে") + || endsWith(buffer, len, "িতেছ") + + || endsWith(buffer, len, "খানা") + || endsWith(buffer, len, "খানি") + || endsWith(buffer, len, "গুলো") + || endsWith(buffer, len, "গুলি") + || endsWith(buffer, len, "য়েরা") + || endsWith(buffer, len, "েদের") + )) + return len - 4; + + // 3 + if ((len > 4) && (endsWith(buffer, len, "লাম") + || endsWith(buffer, len, "িলি") + || endsWith(buffer, len, "ইলি") + || endsWith(buffer, len, "িলে") + || endsWith(buffer, len, "ইলে") + || endsWith(buffer, len, "লেন") + || endsWith(buffer, len, "িলা") + || endsWith(buffer, len, "ইলা") + || endsWith(buffer, len, "তাম") + || endsWith(buffer, len, "িতি") + || endsWith(buffer, len, "ইতি") + || endsWith(buffer, len, "িতে") + || endsWith(buffer, len, "ইতে") + || endsWith(buffer, len, "তেন") + || endsWith(buffer, len, "িতা") + || endsWith(buffer, len, "িবা") + || endsWith(buffer, len, "ইবা") + || endsWith(buffer, len, "িবি") + || endsWith(buffer, len, "ইবি") + || endsWith(buffer, len, "বেন") + || endsWith(buffer, len, "িবে") + || endsWith(buffer, len, "ইবে") + || endsWith(buffer, len, "ছেন") + + || endsWith(buffer, len, "য়োন") + || endsWith(buffer, len, "য়ের") + || endsWith(buffer, len, "েরা") + || endsWith(buffer, len, "দের") + )) + return len - 3; + + // 2 + if ((len > 3) && (endsWith(buffer, len, "িস") + || endsWith(buffer, len, "েন") + || endsWith(buffer, len, "লি") + || endsWith(buffer, len, "লে") + || endsWith(buffer, len, "লা") + || endsWith(buffer, len, "তি") + || endsWith(buffer, len, "তে") + || endsWith(buffer, len, "তা") + || endsWith(buffer, len, "বি") + || endsWith(buffer, len, "বে") + || endsWith(buffer, len, "বা") + || endsWith(buffer, len, "ছি") + || endsWith(buffer, len, "ছা") + || endsWith(buffer, len, "ছে") + || endsWith(buffer, len, "ুন") + || endsWith(buffer, len, "ুক") + + || endsWith(buffer, len, "টা") + || endsWith(buffer, len, "টি") + || endsWith(buffer, len, "নি") + || endsWith(buffer, len, "ের") + || endsWith(buffer, len, "তে") + || endsWith(buffer, len, "রা") + || endsWith(buffer, len, "কে") + )) + return len - 2; + + // 1 + if ((len > 2) && (endsWith(buffer, len, "ি") + || endsWith(buffer, len, "ী") + || endsWith(buffer, len, "া") + || endsWith(buffer, len, "ো") + || endsWith(buffer, len, "ে") + || endsWith(buffer, len, "ব") + || endsWith(buffer, len, "ত") + )) + return len - 1; + + return len; + } +} diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/bn/package-info.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/bn/package-info.java new file mode 100644 index 00000000000..eea39a9fdfb --- /dev/null +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/bn/package-info.java @@ -0,0 +1,21 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * Analyzer for Bengali Language. + */ +package org.apache.lucene.analysis.bn; diff --git a/lucene/analysis/common/src/resources/META-INF/services/org.apache.lucene.analysis.util.TokenFilterFactory b/lucene/analysis/common/src/resources/META-INF/services/org.apache.lucene.analysis.util.TokenFilterFactory index bc19c4ac320..d871ad649d1 100644 --- a/lucene/analysis/common/src/resources/META-INF/services/org.apache.lucene.analysis.util.TokenFilterFactory +++ b/lucene/analysis/common/src/resources/META-INF/services/org.apache.lucene.analysis.util.TokenFilterFactory @@ -17,6 +17,8 @@ org.apache.lucene.analysis.tr.ApostropheFilterFactory org.apache.lucene.analysis.ar.ArabicNormalizationFilterFactory org.apache.lucene.analysis.ar.ArabicStemFilterFactory org.apache.lucene.analysis.bg.BulgarianStemFilterFactory +org.apache.lucene.analysis.bn.BengaliNormalizationFilterFactory +org.apache.lucene.analysis.bn.BengaliStemFilterFactory org.apache.lucene.analysis.br.BrazilianStemFilterFactory org.apache.lucene.analysis.cjk.CJKBigramFilterFactory org.apache.lucene.analysis.cjk.CJKWidthFilterFactory diff --git a/lucene/analysis/common/src/resources/org/apache/lucene/analysis/bn/stopwords.txt b/lucene/analysis/common/src/resources/org/apache/lucene/analysis/bn/stopwords.txt new file mode 100644 index 00000000000..84d1d2ad732 --- /dev/null +++ b/lucene/analysis/common/src/resources/org/apache/lucene/analysis/bn/stopwords.txt @@ -0,0 +1,121 @@ +# See http://members.unine.ch/jacques.savoy/clef/index.html. +# This file was created by Jacques Savoy and is distributed under the BSD license +এই +ও +থেকে +করে +এ +না +ওই +এক্ +নিয়ে +করা +বলেন +সঙ্গে +যে +এব +তা +আর +কোনো +বলে +সেই +দিন +হয় +কি +দু +পরে +সব +দেওয়া +মধ্যে +এর +সি +শুরু +কাজ +কিছু +কাছে +সে +তবে +বা +বন +আগে +জ্নজন +পি +পর +তো +ছিল +এখন +আমরা +প্রায় +দুই +আমাদের +তাই +অন্য +গিয়ে +প্রযন্ত +মনে +নতুন +মতো +কেখা +প্রথম +আজ +টি +ধামার +অনেক +বিভিন্ন +র +হাজার +জানা +নয় +অবশ্য +বেশি +এস +করে +কে +হতে +বি +কয়েক +সহ +বেশ +এমন +এমনি +কেন +কেউ +নেওয়া +চেষ্টা +লক্ষ +বলা +কারণ +আছে +শুধু +তখন +যা +এসে +চার +ছিল +যদি +আবার +কোটি +উত্তর +সামনে +উপর +বক্তব্য +এত +প্রাথমিক +উপরে +আছে +প্রতি +কাজে +যখন +খুব +বহু +গেল +পেয়্র্ +চালু +ই +নাগাদ +থাকা +পাচ +যাওয়া +রকম +সাধারণ +কমনে \ No newline at end of file diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/bn/TestBengaliAnalyzer.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/bn/TestBengaliAnalyzer.java new file mode 100644 index 00000000000..898480a73cc --- /dev/null +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/bn/TestBengaliAnalyzer.java @@ -0,0 +1,55 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.analysis.bn; + +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.BaseTokenStreamTestCase; +import org.apache.lucene.analysis.CharArraySet; + + +/** + * Tests the BengaliAnalyzer + */ +public class TestBengaliAnalyzer extends BaseTokenStreamTestCase { + + public void testResourcesAvailable() { + new BengaliAnalyzer().close(); + } + + public void testBasics() throws Exception { + Analyzer a = new BengaliAnalyzer(); + + checkOneTerm(a, "বাড়ী", "বার"); + checkOneTerm(a, "বারী", "বার"); + a.close(); + } + /** + * test Digits + */ + public void testDigits() throws Exception { + BengaliAnalyzer a = new BengaliAnalyzer(); + checkOneTerm(a, "১২৩৪৫৬৭৮৯০", "1234567890"); + a.close(); + } + + /** blast some random strings through the analyzer */ + public void testRandomStrings() throws Exception { + Analyzer analyzer = new BengaliAnalyzer(); + checkRandomData(random(), analyzer, 1000*RANDOM_MULTIPLIER); + analyzer.close(); + } +} diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/bn/TestBengaliFilters.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/bn/TestBengaliFilters.java new file mode 100644 index 00000000000..3ed1a07e14f --- /dev/null +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/bn/TestBengaliFilters.java @@ -0,0 +1,80 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.analysis.bn; + + +import java.io.Reader; +import java.io.StringReader; + +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.util.BaseTokenStreamFactoryTestCase; + +/** + * Test Bengali Filter Factory + */ +public class TestBengaliFilters extends BaseTokenStreamFactoryTestCase { + /** + * Test IndicNormalizationFilterFactory + */ + public void testIndicNormalizer() throws Exception { + Reader reader = new StringReader("ত্‍ আমি"); + TokenStream stream = whitespaceMockTokenizer(reader); + stream = tokenFilterFactory("IndicNormalization").create(stream); + assertTokenStreamContents(stream, new String[] { "ৎ", "আমি" }); + } + + /** + * Test BengaliNormalizationFilterFactory + */ + public void testBengaliNormalizer() throws Exception { + Reader reader = new StringReader("বাড়ী"); + TokenStream stream = whitespaceMockTokenizer(reader); + stream = tokenFilterFactory("IndicNormalization").create(stream); + stream = tokenFilterFactory("BengaliNormalization").create(stream); + assertTokenStreamContents(stream, new String[] {"বারি"}); + } + + /** + * Test BengaliStemFilterFactory + */ + public void testStemmer() throws Exception { + Reader reader = new StringReader("বাড়ী"); + TokenStream stream = whitespaceMockTokenizer(reader); + stream = tokenFilterFactory("IndicNormalization").create(stream); + stream = tokenFilterFactory("BengaliNormalization").create(stream); + stream = tokenFilterFactory("BengaliStem").create(stream); + assertTokenStreamContents(stream, new String[] {"বার"}); + } + + /** Test that bogus arguments result in exception */ + public void testBogusArguments() throws Exception { + IllegalArgumentException expected = expectThrows(IllegalArgumentException.class, () -> { + tokenFilterFactory("IndicNormalization", "bogusArg", "bogusValue"); + }); + assertTrue(expected.getMessage().contains("Unknown parameters")); + + expected = expectThrows(IllegalArgumentException.class, () -> { + tokenFilterFactory("BengaliNormalization", "bogusArg", "bogusValue"); + }); + assertTrue(expected.getMessage().contains("Unknown parameters")); + + expected = expectThrows(IllegalArgumentException.class, () -> { + tokenFilterFactory("BengaliStem", "bogusArg", "bogusValue"); + }); + assertTrue(expected.getMessage().contains("Unknown parameters")); + } +} diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/bn/TestBengaliNormalizer.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/bn/TestBengaliNormalizer.java new file mode 100644 index 00000000000..ecd11ae4ba2 --- /dev/null +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/bn/TestBengaliNormalizer.java @@ -0,0 +1,93 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.analysis.bn; + + +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.BaseTokenStreamTestCase; +import org.apache.lucene.analysis.TokenFilter; +import org.apache.lucene.analysis.Tokenizer; +import org.apache.lucene.analysis.core.KeywordTokenizer; + +import java.io.IOException; + +/** + * Test BengaliNormalizer + */ +public class TestBengaliNormalizer extends BaseTokenStreamTestCase { + /** + * Test some basic normalization, with an example from the paper. + */ + public void testChndrobindu() throws IOException { + check("চাঁদ", "চাদ"); + } + + public void testRosshoIKar() throws IOException { + check("বাড়ী", "বারি"); + check("তীর", "তির"); + } + + public void testRosshoUKar() throws IOException { + check("ভূল", "ভুল"); + check("অনূপ", "অনুপ"); + } + + public void testNga() throws IOException { + check("বাঙলা", "বাংলা"); + } + + public void testJaPhaala() throws IOException { + check("ব্যাক্তি", "বেক্তি"); + check( "সন্ধ্যা", "সন্ধা"); + } + + public void testBaPhalaa() throws IOException { + check("স্বদেশ", "সদেস"); + check("তত্ত্ব", "তত্ত"); + check("বিশ্ব", "বিসস"); + } + + public void testVisarga() throws IOException { + check("দুঃখ", "দুখখ"); + check("উঃ", "উহ"); + check("পুনঃ", "পুন"); + } + + public void testBasics() throws IOException { + check("কণা", "কনা"); + check("শরীর", "সরির"); + check("বাড়ি", "বারি"); + } + + private void check(String input, String output) throws IOException { + Tokenizer tokenizer = whitespaceMockTokenizer(input); + TokenFilter tf = new BengaliNormalizationFilter(tokenizer); + assertTokenStreamContents(tf, new String[] { output }); + } + + public void testEmptyTerm() throws IOException { + Analyzer a = new Analyzer() { + @Override + protected TokenStreamComponents createComponents(String fieldName) { + Tokenizer tokenizer = new KeywordTokenizer(); + return new TokenStreamComponents(tokenizer, new BengaliNormalizationFilter(tokenizer)); + } + }; + checkOneTerm(a, "", ""); + a.close(); + } +} diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/bn/TestBengaliStemmer.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/bn/TestBengaliStemmer.java new file mode 100644 index 00000000000..4f7617236f3 --- /dev/null +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/bn/TestBengaliStemmer.java @@ -0,0 +1,79 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.analysis.bn; + + +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.BaseTokenStreamTestCase; +import org.apache.lucene.analysis.TokenFilter; +import org.apache.lucene.analysis.Tokenizer; +import org.apache.lucene.analysis.core.KeywordTokenizer; + +import java.io.IOException; + +/** + * Test Codes for BengaliStemmer + */ +public class TestBengaliStemmer extends BaseTokenStreamTestCase { + + /** + * Testing few verbal words + */ + public void testVerbsInShadhuForm() throws IOException { + check("করেছিলাম", "কর"); + check("করিতেছিলে", "কর"); + check("খাইতাম", "খাই"); + check("যাইবে", "যা"); + } + + public void testVerbsInCholitoForm() throws IOException { + check("করছিলাম", "কর"); + check("করছিলে", "কর"); + check("করতাম", "কর"); + check("যাব", "যা"); + check("যাবে", "যা"); + check("করি", "কর"); + check("করো", "কর"); + } + + public void testNouns() throws IOException { + check("মেয়েরা", "মে"); + check("মেয়েদেরকে", "মে"); + check("মেয়েদের", "মে"); + + check("একটি", "এক"); + check("মানুষগুলি", "মানুষ"); + } + + private void check(String input, String output) throws IOException { + Tokenizer tokenizer = whitespaceMockTokenizer(input); + TokenFilter tf = new BengaliStemFilter(tokenizer); + assertTokenStreamContents(tf, new String[] { output }); + } + + public void testEmptyTerm() throws IOException { + Analyzer a = new Analyzer() { + @Override + protected TokenStreamComponents createComponents(String fieldName) { + Tokenizer tokenizer = new KeywordTokenizer(); + return new TokenStreamComponents(tokenizer, new BengaliStemFilter(tokenizer)); + } + }; + checkOneTerm(a, "", ""); + a.close(); + } +} From 3197d5af71dbbbc9a3186bf17ee5ae6e15fec600 Mon Sep 17 00:00:00 2001 From: "Md. Abdulla-Al-Sun" Date: Thu, 31 Aug 2017 12:11:53 +0600 Subject: [PATCH 02/44] LUCENE-7940: Updated for boundary value of Ja Phaala --- .../java/org/apache/lucene/analysis/bn/BengaliNormalizer.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/bn/BengaliNormalizer.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/bn/BengaliNormalizer.java index b416d1a365c..057fbb5ee85 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/bn/BengaliNormalizer.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/bn/BengaliNormalizer.java @@ -80,12 +80,12 @@ public class BengaliNormalizer { if(i - 2 == 0 && s[i-1] == '\u09CD') { s[i - 1] = '\u09C7'; - if(s[i+1] == '\u09BE') { + if(i + 1 < len && s[i+1] == '\u09BE') { len = delete(s, i+1, len); } len = delete(s, i, len); i --; - } else { + } else if(i - 1 >= 0 && s[i-1] == '\u09CD' ){ len = delete(s, i, len); len = delete(s, i-1, len); i -=2; From b71e8032104c2d334e2dedd3ad4139a1e030c56c Mon Sep 17 00:00:00 2001 From: Cassandra Targett Date: Thu, 31 Aug 2017 10:29:36 -0500 Subject: [PATCH 03/44] SOLR-11253: Solr 7 upgrade info --- solr/solr-ref-guide/src/index.adoc | 2 +- .../src/major-changes-in-solr-7.adoc | 187 ++++++++++++++++++ .../src/solr-upgrade-notes.adoc | 56 +----- 3 files changed, 193 insertions(+), 52 deletions(-) create mode 100644 solr/solr-ref-guide/src/major-changes-in-solr-7.adoc diff --git a/solr/solr-ref-guide/src/index.adoc b/solr/solr-ref-guide/src/index.adoc index 13cad02844a..a80b1229cbf 100644 --- a/solr/solr-ref-guide/src/index.adoc +++ b/solr/solr-ref-guide/src/index.adoc @@ -1,7 +1,7 @@ = Apache Solr Reference Guide :page-shortname: index :page-permalink: index.html -:page-children: about-this-guide, solr-tutorial, getting-started, solr-control-script-reference, using-the-solr-administration-user-interface, documents-fields-and-schema-design, understanding-analyzers-tokenizers-and-filters, indexing-and-basic-data-operations, searching, the-well-configured-solr-instance, managing-solr, solrcloud, legacy-scaling-and-distribution, client-apis, major-changes-from-solr-5-to-solr-6, further-assistance, solr-glossary, errata, how-to-contribute +:page-children: about-this-guide, solr-tutorial, getting-started, solr-control-script-reference, using-the-solr-administration-user-interface, documents-fields-and-schema-design, understanding-analyzers-tokenizers-and-filters, indexing-and-basic-data-operations, searching, the-well-configured-solr-instance, managing-solr, solrcloud, legacy-scaling-and-distribution, client-apis, further-assistance, solr-glossary, errata, how-to-contribute // Licensed to the Apache Software Foundation (ASF) under one // or more contributor license agreements. See the NOTICE file // distributed with this work for additional information diff --git a/solr/solr-ref-guide/src/major-changes-in-solr-7.adoc b/solr/solr-ref-guide/src/major-changes-in-solr-7.adoc new file mode 100644 index 00000000000..6261ef98420 --- /dev/null +++ b/solr/solr-ref-guide/src/major-changes-in-solr-7.adoc @@ -0,0 +1,187 @@ += Major Changes in Solr 7 +:page-shortname: major-changes-in-solr-7 +:page-permalink: major-changes-in-solr-7.html +:page-tocclass: right +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +Solr 7 is a major new release of Solr which introduces new features and a number of other changes that may impact your existing installation. + +== Upgrade Planning +There are major changes in Solr 7 to consider before starting to migrate your configurations and indexes. This page is designed to highlight the biggest changes - new features you may want to be aware of, but also changes in default behavior, and deprecated features that have been removed. + +There are many hundreds of changes in Solr 7, however, so a thorough review of the <> as well as the {solr-javadocs}/changes/Changes.html[CHANGES.txt] file in your Solr instance will help you plan your migration to Solr 7. This section attempts to highlight some of the major changes you should be aware of. + +You should also consider all changes that have been made to Solr in any version you have not upgraded to already. For example, if you are currently using Solr 6.2, you should review changes made in all subsequent 6.x releases in addition to changes for 7.0. + +Re-indexing your data is considered the best practice and you should try to do so if possible. However, if re-indexing is not feasible, keep in mind you can only upgrade one major version at a time. Thus, Solr 6.x indexes will be compatible with Solr 7 but Solr 5.x indexes will not be. + +If you do not re-index now, keep in mind that you will need to either re-index your data or upgrade your indexes before you will be able to move to Solr 8 when it is released in the future. See the section <> for more details on how to upgrade your indexes. + +See also the section <> for details on how to upgrade a SolrCloud cluster. + +== New Features & Enhancements + +=== Replication Modes +Until Solr 7, the SolrCloud model for replicas has been to allow any replica to become a leader when a leader is lost. This is highly effective for most users, providing reliable failover in case of issues in the cluster. However, it comes at a cost in large clusters because all replicas must be in sync at all times. + +To provide additional flexibility, two new types of replicas have been added, named TLOG & PULL. These new types provide options to have replicas which only sync with the leader by copying index segments from the leader. The TLOG type has an additional benefit of maintaining a transaction log (the "tlog" of its name), which would allow it to recover and become a leader if necessary; the PULL type does not maintain a transaction log, so cannot become a leader. + +As part of this change, the traditional type of replica is now named NRT. If you do not explicitly define a number of TLOG or PULL replicas, Solr defaults to creating NRT replicas. If this model is working for you, you will not have to change anything. + +See the section <> for more details on the new replica modes, and how define the replica type in your cluster. + +=== Autoscaling +Solr autoscaling is a new suite of features in Solr to make managing a SolrCloud cluster easier and more automated. + +At its core, Solr autoscaling provides users with a rule syntax to define preferences and policies for how to distribute nodes and shards in a cluster, with the goal of maintaining a balance in the cluster. As of Solr 7, Solr will take any policy or preference rules into account when determining where to place new shards and replicas created or moved with various Collections API commands. + +See the section <> for details on the options available in 7.0. Expect more features to be released in subsequent 7.x releases in this area. + +=== Other Features & Enhancements + +* The Analytics Component has been refactored. + +* There were several other new features released in earlier 6.x releases, which you may have missed: +** <> +** <> +** <>. See also information about related deprecations in the section <> below. +** <> +** <> +** <> +** <> + +== Configuration and Default Changes + +=== New Default ConfigSet +Several changes have been made to configSets that ship with Solr; not only their content but how Solr behaves in regard to them: + +* The `data_driven_configset` and `basic_configset` have been removed, and replaced by the `_default` configset. The `sample_techproducts_configset` also remains, and is designed for use with the example documents shipped with Solr in the `example/exampledocs` directory. +* When creating a new collection, if you do not specify a configSet, the `_default` will be used. +** If you use SolrCloud, the `_default` configSet will be automatically uploaded to ZooKeeper. +** If you use standalone mode, the instanceDir will be created automatically, using the `_default` configSet as it's basis. + +=== Schemaless Improvements + +To improve the functionality of Schemaless Mode, Solr now behaves differently when it detects that data in an incoming field should have a text-based field type. + +* Incoming fields will be indexed as `text_general` by default (you can change this). The name of the field will be the same as the field name defined in the document. +* A copy field rule will be inserted into your schema to copy the new `text_general` field to a new field with the name `_str`. This field's type will be a `strings` field (to allow for multiple values). The first 256 characters of the text field will be inserted to the new `strings` field. + +This behavior can be customized if you wish to remove the copy field rule, or to change the number of characters inserted to the string field, or the field type used. See the section <> for details. + +TIP: Because copy field rules can slow indexing and increase index size, it's recommended you only use copy fields when you need to. If you do not need to sort or facet on a field, you should remove the automatically-generated copy field rule. + +Automatic field creation can be disabled with the `update.autoCreateFields` property. To do this, you can use the Config API with a command such as: + +[source,bash] +curl http://host:8983/solr/mycollection/config -d '{"set-user-property": {"update.autoCreateFields":"false"}}' + +=== Changes to Default Behaviors +* JSON is now the default response format. If you rely on XML responses, you must now define `wt=xml` in your request. In addition, line indentation is enabled by default (`indent=on`). +* The `sow` parameter (short for "Split on Whitespace") now defaults to `false`, which allows support for multi-word synonyms out of the box. This parameter is used with the eDismax and standard/"lucene" query parsers. If this parameter is not explicitly specified as `true`, query text will not be split on whitespace before analysis. +* The `legacyCloud` parameter now defaults to `false`. If an entry for a replica does not exist in `state.json`, that replica will not get registered. ++ +This may affect users who bring up replicas and they are automatically registered as a part of a shard. It is possible to fall back to the old behavior by setting the property `legacyCloud=true`, in the cluster properties using the following command: ++ +`./server/scripts/cloud-scripts/zkcli.sh -zkhost 127.0.0.1:2181 -cmd clusterprop -name legacyCloud -val true` +* The eDismax query parser parameter `lowercaseOperators` now defaults to `false` if the `luceneMatchVersion` in `solrconfig.xml` is 7.0.0 or above. Behavior for `luceneMatchVersion` lower than 7.0.0 is unchanged (so, `true`). This means that clients must sent boolean operators (such as AND, OR and NOT) in upper case in order to be recognized, or you must explicitly set this parameter to `true`. +* The `handleSelect` parameter in `solrconfig.xml` now defaults to `false` if the `luceneMatchVersion` is 7.0.0 or above. This causes Solr to ignore the `qt` parameter if it is present in a request. If you have request handlers without a leading '/', you can set `handleSelect="true"` or consider migrating your configuration. ++ +The `qt` parameter is still used as a SolrJ special parameter that specifies the request handler (tail URL path) to use. +* The lucenePlusSort query parser (aka the "Old Lucene Query Parser") has been deprecated and is no longer implicitly defined. If you wish to continue using this parser until Solr 8 (when it will be removed), you must register it in your `solrconfig.xml`, as in: ``. +* The name of `TemplateUpdateRequestProcessorFactory` is changed to `template` from `Template` and the name of `AtomicUpdateProcessorFactory` is changed to `atomic` from `Atomic` +** Also, `TemplateUpdateRequestProcessorFactory` now uses `{}` instead of `${}` for `template`. + + +== Deprecations and Removed Features + +=== Point Fields Are Default Numeric Types +Solr has implemented \*PointField types across the board, to replace Trie* based numeric fields. All Trie* fields are now considered deprecated, and will be removed in Solr 8. + +If you are using Trie* fields in your schema, you should consider moving to PointFields as soon as feasible. Changing to the new PointField types will require you to re-index your data. + +=== Spatial Fields + +The following spatial-related fields have been deprecated: + +* `LatLonType` +* `GeoHashField` +* `SpatialVectorFieldType` +* `SpatialTermQueryPrefixTreeFieldType` + +Choose one of these field types instead: + +* `LatLonSpatialField` +* `SpatialRecursivePrefixTreeField` +* `RptWithGeometrySpatialField` + +See the section <> for more information. + +=== JMX Support and MBeans +* The `` element in `solrconfig.xml` has been removed in favor of `` elements defined in `solr.xml`. ++ +Limited back-compatibility is offered by automatically adding a default instance of `SolrJmxReporter` if it's missing, AND when a local MBean server is found (which can be activated either via `ENABLE_REMOTE_JMX_OPTS` in `solr.in.sh` or via system properties, e.g., `-Dcom.sun.management.jmxremote`). This default instance exports all Solr metrics from all registries as hierarchical MBeans. ++ +This behavior can be also disabled by specifying a `SolrJmxReporter` configuration with a boolean init argument `enabled` set to `false`. For a more fine-grained control users should explicitly specify at least one `SolrJmxReporter` configuration. ++ +See also the section < Element>>, which describes how to set up Metrics Reporters in `solr.xml`. + +* MBean names and attributes now follow the hierarchical names used in metrics. This is reflected also in `/admin/mbeans` and `/admin/plugins` output, and can be observed in the UI Plugins tab, because now all these APIs get their data from the metrics API. The old (mostly flat) JMX view has been removed. + +=== SolrJ +The following changes were made in SolrJ. + +* `HttpClientInterceptorPlugin` is now `HttpClientBuilderPlugin` and must work with a `SolrHttpClientBuilder` rather than an `HttpClientConfigurer`. +* `HttpClientUtil` now allows configuring `HttpClient` instances via `SolrHttpClientBuilder` rather than an `HttpClientConfigurer`. Use of env variable `SOLR_AUTHENTICATION_CLIENT_CONFIGURER` no longer works, please use `SOLR_AUTHENTICATION_CLIENT_BUILDER` +* `SolrClient` implementations now use their own internal configuration for socket timeouts, connect timeouts, and allowing redirects rather than what is set as the default when building the `HttpClient` instance. Use the appropriate setters on the `SolrClient` instance. +* `HttpSolrClient#setAllowCompression` has been removed and compression must be enabled as a constructor param. +* `HttpSolrClient#setDefaultMaxConnectionsPerHost` and `HttpSolrClient#setMaxTotalConnections` have been removed. These now default very high and can only be changed via param when creating an HttpClient instance. + +=== Other Deprecations and Removals +* The `defaultOperator` parameter in the schema is no longer supported. Use the `q.op` parameter instead. This option had been deprecated for several releases. See the section <> for more information. +* The `defaultSearchField` parameter in the schema is no longer supported. Use the `df` parameter instead. This option had been deprecated for several releases. See the section <> for more information. +* The `mergePolicy`, `mergeFactor` and `maxMergeDocs` parameters have been removed and are no longer supported. You should define a `mergePolicyFactory` instead. See the section <> for more information. +* The PostingsSolrHighlighter has been deprecated. It's recommended that you move to using the UnifiedHighlighter instead. See the section <> for more information about this highlighter. +* Index-time boosts have been removed from Lucene, and are no longer available from Solr. If any boosts are provided, they will be ignored by the indexing chain. As a replacement, index-time scoring factors should be indexed in a separate field and combined with the query score using a function query. See the section <> for more information. +* The `StandardRequestHandler` is deprecated. Use `SearchHandler` instead. +* To improve parameter consistency in the Collections API, the parameter names `fromNode` for the MOVEREPLICA command and `source`, `target` for the REPLACENODE command have been deprecated and replaced with `sourceNode` and `targetNode` instead. The old names will continue to work for back-compatibility but they will be removed in Solr 8. +* The unused `valType` option has been removed from ExternalFileField, if you have this in your schema you can safely remove it. + +== Major Changes in Earlier 6.x Versions +The following summary of changes in earlier 6.x releases highlights significant changes released between Solr 6.0 and 6.6 that were listed in earlier versions of this Guide. Mentions of deprecations are likely superseded by removal in Solr 7, as noted in the above sections. + +* The Solr contribs map-reduce, morphlines-core and morphlines-cell have been removed. +* JSON Facet API now uses hyper-log-log for numBuckets cardinality calculation and calculates cardinality before filtering buckets by any `mincount` greater than 1. +* If you use historical dates, specifically on or before the year 1582, you should re-index for better date handling. +* If you use the JSON Facet API (json.facet) with `method=stream`, you must now set `sort='index asc'` to get the streaming behavior; otherwise it won't stream. Reminder: `method` is a hint that doesn't change defaults of other parameters. +* If you use the JSON Facet API (json.facet) to facet on a numeric field and if you use `mincount=0` or if you set the prefix, you will now get an error as these options are incompatible with numeric faceting. +* Solr's logging verbosity at the INFO level has been greatly reduced, and you may need to update the log configs to use the DEBUG level to see all the logging messages you used to see at INFO level before. +* We are no longer backing up `solr.log` and `solr_gc.log` files in date-stamped copies forever. If you relied on the `solr_log_` or `solr_gc_log_` being in the logs folder that will no longer be the case. See the section <> for details on how log rotation works as of Solr 6.3. +* The create/deleteCollection methods on `MiniSolrCloudCluster` have been deprecated. Clients should instead use the `CollectionAdminRequest` API. In addition, `MiniSolrCloudCluster#uploadConfigDir(File, String)` has been deprecated in favour of `#uploadConfigSet(Path, String)`. +* The `bin/solr.in.sh` (`bin/solr.in.cmd` on Windows) is now completely commented by default. Previously, this wasn't so, which had the effect of masking existing environment variables. +* The `\_version_` field is no longer indexed and is now defined with `indexed=false` by default, because the field has DocValues enabled. +* The `/export` handler has been changed so it no longer returns zero (0) for numeric fields that are not in the original document. One consequence of this change is that you must be aware that some tuples will not have values if there were none in the original document. +* Metrics-related classes in `org.apache.solr.util.stats` have been removed in favor of the http://metrics.dropwizard.io/3.1.0/[Dropwizard metrics library]. Any custom plugins using these classes should be changed to use the equivalent classes from the metrics library. As part of this, the following changes were made to the output of Overseer Status API: +** The "totalTime" metric has been removed because it is no longer supported. +** The metrics "75thPctlRequestTime", "95thPctlRequestTime", "99thPctlRequestTime" and "999thPctlRequestTime" in Overseer Status API have been renamed to "75thPcRequestTime", "95thPcRequestTime" and so on for consistency with stats output in other parts of Solr. +** The metrics "avgRequestsPerMinute", "5minRateRequestsPerMinute" and "15minRateRequestsPerMinute" have been replaced by corresponding per-second rates viz. "avgRequestsPerSecond", "5minRateRequestsPerSecond" and "15minRateRequestsPerSecond" for consistency with stats output in other parts of Solr. +* A new highlighter named UnifiedHighlighter has been added. You are encouraged to try out the UnifiedHighlighter by setting `hl.method=unified` and report feedback. It's more efficient/faster than the other highlighters, especially compared to the original Highlighter. See `HighlightParams.java` for a listing of highlight parameters annotated with which highlighters use them. `hl.useFastVectorHighlighter` is now considered deprecated in lieu of `hl.method=fastVector`. +* The <> now defaults to 1, and more importantly commits will now block if this limit is exceeded instead of throwing an exception (a good thing). Consequently there is no longer a risk in overlapping commits. Nonetheless users should continue to avoid excessive committing. Users are advised to remove any pre-existing `maxWarmingSearchers` entries from their `solrconfig.xml` files. +* The <> now supports leading wildcards. Beware of its possible heaviness, users are encouraged to use ReversedWildcardFilter in index time analysis. +* The JMX metric "avgTimePerRequest" (and the corresponding metric in the metrics API for each handler) used to be a simple non-decaying average based on total cumulative time and the number of requests. The Codahale Metrics implementation applies exponential decay to this value, which heavily biases the average towards the last 5 minutes. +* Parallel SQL now uses Apache Calcite as its SQL framework. As part of this change the default aggregation mode has been changed to `facet` rather than `map_reduce`. There have also been changes to the SQL aggregate response and some SQL syntax changes. Consult the <> documentation for full details. diff --git a/solr/solr-ref-guide/src/solr-upgrade-notes.adoc b/solr/solr-ref-guide/src/solr-upgrade-notes.adoc index 86ea1b9fee3..39cec2fa143 100644 --- a/solr/solr-ref-guide/src/solr-upgrade-notes.adoc +++ b/solr/solr-ref-guide/src/solr-upgrade-notes.adoc @@ -1,6 +1,7 @@ = Solr Upgrade Notes :page-shortname: solr-upgrade-notes :page-permalink: solr-upgrade-notes.html +:page-children: major-changes-in-solr-7, major-changes-from-solr-5-to-solr-6 // Licensed to the Apache Software Foundation (ASF) under one // or more contributor license agreements. See the NOTICE file // distributed with this work for additional information @@ -22,60 +23,13 @@ The following notes describe changes to Solr in recent releases that you should These notes are meant to highlight the biggest changes that may impact the largest number of implementations. It is not a comprehensive list of all changes to Solr in any release. -When planning your Solr upgrade, consider the customizations you have made to your system and review the {solr-javadocs}/changes/Changes.html[`CHANGES.txt`] file found in your Solr package. That file includes all of the changes and updates that may effect your existing implementation. Detailed steps for upgrading a Solr cluster can be found in the appendix: <>. +When planning your Solr upgrade, consider the customizations you have made to your system and review the {solr-javadocs}/changes/Changes.html[`CHANGES.txt`] file found in your Solr package. That file includes all of the changes and updates that may effect your existing implementation. -== Upgrading from 6.5.x +Detailed steps for upgrading a Solr cluster can be found in the section <>. -If you are already using Solr 6.5, Solr 6.6 should not present any major problems. +== Upgrading from Any 6.x Release -* Solr contribs map-reduce, morphlines-core and morphlines-cell have been removed. - -* JSON Facet API now uses hyper-log-log for numBuckets cardinality calculation and calculates cardinality before filtering buckets by any mincount greater than 1. - -* ZooKeeper dependency has been upgraded from 3.4.6 to 3.4.10. - -== Upgrading from earlier 6.x versions - -* If you use historical dates, specifically on or before the year 1582, you should re-index after upgrading to this version. -* If you use the JSON Facet API (json.facet) with `method=stream`, you must now set `sort='index asc'` to get the streaming behavior; otherwise it won't stream. Reminder: "method" is a hint that doesn't change defaults of other parameters. -* If you use the JSON Facet API (json.facet) to facet on a numeric field and if you use `mincount=0` or if you set the prefix, then you will now get an error as these options are incompatible with numeric faceting. -* Solr's logging verbosity at the INFO level has been greatly reduced, and you may need to update the log configs to use the DEBUG level to see all the logging messages you used to see at INFO level before. -* We are no longer backing up `solr.log` and `solr_gc.log` files in date-stamped copies forever. If you relied on the `solr_log_` or `solr_gc_log_` being in the logs folder that will no longer be the case. See the section <> for details on how log rotation works as of Solr 6.3. -* The create/deleteCollection methods on MiniSolrCloudCluster have been deprecated. Clients should instead use the CollectionAdminRequest API. In addition, `MiniSolrCloudCluster#uploadConfigDir(File, String)` has been deprecated in favour of `#uploadConfigSet(Path, String)`. -* The http://solr.in[`bin/solr.in.sh`] (http://solr.in[`bin/solr.in.cmd`] on Windows) is now completely commented by default. Previously, this wasn't so, which had the effect of masking existing environment variables. -* The `\_version_` field is no longer indexed and is now defined with `indexed=false` by default, because the field has DocValues enabled. -* The `/export` handler has been changed so it no longer returns zero (0) for numeric fields that are not in the original document. One consequence of this change is that you must be aware that some tuples will not have values if there were none in the original document. -* Metrics-related classes in `org.apache.solr.util.stats` have been removed in favor of the http://metrics.dropwizard.io/3.1.0/[Dropwizard metrics library]. Any custom plugins using these classes should be changed to use the equivalent classes from the metrics library. As part of this, the following changes were made to the output of Overseer Status API: -** The "totalTime" metric has been removed because it is no longer supported. -** The metrics "75thPctlRequestTime", "95thPctlRequestTime", "99thPctlRequestTime" and "999thPctlRequestTime" in Overseer Status API have been renamed to "75thPcRequestTime", "95thPcRequestTime" and so on for consistency with stats output in other parts of Solr. -** The metrics "avgRequestsPerMinute", "5minRateRequestsPerMinute" and "15minRateRequestsPerMinute" have been replaced by corresponding per-second rates viz. "avgRequestsPerSecond", "5minRateRequestsPerSecond" and "15minRateRequestsPerSecond" for consistency with stats output in other parts of Solr. -* A new highlighter named UnifiedHighlighter has been added. You are encouraged to try out the UnifiedHighlighter by setting `hl.method=unified` and report feedback. It might become the default in 7.0. It's more efficient/faster than the other highlighters, especially compared to the original Highlighter. That said, some options aren't supported yet. It will get more features in time, especially with your input. See HighlightParams.java for a listing of highlight parameters annotated with which highlighters use them. `hl.useFastVectorHighlighter` is now considered deprecated in lieu of `hl.method=fastVector`. -* The <> now defaults to 1, and more importantly commits will now block if this limit is exceeded instead of throwing an exception (a good thing). Consequently there is no longer a risk in overlapping commits. Nonetheless users should continue to avoid excessive committing. Users are advised to remove any pre-existing maxWarmingSearchers entries from their solrconfig.xml files. -* The <> now supports leading wildcards. Beware of its possible heaviness, users are encouraged to use ReversedWildcardFilter in index time analysis. -* The JMX metric "avgTimePerRequest" (and the corresponding metric in the metrics API for each handler) used to be a simple non-decaying average based on total cumulative time and the number of requests. New Codahale Metrics implementation applies exponential decay to this value, which heavily biases the average towards the last 5 minutes. -* Index-time boosts are now deprecated. As a replacement, index-time scoring factors should be indexed in a separate field and combined with the query score using a function query. These boosts will be removed in Solr 7.0. -* Parallel SQL now uses Apache Calcite as its SQL framework. As part of this change the default aggregation mode has been changed to facet rather than map_reduce. There have also been changes to the SQL aggregate response and some SQL syntax changes. Consult the <> documentation for full details. - -== Upgrading from 5.5.x - -* The deprecated `SolrServer` and subclasses have been removed, use <> instead. -* The deprecated `` configuration in <> has been removed. Please remove it from `solrconfig.xml`. -* `SolrClient.shutdown()` has been removed, use {solr-javadocs}/solr-solrj/org/apache/solr/client/solrj/SolrClient.html[`SolrClient.close()`] instead. -* The deprecated `zkCredientialsProvider` element in `solrcloud` section of `solr.xml` is now removed. Use the correct spelling (<>) instead. -* Internal/expert - `ResultContext` was significantly changed and expanded to allow for multiple full query results (`DocLists`) per Solr request. `TransformContext` was rendered redundant and was removed. See https://issues.apache.org/jira/browse/SOLR-7957[SOLR-7957] for details. -* Several changes have been made regarding the "<>" used in Solr, in order to provide better default behavior for new users. There are 3 key impacts of these changes on existing users who upgrade: -** `DefaultSimilarityFactory` has been removed. If you currently have `DefaultSimilarityFactory` explicitly referenced in your `schema.xml`, edit your config to use the functionally identical `ClassicSimilarityFactory`. See https://issues.apache.org/jira/browse/SOLR-8239[SOLR-8239] for more details. -** The implicit default Similarity used when no `` is configured in `schema.xml` has been changed to `SchemaSimilarityFactory`. Users who wish to preserve back-compatible behavior should either explicitly configure `ClassicSimilarityFactory`, or ensure that the `luceneMatchVersion` for the collection is less then 6.0. See https://issues.apache.org/jira/browse/SOLR-8270[SOLR-8270] + http://SOLR-8271[SOLR-8271] for details. -** `SchemaSimilarityFactory` has been modified to use `BM25Similarity` as the default for `fieldTypes` that do not explicitly declare a Similarity. The legacy behavior of using `ClassicSimilarity` as the default will occur if the `luceneMatchVersion` for the collection is less then 6.0, or the `'defaultSimFromFieldType'` configuration option may be used to specify any default of your choosing. See https://issues.apache.org/jira/browse/SOLR-8261[SOLR-8261] + https://issues.apache.org/jira/browse/SOLR-8329[SOLR-8329] for more details. -* If your `solrconfig.xml` file doesn't explicitly mention the `schemaFactory` to use then Solr will choose the `ManagedIndexSchemaFactory` by default. Previously it would have chosen `ClassicIndexSchemaFactory`. This means that the Schema APIs (`//schema`) are enabled and the schema is mutable. When Solr starts your `schema.xml` file will be renamed to `managed-schema`. If you want to retain the old behaviour then please ensure that the `solrconfig.xml` explicitly uses the `ClassicIndexSchemaFactory` or your `luceneMatchVersion` in the `solrconfig.xml` is less than 6.0. See the <> section for more details -* `SolrIndexSearcher.QueryCommand` and `QueryResult` were moved to their own classes. If you reference them in your code, you should import them under o.a.s.search (or use your IDE's "Organize Imports"). -* The '<>' attribute specified in request handler cannot be overridden from request params. See https://issues.apache.org/jira/browse/SOLR-8698[SOLR-8698] for more details. -* When requesting stats in date fields, "sum" is now returned as a double value instead of a date. See https://issues.apache.org/jira/browse/SOLR-8671[SOLR-8671] for more details. -* The deprecated GET methods for schema are now accessible through the <>. These methods now accept fewer request parameters, and output less information. See https://issues.apache.org/jira/browse/SOLR-8736[SOLR-8736] for more details. Some of the removed functionality will likely be restored in a future version of Solr - see https://issues.apache.org/jira/browse/SOLR-8992[SOLR-8992]. -* In the past, Solr guaranteed that retrieval of multi-valued fields would preserve the order of values. Because values may now be retrieved from column-stored fields (`docValues="true"`), in conjunction with the fact that <> do not currently preserve order, means that users should set <> to prevent future optimizations from using the column-stored values over the row-stored values when fields have both `stored="true"` and `docValues="true"`. -* <> have some differences. If the year is more than 4 digits, there is a leading '+'. When there is a non-zero number of milliseconds, it is padded with zeros to 3 digits. Negative year (BC) dates are now possible. Parsing: It is now an error to supply a portion of the date out of its, range, like 67 seconds. -* <> no longer includes `DateUtil`. If for some reason you need to format or parse dates, simply use `Instant.format()` and `Instant.parse()`. -* If you are using spatial4j, please upgrade to 0.6 and <> to replace `com.spatial4j.core` with `org.locationtech.spatial4j` . +The upgrade from Solr 6.x to Solr 7 introduces several *major* changes that you should be aware of before upgrading. Please do a thorough review of the section <> before starting your upgrade. == Upgrading from Older Versions of Solr From 42bb0d046448c7729c329944d5238de62305f623 Mon Sep 17 00:00:00 2001 From: Cassandra Targett Date: Thu, 31 Aug 2017 11:45:57 -0500 Subject: [PATCH 04/44] Ref Guide: fix build errors and warnings --- .../src/cross-data-center-replication-cdcr.adoc | 1 - solr/solr-ref-guide/src/how-to-contribute.adoc | 3 ++- .../src/solrcloud-autoscaling-policy-preferences.adoc | 4 +++- 3 files changed, 5 insertions(+), 3 deletions(-) diff --git a/solr/solr-ref-guide/src/cross-data-center-replication-cdcr.adoc b/solr/solr-ref-guide/src/cross-data-center-replication-cdcr.adoc index 44a63de2cf5..962d82cb662 100644 --- a/solr/solr-ref-guide/src/cross-data-center-replication-cdcr.adoc +++ b/solr/solr-ref-guide/src/cross-data-center-replication-cdcr.adoc @@ -44,7 +44,6 @@ CDCR is configured to replicate from collections in the Source cluster to collec CDCR can be configured to replicate from one collection to a second collection _within the same cluster_. That is a specialized scenario not covered in this document. -[glossary] == CDCR Glossary Terms used in this document include: diff --git a/solr/solr-ref-guide/src/how-to-contribute.adoc b/solr/solr-ref-guide/src/how-to-contribute.adoc index a590b3a5b56..c6d504338d7 100644 --- a/solr/solr-ref-guide/src/how-to-contribute.adoc +++ b/solr/solr-ref-guide/src/how-to-contribute.adoc @@ -41,7 +41,8 @@ include::meta-docs/asciidoc-syntax.adoc[leveloffset=+2] include::meta-docs/editing-tools.adoc[leveloffset=+2] == Modifying the Output Formats -The Solr Reference Guide is published in two formats, HTML and PDF. Different tools are used for each. +The Solr Reference Guide is published in two formats: HTML and PDF. Different tools are used for each. + include::meta-docs/jekyll.adoc[leveloffset=+2] include::meta-docs/pdf.adoc[leveloffset=+2] diff --git a/solr/solr-ref-guide/src/solrcloud-autoscaling-policy-preferences.adoc b/solr/solr-ref-guide/src/solrcloud-autoscaling-policy-preferences.adoc index 9fafd69ce6a..6638e96f977 100644 --- a/solr/solr-ref-guide/src/solrcloud-autoscaling-policy-preferences.adoc +++ b/solr/solr-ref-guide/src/solrcloud-autoscaling-policy-preferences.adoc @@ -61,7 +61,9 @@ See the section <>. [source,json] -[{"minimize":"cores"}] +[ + {"minimize":"cores"} +] ==== Minimize Cores; Maximize Free Disk In this example, we want to minimize the number of Solr cores and in case of a tie, maximize the amount of free disk space on each node. From d0a99af7efe81f2090663f23823e8f4104b837bf Mon Sep 17 00:00:00 2001 From: Cassandra Targett Date: Thu, 31 Aug 2017 13:43:28 -0500 Subject: [PATCH 05/44] Ref Guide: Doc updates for SOLR-10046, SOLR-10929, SOLR-11021 --- .../src/indexconfig-in-solrconfig.adoc | 2 +- .../src/the-query-elevation-component.adoc | 12 +++++++----- .../working-with-external-files-and-processes.adoc | 6 ++---- 3 files changed, 10 insertions(+), 10 deletions(-) diff --git a/solr/solr-ref-guide/src/indexconfig-in-solrconfig.adoc b/solr/solr-ref-guide/src/indexconfig-in-solrconfig.adoc index d81936fb199..69e6b6ab41e 100644 --- a/solr/solr-ref-guide/src/indexconfig-in-solrconfig.adoc +++ b/solr/solr-ref-guide/src/indexconfig-in-solrconfig.adoc @@ -66,7 +66,7 @@ Defines how merging segments is done. The default in Solr is to use a `TieredMergePolicy`, which merges segments of approximately equal size, subject to an allowed number of segments per tier. -Other policies available are the `LogByteSizeMergePolicy` and `LogDocMergePolicy`. For more information on these policies, please see {lucene-javadocs}/core/org/apache/lucene/index/MergePolicy.html[the MergePolicy javadocs]. +Other policies available are the `LogByteSizeMergePolicy`, `LogDocMergePolicy`, and `UninvertDocValuesMergePolicy`. For more information on these policies, please see {lucene-javadocs}/core/org/apache/lucene/index/MergePolicy.html[the MergePolicy javadocs]. [source,xml] ---- diff --git a/solr/solr-ref-guide/src/the-query-elevation-component.adoc b/solr/solr-ref-guide/src/the-query-elevation-component.adoc index 638aa8163f0..3a44975f0bf 100644 --- a/solr/solr-ref-guide/src/the-query-elevation-component.adoc +++ b/solr/solr-ref-guide/src/the-query-elevation-component.adoc @@ -18,11 +18,11 @@ // specific language governing permissions and limitations // under the License. -The https://wiki.apache.org/solr/QueryElevationComponent[Query Elevation Component] lets you configure the top results for a given query regardless of the normal Lucene scoring. +The Query Elevation Component lets you configure the top results for a given query regardless of the normal Lucene scoring. -This is sometimes called "sponsored search," "editorial boosting," or "best bets." This component matches the user query text to a configured map of top results. The text can be any string or non-string IDs, as long as it's indexed. Although this component will work with any QueryParser, it makes the most sense to use with <> or <>. +This is sometimes called "sponsored search", "editorial boosting", or "best bets." This component matches the user query text to a configured map of top results. The text can be any string or non-string IDs, as long as it's indexed. Although this component will work with any QueryParser, it makes the most sense to use with <> or <>. -The https://wiki.apache.org/solr/QueryElevationComponent[Query Elevation Component] is supported by distributed searching. +The Query Elevation Component also supports distributed searching. All of the sample configuration and queries used in this section assume you are running Solr's "```techproducts```" example: @@ -71,9 +71,9 @@ Path to the file that defines query elevation. This file must exist in `>. + == Using the Query Elevation Component === The enableElevation Parameter diff --git a/solr/solr-ref-guide/src/working-with-external-files-and-processes.adoc b/solr/solr-ref-guide/src/working-with-external-files-and-processes.adoc index ac42636ca62..eb0c4c0e6d4 100644 --- a/solr/solr-ref-guide/src/working-with-external-files-and-processes.adoc +++ b/solr/solr-ref-guide/src/working-with-external-files-and-processes.adoc @@ -33,16 +33,14 @@ In `schema.xml`, the definition of this field type might look like this: [source,xml] ---- - + ---- The `keyField` attribute defines the key that will be defined in the external file. It is usually the unique key for the index, but it doesn't need to be as long as the `keyField` can be used to identify documents in the index. A `defVal` defines a default value that will be used if there is no entry in the external file for a particular document. -The `valType` attribute specifies the actual type of values that will be found in the file. The type specified must be either a float field type, so valid values for this attribute are `pfloat`, `float` or `tfloat`. This attribute can be omitted. - === Format of the External File -The file itself is located in Solr's index directory, which by default is `$SOLR_HOME/data`. The name of the file should be `external___fieldname__` or `external___fieldname__.*`. For the example above, then, the file could be named `external_entryRankFile` or `external_entryRankFile.txt`. +The file itself is located in Solr's index directory, which by default is `$SOLR_HOME/data`. The name of the file should be `external_fieldname_` or `external_fieldname_.*`. For the example above, then, the file could be named `external_entryRankFile` or `external_entryRankFile.txt`. [TIP] ==== From eca049e13f7ccaa5ba2993512f607980284c9357 Mon Sep 17 00:00:00 2001 From: Cassandra Targett Date: Thu, 31 Aug 2017 15:07:22 -0500 Subject: [PATCH 06/44] SOLR-11305: Update field type description page to mark deprecated types; change some Trie*Field refs to *PointFields --- ...field-type-definitions-and-properties.adoc | 2 +- .../src/field-types-included-with-solr.adoc | 79 +++++++++++++------ .../src/major-changes-in-solr-7.adoc | 2 +- solr/solr-ref-guide/src/other-parsers.adoc | 2 +- .../src/putting-the-pieces-together.adoc | 4 +- solr/solr-ref-guide/src/schema-api.adoc | 3 +- .../src/working-with-dates.adoc | 2 +- 7 files changed, 63 insertions(+), 31 deletions(-) diff --git a/solr/solr-ref-guide/src/field-type-definitions-and-properties.adoc b/solr/solr-ref-guide/src/field-type-definitions-and-properties.adoc index 205aa83b547..da855b9e5b4 100644 --- a/solr/solr-ref-guide/src/field-type-definitions-and-properties.adoc +++ b/solr/solr-ref-guide/src/field-type-definitions-and-properties.adoc @@ -62,7 +62,7 @@ The field type `class` determines most of the behavior of a field type, but opti [source,xml] ---- - ---- diff --git a/solr/solr-ref-guide/src/field-types-included-with-solr.adoc b/solr/solr-ref-guide/src/field-types-included-with-solr.adoc index 7508ef2528d..5463552d2b5 100644 --- a/solr/solr-ref-guide/src/field-types-included-with-solr.adoc +++ b/solr/solr-ref-guide/src/field-types-included-with-solr.adoc @@ -26,33 +26,66 @@ The following table lists the field types that are available in Solr. The `org.a |=== |Class |Description |BinaryField |Binary data. -|BoolField |Contains either true or false. Values of "1", "t", or "T" in the first character are interpreted as true. Any other values in the first character are interpreted as false. -|CollationField |Supports Unicode collation for sorting and range queries. ICUCollationField is a better choice if you can use ICU4J. See the section <>. -|CurrencyField |Deprecated in favor of CurrencyFieldType. -|CurrencyFieldType |Supports currencies and exchange rates. See the section <>. + +|BoolField |Contains either true or false. Values of `1`, `t`, or `T` in the first character are interpreted as `true`. Any other values in the first character are interpreted as `false`. + +|CollationField |Supports Unicode collation for sorting and range queries. The ICUCollationField is a better choice if you can use ICU4J. See the section <> for more information. + +|CurrencyField |*Deprecated*. Use CurrencyFieldType instead. + +|CurrencyFieldType |Supports currencies and exchange rates. See the section <> for more information. + |DateRangeField |Supports indexing date ranges, to include point in time date instances as well (single-millisecond durations). See the section <> for more detail on using this field type. Consider using this field type even if it's just for date instances, particularly when the queries typically fall on UTC year/month/day/hour, etc., boundaries. -|ExternalFileField |Pulls values from a file on disk. See the section <>. -|EnumField |Deprecated in favor of EnumFieldType -|EnumFieldType |Allows defining an enumerated set of values which may not be easily sorted by either alphabetic or numeric order (such as a list of severities, for example). This field type takes a configuration file, which lists the proper order of the field values. See the section <> for more information. -|ICUCollationField |Supports Unicode collation for sorting and range queries. See the section <>. -|LatLonPointSpatialField |<>: a latitude/longitude coordinate pair; possibly multi-valued for multiple points. Usually it's specified as "lat,lon" order with a comma. -|LatLonType |(deprecated) <>: a single-valued latitude/longitude coordinate pair. Usually it's specified as "lat,lon" order with a comma. -|PointType |<>: A single-valued n-dimensional point. It's both for sorting spatial data that is _not_ lat-lon, and for some more rare use-cases. (NOTE: this is _not_ related to the "Point" based numeric fields) -|PreAnalyzedField |Provides a way to send to Solr serialized token streams, optionally with independent stored values of a field, and have this information stored and indexed without any additional text processing. Configuration and usage of PreAnalyzedField is documented on the <> page. -|RandomSortField |Does not contain a value. Queries that sort on this field type will return results in random order. Use a dynamic field to use this feature. -|SpatialRecursivePrefixTreeFieldType |(RPT for short) <>: Accepts latitude comma longitude strings or other shapes in WKT format. -|StrField |String (UTF-8 encoded string or Unicode). Strings are intended for small fields and are _not_ tokenized or analyzed in any way. They have a hard limit of slightly less than 32K. -|TextField |Text, usually multiple words or tokens. -|TrieDateField |Date field. Represents a point in time with millisecond precision. See the section <>. `precisionStep="0"` minimizes index size; `precisionStep="8"` (the default) enables more efficient range queries. For single valued fields, use `docValues="true"` for efficient sorting. -|TrieDoubleField |Double field (64-bit IEEE floating point). `precisionStep="0"` minimizes index size; `precisionStep="8"` (the default) enables more efficient range queries. For single valued fields, use `docValues="true"` for efficient sorting. -|TrieFloatField |Floating point field (32-bit IEEE floating point) . `precisionStep="0"` enables efficient numeric sorting and minimizes index size; `precisionStep="8"` (the default) enables efficient range queries. Use `docValues="true"` for efficient sorting. For single valued fields, use `docValues="true"` for efficient sorting. -|TrieIntField |Integer field (32-bit signed integer). `precisionStep="0"` enables efficient numeric sorting and minimizes index size; `precisionStep="8"` (the default) enables efficient range queries. For single valued fields, use `docValues="true"` for efficient sorting. -|TrieLongField |Long field (64-bit signed integer). `precisionStep="0"` minimizes index size; `precisionStep="8"` (the default) enables more efficient range queries. For single valued fields, use `docValues="true"` for efficient sorting. -|TrieField |If this field type is used, a "type" attribute must also be specified, valid values are: `integer`, `long`, `float`, `double`, `date`. Using this field is the same as using any of the Trie fields mentioned above + |DatePointField |Date field. Represents a point in time with millisecond precision. See the section <>. This class functions similarly to TrieDateField, but using a "Dimensional Points" based data structure instead of indexed terms, and doesn't require configuration of a precision step. For single valued fields, `docValues="true"` must be used to enable sorting. + |DoublePointField |Double field (64-bit IEEE floating point). This class functions similarly to TrieDoubleField, but using a "Dimensional Points" based data structure instead of indexed terms, and doesn't require configuration of a precision step. For single valued fields, `docValues="true"` must be used to enable sorting. + +|ExternalFileField |Pulls values from a file on disk. See the section <> for more information. + +|EnumField |*Deprecated*. Use EnumFieldType instead. + +|EnumFieldType |Allows defining an enumerated set of values which may not be easily sorted by either alphabetic or numeric order (such as a list of severities, for example). This field type takes a configuration file, which lists the proper order of the field values. See the section <> for more information. + |FloatPointField |Floating point field (32-bit IEEE floating point). This class functions similarly to TrieFloatField, but using a "Dimensional Points" based data structure instead of indexed terms, and doesn't require configuration of a precision step. For single valued fields, `docValues="true"` must be used to enable sorting. + +|ICUCollationField |Supports Unicode collation for sorting and range queries. See the section <> for more information. + |IntPointField |Integer field (32-bit signed integer). This class functions similarly to TrieIntField, but using a "Dimensional Points" based data structure instead of indexed terms, and doesn't require configuration of a precision step. For single valued fields, `docValues="true"` must be used to enable sorting. + +|LatLonPointSpatialField |A latitude/longitude coordinate pair; possibly multi-valued for multiple points. Usually it's specified as "lat,lon" order with a comma. See the section <> for more information. + +|LatLonType |*Deprecated*. Consider using the LatLonPointSpatialField instead. A single-valued latitude/longitude coordinate pair. Usually it's specified as "lat,lon" order with a comma. See the section <> for more information. + |LongPointField |Long field (64-bit signed integer). This class functions similarly to TrieLongField, but using a "Dimensional Points" based data structure instead of indexed terms, and doesn't require configuration of a precision step. For single valued fields, `docValues="true"` must be used to enable sorting. -|UUIDField |Universally Unique Identifier (UUID). Pass in a value of "NEW" and Solr will create a new UUID. *Note*: configuring a UUIDField instance with a default value of "NEW" is not advisable for most users when using SolrCloud (and not possible if the UUID value is configured as the unique key field) since the result will be that each replica of each document will get a unique UUID value. Using UUIDUpdateProcessorFactory to generate UUID values when documents are added is recommended instead. + +|PointType |A single-valued n-dimensional point. It's both for sorting spatial data that is _not_ lat-lon, and for some more rare use-cases. (NOTE: this is _not_ related to the "Point" based numeric fields). See <> for more information. + +|PreAnalyzedField |Provides a way to send to Solr serialized token streams, optionally with independent stored values of a field, and have this information stored and indexed without any additional text processing. + +Configuration and usage of PreAnalyzedField is documented in the section <>. + +|RandomSortField |Does not contain a value. Queries that sort on this field type will return results in random order. Use a dynamic field to use this feature. + +|SpatialRecursivePrefixTreeFieldType |(RPT for short) Accepts latitude comma longitude strings or other shapes in WKT format. See <> for more information. + +|StrField |String (UTF-8 encoded string or Unicode). Strings are intended for small fields and are _not_ tokenized or analyzed in any way. They have a hard limit of slightly less than 32K. + +|TextField |Text, usually multiple words or tokens. + +|TrieDateField |*Deprecated*. Use DatePointField instead. + +|TrieDoubleField |*Deprecated*. Use DoublePointField instead. + +|TrieFloatField |*Deprecated*. Use FloatPointField instead. + +|TrieIntField |*Deprecated*. Use IntPointField instead. + +|TrieLongField |*Deprecated*. Use LongPointField instead. + +|TrieField |*Deprecated*. This field takes a `type` parameter to define the specific class of Trie* field to use; Use an appropriate Point Field type instead. + +|UUIDField |Universally Unique Identifier (UUID). Pass in a value of `NEW` and Solr will create a new UUID. + +*Note*: configuring a UUIDField instance with a default value of `NEW` is not advisable for most users when using SolrCloud (and not possible if the UUID value is configured as the unique key field) since the result will be that each replica of each document will get a unique UUID value. Using UUIDUpdateProcessorFactory to generate UUID values when documents are added is recommended instead. |=== diff --git a/solr/solr-ref-guide/src/major-changes-in-solr-7.adoc b/solr/solr-ref-guide/src/major-changes-in-solr-7.adoc index 6261ef98420..68c3f129c0e 100644 --- a/solr/solr-ref-guide/src/major-changes-in-solr-7.adoc +++ b/solr/solr-ref-guide/src/major-changes-in-solr-7.adoc @@ -126,7 +126,7 @@ The following spatial-related fields have been deprecated: Choose one of these field types instead: -* `LatLonSpatialField` +* `LatLonPointSpatialField` * `SpatialRecursivePrefixTreeField` * `RptWithGeometrySpatialField` diff --git a/solr/solr-ref-guide/src/other-parsers.adoc b/solr/solr-ref-guide/src/other-parsers.adoc index 72ea05c82e8..c339135fa8e 100644 --- a/solr/solr-ref-guide/src/other-parsers.adoc +++ b/solr/solr-ref-guide/src/other-parsers.adoc @@ -478,7 +478,7 @@ q = {!join from=id to=manu_id_s}compName_s:Belkin fq = price:[* TO 12] ---- -The join operation is done on a term basis, so the "from" and "to" fields must use compatible field types. For example: joining between a `StrField` and a `TrieIntField` will not work, likewise joining between a `StrField` and a `TextField` that uses `LowerCaseFilterFactory` will only work for values that are already lower cased in the string field. +The join operation is done on a term basis, so the "from" and "to" fields must use compatible field types. For example: joining between a `StrField` and a `IntPointField` will not work, likewise joining between a `StrField` and a `TextField` that uses `LowerCaseFilterFactory` will only work for values that are already lower cased in the string field. === Join Parser Scoring diff --git a/solr/solr-ref-guide/src/putting-the-pieces-together.adoc b/solr/solr-ref-guide/src/putting-the-pieces-together.adoc index 062681407d5..c4835b51ddb 100644 --- a/solr/solr-ref-guide/src/putting-the-pieces-together.adoc +++ b/solr/solr-ref-guide/src/putting-the-pieces-together.adoc @@ -46,9 +46,9 @@ Note that the `types` and `fields` sections are optional, meaning you are free t == Choosing Appropriate Numeric Types -For general numeric needs, consider using one of the` IntPointField`, `LongPointField`, `FloatPointField`, or `DoublePointField` classes, depending on the specific values you expect. These "Dimensional Point" based numeric classes use specially encoded data structures to support efficient range queries regardless of the size of the ranges used. Enable <> on these fields as needed for sorting and/or faceting. +For general numeric needs, consider using one of the `IntPointField`, `LongPointField`, `FloatPointField`, or `DoublePointField` classes, depending on the specific values you expect. These "Dimensional Point" based numeric classes use specially encoded data structures to support efficient range queries regardless of the size of the ranges used. Enable <> on these fields as needed for sorting and/or faceting. -Some Solr features may not yet work with "Dimensional Points", in which case you may want to consider the equivalent `TrieIntField`, `TrieLongField`, `TrieFloatField`, and `TrieDoubleField` classes. Configure a `precisionStep="0"` if you wish to minimize index size, but if you expect users to make frequent range queries on numeric types, use the default `precisionStep` (by not specifying it) or specify it as `precisionStep="8"` (which is the default). This offers faster speed for range queries at the expense of increasing index size. +Some Solr features may not yet work with "Dimensional Points", in which case you may want to consider the equivalent `TrieIntField`, `TrieLongField`, `TrieFloatField`, and `TrieDoubleField` classes. These field types are deprecated and are likely to be removed in a future major Solr release, but they can still be used if necessary. Configure a `precisionStep="0"` if you wish to minimize index size, but if you expect users to make frequent range queries on numeric types, use the default `precisionStep` (by not specifying it) or specify it as `precisionStep="8"` (which is the default). This offers faster speed for range queries at the expense of increasing index size. == Working With Text diff --git a/solr/solr-ref-guide/src/schema-api.adoc b/solr/solr-ref-guide/src/schema-api.adoc index c120e0a93ea..5c5a8e20559 100644 --- a/solr/solr-ref-guide/src/schema-api.adoc +++ b/solr/solr-ref-guide/src/schema-api.adoc @@ -803,7 +803,7 @@ The sample output below has been truncated to show a few different field types f "sortMissingLast": true }, { - "class": "solr.TrieFloatField", + "class": "solr.FloatPointField", "dynamicFields": [ "*_fs", "*_f" @@ -814,7 +814,6 @@ The sample output below has been truncated to show a few different field types f ], "name": "float", "positionIncrementGap": "0", - "precisionStep": "0" }] } ---- diff --git a/solr/solr-ref-guide/src/working-with-dates.adoc b/solr/solr-ref-guide/src/working-with-dates.adoc index 5f28f61e23a..4fb12d7b149 100644 --- a/solr/solr-ref-guide/src/working-with-dates.adoc +++ b/solr/solr-ref-guide/src/working-with-dates.adoc @@ -20,7 +20,7 @@ == Date Formatting -Solr's date fields (`TrieDateField`, `DatePointField` and `DateRangeField`) represent "dates" as a point in time with millisecond precision. The format used is a restricted form of the canonical representation of dateTime in the http://www.w3.org/TR/xmlschema-2/#dateTime[XML Schema specification] – a restricted subset of https://en.wikipedia.org/wiki/ISO_8601[ISO-8601]. For those familiar with Java 8, Solr uses https://docs.oracle.com/javase/8/docs/api/java/time/format/DateTimeFormatter.html#ISO_INSTANT[DateTimeFormatter.ISO_INSTANT] for formatting, and parsing too with "leniency". +Solr's date fields (`DatePointField`, `DateRangeField` and the deprecated `TrieDateField`) represent "dates" as a point in time with millisecond precision. The format used is a restricted form of the canonical representation of dateTime in the http://www.w3.org/TR/xmlschema-2/#dateTime[XML Schema specification] – a restricted subset of https://en.wikipedia.org/wiki/ISO_8601[ISO-8601]. For those familiar with Java 8, Solr uses https://docs.oracle.com/javase/8/docs/api/java/time/format/DateTimeFormatter.html#ISO_INSTANT[DateTimeFormatter.ISO_INSTANT] for formatting, and parsing too with "leniency". `YYYY-MM-DDThh:mm:ssZ` From efe9003043331efd1de7759e00eee69674b32a0a Mon Sep 17 00:00:00 2001 From: Dawid Weiss Date: Thu, 31 Aug 2017 23:32:20 +0200 Subject: [PATCH 07/44] LUCENE-7948: Upgrade randomizedtesting to 2.5.3 (minor fixes in test filtering for IDEs). --- lucene/CHANGES.txt | 5 +++++ lucene/ivy-versions.properties | 2 +- lucene/licenses/randomizedtesting-runner-2.5.2.jar.sha1 | 1 - lucene/licenses/randomizedtesting-runner-2.5.3.jar.sha1 | 1 + solr/licenses/junit4-ant-2.5.2.jar.sha1 | 1 - solr/licenses/junit4-ant-2.5.3.jar.sha1 | 1 + solr/licenses/randomizedtesting-runner-2.5.2.jar.sha1 | 1 - solr/licenses/randomizedtesting-runner-2.5.3.jar.sha1 | 1 + 8 files changed, 9 insertions(+), 4 deletions(-) delete mode 100644 lucene/licenses/randomizedtesting-runner-2.5.2.jar.sha1 create mode 100644 lucene/licenses/randomizedtesting-runner-2.5.3.jar.sha1 delete mode 100644 solr/licenses/junit4-ant-2.5.2.jar.sha1 create mode 100644 solr/licenses/junit4-ant-2.5.3.jar.sha1 delete mode 100644 solr/licenses/randomizedtesting-runner-2.5.2.jar.sha1 create mode 100644 solr/licenses/randomizedtesting-runner-2.5.3.jar.sha1 diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index 9c4890d2c7b..577c7f4e2b7 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -58,6 +58,11 @@ Build instead of locally installing first, to workaround a double repository push of *-sources.jar and *-javadoc.jar files. (Lynn Monson via Steve Rowe) +Other + +* LUCENE-7948, LUCENE-7937: Upgrade randomizedtesting to 2.5.3 (minor fixes + in test filtering for IDEs). (Mike Sokolov, Dawid Weiss) + ======================= Lucene 7.0.0 ======================= New Features diff --git a/lucene/ivy-versions.properties b/lucene/ivy-versions.properties index 2026df34af8..66b7c89f9f8 100644 --- a/lucene/ivy-versions.properties +++ b/lucene/ivy-versions.properties @@ -5,7 +5,7 @@ /antlr/antlr = 2.7.7 /com.adobe.xmp/xmpcore = 5.1.2 -com.carrotsearch.randomizedtesting.version = 2.5.2 +com.carrotsearch.randomizedtesting.version = 2.5.3 /com.carrotsearch.randomizedtesting/junit4-ant = ${com.carrotsearch.randomizedtesting.version} /com.carrotsearch.randomizedtesting/randomizedtesting-runner = ${com.carrotsearch.randomizedtesting.version} diff --git a/lucene/licenses/randomizedtesting-runner-2.5.2.jar.sha1 b/lucene/licenses/randomizedtesting-runner-2.5.2.jar.sha1 deleted file mode 100644 index bc306100ad2..00000000000 --- a/lucene/licenses/randomizedtesting-runner-2.5.2.jar.sha1 +++ /dev/null @@ -1 +0,0 @@ -91f3284993b44dcb2f003b5f28617abba13971d2 diff --git a/lucene/licenses/randomizedtesting-runner-2.5.3.jar.sha1 b/lucene/licenses/randomizedtesting-runner-2.5.3.jar.sha1 new file mode 100644 index 00000000000..8b4a92f53fa --- /dev/null +++ b/lucene/licenses/randomizedtesting-runner-2.5.3.jar.sha1 @@ -0,0 +1 @@ +053da66a10597283d48266d1f09d572f8608ae3f diff --git a/solr/licenses/junit4-ant-2.5.2.jar.sha1 b/solr/licenses/junit4-ant-2.5.2.jar.sha1 deleted file mode 100644 index a7c360c0a32..00000000000 --- a/solr/licenses/junit4-ant-2.5.2.jar.sha1 +++ /dev/null @@ -1 +0,0 @@ -b8f91682cfeb8f9196aad56ace9c9a13330acef6 diff --git a/solr/licenses/junit4-ant-2.5.3.jar.sha1 b/solr/licenses/junit4-ant-2.5.3.jar.sha1 new file mode 100644 index 00000000000..f62922a62e7 --- /dev/null +++ b/solr/licenses/junit4-ant-2.5.3.jar.sha1 @@ -0,0 +1 @@ +c3809c2c2bc135109a7e1e231463da303299b5dd diff --git a/solr/licenses/randomizedtesting-runner-2.5.2.jar.sha1 b/solr/licenses/randomizedtesting-runner-2.5.2.jar.sha1 deleted file mode 100644 index bc306100ad2..00000000000 --- a/solr/licenses/randomizedtesting-runner-2.5.2.jar.sha1 +++ /dev/null @@ -1 +0,0 @@ -91f3284993b44dcb2f003b5f28617abba13971d2 diff --git a/solr/licenses/randomizedtesting-runner-2.5.3.jar.sha1 b/solr/licenses/randomizedtesting-runner-2.5.3.jar.sha1 new file mode 100644 index 00000000000..8b4a92f53fa --- /dev/null +++ b/solr/licenses/randomizedtesting-runner-2.5.3.jar.sha1 @@ -0,0 +1 @@ +053da66a10597283d48266d1f09d572f8608ae3f From ded726ad82391b84d06c325d3880c78d4eed99b4 Mon Sep 17 00:00:00 2001 From: markrmiller Date: Thu, 31 Aug 2017 20:10:50 -0500 Subject: [PATCH 08/44] SOLR-11209: Try and fix sha1 file. --- solr/licenses/httpmime-4.5.3.jar.sha1 | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/solr/licenses/httpmime-4.5.3.jar.sha1 b/solr/licenses/httpmime-4.5.3.jar.sha1 index 72cd5c4abb2..754ef5471cb 100644 --- a/solr/licenses/httpmime-4.5.3.jar.sha1 +++ b/solr/licenses/httpmime-4.5.3.jar.sha1 @@ -1 +1 @@ -889fd6d061bb63b99dd5c6aba35a555ae863de52 \ No newline at end of file +889fd6d061bb63b99dd5c6aba35a555ae863de52 From d7379f1dd2baee50d7dfd2fb8767c968c982c001 Mon Sep 17 00:00:00 2001 From: Robert Muir Date: Thu, 31 Aug 2017 22:29:38 -0400 Subject: [PATCH 09/44] LUCENE-7946: Fix CharTermAttribute.setLength's bounds check --- .../tokenattributes/CharTermAttributeImpl.java | 3 +++ .../tokenattributes/TestCharTermAttributeImpl.java | 10 ++++++++++ 2 files changed, 13 insertions(+) diff --git a/lucene/core/src/java/org/apache/lucene/analysis/tokenattributes/CharTermAttributeImpl.java b/lucene/core/src/java/org/apache/lucene/analysis/tokenattributes/CharTermAttributeImpl.java index 9a5b9fa29bb..355f4176097 100644 --- a/lucene/core/src/java/org/apache/lucene/analysis/tokenattributes/CharTermAttributeImpl.java +++ b/lucene/core/src/java/org/apache/lucene/analysis/tokenattributes/CharTermAttributeImpl.java @@ -71,6 +71,9 @@ public class CharTermAttributeImpl extends AttributeImpl implements CharTermAttr @Override public final CharTermAttribute setLength(int length) { + if (length < 0) { + throw new IllegalArgumentException("length " + length + " must not be negative"); + } if (length > termBuffer.length) throw new IllegalArgumentException("length " + length + " exceeds the size of the termBuffer (" + termBuffer.length + ")"); termLength = length; diff --git a/lucene/core/src/test/org/apache/lucene/analysis/tokenattributes/TestCharTermAttributeImpl.java b/lucene/core/src/test/org/apache/lucene/analysis/tokenattributes/TestCharTermAttributeImpl.java index 81dd576be08..30821291fc8 100644 --- a/lucene/core/src/test/org/apache/lucene/analysis/tokenattributes/TestCharTermAttributeImpl.java +++ b/lucene/core/src/test/org/apache/lucene/analysis/tokenattributes/TestCharTermAttributeImpl.java @@ -42,6 +42,16 @@ public class TestCharTermAttributeImpl extends LuceneTestCase { } } + public void testSetLength() { + CharTermAttributeImpl t = new CharTermAttributeImpl(); + char[] content = "hello".toCharArray(); + t.copyBuffer(content, 0, content.length); + IllegalArgumentException expected = expectThrows(IllegalArgumentException.class, () -> { + t.setLength(-1); + }); + assertTrue(expected.getMessage().contains("must not be negative")); + } + public void testGrow() { CharTermAttributeImpl t = new CharTermAttributeImpl(); StringBuilder buf = new StringBuilder("ab"); From ea76351419095e156b8adc78de6dfd1e35094ee2 Mon Sep 17 00:00:00 2001 From: Cao Manh Dat Date: Fri, 1 Sep 2017 10:46:13 +0700 Subject: [PATCH 10/44] SOLR-11054: Revert the last commit since we found a test failure --- .../src/test/org/apache/solr/update/SoftAutoCommitTest.java | 6 ------ 1 file changed, 6 deletions(-) diff --git a/solr/core/src/test/org/apache/solr/update/SoftAutoCommitTest.java b/solr/core/src/test/org/apache/solr/update/SoftAutoCommitTest.java index cb2affabed6..84ec63d6473 100644 --- a/solr/core/src/test/org/apache/solr/update/SoftAutoCommitTest.java +++ b/solr/core/src/test/org/apache/solr/update/SoftAutoCommitTest.java @@ -152,12 +152,6 @@ public class SoftAutoCommitTest extends AbstractSolrTestCase { minHardCommitNanos + "ns", minHardCommitNanos < firstHardNanos); - final Long firstSearcherNanos = monitor.searcher.poll(5000, MILLISECONDS); - assertNotNull("didn't get a single new searcher", firstSearcherNanos); - for (int i = 0; i <= softCommitMaxDocs; i++) { - assertQ("should find one", req("id:"+(8000 + i)) ,"//result[@numFound=1]" ); - } - // wait a bit, w/o other action we shouldn't see any new hard/soft commits assertNull("Got a hard commit we weren't expecting", monitor.hard.poll(1000, MILLISECONDS)); From 755f6cc6a80f6060e240b715b9f22b480f70d8e1 Mon Sep 17 00:00:00 2001 From: Robert Muir Date: Thu, 31 Aug 2017 23:49:03 -0400 Subject: [PATCH 11/44] LUCENE-7940: removed unused import and javadocs fix so that ant precommit succeeds --- .../java/org/apache/lucene/analysis/bn/BengaliAnalyzer.java | 4 ++-- .../org/apache/lucene/analysis/bn/TestBengaliAnalyzer.java | 2 -- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/bn/BengaliAnalyzer.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/bn/BengaliAnalyzer.java index 912c4dd125c..4f8ec06742d 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/bn/BengaliAnalyzer.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/bn/BengaliAnalyzer.java @@ -96,10 +96,10 @@ public final class BengaliAnalyzer extends StopwordAnalyzerBase { /** * Creates - * {@link TokenStreamComponents} + * {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents} * used to tokenize all the text in the provided {@link Reader}. * - * @return {@link TokenStreamComponents} + * @return {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents} * built from a {@link StandardTokenizer} filtered with * {@link LowerCaseFilter}, {@link DecimalDigitFilter}, {@link IndicNormalizationFilter}, * {@link BengaliNormalizationFilter}, {@link SetKeywordMarkerFilter} diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/bn/TestBengaliAnalyzer.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/bn/TestBengaliAnalyzer.java index 898480a73cc..e04f209746e 100644 --- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/bn/TestBengaliAnalyzer.java +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/bn/TestBengaliAnalyzer.java @@ -18,8 +18,6 @@ package org.apache.lucene.analysis.bn; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.BaseTokenStreamTestCase; -import org.apache.lucene.analysis.CharArraySet; - /** * Tests the BengaliAnalyzer From 1fbb400e6f02c1443cd84b186c9d9169c2d17e53 Mon Sep 17 00:00:00 2001 From: Robert Muir Date: Fri, 1 Sep 2017 01:22:52 -0400 Subject: [PATCH 12/44] LUCENE-7940: add more efficient (failing) test for BengaliNormalizer --- .../analysis/bn/TestBengaliNormalizer.java | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/bn/TestBengaliNormalizer.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/bn/TestBengaliNormalizer.java index ecd11ae4ba2..b8073c9dda4 100644 --- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/bn/TestBengaliNormalizer.java +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/bn/TestBengaliNormalizer.java @@ -22,6 +22,7 @@ import org.apache.lucene.analysis.BaseTokenStreamTestCase; import org.apache.lucene.analysis.TokenFilter; import org.apache.lucene.analysis.Tokenizer; import org.apache.lucene.analysis.core.KeywordTokenizer; +import org.apache.lucene.util.TestUtil; import java.io.IOException; @@ -73,6 +74,22 @@ public class TestBengaliNormalizer extends BaseTokenStreamTestCase { check("বাড়ি", "বারি"); } + /** creates random strings in the bengali block and ensures the normalizer doesn't trip up on them */ + public void testRandom() throws IOException { + BengaliNormalizer normalizer = new BengaliNormalizer(); + for (int i = 0; i < 100000; i++) { + String randomBengali = TestUtil.randomSimpleStringRange(random(), '\u0980', '\u09FF', 7); + try { + int newLen = normalizer.normalize(randomBengali.toCharArray(), randomBengali.length()); + assertTrue(newLen >= 0); // should not return negative length + assertTrue(newLen <= randomBengali.length()); // should not increase length of string + } catch (Exception e) { + System.err.println("normalizer failed on input: '" + randomBengali + "' (" + escape(randomBengali) + ")"); + throw e; + } + } + } + private void check(String input, String output) throws IOException { Tokenizer tokenizer = whitespaceMockTokenizer(input); TokenFilter tf = new BengaliNormalizationFilter(tokenizer); From 63a0c8d92f9311823c3788ea18528fb042d9eaab Mon Sep 17 00:00:00 2001 From: Mike McCandless Date: Fri, 1 Sep 2017 09:15:20 -0400 Subject: [PATCH 13/44] LUCENE-7933: validate numBits parameter to LongBitSet ctor --- lucene/CHANGES.txt | 3 +++ .../org/apache/lucene/util/LongBitSet.java | 8 +++++++- .../apache/lucene/util/TestLongBitSet.java | 20 +++++++++++++++++-- 3 files changed, 28 insertions(+), 3 deletions(-) diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index 577c7f4e2b7..f6a37ecd747 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -63,6 +63,9 @@ Other * LUCENE-7948, LUCENE-7937: Upgrade randomizedtesting to 2.5.3 (minor fixes in test filtering for IDEs). (Mike Sokolov, Dawid Weiss) +* LUCENE-7933: LongBitSet now validates the numBits parameter (Won + Jonghoon, Mike McCandless) + ======================= Lucene 7.0.0 ======================= New Features diff --git a/lucene/core/src/java/org/apache/lucene/util/LongBitSet.java b/lucene/core/src/java/org/apache/lucene/util/LongBitSet.java index ad8fa641c69..89b47658a1b 100644 --- a/lucene/core/src/java/org/apache/lucene/util/LongBitSet.java +++ b/lucene/core/src/java/org/apache/lucene/util/LongBitSet.java @@ -55,9 +55,15 @@ public final class LongBitSet { return new LongBitSet(arr, (long)arr.length << 6); } } + + /** The maximum {@code numBits} supported. */ + public static final long MAX_NUM_BITS = 64 * (long) ArrayUtil.MAX_ARRAY_LENGTH; - /** returns the number of 64 bit words it would take to hold numBits */ + /** Returns the number of 64 bit words it would take to hold numBits */ public static int bits2words(long numBits) { + if (numBits < 0 || numBits > MAX_NUM_BITS) { + throw new IllegalArgumentException("numBits must be 0 .. " + MAX_NUM_BITS + "; got: " + numBits); + } return (int)((numBits - 1) >> 6) + 1; // I.e.: get the word-offset of the last bit and add one (make sure to use >> so 0 returns 0!) } diff --git a/lucene/core/src/test/org/apache/lucene/util/TestLongBitSet.java b/lucene/core/src/test/org/apache/lucene/util/TestLongBitSet.java index cf4d1a74fe2..f94c97eca28 100644 --- a/lucene/core/src/test/org/apache/lucene/util/TestLongBitSet.java +++ b/lucene/core/src/test/org/apache/lucene/util/TestLongBitSet.java @@ -218,7 +218,23 @@ public class TestLongBitSet extends LuceneTestCase { assertEquals(b1.hashCode(), b2.hashCode()); } } - } + } + + public void testTooLarge() { + Exception e = expectThrows(IllegalArgumentException.class, + () -> { + new LongBitSet(LongBitSet.MAX_NUM_BITS + 1); + }); + assertEquals("numBits must be 0 .. 137438952384; got: 137438952385", e.getMessage()); + } + + public void testNegativeNumBits() { + Exception e = expectThrows(IllegalArgumentException.class, + () -> { + new LongBitSet(-17); + }); + assertEquals("numBits must be 0 .. 137438952384; got: -17", e.getMessage()); + } public void testSmallBitSets() { // Make sure size 0-10 bit sets are OK: @@ -345,6 +361,6 @@ public class TestLongBitSet extends LuceneTestCase { assertEquals(1 << (32-6), LongBitSet.bits2words(1L << 32)); assertEquals((1 << (32-6)) + 1, LongBitSet.bits2words((1L << 32)) + 1); // ... - assertEquals(Integer.MAX_VALUE, LongBitSet.bits2words((1L << 37) - 64)); + assertEquals(2147483631, LongBitSet.bits2words(LongBitSet.MAX_NUM_BITS)); } } From d3013ab600636c4cd958af3a395ef84e6570af6a Mon Sep 17 00:00:00 2001 From: Cao Manh Dat Date: Fri, 1 Sep 2017 20:21:43 +0700 Subject: [PATCH 14/44] SOLR-11244: Query DSL for Solr --- solr/CHANGES.txt | 2 + .../solr/request/json/JsonQueryConverter.java | 110 +++++++++++ .../apache/solr/request/json/RequestUtil.java | 10 +- .../apache/solr/search/BoolQParserPlugin.java | 58 ++++++ .../org/apache/solr/search/QParserPlugin.java | 1 + .../apache/solr/search/TestSmileRequest.java | 2 +- .../solr/search/json/TestJsonRequest.java | 179 +++++++++++++++++- 7 files changed, 354 insertions(+), 8 deletions(-) create mode 100644 solr/core/src/java/org/apache/solr/request/json/JsonQueryConverter.java create mode 100644 solr/core/src/java/org/apache/solr/search/BoolQParserPlugin.java diff --git a/solr/CHANGES.txt b/solr/CHANGES.txt index 4884cf37d8a..493d52f2ac4 100644 --- a/solr/CHANGES.txt +++ b/solr/CHANGES.txt @@ -81,6 +81,8 @@ New Features * SOLR-11215: Make a metric accessible through a single param. (ab) +* SOLR-11244: Query DSL for Solr (Cao Manh Dat) + Bug Fixes ---------------------- diff --git a/solr/core/src/java/org/apache/solr/request/json/JsonQueryConverter.java b/solr/core/src/java/org/apache/solr/request/json/JsonQueryConverter.java new file mode 100644 index 00000000000..e732470749b --- /dev/null +++ b/solr/core/src/java/org/apache/solr/request/json/JsonQueryConverter.java @@ -0,0 +1,110 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.solr.request.json; + +import java.util.List; +import java.util.Map; + +import org.apache.solr.common.SolrException; + +/** + * Convert json query object to local params. + * + * @lucene.internal + */ +class JsonQueryConverter { + private int numParams = 0; + + String toLocalParams(Object jsonQueryObject, Map additionalParams) { + if (jsonQueryObject instanceof String) return jsonQueryObject.toString(); + StringBuilder builder = new StringBuilder(); + buildLocalParams(builder, jsonQueryObject, true, additionalParams); + return builder.toString(); + } + + private String putParam(String val, Map additionalParams) { + String name = "_tt"+(numParams++); + additionalParams.put(name, new String[]{val}); + return name; + } + + private void buildLocalParams(StringBuilder builder, Object val, boolean isQParser, Map additionalParams) { + if (!isQParser && !(val instanceof Map)) { + // val is value of a query parser, and it is not a map + throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, + "Error when parsing json query, expect a json object here, but found : "+val); + } + if (val instanceof String) { + builder.append('$').append(putParam(val.toString(), additionalParams)); + return; + } + if (val instanceof Number) { + builder.append(val); + return; + } + if (!(val instanceof Map)) { + throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, + "Error when parsing json query, expect a json object here, but found : "+val); + } + + Map map = (Map) val; + if (isQParser) { + if (map.size() != 1) { + throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, + "Error when parsing json query, expect only one query parser here, but found : "+map.keySet()); + } + String qtype = map.keySet().iterator().next(); + Object subVal = map.get(qtype); + + // We don't want to introduce unnecessary variable at root level + boolean useSubBuilder = builder.length() > 0; + StringBuilder subBuilder = builder; + + if (useSubBuilder) subBuilder = new StringBuilder(); + + subBuilder = subBuilder.append("{!").append(qtype).append(' ');; + buildLocalParams(subBuilder, subVal, false, additionalParams); + subBuilder.append("}"); + + if (useSubBuilder) builder.append('$').append(putParam(subBuilder.toString(), additionalParams)); + } else { + for (Map.Entry entry : map.entrySet()) { + String key = entry.getKey(); + if (entry.getValue() instanceof List) { + if (key.equals("query")) { + throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, + "Error when parsing json query, value of query field should not be a list, found : " + entry.getValue()); + } + List l = (List) entry.getValue(); + for (Object subVal : l) { + builder.append(key).append("="); + buildLocalParams(builder, subVal, true, additionalParams); + builder.append(" "); + } + } else { + if (key.equals("query")) { + key = "v"; + } + builder.append(key).append("="); + buildLocalParams(builder, entry.getValue(), true, additionalParams); + builder.append(" "); + } + } + } + } +} diff --git a/solr/core/src/java/org/apache/solr/request/json/RequestUtil.java b/solr/core/src/java/org/apache/solr/request/json/RequestUtil.java index ac0dc1951ab..6e7e02a69ed 100644 --- a/solr/core/src/java/org/apache/solr/request/json/RequestUtil.java +++ b/solr/core/src/java/org/apache/solr/request/json/RequestUtil.java @@ -190,16 +190,20 @@ public class RequestUtil { } // implement compat for existing components... + JsonQueryConverter jsonQueryConverter = new JsonQueryConverter(); if (json != null && !isShard) { for (Map.Entry entry : json.entrySet()) { String key = entry.getKey(); String out = null; + boolean isQuery = false; boolean arr = false; if ("query".equals(key)) { out = "q"; + isQuery = true; } else if ("filter".equals(key)) { out = "fq"; arr = true; + isQuery = true; } else if ("fields".equals(key)) { out = "fl"; arr = true; @@ -230,14 +234,14 @@ public class RequestUtil { if (lst != null) { for (int i = 0; i < jsonSize; i++) { Object v = lst.get(i); - newval[existingSize + i] = v.toString(); + newval[existingSize + i] = isQuery ? jsonQueryConverter.toLocalParams(v, newMap) : v.toString(); } } else { - newval[newval.length-1] = val.toString(); + newval[newval.length-1] = isQuery ? jsonQueryConverter.toLocalParams(val, newMap) : val.toString(); } newMap.put(out, newval); } else { - newMap.put(out, new String[]{val.toString()}); + newMap.put(out, new String[]{isQuery ? jsonQueryConverter.toLocalParams(val, newMap) : val.toString()}); } } diff --git a/solr/core/src/java/org/apache/solr/search/BoolQParserPlugin.java b/solr/core/src/java/org/apache/solr/search/BoolQParserPlugin.java new file mode 100644 index 00000000000..c0bebe5329e --- /dev/null +++ b/solr/core/src/java/org/apache/solr/search/BoolQParserPlugin.java @@ -0,0 +1,58 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.solr.search; + +import org.apache.lucene.search.BooleanClause; +import org.apache.lucene.search.BooleanQuery; +import org.apache.lucene.search.Query; +import org.apache.solr.common.params.SolrParams; +import org.apache.solr.request.SolrQueryRequest; + +/** + * Create a boolean query from sub queries. + * Sub queries can be marked as must, must_not, filter or should + * + *

Example: {!bool should=title:lucene should=title:solr must_not=id:1} + */ +public class BoolQParserPlugin extends QParserPlugin { + public static final String NAME = "bool"; + + @Override + public QParser createParser(String qstr, SolrParams localParams, SolrParams params, SolrQueryRequest req) { + return new QParser(qstr, localParams, params, req) { + @Override + public Query parse() throws SyntaxError { + BooleanQuery.Builder builder = new BooleanQuery.Builder(); + SolrParams solrParams = SolrParams.wrapDefaults(localParams, params); + addQueries(builder, solrParams.getParams("must"), BooleanClause.Occur.MUST); + addQueries(builder, solrParams.getParams("must_not"), BooleanClause.Occur.MUST_NOT); + addQueries(builder, solrParams.getParams("filter"), BooleanClause.Occur.FILTER); + addQueries(builder, solrParams.getParams("should"), BooleanClause.Occur.SHOULD); + return builder.build(); + } + + private void addQueries(BooleanQuery.Builder builder, String[] subQueries, BooleanClause.Occur occur) throws SyntaxError { + if (subQueries != null) { + for (String subQuery : subQueries) { + builder.add(subQuery(subQuery, null).parse(), occur); + } + } + } + }; + } +} diff --git a/solr/core/src/java/org/apache/solr/search/QParserPlugin.java b/solr/core/src/java/org/apache/solr/search/QParserPlugin.java index 2ee63cf65ea..893783d8e3a 100644 --- a/solr/core/src/java/org/apache/solr/search/QParserPlugin.java +++ b/solr/core/src/java/org/apache/solr/search/QParserPlugin.java @@ -81,6 +81,7 @@ public abstract class QParserPlugin implements NamedListInitializedPlugin, SolrI map.put(SignificantTermsQParserPlugin.NAME, SignificantTermsQParserPlugin.class); map.put(PayloadScoreQParserPlugin.NAME, PayloadScoreQParserPlugin.class); map.put(PayloadCheckQParserPlugin.NAME, PayloadCheckQParserPlugin.class); + map.put(BoolQParserPlugin.NAME, BoolQParserPlugin.class); standardPlugins = Collections.unmodifiableMap(map); } diff --git a/solr/core/src/test/org/apache/solr/search/TestSmileRequest.java b/solr/core/src/test/org/apache/solr/search/TestSmileRequest.java index c6d72ee3497..2e157c7f657 100644 --- a/solr/core/src/test/org/apache/solr/search/TestSmileRequest.java +++ b/solr/core/src/test/org/apache/solr/search/TestSmileRequest.java @@ -81,7 +81,7 @@ public class TestSmileRequest extends SolrTestCaseJ4 { } }; client.queryDefaults().set("shards", servers.getShards()); - TestJsonRequest.doJsonRequest(client); + TestJsonRequest.doJsonRequest(client, false); } diff --git a/solr/core/src/test/org/apache/solr/search/json/TestJsonRequest.java b/solr/core/src/test/org/apache/solr/search/json/TestJsonRequest.java index 9c151c1d133..4f47f8a2652 100644 --- a/solr/core/src/test/org/apache/solr/search/json/TestJsonRequest.java +++ b/solr/core/src/test/org/apache/solr/search/json/TestJsonRequest.java @@ -53,7 +53,7 @@ public class TestJsonRequest extends SolrTestCaseHS { @Test public void testLocalJsonRequest() throws Exception { - doJsonRequest(Client.localClient); + doJsonRequest(Client.localClient, false); } @Test @@ -62,11 +62,10 @@ public class TestJsonRequest extends SolrTestCaseHS { initServers(); Client client = servers.getClient( random().nextInt() ); client.queryDefaults().set( "shards", servers.getShards() ); - doJsonRequest(client); + doJsonRequest(client, true); } - - public static void doJsonRequest(Client client) throws Exception { + public static void doJsonRequest(Client client, boolean isDistrib) throws Exception { client.deleteByQuery("*:*", null); client.add(sdoc("id", "1", "cat_s", "A", "where_s", "NY"), null); client.add(sdoc("id", "2", "cat_s", "B", "where_s", "NJ"), null); @@ -217,6 +216,178 @@ public class TestJsonRequest extends SolrTestCaseHS { , "debug/json=={query:'cat_s:A', filter:'where_s:NY'}" ); + // test query dsl + client.testJQ( params("json", "{'query':'{!lucene}id:1'}") + , "response/numFound==1" + ); + + client.testJQ( params("json", "{" + + " 'query': {" + + " 'bool' : {" + + " 'should' : [" + + " {'lucene' : {'query' : 'id:1'}}," + + " 'id:2'" + + " ]" + + " }" + + " }" + + "}") + , "response/numFound==2" + ); + + client.testJQ( params("json", "{" + + " 'query': {" + + " 'bool' : {" + + " 'should' : [" + + " 'id:1'," + + " 'id:2'" + + " ]" + + " }" + + " }" + + "}") + , "response/numFound==2" + ); + + client.testJQ( params("json", "{ " + + " query : {" + + " boost : {" + + " query : {" + + " lucene : { " + + " df : cat_s, " + + " query : A " + + " }" + + " }, " + + " b : 1.5 " + + " } " + + " } " + + "}") + , "response/numFound==2" + ); + + client.testJQ( params("json","{ " + + " query : {" + + " bool : {" + + " must : {" + + " lucene : {" + + " q.op : AND," + + " df : cat_s," + + " query : A" + + " }" + + " }" + + " must_not : {lucene : {query:'id: 1'}}" + + " }" + + " }" + + "}") + , "response/numFound==1" + ); + + client.testJQ( params("json","{ " + + " query : {" + + " bool : {" + + " must : {" + + " lucene : {" + + " q.op : AND," + + " df : cat_s," + + " query : A" + + " }" + + " }" + + " must_not : [{lucene : {query:'id: 1'}}]" + + " }" + + " }" + + "}") + , "response/numFound==1" + ); + + client.testJQ( params("json","{ " + + " query : {" + + " bool : {" + + " must : '{!lucene q.op=AND df=cat_s}A'" + + " must_not : '{!lucene v=\\'id:1\\'}'" + + " }" + + " }" + + "}") + , "response/numFound==1" + ); + + + client.testJQ( params("json","{" + + " query : '*:*'," + + " filter : {" + + " collapse : {" + + " field : cat_s" + + " } " + + " } " + + "}") + , isDistrib ? "" : "response/numFound==2" + ); + + client.testJQ( params("json","{" + + " query : {" + + " edismax : {" + + " query : 'A'," + + " qf : 'cat_s'," + + " bq : {" + + " edismax : {" + + " query : 'NJ'" + + " qf : 'where_s'" + + " }" + + " }" + + " }" + + " }, " + + " fields : id" + + "}") + , "response/numFound==2", isDistrib? "" : "response/docs==[{id:'4'},{id:'1'}]" + ); + + client.testJQ( params("json","{" + + " query : {" + + " edismax : {" + + " query : 'A'," + + " qf : 'cat_s'," + + " bq : {" + + " edismax : {" + + " query : 'NY'" + + " qf : 'where_s'" + + " }" + + " }" + + " }" + + " }, " + + " fields : id" + + "}") + , "response/numFound==2", isDistrib? "" : "response/docs==[{id:'1'},{id:'4'}]" + ); + + client.testJQ( params("json","{" + + " query : {" + + " dismax : {" + + " query : 'A NJ'" + + " qf : 'cat_s^0.1 where_s^100'" + + " } " + + " }, " + + " filter : '-id:2'," + + " fields : id" + + "}") + , "response/numFound==3", isDistrib? "" : "response/docs==[{id:'4'},{id:'5'},{id:'1'}]" + ); + + client.testJQ( params("json","{" + + " query : {" + + " dismax : {" + + " query : 'A NJ'" + + " qf : ['cat_s^100', 'where_s^0.1']" + + " } " + + " }, " + + " filter : '-id:2'," + + " fields : id" + + "}") + , "response/numFound==3", isDistrib? "" : "response/docs==[{id:'4'},{id:'1'},{id:'5'}]" + ); + + try { + client.testJQ(params("json", "{query:{'lucene':'id:1'}}")); + fail(); + } catch (Exception e) { + assertTrue(e.getMessage().contains("id:1")); + } try { // test failure on unknown parameter From 1d7809bec1af29a1405a64d3f4f51d64756834ba Mon Sep 17 00:00:00 2001 From: Mike McCandless Date: Fri, 1 Sep 2017 09:24:14 -0400 Subject: [PATCH 15/44] remove test-specific code in NRT replicator --- .../org/apache/lucene/replicator/nrt/PrimaryNode.java | 10 ---------- .../org/apache/lucene/replicator/nrt/ReplicaNode.java | 9 --------- 2 files changed, 19 deletions(-) diff --git a/lucene/replicator/src/java/org/apache/lucene/replicator/nrt/PrimaryNode.java b/lucene/replicator/src/java/org/apache/lucene/replicator/nrt/PrimaryNode.java index ee770cba51a..290195f3144 100644 --- a/lucene/replicator/src/java/org/apache/lucene/replicator/nrt/PrimaryNode.java +++ b/lucene/replicator/src/java/org/apache/lucene/replicator/nrt/PrimaryNode.java @@ -113,14 +113,6 @@ public abstract class PrimaryNode extends Node { setCurrentInfos(Collections.emptySet()); message("init: infos version=" + curInfos.getVersion()); - IndexSearcher s = mgr.acquire(); - try { - // TODO: this is test code specific!! - message("init: marker count: " + s.count(new TermQuery(new Term("marker", "marker")))); - } finally { - mgr.release(s); - } - } catch (Throwable t) { message("init: exception"); t.printStackTrace(printStream); @@ -231,8 +223,6 @@ public abstract class PrimaryNode extends Node { try { searcher = mgr.acquire(); infos = ((StandardDirectoryReader) searcher.getIndexReader()).getSegmentInfos(); - // TODO: this is test code specific!! - message("setCurrentInfos: marker count: " + searcher.count(new TermQuery(new Term("marker", "marker"))) + " version=" + infos.getVersion() + " searcher=" + searcher); } finally { if (searcher != null) { mgr.release(searcher); diff --git a/lucene/replicator/src/java/org/apache/lucene/replicator/nrt/ReplicaNode.java b/lucene/replicator/src/java/org/apache/lucene/replicator/nrt/ReplicaNode.java index 2567fd9b9e9..d546c176a86 100644 --- a/lucene/replicator/src/java/org/apache/lucene/replicator/nrt/ReplicaNode.java +++ b/lucene/replicator/src/java/org/apache/lucene/replicator/nrt/ReplicaNode.java @@ -287,15 +287,6 @@ public abstract class ReplicaNode extends Node { // Finally, we are open for business, since our index now "agrees" with the primary: mgr = new SegmentInfosSearcherManager(dir, this, infos, searcherFactory); - IndexSearcher searcher = mgr.acquire(); - try { - // TODO: this is test specific: - int hitCount = searcher.count(new TermQuery(new Term("marker", "marker"))); - message("top: marker count=" + hitCount + " version=" + ((DirectoryReader) searcher.getIndexReader()).getVersion()); - } finally { - mgr.release(searcher); - } - // Must commit after init mgr: if (doCommit) { // Very important to commit what we just sync'd over, because we removed the pre-existing commit point above if we had to From 6f6cdf963ad289d7d392fa9b1896746e692088de Mon Sep 17 00:00:00 2001 From: Cao Manh Dat Date: Fri, 1 Sep 2017 20:46:42 +0700 Subject: [PATCH 16/44] SOLR-11244: Fix precommit --- solr/core/src/test/org/apache/solr/search/TestSmileRequest.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/solr/core/src/test/org/apache/solr/search/TestSmileRequest.java b/solr/core/src/test/org/apache/solr/search/TestSmileRequest.java index 2e157c7f657..0bf46e756d3 100644 --- a/solr/core/src/test/org/apache/solr/search/TestSmileRequest.java +++ b/solr/core/src/test/org/apache/solr/search/TestSmileRequest.java @@ -81,7 +81,7 @@ public class TestSmileRequest extends SolrTestCaseJ4 { } }; client.queryDefaults().set("shards", servers.getShards()); - TestJsonRequest.doJsonRequest(client, false); + TestJsonRequest.doJsonRequest(client, true); } From f0ed8a9168bae2394af02f1b312f750e5962e286 Mon Sep 17 00:00:00 2001 From: Cassandra Targett Date: Fri, 1 Sep 2017 09:11:22 -0500 Subject: [PATCH 17/44] SOLR-11305: TrieField deprecation cleanup in several pages --- solr/solr-ref-guide/src/docvalues.adoc | 24 ++++++++++--------- .../src/field-types-included-with-solr.adoc | 10 ++++---- solr/solr-ref-guide/src/function-queries.adoc | 2 +- .../src/the-extended-dismax-query-parser.adoc | 21 ++++------------ .../src/the-standard-query-parser.adoc | 6 +++-- 5 files changed, 27 insertions(+), 36 deletions(-) diff --git a/solr/solr-ref-guide/src/docvalues.adoc b/solr/solr-ref-guide/src/docvalues.adoc index 4077d1a66b2..4d6ea83c8b9 100644 --- a/solr/solr-ref-guide/src/docvalues.adoc +++ b/solr/solr-ref-guide/src/docvalues.adoc @@ -44,16 +44,18 @@ If you have already indexed data into your Solr index, you will need to complete DocValues are only available for specific field types. The types chosen determine the underlying Lucene docValue type that will be used. The available Solr field types are: -* `StrField` and `UUIDField`. -** If the field is single-valued (i.e., multi-valued is false), Lucene will use the SORTED type. -** If the field is multi-valued, Lucene will use the SORTED_SET type. -* Any `Trie*` numeric fields, date fields and `EnumFieldType`. -** If the field is single-valued (i.e., multi-valued is false), Lucene will use the NUMERIC type. -** If the field is multi-valued, Lucene will use the SORTED_SET type. -* Boolean fields -* Int|Long|Float|Double|Date PointField -** If the field is single-valued (i.e., multi-valued is false), Lucene will use the NUMERIC type. -** If the field is multi-valued, Lucene will use the SORTED_NUMERIC type. +* `StrField` and `UUIDField`: +** If the field is single-valued (i.e., multi-valued is false), Lucene will use the `SORTED` type. +** If the field is multi-valued, Lucene will use the `SORTED_SET` type. +* `BoolField`: +** If the field is single-valued (i.e., multi-valued is false), Lucene will use the `SORTED` type. +** If the field is multi-valued, Lucene will use the `SORTED_BINARY` type. +* Any `*PointField` Numeric or Date fields, `EnumFieldType`, and `CurrencyFieldType`: +** If the field is single-valued (i.e., multi-valued is false), Lucene will use the `NUMERIC` type. +** If the field is multi-valued, Lucene will use the `SORTED_NUMERIC` type. +* Any of the deprecated `Trie*` Numeric or Date fields, `EnumField` and `CurrencyField`: +** If the field is single-valued (i.e., multi-valued is false), Lucene will use the `NUMERIC` type. +** If the field is multi-valued, Lucene will use the `SORTED_SET` type. These Lucene types are related to how the {lucene-javadocs}/core/org/apache/lucene/index/DocValuesType.html[values are sorted and stored]. @@ -86,4 +88,4 @@ In cases where the query is returning _only_ docValues fields performance may im When retrieving fields from their docValues form (using the <>, <> or if the field is requested in the `fl` parameter), two important differences between regular stored fields and docValues fields must be understood: 1. Order is _not_ preserved. For simply retrieving stored fields, the insertion order is the return order. For docValues, it is the _sorted_ order. -2. Multiple identical entries are collapsed into a single value. Thus if I insert values 4, 5, 2, 4, 1, my return will be 1, 2, 4, 5. +2. For field types using `SORTED_SET`, multiple identical entries are collapsed into a single value. Thus if I insert values 4, 5, 2, 4, 1, my return will be 1, 2, 4, 5. diff --git a/solr/solr-ref-guide/src/field-types-included-with-solr.adoc b/solr/solr-ref-guide/src/field-types-included-with-solr.adoc index 5463552d2b5..39ddb955fbd 100644 --- a/solr/solr-ref-guide/src/field-types-included-with-solr.adoc +++ b/solr/solr-ref-guide/src/field-types-included-with-solr.adoc @@ -37,9 +37,9 @@ The following table lists the field types that are available in Solr. The `org.a |DateRangeField |Supports indexing date ranges, to include point in time date instances as well (single-millisecond durations). See the section <> for more detail on using this field type. Consider using this field type even if it's just for date instances, particularly when the queries typically fall on UTC year/month/day/hour, etc., boundaries. -|DatePointField |Date field. Represents a point in time with millisecond precision. See the section <>. This class functions similarly to TrieDateField, but using a "Dimensional Points" based data structure instead of indexed terms, and doesn't require configuration of a precision step. For single valued fields, `docValues="true"` must be used to enable sorting. +|DatePointField |Date field. Represents a point in time with millisecond precision, encoded using a "Dimensional Points" based data structure that allows for very efficient searches for specific values, or ranges of values. See the section <> for more details on the supported syntax. For single valued fields, `docValues="true"` must be used to enable sorting. -|DoublePointField |Double field (64-bit IEEE floating point). This class functions similarly to TrieDoubleField, but using a "Dimensional Points" based data structure instead of indexed terms, and doesn't require configuration of a precision step. For single valued fields, `docValues="true"` must be used to enable sorting. +|DoublePointField |Double field (64-bit IEEE floating point). This class encodes double values using a "Dimensional Points" based data structure that allows for very efficient searches for specific values, or ranges of values. For single valued fields, `docValues="true"` must be used to enable sorting. |ExternalFileField |Pulls values from a file on disk. See the section <> for more information. @@ -47,17 +47,17 @@ The following table lists the field types that are available in Solr. The `org.a |EnumFieldType |Allows defining an enumerated set of values which may not be easily sorted by either alphabetic or numeric order (such as a list of severities, for example). This field type takes a configuration file, which lists the proper order of the field values. See the section <> for more information. -|FloatPointField |Floating point field (32-bit IEEE floating point). This class functions similarly to TrieFloatField, but using a "Dimensional Points" based data structure instead of indexed terms, and doesn't require configuration of a precision step. For single valued fields, `docValues="true"` must be used to enable sorting. +|FloatPointField |Floating point field (32-bit IEEE floating point). This class encodes float values using a "Dimensional Points" based data structure that allows for very efficient searches for specific values, or ranges of values. For single valued fields, `docValues="true"` must be used to enable sorting. |ICUCollationField |Supports Unicode collation for sorting and range queries. See the section <> for more information. -|IntPointField |Integer field (32-bit signed integer). This class functions similarly to TrieIntField, but using a "Dimensional Points" based data structure instead of indexed terms, and doesn't require configuration of a precision step. For single valued fields, `docValues="true"` must be used to enable sorting. +|IntPointField |Integer field (32-bit signed integer). This class encodes int values using a "Dimensional Points" based data structure that allows for very efficient searches for specific values, or ranges of values. For single valued fields, `docValues="true"` must be used to enable sorting. |LatLonPointSpatialField |A latitude/longitude coordinate pair; possibly multi-valued for multiple points. Usually it's specified as "lat,lon" order with a comma. See the section <> for more information. |LatLonType |*Deprecated*. Consider using the LatLonPointSpatialField instead. A single-valued latitude/longitude coordinate pair. Usually it's specified as "lat,lon" order with a comma. See the section <> for more information. -|LongPointField |Long field (64-bit signed integer). This class functions similarly to TrieLongField, but using a "Dimensional Points" based data structure instead of indexed terms, and doesn't require configuration of a precision step. For single valued fields, `docValues="true"` must be used to enable sorting. +|LongPointField |Long field (64-bit signed integer). This class encodes foo values using a "Dimensional Points" based data structure that allows for very efficient searches for specific values, or ranges of values. For single valued fields, `docValues="true"` must be used to enable sorting. |PointType |A single-valued n-dimensional point. It's both for sorting spatial data that is _not_ lat-lon, and for some more rare use-cases. (NOTE: this is _not_ related to the "Point" based numeric fields). See <> for more information. diff --git a/solr/solr-ref-guide/src/function-queries.adoc b/solr/solr-ref-guide/src/function-queries.adoc index 11dfb08f301..f1684c9e229 100644 --- a/solr/solr-ref-guide/src/function-queries.adoc +++ b/solr/solr-ref-guide/src/function-queries.adoc @@ -254,7 +254,7 @@ Use the `field(myfield,min)` <>. +Arguments may be the name of a `DatePointField`, `TrieDateField`, or date math based on a <>. * `ms()`: Equivalent to `ms(NOW)`, number of milliseconds since the epoch. * `ms(a):` Returns the number of milliseconds since the epoch that the argument represents. diff --git a/solr/solr-ref-guide/src/the-extended-dismax-query-parser.adoc b/solr/solr-ref-guide/src/the-extended-dismax-query-parser.adoc index 4b042bdd7c1..ea465e2cfdd 100644 --- a/solr/solr-ref-guide/src/the-extended-dismax-query-parser.adoc +++ b/solr/solr-ref-guide/src/the-extended-dismax-query-parser.adoc @@ -22,10 +22,10 @@ The Extended DisMax (eDisMax) query parser is an improved version of the <>. -* supports queries such as AND, OR, NOT, -, and +. -* optionally treats "and" and "or" as "AND" and "OR" in Lucene syntax mode. -* respects the 'magic field' names `\_val_` and `\_query_`. These are not a real fields in the Schema, but if used it helps do special things (like a function query in the case of `\_val_` or a nested query in the case of `\_query_`). If `\_val_` is used in a term or phrase query, the value is parsed as a function. +* supports the full Lucene query parser syntax with the same enhancements as <>. +** supports queries such as AND, OR, NOT, -, and +. +** optionally treats "and" and "or" as "AND" and "OR" in Lucene syntax mode. +** respects the 'magic field' names `\_val_` and `\_query_`. These are not a real fields in the Schema, but if used it helps do special things (like a function query in the case of `\_val_` or a nested query in the case of `\_query_`). If `\_val_` is used in a term or phrase query, the value is parsed as a function. * includes improved smart partial escaping in the case of syntax errors; fielded queries, +/-, and phrase queries are still supported in this mode. * improves proximity boosting by using word shingles; you do not need the query to match all words in the document before proximity boosting is applied. * includes advanced stopword handling: stopwords are not required in the mandatory part of the query but are still used in the proximity boosting part. If a query consists of all stopwords, such as "to be or not to be", then all words are required. @@ -218,16 +218,3 @@ _val_:"recip(rord(myfield),1,2,3)" _query_:"{!dismax qf=myfield}how now brown cow" ---- -Although not technically a syntax difference, note that if you use the Solr {solr-javadocs}/solr-core/org/apache/solr/schema/TrieDateField.html[`TrieDateField`] type, any queries on those fields (typically range queries) should use either the Complete ISO 8601 Date syntax that field supports, or the {solr-javadocs}/solr-core/org/apache/solr/util/DateMathParser.html[DateMath Syntax] to get relative dates. For example: - -[source,text] ----- -timestamp:[* TO NOW] -createdate:[1976-03-06T23:59:59.999Z TO *] -createdate:[1995-12-31T23:59:59.999Z TO 2007-03-06T00:00:00Z] -pubdate:[NOW-1YEAR/DAY TO NOW/DAY+1DAY] -createdate:[1976-03-06T23:59:59.999Z TO 1976-03-06T23:59:59.999Z+1YEAR] -createdate:[1976-03-06T23:59:59.999Z/YEAR TO 1976-03-06T23:59:59.999Z] ----- - -IMPORTANT: `TO` must be uppercase, or Solr will report a 'Range Group' error. diff --git a/solr/solr-ref-guide/src/the-standard-query-parser.adoc b/solr/solr-ref-guide/src/the-standard-query-parser.adoc index 7c49d623c9a..b2db25cc8f2 100644 --- a/solr/solr-ref-guide/src/the-standard-query-parser.adoc +++ b/solr/solr-ref-guide/src/the-standard-query-parser.adoc @@ -350,11 +350,13 @@ This can even be used to cache individual clauses of complex filter queries. In === Specifying Dates and Times -Queries against fields using the `TrieDateField` type (typically range queries) should use the <>: +Queries against date based fields must use the <>. Queries for exact date values will require quoting or escaping since `:` is the parser syntax used to denote a field query: -* `timestamp:[* TO NOW]` +* `createdate:1976-03-06T23\:59\:59.999Z` +* `createdate:"1976-03-06T23:59:59.999Z"` * `createdate:[1976-03-06T23:59:59.999Z TO *]` * `createdate:[1995-12-31T23:59:59.999Z TO 2007-03-06T00:00:00Z]` +* `timestamp:[* TO NOW]` * `pubdate:[NOW-1YEAR/DAY TO NOW/DAY+1DAY]` * `createdate:[1976-03-06T23:59:59.999Z TO 1976-03-06T23:59:59.999Z+1YEAR]` * `createdate:[1976-03-06T23:59:59.999Z/YEAR TO 1976-03-06T23:59:59.999Z]` From c141094f6bec699c78a7513d0ccef8b958c4606e Mon Sep 17 00:00:00 2001 From: Cao Manh Dat Date: Fri, 1 Sep 2017 21:37:15 +0700 Subject: [PATCH 18/44] SOLR-11244: Add test for bool qparser in QueryEqualityTest --- .../apache/solr/search/QueryEqualityTest.java | 21 +++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/solr/core/src/test/org/apache/solr/search/QueryEqualityTest.java b/solr/core/src/test/org/apache/solr/search/QueryEqualityTest.java index 6e747001cd5..eab254fb3f4 100644 --- a/solr/core/src/test/org/apache/solr/search/QueryEqualityTest.java +++ b/solr/core/src/test/org/apache/solr/search/QueryEqualityTest.java @@ -1190,6 +1190,27 @@ public class QueryEqualityTest extends SolrTestCaseJ4 { } } + public void testBoolQuery() throws Exception { + assertQueryEquals("bool", + "{!bool must='{!lucene}foo_s:a' must='{!lucene}foo_s:b'}", + "{!bool must='{!lucene}foo_s:b' must='{!lucene}foo_s:a'}"); + assertQueryEquals("bool", + "{!bool must_not='{!lucene}foo_s:a' should='{!lucene}foo_s:b' " + + "must='{!lucene}foo_s:c' filter='{!lucene}foo_s:d' filter='{!lucene}foo_s:e'}", + "{!bool must='{!lucene}foo_s:c' filter='{!lucene}foo_s:d' " + + "must_not='{!lucene}foo_s:a' should='{!lucene}foo_s:b' filter='{!lucene}foo_s:e'}"); + try { + assertQueryEquals + ("bool" + , "{!bool must='{!lucene}foo_s:a'}" + , "{!bool should='{!lucene}foo_s:a'}" + ); + fail("queries should not have been equal"); + } catch(AssertionFailedError e) { + assertTrue("queries were not equal, as expected", true); + } + } + // Override req to add df param public static SolrQueryRequest req(String... q) { return SolrTestCaseJ4.req(q, "df", "text"); From 5e3c64a1cde9f0aa5225b860749ea48648dfdd7c Mon Sep 17 00:00:00 2001 From: Varun Thacker Date: Fri, 1 Sep 2017 22:08:45 +0530 Subject: [PATCH 19/44] SOLR-11309: Split up the Read and Write Side Fault Tolerance ref guide page --- .../src/how-solrcloud-works.adoc | 3 +- ...loud-query-routing-and-read-tolerance.adoc | 85 +++++++++++++++++++ ...cloud-recoveries-and-write-tolerance.adoc} | 74 +--------------- .../src/solrcloud-resilience.adoc | 28 ++++++ solr/solr-ref-guide/src/solrcloud.adoc | 6 +- .../src/update-request-processors.adoc | 2 +- .../src/updatehandlers-in-solrconfig.adoc | 2 +- 7 files changed, 124 insertions(+), 76 deletions(-) create mode 100644 solr/solr-ref-guide/src/solrcloud-query-routing-and-read-tolerance.adoc rename solr/solr-ref-guide/src/{read-and-write-side-fault-tolerance.adoc => solrcloud-recoveries-and-write-tolerance.adoc} (55%) create mode 100644 solr/solr-ref-guide/src/solrcloud-resilience.adoc diff --git a/solr/solr-ref-guide/src/how-solrcloud-works.adoc b/solr/solr-ref-guide/src/how-solrcloud-works.adoc index 5e364ce6307..d18d1b96b60 100644 --- a/solr/solr-ref-guide/src/how-solrcloud-works.adoc +++ b/solr/solr-ref-guide/src/how-solrcloud-works.adoc @@ -1,7 +1,7 @@ = How SolrCloud Works :page-shortname: how-solrcloud-works :page-permalink: how-solrcloud-works.html -:page-children: shards-and-indexing-data-in-solrcloud, distributed-requests, read-and-write-side-fault-tolerance +:page-children: shards-and-indexing-data-in-solrcloud, distributed-requests // Licensed to the Apache Software Foundation (ASF) under one // or more contributor license agreements. See the NOTICE file // distributed with this work for additional information @@ -23,7 +23,6 @@ The following sections cover provide general information about how various SolrC * <> * <> -* <> If you are already familiar with SolrCloud concepts and basic functionality, you can skip to the section covering <>. diff --git a/solr/solr-ref-guide/src/solrcloud-query-routing-and-read-tolerance.adoc b/solr/solr-ref-guide/src/solrcloud-query-routing-and-read-tolerance.adoc new file mode 100644 index 00000000000..7cfc1cdecfb --- /dev/null +++ b/solr/solr-ref-guide/src/solrcloud-query-routing-and-read-tolerance.adoc @@ -0,0 +1,85 @@ += SolrCloud Query Routing And Read Tolerance +:page-shortname: solrcloud-query-routing-and-read-tolerance +:page-permalink: solrcloud-query-routing-and-read-tolerance.html +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +SolrCloud is highly available and fault tolerant in reads and writes. + + +== Read Side Fault Tolerance + +In a SolrCloud cluster each individual node load balances read requests across all the replicas in collection. You still need a load balancer on the 'outside' that talks to the cluster, or you need a smart client which understands how to read and interact with Solr's metadata in ZooKeeper and only requests the ZooKeeper ensemble's address to start discovering to which nodes it should send requests. (Solr provides a smart Java SolrJ client called {solr-javadocs}/solr-solrj/org/apache/solr/client/solrj/impl/CloudSolrClient.html[CloudSolrClient].) + +Even if some nodes in the cluster are offline or unreachable, a Solr node will be able to correctly respond to a search request as long as it can communicate with at least one replica of every shard, or one replica of every _relevant_ shard if the user limited the search via the `shards` or `\_route_` parameters. The more replicas there are of every shard, the more likely that the Solr cluster will be able to handle search results in the event of node failures. + +=== zkConnected + +A Solr node will return the results of a search request as long as it can communicate with at least one replica of every shard that it knows about, even if it can _not_ communicate with ZooKeeper at the time it receives the request. This is normally the preferred behavior from a fault tolerance standpoint, but may result in stale or incorrect results if there have been major changes to the collection structure that the node has not been informed of via ZooKeeper (i.e., shards may have been added or removed, or split into sub-shards) + +A `zkConnected` header is included in every search response indicating if the node that processed the request was connected with ZooKeeper at the time: + +.Solr Response with partialResults +[source,json] +---- +{ + "responseHeader": { + "status": 0, + "zkConnected": true, + "QTime": 20, + "params": { + "q": "*:*" + } + }, + "response": { + "numFound": 107, + "start": 0, + "docs": [ "..." ] + } +} +---- + +=== shards.tolerant + +In the event that one or more shards queried are completely unavailable, then Solr's default behavior is to fail the request. However, there are many use-cases where partial results are acceptable and so Solr provides a boolean `shards.tolerant` parameter (default `false`). + +If `shards.tolerant=true` then partial results may be returned. If the returned response does not contain results from all the appropriate shards then the response header contains a special flag called `partialResults`. + +The client can specify '<>' along with the `shards.tolerant` parameter to retrieve more fine-grained details. + +Example response with `partialResults` flag set to 'true': + +*Solr Response with partialResults* + +[source,json] +---- +{ + "responseHeader": { + "status": 0, + "zkConnected": true, + "partialResults": true, + "QTime": 20, + "params": { + "q": "*:*" + } + }, + "response": { + "numFound": 77, + "start": 0, + "docs": [ "..." ] + } +} \ No newline at end of file diff --git a/solr/solr-ref-guide/src/read-and-write-side-fault-tolerance.adoc b/solr/solr-ref-guide/src/solrcloud-recoveries-and-write-tolerance.adoc similarity index 55% rename from solr/solr-ref-guide/src/read-and-write-side-fault-tolerance.adoc rename to solr/solr-ref-guide/src/solrcloud-recoveries-and-write-tolerance.adoc index 9f9f0412ec7..70c120a20ff 100644 --- a/solr/solr-ref-guide/src/read-and-write-side-fault-tolerance.adoc +++ b/solr/solr-ref-guide/src/solrcloud-recoveries-and-write-tolerance.adoc @@ -1,6 +1,6 @@ -= Read and Write Side Fault Tolerance -:page-shortname: read-and-write-side-fault-tolerance -:page-permalink: read-and-write-side-fault-tolerance.html += SolrCloud Recoveries and Write Tolerance +:page-shortname: solrcloud-recoveries-and-write-tolerance +:page-permalink: solrcloud-recoveries-and-write-tolerance.html // Licensed to the Apache Software Foundation (ASF) under one // or more contributor license agreements. See the NOTICE file // distributed with this work for additional information @@ -18,73 +18,7 @@ // specific language governing permissions and limitations // under the License. -SolrCloud supports elasticity, high availability, and fault tolerance in reads and writes. - -What this means, basically, is that when you have a large cluster, you can always make requests to the cluster: Reads will return results whenever possible, even if some nodes are down, and Writes will be acknowledged only if they are durable; i.e., you won't lose data. - -== Read Side Fault Tolerance - -In a SolrCloud cluster each individual node load balances read requests across all the replicas in collection. You still need a load balancer on the 'outside' that talks to the cluster, or you need a smart client which understands how to read and interact with Solr's metadata in ZooKeeper and only requests the ZooKeeper ensemble's address to start discovering to which nodes it should send requests. (Solr provides a smart Java SolrJ client called {solr-javadocs}/solr-solrj/org/apache/solr/client/solrj/impl/CloudSolrClient.html[CloudSolrClient].) - -Even if some nodes in the cluster are offline or unreachable, a Solr node will be able to correctly respond to a search request as long as it can communicate with at least one replica of every shard, or one replica of every _relevant_ shard if the user limited the search via the `shards` or `\_route_` parameters. The more replicas there are of every shard, the more likely that the Solr cluster will be able to handle search results in the event of node failures. - -=== zkConnected - -A Solr node will return the results of a search request as long as it can communicate with at least one replica of every shard that it knows about, even if it can _not_ communicate with ZooKeeper at the time it receives the request. This is normally the preferred behavior from a fault tolerance standpoint, but may result in stale or incorrect results if there have been major changes to the collection structure that the node has not been informed of via ZooKeeper (i.e., shards may have been added or removed, or split into sub-shards) - -A `zkConnected` header is included in every search response indicating if the node that processed the request was connected with ZooKeeper at the time: - -.Solr Response with partialResults -[source,json] ----- -{ - "responseHeader": { - "status": 0, - "zkConnected": true, - "QTime": 20, - "params": { - "q": "*:*" - } - }, - "response": { - "numFound": 107, - "start": 0, - "docs": [ "..." ] - } -} ----- - -=== shards.tolerant - -In the event that one or more shards queried are completely unavailable, then Solr's default behavior is to fail the request. However, there are many use-cases where partial results are acceptable and so Solr provides a boolean `shards.tolerant` parameter (default `false`). - -If `shards.tolerant=true` then partial results may be returned. If the returned response does not contain results from all the appropriate shards then the response header contains a special flag called `partialResults`. - -The client can specify '<>' along with the `shards.tolerant` parameter to retrieve more fine-grained details. - -Example response with `partialResults` flag set to 'true': - -*Solr Response with partialResults* - -[source,json] ----- -{ - "responseHeader": { - "status": 0, - "zkConnected": true, - "partialResults": true, - "QTime": 20, - "params": { - "q": "*:*" - } - }, - "response": { - "numFound": 77, - "start": 0, - "docs": [ "..." ] - } -} ----- +SolrCloud is highly available and fault tolerant in reads and writes. == Write Side Fault Tolerance diff --git a/solr/solr-ref-guide/src/solrcloud-resilience.adoc b/solr/solr-ref-guide/src/solrcloud-resilience.adoc new file mode 100644 index 00000000000..ae341d45908 --- /dev/null +++ b/solr/solr-ref-guide/src/solrcloud-resilience.adoc @@ -0,0 +1,28 @@ += SolrCloud Resilience +:page-shortname: solrcloud-resilience +:page-permalink: solrcloud-resilience.html +:page-children: solrcloud-recoveries-and-write-tolerance, solrcloud-query-routing-and-read-tolerance +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +In this section, we'll cover how does Solr handle reads and writes when all the nodes in the cluster are not healthy + +The following sections cover these topics: + +* <> +* <> + diff --git a/solr/solr-ref-guide/src/solrcloud.adoc b/solr/solr-ref-guide/src/solrcloud.adoc index 10b647fb273..b5dc33b8fb8 100644 --- a/solr/solr-ref-guide/src/solrcloud.adoc +++ b/solr/solr-ref-guide/src/solrcloud.adoc @@ -1,7 +1,7 @@ = SolrCloud :page-shortname: solrcloud :page-permalink: solrcloud.html -:page-children: getting-started-with-solrcloud, how-solrcloud-works, solrcloud-configuration-and-parameters, rule-based-replica-placement, cross-data-center-replication-cdcr, solrcloud-autoscaling +:page-children: getting-started-with-solrcloud, how-solrcloud-works, solrcloud-resilience, solrcloud-configuration-and-parameters, rule-based-replica-placement, cross-data-center-replication-cdcr, solrcloud-autoscaling // Licensed to the Apache Software Foundation (ASF) under one // or more contributor license agreements. See the NOTICE file // distributed with this work for additional information @@ -33,7 +33,9 @@ In this section, we'll cover everything you need to know about using Solr in Sol * <> ** <> ** <> -** <> +* <> +** <> +** <> * <> ** <> ** <> diff --git a/solr/solr-ref-guide/src/update-request-processors.adoc b/solr/solr-ref-guide/src/update-request-processors.adoc index 981c2e1dedf..f40f909d36a 100644 --- a/solr/solr-ref-guide/src/update-request-processors.adoc +++ b/solr/solr-ref-guide/src/update-request-processors.adoc @@ -142,7 +142,7 @@ However executing a processor only on the forwarding nodes is a great way of dis .Custom update chain post-processors may never be invoked on a recovering replica [WARNING] ==== -While a replica is in <>, inbound update requests are buffered to the transaction log. After recovery has completed successfully, those buffered update requests are replayed. As of this writing, however, custom update chain post-processors are never invoked for buffered update requests. See https://issues.apache.org/jira/browse/SOLR-8030[SOLR-8030]. To work around this problem until SOLR-8030 has been fixed, *avoid specifying post-processors in custom update chains*. +While a replica is in <>, inbound update requests are buffered to the transaction log. After recovery has completed successfully, those buffered update requests are replayed. As of this writing, however, custom update chain post-processors are never invoked for buffered update requests. See https://issues.apache.org/jira/browse/SOLR-8030[SOLR-8030]. To work around this problem until SOLR-8030 has been fixed, *avoid specifying post-processors in custom update chains*. ==== === Atomic Update Processor Factory diff --git a/solr/solr-ref-guide/src/updatehandlers-in-solrconfig.adoc b/solr/solr-ref-guide/src/updatehandlers-in-solrconfig.adoc index 43314574bbb..1d0a5cf0b5f 100644 --- a/solr/solr-ref-guide/src/updatehandlers-in-solrconfig.adoc +++ b/solr/solr-ref-guide/src/updatehandlers-in-solrconfig.adoc @@ -121,7 +121,7 @@ Realtime Get currently relies on the update log feature, which is enabled by def ---- -Three additional expert-level configuration settings affect indexing performance and how far a replica can fall behind on updates before it must enter into full recovery - see the section on <> for more information: +Three additional expert-level configuration settings affect indexing performance and how far a replica can fall behind on updates before it must enter into full recovery - see the section on <> for more information: `numRecordsToKeep`:: The number of update records to keep per log. The default is `100`. From 0807708f50daba37beeb8490bda1600da588f46d Mon Sep 17 00:00:00 2001 From: Mike McCandless Date: Sat, 2 Sep 2017 05:39:49 -0400 Subject: [PATCH 20/44] remove unused imports to fix precommit --- .../src/java/org/apache/lucene/replicator/nrt/PrimaryNode.java | 2 -- .../src/java/org/apache/lucene/replicator/nrt/ReplicaNode.java | 1 - 2 files changed, 3 deletions(-) diff --git a/lucene/replicator/src/java/org/apache/lucene/replicator/nrt/PrimaryNode.java b/lucene/replicator/src/java/org/apache/lucene/replicator/nrt/PrimaryNode.java index 290195f3144..417a26b0dc4 100644 --- a/lucene/replicator/src/java/org/apache/lucene/replicator/nrt/PrimaryNode.java +++ b/lucene/replicator/src/java/org/apache/lucene/replicator/nrt/PrimaryNode.java @@ -30,11 +30,9 @@ import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.SegmentCommitInfo; import org.apache.lucene.index.SegmentInfos; import org.apache.lucene.index.StandardDirectoryReader; -import org.apache.lucene.index.Term; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.SearcherFactory; import org.apache.lucene.search.SearcherManager; -import org.apache.lucene.search.TermQuery; import org.apache.lucene.store.AlreadyClosedException; import org.apache.lucene.store.RAMFile; import org.apache.lucene.store.RAMOutputStream; diff --git a/lucene/replicator/src/java/org/apache/lucene/replicator/nrt/ReplicaNode.java b/lucene/replicator/src/java/org/apache/lucene/replicator/nrt/ReplicaNode.java index d546c176a86..0996f4f6e06 100644 --- a/lucene/replicator/src/java/org/apache/lucene/replicator/nrt/ReplicaNode.java +++ b/lucene/replicator/src/java/org/apache/lucene/replicator/nrt/ReplicaNode.java @@ -33,7 +33,6 @@ import java.util.concurrent.ConcurrentHashMap; import java.util.concurrent.ConcurrentMap; import java.util.concurrent.atomic.AtomicBoolean; -import org.apache.lucene.index.DirectoryReader; import org.apache.lucene.index.IndexFileNames; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.SegmentInfos; From cd471cc98dcee4f587739b2288e4e120f8c54808 Mon Sep 17 00:00:00 2001 From: Mike McCandless Date: Sat, 2 Sep 2017 05:41:06 -0400 Subject: [PATCH 21/44] LUCENE-7933: relax test to also pass on 32 bit JVMs --- .../test/org/apache/lucene/util/TestLongBitSet.java | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/lucene/core/src/test/org/apache/lucene/util/TestLongBitSet.java b/lucene/core/src/test/org/apache/lucene/util/TestLongBitSet.java index f94c97eca28..5a748551dc2 100644 --- a/lucene/core/src/test/org/apache/lucene/util/TestLongBitSet.java +++ b/lucene/core/src/test/org/apache/lucene/util/TestLongBitSet.java @@ -225,7 +225,7 @@ public class TestLongBitSet extends LuceneTestCase { () -> { new LongBitSet(LongBitSet.MAX_NUM_BITS + 1); }); - assertEquals("numBits must be 0 .. 137438952384; got: 137438952385", e.getMessage()); + assertTrue(e.getMessage().startsWith("numBits must be 0 .. ")); } public void testNegativeNumBits() { @@ -233,7 +233,7 @@ public class TestLongBitSet extends LuceneTestCase { () -> { new LongBitSet(-17); }); - assertEquals("numBits must be 0 .. 137438952384; got: -17", e.getMessage()); + assertTrue(e.getMessage().startsWith("numBits must be 0 .. ")); } public void testSmallBitSets() { @@ -360,7 +360,9 @@ public class TestLongBitSet extends LuceneTestCase { // ... assertEquals(1 << (32-6), LongBitSet.bits2words(1L << 32)); assertEquals((1 << (32-6)) + 1, LongBitSet.bits2words((1L << 32)) + 1); - // ... - assertEquals(2147483631, LongBitSet.bits2words(LongBitSet.MAX_NUM_BITS)); + + // ensure the claimed max num_bits doesn't throw exc; we can't enforce exact values here + // because the value variees with JVM: + assertTrue(LongBitSet.bits2words(LongBitSet.MAX_NUM_BITS) > 0); } } From c2c2e8a85e92024d627381858cd1dbcff4cbab72 Mon Sep 17 00:00:00 2001 From: Tommaso Teofili Date: Sat, 2 Sep 2017 14:43:59 +0200 Subject: [PATCH 22/44] LUCENE-7950 - fixed potential NPE when no docs have the class field --- .../SimpleNaiveBayesDocumentClassifier.java | 30 ++++++++++--------- 1 file changed, 16 insertions(+), 14 deletions(-) diff --git a/lucene/classification/src/java/org/apache/lucene/classification/document/SimpleNaiveBayesDocumentClassifier.java b/lucene/classification/src/java/org/apache/lucene/classification/document/SimpleNaiveBayesDocumentClassifier.java index 21ad7d134a4..6bc8573c094 100644 --- a/lucene/classification/src/java/org/apache/lucene/classification/document/SimpleNaiveBayesDocumentClassifier.java +++ b/lucene/classification/src/java/org/apache/lucene/classification/document/SimpleNaiveBayesDocumentClassifier.java @@ -113,24 +113,26 @@ public class SimpleNaiveBayesDocumentClassifier extends SimpleNaiveBayesClassifi Map> fieldName2tokensArray = new LinkedHashMap<>(); Map fieldName2boost = new LinkedHashMap<>(); Terms classes = MultiFields.getTerms(indexReader, classFieldName); - TermsEnum classesEnum = classes.iterator(); - BytesRef c; + if (classes != null) { + TermsEnum classesEnum = classes.iterator(); + BytesRef c; - analyzeSeedDocument(inputDocument, fieldName2tokensArray, fieldName2boost); + analyzeSeedDocument(inputDocument, fieldName2tokensArray, fieldName2boost); - int docsWithClassSize = countDocsWithClass(); - while ((c = classesEnum.next()) != null) { - double classScore = 0; - Term term = new Term(this.classFieldName, c); - for (String fieldName : textFieldNames) { - List tokensArrays = fieldName2tokensArray.get(fieldName); - double fieldScore = 0; - for (String[] fieldTokensArray : tokensArrays) { - fieldScore += calculateLogPrior(term, docsWithClassSize) + calculateLogLikelihood(fieldTokensArray, fieldName, term, docsWithClassSize) * fieldName2boost.get(fieldName); + int docsWithClassSize = countDocsWithClass(); + while ((c = classesEnum.next()) != null) { + double classScore = 0; + Term term = new Term(this.classFieldName, c); + for (String fieldName : textFieldNames) { + List tokensArrays = fieldName2tokensArray.get(fieldName); + double fieldScore = 0; + for (String[] fieldTokensArray : tokensArrays) { + fieldScore += calculateLogPrior(term, docsWithClassSize) + calculateLogLikelihood(fieldTokensArray, fieldName, term, docsWithClassSize) * fieldName2boost.get(fieldName); + } + classScore += fieldScore; } - classScore += fieldScore; + assignedClasses.add(new ClassificationResult<>(term.bytes(), classScore)); } - assignedClasses.add(new ClassificationResult<>(term.bytes(), classScore)); } return normClassificationResults(assignedClasses); } From 40dddf9324e549e14132b9cd957ed07c7aca5c96 Mon Sep 17 00:00:00 2001 From: "Md. Abdulla-Al-Sun" Date: Sun, 3 Sep 2017 01:53:15 +0600 Subject: [PATCH 23/44] LUCENE-7940: added missing boundary case for ba phaala normalization --- .../java/org/apache/lucene/analysis/bn/BengaliNormalizer.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/bn/BengaliNormalizer.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/bn/BengaliNormalizer.java index 057fbb5ee85..1718cbce70f 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/bn/BengaliNormalizer.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/bn/BengaliNormalizer.java @@ -104,7 +104,7 @@ public class BengaliNormalizer { len = delete(s, i, len); len = delete(s, i-1, len); i -=2; - } else { + } else if(i - 2 >= 0){ s[i - 1] = s[i - 2]; len = delete(s, i, len); i --; From 3901a13de3aa2ce172fea244f84ab613b3314041 Mon Sep 17 00:00:00 2001 From: David Smiley Date: Sun, 3 Sep 2017 16:59:31 -0400 Subject: [PATCH 24/44] SOLR-11242: QueryParser: re-use the LookaheadSuccess exception. --- solr/CHANGES.txt | 2 ++ solr/core/build.xml | 5 +++++ solr/core/src/java/org/apache/solr/parser/QueryParser.java | 2 +- 3 files changed, 8 insertions(+), 1 deletion(-) diff --git a/solr/CHANGES.txt b/solr/CHANGES.txt index 493d52f2ac4..4ccc609ce31 100644 --- a/solr/CHANGES.txt +++ b/solr/CHANGES.txt @@ -128,6 +128,8 @@ Optimizations * SOLR-11124: MoveReplicaCmd should skip deleting old replica in case of its node is not live (Cao Manh Dat) +* SOLR-11242: QueryParser: re-use the LookaheadSuccess exception. (Michael Braun via David Smiley) + Other Changes ---------------------- diff --git a/solr/core/build.xml b/solr/core/build.xml index 532a3c22d4a..af4c262dbd8 100644 --- a/solr/core/build.xml +++ b/solr/core/build.xml @@ -83,6 +83,11 @@ byline="true" match="public QueryParser\(QueryParserTokenManager " replace="protected QueryParser(QueryParserTokenManager "/> + + diff --git a/solr/core/src/java/org/apache/solr/parser/QueryParser.java b/solr/core/src/java/org/apache/solr/parser/QueryParser.java index 42e982fa26d..50e36dd916a 100644 --- a/solr/core/src/java/org/apache/solr/parser/QueryParser.java +++ b/solr/core/src/java/org/apache/solr/parser/QueryParser.java @@ -767,7 +767,7 @@ public class QueryParser extends SolrQueryParserBase implements QueryParserConst } static private final class LookaheadSuccess extends java.lang.Error { } - final private LookaheadSuccess jj_ls = new LookaheadSuccess(); + static final private LookaheadSuccess jj_ls = new LookaheadSuccess(); private boolean jj_scan_token(int kind) { if (jj_scanpos == jj_lastpos) { jj_la--; From e782082e711286a4c1a6ca101a9fa11bafab7b0d Mon Sep 17 00:00:00 2001 From: Shalin Shekhar Mangar Date: Mon, 4 Sep 2017 09:42:56 +0530 Subject: [PATCH 25/44] SOLR-11278: Disable frequently failing method with AwaitsFix --- solr/core/src/test/org/apache/solr/cloud/CdcrBootstrapTest.java | 1 + 1 file changed, 1 insertion(+) diff --git a/solr/core/src/test/org/apache/solr/cloud/CdcrBootstrapTest.java b/solr/core/src/test/org/apache/solr/cloud/CdcrBootstrapTest.java index 6959bd825d5..a5b37d8a5d0 100644 --- a/solr/core/src/test/org/apache/solr/cloud/CdcrBootstrapTest.java +++ b/solr/core/src/test/org/apache/solr/cloud/CdcrBootstrapTest.java @@ -239,6 +239,7 @@ public class CdcrBootstrapTest extends SolrTestCaseJ4 { } } + @AwaitsFix(bugUrl = "https://issues.apache.org/jira/browse/SOLR-11278") public void testBootstrapWithContinousIndexingOnSourceCluster() throws Exception { // start the target first so that we know its zkhost MiniSolrCloudCluster target = new MiniSolrCloudCluster(1, createTempDir("cdcr-target"), buildJettyConfig("/solr")); From 3423ae4b920ef51d685c9efc7f2cad390d987b87 Mon Sep 17 00:00:00 2001 From: Joel Bernstein Date: Mon, 4 Sep 2017 18:40:03 -0400 Subject: [PATCH 26/44] SOLR-11321: Add ebeAdd, ebeSubtract, ebeDivide, ebeMultiply, dotProduct and cosineSimilarity Stream Evaluators --- .../apache/solr/handler/StreamHandler.java | 85 +----------- .../io/eval/CosineSimilarityEvaluator.java | 67 ++++++++++ .../solrj/io/eval/DotProductEvaluator.java | 58 ++++++++ .../client/solrj/io/eval/EBEAddEvaluator.java | 63 +++++++++ .../solrj/io/eval/EBEDivideEvaluator.java | 63 +++++++++ .../solrj/io/eval/EBEMultiplyEvaluator.java | 63 +++++++++ .../solrj/io/eval/EBESubtractEvaluator.java | 63 +++++++++ .../solrj/io/stream/StreamExpressionTest.java | 124 ++++++++++++++++++ 8 files changed, 508 insertions(+), 78 deletions(-) create mode 100644 solr/solrj/src/java/org/apache/solr/client/solrj/io/eval/CosineSimilarityEvaluator.java create mode 100644 solr/solrj/src/java/org/apache/solr/client/solrj/io/eval/DotProductEvaluator.java create mode 100644 solr/solrj/src/java/org/apache/solr/client/solrj/io/eval/EBEAddEvaluator.java create mode 100644 solr/solrj/src/java/org/apache/solr/client/solrj/io/eval/EBEDivideEvaluator.java create mode 100644 solr/solrj/src/java/org/apache/solr/client/solrj/io/eval/EBEMultiplyEvaluator.java create mode 100644 solr/solrj/src/java/org/apache/solr/client/solrj/io/eval/EBESubtractEvaluator.java diff --git a/solr/core/src/java/org/apache/solr/handler/StreamHandler.java b/solr/core/src/java/org/apache/solr/handler/StreamHandler.java index 8b6a6c0225d..9613ec33a06 100644 --- a/solr/core/src/java/org/apache/solr/handler/StreamHandler.java +++ b/solr/core/src/java/org/apache/solr/handler/StreamHandler.java @@ -33,84 +33,7 @@ import org.apache.solr.client.solrj.io.ModelCache; import org.apache.solr.client.solrj.io.SolrClientCache; import org.apache.solr.client.solrj.io.Tuple; import org.apache.solr.client.solrj.io.comp.StreamComparator; -import org.apache.solr.client.solrj.io.eval.AbsoluteValueEvaluator; -import org.apache.solr.client.solrj.io.eval.AddEvaluator; -import org.apache.solr.client.solrj.io.eval.AndEvaluator; -import org.apache.solr.client.solrj.io.eval.AnovaEvaluator; -import org.apache.solr.client.solrj.io.eval.AppendEvaluator; -import org.apache.solr.client.solrj.io.eval.ArcCosineEvaluator; -import org.apache.solr.client.solrj.io.eval.ArcSineEvaluator; -import org.apache.solr.client.solrj.io.eval.ArcTangentEvaluator; -import org.apache.solr.client.solrj.io.eval.ArrayEvaluator; -import org.apache.solr.client.solrj.io.eval.AscEvaluator; -import org.apache.solr.client.solrj.io.eval.CeilingEvaluator; -import org.apache.solr.client.solrj.io.eval.CoalesceEvaluator; -import org.apache.solr.client.solrj.io.eval.ColumnEvaluator; -import org.apache.solr.client.solrj.io.eval.ConversionEvaluator; -import org.apache.solr.client.solrj.io.eval.ConvolutionEvaluator; -import org.apache.solr.client.solrj.io.eval.CopyOfEvaluator; -import org.apache.solr.client.solrj.io.eval.CopyOfRangeEvaluator; -import org.apache.solr.client.solrj.io.eval.CorrelationEvaluator; -import org.apache.solr.client.solrj.io.eval.CosineEvaluator; -import org.apache.solr.client.solrj.io.eval.CovarianceEvaluator; -import org.apache.solr.client.solrj.io.eval.CubedRootEvaluator; -import org.apache.solr.client.solrj.io.eval.CumulativeProbabilityEvaluator; -import org.apache.solr.client.solrj.io.eval.DescribeEvaluator; -import org.apache.solr.client.solrj.io.eval.DivideEvaluator; -import org.apache.solr.client.solrj.io.eval.EmpiricalDistributionEvaluator; -import org.apache.solr.client.solrj.io.eval.EqualToEvaluator; -import org.apache.solr.client.solrj.io.eval.EuclideanDistanceEvaluator; -import org.apache.solr.client.solrj.io.eval.ExclusiveOrEvaluator; -import org.apache.solr.client.solrj.io.eval.FindDelayEvaluator; -import org.apache.solr.client.solrj.io.eval.FloorEvaluator; -import org.apache.solr.client.solrj.io.eval.GreaterThanEqualToEvaluator; -import org.apache.solr.client.solrj.io.eval.GreaterThanEvaluator; -import org.apache.solr.client.solrj.io.eval.HistogramEvaluator; -import org.apache.solr.client.solrj.io.eval.HyperbolicCosineEvaluator; -import org.apache.solr.client.solrj.io.eval.HyperbolicSineEvaluator; -import org.apache.solr.client.solrj.io.eval.HyperbolicTangentEvaluator; -import org.apache.solr.client.solrj.io.eval.IfThenElseEvaluator; -import org.apache.solr.client.solrj.io.eval.KolmogorovSmirnovEvaluator; -import org.apache.solr.client.solrj.io.eval.LengthEvaluator; -import org.apache.solr.client.solrj.io.eval.LessThanEqualToEvaluator; -import org.apache.solr.client.solrj.io.eval.LessThanEvaluator; -import org.apache.solr.client.solrj.io.eval.ModuloEvaluator; -import org.apache.solr.client.solrj.io.eval.MovingAverageEvaluator; -import org.apache.solr.client.solrj.io.eval.MultiplyEvaluator; -import org.apache.solr.client.solrj.io.eval.NaturalLogEvaluator; -import org.apache.solr.client.solrj.io.eval.NormalDistributionEvaluator; -import org.apache.solr.client.solrj.io.eval.NormalizeEvaluator; -import org.apache.solr.client.solrj.io.eval.NotEvaluator; -import org.apache.solr.client.solrj.io.eval.OrEvaluator; -import org.apache.solr.client.solrj.io.eval.PercentileEvaluator; -import org.apache.solr.client.solrj.io.eval.PowerEvaluator; -import org.apache.solr.client.solrj.io.eval.PredictEvaluator; -import org.apache.solr.client.solrj.io.eval.RankEvaluator; -import org.apache.solr.client.solrj.io.eval.RawValueEvaluator; -import org.apache.solr.client.solrj.io.eval.RegressionEvaluator; -import org.apache.solr.client.solrj.io.eval.ResidualsEvaluator; -import org.apache.solr.client.solrj.io.eval.ReverseEvaluator; -import org.apache.solr.client.solrj.io.eval.RoundEvaluator; -import org.apache.solr.client.solrj.io.eval.SampleEvaluator; -import org.apache.solr.client.solrj.io.eval.ScaleEvaluator; -import org.apache.solr.client.solrj.io.eval.SequenceEvaluator; -import org.apache.solr.client.solrj.io.eval.SineEvaluator; -import org.apache.solr.client.solrj.io.eval.SquareRootEvaluator; -import org.apache.solr.client.solrj.io.eval.SubtractEvaluator; -import org.apache.solr.client.solrj.io.eval.TangentEvaluator; -import org.apache.solr.client.solrj.io.eval.TemporalEvaluatorDay; -import org.apache.solr.client.solrj.io.eval.TemporalEvaluatorDayOfQuarter; -import org.apache.solr.client.solrj.io.eval.TemporalEvaluatorDayOfYear; -import org.apache.solr.client.solrj.io.eval.TemporalEvaluatorEpoch; -import org.apache.solr.client.solrj.io.eval.TemporalEvaluatorHour; -import org.apache.solr.client.solrj.io.eval.TemporalEvaluatorMinute; -import org.apache.solr.client.solrj.io.eval.TemporalEvaluatorMonth; -import org.apache.solr.client.solrj.io.eval.TemporalEvaluatorQuarter; -import org.apache.solr.client.solrj.io.eval.TemporalEvaluatorSecond; -import org.apache.solr.client.solrj.io.eval.TemporalEvaluatorWeek; -import org.apache.solr.client.solrj.io.eval.TemporalEvaluatorYear; -import org.apache.solr.client.solrj.io.eval.UniformDistributionEvaluator; -import org.apache.solr.client.solrj.io.eval.UuidEvaluator; +import org.apache.solr.client.solrj.io.eval.*; import org.apache.solr.client.solrj.io.graph.GatherNodesStream; import org.apache.solr.client.solrj.io.graph.ShortestPathStream; import org.apache.solr.client.solrj.io.ops.ConcatOperation; @@ -352,6 +275,12 @@ public class StreamHandler extends RequestHandlerBase implements SolrCoreAware, .withFunctionName("ks", KolmogorovSmirnovEvaluator.class) .withFunctionName("asc", AscEvaluator.class) .withFunctionName("cumulativeProbability", CumulativeProbabilityEvaluator.class) + .withFunctionName("ebeAdd", EBEAddEvaluator.class) + .withFunctionName("ebeSubtract", EBESubtractEvaluator.class) + .withFunctionName("ebeMultiply", EBEMultiplyEvaluator.class) + .withFunctionName("ebeDivide", EBEDivideEvaluator.class) + .withFunctionName("dotProduct", DotProductEvaluator.class) + .withFunctionName("cosineSimilarity", CosineSimilarityEvaluator.class) // Boolean Stream Evaluators .withFunctionName("and", AndEvaluator.class) diff --git a/solr/solrj/src/java/org/apache/solr/client/solrj/io/eval/CosineSimilarityEvaluator.java b/solr/solrj/src/java/org/apache/solr/client/solrj/io/eval/CosineSimilarityEvaluator.java new file mode 100644 index 00000000000..ea88400b7a6 --- /dev/null +++ b/solr/solrj/src/java/org/apache/solr/client/solrj/io/eval/CosineSimilarityEvaluator.java @@ -0,0 +1,67 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.solr.client.solrj.io.eval; + +import java.io.IOException; +import java.math.BigDecimal; +import java.util.List; +import java.util.Locale; + +import org.apache.solr.client.solrj.io.stream.expr.StreamExpression; +import org.apache.solr.client.solrj.io.stream.expr.StreamFactory; + +public class CosineSimilarityEvaluator extends RecursiveNumericEvaluator implements TwoValueWorker { + protected static final long serialVersionUID = 1L; + + public CosineSimilarityEvaluator(StreamExpression expression, StreamFactory factory) throws IOException{ + super(expression, factory); + } + + @Override + public Object doWork(Object first, Object second) throws IOException{ + if(null == first){ + throw new IOException(String.format(Locale.ROOT,"Invalid expression %s - null found for the first value",toExpression(constructingFactory))); + } + if(null == second){ + throw new IOException(String.format(Locale.ROOT,"Invalid expression %s - null found for the second value",toExpression(constructingFactory))); + } + if(!(first instanceof List)){ + throw new IOException(String.format(Locale.ROOT,"Invalid expression %s - found type %s for the first value, expecting a list of numbers",toExpression(constructingFactory), first.getClass().getSimpleName())); + } + if(!(second instanceof List)){ + throw new IOException(String.format(Locale.ROOT,"Invalid expression %s - found type %s for the second value, expecting a list of numbers",toExpression(constructingFactory), first.getClass().getSimpleName())); + } + + double[] d1 = ((List) first).stream().mapToDouble(value -> ((BigDecimal) value).doubleValue()).toArray(); + double[] d2 = ((List) second).stream().mapToDouble(value -> ((BigDecimal) value).doubleValue()).toArray(); + + return cosineSimilarity(d1, d2); + } + + private double cosineSimilarity(double[] vectorA, double[] vectorB) { + double dotProduct = 0.0; + double normA = 0.0; + double normB = 0.0; + for (int i = 0; i < vectorA.length; i++) { + dotProduct += vectorA[i] * vectorB[i]; + normA += Math.pow(vectorA[i], 2); + normB += Math.pow(vectorB[i], 2); + } + return dotProduct / (Math.sqrt(normA) * Math.sqrt(normB)); + } + +} diff --git a/solr/solrj/src/java/org/apache/solr/client/solrj/io/eval/DotProductEvaluator.java b/solr/solrj/src/java/org/apache/solr/client/solrj/io/eval/DotProductEvaluator.java new file mode 100644 index 00000000000..3133bac7d1c --- /dev/null +++ b/solr/solrj/src/java/org/apache/solr/client/solrj/io/eval/DotProductEvaluator.java @@ -0,0 +1,58 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.solr.client.solrj.io.eval; + +import java.io.IOException; +import java.math.BigDecimal; +import java.util.List; +import java.util.Locale; + +import org.apache.commons.math3.linear.RealVector; +import org.apache.commons.math3.linear.ArrayRealVector; + +import org.apache.solr.client.solrj.io.stream.expr.StreamExpression; +import org.apache.solr.client.solrj.io.stream.expr.StreamFactory; + +public class DotProductEvaluator extends RecursiveNumericEvaluator implements TwoValueWorker { + protected static final long serialVersionUID = 1L; + + public DotProductEvaluator(StreamExpression expression, StreamFactory factory) throws IOException{ + super(expression, factory); + } + + @Override + public Object doWork(Object first, Object second) throws IOException{ + if(null == first){ + throw new IOException(String.format(Locale.ROOT,"Invalid expression %s - null found for the first value",toExpression(constructingFactory))); + } + if(null == second){ + throw new IOException(String.format(Locale.ROOT,"Invalid expression %s - null found for the second value",toExpression(constructingFactory))); + } + if(!(first instanceof List)){ + throw new IOException(String.format(Locale.ROOT,"Invalid expression %s - found type %s for the first value, expecting a list of numbers",toExpression(constructingFactory), first.getClass().getSimpleName())); + } + if(!(second instanceof List)){ + throw new IOException(String.format(Locale.ROOT,"Invalid expression %s - found type %s for the second value, expecting a list of numbers",toExpression(constructingFactory), first.getClass().getSimpleName())); + } + + RealVector v = new ArrayRealVector(((List) first).stream().mapToDouble(value -> ((BigDecimal) value).doubleValue()).toArray()); + RealVector v2 = new ArrayRealVector(((List) second).stream().mapToDouble(value -> ((BigDecimal) value).doubleValue()).toArray()); + + return v.dotProduct(v2); + + } +} diff --git a/solr/solrj/src/java/org/apache/solr/client/solrj/io/eval/EBEAddEvaluator.java b/solr/solrj/src/java/org/apache/solr/client/solrj/io/eval/EBEAddEvaluator.java new file mode 100644 index 00000000000..c1eec9b71a3 --- /dev/null +++ b/solr/solrj/src/java/org/apache/solr/client/solrj/io/eval/EBEAddEvaluator.java @@ -0,0 +1,63 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.solr.client.solrj.io.eval; + +import java.io.IOException; +import java.math.BigDecimal; +import java.util.ArrayList; +import java.util.List; +import java.util.Locale; + +import org.apache.commons.math3.util.MathArrays; +import org.apache.solr.client.solrj.io.stream.expr.StreamExpression; +import org.apache.solr.client.solrj.io.stream.expr.StreamFactory; + +public class EBEAddEvaluator extends RecursiveNumericEvaluator implements TwoValueWorker { + protected static final long serialVersionUID = 1L; + + public EBEAddEvaluator(StreamExpression expression, StreamFactory factory) throws IOException{ + super(expression, factory); + } + + @Override + public Object doWork(Object first, Object second) throws IOException{ + if(null == first){ + throw new IOException(String.format(Locale.ROOT,"Invalid expression %s - null found for the first value",toExpression(constructingFactory))); + } + if(null == second){ + throw new IOException(String.format(Locale.ROOT,"Invalid expression %s - null found for the second value",toExpression(constructingFactory))); + } + if(!(first instanceof List)){ + throw new IOException(String.format(Locale.ROOT,"Invalid expression %s - found type %s for the first value, expecting a list of numbers",toExpression(constructingFactory), first.getClass().getSimpleName())); + } + if(!(second instanceof List)){ + throw new IOException(String.format(Locale.ROOT,"Invalid expression %s - found type %s for the second value, expecting a list of numbers",toExpression(constructingFactory), first.getClass().getSimpleName())); + } + + double[] result = MathArrays.ebeAdd( + ((List) first).stream().mapToDouble(value -> ((BigDecimal) value).doubleValue()).toArray(), + ((List) second).stream().mapToDouble(value -> ((BigDecimal) value).doubleValue()).toArray() + ); + + List numbers = new ArrayList(); + for(double d : result) { + numbers.add(d); + } + + return numbers; + } +} diff --git a/solr/solrj/src/java/org/apache/solr/client/solrj/io/eval/EBEDivideEvaluator.java b/solr/solrj/src/java/org/apache/solr/client/solrj/io/eval/EBEDivideEvaluator.java new file mode 100644 index 00000000000..c457f68795a --- /dev/null +++ b/solr/solrj/src/java/org/apache/solr/client/solrj/io/eval/EBEDivideEvaluator.java @@ -0,0 +1,63 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.solr.client.solrj.io.eval; + +import java.io.IOException; +import java.math.BigDecimal; +import java.util.ArrayList; +import java.util.List; +import java.util.Locale; + +import org.apache.commons.math3.util.MathArrays; +import org.apache.solr.client.solrj.io.stream.expr.StreamExpression; +import org.apache.solr.client.solrj.io.stream.expr.StreamFactory; + +public class EBEDivideEvaluator extends RecursiveNumericEvaluator implements TwoValueWorker { + protected static final long serialVersionUID = 1L; + + public EBEDivideEvaluator(StreamExpression expression, StreamFactory factory) throws IOException{ + super(expression, factory); + } + + @Override + public Object doWork(Object first, Object second) throws IOException{ + if(null == first){ + throw new IOException(String.format(Locale.ROOT,"Invalid expression %s - null found for the first value",toExpression(constructingFactory))); + } + if(null == second){ + throw new IOException(String.format(Locale.ROOT,"Invalid expression %s - null found for the second value",toExpression(constructingFactory))); + } + if(!(first instanceof List)){ + throw new IOException(String.format(Locale.ROOT,"Invalid expression %s - found type %s for the first value, expecting a list of numbers",toExpression(constructingFactory), first.getClass().getSimpleName())); + } + if(!(second instanceof List)){ + throw new IOException(String.format(Locale.ROOT,"Invalid expression %s - found type %s for the second value, expecting a list of numbers",toExpression(constructingFactory), first.getClass().getSimpleName())); + } + + double[] result = MathArrays.ebeDivide( + ((List) first).stream().mapToDouble(value -> ((BigDecimal) value).doubleValue()).toArray(), + ((List) second).stream().mapToDouble(value -> ((BigDecimal) value).doubleValue()).toArray() + ); + + List numbers = new ArrayList(); + for(double d : result) { + numbers.add(d); + } + + return numbers; + } +} diff --git a/solr/solrj/src/java/org/apache/solr/client/solrj/io/eval/EBEMultiplyEvaluator.java b/solr/solrj/src/java/org/apache/solr/client/solrj/io/eval/EBEMultiplyEvaluator.java new file mode 100644 index 00000000000..b3617cdd37f --- /dev/null +++ b/solr/solrj/src/java/org/apache/solr/client/solrj/io/eval/EBEMultiplyEvaluator.java @@ -0,0 +1,63 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.solr.client.solrj.io.eval; + +import java.io.IOException; +import java.math.BigDecimal; +import java.util.ArrayList; +import java.util.List; +import java.util.Locale; + +import org.apache.commons.math3.util.MathArrays; +import org.apache.solr.client.solrj.io.stream.expr.StreamExpression; +import org.apache.solr.client.solrj.io.stream.expr.StreamFactory; + +public class EBEMultiplyEvaluator extends RecursiveNumericEvaluator implements TwoValueWorker { + protected static final long serialVersionUID = 1L; + + public EBEMultiplyEvaluator(StreamExpression expression, StreamFactory factory) throws IOException{ + super(expression, factory); + } + + @Override + public Object doWork(Object first, Object second) throws IOException{ + if(null == first){ + throw new IOException(String.format(Locale.ROOT,"Invalid expression %s - null found for the first value",toExpression(constructingFactory))); + } + if(null == second){ + throw new IOException(String.format(Locale.ROOT,"Invalid expression %s - null found for the second value",toExpression(constructingFactory))); + } + if(!(first instanceof List)){ + throw new IOException(String.format(Locale.ROOT,"Invalid expression %s - found type %s for the first value, expecting a list of numbers",toExpression(constructingFactory), first.getClass().getSimpleName())); + } + if(!(second instanceof List)){ + throw new IOException(String.format(Locale.ROOT,"Invalid expression %s - found type %s for the second value, expecting a list of numbers",toExpression(constructingFactory), first.getClass().getSimpleName())); + } + + double[] result = MathArrays.ebeMultiply( + ((List) first).stream().mapToDouble(value -> ((BigDecimal) value).doubleValue()).toArray(), + ((List) second).stream().mapToDouble(value -> ((BigDecimal) value).doubleValue()).toArray() + ); + + List numbers = new ArrayList(); + for(double d : result) { + numbers.add(d); + } + + return numbers; + } +} diff --git a/solr/solrj/src/java/org/apache/solr/client/solrj/io/eval/EBESubtractEvaluator.java b/solr/solrj/src/java/org/apache/solr/client/solrj/io/eval/EBESubtractEvaluator.java new file mode 100644 index 00000000000..2f2f0223bb3 --- /dev/null +++ b/solr/solrj/src/java/org/apache/solr/client/solrj/io/eval/EBESubtractEvaluator.java @@ -0,0 +1,63 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.solr.client.solrj.io.eval; + +import java.io.IOException; +import java.math.BigDecimal; +import java.util.ArrayList; +import java.util.List; +import java.util.Locale; + +import org.apache.commons.math3.util.MathArrays; +import org.apache.solr.client.solrj.io.stream.expr.StreamExpression; +import org.apache.solr.client.solrj.io.stream.expr.StreamFactory; + +public class EBESubtractEvaluator extends RecursiveNumericEvaluator implements TwoValueWorker { + protected static final long serialVersionUID = 1L; + + public EBESubtractEvaluator(StreamExpression expression, StreamFactory factory) throws IOException{ + super(expression, factory); + } + + @Override + public Object doWork(Object first, Object second) throws IOException{ + if(null == first){ + throw new IOException(String.format(Locale.ROOT,"Invalid expression %s - null found for the first value",toExpression(constructingFactory))); + } + if(null == second){ + throw new IOException(String.format(Locale.ROOT,"Invalid expression %s - null found for the second value",toExpression(constructingFactory))); + } + if(!(first instanceof List)){ + throw new IOException(String.format(Locale.ROOT,"Invalid expression %s - found type %s for the first value, expecting a list of numbers",toExpression(constructingFactory), first.getClass().getSimpleName())); + } + if(!(second instanceof List)){ + throw new IOException(String.format(Locale.ROOT,"Invalid expression %s - found type %s for the second value, expecting a list of numbers",toExpression(constructingFactory), first.getClass().getSimpleName())); + } + + double[] result = MathArrays.ebeSubtract( + ((List) first).stream().mapToDouble(value -> ((BigDecimal) value).doubleValue()).toArray(), + ((List) second).stream().mapToDouble(value -> ((BigDecimal) value).doubleValue()).toArray() + ); + + List numbers = new ArrayList(); + for(double d : result) { + numbers.add(d); + } + + return numbers; + } +} diff --git a/solr/solrj/src/test/org/apache/solr/client/solrj/io/stream/StreamExpressionTest.java b/solr/solrj/src/test/org/apache/solr/client/solrj/io/stream/StreamExpressionTest.java index 670b39d4fd0..f831daca740 100644 --- a/solr/solrj/src/test/org/apache/solr/client/solrj/io/stream/StreamExpressionTest.java +++ b/solr/solrj/src/test/org/apache/solr/client/solrj/io/stream/StreamExpressionTest.java @@ -6064,6 +6064,130 @@ public class StreamExpressionTest extends SolrCloudTestCase { } } + @Test + public void testEBESubtract() throws Exception { + String cexpr = "ebeSubtract(array(2,4,6,8,10,12),array(1,2,3,4,5,6))"; + ModifiableSolrParams paramsLoc = new ModifiableSolrParams(); + paramsLoc.set("expr", cexpr); + paramsLoc.set("qt", "/stream"); + String url = cluster.getJettySolrRunners().get(0).getBaseUrl().toString()+"/"+COLLECTIONORALIAS; + TupleStream solrStream = new SolrStream(url, paramsLoc); + StreamContext context = new StreamContext(); + solrStream.setStreamContext(context); + List tuples = getTuples(solrStream); + assertTrue(tuples.size() == 1); + List out = (List)tuples.get(0).get("return-value"); + assertTrue(out.size() == 6); + assertTrue(out.get(0).intValue() == 1); + assertTrue(out.get(1).intValue() == 2); + assertTrue(out.get(2).intValue() == 3); + assertTrue(out.get(3).intValue() == 4); + assertTrue(out.get(4).intValue() == 5); + assertTrue(out.get(5).intValue() == 6); + } + + + @Test + public void testEBEMultiply() throws Exception { + String cexpr = "ebeMultiply(array(2,4,6,8,10,12),array(1,2,3,4,5,6))"; + ModifiableSolrParams paramsLoc = new ModifiableSolrParams(); + paramsLoc.set("expr", cexpr); + paramsLoc.set("qt", "/stream"); + String url = cluster.getJettySolrRunners().get(0).getBaseUrl().toString()+"/"+COLLECTIONORALIAS; + TupleStream solrStream = new SolrStream(url, paramsLoc); + StreamContext context = new StreamContext(); + solrStream.setStreamContext(context); + List tuples = getTuples(solrStream); + assertTrue(tuples.size() == 1); + List out = (List)tuples.get(0).get("return-value"); + assertTrue(out.size() == 6); + assertTrue(out.get(0).intValue() == 2); + assertTrue(out.get(1).intValue() == 8); + assertTrue(out.get(2).intValue() == 18); + assertTrue(out.get(3).intValue() == 32); + assertTrue(out.get(4).intValue() == 50); + assertTrue(out.get(5).intValue() == 72); + } + + + @Test + public void testEBEAdd() throws Exception { + String cexpr = "ebeAdd(array(2,4,6,8,10,12),array(1,2,3,4,5,6))"; + ModifiableSolrParams paramsLoc = new ModifiableSolrParams(); + paramsLoc.set("expr", cexpr); + paramsLoc.set("qt", "/stream"); + String url = cluster.getJettySolrRunners().get(0).getBaseUrl().toString()+"/"+COLLECTIONORALIAS; + TupleStream solrStream = new SolrStream(url, paramsLoc); + StreamContext context = new StreamContext(); + solrStream.setStreamContext(context); + List tuples = getTuples(solrStream); + assertTrue(tuples.size() == 1); + List out = (List)tuples.get(0).get("return-value"); + assertTrue(out.size() == 6); + assertTrue(out.get(0).intValue() == 3); + assertTrue(out.get(1).intValue() == 6); + assertTrue(out.get(2).intValue() == 9); + assertTrue(out.get(3).intValue() == 12); + assertTrue(out.get(4).intValue() == 15); + assertTrue(out.get(5).intValue() == 18); + } + + + @Test + public void testEBEDivide() throws Exception { + String cexpr = "ebeDivide(array(2,4,6,8,10,12),array(1,2,3,4,5,6))"; + ModifiableSolrParams paramsLoc = new ModifiableSolrParams(); + paramsLoc.set("expr", cexpr); + paramsLoc.set("qt", "/stream"); + String url = cluster.getJettySolrRunners().get(0).getBaseUrl().toString()+"/"+COLLECTIONORALIAS; + TupleStream solrStream = new SolrStream(url, paramsLoc); + StreamContext context = new StreamContext(); + solrStream.setStreamContext(context); + List tuples = getTuples(solrStream); + assertTrue(tuples.size() == 1); + List out = (List)tuples.get(0).get("return-value"); + assertTrue(out.size() == 6); + assertTrue(out.get(0).intValue() == 2); + assertTrue(out.get(1).intValue() == 2); + assertTrue(out.get(2).intValue() == 2); + assertTrue(out.get(3).intValue() == 2); + assertTrue(out.get(4).intValue() == 2); + assertTrue(out.get(5).intValue() == 2); + } + + @Test + public void testCosineSimilarity() throws Exception { + String cexpr = "cosineSimilarity(array(2,4,6,8),array(1,1,3,4))"; + ModifiableSolrParams paramsLoc = new ModifiableSolrParams(); + paramsLoc.set("expr", cexpr); + paramsLoc.set("qt", "/stream"); + String url = cluster.getJettySolrRunners().get(0).getBaseUrl().toString()+"/"+COLLECTIONORALIAS; + TupleStream solrStream = new SolrStream(url, paramsLoc); + StreamContext context = new StreamContext(); + solrStream.setStreamContext(context); + List tuples = getTuples(solrStream); + assertTrue(tuples.size() == 1); + Number cs = (Number)tuples.get(0).get("return-value"); + assertTrue(cs.doubleValue() == 0.9838197164968291); + } + + + + @Test + public void testDotProduct() throws Exception { + String cexpr = "dotProduct(array(2,4,6,8,10,12),array(1,2,3,4,5,6))"; + ModifiableSolrParams paramsLoc = new ModifiableSolrParams(); + paramsLoc.set("expr", cexpr); + paramsLoc.set("qt", "/stream"); + String url = cluster.getJettySolrRunners().get(0).getBaseUrl().toString()+"/"+COLLECTIONORALIAS; + TupleStream solrStream = new SolrStream(url, paramsLoc); + StreamContext context = new StreamContext(); + solrStream.setStreamContext(context); + List tuples = getTuples(solrStream); + assertTrue(tuples.size() == 1); + Number dotProduct = (Number)tuples.get(0).get("return-value"); + assertTrue(dotProduct.doubleValue()== 182); + } @Test From 33178fb5ccd43fa0c54d2df3760d3362c2700a28 Mon Sep 17 00:00:00 2001 From: yonik Date: Mon, 4 Sep 2017 18:58:31 -0400 Subject: [PATCH 27/44] SOLR-10613: add flag to FunctionQParser to use FieldNameValueSource place holder when encountering a field name --- .../apache/solr/search/FunctionQParser.java | 11 +++- .../apache/solr/search/ValueSourceParser.java | 4 +- .../apache/solr/search/facet/MinMaxAgg.java | 12 +++- .../search/function/FieldNameValueSource.java | 60 +++++++++++++++++++ 4 files changed, 81 insertions(+), 6 deletions(-) create mode 100644 solr/core/src/java/org/apache/solr/search/function/FieldNameValueSource.java diff --git a/solr/core/src/java/org/apache/solr/search/FunctionQParser.java b/solr/core/src/java/org/apache/solr/search/FunctionQParser.java index 7e6a706403d..d3a311d936a 100644 --- a/solr/core/src/java/org/apache/solr/search/FunctionQParser.java +++ b/solr/core/src/java/org/apache/solr/search/FunctionQParser.java @@ -25,6 +25,7 @@ import org.apache.solr.common.params.SolrParams; import org.apache.solr.request.SolrQueryRequest; import org.apache.solr.schema.SchemaField; import org.apache.solr.search.facet.AggValueSource; +import org.apache.solr.search.function.FieldNameValueSource; import java.util.ArrayList; import java.util.List; @@ -33,6 +34,7 @@ public class FunctionQParser extends QParser { public static final int FLAG_CONSUME_DELIMITER = 0x01; // consume delimiter after parsing arg public static final int FLAG_IS_AGG = 0x02; + public static final int FLAG_USE_FIELDNAME_SOURCE = 0x04; // When a field name is encountered, use the placeholder FieldNameValueSource instead of resolving to a real ValueSource public static final int FLAG_DEFAULT = FLAG_CONSUME_DELIMITER; /** @lucene.internal */ @@ -374,8 +376,13 @@ public class FunctionQParser extends QParser { } else if ("false".equals(id)) { valueSource = new BoolConstValueSource(false); } else { - SchemaField f = req.getSchema().getField(id); - valueSource = f.getType().getValueSource(f, this); + if ((flags & FLAG_USE_FIELDNAME_SOURCE) != 0) { + // Don't try to create a ValueSource for the field, just use a placeholder. + valueSource = new FieldNameValueSource(id); + } else { + SchemaField f = req.getSchema().getField(id); + valueSource = f.getType().getValueSource(f, this); + } } } diff --git a/solr/core/src/java/org/apache/solr/search/ValueSourceParser.java b/solr/core/src/java/org/apache/solr/search/ValueSourceParser.java index 7d6d162ce1a..51048d2aa6b 100644 --- a/solr/core/src/java/org/apache/solr/search/ValueSourceParser.java +++ b/solr/core/src/java/org/apache/solr/search/ValueSourceParser.java @@ -1017,14 +1017,14 @@ public abstract class ValueSourceParser implements NamedListInitializedPlugin { addParser("agg_min", new ValueSourceParser() { @Override public ValueSource parse(FunctionQParser fp) throws SyntaxError { - return new MinMaxAgg("min", fp.parseValueSource()); + return new MinMaxAgg("min", fp.parseValueSource(FunctionQParser.FLAG_DEFAULT | FunctionQParser.FLAG_USE_FIELDNAME_SOURCE)); } }); addParser("agg_max", new ValueSourceParser() { @Override public ValueSource parse(FunctionQParser fp) throws SyntaxError { - return new MinMaxAgg("max", fp.parseValueSource()); + return new MinMaxAgg("max", fp.parseValueSource(FunctionQParser.FLAG_DEFAULT | FunctionQParser.FLAG_USE_FIELDNAME_SOURCE)); } }); diff --git a/solr/core/src/java/org/apache/solr/search/facet/MinMaxAgg.java b/solr/core/src/java/org/apache/solr/search/facet/MinMaxAgg.java index a6d6b9744b5..9569599c458 100644 --- a/solr/core/src/java/org/apache/solr/search/facet/MinMaxAgg.java +++ b/solr/core/src/java/org/apache/solr/search/facet/MinMaxAgg.java @@ -28,6 +28,7 @@ import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.LongValues; import org.apache.solr.schema.SchemaField; import org.apache.solr.schema.StrFieldSource; +import org.apache.solr.search.function.FieldNameValueSource; public class MinMaxAgg extends SimpleAggValueSource { final int minmax; // a multiplier to reverse the normal order of compare if this is max instead of min (i.e. max will be -1) @@ -41,9 +42,16 @@ public class MinMaxAgg extends SimpleAggValueSource { public SlotAcc createSlotAcc(FacetContext fcontext, int numDocs, int numSlots) throws IOException { ValueSource vs = getArg(); + SchemaField sf = null; + + if (vs instanceof FieldNameValueSource) { + String field = ((FieldNameValueSource)vs).getFieldName(); + sf = fcontext.qcontext.searcher().getSchema().getField(field); + + vs = sf.getType().getValueSource(sf, null); // temporary implementation to make existing code work + } + if (vs instanceof StrFieldSource) { - String field = ((StrFieldSource) vs).getField(); - SchemaField sf = fcontext.qcontext.searcher().getSchema().getField(field); if (sf.multiValued() || sf.getType().multiValuedFieldCache()) { if (sf.hasDocValues()) { // dv diff --git a/solr/core/src/java/org/apache/solr/search/function/FieldNameValueSource.java b/solr/core/src/java/org/apache/solr/search/function/FieldNameValueSource.java new file mode 100644 index 00000000000..c122dbb611e --- /dev/null +++ b/solr/core/src/java/org/apache/solr/search/function/FieldNameValueSource.java @@ -0,0 +1,60 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.solr.search.function; + + +import java.io.IOException; +import java.util.Map; + +import org.apache.lucene.index.LeafReaderContext; +import org.apache.lucene.queries.function.FunctionValues; +import org.apache.lucene.queries.function.ValueSource; + +/** Placeholder value source. + * @lucene.internal */ +public class FieldNameValueSource extends ValueSource { + private String fieldName; + + public FieldNameValueSource(String fieldName) { + this.fieldName = fieldName; + } + + public String getFieldName() { + return fieldName; + } + + @Override + public FunctionValues getValues(Map context, LeafReaderContext readerContext) throws IOException { + throw new UnsupportedOperationException("FieldNameValueSource should not be directly used: " + this); + } + + @Override + public boolean equals(Object o) { + return o instanceof FieldNameValueSource && fieldName.equals(((FieldNameValueSource)o).getFieldName()); + } + + @Override + public int hashCode() { + return fieldName.hashCode(); + } + + @Override + public String description() { + return "FIELDNAME(" + fieldName + ")"; + } +} From c6a5f10fd364a0c63477348560e0c6eb1a742d3e Mon Sep 17 00:00:00 2001 From: yonik Date: Mon, 4 Sep 2017 19:19:40 -0400 Subject: [PATCH 28/44] SOLR-11322: omit val for min/max when no values in field for bucket --- solr/CHANGES.txt | 4 ++++ .../java/org/apache/solr/search/facet/MinMaxAgg.java | 10 ++++++++++ .../org/apache/solr/search/facet/TestJsonFacets.java | 4 ++-- 3 files changed, 16 insertions(+), 2 deletions(-) diff --git a/solr/CHANGES.txt b/solr/CHANGES.txt index 4ccc609ce31..56b55c95762 100644 --- a/solr/CHANGES.txt +++ b/solr/CHANGES.txt @@ -183,6 +183,10 @@ Other Changes * SOLR-11209: Upgrade HttpClient to 4.5.3. (Hrishikesh Gadre via Mark Miller) +* SOLR-11322: JSON Facet API: instead of returning NaN, min & max aggregations omit + the value for any bucket with no values in the numeric field. (yonik) + + ================== 7.0.0 ================== Versions of Major Components diff --git a/solr/core/src/java/org/apache/solr/search/facet/MinMaxAgg.java b/solr/core/src/java/org/apache/solr/search/facet/MinMaxAgg.java index 9569599c458..0f4bea6f271 100644 --- a/solr/core/src/java/org/apache/solr/search/facet/MinMaxAgg.java +++ b/solr/core/src/java/org/apache/solr/search/facet/MinMaxAgg.java @@ -137,6 +137,16 @@ public class MinMaxAgg extends SimpleAggValueSource { result[slotNum] = val; } } + + @Override + public Object getValue(int slot) { + double val = result[slot]; + if (Double.isNaN(val)) { + return null; + } else { + return val; + } + } } diff --git a/solr/core/src/test/org/apache/solr/search/facet/TestJsonFacets.java b/solr/core/src/test/org/apache/solr/search/facet/TestJsonFacets.java index 3ee069f6a75..5ecd3a1a85c 100644 --- a/solr/core/src/test/org/apache/solr/search/facet/TestJsonFacets.java +++ b/solr/core/src/test/org/apache/solr/search/facet/TestJsonFacets.java @@ -1019,8 +1019,8 @@ public class TestJsonFacets extends SolrTestCaseHS { ",sum1:0.0," + " sumsq1:0.0," + " avg1:0.0," + // TODO: undesirable. omit? - " min1:'NaN'," + // TODO: undesirable. omit? - " max1:'NaN'," + + // " min1:'NaN'," + + // " max1:'NaN'," + " numwhere:0," + " unique_num_i:0," + " unique_num_d:0," + From 5436395325e7748f35f771a3110c5671a9b64d14 Mon Sep 17 00:00:00 2001 From: Erick Date: Mon, 4 Sep 2017 19:05:24 -0700 Subject: [PATCH 29/44] commit 19ec48d8a22461fb5723d9e3b81e87a59f3337a3 Author: Erick Date: Mon Sep 4 19:00:12 2017 -0700 SOLR-10101: TestLazyCores hangs --- solr/CHANGES.txt | 2 ++ solr/core/src/test/org/apache/solr/core/TestLazyCores.java | 4 ++-- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/solr/CHANGES.txt b/solr/CHANGES.txt index 56b55c95762..d1c3b3fe0cb 100644 --- a/solr/CHANGES.txt +++ b/solr/CHANGES.txt @@ -120,6 +120,8 @@ Bug Fixes * SOLR-11293: Potential data loss in TLOG replicas after replication failures (noble) +* SOLR-10101: TestLazyCores hangs (Erick Erickson) + Optimizations ---------------------- diff --git a/solr/core/src/test/org/apache/solr/core/TestLazyCores.java b/solr/core/src/test/org/apache/solr/core/TestLazyCores.java index 4c50480af0d..6a5697a93f4 100644 --- a/solr/core/src/test/org/apache/solr/core/TestLazyCores.java +++ b/solr/core/src/test/org/apache/solr/core/TestLazyCores.java @@ -781,7 +781,6 @@ public class TestLazyCores extends SolrTestCaseJ4 { } } - @BadApple(bugUrl = "https://issues.apache.org/jira/browse/SOLR-10101") // Insure that when a core is aged out of the transient cache, any uncommitted docs are preserved. // Note, this needs FS-based indexes to persist! // Cores 2, 3, 6, 7, 8, 9 are transient @@ -814,7 +813,8 @@ public class TestLazyCores extends SolrTestCaseJ4 { openCores.clear(); // We still should have 6, 7, 8, 9 loaded, their reference counts have NOT dropped to zero - checkInCores(cc, "collection6", "collection7", "collection8", "collection9"); + checkInCores(cc, "collection1", "collection5", + "collection6", "collection7", "collection8", "collection9"); for (String coreName : coreList) { // The point of this test is to insure that when cores are aged out and re-opened From f4b13e86ff537a636b1018f2a89de446698886ab Mon Sep 17 00:00:00 2001 From: Steve Rowe Date: Tue, 5 Sep 2017 10:00:03 -0400 Subject: [PATCH 30/44] SOLR-11315: Finish the job of removing trie fields from the 'files' example schema --- solr/CHANGES.txt | 2 +- solr/example/files/conf/managed-schema | 8 ++------ 2 files changed, 3 insertions(+), 7 deletions(-) diff --git a/solr/CHANGES.txt b/solr/CHANGES.txt index d1c3b3fe0cb..d0a58243690 100644 --- a/solr/CHANGES.txt +++ b/solr/CHANGES.txt @@ -796,7 +796,7 @@ Other Changes * SOLR-10494: Make default response format JSON (wt=json), and also indent text responses formats (indent=on) by default (Trey Grainger & Cassandra Targett via hossman) -* SOLR-10760: Remove trie field types and fields from example schemas. (Steve Rowe) +* SOLR-10760,SOLR-11315: Remove trie field types and fields from example schemas. (Steve Rowe) * SOLR-11056: Add random range query test that compares results across Trie*, *Point and DocValue-only fields (Tomás Fernández Löbbe) diff --git a/solr/example/files/conf/managed-schema b/solr/example/files/conf/managed-schema index 97742740076..d9f45385d55 100644 --- a/solr/example/files/conf/managed-schema +++ b/solr/example/files/conf/managed-schema @@ -442,7 +442,7 @@ - + @@ -482,7 +482,7 @@ - + @@ -493,10 +493,6 @@ - - - - From b4a1a1a87b5489839d6ce0e14fb188b5fcb566f9 Mon Sep 17 00:00:00 2001 From: Mike McCandless Date: Tue, 5 Sep 2017 10:13:58 -0400 Subject: [PATCH 31/44] LUCENE-7891: use a non-buggy LRU cache in Lucene's taxonomy facets, by default --- lucene/CHANGES.txt | 3 ++ .../writercache/LruTaxonomyWriterCache.java | 20 +++++--- .../TestLruTaxonomyWriterCache.java | 50 +++++++++++++++++++ 3 files changed, 65 insertions(+), 8 deletions(-) create mode 100644 lucene/facet/src/test/org/apache/lucene/facet/taxonomy/writercache/TestLruTaxonomyWriterCache.java diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index f6a37ecd747..a819916f275 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -52,6 +52,9 @@ Bug Fixes not recommended, lucene-analyzers-icu contains binary data structures specific to ICU/Unicode versions it is built against. (Chris Koenig, Robert Muir) +* LUCENE-7891: Lucene's taxonomy facets now uses a non-buggy LRU cache + by default. (Jan-Willem van den Broek via Mike McCandless) + Build * SOLR-11181: Switch order of maven artifact publishing procedure: deploy first diff --git a/lucene/facet/src/java/org/apache/lucene/facet/taxonomy/writercache/LruTaxonomyWriterCache.java b/lucene/facet/src/java/org/apache/lucene/facet/taxonomy/writercache/LruTaxonomyWriterCache.java index 828e2b6df3a..6dc8cd23724 100644 --- a/lucene/facet/src/java/org/apache/lucene/facet/taxonomy/writercache/LruTaxonomyWriterCache.java +++ b/lucene/facet/src/java/org/apache/lucene/facet/taxonomy/writercache/LruTaxonomyWriterCache.java @@ -32,8 +32,12 @@ public class LruTaxonomyWriterCache implements TaxonomyWriterCache { * function, LRU_STRING should be used. */ public enum LRUType { - /** Use the label's hash as the key; this can lead to - * silent conflicts! */ + /** Use only the label's 64 bit longHashCode as the hash key. Do not + * check equals, unlike most hash maps. + * Note that while these hashes are very likely to be unique, the chance + * of a collision is still greater than zero. If such an unlikely event + * occurs, your document will get an incorrect facet. + */ LRU_HASHED, /** Use the label as the hash key; this is always @@ -43,15 +47,15 @@ public class LruTaxonomyWriterCache implements TaxonomyWriterCache { private NameIntCacheLRU cache; - /** Creates this with {@link LRUType#LRU_HASHED} method. */ + /** Creates this with {@link LRUType#LRU_STRING} method. */ public LruTaxonomyWriterCache(int cacheSize) { // TODO (Facet): choose between NameHashIntCacheLRU and NameIntCacheLRU. // For guaranteed correctness - not relying on no-collisions in the hash // function, NameIntCacheLRU should be used: // On the other hand, NameHashIntCacheLRU takes less RAM but if there - // are collisions (which we never found) two different paths would be - // mapped to the same ordinal... - this(cacheSize, LRUType.LRU_HASHED); + // are collisions two different paths would be mapped to the same + // ordinal... + this(cacheSize, LRUType.LRU_STRING); } /** Creates this with the specified method. */ @@ -60,8 +64,8 @@ public class LruTaxonomyWriterCache implements TaxonomyWriterCache { // For guaranteed correctness - not relying on no-collisions in the hash // function, NameIntCacheLRU should be used: // On the other hand, NameHashIntCacheLRU takes less RAM but if there - // are collisions (which we never found) two different paths would be - // mapped to the same ordinal... + // are collisions two different paths would be mapped to the same + // ordinal... if (lruType == LRUType.LRU_HASHED) { this.cache = new NameHashIntCacheLRU(cacheSize); } else { diff --git a/lucene/facet/src/test/org/apache/lucene/facet/taxonomy/writercache/TestLruTaxonomyWriterCache.java b/lucene/facet/src/test/org/apache/lucene/facet/taxonomy/writercache/TestLruTaxonomyWriterCache.java new file mode 100644 index 00000000000..972b2965525 --- /dev/null +++ b/lucene/facet/src/test/org/apache/lucene/facet/taxonomy/writercache/TestLruTaxonomyWriterCache.java @@ -0,0 +1,50 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.lucene.facet.taxonomy.writercache; + +import org.apache.lucene.facet.FacetTestCase; +import org.apache.lucene.facet.taxonomy.FacetLabel; +import org.junit.Test; + +public class TestLruTaxonomyWriterCache extends FacetTestCase { + + @Test + public void testDefaultLRUTypeIsCollisionSafe() { + // These labels are clearly different, but have identical longHashCodes. + // Note that these labels are clearly contrived. We did encounter + // collisions in actual production data, but we aren't allowed to publish + // those. + final FacetLabel a = new FacetLabel("\0", "\u0003\uFFE2"); + final FacetLabel b = new FacetLabel("\1", "\0"); + // If this fails, then the longHashCode implementation has changed. This + // cannot prevent collisions. (All hashes must allow for collisions.) It + // will however stop the rest of this test from making sense. To fix, find + // new colliding labels, or make a subclass of FacetLabel that produces + // collisions. + assertEquals(a.longHashCode(), b.longHashCode()); + // Make a cache with capacity > 2 so both our labels will fit. Don't + // specify an LRUType, since we want to check if the default is + // collision-safe. + final LruTaxonomyWriterCache cache = new LruTaxonomyWriterCache(10); + cache.put(a, 0); + cache.put(b, 1); + assertEquals(cache.get(a), 0); + assertEquals(cache.get(b), 1); + } + +} From 810ae50e7e9d4ab6afe60eca5daf9c6d74afa287 Mon Sep 17 00:00:00 2001 From: Steve Rowe Date: Tue, 5 Sep 2017 10:22:20 -0400 Subject: [PATCH 32/44] SOLR-11313: Finish the job of removing trie fields from the DIH example schemas --- solr/CHANGES.txt | 2 +- solr/example/example-DIH/solr/db/conf/managed-schema | 2 +- solr/example/example-DIH/solr/mail/conf/managed-schema | 2 +- solr/example/example-DIH/solr/solr/conf/managed-schema | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/solr/CHANGES.txt b/solr/CHANGES.txt index d0a58243690..69c414d3937 100644 --- a/solr/CHANGES.txt +++ b/solr/CHANGES.txt @@ -796,7 +796,7 @@ Other Changes * SOLR-10494: Make default response format JSON (wt=json), and also indent text responses formats (indent=on) by default (Trey Grainger & Cassandra Targett via hossman) -* SOLR-10760,SOLR-11315: Remove trie field types and fields from example schemas. (Steve Rowe) +* SOLR-10760,SOLR-11315,SOLR-11313: Remove trie field types and fields from example schemas. (Steve Rowe) * SOLR-11056: Add random range query test that compares results across Trie*, *Point and DocValue-only fields (Tomás Fernández Löbbe) diff --git a/solr/example/example-DIH/solr/db/conf/managed-schema b/solr/example/example-DIH/solr/db/conf/managed-schema index df01bc02d3d..88ed6d01b9b 100644 --- a/solr/example/example-DIH/solr/db/conf/managed-schema +++ b/solr/example/example-DIH/solr/db/conf/managed-schema @@ -218,7 +218,7 @@ - + diff --git a/solr/example/example-DIH/solr/mail/conf/managed-schema b/solr/example/example-DIH/solr/mail/conf/managed-schema index 06c8e7d0262..6167dafcdfa 100644 --- a/solr/example/example-DIH/solr/mail/conf/managed-schema +++ b/solr/example/example-DIH/solr/mail/conf/managed-schema @@ -154,7 +154,7 @@ - + diff --git a/solr/example/example-DIH/solr/solr/conf/managed-schema b/solr/example/example-DIH/solr/solr/conf/managed-schema index d6de5f5daaf..eed5a83c5eb 100644 --- a/solr/example/example-DIH/solr/solr/conf/managed-schema +++ b/solr/example/example-DIH/solr/solr/conf/managed-schema @@ -218,7 +218,7 @@ - + From 030c3c8f72be848d670b0ab8a0f7846ea9dbb086 Mon Sep 17 00:00:00 2001 From: Cassandra Targett Date: Fri, 1 Sep 2017 12:22:49 -0500 Subject: [PATCH 33/44] Ref Guide: update ref guide readme for better explanation of build dependencies and targets --- solr/solr-ref-guide/README.adoc | 32 +++++++++++++++++++++++--------- 1 file changed, 23 insertions(+), 9 deletions(-) diff --git a/solr/solr-ref-guide/README.adoc b/solr/solr-ref-guide/README.adoc index a66d79defeb..b1613965d19 100644 --- a/solr/solr-ref-guide/README.adoc +++ b/solr/solr-ref-guide/README.adoc @@ -18,26 +18,40 @@ This is the source for the Solr Reference Guide. -Raw content is stored in Asciidoc (`.adoc`) formated files in the `src/` directory. +Raw content is stored in Asciidoc (`.adoc`) formatted files in the `src/` directory. +== Prerequisites for Building These files are processed with AsciiDoctor in 2 different ways: -* Via 'Jekyll' to build an HTML browsable version of the Ref Guide -** Prerequisites: `Ruby` and the following gems must be installed: -*** `jekyll` -*** `jekyll-asciidoc` -*** `pygments.rb` -* Via `asciidoctor-ant` to build the officially released PDF version of the Ref Guide -** Prerequisites: None (except for those required to use the Lucene/Solr build: Java, Ant) +* Via Jekyll to build an HTML browsable version of the Ref Guide. +** Prerequisites: `Ruby` (v2.1 or higher) and the following gems must be installed: +*** `jekyll`: v3.5, not v4.x. Use `gem install --force --version 3.5.0 jekyll` to force install of Jekyll 3.5.0. +*** `jekyll-asciidoc`: v2.1 or higher. Use `gem install jekyll-asciidoc` to install. +*** `pygments.rb`: v1.1.2 or higher. Use `gem install pygments.rb` to install. +* Via `asciidoctor-ant` to build the officially released PDF version of the Ref Guide. +** Prerequisites: None beyond those required to use the main Lucene/Solr build: Java, and Ant. +== Building the Guide For details on building the ref guide, see `ant -p`. +There are currently four available targets: + +* `ant default`: builds both the PDF and HTML versions of the Solr Ref Guide. +* `ant build-site`: builds only the HTML version. +* `ant build-pdf`: builds only the PDF version. +* `ant clean`: removes the `../build/solr-ref-guide` directory. + +The output of all builds will be located in `../build/solr-ref-guide`. + +== Key Directories Key directories to be aware of: -* `src` - where all human edited `*.adoc` files realted to the Guide live, as well as various configuration, theme, and template files. +* `src` - where all human edited `*.adoc` files related to the Guide live, as well as various configuration, theme, and template files. * `tools` - custom Java code for parsing metadata in our `src/*.adoc` files to produce some `_data/` files for site & pdf navigation purposes. * `../build/solr-ref-guide/content` - a copy of the `src` dir generated by ant where: ** `*.template` files are processed to replace ant properties with their runtime values ** some `../build/solr-ref-guide/content/_data` files are generated by our java tools based header attributes from each of the `*.adoc` files * `../build/solr-ref-guide/html-site` - HTML generated version of the ref guide * `../build/solr-ref-guide/apache-solr-ref-guide-X.Y.pdf` - PDF generated version of the ref guide + +See the additional documentation in `src/metadocs` for more information about how to edit files, build for releases, or modifying any Jekyll or PDF templates. From aff647ecfaf5af3bbeb2363b82821c53c5df7f3d Mon Sep 17 00:00:00 2001 From: Cassandra Targett Date: Tue, 5 Sep 2017 09:35:45 -0500 Subject: [PATCH 34/44] SOLR-9526: Update Ref Guide for schemaless changes --- solr/solr-ref-guide/src/schemaless-mode.adoc | 62 ++++++++++++++------ 1 file changed, 43 insertions(+), 19 deletions(-) diff --git a/solr/solr-ref-guide/src/schemaless-mode.adoc b/solr/solr-ref-guide/src/schemaless-mode.adoc index 825c294ac6b..f351bd63191 100644 --- a/solr/solr-ref-guide/src/schemaless-mode.adoc +++ b/solr/solr-ref-guide/src/schemaless-mode.adoc @@ -22,7 +22,7 @@ Schemaless Mode is a set of Solr features that, when used together, allow users These Solr features, all controlled via `solrconfig.xml`, are: -. Managed schema: Schema modifications are made at runtime through Solr APIs, which requires the use of `schemaFactory` that supports these changes - see <> for more details. +. Managed schema: Schema modifications are made at runtime through Solr APIs, which requires the use of a `schemaFactory` that supports these changes. See the section <> for more details. . Field value class guessing: Previously unseen fields are run through a cascading set of value-based parsers, which guess the Java class of field values - parsers for Boolean, Integer, Long, Float, Double, and Date are currently available. . Automatic schema field addition, based on field value class(es): Previously unseen fields are added to the schema, based on field value Java classes, which are mapped to schema field types - see <>. @@ -35,7 +35,7 @@ The three features of schemaless mode are pre-configured in the `_default` <> to confirm this: `curl \http://localhost:8983/solr/gettingstarted/schema/fields` will output: @@ -84,19 +84,23 @@ You can configure the `ManagedIndexSchemaFactory` (and control the resource file ---- -=== Define an UpdateRequestProcessorChain +=== Enable Field Class Guessing -The UpdateRequestProcessorChain allows Solr to guess field types, and you can define the default field type classes to use. To start, you should define it as follows (see the javadoc links below for update processor factory documentation): +In Solr, an <> defines a chain of plugins that are applied to documents before or while they are indexed. + +The field guessing aspect of Solr's schemaless mode uses a specially-defined UpdateRequestProcessorChain that allows Solr to guess field types. You can also define the default field type classes to use. + +To start, you should define it as follows (see the javadoc links below for update processor factory documentation): [source,xml] ---- - + [^\w-\.] _ - + @@ -120,11 +124,11 @@ The UpdateRequestProcessorChain allows Solr to guess field types, and you can de yyyy-MM-dd - + - java.lang.String + java.lang.String text_general - + *_str 256 @@ -140,7 +144,7 @@ The UpdateRequestProcessorChain allows Solr to guess field types, and you can de pdates - java.lang.Long + java.lang.Long java.lang.Integer plongs @@ -152,14 +156,26 @@ The UpdateRequestProcessorChain allows Solr to guess field types, and you can de + processor="uuid,remove-blank,field-name-mutating,parse-boolean,parse-long,parse-double,parse-date,add-schema-fields"> ---- -Javadocs for update processor factories mentioned above: +There are many things defined in this chain. Let's step through a few of them. + +<1> First, we're using the FieldNameMutatingUpdateProcessorFactory to lower-case all field names. Note that this and every following `` element include a `name`. These names will be used in the final chain definition at the end of this example. +<2> Next we add several update request processors to parse different field types. Note the ParseDateFieldUpdateProcessorFactory includes a long list of possible date formations that would be parsed into valid Solr dates. If you have a custom date, you could add it to this list (see the link to the Javadocs below to get information on how). +<3> Once the fields have been parsed, we define the field types that will be assigned to those fields. You can modify any of these that you would like to change. +<4> In this definition, if the parsing step decides the incoming data in a field is a string, we will put this into a field in Solr with the field type `text_general`. This field type by default allows Solr to query on this field. +<5> After we've added the `text_general` field, we have also defined a copy field rule that will copy all data from the new `text_general` field to a field with the same name suffixed with `_str`. This is done by Solr's dynamic fields feature. By defining the target of the copy field rule as a dynamic field in this way, you can control the field type used in your schema. The default selection allows Solr to facet, highlight, and sort on these fields. +<6> This is another example of a mapping rule. In this case we define that when either of the `Long` or `Integer` field parsers identify a field, they should both map their fields to the `plongs` field type. +<7> Finally, we add a chain definition that calls the list of plugins. These plugins are each called by the names we gave to them when we defined them. We can also add other processors to the chain, as shown here. Note we have also given the entire chain a `name` ("add-unknown-fields-to-the-schema"). We'll use this name in the next section to specify that our update request handler should use this chain definition. + +CAUTION: This chain definition will make a number of copy field rules for string fields to be created from corresponding text fields. If your data causes you to end up with a lot of copy field rules, indexing may be slowed down noticeably, and your index size will be larger. To control for these issues, it's recommended that you review the copy field rules that are created, and remove any which you do not need for faceting, sorting, highlighting, etc. + +If you're interested in more information about the classes used in this chain, here are links to the Javadocs for update processor factories mentioned above: * {solr-javadocs}/solr-core/org/apache/solr/update/processor/UUIDUpdateProcessorFactory.html[UUIDUpdateProcessorFactory] * {solr-javadocs}/solr-core/org/apache/solr/update/processor/RemoveBlankFieldUpdateProcessorFactory.html[RemoveBlankFieldUpdateProcessorFactory] @@ -170,9 +186,13 @@ Javadocs for update processor factories mentioned above: * {solr-javadocs}/solr-core/org/apache/solr/update/processor/ParseDateFieldUpdateProcessorFactory.html[ParseDateFieldUpdateProcessorFactory] * {solr-javadocs}/solr-core/org/apache/solr/update/processor/AddSchemaFieldsUpdateProcessorFactory.html[AddSchemaFieldsUpdateProcessorFactory] -=== Make the UpdateRequestProcessorChain the Default for the UpdateRequestHandler +=== Set the Default UpdateRequestProcessorChain -Once the UpdateRequestProcessorChain has been defined, you must instruct your UpdateRequestHandlers to use it when working with index updates (i.e., adding, removing, replacing documents). There are two ways to do this. The update chain shown above has a `default=true` attribute which will use it for any update handler. An alternative, more explicit way is to use <> to set the defaults on all `/update` request handlers: +Once the UpdateRequestProcessorChain has been defined, you must instruct your UpdateRequestHandlers to use it when working with index updates (i.e., adding, removing, replacing documents). + +There are two ways to do this. The update chain shown above has a `default=true` attribute which will use it for any update handler. + +An alternative, more explicit way is to use <> to set the defaults on all `/update` request handlers: [source,xml] ---- @@ -183,14 +203,18 @@ Once the UpdateRequestProcessorChain has been defined, you must instruct your Up ---- -[IMPORTANT] -==== -After each of these changes have been made, Solr should be restarted (or, you can reload the cores to load the new `solrconfig.xml` definitions). -==== +IMPORTANT: After all of these changes have been made, Solr should be restarted or the cores reloaded. + +=== Disabling Automatic Field Guessing + +Automatic field creation can be disabled with the `update.autoCreateFields` property. To do this, you can use the <> with a command such as: + +[source,bash] +curl http://host:8983/solr/mycollection/config -d '{"set-user-property": {"update.autoCreateFields":"false"}}' == Examples of Indexed Documents -Once the schemaless mode has been enabled (whether you configured it manually or are using `_default`), documents that include fields that are not defined in your schema will be indexed, using the guessed field types which are automatically added to the schema. +Once the schemaless mode has been enabled (whether you configured it manually or are using the `_default` configset), documents that include fields that are not defined in your schema will be indexed, using the guessed field types which are automatically added to the schema. For example, adding a CSV document will cause unknown fields to be added, with fieldTypes based on values: From 547228df170912739a28974a47e60a1ceb7a241b Mon Sep 17 00:00:00 2001 From: Steve Rowe Date: Tue, 5 Sep 2017 11:14:53 -0400 Subject: [PATCH 35/44] SOLR-11324: Clean up mention of trie fields in documentation and source comments --- solr/CHANGES.txt | 2 ++ .../java/org/apache/solr/schema/DateRangeField.java | 4 ++-- .../processor/FieldLengthUpdateProcessorFactory.java | 6 +++--- .../processor/FieldMutatingUpdateProcessorFactory.java | 4 ++-- .../ParseDateFieldUpdateProcessorFactory.java | 4 ++-- .../ParseDoubleFieldUpdateProcessorFactory.java | 4 ++-- .../ParseFloatFieldUpdateProcessorFactory.java | 4 ++-- .../processor/ParseIntFieldUpdateProcessorFactory.java | 4 ++-- .../ParseLongFieldUpdateProcessorFactory.java | 4 ++-- .../test/org/apache/solr/BasicFunctionalityTest.java | 2 +- .../org/apache/solr/TestDistributedMissingSort.java | 8 ++++---- .../test/org/apache/solr/search/CursorMarkTest.java | 10 +++++----- .../org/apache/solr/search/TestSolrQueryParser.java | 2 +- solr/example/films/README.txt | 6 +++--- solr/solr-ref-guide/src/schema-api.adoc | 4 ++-- solr/solr-ref-guide/src/working-with-dates.adoc | 4 ++-- 16 files changed, 37 insertions(+), 35 deletions(-) diff --git a/solr/CHANGES.txt b/solr/CHANGES.txt index 69c414d3937..4118e75dab8 100644 --- a/solr/CHANGES.txt +++ b/solr/CHANGES.txt @@ -889,6 +889,8 @@ Other Changes * SOLR-11261, SOLR-10966: Upgrade to Hadoop 2.7.4 to fix incompatibility with Java 9. (Uwe Schindler) + +* SOLR-11324: Clean up mention of trie fields in documentation and source comments. (Steve Rowe) ================== 6.6.1 ================== diff --git a/solr/core/src/java/org/apache/solr/schema/DateRangeField.java b/solr/core/src/java/org/apache/solr/schema/DateRangeField.java index 8dde953f8aa..b7c3329302c 100644 --- a/solr/core/src/java/org/apache/solr/schema/DateRangeField.java +++ b/solr/core/src/java/org/apache/solr/schema/DateRangeField.java @@ -39,7 +39,7 @@ import org.apache.solr.util.DateMathParser; import org.locationtech.spatial4j.shape.Shape; /** - * A field for indexed dates and date ranges. It's mostly compatible with TrieDateField. It has the potential to allow + * A field for indexed dates and date ranges. It's mostly compatible with DatePointField. It has the potential to allow * efficient faceting, similar to facet.enum. * * @see NumberRangePrefixTreeStrategy @@ -75,7 +75,7 @@ public class DateRangeField extends AbstractSpatialPrefixTreeFieldTypeFor example, with the configuration listed below any documents * containing String values (such as "abcdef" or * "xyz") in a field declared in the schema using - * TrieIntField or TrieLongField + * IntPointField or LongPointField * would have those Strings replaced with the length of those fields as an * Integer * (ie: 6 and 3 respectively) @@ -43,8 +43,8 @@ import static org.apache.solr.update.processor.FieldValueMutatingUpdateProcessor *

  * <processor class="solr.FieldLengthUpdateProcessorFactory">
  *   <arr name="typeClass">
- *     <str>solr.TrieIntField</str>
- *     <str>solr.TrieLongField</str>
+ *     <str>solr.IntPointField</str>
+ *     <str>solr.LongPointField</str>
  *   </arr>
  * </processor>
*/ diff --git a/solr/core/src/java/org/apache/solr/update/processor/FieldMutatingUpdateProcessorFactory.java b/solr/core/src/java/org/apache/solr/update/processor/FieldMutatingUpdateProcessorFactory.java index c9034f86da2..a8cb87dfd42 100644 --- a/solr/core/src/java/org/apache/solr/update/processor/FieldMutatingUpdateProcessorFactory.java +++ b/solr/core/src/java/org/apache/solr/update/processor/FieldMutatingUpdateProcessorFactory.java @@ -79,7 +79,7 @@ import static org.apache.solr.update.processor.FieldMutatingUpdateProcessor.SELE * In the ExampleFieldMutatingUpdateProcessorFactory configured below, * fields will be mutated if the name starts with "foo" or "bar"; * unless the field name contains the substring "SKIP" or - * the fieldType is (or subclasses) TrieDateField. Meaning a field named + * the fieldType is (or subclasses) DatePointField. Meaning a field named * "foo_SKIP" is guaranteed not to be selected, but a field named "bar_smith" * that uses StrField will be selected. *

@@ -92,7 +92,7 @@ import static org.apache.solr.update.processor.FieldMutatingUpdateProcessor.SELE * <str name="fieldRegex">.*SKIP.*</str> * </lst> * <lst name="exclude"> - * <str name="typeClass">solr.TrieDateField</str> + * <str name="typeClass">solr.DatePointField</str> * </lst> * </processor> * diff --git a/solr/core/src/java/org/apache/solr/update/processor/ParseDateFieldUpdateProcessorFactory.java b/solr/core/src/java/org/apache/solr/update/processor/ParseDateFieldUpdateProcessorFactory.java index 9d0311c4a6b..5958f3a3b70 100644 --- a/solr/core/src/java/org/apache/solr/update/processor/ParseDateFieldUpdateProcessorFactory.java +++ b/solr/core/src/java/org/apache/solr/update/processor/ParseDateFieldUpdateProcessorFactory.java @@ -47,8 +47,8 @@ import org.slf4j.LoggerFactory; *

*

* The default selection behavior is to mutate both those fields that don't match - * a schema field, as well as those fields that match a schema field with a field - * type that uses class solr.TrieDateField. + * a schema field, as well as those fields that match a schema field with a date + * field type. *

*

* If all values are parseable as dates (or are already Date), then the field will diff --git a/solr/core/src/java/org/apache/solr/update/processor/ParseDoubleFieldUpdateProcessorFactory.java b/solr/core/src/java/org/apache/solr/update/processor/ParseDoubleFieldUpdateProcessorFactory.java index c2d2e8ec476..93badadf8f2 100644 --- a/solr/core/src/java/org/apache/solr/update/processor/ParseDoubleFieldUpdateProcessorFactory.java +++ b/solr/core/src/java/org/apache/solr/update/processor/ParseDoubleFieldUpdateProcessorFactory.java @@ -38,8 +38,8 @@ import java.util.Locale; *

*

* The default selection behavior is to mutate both those fields that don't match - * a schema field, as well as those fields that match a schema field with a field - * type that uses class solr.TrieDoubleField. + * a schema field, as well as those fields that match a schema field with a double + * field type. *

*

* If all values are parseable as double (or are already Double), then the field diff --git a/solr/core/src/java/org/apache/solr/update/processor/ParseFloatFieldUpdateProcessorFactory.java b/solr/core/src/java/org/apache/solr/update/processor/ParseFloatFieldUpdateProcessorFactory.java index 778e7775cd6..311b4aec8b7 100644 --- a/solr/core/src/java/org/apache/solr/update/processor/ParseFloatFieldUpdateProcessorFactory.java +++ b/solr/core/src/java/org/apache/solr/update/processor/ParseFloatFieldUpdateProcessorFactory.java @@ -38,8 +38,8 @@ import java.util.Locale; *

*

* The default selection behavior is to mutate both those fields that don't match - * a schema field, as well as those fields that match a schema field with a field - * type that uses class solr.TrieFloatField. + * a schema field, as well as those fields that match a schema field with a float + * field type. *

*

* If all values are parseable as float (or are already Float), then the field diff --git a/solr/core/src/java/org/apache/solr/update/processor/ParseIntFieldUpdateProcessorFactory.java b/solr/core/src/java/org/apache/solr/update/processor/ParseIntFieldUpdateProcessorFactory.java index eebc7ff54d7..fa7e1caf59d 100644 --- a/solr/core/src/java/org/apache/solr/update/processor/ParseIntFieldUpdateProcessorFactory.java +++ b/solr/core/src/java/org/apache/solr/update/processor/ParseIntFieldUpdateProcessorFactory.java @@ -35,8 +35,8 @@ import java.util.Locale; *

*

* The default selection behavior is to mutate both those fields that don't match - * a schema field, as well as those fields that match a schema field with a field - * type that uses class solr.TrieIntField. + * a schema field, as well as those fields that match a schema field with an int + * field type. *

*

* If all values are parseable as int (or are already Integer), then the field diff --git a/solr/core/src/java/org/apache/solr/update/processor/ParseLongFieldUpdateProcessorFactory.java b/solr/core/src/java/org/apache/solr/update/processor/ParseLongFieldUpdateProcessorFactory.java index bc7d1da142b..78863c1b100 100644 --- a/solr/core/src/java/org/apache/solr/update/processor/ParseLongFieldUpdateProcessorFactory.java +++ b/solr/core/src/java/org/apache/solr/update/processor/ParseLongFieldUpdateProcessorFactory.java @@ -35,8 +35,8 @@ import java.util.Locale; *

*

* The default selection behavior is to mutate both those fields that don't match - * a schema field, as well as those fields that match a schema field with a field - * type that uses class solr.TrieLongField. + * a schema field, as well as those fields that match a schema field with a long + * field type. *

*

* If all values are parseable as long (or are already Long), then the field diff --git a/solr/core/src/test/org/apache/solr/BasicFunctionalityTest.java b/solr/core/src/test/org/apache/solr/BasicFunctionalityTest.java index 52e2e781819..84280ee3a82 100644 --- a/solr/core/src/test/org/apache/solr/BasicFunctionalityTest.java +++ b/solr/core/src/test/org/apache/solr/BasicFunctionalityTest.java @@ -869,7 +869,7 @@ public class BasicFunctionalityTest extends SolrTestCaseJ4 { // testing everything from query level is hard because // time marches on ... and there is no easy way to reach into the - // bowels of TrieDateField and muck with the definition of "now" + // bowels of DatePointField and muck with the definition of "now" // ... // BUT: we can test that crazy combinations of "NOW" all work correctly, // assuming the test doesn't take too long to run... diff --git a/solr/core/src/test/org/apache/solr/TestDistributedMissingSort.java b/solr/core/src/test/org/apache/solr/TestDistributedMissingSort.java index 378ad0dcfc5..416556a1f95 100644 --- a/solr/core/src/test/org/apache/solr/TestDistributedMissingSort.java +++ b/solr/core/src/test/org/apache/solr/TestDistributedMissingSort.java @@ -30,10 +30,10 @@ public class TestDistributedMissingSort extends BaseDistributedSearchTestCase { schemaString = "schema-distributed-missing-sort.xml"; } - String sint1_ml = "one_i1_ml"; // TrieIntField, sortMissingLast=true, multiValued=false - String sint1_mf = "two_i1_mf"; // TrieIntField, sortMissingFirst=true, multiValued=false - String long1_ml = "three_l1_ml"; // TrieLongField, sortMissingLast=true, multiValued=false - String long1_mf = "four_l1_mf"; // TrieLongField, sortMissingFirst=true, multiValued=false + String sint1_ml = "one_i1_ml"; // int field, sortMissingLast=true, multiValued=false + String sint1_mf = "two_i1_mf"; // int field, sortMissingFirst=true, multiValued=false + String long1_ml = "three_l1_ml"; // long field, sortMissingLast=true, multiValued=false + String long1_mf = "four_l1_mf"; // long field, sortMissingFirst=true, multiValued=false String string1_ml = "five_s1_ml"; // StringField, sortMissingLast=true, multiValued=false String string1_mf = "six_s1_mf"; // StringField, sortMissingFirst=true, multiValued=false diff --git a/solr/core/src/test/org/apache/solr/search/CursorMarkTest.java b/solr/core/src/test/org/apache/solr/search/CursorMarkTest.java index dab47ca4b0e..19722be77ea 100644 --- a/solr/core/src/test/org/apache/solr/search/CursorMarkTest.java +++ b/solr/core/src/test/org/apache/solr/search/CursorMarkTest.java @@ -230,15 +230,15 @@ public class CursorMarkTest extends SolrTestCaseJ4 { random().nextBytes(randBytes); val = new BytesRef(randBytes); } else if (fieldName.contains("int")) { - val = random().nextInt(); // TrieIntField + val = random().nextInt(); } else if (fieldName.contains("long")) { - val = random().nextLong(); // TrieLongField + val = random().nextLong(); } else if (fieldName.contains("float")) { - val = random().nextFloat() * random().nextInt(); // TrieFloatField + val = random().nextFloat() * random().nextInt(); } else if (fieldName.contains("double")) { - val = random().nextDouble() * random().nextInt(); // TrieDoubleField + val = random().nextDouble() * random().nextInt(); } else if (fieldName.contains("date")) { - val = random().nextLong(); // TrieDateField + val = random().nextLong(); } else if (fieldName.startsWith("currency")) { val = random().nextDouble(); } else if (fieldName.startsWith("uuid")) { diff --git a/solr/core/src/test/org/apache/solr/search/TestSolrQueryParser.java b/solr/core/src/test/org/apache/solr/search/TestSolrQueryParser.java index 65c4d8f28aa..1db6b5a7c22 100644 --- a/solr/core/src/test/org/apache/solr/search/TestSolrQueryParser.java +++ b/solr/core/src/test/org/apache/solr/search/TestSolrQueryParser.java @@ -278,7 +278,7 @@ public class TestSolrQueryParser extends SolrTestCaseJ4 { q = qParser.getQuery(); assertEquals(26, ((TermInSetQuery)q).getTermData().size()); - // large numeric filter query should use TermsQuery (for trie fields) + // large numeric filter query should use TermsQuery qParser = QParser.getParser("foo_ti:(1 2 3 4 5 6 7 8 9 10 20 19 18 17 16 15 14 13 12 11)", req); qParser.setIsFilter(true); // this may change in the future qParser.setParams(params); diff --git a/solr/example/films/README.txt b/solr/example/films/README.txt index f1fabe05a56..d1679d22223 100644 --- a/solr/example/films/README.txt +++ b/solr/example/films/README.txt @@ -27,7 +27,7 @@ curl http://localhost:8983/solr/films/schema -X POST -H 'Content-type:applicatio }, "add-field" : { "name":"initial_release_date", - "type":"tdate", + "type":"pdate", "stored":true } }' @@ -83,7 +83,7 @@ FAQ: Why override the schema of the _name_ and _initial_release_date_ fields? Without overriding those field types, the _name_ field would have been guessed as a multi-valued string field type - and _initial_release_date_ would have been guessed as a multi-valued tdate type. It makes more sense with this + and _initial_release_date_ would have been guessed as a multi-valued pdate type. It makes more sense with this particular data set domain to have the movie name be a single valued general full-text searchable field, and for the release date also to be single valued. @@ -109,7 +109,7 @@ curl http://localhost:8983/solr/films/schema -X POST -H 'Content-type:applicatio }, "add-field" : { "name":"initial_release_date", - "type":"tdate", + "type":"pdate", "stored":true } }' diff --git a/solr/solr-ref-guide/src/schema-api.adoc b/solr/solr-ref-guide/src/schema-api.adoc index 5c5a8e20559..9865df0aa20 100644 --- a/solr/solr-ref-guide/src/schema-api.adoc +++ b/solr/solr-ref-guide/src/schema-api.adoc @@ -94,14 +94,14 @@ The `add-field` command adds a new field definition to your schema. If a field w All of the properties available when defining a field with manual `schema.xml` edits can be passed via the API. These request attributes are described in detail in the section <>. -For example, to define a new stored field named "sell-by", of type "tdate", you would POST the following request: +For example, to define a new stored field named "sell-by", of type "pdate", you would POST the following request: [source,bash] ---- curl -X POST -H 'Content-type:application/json' --data-binary '{ "add-field":{ "name":"sell-by", - "type":"tdate", + "type":"pdate", "stored":true } }' http://localhost:8983/solr/gettingstarted/schema ---- diff --git a/solr/solr-ref-guide/src/working-with-dates.adoc b/solr/solr-ref-guide/src/working-with-dates.adoc index 4fb12d7b149..c6321f655a4 100644 --- a/solr/solr-ref-guide/src/working-with-dates.adoc +++ b/solr/solr-ref-guide/src/working-with-dates.adoc @@ -68,7 +68,7 @@ Solr's `DateRangeField` supports the same point in time date syntax described ab * `[2014 TO 2014-12-01]` – From the start of 2014 till the end of the first day of December. * `[* TO 2014-12-01]` – From the earliest representable time thru till the end of the day on 2014-12-01. -Limitations: The range syntax doesn't support embedded date math. If you specify a date instance supported by TrieDateField with date math truncating it, like `NOW/DAY`, you still get the first millisecond of that day, not the entire day's range. Exclusive ranges (using `{` & `}`) work in _queries_ but not for _indexing_ ranges. +Limitations: The range syntax doesn't support embedded date math. If you specify a date instance supported by DatePointField with date math truncating it, like `NOW/DAY`, you still get the first millisecond of that day, not the entire day's range. Exclusive ranges (using `{` & `}`) work in _queries_ but not for _indexing_ ranges. == Date Math @@ -154,7 +154,7 @@ http://localhost:8983/solr/my_collection/select?q=*:*&facet.range=my_date_field& == More DateRangeField Details -`DateRangeField` is almost a drop-in replacement for places where `TrieDateField` is used. The only difference is that Solr's XML or SolrJ response formats will expose the stored data as a String instead of a Date. The underlying index data for this field will be a bit larger. Queries that align to units of time a second on up should be faster than TrieDateField, especially if it's in UTC. But the main point of DateRangeField as its name suggests is to allow indexing date ranges. To do that, simply supply strings in the format shown above. It also supports specifying 3 different relational predicates between the indexed data, and the query range: `Intersects` (default), `Contains`, `Within`. You can specify the predicate by querying using the `op` local-params parameter like so: +`DateRangeField` is almost a drop-in replacement for places where `DatePointField` is used. The only difference is that Solr's XML or SolrJ response formats will expose the stored data as a String instead of a Date. The underlying index data for this field will be a bit larger. Queries that align to units of time a second on up should be faster than TrieDateField, especially if it's in UTC. But the main point of DateRangeField as its name suggests is to allow indexing date ranges. To do that, simply supply strings in the format shown above. It also supports specifying 3 different relational predicates between the indexed data, and the query range: `Intersects` (default), `Contains`, `Within`. You can specify the predicate by querying using the `op` local-params parameter like so: [source,text] ---- From 5a8eb5388d7f05c5b1c4eb121288ec2a7ed67eb7 Mon Sep 17 00:00:00 2001 From: Cassandra Targett Date: Tue, 5 Sep 2017 10:37:35 -0500 Subject: [PATCH 36/44] SOLR-11305: finish clean up of Trie* fields in docs --- solr/solr-ref-guide/src/spatial-search.adoc | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/solr/solr-ref-guide/src/spatial-search.adoc b/solr/solr-ref-guide/src/spatial-search.adoc index 64d813fd107..4cc05d66657 100644 --- a/solr/solr-ref-guide/src/spatial-search.adoc +++ b/solr/solr-ref-guide/src/spatial-search.adoc @@ -354,9 +354,10 @@ The `BBoxField` field type indexes a single rectangle (bounding box) per documen [source,xml] ---- + - + geo="true" distanceUnits="kilometers" numberType="pdouble" /> + ---- BBoxField is actually based off of 4 instances of another field type referred to by numberType. It also uses a boolean to flag a dateline cross. Assuming you want to use the relevancy feature, docValues is required. Some of the attributes are in common with the RPT field like geo, units, worldBounds, and spatialContextFactory because they share some of the same spatial infrastructure. From 96150badce8234cac00a23c2d5da55545e0be958 Mon Sep 17 00:00:00 2001 From: Adrien Grand Date: Tue, 5 Sep 2017 18:34:05 +0200 Subject: [PATCH 37/44] LUCENE-7956: Fixed potential stack overflow error in ICUNormalizer2CharFilter. --- lucene/CHANGES.txt | 3 + .../icu/ICUNormalizer2CharFilter.java | 35 +++-- .../icu/TestICUNormalizer2CharFilter.java | 21 +++ .../org/apache/lucene/search/DisiWrapper.java | 12 ++ .../apache/lucene/search/TermInSetQuery.java | 143 ++++-------------- 5 files changed, 89 insertions(+), 125 deletions(-) diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index a819916f275..02b22315940 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -195,6 +195,9 @@ Bug Fixes * LUCENE-7864: IndexMergeTool is not using intermediate hard links (even if possible). (Dawid Weiss) +* LUCENE-7956: Fixed potential stack overflow error in ICUNormalizer2CharFilter. + (Adrien Grand) + Improvements * LUCENE-7489: Better storage of sparse doc-values fields with the default diff --git a/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/ICUNormalizer2CharFilter.java b/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/ICUNormalizer2CharFilter.java index 706550a0f71..c529f74ed08 100644 --- a/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/ICUNormalizer2CharFilter.java +++ b/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/ICUNormalizer2CharFilter.java @@ -21,6 +21,7 @@ import java.io.IOException; import java.io.Reader; import java.util.Objects; +import org.apache.lucene.analysis.CharacterUtils; import org.apache.lucene.analysis.charfilter.BaseCharFilter; import com.ibm.icu.text.Normalizer2; @@ -61,7 +62,7 @@ public final class ICUNormalizer2CharFilter extends BaseCharFilter { ICUNormalizer2CharFilter(Reader in, Normalizer2 normalizer, int bufferSize) { super(in); this.normalizer = Objects.requireNonNull(normalizer); - this.tmpBuffer = new char[bufferSize]; + this.tmpBuffer = CharacterUtils.newCharacterBuffer(bufferSize); } @Override @@ -94,23 +95,31 @@ public final class ICUNormalizer2CharFilter extends BaseCharFilter { return -1; } - private final char[] tmpBuffer; + private final CharacterUtils.CharacterBuffer tmpBuffer; - private int readInputToBuffer() throws IOException { - final int len = input.read(tmpBuffer); - if (len == -1) { - inputFinished = true; - return 0; + private void readInputToBuffer() throws IOException { + while (true) { + // CharacterUtils.fill is supplementary char aware + final boolean hasRemainingChars = CharacterUtils.fill(tmpBuffer, input); + + assert tmpBuffer.getOffset() == 0; + inputBuffer.append(tmpBuffer.getBuffer(), 0, tmpBuffer.getLength()); + + if (hasRemainingChars == false) { + inputFinished = true; + break; + } + + final int lastCodePoint = Character.codePointBefore(tmpBuffer.getBuffer(), tmpBuffer.getLength()); + if (normalizer.isInert(lastCodePoint)) { + // we require an inert char so that we can normalize content before and + // after this character independently + break; + } } - inputBuffer.append(tmpBuffer, 0, len); // if checkedInputBoundary was at the end of a buffer, we need to check that char again checkedInputBoundary = Math.max(checkedInputBoundary - 1, 0); - // this loop depends on 'isInert' (changes under normalization) but looks only at characters. - // so we treat all surrogates as non-inert for simplicity - if (normalizer.isInert(tmpBuffer[len - 1]) && !Character.isSurrogate(tmpBuffer[len-1])) { - return len; - } else return len + readInputToBuffer(); } private int readAndNormalizeFromInput() { diff --git a/lucene/analysis/icu/src/test/org/apache/lucene/analysis/icu/TestICUNormalizer2CharFilter.java b/lucene/analysis/icu/src/test/org/apache/lucene/analysis/icu/TestICUNormalizer2CharFilter.java index 438a93179c2..822466f0192 100644 --- a/lucene/analysis/icu/src/test/org/apache/lucene/analysis/icu/TestICUNormalizer2CharFilter.java +++ b/lucene/analysis/icu/src/test/org/apache/lucene/analysis/icu/TestICUNormalizer2CharFilter.java @@ -20,12 +20,14 @@ package org.apache.lucene.analysis.icu; import java.io.IOException; import java.io.Reader; import java.io.StringReader; +import java.util.Arrays; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.BaseTokenStreamTestCase; import org.apache.lucene.analysis.CharFilter; import org.apache.lucene.analysis.MockTokenizer; import org.apache.lucene.analysis.Tokenizer; +import org.apache.lucene.analysis.core.KeywordTokenizer; import org.apache.lucene.analysis.ngram.NGramTokenizer; import org.apache.lucene.util.TestUtil; @@ -418,4 +420,23 @@ public class TestICUNormalizer2CharFilter extends BaseTokenStreamTestCase { } a.close(); } + + // https://issues.apache.org/jira/browse/LUCENE-7956 + public void testVeryLargeInputOfNonInertChars() throws Exception { + char[] text = new char[1000000]; + Arrays.fill(text, 'a'); + try (Analyzer a = new Analyzer() { + @Override + protected TokenStreamComponents createComponents(String fieldName) { + return new TokenStreamComponents(new KeywordTokenizer()); + } + + @Override + protected Reader initReader(String fieldName, Reader reader) { + return new ICUNormalizer2CharFilter(reader, Normalizer2.getInstance(null, "nfkc_cf", Normalizer2.Mode.COMPOSE)); + } + }) { + checkAnalysisConsistency(random(), a, false, new String(text)); + } + } } diff --git a/lucene/core/src/java/org/apache/lucene/search/DisiWrapper.java b/lucene/core/src/java/org/apache/lucene/search/DisiWrapper.java index f2543409f1d..28ba989be62 100644 --- a/lucene/core/src/java/org/apache/lucene/search/DisiWrapper.java +++ b/lucene/core/src/java/org/apache/lucene/search/DisiWrapper.java @@ -60,6 +60,18 @@ public class DisiWrapper { } } + // For TermInSetQuery + public DisiWrapper(DocIdSetIterator iterator) { + this.scorer = null; + this.spans = null; + this.iterator = iterator; + this.cost = iterator.cost(); + this.doc = -1; + this.twoPhaseView = null; + this.approximation = iterator; + this.matchCost = 0f; + } + public DisiWrapper(Spans spans) { this.scorer = null; this.spans = spans; diff --git a/lucene/core/src/java/org/apache/lucene/search/TermInSetQuery.java b/lucene/core/src/java/org/apache/lucene/search/TermInSetQuery.java index 9b64d379174..5a6676fca90 100644 --- a/lucene/core/src/java/org/apache/lucene/search/TermInSetQuery.java +++ b/lucene/core/src/java/org/apache/lucene/search/TermInSetQuery.java @@ -17,12 +17,9 @@ package org.apache.lucene.search; import java.io.IOException; -import java.util.ArrayList; import java.util.Arrays; import java.util.Collection; import java.util.Collections; -import java.util.List; -import java.util.Objects; import java.util.Set; import java.util.SortedSet; @@ -33,8 +30,6 @@ import org.apache.lucene.index.PostingsEnum; import org.apache.lucene.index.PrefixCodedTerms; import org.apache.lucene.index.PrefixCodedTerms.TermIterator; import org.apache.lucene.index.Term; -import org.apache.lucene.index.TermContext; -import org.apache.lucene.index.TermState; import org.apache.lucene.index.Terms; import org.apache.lucene.index.TermsEnum; import org.apache.lucene.search.BooleanClause.Occur; @@ -43,6 +38,7 @@ import org.apache.lucene.util.ArrayUtil; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.BytesRefBuilder; import org.apache.lucene.util.DocIdSetBuilder; +import org.apache.lucene.util.PriorityQueue; import org.apache.lucene.util.RamUsageEstimator; /** @@ -171,39 +167,6 @@ public class TermInSetQuery extends Query implements Accountable { return Collections.emptyList(); } - private static class TermAndState { - final String field; - final TermsEnum termsEnum; - final BytesRef term; - final TermState state; - final int docFreq; - final long totalTermFreq; - - TermAndState(String field, TermsEnum termsEnum) throws IOException { - this.field = field; - this.termsEnum = termsEnum; - this.term = BytesRef.deepCopyOf(termsEnum.term()); - this.state = termsEnum.termState(); - this.docFreq = termsEnum.docFreq(); - this.totalTermFreq = termsEnum.totalTermFreq(); - } - } - - private static class WeightOrDocIdSet { - final Weight weight; - final DocIdSet set; - - WeightOrDocIdSet(Weight weight) { - this.weight = Objects.requireNonNull(weight); - this.set = null; - } - - WeightOrDocIdSet(DocIdSet bitset) { - this.set = bitset; - this.weight = null; - } - } - @Override public Weight createWeight(IndexSearcher searcher, boolean needsScores, float boost) throws IOException { return new ConstantScoreWeight(this, boost) { @@ -216,11 +179,8 @@ public class TermInSetQuery extends Query implements Accountable { // order to protect highlighters } - /** - * On the given leaf context, try to either rewrite to a disjunction if - * there are few matching terms, or build a bitset containing matching docs. - */ - private WeightOrDocIdSet rewrite(LeafReaderContext context) throws IOException { + @Override + public Scorer scorer(LeafReaderContext context) throws IOException { final LeafReader reader = context.reader(); Terms terms = reader.terms(field); @@ -231,90 +191,49 @@ public class TermInSetQuery extends Query implements Accountable { PostingsEnum docs = null; TermIterator iterator = termData.iterator(); - // We will first try to collect up to 'threshold' terms into 'matchingTerms' - // if there are two many terms, we will fall back to building the 'builder' - final int threshold = Math.min(BOOLEAN_REWRITE_TERM_COUNT_THRESHOLD, BooleanQuery.getMaxClauseCount()); - assert termData.size() > threshold : "Query should have been rewritten"; - List matchingTerms = new ArrayList<>(threshold); - DocIdSetBuilder builder = null; + // Here we partition postings based on cost: longer ones will be consumed + // lazily while shorter ones are consumed eagerly into a bitset. Compared to + // putting everything into a bitset, this should help skip over unnecessary doc + // ids in the longer postings lists. This should be especially useful if + // document frequencies have a zipfian distribution. + final PriorityQueue longestPostingsLists = new PriorityQueue(BOOLEAN_REWRITE_TERM_COUNT_THRESHOLD) { + @Override + protected boolean lessThan(PostingsEnum a, PostingsEnum b) { + return a.cost() < b.cost(); + } + }; + DocIdSetBuilder shortestPostingsLists = null; for (BytesRef term = iterator.next(); term != null; term = iterator.next()) { assert field.equals(iterator.field()); if (termsEnum.seekExact(term)) { - if (matchingTerms == null) { - docs = termsEnum.postings(docs, PostingsEnum.NONE); - builder.add(docs); - } else if (matchingTerms.size() < threshold) { - matchingTerms.add(new TermAndState(field, termsEnum)); - } else { - assert matchingTerms.size() == threshold; - builder = new DocIdSetBuilder(reader.maxDoc(), terms); - docs = termsEnum.postings(docs, PostingsEnum.NONE); - builder.add(docs); - for (TermAndState t : matchingTerms) { - t.termsEnum.seekExact(t.term, t.state); - docs = t.termsEnum.postings(docs, PostingsEnum.NONE); - builder.add(docs); + docs = termsEnum.postings(docs, PostingsEnum.NONE); + docs = longestPostingsLists.insertWithOverflow(docs); + if (docs != null) { // the pq is full + if (shortestPostingsLists == null) { + shortestPostingsLists = new DocIdSetBuilder(reader.maxDoc()); } - matchingTerms = null; + shortestPostingsLists.add(docs); } } } - if (matchingTerms != null) { - assert builder == null; - BooleanQuery.Builder bq = new BooleanQuery.Builder(); - for (TermAndState t : matchingTerms) { - final TermContext termContext = new TermContext(searcher.getTopReaderContext()); - termContext.register(t.state, context.ord, t.docFreq, t.totalTermFreq); - bq.add(new TermQuery(new Term(t.field, t.term), termContext), Occur.SHOULD); - } - Query q = new ConstantScoreQuery(bq.build()); - final Weight weight = searcher.rewrite(q).createWeight(searcher, needsScores, score()); - return new WeightOrDocIdSet(weight); - } else { - assert builder != null; - return new WeightOrDocIdSet(builder.build()); - } - } - private Scorer scorer(DocIdSet set) throws IOException { - if (set == null) { + final int numClauses = longestPostingsLists.size() + (shortestPostingsLists == null ? 0 : 1); + if (numClauses == 0) { return null; } - final DocIdSetIterator disi = set.iterator(); - if (disi == null) { - return null; - } - return new ConstantScoreScorer(this, score(), disi); - } - @Override - public BulkScorer bulkScorer(LeafReaderContext context) throws IOException { - final WeightOrDocIdSet weightOrBitSet = rewrite(context); - if (weightOrBitSet == null) { - return null; - } else if (weightOrBitSet.weight != null) { - return weightOrBitSet.weight.bulkScorer(context); - } else { - final Scorer scorer = scorer(weightOrBitSet.set); - if (scorer == null) { - return null; - } - return new DefaultBulkScorer(scorer); + DisiPriorityQueue queue = new DisiPriorityQueue(numClauses); + for (PostingsEnum postings : longestPostingsLists) { + queue.add(new DisiWrapper(postings)); } - } - - @Override - public Scorer scorer(LeafReaderContext context) throws IOException { - final WeightOrDocIdSet weightOrBitSet = rewrite(context); - if (weightOrBitSet == null) { - return null; - } else if (weightOrBitSet.weight != null) { - return weightOrBitSet.weight.scorer(context); - } else { - return scorer(weightOrBitSet.set); + if (shortestPostingsLists != null) { + queue.add(new DisiWrapper(shortestPostingsLists.build().iterator())); } + final DocIdSetIterator disi = new DisjunctionDISIApproximation(queue); + return new ConstantScoreScorer(this, boost, disi); } }; } + } From 723ca96bc0ee41bbf8f0f3fe9374bcd1c34f9d1b Mon Sep 17 00:00:00 2001 From: yonik Date: Tue, 5 Sep 2017 13:33:08 -0400 Subject: [PATCH 38/44] SOLR-11317: min/max aggs use integral values for integral fields --- solr/CHANGES.txt | 4 + .../apache/solr/search/facet/MinMaxAgg.java | 100 +++++++++++++++--- .../org/apache/solr/search/facet/SlotAcc.java | 84 +++++++++++++-- .../solr/search/facet/TestJsonFacets.java | 66 ++++++++---- 4 files changed, 213 insertions(+), 41 deletions(-) diff --git a/solr/CHANGES.txt b/solr/CHANGES.txt index 4118e75dab8..ad0a522c642 100644 --- a/solr/CHANGES.txt +++ b/solr/CHANGES.txt @@ -83,6 +83,10 @@ New Features * SOLR-11244: Query DSL for Solr (Cao Manh Dat) +* SOLR-11317: JSON Facet API: min/max aggregations on numeric fields are now typed better so int/long + fields return an appropriate integral type rather than a double. (yonik) + + Bug Fixes ---------------------- diff --git a/solr/core/src/java/org/apache/solr/search/facet/MinMaxAgg.java b/solr/core/src/java/org/apache/solr/search/facet/MinMaxAgg.java index 0f4bea6f271..5a48ab2221c 100644 --- a/solr/core/src/java/org/apache/solr/search/facet/MinMaxAgg.java +++ b/solr/core/src/java/org/apache/solr/search/facet/MinMaxAgg.java @@ -25,7 +25,9 @@ import org.apache.lucene.index.OrdinalMap; import org.apache.lucene.index.SortedDocValues; import org.apache.lucene.queries.function.ValueSource; import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.FixedBitSet; import org.apache.lucene.util.LongValues; +import org.apache.solr.common.SolrException; import org.apache.solr.schema.SchemaField; import org.apache.solr.schema.StrFieldSource; import org.apache.solr.search.function.FieldNameValueSource; @@ -48,29 +50,39 @@ public class MinMaxAgg extends SimpleAggValueSource { String field = ((FieldNameValueSource)vs).getFieldName(); sf = fcontext.qcontext.searcher().getSchema().getField(field); - vs = sf.getType().getValueSource(sf, null); // temporary implementation to make existing code work + if (sf.multiValued() || sf.getType().multiValuedFieldCache()) { + vs = null; + throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, "min/max aggregations can't be used on multi-valued field " + field); + } else { + vs = sf.getType().getValueSource(sf, null); + } } if (vs instanceof StrFieldSource) { - if (sf.multiValued() || sf.getType().multiValuedFieldCache()) { - if (sf.hasDocValues()) { - // dv - } else { - // uif - } - } else { - return new SingleValuedOrdAcc(fcontext, sf, numSlots); + return new SingleValuedOrdAcc(fcontext, sf, numSlots); + } + + // Since functions don't currently have types, we rely on the type of the field + if (sf != null && sf.getType().getNumberType() != null) { + switch (sf.getType().getNumberType()) { + case FLOAT: + case DOUBLE: + return new DFuncAcc(vs, fcontext, numSlots); + case INTEGER: + case LONG: + case DATE: + return new LFuncAcc(vs, fcontext, numSlots); } } // numeric functions - return new ValSlotAcc(vs, fcontext, numSlots); + return new DFuncAcc(vs, fcontext, numSlots); } @Override public FacetMerger createFacetMerger(Object prototype) { - if (prototype instanceof Number) - return new NumericMerger(); + if (prototype instanceof Double) + return new NumericMerger(); // still use NumericMerger to handle NaN? else if (prototype instanceof Comparable) { return new ComparableMerger(); } else { @@ -122,8 +134,8 @@ public class MinMaxAgg extends SimpleAggValueSource { } } - class ValSlotAcc extends DoubleFuncSlotAcc { - public ValSlotAcc(ValueSource values, FacetContext fcontext, int numSlots) { + class DFuncAcc extends DoubleFuncSlotAcc { + public DFuncAcc(ValueSource values, FacetContext fcontext, int numSlots) { super(values, fcontext, numSlots, Double.NaN); } @@ -149,6 +161,66 @@ public class MinMaxAgg extends SimpleAggValueSource { } } + class LFuncAcc extends LongFuncSlotAcc { + FixedBitSet exists; + public LFuncAcc(ValueSource values, FacetContext fcontext, int numSlots) { + super(values, fcontext, numSlots, 0); + exists = new FixedBitSet(numSlots); + } + + @Override + public void collect(int doc, int slotNum) throws IOException { + long val = values.longVal(doc); + if (val == 0 && !values.exists(doc)) return; // depend on fact that non existing values return 0 for func query + + long currVal = result[slotNum]; + if (currVal == 0 && !exists.get(slotNum)) { + exists.set(slotNum); + result[slotNum] = val; + } else if (Long.compare(val, currVal) * minmax < 0) { + result[slotNum] = val; + } + } + + @Override + public Object getValue(int slot) { + long val = result[slot]; + if (val == 0 && exists.get(slot)) { + return null; + } else { + return val; + } + } + + @Override + public void resize(Resizer resizer) { + super.resize(resizer); + exists = resizer.resize(exists); + } + + @Override + public int compare(int slotA, int slotB) { + long a = result[slotA]; + long b = result[slotB]; + boolean ea = a != 0 || exists.get(slotA); + boolean eb = b != 0 || exists.get(slotB); + + if (ea != eb) { + if (ea) return 1; // a exists and b doesn't TODO: we need context to be able to sort missing last! SOLR-10618 + if (eb) return -1; // b exists and a is missing + } + + return Long.compare(a, b); + } + + @Override + public void reset() { + super.reset(); + exists.clear(0, exists.length()); + } + + } + abstract class OrdAcc extends SlotAcc { final static int MISSING = -1; diff --git a/solr/core/src/java/org/apache/solr/search/facet/SlotAcc.java b/solr/core/src/java/org/apache/solr/search/facet/SlotAcc.java index 1240051be6e..578ef1796a9 100644 --- a/solr/core/src/java/org/apache/solr/search/facet/SlotAcc.java +++ b/solr/core/src/java/org/apache/solr/search/facet/SlotAcc.java @@ -16,14 +16,6 @@ */ package org.apache.solr.search.facet; -import org.apache.lucene.index.LeafReaderContext; -import org.apache.lucene.queries.function.FunctionValues; -import org.apache.lucene.queries.function.ValueSource; -import org.apache.solr.common.util.SimpleOrderedMap; -import org.apache.solr.search.DocIterator; -import org.apache.solr.search.DocSet; -import org.apache.solr.search.SolrIndexSearcher; - import java.io.Closeable; import java.io.IOException; import java.lang.reflect.Array; @@ -32,6 +24,16 @@ import java.util.Arrays; import java.util.Iterator; import java.util.List; +import org.apache.lucene.index.LeafReaderContext; +import org.apache.lucene.queries.function.FunctionValues; +import org.apache.lucene.queries.function.ValueSource; +import org.apache.lucene.search.DocIdSetIterator; +import org.apache.lucene.util.FixedBitSet; +import org.apache.solr.common.util.SimpleOrderedMap; +import org.apache.solr.search.DocIterator; +import org.apache.solr.search.DocSet; +import org.apache.solr.search.SolrIndexSearcher; + /** * Accumulates statistics separated by a slot number. * There is a separate statistic per slot. The slot is usually an ordinal into a set of values, e.g. tracking a count @@ -140,6 +142,38 @@ public abstract class SlotAcc implements Closeable { return values; } + public long[] resize(long[] old, long defaultValue) { + long[] values = new long[getNewSize()]; + if (defaultValue != 0) { + Arrays.fill(values, 0, values.length, defaultValue); + } + for (int i = 0; i < old.length; i++) { + long val = old[i]; + if (val != defaultValue) { + int newSlot = getNewSlot(i); + if (newSlot >= 0) { + values[newSlot] = val; + } + } + } + return values; + } + + public FixedBitSet resize(FixedBitSet old) { + FixedBitSet values = new FixedBitSet(getNewSize()); + int oldSize = old.length(); + + for(int oldSlot = 0;;) { + oldSlot = values.nextSetBit(oldSlot); + if (oldSlot == DocIdSetIterator.NO_MORE_DOCS) break; + int newSlot = getNewSlot(oldSlot); + values.set(newSlot); + if (++oldSlot >= oldSize) break; + } + + return values; + } + public T[] resize(T[] old, T defaultValue) { T[] values = (T[]) Array.newInstance(old.getClass().getComponentType(), getNewSize()); if (defaultValue != null) { @@ -222,6 +256,40 @@ abstract class DoubleFuncSlotAcc extends FuncSlotAcc { } } +abstract class LongFuncSlotAcc extends FuncSlotAcc { + long[] result; + long initialValue; + + public LongFuncSlotAcc(ValueSource values, FacetContext fcontext, int numSlots, long initialValue) { + super(values, fcontext, numSlots); + this.initialValue = initialValue; + result = new long[numSlots]; + if (initialValue != 0) { + reset(); + } + } + + @Override + public int compare(int slotA, int slotB) { + return Long.compare(result[slotA], result[slotB]); + } + + @Override + public Object getValue(int slot) { + return result[slot]; + } + + @Override + public void reset() { + Arrays.fill(result, initialValue); + } + + @Override + public void resize(Resizer resizer) { + result = resizer.resize(result, initialValue); + } +} + abstract class IntSlotAcc extends SlotAcc { int[] result; // use LongArray32 int initialValue; diff --git a/solr/core/src/test/org/apache/solr/search/facet/TestJsonFacets.java b/solr/core/src/test/org/apache/solr/search/facet/TestJsonFacets.java index 5ecd3a1a85c..559240bc048 100644 --- a/solr/core/src/test/org/apache/solr/search/facet/TestJsonFacets.java +++ b/solr/core/src/test/org/apache/solr/search/facet/TestJsonFacets.java @@ -467,28 +467,29 @@ public class TestJsonFacets extends SolrTestCaseHS { // single valued strings - doStatsTemplated(client, params(p, "rows","0", "noexist","noexist_s", "cat_s","cat_s", "where_s","where_s", "num_d","num_d", "num_i","num_i", "super_s","super_s", "val_b","val_b", "date","date_dt", "sparse_s","sparse_s" ,"multi_ss","multi_ss") ); + doStatsTemplated(client, params(p, "rows","0", "noexist","noexist_s", "cat_s","cat_s", "where_s","where_s", "num_d","num_d", "num_i","num_i", "num_l","long_l", "super_s","super_s", "val_b","val_b", "date","date_dt", "sparse_s","sparse_s" ,"multi_ss","multi_ss") ); // multi-valued strings, long/float substitute for int/double - doStatsTemplated(client, params(p, "facet","true", "rows","0", "noexist","noexist_ss", "cat_s","cat_ss", "where_s","where_ss", "num_d","num_f", "num_i","num_l", "num_is","num_ls", "num_fs", "num_ds", "super_s","super_ss", "val_b","val_b", "date","date_dt", "sparse_s","sparse_ss", "multi_ss","multi_ss") ); + doStatsTemplated(client, params(p, "facet","true", "rows","0", "noexist","noexist_ss", "cat_s","cat_ss", "where_s","where_ss", "num_d","num_f", "num_i","num_l", "num_l","long_l", "num_is","num_ls", "num_fs", "num_ds", "super_s","super_ss", "val_b","val_b", "date","date_dt", "sparse_s","sparse_ss", "multi_ss","multi_ss") ); // multi-valued strings, method=dv for terms facets - doStatsTemplated(client, params(p, "terms_method", "method:dv,", "rows", "0", "noexist", "noexist_ss", "cat_s", "cat_ss", "where_s", "where_ss", "num_d", "num_f", "num_i", "num_l", "super_s", "super_ss", "val_b", "val_b", "date", "date_dt", "sparse_s", "sparse_ss", "multi_ss", "multi_ss")); + doStatsTemplated(client, params(p, "terms_method", "method:dv,", "rows", "0", "noexist", "noexist_ss", "cat_s", "cat_ss", "where_s", "where_ss", "num_d", "num_f", "num_i", "num_l", "num_l","long_l","super_s", "super_ss", "val_b", "val_b", "date", "date_dt", "sparse_s", "sparse_ss", "multi_ss", "multi_ss")); // single valued docvalues for strings, and single valued numeric doc values for numeric fields - doStatsTemplated(client, params(p, "rows","0", "noexist","noexist_sd", "cat_s","cat_sd", "where_s","where_sd", "num_d","num_dd", "num_i","num_id", "num_is","num_lds", "num_fs","num_dds", "super_s","super_sd", "val_b","val_b", "date","date_dtd", "sparse_s","sparse_sd" ,"multi_ss","multi_sds") ); + doStatsTemplated(client, params(p, "rows","0", "noexist","noexist_sd", "cat_s","cat_sd", "where_s","where_sd", "num_d","num_dd", "num_i","num_id", "num_is","num_lds", "num_l","long_ld", "num_fs","num_dds", "super_s","super_sd", "val_b","val_b", "date","date_dtd", "sparse_s","sparse_sd" ,"multi_ss","multi_sds") ); // multi-valued docvalues FacetFieldProcessorByArrayDV.unwrap_singleValued_multiDv = false; // better multi-valued coverage - doStatsTemplated(client, params(p, "rows","0", "noexist","noexist_sds", "cat_s","cat_sds", "where_s","where_sds", "num_d","num_d", "num_i","num_i", "num_is","num_ids", "num_fs","num_fds", "super_s","super_sds", "val_b","val_b", "date","date_dtds", "sparse_s","sparse_sds" ,"multi_ss","multi_sds") ); + doStatsTemplated(client, params(p, "rows","0", "noexist","noexist_sds", "cat_s","cat_sds", "where_s","where_sds", "num_d","num_d", "num_i","num_i", "num_is","num_ids", "num_l","long_ld", "num_fs","num_fds", "super_s","super_sds", "val_b","val_b", "date","date_dtds", "sparse_s","sparse_sds" ,"multi_ss","multi_sds") ); // multi-valued docvalues FacetFieldProcessorByArrayDV.unwrap_singleValued_multiDv = true; - doStatsTemplated(client, params(p, "rows","0", "noexist","noexist_sds", "cat_s","cat_sds", "where_s","where_sds", "num_d","num_d", "num_i","num_i", "num_is","num_ids", "num_fs","num_fds", "super_s","super_sds", "val_b","val_b", "date","date_dtds", "sparse_s","sparse_sds" ,"multi_ss","multi_sds") ); + doStatsTemplated(client, params(p, "rows","0", "noexist","noexist_sds", "cat_s","cat_sds", "where_s","where_sds", "num_d","num_d", "num_i","num_i", "num_is","num_ids", "num_l","long_ld", "num_fs","num_fds", "super_s","super_sds", "val_b","val_b", "date","date_dtds", "sparse_s","sparse_sds" ,"multi_ss","multi_sds") ); } public static void doStatsTemplated(Client client, ModifiableSolrParams p) throws Exception { p.set("Z_num_i", "Z_" + p.get("num_i") ); + p.set("Z_num_l", "Z_" + p.get("num_l") ); p.set("sparse_num_d", "sparse_" + p.get("num_d") ); if (p.get("num_is") == null) p.add("num_is","num_is"); if (p.get("num_fs") == null) p.add("num_fs","num_fs"); @@ -528,6 +529,7 @@ public class TestJsonFacets extends SolrTestCaseHS { String num_is = m.expand("${num_is}"); String num_fs = m.expand("${num_fs}"); String Z_num_i = m.expand("${Z_num_i}"); + String Z_num_l = m.expand("${Z_num_l}"); String val_b = m.expand("${val_b}"); String date = m.expand("${date}"); String super_s = m.expand("${super_s}"); @@ -553,13 +555,13 @@ public class TestJsonFacets extends SolrTestCaseHS { iclient.add(doc, null); iclient.add(doc, null); iclient.add(doc, null); // a couple of deleted docs - iclient.add(sdoc("id", "2", cat_s, "B", where_s, "NJ", num_d, "-9", num_i, "-5", num_is,"3",num_is,"-1", num_fs,"3",num_fs,"-1.5", super_s,"superman", date,"2002-02-02T02:02:02Z", val_b, "false" , multi_ss,"a", multi_ss,"b" , Z_num_i, "0"), null); + iclient.add(sdoc("id", "2", cat_s, "B", where_s, "NJ", num_d, "-9", num_i, "-5", num_is,"3",num_is,"-1", num_fs,"3",num_fs,"-1.5", super_s,"superman", date,"2002-02-02T02:02:02Z", val_b, "false" , multi_ss,"a", multi_ss,"b" , Z_num_i, "0", Z_num_l,"0"), null); iclient.add(sdoc("id", "3"), null); iclient.commit(); - iclient.add(sdoc("id", "4", cat_s, "A", where_s, "NJ", num_d, "2", sparse_num_d,"-4",num_i, "3", num_is,"0",num_is,"3", num_fs,"0", num_fs,"3", super_s,"spiderman", date,"2003-03-03T03:03:03Z" , multi_ss, "b", Z_num_i, ""+Integer.MIN_VALUE), null); + iclient.add(sdoc("id", "4", cat_s, "A", where_s, "NJ", num_d, "2", sparse_num_d,"-4",num_i, "3", num_is,"0",num_is,"3", num_fs,"0", num_fs,"3", super_s,"spiderman", date,"2003-03-03T03:03:03Z" , multi_ss, "b", Z_num_i, ""+Integer.MIN_VALUE, Z_num_l,Long.MIN_VALUE), null); iclient.add(sdoc("id", "5", cat_s, "B", where_s, "NJ", num_d, "11", num_i, "7", num_is,"0", num_fs,"0", super_s,"batman" , date,"2001-02-03T01:02:03Z" ,sparse_s,"two", multi_ss, "a"), null); iclient.commit(); - iclient.add(sdoc("id", "6", cat_s, "B", where_s, "NY", num_d, "-5", num_i, "-5", num_is,"-1", num_fs,"-1.5", super_s,"hulk" , date,"2002-03-01T03:02:01Z" , multi_ss, "b", multi_ss, "a", Z_num_i, ""+Integer.MAX_VALUE), null); + iclient.add(sdoc("id", "6", cat_s, "B", where_s, "NY", num_d, "-5", num_i, "-5", num_is,"-1", num_fs,"-1.5", super_s,"hulk" , date,"2002-03-01T03:02:01Z" , multi_ss, "b", multi_ss, "a", Z_num_i, ""+Integer.MAX_VALUE, Z_num_l,Long.MAX_VALUE), null); iclient.commit(); client.commit(); @@ -685,7 +687,18 @@ public class TestJsonFacets extends SolrTestCaseHS { ", f2:{ 'buckets':[{ val:'B', count:3, n1:-2.0}, { val:'A', count:2, n1:6.0 }]} }" ); - + // facet on numbers to test resize from hashing (may need to be sorting by the metric to test that) + client.testJQ(params(p, "q", "*:*" + , "json.facet", "{" + + " f1:{${terms} type:field, field:${num_is}, facet:{a:'min(${num_i})'}, sort:'a asc' }" + + ",f2:{${terms} type:field, field:${num_is}, facet:{a:'max(${num_i})'}, sort:'a desc' }" + + "}" + ) + , "facets=={count:6 " + + ",f1:{ buckets:[{val:-1,count:2,a:-5},{val:3,count:2,a:-5},{val:-5,count:1,a:2},{val:2,count:1,a:2},{val:0,count:2,a:3}, ] } " + + ",f2:{ buckets:[{val:0,count:2,a:7},{val:3,count:2,a:3},{val:-5,count:1,a:2},{val:2,count:1,a:2},{val:-1,count:2,a:-5}, ] } " + + "}" + ); // percentiles 0,10,50,90,100 // catA: 2.0 2.2 3.0 3.8 4.0 @@ -983,16 +996,20 @@ public class TestJsonFacets extends SolrTestCaseHS { // stats at top level client.testJQ(params(p, "q", "*:*" - , "json.facet", "{ sum1:'sum(${num_d})', sumsq1:'sumsq(${num_d})', avg1:'avg(${num_d})', avg2:'avg(def(${num_d},0))', min1:'min(${num_d})', max1:'max(${num_d})'" + + , "json.facet", "{ sum1:'sum(${num_d})', sumsq1:'sumsq(${num_d})', avg1:'avg(${num_d})', avg2:'avg(def(${num_d},0))', mind:'min(${num_d})', maxd:'max(${num_d})'" + ", numwhere:'unique(${where_s})', unique_num_i:'unique(${num_i})', unique_num_d:'unique(${num_d})', unique_date:'unique(${date})'" + ", where_hll:'hll(${where_s})', hll_num_i:'hll(${num_i})', hll_num_d:'hll(${num_d})', hll_date:'hll(${date})'" + - ", med:'percentile(${num_d},50)', perc:'percentile(${num_d},0,50.0,100)', variance:'variance(${num_d})', stddev:'stddev(${num_d})' }" + ", med:'percentile(${num_d},50)', perc:'percentile(${num_d},0,50.0,100)', variance:'variance(${num_d})', stddev:'stddev(${num_d})'" + + ", mini:'min(${num_i})', maxi:'max(${num_i})'" + + " }" ) , "facets=={ 'count':6, " + - "sum1:3.0, sumsq1:247.0, avg1:0.6, avg2:0.5, min1:-9.0, max1:11.0" + + "sum1:3.0, sumsq1:247.0, avg1:0.6, avg2:0.5, mind:-9.0, maxd:11.0" + ", numwhere:2, unique_num_i:4, unique_num_d:5, unique_date:5" + ", where_hll:2, hll_num_i:4, hll_num_d:5, hll_date:5" + - ", med:2.0, perc:[-9.0,2.0,11.0], variance:49.04, stddev:7.002856560004639}" + ", med:2.0, perc:[-9.0,2.0,11.0], variance:49.04, stddev:7.002856560004639" + + ", mini:-5, maxi:7" + + "}" ); // stats at top level, no matches @@ -1307,16 +1324,26 @@ public class TestJsonFacets extends SolrTestCaseHS { "}" ); - // test 0, min/max int + // test 0, min/max int/long client.testJQ(params(p, "q", "*:*" , "json.facet", "{" + - " u : 'unique(${Z_num_i})'" + + " u : 'unique(${Z_num_i})'" + + ", u2 : 'unique(${Z_num_l})'" + + ", min1 : 'min(${Z_num_i})', max1 : 'max(${Z_num_i})'" + + ", min2 : 'min(${Z_num_l})', max2 : 'max(${Z_num_l})'" + ", f1:{${terms} type:field, field:${Z_num_i} }" + + ", f2:{${terms} type:field, field:${Z_num_l} }" + "}" ) , "facets=={count:6 " + ",u:3" + + ",u2:3" + + ",min1:" + Integer.MIN_VALUE + + ",max1:" + Integer.MAX_VALUE + + ",min2:" + Long.MIN_VALUE + + ",max2:" + Long.MAX_VALUE + ",f1:{ buckets:[{val:" + Integer.MIN_VALUE + ",count:1},{val:0,count:1},{val:" + Integer.MAX_VALUE+",count:1}]} " + + ",f2:{ buckets:[{val:" + Long.MIN_VALUE + ",count:1},{val:0,count:1},{val:" + Long.MAX_VALUE+",count:1}]} " + "}" ); @@ -1394,11 +1421,12 @@ public class TestJsonFacets extends SolrTestCaseHS { // test acc reuse (i.e. reset() method). This is normally used for stats that are not calculated in the first phase, // currently non-sorting stats. client.testJQ(params(p, "q", "*:*" - , "json.facet", "{f1:{type:terms, field:'${cat_s}', facet:{h:'hll(${where_s})' , u:'unique(${where_s})', mind:'min(${num_d})', maxd:'max(${num_d})', sumd:'sum(${num_d})', avgd:'avg(${num_d})', variance:'variance(${num_d})', stddev:'stddev(${num_d})' } }}" + , "json.facet", "{f1:{type:terms, field:'${cat_s}', facet:{h:'hll(${where_s})' , u:'unique(${where_s})', mind:'min(${num_d})', maxd:'max(${num_d})', mini:'min(${num_i})', maxi:'max(${num_i})'" + + ", sumd:'sum(${num_d})', avgd:'avg(${num_d})', variance:'variance(${num_d})', stddev:'stddev(${num_d})' } }}" ) , "facets=={ 'count':6, " + - "'f1':{ buckets:[{val:B, count:3, h:2, u:2, mind:-9.0, maxd:11.0, sumd:-3.0, avgd:-1.0, variance:74.66666666666667, stddev:8.640987597877148}," + - " {val:A, count:2, h:2, u:2, mind:2.0, maxd:4.0, sumd:6.0, avgd:3.0, variance:1.0, stddev:1.0}] } } " + "'f1':{ buckets:[{val:B, count:3, h:2, u:2, mind:-9.0, maxd:11.0, mini:-5, maxi:7, sumd:-3.0, avgd:-1.0, variance:74.66666666666667, stddev:8.640987597877148}," + + " {val:A, count:2, h:2, u:2, mind:2.0, maxd:4.0, mini:2, maxi:3, sumd:6.0, avgd:3.0, variance:1.0, stddev:1.0}] } } " ); From ffb7e4f2a53a1adb7cd9cb7d8055027458a5f4d9 Mon Sep 17 00:00:00 2001 From: yonik Date: Tue, 5 Sep 2017 14:00:59 -0400 Subject: [PATCH 39/44] SOLR-11317: tests - disable failing TestSQLHandler --- solr/core/src/test/org/apache/solr/handler/TestSQLHandler.java | 2 ++ 1 file changed, 2 insertions(+) diff --git a/solr/core/src/test/org/apache/solr/handler/TestSQLHandler.java b/solr/core/src/test/org/apache/solr/handler/TestSQLHandler.java index 1999965f339..32060029bd6 100644 --- a/solr/core/src/test/org/apache/solr/handler/TestSQLHandler.java +++ b/solr/core/src/test/org/apache/solr/handler/TestSQLHandler.java @@ -23,6 +23,7 @@ import java.io.InputStreamReader; import java.util.ArrayList; import java.util.List; +import org.apache.lucene.util.LuceneTestCase; import org.apache.solr.client.solrj.SolrClient; import org.apache.solr.client.solrj.SolrRequest; import org.apache.solr.client.solrj.SolrServerException; @@ -43,6 +44,7 @@ import org.junit.After; import org.junit.Before; import org.junit.Test; +@LuceneTestCase.AwaitsFix(bugUrl = "https://issues.apache.org/jira/browse/SOLR-11317") public class TestSQLHandler extends AbstractFullDistribZkTestBase { static { From 2ed1573adc9a7e340d1ab6ccaabf05d1bf024be3 Mon Sep 17 00:00:00 2001 From: yonik Date: Tue, 5 Sep 2017 14:14:19 -0400 Subject: [PATCH 40/44] SOLR-11317: change FacetStream to handle int/long as well as double for metrics --- .../test/org/apache/solr/handler/TestSQLHandler.java | 3 --- .../solr/client/solrj/io/stream/FacetStream.java | 10 +++++++--- 2 files changed, 7 insertions(+), 6 deletions(-) diff --git a/solr/core/src/test/org/apache/solr/handler/TestSQLHandler.java b/solr/core/src/test/org/apache/solr/handler/TestSQLHandler.java index 32060029bd6..70ebd0aa788 100644 --- a/solr/core/src/test/org/apache/solr/handler/TestSQLHandler.java +++ b/solr/core/src/test/org/apache/solr/handler/TestSQLHandler.java @@ -23,7 +23,6 @@ import java.io.InputStreamReader; import java.util.ArrayList; import java.util.List; -import org.apache.lucene.util.LuceneTestCase; import org.apache.solr.client.solrj.SolrClient; import org.apache.solr.client.solrj.SolrRequest; import org.apache.solr.client.solrj.SolrServerException; @@ -38,13 +37,11 @@ import org.apache.solr.common.cloud.Replica; import org.apache.solr.common.params.CommonParams; import org.apache.solr.common.params.ModifiableSolrParams; import org.apache.solr.common.params.SolrParams; - import org.apache.solr.common.util.NamedList; import org.junit.After; import org.junit.Before; import org.junit.Test; -@LuceneTestCase.AwaitsFix(bugUrl = "https://issues.apache.org/jira/browse/SOLR-11317") public class TestSQLHandler extends AbstractFullDistribZkTestBase { static { diff --git a/solr/solrj/src/java/org/apache/solr/client/solrj/io/stream/FacetStream.java b/solr/solrj/src/java/org/apache/solr/client/solrj/io/stream/FacetStream.java index e11bf023276..639782e1707 100644 --- a/solr/solrj/src/java/org/apache/solr/client/solrj/io/stream/FacetStream.java +++ b/solr/solrj/src/java/org/apache/solr/client/solrj/io/stream/FacetStream.java @@ -495,11 +495,15 @@ public class FacetStream extends TupleStream implements Expressible { for(Metric metric : _metrics) { String identifier = metric.getIdentifier(); if(!identifier.startsWith("count(")) { - double d = (double)bucket.get("facet_"+m); + Number d = ((Number)bucket.get("facet_"+m)); if(metric.outputLong) { - t.put(identifier, Math.round(d)); + if (d instanceof Long || d instanceof Integer) { + t.put(identifier, d.longValue()); + } else { + t.put(identifier, Math.round(d.doubleValue())); + } } else { - t.put(identifier, d); + t.put(identifier, d.doubleValue()); } ++m; } else { From 967fe8a8b9d4e9416cfcbdc58a36dfda1eb0d854 Mon Sep 17 00:00:00 2001 From: Adrien Grand Date: Tue, 5 Sep 2017 21:17:03 +0200 Subject: [PATCH 41/44] LUCENE-7956: Remove unrelated changes. --- .../org/apache/lucene/search/DisiWrapper.java | 12 -- .../apache/lucene/search/TermInSetQuery.java | 143 ++++++++++++++---- 2 files changed, 112 insertions(+), 43 deletions(-) diff --git a/lucene/core/src/java/org/apache/lucene/search/DisiWrapper.java b/lucene/core/src/java/org/apache/lucene/search/DisiWrapper.java index 28ba989be62..f2543409f1d 100644 --- a/lucene/core/src/java/org/apache/lucene/search/DisiWrapper.java +++ b/lucene/core/src/java/org/apache/lucene/search/DisiWrapper.java @@ -60,18 +60,6 @@ public class DisiWrapper { } } - // For TermInSetQuery - public DisiWrapper(DocIdSetIterator iterator) { - this.scorer = null; - this.spans = null; - this.iterator = iterator; - this.cost = iterator.cost(); - this.doc = -1; - this.twoPhaseView = null; - this.approximation = iterator; - this.matchCost = 0f; - } - public DisiWrapper(Spans spans) { this.scorer = null; this.spans = spans; diff --git a/lucene/core/src/java/org/apache/lucene/search/TermInSetQuery.java b/lucene/core/src/java/org/apache/lucene/search/TermInSetQuery.java index 5a6676fca90..9b64d379174 100644 --- a/lucene/core/src/java/org/apache/lucene/search/TermInSetQuery.java +++ b/lucene/core/src/java/org/apache/lucene/search/TermInSetQuery.java @@ -17,9 +17,12 @@ package org.apache.lucene.search; import java.io.IOException; +import java.util.ArrayList; import java.util.Arrays; import java.util.Collection; import java.util.Collections; +import java.util.List; +import java.util.Objects; import java.util.Set; import java.util.SortedSet; @@ -30,6 +33,8 @@ import org.apache.lucene.index.PostingsEnum; import org.apache.lucene.index.PrefixCodedTerms; import org.apache.lucene.index.PrefixCodedTerms.TermIterator; import org.apache.lucene.index.Term; +import org.apache.lucene.index.TermContext; +import org.apache.lucene.index.TermState; import org.apache.lucene.index.Terms; import org.apache.lucene.index.TermsEnum; import org.apache.lucene.search.BooleanClause.Occur; @@ -38,7 +43,6 @@ import org.apache.lucene.util.ArrayUtil; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.BytesRefBuilder; import org.apache.lucene.util.DocIdSetBuilder; -import org.apache.lucene.util.PriorityQueue; import org.apache.lucene.util.RamUsageEstimator; /** @@ -167,6 +171,39 @@ public class TermInSetQuery extends Query implements Accountable { return Collections.emptyList(); } + private static class TermAndState { + final String field; + final TermsEnum termsEnum; + final BytesRef term; + final TermState state; + final int docFreq; + final long totalTermFreq; + + TermAndState(String field, TermsEnum termsEnum) throws IOException { + this.field = field; + this.termsEnum = termsEnum; + this.term = BytesRef.deepCopyOf(termsEnum.term()); + this.state = termsEnum.termState(); + this.docFreq = termsEnum.docFreq(); + this.totalTermFreq = termsEnum.totalTermFreq(); + } + } + + private static class WeightOrDocIdSet { + final Weight weight; + final DocIdSet set; + + WeightOrDocIdSet(Weight weight) { + this.weight = Objects.requireNonNull(weight); + this.set = null; + } + + WeightOrDocIdSet(DocIdSet bitset) { + this.set = bitset; + this.weight = null; + } + } + @Override public Weight createWeight(IndexSearcher searcher, boolean needsScores, float boost) throws IOException { return new ConstantScoreWeight(this, boost) { @@ -179,8 +216,11 @@ public class TermInSetQuery extends Query implements Accountable { // order to protect highlighters } - @Override - public Scorer scorer(LeafReaderContext context) throws IOException { + /** + * On the given leaf context, try to either rewrite to a disjunction if + * there are few matching terms, or build a bitset containing matching docs. + */ + private WeightOrDocIdSet rewrite(LeafReaderContext context) throws IOException { final LeafReader reader = context.reader(); Terms terms = reader.terms(field); @@ -191,49 +231,90 @@ public class TermInSetQuery extends Query implements Accountable { PostingsEnum docs = null; TermIterator iterator = termData.iterator(); - // Here we partition postings based on cost: longer ones will be consumed - // lazily while shorter ones are consumed eagerly into a bitset. Compared to - // putting everything into a bitset, this should help skip over unnecessary doc - // ids in the longer postings lists. This should be especially useful if - // document frequencies have a zipfian distribution. - final PriorityQueue longestPostingsLists = new PriorityQueue(BOOLEAN_REWRITE_TERM_COUNT_THRESHOLD) { - @Override - protected boolean lessThan(PostingsEnum a, PostingsEnum b) { - return a.cost() < b.cost(); - } - }; - DocIdSetBuilder shortestPostingsLists = null; + // We will first try to collect up to 'threshold' terms into 'matchingTerms' + // if there are two many terms, we will fall back to building the 'builder' + final int threshold = Math.min(BOOLEAN_REWRITE_TERM_COUNT_THRESHOLD, BooleanQuery.getMaxClauseCount()); + assert termData.size() > threshold : "Query should have been rewritten"; + List matchingTerms = new ArrayList<>(threshold); + DocIdSetBuilder builder = null; for (BytesRef term = iterator.next(); term != null; term = iterator.next()) { assert field.equals(iterator.field()); if (termsEnum.seekExact(term)) { - docs = termsEnum.postings(docs, PostingsEnum.NONE); - docs = longestPostingsLists.insertWithOverflow(docs); - if (docs != null) { // the pq is full - if (shortestPostingsLists == null) { - shortestPostingsLists = new DocIdSetBuilder(reader.maxDoc()); + if (matchingTerms == null) { + docs = termsEnum.postings(docs, PostingsEnum.NONE); + builder.add(docs); + } else if (matchingTerms.size() < threshold) { + matchingTerms.add(new TermAndState(field, termsEnum)); + } else { + assert matchingTerms.size() == threshold; + builder = new DocIdSetBuilder(reader.maxDoc(), terms); + docs = termsEnum.postings(docs, PostingsEnum.NONE); + builder.add(docs); + for (TermAndState t : matchingTerms) { + t.termsEnum.seekExact(t.term, t.state); + docs = t.termsEnum.postings(docs, PostingsEnum.NONE); + builder.add(docs); } - shortestPostingsLists.add(docs); + matchingTerms = null; } } } + if (matchingTerms != null) { + assert builder == null; + BooleanQuery.Builder bq = new BooleanQuery.Builder(); + for (TermAndState t : matchingTerms) { + final TermContext termContext = new TermContext(searcher.getTopReaderContext()); + termContext.register(t.state, context.ord, t.docFreq, t.totalTermFreq); + bq.add(new TermQuery(new Term(t.field, t.term), termContext), Occur.SHOULD); + } + Query q = new ConstantScoreQuery(bq.build()); + final Weight weight = searcher.rewrite(q).createWeight(searcher, needsScores, score()); + return new WeightOrDocIdSet(weight); + } else { + assert builder != null; + return new WeightOrDocIdSet(builder.build()); + } + } - final int numClauses = longestPostingsLists.size() + (shortestPostingsLists == null ? 0 : 1); - if (numClauses == 0) { + private Scorer scorer(DocIdSet set) throws IOException { + if (set == null) { return null; } + final DocIdSetIterator disi = set.iterator(); + if (disi == null) { + return null; + } + return new ConstantScoreScorer(this, score(), disi); + } - DisiPriorityQueue queue = new DisiPriorityQueue(numClauses); - for (PostingsEnum postings : longestPostingsLists) { - queue.add(new DisiWrapper(postings)); + @Override + public BulkScorer bulkScorer(LeafReaderContext context) throws IOException { + final WeightOrDocIdSet weightOrBitSet = rewrite(context); + if (weightOrBitSet == null) { + return null; + } else if (weightOrBitSet.weight != null) { + return weightOrBitSet.weight.bulkScorer(context); + } else { + final Scorer scorer = scorer(weightOrBitSet.set); + if (scorer == null) { + return null; + } + return new DefaultBulkScorer(scorer); } - if (shortestPostingsLists != null) { - queue.add(new DisiWrapper(shortestPostingsLists.build().iterator())); + } + + @Override + public Scorer scorer(LeafReaderContext context) throws IOException { + final WeightOrDocIdSet weightOrBitSet = rewrite(context); + if (weightOrBitSet == null) { + return null; + } else if (weightOrBitSet.weight != null) { + return weightOrBitSet.weight.scorer(context); + } else { + return scorer(weightOrBitSet.set); } - final DocIdSetIterator disi = new DisjunctionDISIApproximation(queue); - return new ConstantScoreScorer(this, boost, disi); } }; } - } From cb0ff1a799f61a1b532855a20b6859f34f59d9af Mon Sep 17 00:00:00 2001 From: Adrien Grand Date: Tue, 5 Sep 2017 21:21:51 +0200 Subject: [PATCH 42/44] LUCENE-7956: Make the start offset of codePointBefore explicit. --- .../apache/lucene/analysis/icu/ICUNormalizer2CharFilter.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/ICUNormalizer2CharFilter.java b/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/ICUNormalizer2CharFilter.java index c529f74ed08..07b1c88c04b 100644 --- a/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/ICUNormalizer2CharFilter.java +++ b/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/ICUNormalizer2CharFilter.java @@ -110,7 +110,7 @@ public final class ICUNormalizer2CharFilter extends BaseCharFilter { break; } - final int lastCodePoint = Character.codePointBefore(tmpBuffer.getBuffer(), tmpBuffer.getLength()); + final int lastCodePoint = Character.codePointBefore(tmpBuffer.getBuffer(), tmpBuffer.getLength(), 0); if (normalizer.isInert(lastCodePoint)) { // we require an inert char so that we can normalize content before and // after this character independently From cc344dc6bd9e71ed7848618630b51f4633e1dd50 Mon Sep 17 00:00:00 2001 From: yonik Date: Tue, 5 Sep 2017 16:06:41 -0400 Subject: [PATCH 43/44] SOLR-11316: date support for min/max, fix missing bug for int/long fields --- solr/CHANGES.txt | 3 ++ .../apache/solr/search/facet/MinMaxAgg.java | 31 +++++++++++++++++-- .../solr/search/facet/TestJsonFacets.java | 22 +++++++++++-- 3 files changed, 52 insertions(+), 4 deletions(-) diff --git a/solr/CHANGES.txt b/solr/CHANGES.txt index ad0a522c642..be89fb1880f 100644 --- a/solr/CHANGES.txt +++ b/solr/CHANGES.txt @@ -86,6 +86,9 @@ New Features * SOLR-11317: JSON Facet API: min/max aggregations on numeric fields are now typed better so int/long fields return an appropriate integral type rather than a double. (yonik) +* SOLR-11316: JSON Facet API: min/max aggregations are now supported on single-valued date fields. + (yonik) + Bug Fixes ---------------------- diff --git a/solr/core/src/java/org/apache/solr/search/facet/MinMaxAgg.java b/solr/core/src/java/org/apache/solr/search/facet/MinMaxAgg.java index 5a48ab2221c..008d0fd4445 100644 --- a/solr/core/src/java/org/apache/solr/search/facet/MinMaxAgg.java +++ b/solr/core/src/java/org/apache/solr/search/facet/MinMaxAgg.java @@ -18,6 +18,7 @@ package org.apache.solr.search.facet; import java.io.IOException; import java.util.Arrays; +import java.util.Date; import org.apache.lucene.index.LeafReaderContext; import org.apache.lucene.index.MultiDocValues; @@ -70,8 +71,9 @@ public class MinMaxAgg extends SimpleAggValueSource { return new DFuncAcc(vs, fcontext, numSlots); case INTEGER: case LONG: - case DATE: return new LFuncAcc(vs, fcontext, numSlots); + case DATE: + return new DateFuncAcc(vs, fcontext, numSlots); } } @@ -185,7 +187,7 @@ public class MinMaxAgg extends SimpleAggValueSource { @Override public Object getValue(int slot) { long val = result[slot]; - if (val == 0 && exists.get(slot)) { + if (val == 0 && !exists.get(slot)) { return null; } else { return val; @@ -221,6 +223,31 @@ public class MinMaxAgg extends SimpleAggValueSource { } + class DateFuncAcc extends LongFuncSlotAcc { + private static final long MISSING = Long.MIN_VALUE; + public DateFuncAcc(ValueSource values, FacetContext fcontext, int numSlots) { + super(values, fcontext, numSlots, MISSING); + } + + @Override + public void collect(int doc, int slotNum) throws IOException { + long val = values.longVal(doc); + if (val == 0 && !values.exists(doc)) return; // depend on fact that non existing values return 0 for func query + + long currVal = result[slotNum]; + if (Long.compare(val, currVal) * minmax < 0 || currVal == MISSING) { + result[slotNum] = val; + } + } + + // let compare be the default for now (since we can't yet correctly handle sortMissingLast + + @Override + public Object getValue(int slot) { + return result[slot] == MISSING ? null : new Date(result[slot]); + } + } + abstract class OrdAcc extends SlotAcc { final static int MISSING = -1; diff --git a/solr/core/src/test/org/apache/solr/search/facet/TestJsonFacets.java b/solr/core/src/test/org/apache/solr/search/facet/TestJsonFacets.java index 559240bc048..33d7fa89c78 100644 --- a/solr/core/src/test/org/apache/solr/search/facet/TestJsonFacets.java +++ b/solr/core/src/test/org/apache/solr/search/facet/TestJsonFacets.java @@ -695,11 +695,29 @@ public class TestJsonFacets extends SolrTestCaseHS { "}" ) , "facets=={count:6 " + - ",f1:{ buckets:[{val:-1,count:2,a:-5},{val:3,count:2,a:-5},{val:-5,count:1,a:2},{val:2,count:1,a:2},{val:0,count:2,a:3}, ] } " + - ",f2:{ buckets:[{val:0,count:2,a:7},{val:3,count:2,a:3},{val:-5,count:1,a:2},{val:2,count:1,a:2},{val:-1,count:2,a:-5}, ] } " + + ",f1:{ buckets:[{val:-1,count:2,a:-5},{val:3,count:2,a:-5},{val:-5,count:1,a:2},{val:2,count:1,a:2},{val:0,count:2,a:3} ] } " + + ",f2:{ buckets:[{val:0,count:2,a:7},{val:3,count:2,a:3},{val:-5,count:1,a:2},{val:2,count:1,a:2},{val:-1,count:2,a:-5} ] } " + "}" ); + + // Same thing for dates + // test min/max of string field + if (date.equals("date_dt") || date.equals("date_dtd")) { // supports only single valued currently... + client.testJQ(params(p, "q", "*:*" + , "json.facet", "{" + + " f3:{${terms} type:field, field:${num_is}, facet:{a:'min(${date})'}, sort:'a desc' }" + + ",f4:{${terms} type:field, field:${num_is}, facet:{a:'max(${date})'}, sort:'a asc' }" + + "}" + ) + , "facets=={count:6 " + + ",f3:{ buckets:[{val:-1,count:2,a:'2002-02-02T02:02:02Z'},{val:3,count:2,a:'2002-02-02T02:02:02Z'},{val:0,count:2,a:'2001-02-03T01:02:03Z'},{val:-5,count:1,a:'2001-01-01T01:01:01Z'},{val:2,count:1,a:'2001-01-01T01:01:01Z'} ] } " + + ",f4:{ buckets:[{val:-5,count:1,a:'2001-01-01T01:01:01Z'},{val:2,count:1,a:'2001-01-01T01:01:01Z'},{val:-1,count:2,a:'2002-03-01T03:02:01Z'},{val:0,count:2,a:'2003-03-03T03:03:03Z'},{val:3,count:2,a:'2003-03-03T03:03:03Z'} ] } " + + "}" + ); + } + + // percentiles 0,10,50,90,100 // catA: 2.0 2.2 3.0 3.8 4.0 // catB: -9.0 -8.2 -5.0 7.800000000000001 11.0 From f49de60b54f5941a909145e77af29743c3479c30 Mon Sep 17 00:00:00 2001 From: Robert Muir Date: Tue, 5 Sep 2017 18:48:12 -0400 Subject: [PATCH 44/44] LUCENE-7940: add CHANGES entry --- lucene/CHANGES.txt | 2 ++ 1 file changed, 2 insertions(+) diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index 02b22315940..0dced289cf3 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -23,6 +23,8 @@ New Features * LUCENE-7927: Add LongValueFacetCounts, to compute facet counts for individual numeric values (Mike McCandless) +* LUCENE-7940: Add BengaliAnalyzer. (Md. Abdulla-Al-Sun via Robert Muir) + Optimizations * LUCENE-7905: Optimize how OrdinalMap (used by