SOLR-14782: Document how to unescape for the QueryElevationComponent.

2020-09-02 11:45:36 +02:00 · 2020-09-02 11:45:36 +02:00 · 56dbb66503
parent 20af6dbd3d
commit 56dbb66503
2 changed files with 37 additions and 1 deletions
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/pattern/TestPatternReplaceCharFilterFactory.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/pattern/TestPatternReplaceCharFilterFactory.java
@ -77,4 +77,27 @@ public class TestPatternReplaceCharFilterFactory extends BaseTokenStreamFactoryT
    });
    assertTrue(expected.getMessage().contains("Unknown parameters"));
  }
+
+  /** Test with backslash unescape */
+  public void testUnescape() throws Exception {
+    Reader reader = new StringReader("aaa\\ bbb\\-ccc");
+    reader = charFilterFactory("PatternReplace",
+            "pattern", "\\\\(.)",
+            "replacement", "$1").create(reader);
+    TokenStream ts = whitespaceMockTokenizer(reader);
+    assertTokenStreamContents(ts,
+            new String[] { "aaa", "bbb-ccc" },
+            new int[] { 0, 5 },
+            new int[] { 3, 13 });
+
+    reader = new StringReader("a\\b\\0\\-c\\é\\ d");
+    reader = charFilterFactory("PatternReplace",
+            "pattern", "\\\\([^\\p{IsAlphabetic}\\p{Digit}])",
+            "replacement", "$1").create(reader);
+    ts = whitespaceMockTokenizer(reader);
+    assertTokenStreamContents(ts,
+            new String[] { "a\\b\\0-c\\é", "d" },
+            new int[] { 0, 12 },
+            new int[] { 10, 13 });
+  }
 }
--- a/solr/solr-ref-guide/src/the-query-elevation-component.adoc
+++ b/solr/solr-ref-guide/src/the-query-elevation-component.adoc
@ -61,7 +61,20 @@ Optionally, in the Query Elevation Component configuration you can also specify
 The Query Elevation Search Component takes the following parameters:

 `queryFieldType`::
-Specifies which fieldType should be used to analyze the incoming text. For example, it may be appropriate to use a fieldType with a LowerCaseFilter.
+Specifies which fieldType should be used to analyze the incoming text. For example, it may be appropriate to use a fieldType with a LowerCaseFilter. Other example, if you need to unescape backslash-escaped queries, then you can define the fieldType to preprocess with a PatternReplaceCharFilter. Here is the corresponding example of fieldType (traditionally in `schema.xml`):
+
+[source,xml]
+----
+<fieldType name="unescapelowercase" class="solr.TextField">
+  <analyzer>
+    <charFilter class="solr.PatternReplaceCharFilterFactory" pattern="\\(.)" replacement="$1"/>
+    <tokenizer class="solr.StandardTokenizerFactory"/>
+    <filter class="solr.LowerCaseFilterFactory"/>
+  </analyzer>
+</fieldType>
+----
+
+For example, to unescape only non-alphanumeric, the pattern could be `\\([^\p{IsAlphabetic}\p{Digit}])`.

 `config-file`::
 Path to the file that defines query elevation. This file must exist in `<instanceDir>/conf/<config-file>` or `<dataDir>/<config-file>`. If the file exists in the `conf/` directory it will be loaded once at startup. If it exists in the `data/` directory, it will be reloaded for each IndexReader.