mirror of
https://github.com/apache/lucene.git
synced 2025-02-15 14:35:50 +00:00
SOLR-13622: Rename FilesStream -> CatStream
Also fixes an 'cat' OS-dependent bug in StreamExpressionTest.
This commit is contained in:
parent
de522052c8
commit
299d92da5c
@ -52,7 +52,7 @@ New Features
|
|||||||
|
|
||||||
* SOLR-13553: Node level custom RequestHandlers (noble)
|
* SOLR-13553: Node level custom RequestHandlers (noble)
|
||||||
|
|
||||||
* SOLR-13622: Add files() stream source to create tuples from lines in local files (Jason Gerlowski and Joel Bernstein)
|
* SOLR-13622: Add cat() stream source to create tuples from lines in local files (Jason Gerlowski and Joel Bernstein)
|
||||||
|
|
||||||
* SOLR-11866: QueryElevationComponent can have query rules configured with match="subset" wherein the words need only
|
* SOLR-11866: QueryElevationComponent can have query rules configured with match="subset" wherein the words need only
|
||||||
match a subset of the query's words and in any order. (Bruno Roustant via David Smiley)
|
match a subset of the query's words and in any order. (Bruno Roustant via David Smiley)
|
||||||
|
@ -46,7 +46,7 @@ import org.apache.solr.core.SolrResourceLoader;
|
|||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
import org.slf4j.LoggerFactory;
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
public class FilesStream extends TupleStream implements Expressible {
|
public class CatStream extends TupleStream implements Expressible {
|
||||||
private static final Logger log = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());
|
private static final Logger log = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());
|
||||||
|
|
||||||
private final String commaDelimitedFilepaths;
|
private final String commaDelimitedFilepaths;
|
||||||
@ -60,11 +60,11 @@ public class FilesStream extends TupleStream implements Expressible {
|
|||||||
private CrawlFile currentFilePath;
|
private CrawlFile currentFilePath;
|
||||||
private LineIterator currentFileLines;
|
private LineIterator currentFileLines;
|
||||||
|
|
||||||
public FilesStream(StreamExpression expression, StreamFactory factory) throws IOException {
|
public CatStream(StreamExpression expression, StreamFactory factory) throws IOException {
|
||||||
this(factory.getValueOperand(expression, 0), factory.getIntOperand(expression, "maxLines", -1));
|
this(factory.getValueOperand(expression, 0), factory.getIntOperand(expression, "maxLines", -1));
|
||||||
}
|
}
|
||||||
|
|
||||||
public FilesStream(String commaDelimitedFilepaths, int maxLines) {
|
public CatStream(String commaDelimitedFilepaths, int maxLines) {
|
||||||
if (commaDelimitedFilepaths == null) {
|
if (commaDelimitedFilepaths == null) {
|
||||||
throw new IllegalArgumentException("No filepaths provided to stream");
|
throw new IllegalArgumentException("No filepaths provided to stream");
|
||||||
}
|
}
|
@ -34,7 +34,7 @@ public class SolrDefaultStreamFactory extends DefaultStreamFactory {
|
|||||||
public SolrDefaultStreamFactory() {
|
public SolrDefaultStreamFactory() {
|
||||||
super();
|
super();
|
||||||
this.withFunctionName("analyze", AnalyzeEvaluator.class);
|
this.withFunctionName("analyze", AnalyzeEvaluator.class);
|
||||||
this.withFunctionName("files", FilesStream.class);
|
this.withFunctionName("cat", CatStream.class);
|
||||||
this.withFunctionName("classify", ClassifyStream.class);
|
this.withFunctionName("classify", ClassifyStream.class);
|
||||||
this.withFunctionName("haversineMeters", HaversineMetersEvaluator.class);
|
this.withFunctionName("haversineMeters", HaversineMetersEvaluator.class);
|
||||||
}
|
}
|
||||||
|
@ -216,31 +216,31 @@ features(collection1,
|
|||||||
numTerms=250)
|
numTerms=250)
|
||||||
----
|
----
|
||||||
|
|
||||||
== files
|
== cat
|
||||||
|
|
||||||
The `files` function reads the specified files or directories and emits each line in the file(s) as a tuple.
|
The `cat` function reads the specified files or directories and emits each line in the file(s) as a tuple.
|
||||||
|
|
||||||
Each emitted tuple contains two fields: `file` and `line`. `file` contains the path to the file being read from relative to the `userfiles` chroot (directly under `$SOLR_HOME`), and `line` contains a line in that file.
|
Each emitted tuple contains two fields: `file` and `line`. `file` contains the path to the file being read from relative to the `userfiles` chroot (directly under `$SOLR_HOME`), and `line` contains a line in that file.
|
||||||
|
|
||||||
`files` is ideally used with the `update` stream to index data from the specified documents, or with the `analyze` stream to further split the lines into individual tokens for statistical processing or visualization.
|
`cat` is ideally used with the `update` stream to index data from the specified documents, or with the `analyze` stream to further split the lines into individual tokens for statistical processing or visualization.
|
||||||
|
|
||||||
=== files Parameters
|
=== cat Parameters
|
||||||
|
|
||||||
* `filePaths`: (Mandatory) a comma separated list of filepaths to read lines from. If the specified path is a directory, it will be crawled recursively and all contained files will be read. To prevent malicious users from reading arbitrary files from Solr nodes, `filePaths` must be a relative path measured from a chroot of `$SOLR_HOME/userfiles` on the node running the streaming expression.
|
* `filePaths`: (Mandatory) a comma separated list of filepaths to read lines from. If the specified path is a directory, it will be crawled recursively and all contained files will be read. To prevent malicious users from reading arbitrary files from Solr nodes, `filePaths` must be a relative path measured from a chroot of `$SOLR_HOME/userfiles` on the node running the streaming expression.
|
||||||
* `maxLines`: (defaults to -1) The maximum number of lines to read (and tuples to emit). If a negative value is specified, all lines in the specified files will be emitted as tuples. Files are read in the order that they appear in the comma-separated `filePaths` argument. If the line-limit is hit, it will be these later files that are partially emitted or not read at all.
|
* `maxLines`: (defaults to -1) The maximum number of lines to read (and tuples to emit). If a negative value is specified, all lines in the specified files will be emitted as tuples. Files are read in the order that they appear in the comma-separated `filePaths` argument. If the line-limit is hit, it will be these later files that are partially emitted or not read at all.
|
||||||
|
|
||||||
=== files Examples
|
=== cat Examples
|
||||||
|
|
||||||
The following example emits all lines from a single text file located at `$SOLR_HOME/userfiles/authors.txt`:
|
The following example emits all lines from a single text file located at `$SOLR_HOME/userfiles/authors.txt`:
|
||||||
[source,text]
|
[source,text]
|
||||||
----
|
----
|
||||||
files("authors.txt")
|
cat("authors.txt")
|
||||||
----
|
----
|
||||||
|
|
||||||
This example will read lines from `$SOLR_HOME/userfiles/authors.txt`, as well as all files (recursively) found under `$SOLR_HOME/userfiles/fiction/scifi`. Only 500 lines will be emitted, meaning that some files may be partially emitted or not read at all:
|
This example will read lines from `$SOLR_HOME/userfiles/authors.txt`, as well as all files (recursively) found under `$SOLR_HOME/userfiles/fiction/scifi`. Only 500 lines will be emitted, meaning that some files may be partially emitted or not read at all:
|
||||||
[source,text]
|
[source,text]
|
||||||
----
|
----
|
||||||
files("authors.txt,fiction/scifi/", maxLines=500)
|
cat("authors.txt,fiction/scifi/", maxLines=500)
|
||||||
----
|
----
|
||||||
|
|
||||||
== nodes
|
== nodes
|
||||||
|
@ -3064,10 +3064,10 @@ public class StreamExpressionTest extends SolrCloudTestCase {
|
|||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void testFileStreamSingleFile() throws Exception {
|
public void testCatStreamSingleFile() throws Exception {
|
||||||
final String fileStream = "files(\"topLevel1.txt\")";
|
final String catStream = "cat(\"topLevel1.txt\")";
|
||||||
ModifiableSolrParams paramsLoc = new ModifiableSolrParams();
|
ModifiableSolrParams paramsLoc = new ModifiableSolrParams();
|
||||||
paramsLoc.set("expr", fileStream);
|
paramsLoc.set("expr", catStream);
|
||||||
paramsLoc.set("qt", "/stream");
|
paramsLoc.set("qt", "/stream");
|
||||||
String url = cluster.getJettySolrRunners().get(0).getBaseUrl().toString()+"/"+FILESTREAM_COLLECTION;
|
String url = cluster.getJettySolrRunners().get(0).getBaseUrl().toString()+"/"+FILESTREAM_COLLECTION;
|
||||||
|
|
||||||
@ -3086,10 +3086,10 @@ public class StreamExpressionTest extends SolrCloudTestCase {
|
|||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void testFileStreamMaxLines() throws Exception {
|
public void testCatStreamMaxLines() throws Exception {
|
||||||
final String fileStream = "files(\"topLevel1.txt\", maxLines=2)";
|
final String catStream = "cat(\"topLevel1.txt\", maxLines=2)";
|
||||||
ModifiableSolrParams paramsLoc = new ModifiableSolrParams();
|
ModifiableSolrParams paramsLoc = new ModifiableSolrParams();
|
||||||
paramsLoc.set("expr", fileStream);
|
paramsLoc.set("expr", catStream);
|
||||||
paramsLoc.set("qt", "/stream");
|
paramsLoc.set("qt", "/stream");
|
||||||
String url = cluster.getJettySolrRunners().get(0).getBaseUrl().toString()+"/"+FILESTREAM_COLLECTION;
|
String url = cluster.getJettySolrRunners().get(0).getBaseUrl().toString()+"/"+FILESTREAM_COLLECTION;
|
||||||
|
|
||||||
@ -3108,10 +3108,10 @@ public class StreamExpressionTest extends SolrCloudTestCase {
|
|||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void testFileStreamDirectoryCrawl() throws Exception {
|
public void testCatStreamDirectoryCrawl() throws Exception {
|
||||||
final String fileStream = "files(\"directory1\")";
|
final String catStream = "cat(\"directory1\")";
|
||||||
ModifiableSolrParams paramsLoc = new ModifiableSolrParams();
|
ModifiableSolrParams paramsLoc = new ModifiableSolrParams();
|
||||||
paramsLoc.set("expr", fileStream);
|
paramsLoc.set("expr", catStream);
|
||||||
paramsLoc.set("qt", "/stream");
|
paramsLoc.set("qt", "/stream");
|
||||||
String url = cluster.getJettySolrRunners().get(0).getBaseUrl().toString()+"/"+FILESTREAM_COLLECTION;
|
String url = cluster.getJettySolrRunners().get(0).getBaseUrl().toString()+"/"+FILESTREAM_COLLECTION;
|
||||||
|
|
||||||
@ -3122,24 +3122,26 @@ public class StreamExpressionTest extends SolrCloudTestCase {
|
|||||||
List<Tuple> tuples = getTuples(solrStream);
|
List<Tuple> tuples = getTuples(solrStream);
|
||||||
assertEquals(8, tuples.size());
|
assertEquals(8, tuples.size());
|
||||||
|
|
||||||
|
final String expectedSecondLevel1Path = "directory1" + File.separator + "secondLevel1.txt";
|
||||||
for (int i = 0; i < 4; i++) {
|
for (int i = 0; i < 4; i++) {
|
||||||
Tuple t = tuples.get(i);
|
Tuple t = tuples.get(i);
|
||||||
assertEquals("secondLevel1.txt line " + String.valueOf(i+1), t.get("line"));
|
assertEquals("secondLevel1.txt line " + String.valueOf(i+1), t.get("line"));
|
||||||
assertEquals("directory1/secondLevel1.txt", t.get("file"));
|
assertEquals(expectedSecondLevel1Path, t.get("file"));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
final String expectedSecondLevel2Path = "directory1" + File.separator + "secondLevel2.txt";
|
||||||
for (int i = 4; i < 8; i++) {
|
for (int i = 4; i < 8; i++) {
|
||||||
Tuple t = tuples.get(i);
|
Tuple t = tuples.get(i);
|
||||||
assertEquals("secondLevel2.txt line " + String.valueOf(i - 3), t.get("line"));
|
assertEquals("secondLevel2.txt line " + String.valueOf(i - 3), t.get("line"));
|
||||||
assertEquals("directory1/secondLevel2.txt", t.get("file"));
|
assertEquals(expectedSecondLevel2Path, t.get("file"));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void testFileStreamMultipleExplicitFiles() throws Exception {
|
public void testCatStreamMultipleExplicitFiles() throws Exception {
|
||||||
final String fileStream = "files(\"topLevel1.txt,directory1/secondLevel2.txt\")";
|
final String catStream = "cat(\"topLevel1.txt,directory1" + File.separator + "secondLevel2.txt\")";
|
||||||
ModifiableSolrParams paramsLoc = new ModifiableSolrParams();
|
ModifiableSolrParams paramsLoc = new ModifiableSolrParams();
|
||||||
paramsLoc.set("expr", fileStream);
|
paramsLoc.set("expr", catStream);
|
||||||
paramsLoc.set("qt", "/stream");
|
paramsLoc.set("qt", "/stream");
|
||||||
String url = cluster.getJettySolrRunners().get(0).getBaseUrl().toString()+"/"+FILESTREAM_COLLECTION;
|
String url = cluster.getJettySolrRunners().get(0).getBaseUrl().toString()+"/"+FILESTREAM_COLLECTION;
|
||||||
|
|
||||||
@ -3156,10 +3158,11 @@ public class StreamExpressionTest extends SolrCloudTestCase {
|
|||||||
assertEquals("topLevel1.txt", t.get("file"));
|
assertEquals("topLevel1.txt", t.get("file"));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
final String expectedSecondLevel2Path = "directory1" + File.separator + "secondLevel2.txt";
|
||||||
for (int i = 4; i < 8; i++) {
|
for (int i = 4; i < 8; i++) {
|
||||||
Tuple t = tuples.get(i);
|
Tuple t = tuples.get(i);
|
||||||
assertEquals("secondLevel2.txt line " + String.valueOf(i - 3), t.get("line"));
|
assertEquals("secondLevel2.txt line " + String.valueOf(i - 3), t.get("line"));
|
||||||
assertEquals("directory1/secondLevel2.txt", t.get("file"));
|
assertEquals(expectedSecondLevel2Path, t.get("file"));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user