SOLR-13622: Rename FilesStream -> CatStream

Also fixes an 'cat' OS-dependent bug in StreamExpressionTest.
This commit is contained in:
Jason Gerlowski 2019-08-07 21:17:48 -04:00
parent ed137dbe28
commit 2eb493d170
5 changed files with 30 additions and 27 deletions

View File

@ -102,7 +102,7 @@ New Features
* SOLR-13553: Node level custom RequestHandlers (noble)
* SOLR-13622: Add files() stream source to create tuples from lines in local files (Jason Gerlowski and Joel Bernstein)
* SOLR-13622: Add cat() stream source to create tuples from lines in local files (Jason Gerlowski and Joel Bernstein)
* SOLR-11866: QueryElevationComponent can have query rules configured with match="subset" wherein the words need only
match a subset of the query's words and in any order. (Bruno Roustant via David Smiley)

View File

@ -46,7 +46,7 @@ import org.apache.solr.core.SolrResourceLoader;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
public class FilesStream extends TupleStream implements Expressible {
public class CatStream extends TupleStream implements Expressible {
private static final Logger log = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());
private final String commaDelimitedFilepaths;
@ -60,11 +60,11 @@ public class FilesStream extends TupleStream implements Expressible {
private CrawlFile currentFilePath;
private LineIterator currentFileLines;
public FilesStream(StreamExpression expression, StreamFactory factory) throws IOException {
public CatStream(StreamExpression expression, StreamFactory factory) throws IOException {
this(factory.getValueOperand(expression, 0), factory.getIntOperand(expression, "maxLines", -1));
}
public FilesStream(String commaDelimitedFilepaths, int maxLines) {
public CatStream(String commaDelimitedFilepaths, int maxLines) {
if (commaDelimitedFilepaths == null) {
throw new IllegalArgumentException("No filepaths provided to stream");
}

View File

@ -34,7 +34,7 @@ public class SolrDefaultStreamFactory extends DefaultStreamFactory {
public SolrDefaultStreamFactory() {
super();
this.withFunctionName("analyze", AnalyzeEvaluator.class);
this.withFunctionName("files", FilesStream.class);
this.withFunctionName("cat", CatStream.class);
this.withFunctionName("classify", ClassifyStream.class);
this.withFunctionName("haversineMeters", HaversineMetersEvaluator.class);
}

View File

@ -216,31 +216,31 @@ features(collection1,
numTerms=250)
----
== files
== cat
The `files` function reads the specified files or directories and emits each line in the file(s) as a tuple.
The `cat` function reads the specified files or directories and emits each line in the file(s) as a tuple.
Each emitted tuple contains two fields: `file` and `line`. `file` contains the path to the file being read from relative to the `userfiles` chroot (directly under `$SOLR_HOME`), and `line` contains a line in that file.
`files` is ideally used with the `update` stream to index data from the specified documents, or with the `analyze` stream to further split the lines into individual tokens for statistical processing or visualization.
`cat` is ideally used with the `update` stream to index data from the specified documents, or with the `analyze` stream to further split the lines into individual tokens for statistical processing or visualization.
=== files Parameters
=== cat Parameters
* `filePaths`: (Mandatory) a comma separated list of filepaths to read lines from. If the specified path is a directory, it will be crawled recursively and all contained files will be read. To prevent malicious users from reading arbitrary files from Solr nodes, `filePaths` must be a relative path measured from a chroot of `$SOLR_HOME/userfiles` on the node running the streaming expression.
* `maxLines`: (defaults to -1) The maximum number of lines to read (and tuples to emit). If a negative value is specified, all lines in the specified files will be emitted as tuples. Files are read in the order that they appear in the comma-separated `filePaths` argument. If the line-limit is hit, it will be these later files that are partially emitted or not read at all.
=== files Examples
=== cat Examples
The following example emits all lines from a single text file located at `$SOLR_HOME/userfiles/authors.txt`:
[source,text]
----
files("authors.txt")
cat("authors.txt")
----
This example will read lines from `$SOLR_HOME/userfiles/authors.txt`, as well as all files (recursively) found under `$SOLR_HOME/userfiles/fiction/scifi`. Only 500 lines will be emitted, meaning that some files may be partially emitted or not read at all:
[source,text]
----
files("authors.txt,fiction/scifi/", maxLines=500)
cat("authors.txt,fiction/scifi/", maxLines=500)
----
== nodes

View File

@ -3064,10 +3064,10 @@ public class StreamExpressionTest extends SolrCloudTestCase {
}
@Test
public void testFileStreamSingleFile() throws Exception {
final String fileStream = "files(\"topLevel1.txt\")";
public void testCatStreamSingleFile() throws Exception {
final String catStream = "cat(\"topLevel1.txt\")";
ModifiableSolrParams paramsLoc = new ModifiableSolrParams();
paramsLoc.set("expr", fileStream);
paramsLoc.set("expr", catStream);
paramsLoc.set("qt", "/stream");
String url = cluster.getJettySolrRunners().get(0).getBaseUrl().toString()+"/"+FILESTREAM_COLLECTION;
@ -3086,10 +3086,10 @@ public class StreamExpressionTest extends SolrCloudTestCase {
}
@Test
public void testFileStreamMaxLines() throws Exception {
final String fileStream = "files(\"topLevel1.txt\", maxLines=2)";
public void testCatStreamMaxLines() throws Exception {
final String catStream = "cat(\"topLevel1.txt\", maxLines=2)";
ModifiableSolrParams paramsLoc = new ModifiableSolrParams();
paramsLoc.set("expr", fileStream);
paramsLoc.set("expr", catStream);
paramsLoc.set("qt", "/stream");
String url = cluster.getJettySolrRunners().get(0).getBaseUrl().toString()+"/"+FILESTREAM_COLLECTION;
@ -3108,10 +3108,10 @@ public class StreamExpressionTest extends SolrCloudTestCase {
}
@Test
public void testFileStreamDirectoryCrawl() throws Exception {
final String fileStream = "files(\"directory1\")";
public void testCatStreamDirectoryCrawl() throws Exception {
final String catStream = "cat(\"directory1\")";
ModifiableSolrParams paramsLoc = new ModifiableSolrParams();
paramsLoc.set("expr", fileStream);
paramsLoc.set("expr", catStream);
paramsLoc.set("qt", "/stream");
String url = cluster.getJettySolrRunners().get(0).getBaseUrl().toString()+"/"+FILESTREAM_COLLECTION;
@ -3122,24 +3122,26 @@ public class StreamExpressionTest extends SolrCloudTestCase {
List<Tuple> tuples = getTuples(solrStream);
assertEquals(8, tuples.size());
final String expectedSecondLevel1Path = "directory1" + File.separator + "secondLevel1.txt";
for (int i = 0; i < 4; i++) {
Tuple t = tuples.get(i);
assertEquals("secondLevel1.txt line " + String.valueOf(i+1), t.get("line"));
assertEquals("directory1/secondLevel1.txt", t.get("file"));
assertEquals(expectedSecondLevel1Path, t.get("file"));
}
final String expectedSecondLevel2Path = "directory1" + File.separator + "secondLevel2.txt";
for (int i = 4; i < 8; i++) {
Tuple t = tuples.get(i);
assertEquals("secondLevel2.txt line " + String.valueOf(i - 3), t.get("line"));
assertEquals("directory1/secondLevel2.txt", t.get("file"));
assertEquals(expectedSecondLevel2Path, t.get("file"));
}
}
@Test
public void testFileStreamMultipleExplicitFiles() throws Exception {
final String fileStream = "files(\"topLevel1.txt,directory1/secondLevel2.txt\")";
public void testCatStreamMultipleExplicitFiles() throws Exception {
final String catStream = "cat(\"topLevel1.txt,directory1" + File.separator + "secondLevel2.txt\")";
ModifiableSolrParams paramsLoc = new ModifiableSolrParams();
paramsLoc.set("expr", fileStream);
paramsLoc.set("expr", catStream);
paramsLoc.set("qt", "/stream");
String url = cluster.getJettySolrRunners().get(0).getBaseUrl().toString()+"/"+FILESTREAM_COLLECTION;
@ -3156,10 +3158,11 @@ public class StreamExpressionTest extends SolrCloudTestCase {
assertEquals("topLevel1.txt", t.get("file"));
}
final String expectedSecondLevel2Path = "directory1" + File.separator + "secondLevel2.txt";
for (int i = 4; i < 8; i++) {
Tuple t = tuples.get(i);
assertEquals("secondLevel2.txt line " + String.valueOf(i - 3), t.get("line"));
assertEquals("directory1/secondLevel2.txt", t.get("file"));
assertEquals(expectedSecondLevel2Path, t.get("file"));
}
}