mirror of https://github.com/apache/lucene.git
SOLR-13622: Rename FilesStream -> CatStream
Also fixes an 'cat' OS-dependent bug in StreamExpressionTest.
This commit is contained in:
parent
ed137dbe28
commit
2eb493d170
|
@ -102,7 +102,7 @@ New Features
|
|||
|
||||
* SOLR-13553: Node level custom RequestHandlers (noble)
|
||||
|
||||
* SOLR-13622: Add files() stream source to create tuples from lines in local files (Jason Gerlowski and Joel Bernstein)
|
||||
* SOLR-13622: Add cat() stream source to create tuples from lines in local files (Jason Gerlowski and Joel Bernstein)
|
||||
|
||||
* SOLR-11866: QueryElevationComponent can have query rules configured with match="subset" wherein the words need only
|
||||
match a subset of the query's words and in any order. (Bruno Roustant via David Smiley)
|
||||
|
|
|
@ -46,7 +46,7 @@ import org.apache.solr.core.SolrResourceLoader;
|
|||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
public class FilesStream extends TupleStream implements Expressible {
|
||||
public class CatStream extends TupleStream implements Expressible {
|
||||
private static final Logger log = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());
|
||||
|
||||
private final String commaDelimitedFilepaths;
|
||||
|
@ -60,11 +60,11 @@ public class FilesStream extends TupleStream implements Expressible {
|
|||
private CrawlFile currentFilePath;
|
||||
private LineIterator currentFileLines;
|
||||
|
||||
public FilesStream(StreamExpression expression, StreamFactory factory) throws IOException {
|
||||
public CatStream(StreamExpression expression, StreamFactory factory) throws IOException {
|
||||
this(factory.getValueOperand(expression, 0), factory.getIntOperand(expression, "maxLines", -1));
|
||||
}
|
||||
|
||||
public FilesStream(String commaDelimitedFilepaths, int maxLines) {
|
||||
public CatStream(String commaDelimitedFilepaths, int maxLines) {
|
||||
if (commaDelimitedFilepaths == null) {
|
||||
throw new IllegalArgumentException("No filepaths provided to stream");
|
||||
}
|
|
@ -34,7 +34,7 @@ public class SolrDefaultStreamFactory extends DefaultStreamFactory {
|
|||
public SolrDefaultStreamFactory() {
|
||||
super();
|
||||
this.withFunctionName("analyze", AnalyzeEvaluator.class);
|
||||
this.withFunctionName("files", FilesStream.class);
|
||||
this.withFunctionName("cat", CatStream.class);
|
||||
this.withFunctionName("classify", ClassifyStream.class);
|
||||
this.withFunctionName("haversineMeters", HaversineMetersEvaluator.class);
|
||||
}
|
||||
|
|
|
@ -216,31 +216,31 @@ features(collection1,
|
|||
numTerms=250)
|
||||
----
|
||||
|
||||
== files
|
||||
== cat
|
||||
|
||||
The `files` function reads the specified files or directories and emits each line in the file(s) as a tuple.
|
||||
The `cat` function reads the specified files or directories and emits each line in the file(s) as a tuple.
|
||||
|
||||
Each emitted tuple contains two fields: `file` and `line`. `file` contains the path to the file being read from relative to the `userfiles` chroot (directly under `$SOLR_HOME`), and `line` contains a line in that file.
|
||||
|
||||
`files` is ideally used with the `update` stream to index data from the specified documents, or with the `analyze` stream to further split the lines into individual tokens for statistical processing or visualization.
|
||||
`cat` is ideally used with the `update` stream to index data from the specified documents, or with the `analyze` stream to further split the lines into individual tokens for statistical processing or visualization.
|
||||
|
||||
=== files Parameters
|
||||
=== cat Parameters
|
||||
|
||||
* `filePaths`: (Mandatory) a comma separated list of filepaths to read lines from. If the specified path is a directory, it will be crawled recursively and all contained files will be read. To prevent malicious users from reading arbitrary files from Solr nodes, `filePaths` must be a relative path measured from a chroot of `$SOLR_HOME/userfiles` on the node running the streaming expression.
|
||||
* `maxLines`: (defaults to -1) The maximum number of lines to read (and tuples to emit). If a negative value is specified, all lines in the specified files will be emitted as tuples. Files are read in the order that they appear in the comma-separated `filePaths` argument. If the line-limit is hit, it will be these later files that are partially emitted or not read at all.
|
||||
|
||||
=== files Examples
|
||||
=== cat Examples
|
||||
|
||||
The following example emits all lines from a single text file located at `$SOLR_HOME/userfiles/authors.txt`:
|
||||
[source,text]
|
||||
----
|
||||
files("authors.txt")
|
||||
cat("authors.txt")
|
||||
----
|
||||
|
||||
This example will read lines from `$SOLR_HOME/userfiles/authors.txt`, as well as all files (recursively) found under `$SOLR_HOME/userfiles/fiction/scifi`. Only 500 lines will be emitted, meaning that some files may be partially emitted or not read at all:
|
||||
[source,text]
|
||||
----
|
||||
files("authors.txt,fiction/scifi/", maxLines=500)
|
||||
cat("authors.txt,fiction/scifi/", maxLines=500)
|
||||
----
|
||||
|
||||
== nodes
|
||||
|
|
|
@ -3064,10 +3064,10 @@ public class StreamExpressionTest extends SolrCloudTestCase {
|
|||
}
|
||||
|
||||
@Test
|
||||
public void testFileStreamSingleFile() throws Exception {
|
||||
final String fileStream = "files(\"topLevel1.txt\")";
|
||||
public void testCatStreamSingleFile() throws Exception {
|
||||
final String catStream = "cat(\"topLevel1.txt\")";
|
||||
ModifiableSolrParams paramsLoc = new ModifiableSolrParams();
|
||||
paramsLoc.set("expr", fileStream);
|
||||
paramsLoc.set("expr", catStream);
|
||||
paramsLoc.set("qt", "/stream");
|
||||
String url = cluster.getJettySolrRunners().get(0).getBaseUrl().toString()+"/"+FILESTREAM_COLLECTION;
|
||||
|
||||
|
@ -3086,10 +3086,10 @@ public class StreamExpressionTest extends SolrCloudTestCase {
|
|||
}
|
||||
|
||||
@Test
|
||||
public void testFileStreamMaxLines() throws Exception {
|
||||
final String fileStream = "files(\"topLevel1.txt\", maxLines=2)";
|
||||
public void testCatStreamMaxLines() throws Exception {
|
||||
final String catStream = "cat(\"topLevel1.txt\", maxLines=2)";
|
||||
ModifiableSolrParams paramsLoc = new ModifiableSolrParams();
|
||||
paramsLoc.set("expr", fileStream);
|
||||
paramsLoc.set("expr", catStream);
|
||||
paramsLoc.set("qt", "/stream");
|
||||
String url = cluster.getJettySolrRunners().get(0).getBaseUrl().toString()+"/"+FILESTREAM_COLLECTION;
|
||||
|
||||
|
@ -3108,10 +3108,10 @@ public class StreamExpressionTest extends SolrCloudTestCase {
|
|||
}
|
||||
|
||||
@Test
|
||||
public void testFileStreamDirectoryCrawl() throws Exception {
|
||||
final String fileStream = "files(\"directory1\")";
|
||||
public void testCatStreamDirectoryCrawl() throws Exception {
|
||||
final String catStream = "cat(\"directory1\")";
|
||||
ModifiableSolrParams paramsLoc = new ModifiableSolrParams();
|
||||
paramsLoc.set("expr", fileStream);
|
||||
paramsLoc.set("expr", catStream);
|
||||
paramsLoc.set("qt", "/stream");
|
||||
String url = cluster.getJettySolrRunners().get(0).getBaseUrl().toString()+"/"+FILESTREAM_COLLECTION;
|
||||
|
||||
|
@ -3122,24 +3122,26 @@ public class StreamExpressionTest extends SolrCloudTestCase {
|
|||
List<Tuple> tuples = getTuples(solrStream);
|
||||
assertEquals(8, tuples.size());
|
||||
|
||||
final String expectedSecondLevel1Path = "directory1" + File.separator + "secondLevel1.txt";
|
||||
for (int i = 0; i < 4; i++) {
|
||||
Tuple t = tuples.get(i);
|
||||
assertEquals("secondLevel1.txt line " + String.valueOf(i+1), t.get("line"));
|
||||
assertEquals("directory1/secondLevel1.txt", t.get("file"));
|
||||
assertEquals(expectedSecondLevel1Path, t.get("file"));
|
||||
}
|
||||
|
||||
final String expectedSecondLevel2Path = "directory1" + File.separator + "secondLevel2.txt";
|
||||
for (int i = 4; i < 8; i++) {
|
||||
Tuple t = tuples.get(i);
|
||||
assertEquals("secondLevel2.txt line " + String.valueOf(i - 3), t.get("line"));
|
||||
assertEquals("directory1/secondLevel2.txt", t.get("file"));
|
||||
assertEquals(expectedSecondLevel2Path, t.get("file"));
|
||||
}
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testFileStreamMultipleExplicitFiles() throws Exception {
|
||||
final String fileStream = "files(\"topLevel1.txt,directory1/secondLevel2.txt\")";
|
||||
public void testCatStreamMultipleExplicitFiles() throws Exception {
|
||||
final String catStream = "cat(\"topLevel1.txt,directory1" + File.separator + "secondLevel2.txt\")";
|
||||
ModifiableSolrParams paramsLoc = new ModifiableSolrParams();
|
||||
paramsLoc.set("expr", fileStream);
|
||||
paramsLoc.set("expr", catStream);
|
||||
paramsLoc.set("qt", "/stream");
|
||||
String url = cluster.getJettySolrRunners().get(0).getBaseUrl().toString()+"/"+FILESTREAM_COLLECTION;
|
||||
|
||||
|
@ -3156,10 +3158,11 @@ public class StreamExpressionTest extends SolrCloudTestCase {
|
|||
assertEquals("topLevel1.txt", t.get("file"));
|
||||
}
|
||||
|
||||
final String expectedSecondLevel2Path = "directory1" + File.separator + "secondLevel2.txt";
|
||||
for (int i = 4; i < 8; i++) {
|
||||
Tuple t = tuples.get(i);
|
||||
assertEquals("secondLevel2.txt line " + String.valueOf(i - 3), t.get("line"));
|
||||
assertEquals("directory1/secondLevel2.txt", t.get("file"));
|
||||
assertEquals(expectedSecondLevel2Path, t.get("file"));
|
||||
}
|
||||
}
|
||||
|
||||
|
|
Loading…
Reference in New Issue