mirror of https://github.com/apache/lucene.git
Get rid of inefficient Stream.count() (#12975)
This commit is contained in:
parent
9c9949b2bc
commit
57b104e806
|
@ -34,8 +34,8 @@ import java.util.stream.Stream;
|
||||||
* Title, Date, Dateline, Body
|
* Title, Date, Dateline, Body
|
||||||
*/
|
*/
|
||||||
public class ExtractReuters {
|
public class ExtractReuters {
|
||||||
private Path reutersDir;
|
private final Path reutersDir;
|
||||||
private Path outputDir;
|
private final Path outputDir;
|
||||||
|
|
||||||
public ExtractReuters(Path reutersDir, Path outputDir) throws IOException {
|
public ExtractReuters(Path reutersDir, Path outputDir) throws IOException {
|
||||||
this.reutersDir = reutersDir;
|
this.reutersDir = reutersDir;
|
||||||
|
@ -46,7 +46,7 @@ public class ExtractReuters {
|
||||||
long count = 0;
|
long count = 0;
|
||||||
Files.createDirectories(outputDir);
|
Files.createDirectories(outputDir);
|
||||||
try (Stream<Path> files = Files.list(outputDir)) {
|
try (Stream<Path> files = Files.list(outputDir)) {
|
||||||
if (files.count() > 0) {
|
if (files.findAny().isPresent()) {
|
||||||
throw new IOException("The output directory must be empty: " + outputDir);
|
throw new IOException("The output directory must be empty: " + outputDir);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -65,9 +65,9 @@ public class ExtractReuters {
|
||||||
Pattern EXTRACTION_PATTERN =
|
Pattern EXTRACTION_PATTERN =
|
||||||
Pattern.compile("<TITLE>(.*?)</TITLE>|<DATE>(.*?)</DATE>|<BODY>(.*?)</BODY>");
|
Pattern.compile("<TITLE>(.*?)</TITLE>|<DATE>(.*?)</DATE>|<BODY>(.*?)</BODY>");
|
||||||
|
|
||||||
private static String[] META_CHARS = {"&", "<", ">", "\"", "'"};
|
private static final String[] META_CHARS = {"&", "<", ">", "\"", "'"};
|
||||||
|
|
||||||
private static String[] META_CHARS_SERIALIZATIONS = {"&", "<", ">", """, "'"};
|
private static final String[] META_CHARS_SERIALIZATIONS = {"&", "<", ">", """, "'"};
|
||||||
|
|
||||||
/** Override if you wish to change what is extracted */
|
/** Override if you wish to change what is extracted */
|
||||||
protected void extractFile(Path sgmFile) throws IOException {
|
protected void extractFile(Path sgmFile) throws IOException {
|
||||||
|
@ -80,7 +80,7 @@ public class ExtractReuters {
|
||||||
while ((line = reader.readLine()) != null) {
|
while ((line = reader.readLine()) != null) {
|
||||||
// when we see a closing reuters tag, flush the file
|
// when we see a closing reuters tag, flush the file
|
||||||
|
|
||||||
if (line.indexOf("</REUTERS") == -1) {
|
if (line.contains("</REUTERS") == false) {
|
||||||
// Replace the SGM escape sequences
|
// Replace the SGM escape sequences
|
||||||
|
|
||||||
buffer.append(line).append(' '); // accumulate the strings for now,
|
buffer.append(line).append(' '); // accumulate the strings for now,
|
||||||
|
|
|
@ -86,8 +86,8 @@ public final class FlattenGraphFilter extends TokenFilter {
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Gathers up merged input positions into a single output position, only for the current
|
* Gathers merged input positions into a single output position, only for the current "frontier"
|
||||||
* "frontier" of nodes we've seen but can't yet output because they are not frozen.
|
* of nodes we've seen but can't yet output because they are not frozen.
|
||||||
*/
|
*/
|
||||||
private static final class OutputNode implements RollingBuffer.Resettable {
|
private static final class OutputNode implements RollingBuffer.Resettable {
|
||||||
private final List<Integer> inputNodes = new ArrayList<>();
|
private final List<Integer> inputNodes = new ArrayList<>();
|
||||||
|
@ -115,7 +115,7 @@ public final class FlattenGraphFilter extends TokenFilter {
|
||||||
}
|
}
|
||||||
|
|
||||||
private final RollingBuffer<InputNode> inputNodes =
|
private final RollingBuffer<InputNode> inputNodes =
|
||||||
new RollingBuffer<InputNode>() {
|
new RollingBuffer<>() {
|
||||||
@Override
|
@Override
|
||||||
protected InputNode newInstance() {
|
protected InputNode newInstance() {
|
||||||
return new InputNode();
|
return new InputNode();
|
||||||
|
@ -123,7 +123,7 @@ public final class FlattenGraphFilter extends TokenFilter {
|
||||||
};
|
};
|
||||||
|
|
||||||
private final RollingBuffer<OutputNode> outputNodes =
|
private final RollingBuffer<OutputNode> outputNodes =
|
||||||
new RollingBuffer<OutputNode>() {
|
new RollingBuffer<>() {
|
||||||
@Override
|
@Override
|
||||||
protected OutputNode newInstance() {
|
protected OutputNode newInstance() {
|
||||||
return new OutputNode();
|
return new OutputNode();
|
||||||
|
@ -193,10 +193,10 @@ public final class FlattenGraphFilter extends TokenFilter {
|
||||||
+ " vs output.inputNodes.size()="
|
+ " vs output.inputNodes.size()="
|
||||||
+ output.inputNodes.size();
|
+ output.inputNodes.size();
|
||||||
InputNode inputNode = inputNodes.get(output.inputNodes.get(output.nextOut));
|
InputNode inputNode = inputNodes.get(output.inputNodes.get(output.nextOut));
|
||||||
if (done && inputNode.tokens.size() == 0 && outputFrom >= outputNodes.getMaxPos()) {
|
if (done && inputNode.tokens.isEmpty() && outputFrom >= outputNodes.getMaxPos()) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
if (inputNode.tokens.size() == 0) {
|
if (inputNode.tokens.isEmpty()) {
|
||||||
assert inputNode.nextOut == 0;
|
assert inputNode.nextOut == 0;
|
||||||
// Hole dest nodes should never be merged since 1) we always
|
// Hole dest nodes should never be merged since 1) we always
|
||||||
// assign them to a new output position, and 2) since they never
|
// assign them to a new output position, and 2) since they never
|
||||||
|
@ -271,7 +271,7 @@ public final class FlattenGraphFilter extends TokenFilter {
|
||||||
* @param output target output node
|
* @param output target output node
|
||||||
*/
|
*/
|
||||||
private void freeBefore(OutputNode output) {
|
private void freeBefore(OutputNode output) {
|
||||||
/* We've released all of the tokens that end at the current output, so free all output nodes before this.
|
/* We've released all the tokens that end at the current output, so free all output nodes before this.
|
||||||
Input nodes are more complex. The second shingled tokens with alternate paths can appear later in the output graph
|
Input nodes are more complex. The second shingled tokens with alternate paths can appear later in the output graph
|
||||||
than some of their alternate path tokens. Because of this case we can only free from the minimum because
|
than some of their alternate path tokens. Because of this case we can only free from the minimum because
|
||||||
the minimum node will have come from before the second shingled token.
|
the minimum node will have come from before the second shingled token.
|
||||||
|
@ -283,7 +283,7 @@ public final class FlattenGraphFilter extends TokenFilter {
|
||||||
int freeBefore = Collections.min(output.inputNodes);
|
int freeBefore = Collections.min(output.inputNodes);
|
||||||
// This will catch a node being freed early if it is input to the next output.
|
// This will catch a node being freed early if it is input to the next output.
|
||||||
// Could a freed early node be input to a later output?
|
// Could a freed early node be input to a later output?
|
||||||
assert outputNodes.get(outputFrom).inputNodes.stream().filter(n -> freeBefore > n).count() == 0
|
assert outputNodes.get(outputFrom).inputNodes.stream().noneMatch(n -> freeBefore > n)
|
||||||
: "FreeBefore " + freeBefore + " will free in use nodes";
|
: "FreeBefore " + freeBefore + " will free in use nodes";
|
||||||
inputNodes.freeBefore(freeBefore);
|
inputNodes.freeBefore(freeBefore);
|
||||||
outputNodes.freeBefore(outputFrom);
|
outputNodes.freeBefore(outputFrom);
|
||||||
|
|
Loading…
Reference in New Issue