mirror of https://github.com/apache/lucene.git
Checking in simple performance test
git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@605957 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
52d307607c
commit
7a3a61e45d
|
@ -17,9 +17,14 @@ package org.apache.lucene.analysis;
|
||||||
*/
|
*/
|
||||||
|
|
||||||
import junit.framework.TestCase;
|
import junit.framework.TestCase;
|
||||||
|
import org.apache.lucene.analysis.standard.StandardFilter;
|
||||||
|
import org.apache.lucene.analysis.standard.StandardTokenizer;
|
||||||
|
import org.apache.lucene.util.English;
|
||||||
|
|
||||||
import java.io.StringReader;
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
|
import java.io.StringReader;
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* tests for the TeeTokenFilter and SinkTokenizer
|
* tests for the TeeTokenFilter and SinkTokenizer
|
||||||
|
@ -56,9 +61,9 @@ public class TeeSinkTokenTest extends TestCase {
|
||||||
|
|
||||||
public void test() throws IOException {
|
public void test() throws IOException {
|
||||||
|
|
||||||
SinkTokenizer sink1 = new SinkTokenizer(null){
|
SinkTokenizer sink1 = new SinkTokenizer(null) {
|
||||||
public void add(Token t) {
|
public void add(Token t) {
|
||||||
if (t != null && t.termText().equalsIgnoreCase("The")){
|
if (t != null && t.termText().equalsIgnoreCase("The")) {
|
||||||
super.add(t);
|
super.add(t);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -66,14 +71,14 @@ public class TeeSinkTokenTest extends TestCase {
|
||||||
TokenStream source = new TeeTokenFilter(new WhitespaceTokenizer(new StringReader(buffer1.toString())), sink1);
|
TokenStream source = new TeeTokenFilter(new WhitespaceTokenizer(new StringReader(buffer1.toString())), sink1);
|
||||||
Token token = null;
|
Token token = null;
|
||||||
int i = 0;
|
int i = 0;
|
||||||
while ((token = source.next()) != null){
|
while ((token = source.next()) != null) {
|
||||||
assertTrue(token.termText() + " is not equal to " + tokens1[i], token.termText().equals(tokens1[i]) == true);
|
assertTrue(token.termText() + " is not equal to " + tokens1[i], token.termText().equals(tokens1[i]) == true);
|
||||||
i++;
|
i++;
|
||||||
}
|
}
|
||||||
assertTrue(i + " does not equal: " + tokens1.length, i == tokens1.length);
|
assertTrue(i + " does not equal: " + tokens1.length, i == tokens1.length);
|
||||||
assertTrue("sink1 Size: " + sink1.getTokens().size() + " is not: " + 2, sink1.getTokens().size() == 2);
|
assertTrue("sink1 Size: " + sink1.getTokens().size() + " is not: " + 2, sink1.getTokens().size() == 2);
|
||||||
i = 0;
|
i = 0;
|
||||||
while ((token = sink1.next()) != null){
|
while ((token = sink1.next()) != null) {
|
||||||
assertTrue(token.termText() + " is not equal to " + "The", token.termText().equalsIgnoreCase("The") == true);
|
assertTrue(token.termText() + " is not equal to " + "The", token.termText().equalsIgnoreCase("The") == true);
|
||||||
i++;
|
i++;
|
||||||
}
|
}
|
||||||
|
@ -81,16 +86,16 @@ public class TeeSinkTokenTest extends TestCase {
|
||||||
}
|
}
|
||||||
|
|
||||||
public void testMultipleSources() throws Exception {
|
public void testMultipleSources() throws Exception {
|
||||||
SinkTokenizer theDetector = new SinkTokenizer(null){
|
SinkTokenizer theDetector = new SinkTokenizer(null) {
|
||||||
public void add(Token t) {
|
public void add(Token t) {
|
||||||
if (t != null && t.termText().equalsIgnoreCase("The")){
|
if (t != null && t.termText().equalsIgnoreCase("The")) {
|
||||||
super.add(t);
|
super.add(t);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
SinkTokenizer dogDetector = new SinkTokenizer(null){
|
SinkTokenizer dogDetector = new SinkTokenizer(null) {
|
||||||
public void add(Token t) {
|
public void add(Token t) {
|
||||||
if (t != null && t.termText().equalsIgnoreCase("Dogs")){
|
if (t != null && t.termText().equalsIgnoreCase("Dogs")) {
|
||||||
super.add(t);
|
super.add(t);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -99,7 +104,7 @@ public class TeeSinkTokenTest extends TestCase {
|
||||||
TokenStream source2 = new TeeTokenFilter(new TeeTokenFilter(new WhitespaceTokenizer(new StringReader(buffer2.toString())), theDetector), dogDetector);
|
TokenStream source2 = new TeeTokenFilter(new TeeTokenFilter(new WhitespaceTokenizer(new StringReader(buffer2.toString())), theDetector), dogDetector);
|
||||||
Token token = null;
|
Token token = null;
|
||||||
int i = 0;
|
int i = 0;
|
||||||
while ((token = source1.next()) != null){
|
while ((token = source1.next()) != null) {
|
||||||
assertTrue(token.termText() + " is not equal to " + tokens1[i], token.termText().equals(tokens1[i]) == true);
|
assertTrue(token.termText() + " is not equal to " + tokens1[i], token.termText().equals(tokens1[i]) == true);
|
||||||
i++;
|
i++;
|
||||||
}
|
}
|
||||||
|
@ -107,7 +112,7 @@ public class TeeSinkTokenTest extends TestCase {
|
||||||
assertTrue("theDetector Size: " + theDetector.getTokens().size() + " is not: " + 2, theDetector.getTokens().size() == 2);
|
assertTrue("theDetector Size: " + theDetector.getTokens().size() + " is not: " + 2, theDetector.getTokens().size() == 2);
|
||||||
assertTrue("dogDetector Size: " + dogDetector.getTokens().size() + " is not: " + 1, dogDetector.getTokens().size() == 1);
|
assertTrue("dogDetector Size: " + dogDetector.getTokens().size() + " is not: " + 1, dogDetector.getTokens().size() == 1);
|
||||||
i = 0;
|
i = 0;
|
||||||
while ((token = source2.next()) != null){
|
while ((token = source2.next()) != null) {
|
||||||
assertTrue(token.termText() + " is not equal to " + tokens2[i], token.termText().equals(tokens2[i]) == true);
|
assertTrue(token.termText() + " is not equal to " + tokens2[i], token.termText().equals(tokens2[i]) == true);
|
||||||
i++;
|
i++;
|
||||||
}
|
}
|
||||||
|
@ -115,13 +120,13 @@ public class TeeSinkTokenTest extends TestCase {
|
||||||
assertTrue("theDetector Size: " + theDetector.getTokens().size() + " is not: " + 4, theDetector.getTokens().size() == 4);
|
assertTrue("theDetector Size: " + theDetector.getTokens().size() + " is not: " + 4, theDetector.getTokens().size() == 4);
|
||||||
assertTrue("dogDetector Size: " + dogDetector.getTokens().size() + " is not: " + 2, dogDetector.getTokens().size() == 2);
|
assertTrue("dogDetector Size: " + dogDetector.getTokens().size() + " is not: " + 2, dogDetector.getTokens().size() == 2);
|
||||||
i = 0;
|
i = 0;
|
||||||
while ((token = theDetector.next()) != null){
|
while ((token = theDetector.next()) != null) {
|
||||||
assertTrue(token.termText() + " is not equal to " + "The", token.termText().equalsIgnoreCase("The") == true);
|
assertTrue(token.termText() + " is not equal to " + "The", token.termText().equalsIgnoreCase("The") == true);
|
||||||
i++;
|
i++;
|
||||||
}
|
}
|
||||||
assertTrue(i + " does not equal: " + theDetector.getTokens().size(), i == theDetector.getTokens().size());
|
assertTrue(i + " does not equal: " + theDetector.getTokens().size(), i == theDetector.getTokens().size());
|
||||||
i = 0;
|
i = 0;
|
||||||
while ((token = dogDetector.next()) != null){
|
while ((token = dogDetector.next()) != null) {
|
||||||
assertTrue(token.termText() + " is not equal to " + "Dogs", token.termText().equalsIgnoreCase("Dogs") == true);
|
assertTrue(token.termText() + " is not equal to " + "Dogs", token.termText().equalsIgnoreCase("Dogs") == true);
|
||||||
i++;
|
i++;
|
||||||
}
|
}
|
||||||
|
@ -129,10 +134,130 @@ public class TeeSinkTokenTest extends TestCase {
|
||||||
source1.reset();
|
source1.reset();
|
||||||
TokenStream lowerCasing = new LowerCaseFilter(source1);
|
TokenStream lowerCasing = new LowerCaseFilter(source1);
|
||||||
i = 0;
|
i = 0;
|
||||||
while ((token = lowerCasing.next()) != null){
|
while ((token = lowerCasing.next()) != null) {
|
||||||
assertTrue(token.termText() + " is not equal to " + tokens1[i].toLowerCase(), token.termText().equals(tokens1[i].toLowerCase()) == true);
|
assertTrue(token.termText() + " is not equal to " + tokens1[i].toLowerCase(), token.termText().equals(tokens1[i].toLowerCase()) == true);
|
||||||
i++;
|
i++;
|
||||||
}
|
}
|
||||||
assertTrue(i + " does not equal: " + tokens1.length, i == tokens1.length);
|
assertTrue(i + " does not equal: " + tokens1.length, i == tokens1.length);
|
||||||
}
|
}
|
||||||
}
|
|
||||||
|
/**
|
||||||
|
* Not an explicit test, just useful to print out some info on performance
|
||||||
|
*
|
||||||
|
* @throws Exception
|
||||||
|
*/
|
||||||
|
public void testPerformance() throws Exception {
|
||||||
|
int[] tokCount = {100, 500, 1000, 2000, 5000, 10000};
|
||||||
|
int[] modCounts = {1, 2, 5, 10, 20, 50, 100, 200, 500};
|
||||||
|
for (int k = 0; k < tokCount.length; k++) {
|
||||||
|
StringBuffer buffer = new StringBuffer();
|
||||||
|
System.out.println("-----Tokens: " + tokCount[k] + "-----");
|
||||||
|
for (int i = 0; i < tokCount[k]; i++) {
|
||||||
|
buffer.append(English.intToEnglish(i).toUpperCase()).append(' ');
|
||||||
|
}
|
||||||
|
//make sure we produce the same tokens
|
||||||
|
ModuloSinkTokenizer sink = new ModuloSinkTokenizer(tokCount[k], 100);
|
||||||
|
Token next = new Token();
|
||||||
|
TokenStream result = new TeeTokenFilter(new StandardFilter(new StandardTokenizer(new StringReader(buffer.toString()))), sink);
|
||||||
|
while ((next = result.next(next)) != null) {
|
||||||
|
}
|
||||||
|
result = new ModuloTokenFilter(new StandardFilter(new StandardTokenizer(new StringReader(buffer.toString()))), 100);
|
||||||
|
next = new Token();
|
||||||
|
List tmp = new ArrayList();
|
||||||
|
while ((next = result.next(next)) != null) {
|
||||||
|
tmp.add(next.clone());
|
||||||
|
}
|
||||||
|
List sinkList = sink.getTokens();
|
||||||
|
assertTrue("tmp Size: " + tmp.size() + " is not: " + sinkList.size(), tmp.size() == sinkList.size());
|
||||||
|
for (int i = 0; i < tmp.size(); i++) {
|
||||||
|
Token tfTok = (Token) tmp.get(i);
|
||||||
|
Token sinkTok = (Token) sinkList.get(i);
|
||||||
|
assertTrue(tfTok.termText() + " is not equal to " + sinkTok.termText() + " at token: " + i, tfTok.termText().equals(sinkTok.termText()) == true);
|
||||||
|
}
|
||||||
|
//simulate two fields, each being analyzed once, for 20 documents
|
||||||
|
|
||||||
|
for (int j = 0; j < modCounts.length; j++) {
|
||||||
|
int tfPos = 0;
|
||||||
|
long start = System.currentTimeMillis();
|
||||||
|
for (int i = 0; i < 20; i++) {
|
||||||
|
next = new Token();
|
||||||
|
result = new StandardFilter(new StandardTokenizer(new StringReader(buffer.toString())));
|
||||||
|
while ((next = result.next(next)) != null) {
|
||||||
|
tfPos += next.getPositionIncrement();
|
||||||
|
}
|
||||||
|
next = new Token();
|
||||||
|
result = new ModuloTokenFilter(new StandardFilter(new StandardTokenizer(new StringReader(buffer.toString()))), modCounts[j]);
|
||||||
|
while ((next = result.next(next)) != null) {
|
||||||
|
tfPos += next.getPositionIncrement();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
long finish = System.currentTimeMillis();
|
||||||
|
System.out.println("ModCount: " + modCounts[j] + " Two fields took " + (finish - start) + " ms");
|
||||||
|
int sinkPos = 0;
|
||||||
|
//simulate one field with one sink
|
||||||
|
start = System.currentTimeMillis();
|
||||||
|
for (int i = 0; i < 20; i++) {
|
||||||
|
sink = new ModuloSinkTokenizer(tokCount[k], modCounts[j]);
|
||||||
|
next = new Token();
|
||||||
|
result = new TeeTokenFilter(new StandardFilter(new StandardTokenizer(new StringReader(buffer.toString()))), sink);
|
||||||
|
while ((next = result.next(next)) != null) {
|
||||||
|
sinkPos += next.getPositionIncrement();
|
||||||
|
}
|
||||||
|
//System.out.println("Modulo--------");
|
||||||
|
result = sink;
|
||||||
|
while ((next = result.next(next)) != null) {
|
||||||
|
sinkPos += next.getPositionIncrement();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
finish = System.currentTimeMillis();
|
||||||
|
System.out.println("ModCount: " + modCounts[j] + " Tee fields took " + (finish - start) + " ms");
|
||||||
|
assertTrue(sinkPos + " does not equal: " + tfPos, sinkPos == tfPos);
|
||||||
|
|
||||||
|
}
|
||||||
|
System.out.println("- End Tokens: " + tokCount[k] + "-----");
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
class ModuloTokenFilter extends TokenFilter {
|
||||||
|
|
||||||
|
int modCount;
|
||||||
|
|
||||||
|
ModuloTokenFilter(TokenStream input, int mc) {
|
||||||
|
super(input);
|
||||||
|
modCount = mc;
|
||||||
|
}
|
||||||
|
|
||||||
|
int count = 0;
|
||||||
|
|
||||||
|
//return every 100 tokens
|
||||||
|
public Token next(Token result) throws IOException {
|
||||||
|
|
||||||
|
while ((result = input.next(result)) != null && count % modCount != 0) {
|
||||||
|
count++;
|
||||||
|
}
|
||||||
|
count++;
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
class ModuloSinkTokenizer extends SinkTokenizer {
|
||||||
|
int count = 0;
|
||||||
|
int modCount;
|
||||||
|
|
||||||
|
|
||||||
|
ModuloSinkTokenizer(int numToks, int mc) {
|
||||||
|
modCount = mc;
|
||||||
|
lst = new ArrayList(numToks % mc);
|
||||||
|
}
|
||||||
|
|
||||||
|
public void add(Token t) {
|
||||||
|
if (t != null && count % modCount == 0) {
|
||||||
|
lst.add(t.clone());
|
||||||
|
}
|
||||||
|
count++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue