diff --git a/lucene/analysis/kuromoji/src/resources/org/apache/lucene/analysis/ja/dict/TokenInfoDictionary$fst.dat b/lucene/analysis/kuromoji/src/resources/org/apache/lucene/analysis/ja/dict/TokenInfoDictionary$fst.dat index 2b1cedb59e2..538cd4c7bf2 100644 Binary files a/lucene/analysis/kuromoji/src/resources/org/apache/lucene/analysis/ja/dict/TokenInfoDictionary$fst.dat and b/lucene/analysis/kuromoji/src/resources/org/apache/lucene/analysis/ja/dict/TokenInfoDictionary$fst.dat differ diff --git a/lucene/core/src/java/org/apache/lucene/util/fst/BytesStore.java b/lucene/core/src/java/org/apache/lucene/util/fst/BytesStore.java index 6a3d104d178..bd98ef08711 100644 --- a/lucene/core/src/java/org/apache/lucene/util/fst/BytesStore.java +++ b/lucene/core/src/java/org/apache/lucene/util/fst/BytesStore.java @@ -69,6 +69,14 @@ class BytesStore extends DataOutput { nextWrite = blocks.get(blocks.size()-1).length; } + /** Absolute write byte; you must ensure dest is < max + * position written so far. */ + public void writeByte(int dest, byte b) { + int blockIndex = dest >> blockBits; + byte[] block = blocks.get(blockIndex); + block[dest & blockMask] = b; + } + @Override public void writeByte(byte b) { if (nextWrite == blockSize) { @@ -237,9 +245,10 @@ class BytesStore extends DataOutput { } } - /** Reverse the last numBytes. */ + /** Reverse from srcPos, inclusive, to destPos, inclusive. */ public void reverse(int srcPos, int destPos) { assert srcPos < destPos; + assert destPos < getPosition(); //System.out.println("reverse src=" + srcPos + " dest=" + destPos); int srcBlockIndex = srcPos >> blockBits; @@ -275,7 +284,7 @@ class BytesStore extends DataOutput { } } - public void skip(int len) { + public void skipBytes(int len) { while (len > 0) { int chunk = blockSize - nextWrite; if (len <= chunk) { @@ -294,6 +303,26 @@ class BytesStore extends DataOutput { return (blocks.size()-1) * blockSize + nextWrite; } + /** Pos must be less than the max position written so far! + * Ie, you cannot "grow" the file with this! */ + public void truncate(int newLen) { + assert newLen <= getPosition(); + assert newLen >= 0; + int blockIndex = newLen >> blockBits; + nextWrite = newLen & blockMask; + if (nextWrite == 0) { + blockIndex--; + nextWrite = blockSize; + } + blocks.subList(blockIndex+1, blocks.size()).clear(); + if (newLen == 0) { + current = null; + } else { + current = blocks.get(blockIndex); + } + assert newLen == getPosition(); + } + public void finish() { if (current != null) { byte[] lastBuffer = new byte[nextWrite]; diff --git a/lucene/core/src/java/org/apache/lucene/util/fst/FST.java b/lucene/core/src/java/org/apache/lucene/util/fst/FST.java index 97beefe6274..7d687487a44 100644 --- a/lucene/core/src/java/org/apache/lucene/util/fst/FST.java +++ b/lucene/core/src/java/org/apache/lucene/util/fst/FST.java @@ -730,7 +730,7 @@ public final class FST { int destPos = fixedArrayStart + nodeIn.numArcs*maxBytesPerArc; assert destPos >= srcPos; if (destPos > srcPos) { - bytes.skip(destPos - srcPos); + bytes.skipBytes(destPos - srcPos); for(int arcIdx=nodeIn.numArcs-1;arcIdx>=0;arcIdx--) { destPos -= maxBytesPerArc; srcPos -= bytesPerArc[arcIdx]; @@ -1444,9 +1444,6 @@ public final class FST { throw new IllegalArgumentException("this FST was not built with willPackFST=true"); } - final RAMOutputStream buffer = new RAMOutputStream(); - byte[] bufferBytes = new byte[64]; - Arc arc = new Arc(); final BytesReader r = getBytesReader(0); @@ -1553,7 +1550,7 @@ public final class FST { // this is an array'd node and bytesPerArc changes: writeNode: while(true) { // retry writing this node - assert buffer.getFilePointer() == 0; + //System.out.println(" cycle: retry"); readFirstRealTargetArc(node, arc, r); @@ -1563,9 +1560,9 @@ public final class FST { if (bytesPerArc == 0) { bytesPerArc = arc.bytesPerArc; } - buffer.writeByte(ARCS_AS_FIXED_ARRAY); - buffer.writeVInt(arc.numArcs); - buffer.writeVInt(bytesPerArc); + writer.writeByte(ARCS_AS_FIXED_ARRAY); + writer.writeVInt(arc.numArcs); + writer.writeVInt(bytesPerArc); //System.out.println("node " + node + ": " + arc.numArcs + " arcs"); } @@ -1574,7 +1571,7 @@ public final class FST { while(true) { // iterate over all arcs for this node //System.out.println(" cycle next arc"); - final int arcStartPos = (int) buffer.getFilePointer(); + final int arcStartPos = writer.getPosition(); nodeArcCount++; byte flags = 0; @@ -1621,7 +1618,7 @@ public final class FST { absPtr = topNodeMap.size() + (int) newNodeAddress.get(arc.target) + addressError; } - int delta = (int) (newNodeAddress.get(arc.target) + addressError - buffer.getFilePointer() - address - 2); + int delta = (int) (newNodeAddress.get(arc.target) + addressError - writer.getPosition() - 2); if (delta < 0) { //System.out.println("neg: " + delta); anyNegDelta = true; @@ -1637,23 +1634,23 @@ public final class FST { } assert flags != ARCS_AS_FIXED_ARRAY; - buffer.writeByte(flags); + writer.writeByte(flags); - fst.writeLabel(buffer, arc.label); + fst.writeLabel(writer, arc.label); if (arc.output != NO_OUTPUT) { - outputs.write(arc.output, buffer); + outputs.write(arc.output, writer); if (!retry) { fst.arcWithOutputCount++; } } if (arc.nextFinalOutput != NO_OUTPUT) { - outputs.writeFinalOutput(arc.nextFinalOutput, buffer); + outputs.writeFinalOutput(arc.nextFinalOutput, writer); } if (doWriteTarget) { - int delta = (int) (newNodeAddress.get(arc.target) + addressError - buffer.getFilePointer() - address); + int delta = (int) (newNodeAddress.get(arc.target) + addressError - writer.getPosition()); if (delta < 0) { anyNegDelta = true; //System.out.println("neg: " + delta); @@ -1662,7 +1659,7 @@ public final class FST { if (flag(flags, BIT_TARGET_DELTA)) { //System.out.println(" delta"); - buffer.writeVInt(delta); + writer.writeVInt(delta); if (!retry) { deltaCount++; } @@ -1674,7 +1671,7 @@ public final class FST { System.out.println(" abs"); } */ - buffer.writeVInt(absPtr); + writer.writeVInt(absPtr); if (!retry) { if (absPtr >= topNodeMap.size()) { absCount++; @@ -1686,7 +1683,7 @@ public final class FST { } if (useArcArray) { - final int arcBytes = (int) (buffer.getFilePointer() - arcStartPos); + final int arcBytes = writer.getPosition() - arcStartPos; //System.out.println(" " + arcBytes + " bytes"); maxBytesPerArc = Math.max(maxBytesPerArc, arcBytes); // NOTE: this may in fact go "backwards", if @@ -1696,11 +1693,7 @@ public final class FST { // will retry (below) so it's OK to ovewrite // bytes: //wasted += bytesPerArc - arcBytes; - int skip = (int) (arcStartPos + bytesPerArc - buffer.getFilePointer()); - while(skip > 0) { - buffer.writeByte((byte) 0); - skip--; - } + writer.skipBytes(arcStartPos + bytesPerArc - writer.getPosition()); } if (arc.isLast()) { @@ -1725,19 +1718,12 @@ public final class FST { // Retry: bytesPerArc = maxBytesPerArc; - buffer.reset(); + writer.truncate(address); nodeArcCount = 0; retry = true; anyNegDelta = false; } - if (bufferBytes.length < (int) buffer.getFilePointer()) { - bufferBytes = ArrayUtil.grow(bufferBytes, (int) buffer.getFilePointer()); - } - buffer.writeTo(bufferBytes, 0); - writer.writeBytes(bufferBytes, 0, (int) buffer.getFilePointer()); - buffer.reset(); - negDelta |= anyNegDelta; fst.arcCount += nodeArcCount; diff --git a/lucene/core/src/test/org/apache/lucene/util/fst/TestBytesStore.java b/lucene/core/src/test/org/apache/lucene/util/fst/TestBytesStore.java index f3933d5a0fb..d6c504a7ff8 100644 --- a/lucene/core/src/test/org/apache/lucene/util/fst/TestBytesStore.java +++ b/lucene/core/src/test/org/apache/lucene/util/fst/TestBytesStore.java @@ -42,7 +42,7 @@ public class TestBytesStore extends LuceneTestCase { int pos = 0; while(pos < numBytes) { - int op = random().nextInt(7); + int op = random().nextInt(8); if (VERBOSE) { System.out.println(" cycle pos=" + pos); } @@ -97,21 +97,21 @@ public class TestBytesStore extends LuceneTestCase { case 3: { // reverse bytes - if (pos > 0) { - int len = _TestUtil.nextInt(random(), 1, Math.min(100, pos)); + if (pos > 1) { + int len = _TestUtil.nextInt(random(), 2, Math.min(100, pos)); int start; if (len == pos) { start = 0; } else { start = random().nextInt(pos - len); } - int end = start + len; + int end = start + len - 1; if (VERBOSE) { - System.out.println(" reverse start=" + start + " end=" + end + " len=" + len); + System.out.println(" reverse start=" + start + " end=" + end + " len=" + len + " pos=" + pos); } bytes.reverse(start, end); - while(start < end) { + while(start <= end) { byte b = expected[end]; expected[end] = expected[start]; expected[start] = b; @@ -159,17 +159,49 @@ public class TestBytesStore extends LuceneTestCase { { // skip int len = random().nextInt(Math.min(100, numBytes - pos)); - pos += len; - bytes.skip(len); + if (VERBOSE) { System.out.println(" skip len=" + len); } + + pos += len; + bytes.skipBytes(len); + + // NOTE: must fill in zeros in case truncate was + // used, else we get false fails: + if (len > 0) { + byte[] zeros = new byte[len]; + bytes.writeBytes(pos-len, zeros, 0, len); + } } break; + + case 7: + { + // absWriteByte + if (pos > 0) { + int dest = random().nextInt(pos); + byte b = (byte) random().nextInt(256); + expected[dest] = b; + bytes.writeByte(dest, b); + } + break; + } } assertEquals(pos, bytes.getPosition()); + if (pos > 0 && random().nextInt(50) == 17) { + // truncate + int len = _TestUtil.nextInt(random(), 1, Math.min(pos, 100)); + bytes.truncate(pos - len); + pos -= len; + Arrays.fill(expected, pos, pos+len, (byte) 0); + if (VERBOSE) { + System.out.println(" truncate len=" + len + " newPos=" + pos); + } + } + if ((pos > 0 && random().nextInt(200) == 17)) { verify(bytes, expected, pos); } diff --git a/lucene/core/src/test/org/apache/lucene/util/fst/TestFSTs.java b/lucene/core/src/test/org/apache/lucene/util/fst/TestFSTs.java index bed5bbf48c2..013557a50c9 100644 --- a/lucene/core/src/test/org/apache/lucene/util/fst/TestFSTs.java +++ b/lucene/core/src/test/org/apache/lucene/util/fst/TestFSTs.java @@ -483,11 +483,14 @@ public class TestFSTs extends LuceneTestCase { break; } } - long t = System.currentTimeMillis() - tStart; - System.out.println((t / 1000.0) + " sec to build"); + + long tMid = System.currentTimeMillis(); + System.out.println(((tMid-tStart) / 1000.0) + " sec to add all terms"); assert builder.getTermCount() == ord; FST fst = builder.finish(); + long tEnd = System.currentTimeMillis(); + System.out.println(((tEnd-tMid) / 1000.0) + " sec to finish/pack"); if (fst == null) { System.out.println("FST was fully pruned!"); System.exit(0);