fix Bug 51772 - IllegalArgumentException Parsing MS Word 97 - 2003;

Replace byte->char translation with byte range -> char range_S_ translation for PAPX / CHPX tables

git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@1166144 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Sergey Vladimirov 2011-09-07 12:12:17 +00:00
parent 36bd0ba1a3
commit a45d61c740
8 changed files with 188 additions and 57 deletions

View File

@ -34,6 +34,7 @@
<changes>
<release version="3.8-beta5" date="2011-??-??">
<action dev="poi-developers" type="fix">51772 - IllegalArgumentException Parsing MS Word 97 - 2003</action>
<action dev="poi-developers" type="add">XSLFPowerPointExtractor support for including comment authors with comment text</action>
<action dev="poi-developers" type="fix">Converted XSLFPowerPointExtractor to use UserModel for all text extraction</action>
<action dev="poi-developers" type="add">XSLF initial UserModel support for Notes and Comments for Slides</action>

View File

@ -101,18 +101,21 @@ public class CHPBinTable
CHPFormattedDiskPage cfkp = new CHPFormattedDiskPage(documentStream,
pageOffset, translator);
int fkpSize = cfkp.size();
for (int y = 0; y < fkpSize; y++)
{
final CHPX chpx = cfkp.getCHPX(y);
if (chpx != null)
_textRuns.add(chpx);
}
for ( CHPX chpx : cfkp.getCHPXs() )
{
if ( chpx != null )
_textRuns.add( chpx );
}
}
logger.log( POILogger.DEBUG, "CHPX FKPs loaded in ",
Long.valueOf( System.currentTimeMillis() - start ), " ms (",
Integer.valueOf( _textRuns.size() ), " elements)" );
if ( _textRuns.isEmpty() )
{
logger.log( POILogger.WARN, "CHPX FKPs are empty" );
_textRuns.add( new CHPX( 0, 0, new SprmBuffer( 0 ) ) );
}
}
public void rebuild( ComplexFileTable complexFileTable )

View File

@ -18,6 +18,7 @@
package org.apache.poi.hwpf.model;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import org.apache.poi.hwpf.sprm.SprmBuffer;
@ -82,15 +83,17 @@ public final class CHPFormattedDiskPage extends FormattedDiskPage
int bytesStartAt = getStart( x );
int bytesEndAt = getEnd( x );
int charStartAt = translator.getCharIndex( bytesStartAt );
int charEndAt = translator.getCharIndex( bytesEndAt, charStartAt );
// int charStartAt = translator.getCharIndex( bytesStartAt );
// int charEndAt = translator.getCharIndex( bytesEndAt, charStartAt
// );
// TODO: CHECK!
// CHPX chpx = new CHPX( bytesStartAt, bytesEndAt, tpt, getGrpprl( x
// ) );
CHPX chpx = new CHPX( charStartAt, charEndAt, new SprmBuffer(
getGrpprl( x ), 0 ) );
_chpxList.add( chpx );
for ( int[] range : translator.getCharIndexRanges( bytesStartAt,
bytesEndAt ) )
{
CHPX chpx = new CHPX( range[0], range[1], new SprmBuffer(
getGrpprl( x ), 0 ) );
_chpxList.add( chpx );
}
}
}
@ -99,6 +102,11 @@ public final class CHPFormattedDiskPage extends FormattedDiskPage
return _chpxList.get(index);
}
public List<CHPX> getCHPXs()
{
return Collections.unmodifiableList( _chpxList );
}
public void fill(List<CHPX> filler)
{
_chpxList.addAll(filler);

View File

@ -31,12 +31,16 @@ public interface CharIndexTranslator {
int getByteIndex( int charPos );
/**
* Calculates the char index of the given byte index.
* Look forward if index is not in table
*
* @param bytePos The character offset to check
* Calculates the char index of the given byte index. Look forward if index
* is not in table
*
* @param bytePos
* The character offset to check
* @return the char index
* @deprecated This API were based on incorrect assumption that single byte
* offset corresponds to single char offset
*/
@Deprecated
int getCharIndex(int bytePos);
/**
@ -46,16 +50,29 @@ public interface CharIndexTranslator {
* @param bytePos The character offset to check
* @param startCP look from this characted position
* @return the char index
* @deprecated This API were based on incorrect assumption that single byte
* offset corresponds to single char offset
*/
@Deprecated
int getCharIndex(int bytePos, int startCP);
/**
* Finds character ranges that includes specified byte range.
*
* @param startBytePosInclusive
* start byte range
* @param endBytePosExclusive
* end byte range
*/
int[][] getCharIndexRanges( int startBytePosInclusive,
int endBytePosExclusive );
/**
* Check if index is in table
*
*
* @param bytePos
* @return true if index in table, false if not
*/
boolean isIndexInTable(int bytePos);
/**

View File

@ -92,12 +92,8 @@ public class PAPBinTable
documentStream, dataStream, pageOffset,
charIndexTranslator );
int fkpSize = pfkp.size();
for ( int y = 0; y < fkpSize; y++ )
for ( PAPX papx : pfkp.getPAPXs() )
{
PAPX papx = pfkp.getPAPX( y );
if ( papx != null )
_paragraphs.add( papx );
}
@ -107,6 +103,12 @@ public class PAPBinTable
logger.log( POILogger.DEBUG, "PAPX tables loaded in ",
Long.valueOf( System.currentTimeMillis() - start ), " ms (",
Integer.valueOf( _paragraphs.size() ), " elements)" );
if ( _paragraphs.isEmpty() )
{
logger.log( POILogger.WARN, "PAPX FKPs are empty" );
_paragraphs.add( new PAPX( 0, 0, new SprmBuffer( 2 ) ) );
}
}
public void rebuild( final StringBuilder docText,

View File

@ -23,6 +23,8 @@ import java.util.Arrays;
import java.util.Collections;
import java.util.List;
import org.apache.poi.hwpf.sprm.SprmBuffer;
import org.apache.poi.hwpf.model.io.HWPFOutputStream;
import org.apache.poi.util.Internal;
import org.apache.poi.util.LittleEndian;
@ -88,12 +90,20 @@ public final class PAPFormattedDiskPage extends FormattedDiskPage {
int bytesStartAt = getStart( x );
int bytesEndAt = getEnd( x );
int charStartAt = translator.getCharIndex( bytesStartAt );
int charEndAt = translator.getCharIndex( bytesEndAt, charStartAt );
// int charStartAt = translator.getCharIndex( bytesStartAt );
// int charEndAt = translator.getCharIndex( bytesEndAt, charStartAt
// );
// PAPX papx = new PAPX( charStartAt, charEndAt, getGrpprl( x ),
// getParagraphHeight( x ), dataStream );
// _papxList.add( papx );
PAPX papx = new PAPX( charStartAt, charEndAt, getGrpprl( x ),
getParagraphHeight( x ), dataStream );
_papxList.add( papx );
for ( int[] range : translator.getCharIndexRanges( bytesStartAt,
bytesEndAt ) )
{
PAPX papx = new PAPX( range[0], range[1], getGrpprl( x ),
getParagraphHeight( x ), dataStream );
_papxList.add( papx );
}
}
_fkp = null;
}

View File

@ -20,6 +20,7 @@ import java.io.IOException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.LinkedList;
import java.util.List;
import org.apache.poi.hwpf.model.io.HWPFOutputStream;
@ -107,8 +108,10 @@ public class TextPieceTable implements CharIndexTranslator
System.arraycopy( documentStream, start, buf, 0, textSizeBytes );
// And now build the piece
_textPieces.add( new TextPiece( nodeStartChars, nodeEndChars, buf,
pieces[x] ) );
final TextPiece newTextPiece = new TextPiece( nodeStartChars, nodeEndChars, buf,
pieces[x] );
_textPieces.add( newTextPiece );
}
// In the interest of our sanity, now sort the text pieces
@ -201,11 +204,13 @@ public class TextPieceTable implements CharIndexTranslator
return byteCount;
}
@Deprecated
public int getCharIndex( int bytePos )
{
return getCharIndex( bytePos, 0 );
}
@Deprecated
public int getCharIndex( int startBytePos, int startCP )
{
int charCount = 0;
@ -253,6 +258,42 @@ public class TextPieceTable implements CharIndexTranslator
return charCount;
}
public int[][] getCharIndexRanges( int startBytePosInclusive,
int endBytePosExclusive )
{
List<int[]> result = new LinkedList<int[]>();
for ( TextPiece textPiece : _textPiecesFCOrder )
{
final int tpStart = textPiece.getPieceDescriptor()
.getFilePosition();
final int tpEnd = textPiece.getPieceDescriptor().getFilePosition()
+ textPiece.bytesLength();
if ( startBytePosInclusive > tpEnd )
continue;
if ( endBytePosExclusive < tpStart )
break;
final int rangeStartBytes = Math.max( tpStart,
startBytePosInclusive );
final int rangeEndBytes = Math.min( tpEnd, endBytePosExclusive );
final int rangeLengthBytes = rangeEndBytes - rangeStartBytes;
if ( rangeStartBytes > rangeEndBytes )
continue;
final int encodingMultiplier = textPiece.isUnicode() ? 2 : 1;
final int rangeStartCp = textPiece.getStart()
+ ( rangeStartBytes - tpStart ) / encodingMultiplier;
final int rangeEndCp = rangeStartCp + rangeLengthBytes
/ encodingMultiplier;
result.add( new int[] { rangeStartCp, rangeEndCp } );
}
return result.toArray( new int[result.size()][] );
}
public int getCpMin()
{
return _cpMin;
@ -377,24 +418,42 @@ public class TextPieceTable implements CharIndexTranslator
public int lookIndexForward( final int startBytePos )
{
int bytePos = startBytePos;
for ( TextPiece tp : _textPiecesFCOrder )
if ( _textPiecesFCOrder.isEmpty() )
throw new IllegalStateException( "Text pieces table is empty" );
if ( _textPiecesFCOrder.get( 0 ).getPieceDescriptor().getFilePosition() > startBytePos )
return _textPiecesFCOrder.get( 0 ).getPieceDescriptor().getFilePosition();
if ( _textPiecesFCOrder.get( _textPiecesFCOrder.size() - 1 )
.getPieceDescriptor().getFilePosition() <= startBytePos )
return startBytePos;
int low = 0;
int high = _textPiecesFCOrder.size() - 1;
while ( low <= high )
{
int pieceStart = tp.getPieceDescriptor().getFilePosition();
int mid = ( low + high ) >>> 1;
final TextPiece textPiece = _textPiecesFCOrder.get( mid );
int midVal = textPiece.getPieceDescriptor().getFilePosition();
if ( bytePos >= pieceStart + tp.bytesLength() )
{
continue;
}
if ( pieceStart > bytePos )
{
bytePos = pieceStart;
}
break;
if ( midVal < startBytePos )
low = mid + 1;
else if ( midVal > startBytePos )
high = mid - 1;
else
// found piece with exact start
return textPiece.getPieceDescriptor().getFilePosition();
}
return bytePos;
assert low == high;
assert _textPiecesFCOrder.get( low ).getPieceDescriptor()
.getFilePosition() < startBytePos;
// last line can't be current, can it?
assert _textPiecesFCOrder.get( low + 1 ).getPieceDescriptor()
.getFilePosition() > startBytePos;
// shifting to next piece start
return _textPiecesFCOrder.get( low + 1 ).getPieceDescriptor().getFilePosition();
}
public byte[] writeTo( HWPFOutputStream docStream ) throws IOException

View File

@ -226,6 +226,36 @@ public class TestBugs extends TestCase
assertEquals( extractor1.getText(), extractor2.getText() );
}
/**
* Bug 44331 - HWPFDocument.write destroys fields
*/
public void test44431_2()
{
HWPFDocument doc1 = HWPFTestDataSamples.openSampleFile( "Bug44431.doc" );
WordExtractor extractor1 = new WordExtractor( doc1 );
assertEquals( "File name=FieldsTest.doc\n" +
"\n" +
"\n" +
"STYLEREF test\n" +
"\n" +
"\n" +
"\n" +
"TEST TABLE OF CONTENTS\n" +
"\n" +
"Heading paragraph in next page\t2\n" +
"Another heading paragraph in further page\t3\n" +
"Another heading paragraph in further page\t3\n" +
"\n" +
"\n" +
"Heading paragraph in next page\n" +
"Another heading paragraph in further page\n" +
"\n" +
"\n" +
"\n" +
"Page 3 of 3", extractor1.getText() );
}
/**
* Bug 45473 - HWPF cannot read file after save
*/
@ -640,19 +670,20 @@ public class TestBugs extends TestCase
hwpfDocument.write( new ByteArrayOutputStream() );
}
/**
* Bug 51678 - Extracting text from Bug51524.zip is slow
* Bug 51524 - PapBinTable constructor is slow
* Bug 51678 - Extracting text from Bug51524.zip is slow Bug 51524 -
* PapBinTable constructor is slow
*/
public void test51678And51524()
{
// YK: the test will run only if the poi.test.remote system property is set.
// YK: the test will run only if the poi.test.remote system property is
// set.
// TODO: refactor into something nicer!
if(System.getProperty("poi.test.remote") != null) {
if ( System.getProperty( "poi.test.remote" ) != null )
{
String href = "http://domex.nps.edu/corp/files/govdocs1/007/007488.doc";
HWPFDocument hwpfDocument = HWPFTestDataSamples.openRemoteFile( href );
HWPFDocument hwpfDocument = HWPFTestDataSamples
.openRemoteFile( href );
WordExtractor wordExtractor = new WordExtractor( hwpfDocument );
wordExtractor.getText();