Big big unicode rationalisation in text piece code

git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@684319 13f79535-47bb-0310-9956-ffa450edef68
2008-08-09 19:34:38 +00:00 · 2008-08-09 19:34:38 +00:00 · c71c0851d5
parent 69a90eb5e9
commit c71c0851d5
6 changed files with 269 additions and 113 deletions
--- a/src/scratchpad/src/org/apache/poi/hwpf/model/PropertyNode.java
+++ b/src/scratchpad/src/org/apache/poi/hwpf/model/PropertyNode.java
@ -22,20 +22,22 @@ import java.util.Arrays;
 /**
 * Represents a lightweight node in the Trees used to store content
- * properties.
+ * properties. Works only in characters.
 *
 * @author Ryan Ackley
 */
 public abstract class PropertyNode implements Comparable, Cloneable
 {
  protected Object _buf;
  /** The start, in characters */
  private int _cpStart;
  /** The end, in characters */
  private int _cpEnd;
  /**
-   * @param fcStart The start of the text for this property.
+   * @param fcStart The start of the text for this property, in characters.
-   * @param fcEnd The end of the text for this property.
+   * @param fcEnd The end of the text for this property, in characters.
   * @param buf FIXME: Old documentation is: "grpprl The property description in compressed form."
   */
  protected PropertyNode(int fcStart, int fcEnd, Object buf)
@ -43,11 +45,10 @@ public abstract class PropertyNode implements Comparable, Cloneable
      _cpStart = fcStart;
      _cpEnd = fcEnd;
      _buf = buf;
  }
  /**
-   * @return The offset of this property's text.
+   * @return The start offset of this property's text.
   */
  public int getStart()
  {
@ -142,9 +143,4 @@ public abstract class PropertyNode implements Comparable, Cloneable
        return 1;
      }
  }
 }
--- a/src/scratchpad/src/org/apache/poi/hwpf/model/TextPiece.java
+++ b/src/scratchpad/src/org/apache/poi/hwpf/model/TextPiece.java
@ -22,6 +22,9 @@ package org.apache.poi.hwpf.model;
 import java.io.UnsupportedEncodingException;
 /**
 * Lightweight representation of a text piece.
 * Works in the character domain, not the byte domain, so you
 *  need to have turned byte references into character
 *  references before getting here.
 *
 * @author Ryan Ackley
 */
@ -32,21 +35,43 @@ public class TextPiece extends PropertyNode implements Comparable
  private PieceDescriptor _pd;
-  private int _cpStart;
+  /**
   * @param start Beginning offset in main document stream, in characters.
   * @param end Ending offset in main document stream, in characters.
   * @param text The raw bytes of our text
   */
  public TextPiece(int start, int end, byte[] text, PieceDescriptor pd, int cpStart) {
 	  super(start, end, buildInitSB(text, pd));
 	  _usesUnicode = pd.isUnicode();
 	  _pd = pd;
 	  // Validate
 	  int textLength = ((StringBuffer)_buf).length();
 	  if(end-start != textLength) {
 		  throw new IllegalStateException("Told we're for characters " + start + " -> " + end + ", but actually covers " + textLength + " characters!");
 	  }
 	  if(end < start) {
 		  throw new IllegalStateException("Told we're of negative size! start="+start + " end="+end);
 	  }
  }
  /**
-   * @param start Offset in main document stream.
+   * Create the StringBuffer from the text and unicode flag
   */
-  public TextPiece(int start, int end, byte[] text, PieceDescriptor pd, int cpStart)
+  private static StringBuffer buildInitSB(byte[] text, PieceDescriptor pd) {
-    throws UnsupportedEncodingException
+	  String str;
-  {
+	  try {
-     /** start - end is length on file. This is double the expected when its
+		  if(pd.isUnicode()) {
-     * unicode.*/
+			  str = new String(text, "UTF-16LE");
-    super(start, end, new StringBuffer(new String(text, pd.isUnicode() ? "UTF-16LE" : "Cp1252")));
+		  } else {
-    _usesUnicode = pd.isUnicode();
+			  str = new String(text, "Cp1252");
-    _pd = pd;
+		  }
-    _cpStart = cpStart;
+	  } catch(UnsupportedEncodingException e) {
 		  throw new RuntimeException("Your Java is broken! It doesn't know about basic, required character encodings!");
 	  }
 	  return new StringBuffer(str);
  }
  /**
   * @return If this text piece uses unicode
   */
@ -67,38 +92,43 @@ public class TextPiece extends PropertyNode implements Comparable
   public byte[] getRawBytes()
   {
-     try
+     try {
     {
       return ((StringBuffer)_buf).toString().getBytes(_usesUnicode ?
           "UTF-16LE" : "Cp1252");
     } catch (UnsupportedEncodingException ignore) {
 		  throw new RuntimeException("Your Java is broken! It doesn't know about basic, required character encodings!");
     }
     catch (UnsupportedEncodingException ignore)
     {
       // shouldn't ever happen considering we wouldn't have been able to
       // create the StringBuffer w/o getting this exception
       return ((StringBuffer)_buf).toString().getBytes();
     }
   }
   /**
    * Returns part of the string. 
    * Works only in characters, not in bytes!
    * @param start Local start position, in characters
    * @param end Local end position, in characters
    * @return
    */
   public String substring(int start, int end)
   {
-     int denominator = _usesUnicode ? 2 : 1;
+	   StringBuffer buf = (StringBuffer)_buf;
-     return ((StringBuffer)_buf).substring(start/denominator, end/denominator);
+	   // Validate
 	   if(start < 0) {
 		   throw new StringIndexOutOfBoundsException("Can't request a substring before 0 - asked for " + start);
 	   }
 	   if(end > buf.length()) {
 		   throw new StringIndexOutOfBoundsException("Index " + end + " out of range 0 -> " + buf.length());
 	   }
 	   return buf.substring(start, end);
   }
-   public void adjustForDelete(int start, int length)
+   /**
-   {
+    * Adjusts the internal string for deletinging
-
+    *  some characters within this.
-	   // length is expected to be the number of code-points,
+    * @param start The start position for the delete, in characters
-	   // not the number of characters
+    * @param length The number of characters to delete
    */
   public void adjustForDelete(int start, int length) {
 	   int numChars = length;
 	   if (usesUnicode()) {
 		   start /= 2;
 		   numChars = (length / 2);
 	   }
 	   int myStart = getStart();
 	   int myEnd = getEnd();
@ -121,9 +151,18 @@ public class TextPiece extends PropertyNode implements Comparable
 	   super.adjustForDelete(start, length);
   }
   /**
    * Returns the length, in characters
    */
   public int characterLength()
   {
-     return (getEnd() - getStart()) / (_usesUnicode ? 2 : 1);
+     return (getEnd() - getStart());
   }
   /**
    * Returns the length, in bytes
    */
   public int bytesLength() {
 	   return (getEnd() - getStart()) * (_usesUnicode ? 2 : 1);
   }
   public boolean equals(Object o)
@ -138,9 +177,11 @@ public class TextPiece extends PropertyNode implements Comparable
   }
   /**
    * Returns the character position we start at.
    */
   public int getCP()
   {
-     return _cpStart;
+     return getStart();
   }
 }
--- a/src/scratchpad/src/org/apache/poi/hwpf/model/TextPieceTable.java
+++ b/src/scratchpad/src/org/apache/poi/hwpf/model/TextPieceTable.java
@ -28,6 +28,11 @@ import java.util.ArrayList;
 import java.util.List;
 /**
 * The piece table for matching up character positions
 *  to bits of text.
 * This mostly works in bytes, but the TextPieces
 *  themselves work in characters. This does the icky
 *  convertion.
 * @author Ryan Ackley
 */
 public class TextPieceTable
@ -36,8 +41,7 @@ public class TextPieceTable
  //int _multiple;
  int _cpMin;
-  public TextPieceTable()
+  public TextPieceTable() {
  {
  }
  public TextPieceTable(byte[] documentStream, byte[] tableStream, int offset,
@ -47,7 +51,6 @@ public class TextPieceTable
    // get our plex of PieceDescriptors
    PlexOfCps pieceTable = new PlexOfCps(tableStream, offset, size, PieceDescriptor.getSizeInBytes());
    //_multiple = 2;
    int length = pieceTable.length();
    PieceDescriptor[] pieces = new PieceDescriptor[length];
@ -57,11 +60,6 @@ public class TextPieceTable
    {
      GenericPropertyNode node = pieceTable.getProperty(x);
      pieces[x] = new PieceDescriptor(node.getBytes(), 0);
 //      if (!pieces[x].isUnicode())
 //      {
 //        _multiple = 1;
 //      }
    }
    int firstPieceFilePosition = pieces[0].getFilePosition();
@ -72,26 +70,28 @@ public class TextPieceTable
    {
      int start = pieces[x].getFilePosition();
      PropertyNode node = pieceTable.getProperty(x);
      int nodeStart = node.getStart();
-      // multiple will be 2 if there is only one piece and its unicode. Some
+      // Grab the start and end, which are in characters
-      // type of optimization.
+      int nodeStartChars = node.getStart();
      int nodeEndChars = node.getEnd();
      // What's the relationship between bytes and characters?
      boolean unicode = pieces[x].isUnicode();
      int multiple = 1;
-      if (unicode)
+      if (unicode) {
      {
        multiple = 2;
      }
      int nodeEnd = ((node.getEnd() - nodeStart) * multiple) + nodeStart;
      int textSize = nodeEnd - nodeStart;
      // Figure out the length, in bytes and chars
      int textSizeChars = (nodeEndChars - nodeStartChars);
      int textSizeBytes = textSizeChars * multiple;
-      byte[] buf = new byte[textSize];
+      // Grab the data that makes up the piece
-      System.arraycopy(documentStream, start, buf, 0, textSize);
+      byte[] buf = new byte[textSizeBytes];
      System.arraycopy(documentStream, start, buf, 0, textSizeBytes);
-      int startFilePosition = start - firstPieceFilePosition;
+      // And now build the piece
-      _textPieces.add(new TextPiece(startFilePosition, startFilePosition+textSize, buf, pieces[x], node.getStart()));
+      _textPieces.add(new TextPiece(nodeStartChars, nodeEndChars, buf, pieces[x], node.getStart()));
    }
  }
@ -113,7 +113,6 @@ public class TextPieceTable
    //int fcMin = docStream.getOffset();
    int size = _textPieces.size();
    int bumpDown = 0;
    for (int x = 0; x < size; x++)
    {
      TextPiece next = (TextPiece)_textPieces.get(x);
@ -134,47 +133,43 @@ public class TextPieceTable
      // write the text to the docstream and save the piece descriptor to the
      // plex which will be written later to the tableStream.
      //if (_multiple == 1 && pd.isUnicode() &&
      docStream.write(next.getRawBytes());
      // The TextPiece is already in characters, which
      //  makes our life much easier
      int nodeStart = next.getStart();
-      int multiple = 1;
+      int nodeEnd = next.getEnd();
-      if (pd.isUnicode())
+      textPlex.addProperty(new GenericPropertyNode(nodeStart, nodeEnd,
      {
        multiple = 2;
      }
      textPlex.addProperty(new GenericPropertyNode(nodeStart - bumpDown,
        ((next.getEnd() - nodeStart)/multiple + nodeStart) - bumpDown,
        pd.toByteArray()));
      if (pd.isUnicode())
      {
        bumpDown += ((next.getEnd() - nodeStart)/multiple);
      }
    }
    return textPlex.toByteArray();
  }
-
+  /**
-  public int adjustForInsert(int listIndex, int length)
+   * Adjust all the text piece after inserting
-  {
+   *  some text into one of them
   * @param listIndex The TextPiece that had characters inserted into
   * @param length The number of characters inserted
   */
  public int adjustForInsert(int listIndex, int length) {
    int size = _textPieces.size();
    TextPiece tp = (TextPiece)_textPieces.get(listIndex);
-    //The text piece stores the length on file.
+    // Update with the new end
    length = length * (tp.usesUnicode() ? 2 : 1);
    tp.setEnd(tp.getEnd() + length);
    // Now change all subsequent ones
    for (int x = listIndex + 1; x < size; x++)
    {
      tp = (TextPiece)_textPieces.get(x);
      tp.setStart(tp.getStart() + length);
      tp.setEnd(tp.getEnd() + length);
    }
    // All done
    return length;
  }
--- a/src/scratchpad/src/org/apache/poi/hwpf/usermodel/Range.java
+++ b/src/scratchpad/src/org/apache/poi/hwpf/usermodel/Range.java
@ -137,7 +137,8 @@ public class Range
  /**
   * Used to construct a Range from a document. This is generally used to
-   * create a Range that spans the whole document.
+   * create a Range that spans the whole document, or at least one
   * whole part of the document (eg main text, header, comment)
   *
   * @param start Starting character offset of the range.
   * @param end Ending character offset of the range.
@ -259,15 +260,21 @@ public class Range
    for (int x = _textStart; x < _textEnd; x++)
    {
      TextPiece piece = (TextPiece)_text.get(x);
      int start = _start > piece.getStart() ? _start - piece.getStart() : 0;
      int end = _end <= piece.getEnd() ? _end - piece.getStart() : piece.getEnd() - piece.getStart();
-      if(piece.usesUnicode()) // convert the byte pointers to char pointers
+      // Figure out where in this piece the text
-      {
+      //  we're after lives
-        start/=2;
+      int rStart = 0;
-        end/=2;
+      int rEnd = piece.characterLength();
      if(_start > piece.getStart()) {
    	  rStart = _start - piece.getStart();
      }
-      sb.append(piece.getStringBuffer().substring(start, end));
+      if(_end < piece.getEnd()) {
    	  rEnd -= (piece.getEnd() - _end);
      }
      // Luckily TextPieces work in characters, so we don't
      //  need to worry about unicode here
      sb.append(piece.substring(rStart, rEnd));
    }
    return sb.toString();
  }
@ -929,9 +936,11 @@ public class Range
  }
  /**
-   *	Adjust the value of <code>FIB.CCPText</code> after an insert or a delete...
+   * Adjust the value of <code>FIB.CCPText</code> after an insert or a delete...
   *
-   *	@param	adjustment	The (signed) value that should be added to <code>FIB.CCPText</code>
+   * TODO - handle other kinds of text, eg Headers
   *
   * @param	adjustment	The (signed) value that should be added to <code>FIB.CCPText</code>
   */
  protected void adjustFIB(int adjustment) {
--- a/src/scratchpad/testcases/org/apache/poi/hwpf/TestHWPFRangeParts.java
+++ b/src/scratchpad/testcases/org/apache/poi/hwpf/TestHWPFRangeParts.java
@ -78,10 +78,13 @@ public class TestHWPFRangeParts extends TestCase {
 	;
 	private static final String u_header =
 		"\r\r" +
 		"This is a simple header, with a \u20ac euro symbol in it.\r"
 	;
 	private static final String u_footer =
-		"The footer, with Moli\u00e8re, has Unicode in it.\r"
+		"\r\r\r" +
 		"The footer, with Moli\u00e8re, has Unicode in it.\r" +
 		"\r\r\r\r"
 	;
 	/**
--- a/src/scratchpad/testcases/org/apache/poi/hwpf/model/TestTextPieceTable.java
+++ b/src/scratchpad/testcases/org/apache/poi/hwpf/model/TestTextPieceTable.java
@ -18,19 +18,21 @@
 package org.apache.poi.hwpf.model;
-import junit.framework.*;
+import java.io.ByteArrayInputStream;
 import java.io.ByteArrayOutputStream;
-import java.util.ArrayList;
+import java.io.File;
 import java.io.FileInputStream;
-import org.apache.poi.hwpf.*;
+import junit.framework.TestCase;
-import org.apache.poi.hwpf.model.io.*;
+
 import org.apache.poi.hwpf.HWPFDocFixture;
 import org.apache.poi.hwpf.HWPFDocument;
 import org.apache.poi.hwpf.model.io.HWPFFileSystem;
-public class TestTextPieceTable
+public class TestTextPieceTable extends TestCase {
  extends TestCase
 {
  private HWPFDocFixture _hWPFDocFixture;
  private String dirname;
  public TestTextPieceTable(String name)
  {
@ -63,9 +65,117 @@ public class TestTextPieceTable
    TextPieceTable newTextPieceTable = newCft.getTextPieceTable();
    assertEquals(oldTextPieceTable, newTextPieceTable);
  }
 	/**
 	 * Check that we do the positions correctly when
 	 *  working with pure-ascii
 	 */
 	public void testAsciiParts() throws Exception {
 		HWPFDocument doc = new HWPFDocument(
 				new FileInputStream(new File(dirname, "ThreeColHeadFoot.doc"))
 		);
 		TextPieceTable tbl = doc.getTextTable();
 		// All ascii, so stored in one big lump
 		assertEquals(1, tbl.getTextPieces().size());
 		TextPiece tp = (TextPiece)tbl.getTextPieces().get(0);
 		assertEquals(0, tp.getStart());
 		assertEquals(339, tp.getEnd());
 		assertEquals(339, tp.characterLength());
 		assertEquals(339, tp.bytesLength());
 		assertTrue(tp.getStringBuffer().toString().startsWith("This is a sample word document"));
 		// Save and re-load
 		HWPFDocument docB = saveAndReload(doc);
 		tbl = docB.getTextTable();
 		assertEquals(1, tbl.getTextPieces().size());
 		tp = (TextPiece)tbl.getTextPieces().get(0);
 		assertEquals(0, tp.getStart());
 		assertEquals(339, tp.getEnd());
 		assertEquals(339, tp.characterLength());
 		assertEquals(339, tp.bytesLength());
 		assertTrue(tp.getStringBuffer().toString().startsWith("This is a sample word document"));
 	}
 	/**
 	 * Check that we do the positions correctly when
 	 *  working with a mix ascii, unicode file
 	 */
 	public void testUnicodeParts() throws Exception {
 		HWPFDocument doc = new HWPFDocument(
 				new FileInputStream(new File(dirname, "HeaderFooterUnicode.doc"))
 		);
 		TextPieceTable tbl = doc.getTextTable();
 		// In three bits, split every 512 bytes
 		assertEquals(3, tbl.getTextPieces().size());
 		TextPiece tpA = (TextPiece)tbl.getTextPieces().get(0);
 		TextPiece tpB = (TextPiece)tbl.getTextPieces().get(1);
 		TextPiece tpC = (TextPiece)tbl.getTextPieces().get(2);
 		assertTrue(tpA.usesUnicode());
 		assertTrue(tpB.usesUnicode());
 		assertTrue(tpC.usesUnicode());
 		assertEquals(256, tpA.characterLength());
 		assertEquals(256, tpB.characterLength());
 		assertEquals(19, tpC.characterLength());
 		assertEquals(512, tpA.bytesLength());
 		assertEquals(512, tpB.bytesLength());
 		assertEquals(38, tpC.bytesLength());
 		assertEquals(0, tpA.getStart());
 		assertEquals(256, tpA.getEnd());
 		assertEquals(256, tpB.getStart());
 		assertEquals(512, tpB.getEnd());
 		assertEquals(512, tpC.getStart());
 		assertEquals(531, tpC.getEnd());
 		// Save and re-load
 		HWPFDocument docB = saveAndReload(doc);
 		tbl = docB.getTextTable();
 		assertEquals(3, tbl.getTextPieces().size());
 		tpA = (TextPiece)tbl.getTextPieces().get(0);
 		tpB = (TextPiece)tbl.getTextPieces().get(1);
 		tpC = (TextPiece)tbl.getTextPieces().get(2);
 		assertTrue(tpA.usesUnicode());
 		assertTrue(tpB.usesUnicode());
 		assertTrue(tpC.usesUnicode());
 		assertEquals(256, tpA.characterLength());
 		assertEquals(256, tpB.characterLength());
 		assertEquals(19, tpC.characterLength());
 		assertEquals(512, tpA.bytesLength());
 		assertEquals(512, tpB.bytesLength());
 		assertEquals(38, tpC.bytesLength());
 		assertEquals(0, tpA.getStart());
 		assertEquals(256, tpA.getEnd());
 		assertEquals(256, tpB.getStart());
 		assertEquals(512, tpB.getEnd());
 		assertEquals(512, tpC.getStart());
 		assertEquals(531, tpC.getEnd());
 	}
 	protected HWPFDocument saveAndReload(HWPFDocument doc) throws Exception {
 		ByteArrayOutputStream baos = new ByteArrayOutputStream();
 		doc.write(baos);
 		return new HWPFDocument(
 				new ByteArrayInputStream(baos.toByteArray())
 		);
 	}
  protected void setUp()
    throws Exception
  {
@ -73,6 +183,8 @@ public class TestTextPieceTable
    _hWPFDocFixture = new HWPFDocFixture(this);
    _hWPFDocFixture.setUp();
    dirname = System.getProperty("HWPF.testdata.path");
  }
  protected void tearDown()