Implement the ExtRst part of a UnicodeString

git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@900746 13f79535-47bb-0310-9956-ffa450edef68
2010-01-19 12:04:14 +00:00 · 2010-01-19 12:04:14 +00:00 · 054f1bc289
parent d7470746e1
commit 054f1bc289
5 changed files with 459 additions and 67 deletions
--- a/src/java/org/apache/poi/hssf/record/common/UnicodeString.java
+++ b/src/java/org/apache/poi/hssf/record/common/UnicodeString.java
@ -26,9 +26,9 @@ import org.apache.poi.hssf.record.RecordInputStream;
 import org.apache.poi.hssf.record.cont.ContinuableRecordOutput;
 import org.apache.poi.util.BitField;
 import org.apache.poi.util.BitFieldFactory;
-import org.apache.poi.util.HexDump;
 import org.apache.poi.util.LittleEndianInput;
 import org.apache.poi.util.LittleEndianOutput;
+import org.apache.poi.util.StringUtil;

 /**
 * Title: Unicode String<p/>
@ -42,8 +42,8 @@ public final class UnicodeString implements Comparable<UnicodeString> {
    private short             field_1_charCount;
    private byte              field_2_optionflags;
    private String            field_3_string;
-    private List<FormatRun> field_4_format_runs;
-    private byte[] field_5_ext_rst;
+    private List<FormatRun>   field_4_format_runs;
+    private ExtRst            field_5_ext_rst;
    private static final BitField   highByte  = BitFieldFactory.getInstance(0x1);
    // 0x2 is reserved
    private static final BitField   extBit    = BitFieldFactory.getInstance(0x4);
@ -98,6 +98,225 @@ public final class UnicodeString implements Comparable<UnicodeString> {
            out.writeShort(_fontIndex);
        }
    }
+    
+    // See page 681
+    public static class ExtRst implements Comparable<ExtRst> {
+       private short reserved;
+       
+       // This is a Phs (see page 881)
+       private short formattingFontIndex;
+       private short formattingOptions;
+       
+       // This is a RPHSSub (see page 894)
+       private int numberOfRuns;
+       private String phoneticText;
+       
+       // This is an array of PhRuns (see page 881)
+       private PhRun[] phRuns;
+       // Sometimes there's some cruft at the end
+       private byte[] extraData;
+
+       private void populateEmpty() {
+          reserved = 1;
+          phoneticText = "";
+          phRuns = new PhRun[0];
+          extraData = new byte[0];
+       }
+       
+       protected ExtRst() {
+          populateEmpty();
+       }
+       protected ExtRst(LittleEndianInput in, int expectedLength) {
+          reserved = in.readShort();
+          
+          // Old style detection (Reserved = 0xFF)
+          if(reserved == -1) {
+             populateEmpty();
+             return;
+          }
+          
+          // Spot corrupt records
+          if(reserved != 1) {
+             System.err.println("Warning - ExtRst was has wrong magic marker, expecting 1 but found " + reserved + " - ignoring");
+             // Grab all the remaining data, and ignore it
+             for(int i=0; i<expectedLength-2; i++) {
+                in.readByte();
+             }
+             // And make us be empty
+             populateEmpty();
+             return;
+          }
+          
+          // Carry on reading in as normal
+          short stringDataSize = in.readShort();
+          
+          formattingFontIndex = in.readShort();
+          formattingOptions   = in.readShort();
+          
+          // RPHSSub
+          numberOfRuns = in.readUShort();
+          short length1 = in.readShort();
+          // No really. Someone clearly forgot to read
+          //  the docs on their datastructure...
+          short length2 = in.readShort();
+          // And sometimes they write out garbage :(
+          if(length1 == 0 && length2 > 0) {
+             length2 = 0;
+          }
+          if(length1 != length2) {
+             throw new IllegalStateException(
+                   "The two length fields of the Phonetic Text don't agree! " +
+                   length1 + " vs " + length2
+             );
+          }
+          phoneticText = StringUtil.readUnicodeLE(in, length1);
+          
+          int runData = stringDataSize - 4 - 6 - (2*phoneticText.length());
+          int numRuns = (runData / 6);
+          phRuns = new PhRun[numRuns];
+          for(int i=0; i<phRuns.length; i++) {
+             phRuns[i] = new PhRun(in);
+          }
+
+          int extraDataLength = runData - (numRuns*6);
+          if(extraDataLength < 0) {
+             System.err.println("Warning - ExtRst overran by " + (0-extraDataLength) + " bytes");
+             extraDataLength = 0;
+          }
+          extraData = new byte[extraDataLength];
+          for(int i=0; i<extraData.length; i++) {
+             extraData[i] = in.readByte();
+          }
+       }
+       /**
+        * Returns our size, excluding our 
+        *  4 byte header
+        */
+       protected int getDataSize() {
+          return 4 + 6 + (2*phoneticText.length()) + 
+             (6*phRuns.length) + extraData.length;
+       }
+       protected void serialize(ContinuableRecordOutput out) {
+          int dataSize = getDataSize();
+          
+          out.writeContinueIfRequired(8);
+          out.writeShort(reserved);
+          out.writeShort(dataSize);
+          out.writeShort(formattingFontIndex);
+          out.writeShort(formattingOptions);
+          
+          out.writeContinueIfRequired(6);
+          out.writeShort(numberOfRuns);
+          out.writeShort(phoneticText.length());
+          out.writeShort(phoneticText.length());
+          
+          out.writeContinueIfRequired(phoneticText.length()*2);
+          StringUtil.putUnicodeLE(phoneticText, out);
+          
+          for(int i=0; i<phRuns.length; i++) {
+             phRuns[i].serialize(out);
+          }
+          
+          out.write(extraData);
+       }
+
+       public boolean equals(Object obj) {
+          if(! (obj instanceof ExtRst)) {
+             return false;
+          }
+          ExtRst other = (ExtRst)obj;
+          return (compareTo(other) == 0);
+       }
+       public int compareTo(ExtRst o) {
+          int result;
+          
+          result = reserved - o.reserved;
+          if(result != 0) return result;
+          result = formattingFontIndex - o.formattingFontIndex;
+          if(result != 0) return result;
+          result = formattingOptions - o.formattingOptions;
+          if(result != 0) return result;
+          result = numberOfRuns - o.numberOfRuns;
+          if(result != 0) return result;
+          
+          result = phoneticText.compareTo(o.phoneticText);
+          if(result != 0) return result;
+          
+          result = phRuns.length - o.phRuns.length;
+          if(result != 0) return result;
+          for(int i=0; i<phRuns.length; i++) {
+             result = phRuns[i].phoneticTextFirstCharacterOffset - o.phRuns[i].phoneticTextFirstCharacterOffset;
+             if(result != 0) return result;
+             result = phRuns[i].realTextFirstCharacterOffset - o.phRuns[i].realTextFirstCharacterOffset;
+             if(result != 0) return result;
+             result = phRuns[i].realTextFirstCharacterOffset - o.phRuns[i].realTextLength;
+             if(result != 0) return result;
+          }
+          
+          result = extraData.length - o.extraData.length;
+          if(result != 0) return result;
+          
+          // If we get here, it's the same
+          return 0;
+       }
+       
+       protected ExtRst clone() {
+          ExtRst ext = new ExtRst();
+          ext.reserved = reserved;
+          ext.formattingFontIndex = formattingFontIndex;
+          ext.formattingOptions = formattingOptions;
+          ext.numberOfRuns = numberOfRuns;
+          ext.phoneticText = new String(phoneticText);
+          ext.phRuns = new PhRun[phRuns.length];
+          for(int i=0; i<ext.phRuns.length; i++) {
+             ext.phRuns[i] = new PhRun(
+                   phRuns[i].phoneticTextFirstCharacterOffset,
+                   phRuns[i].realTextFirstCharacterOffset,
+                   phRuns[i].realTextLength
+             );
+          }
+          return ext;
+       }
+       
+       public short getFormattingFontIndex() {
+         return formattingFontIndex;
+       }
+       public short getFormattingOptions() {
+         return formattingOptions;
+       }
+       public int getNumberOfRuns() {
+         return numberOfRuns;
+       }
+       public String getPhoneticText() {
+         return phoneticText;
+       }
+       public PhRun[] getPhRuns() {
+         return phRuns;
+       }
+    }
+    public static class PhRun {
+       private int phoneticTextFirstCharacterOffset;
+       private int realTextFirstCharacterOffset;
+       private int realTextLength;
+       
+       public PhRun(int phoneticTextFirstCharacterOffset,
+            int realTextFirstCharacterOffset, int realTextLength) {
+         this.phoneticTextFirstCharacterOffset = phoneticTextFirstCharacterOffset;
+         this.realTextFirstCharacterOffset = realTextFirstCharacterOffset;
+         this.realTextLength = realTextLength;
+      }
+      private PhRun(LittleEndianInput in) {
+          phoneticTextFirstCharacterOffset = in.readUShort();
+          realTextFirstCharacterOffset = in.readUShort();
+          realTextLength = in.readUShort();
+       }
+       private void serialize(ContinuableRecordOutput out) {
+          out.writeContinueIfRequired(6);
+          out.writeShort(phoneticTextFirstCharacterOffset);
+          out.writeShort(realTextFirstCharacterOffset);
+          out.writeShort(realTextLength);
+       }
+    }

    private UnicodeString() {
     //Used for clone method.
@ -160,22 +379,20 @@ public final class UnicodeString implements Comparable<UnicodeString> {
            return false;
        }

-        //Well the format runs are equal as well!, better check the ExtRst data
-        //Which by the way we dont know how to decode!
-        if ((field_5_ext_rst == null) && (other.field_5_ext_rst == null))
-          return true;
-        if (((field_5_ext_rst == null) && (other.field_5_ext_rst != null)) ||
-            ((field_5_ext_rst != null) && (other.field_5_ext_rst == null)))
-          return false;
-        size = field_5_ext_rst.length;
-        if (size != field_5_ext_rst.length)
-          return false;
-
-        //Check individual bytes!
-        for (int i=0;i<size;i++) {
-          if (field_5_ext_rst[i] != other.field_5_ext_rst[i])
-            return false;
+        // Well the format runs are equal as well!, better check the ExtRst data
+        if(field_5_ext_rst == null && other.field_5_ext_rst == null) {
+           // Good
+        } else if(field_5_ext_rst != null && other.field_5_ext_rst != null) {
+           int extCmp = field_5_ext_rst.compareTo(other.field_5_ext_rst);
+           if(extCmp == 0) {
+              // Good
+           } else {
+              return false;
+           }
+        } else {
+           return false;
        }
+
        //Phew!! After all of that we have finally worked out that the strings
        //are identical.
        return true;
@ -218,10 +435,10 @@ public final class UnicodeString implements Comparable<UnicodeString> {
        }

        if (isExtendedText() && (extensionLength > 0)) {
-          field_5_ext_rst = new byte[extensionLength];
-          for (int i=0;i<extensionLength;i++) {
-            field_5_ext_rst[i] = in.readByte();
-            }
+          field_5_ext_rst = new ExtRst(in, extensionLength);
+          if(field_5_ext_rst.getDataSize()+4 != extensionLength) {
+             System.err.println("ExtRst was supposed to be " + extensionLength + " bytes long, but seems to actually be " + (field_5_ext_rst.getDataSize()+4));
+          }
        }
    }

@ -395,10 +612,15 @@ public final class UnicodeString implements Comparable<UnicodeString> {
    }


-    void setExtendedRst(byte[] ext_rst) {
-      if (ext_rst != null)
-        field_2_optionflags = extBit.setByte(field_2_optionflags);
-      else field_2_optionflags = extBit.clearByte(field_2_optionflags);
+    public ExtRst getExtendedRst() {
+       return this.field_5_ext_rst;
+    }
+    void setExtendedRst(ExtRst ext_rst) {
+      if (ext_rst != null) {
+         field_2_optionflags = extBit.setByte(field_2_optionflags);
+      } else {
+         field_2_optionflags = extBit.clearByte(field_2_optionflags);
+      }
      this.field_5_ext_rst = ext_rst;
    }

@ -452,12 +674,18 @@ public final class UnicodeString implements Comparable<UnicodeString> {
          }
        }
        if (field_5_ext_rst != null) {
-          buffer.append("    .field_5_ext_rst          = ").append("\n").append(HexDump.toHex(field_5_ext_rst)).append("\n");
+          buffer.append("    .field_5_ext_rst          = ").append("\n");
+          buffer.append( field_5_ext_rst.toString() ).append("\n");
        }
        buffer.append("[/UNICODESTRING]\n");
        return buffer.toString();
    }

+    /**
+     * Serialises out the String. There are special rules
+     *  about where we can and can't split onto
+     *  Continue records.
+     */
    public void serialize(ContinuableRecordOutput out) {
        int numberOfRichTextRuns = 0;
        int extendedDataSize = 0;
@ -465,9 +693,11 @@ public final class UnicodeString implements Comparable<UnicodeString> {
            numberOfRichTextRuns = field_4_format_runs.size();
        }
        if (isExtendedText() && field_5_ext_rst != null) {
-            extendedDataSize = field_5_ext_rst.length;
+            extendedDataSize = 4 + field_5_ext_rst.getDataSize();
        }
-
+       
+        // Serialise the bulk of the String
+        // The writeString handles tricky continue stuff for us
        out.writeString(field_3_string, numberOfRichTextRuns, extendedDataSize);

        if (numberOfRichTextRuns > 0) {
@ -477,25 +707,13 @@ public final class UnicodeString implements Comparable<UnicodeString> {
              if (out.getAvailableSpace() < 4) {
                  out.writeContinue();
              }
-                FormatRun r = field_4_format_runs.get(i);
-                r.serialize(out);
+              FormatRun r = field_4_format_runs.get(i);
+              r.serialize(out);
          }
        }

        if (extendedDataSize > 0) {
-            // OK ExtRst is actually not documented, so i am going to hope
-            // that we can actually continue on byte boundaries
-
-            int extPos = 0;
-            while (true) {
-                int nBytesToWrite = Math.min(extendedDataSize - extPos, out.getAvailableSpace());
-                out.write(field_5_ext_rst, extPos, nBytesToWrite);
-                extPos += nBytesToWrite;
-                if (extPos >= extendedDataSize) {
-                    break;
-                }
-                out.writeContinue();
-            }
+           field_5_ext_rst.serialize(out);
        }
    }

@ -534,7 +752,6 @@ public final class UnicodeString implements Comparable<UnicodeString> {
        }

        //Well the format runs are equal as well!, better check the ExtRst data
-        //Which by the way we don't know how to decode!
        if ((field_5_ext_rst == null) && (str.field_5_ext_rst == null))
          return 0;
        if ((field_5_ext_rst == null) && (str.field_5_ext_rst != null))
@ -542,15 +759,10 @@ public final class UnicodeString implements Comparable<UnicodeString> {
        if ((field_5_ext_rst != null) && (str.field_5_ext_rst == null))
          return -1;

-        size = field_5_ext_rst.length;
-        if (size != field_5_ext_rst.length)
-          return size - field_5_ext_rst.length;
+        result = field_5_ext_rst.compareTo(str.field_5_ext_rst); 
+        if (result != 0)
+           return result;

-        //Check individual bytes!
-        for (int i=0;i<size;i++) {
-          if (field_5_ext_rst[i] != str.field_5_ext_rst[i])
-            return field_5_ext_rst[i] - str.field_5_ext_rst[i];
-        }
        //Phew!! After all of that we have finally worked out that the strings
        //are identical.
        return 0;
@ -575,12 +787,10 @@ public final class UnicodeString implements Comparable<UnicodeString> {
          str.field_4_format_runs = new ArrayList<FormatRun>();
          for (FormatRun r : field_4_format_runs) {
            str.field_4_format_runs.add(new FormatRun(r._character, r._fontIndex));
-            }
+          }
        }
        if (field_5_ext_rst != null) {
-          str.field_5_ext_rst = new byte[field_5_ext_rst.length];
-          System.arraycopy(field_5_ext_rst, 0, str.field_5_ext_rst, 0,
-                           field_5_ext_rst.length);
+           str.field_5_ext_rst = field_5_ext_rst.clone();
        }

        return str;
--- a/src/testcases/org/apache/poi/hssf/record/TestSSTRecordSizeCalculator.java
+++ b/src/testcases/org/apache/poi/hssf/record/TestSSTRecordSizeCalculator.java
@ -33,9 +33,8 @@ public final class TestSSTRecordSizeCalculator extends TestCase {
 	private static final int COMPRESSED_PLAIN_STRING_OVERHEAD = 3;
 	private static final int OPTION_FIELD_SIZE = 1;
 	
-	private final IntMapper strings = new IntMapper();
+	private final IntMapper<UnicodeString> strings = new IntMapper<UnicodeString>();

-	
 	private void confirmSize(int expectedSize) {
 		ContinuableRecordOutput cro = ContinuableRecordOutput.createForCountingOnly();
 		SSTSerializer ss = new SSTSerializer(strings, 0, 0);
--- a/src/testcases/org/apache/poi/hssf/record/common/TestUnicodeString.java
+++ b/src/testcases/org/apache/poi/hssf/record/common/TestUnicodeString.java
@ -17,12 +17,19 @@

 package org.apache.poi.hssf.record.common;

+import java.io.ByteArrayInputStream;
+import java.io.ByteArrayOutputStream;
+
 import junit.framework.TestCase;

 import org.apache.poi.hssf.record.ContinueRecord;
 import org.apache.poi.hssf.record.RecordInputStream;
 import org.apache.poi.hssf.record.SSTRecord;
+import org.apache.poi.hssf.record.common.UnicodeString.ExtRst;
+import org.apache.poi.hssf.record.common.UnicodeString.FormatRun;
 import org.apache.poi.hssf.record.cont.ContinuableRecordOutput;
+import org.apache.poi.util.LittleEndianInputStream;
+import org.apache.poi.util.LittleEndianOutputStream;

 /**
 * Tests that {@link UnicodeString} record size calculates correctly.  The record size
@ -85,13 +92,23 @@ public final class TestUnicodeString extends TestCase {
        //Test a compressed small string that has rich text and extended text
        s.setString("Test");
        s.setOptionFlags((byte)0xC);
-        s.setExtendedRst(new byte[]{(byte)0x1,(byte)0x2,(byte)0x3,(byte)0x4,(byte)0x5});
-        confirmSize(26, s);
+        confirmSize(17, s);
+        
+        // Extended phonetics data
+        // Minimum size is 14
+        // Also adds 4 bytes to hold the length
+        s.setExtendedRst(
+              new ExtRst()
+        );
+        confirmSize(35, s);

        //Test a uncompressed small string that has rich text and extended text
        s.setString(STR_16_BIT);
        s.setOptionFlags((byte)0xD);
-        confirmSize(30, s);
+        confirmSize(39, s);
+        
+        s.setExtendedRst(null);
+        confirmSize(21, s);
    }

    public void testPerfectStringSize() {
@ -144,6 +161,146 @@ public final class TestUnicodeString extends TestCase {
      UnicodeString s = makeUnicodeString(strSize);
      confirmSize(MAX_DATA_SIZE*2, s);
    }
+    
+    public void testFormatRun() throws Exception {
+       FormatRun fr = new FormatRun((short)4, (short)0x15c);
+       assertEquals(4, fr.getCharacterPos());
+       assertEquals(0x15c, fr.getFontIndex());
+       
+       ByteArrayOutputStream baos = new ByteArrayOutputStream();
+       LittleEndianOutputStream out = new LittleEndianOutputStream(baos);
+       
+       fr.serialize(out);
+       
+       byte[] b = baos.toByteArray();
+       assertEquals(4, b.length);
+       assertEquals(4, b[0]);
+       assertEquals(0, b[1]);
+       assertEquals(0x5c, b[2]);
+       assertEquals(0x01, b[3]);
+       
+       LittleEndianInputStream inp = new LittleEndianInputStream(
+             new ByteArrayInputStream(b)
+       );
+       fr = new FormatRun(inp);
+       assertEquals(4, fr.getCharacterPos());
+       assertEquals(0x15c, fr.getFontIndex());
+    }
+    
+    public void testExtRstFromEmpty() throws Exception {
+       ExtRst ext = new ExtRst();
+       
+       assertEquals(0, ext.getNumberOfRuns());
+       assertEquals(0, ext.getFormattingFontIndex());
+       assertEquals(0, ext.getFormattingOptions());
+       assertEquals("", ext.getPhoneticText());
+       assertEquals(0, ext.getPhRuns().length);
+       assertEquals(10, ext.getDataSize()); // Excludes 4 byte header
+       
+       ByteArrayOutputStream baos = new ByteArrayOutputStream();
+       LittleEndianOutputStream out = new LittleEndianOutputStream(baos);
+       ContinuableRecordOutput cout = new ContinuableRecordOutput(out, 0xffff);
+       
+       ext.serialize(cout);
+       cout.writeContinue();
+       
+       byte[] b = baos.toByteArray();
+       assertEquals(20, b.length);
+       
+       // First 4 bytes from the outputstream
+       assertEquals(-1, b[0]);
+       assertEquals(-1, b[1]);
+       assertEquals(14, b[2]);
+       assertEquals(00, b[3]);
+       
+       // Reserved
+       assertEquals(1, b[4]);
+       assertEquals(0, b[5]);
+       // Data size
+       assertEquals(10, b[6]);
+       assertEquals(00, b[7]);
+       // Font*2
+       assertEquals(0, b[8]);
+       assertEquals(0, b[9]);
+       assertEquals(0, b[10]);
+       assertEquals(0, b[11]);
+       // 0 Runs
+       assertEquals(0, b[12]);
+       assertEquals(0, b[13]);
+       // Size=0, *2
+       assertEquals(0, b[14]);
+       assertEquals(0, b[15]);
+       assertEquals(0, b[16]);
+       assertEquals(0, b[17]);
+       
+       // Last 2 bytes from the outputstream
+       assertEquals(ContinueRecord.sid, b[18]);
+       assertEquals(0, b[19]);
+       
+       
+       // Load in again and re-test
+       byte[] data = new byte[14];
+       System.arraycopy(b, 4, data, 0, data.length);
+       LittleEndianInputStream inp = new LittleEndianInputStream(
+             new ByteArrayInputStream(data)
+       );
+       ext = new ExtRst(inp, data.length);
+       
+       assertEquals(0, ext.getNumberOfRuns());
+       assertEquals(0, ext.getFormattingFontIndex());
+       assertEquals(0, ext.getFormattingOptions());
+       assertEquals("", ext.getPhoneticText());
+       assertEquals(0, ext.getPhRuns().length);
+    }
+    
+    public void testExtRstFromData() throws Exception {
+       byte[] data = new byte[] {
+             01, 00, 0x0C, 00, 
+             00, 00, 0x37, 00, 
+             00, 00, 
+             00, 00, 00, 00, 
+             00, 00 // Cruft at the end, as found from real files
+       };
+       assertEquals(16, data.length);
+       
+       LittleEndianInputStream inp = new LittleEndianInputStream(
+             new ByteArrayInputStream(data)
+       );
+       ExtRst ext = new ExtRst(inp, data.length);
+       assertEquals(0x0c, ext.getDataSize()); // Excludes 4 byte header
+       
+       assertEquals(0, ext.getNumberOfRuns());
+       assertEquals(0x37, ext.getFormattingOptions());
+       assertEquals(0, ext.getFormattingFontIndex());
+       assertEquals("", ext.getPhoneticText());
+       assertEquals(0, ext.getPhRuns().length);
+    }
+    
+    public void testCorruptExtRstDetection() throws Exception {
+       byte[] data = new byte[] {
+             0x79, 0x79, 0x11, 0x11, 
+             0x22, 0x22, 0x33, 0x33, 
+       };
+       assertEquals(8, data.length);
+       
+       LittleEndianInputStream inp = new LittleEndianInputStream(
+             new ByteArrayInputStream(data)
+       );
+       ExtRst ext = new ExtRst(inp, data.length);
+       
+       // Will be empty
+       assertEquals(ext, new ExtRst());
+
+       // If written, will be the usual size
+       assertEquals(10, ext.getDataSize()); // Excludes 4 byte header
+     
+       // Is empty
+       assertEquals(0, ext.getNumberOfRuns());
+       assertEquals(0, ext.getFormattingOptions());
+       assertEquals(0, ext.getFormattingFontIndex());
+       assertEquals("", ext.getPhoneticText());
+       assertEquals(0, ext.getPhRuns().length);
+    }


    private static UnicodeString makeUnicodeString(String s) {
--- a/src/testcases/org/apache/poi/hssf/usermodel/TestBugs.java
+++ b/src/testcases/org/apache/poi/hssf/usermodel/TestBugs.java
@ -36,6 +36,7 @@ import org.apache.poi.hssf.record.CellValueRecordInterface;
 import org.apache.poi.hssf.record.EmbeddedObjectRefSubRecord;
 import org.apache.poi.hssf.record.NameRecord;
 import org.apache.poi.hssf.record.aggregates.FormulaRecordAggregate;
+import org.apache.poi.hssf.record.common.UnicodeString;
 import org.apache.poi.hssf.record.formula.DeletedArea3DPtg;
 import org.apache.poi.hssf.record.formula.Ptg;
 import org.apache.poi.ss.usermodel.*;
@ -1538,12 +1539,37 @@ public final class TestBugs extends BaseTestBugzillaIssues {
    }
    
    /**
-     * Round trip a file with an unusual ExtRst record
+     * Round trip a file with an unusual UnicodeString/ExtRst record parts
     */
-    public void test47847() {
-       HSSFWorkbook wb = openSample("47251.xls");
-       assertEquals(1, wb.getNumberOfSheets());
+    public void test47847() throws Exception {
+       HSSFWorkbook wb = openSample("47847.xls");
+       assertEquals(3, wb.getNumberOfSheets());
+       
+       // Find the SST record
+       UnicodeString withExt = wb.getWorkbook().getSSTString(0);
+       UnicodeString withoutExt = wb.getWorkbook().getSSTString(31);
+       
+       assertEquals("O:Alloc:Qty", withExt.getString());
+       assertTrue((withExt.getOptionFlags() & 0x0004) == 0x0004);
+       
+       assertEquals("RT", withoutExt.getString());
+       assertTrue((withoutExt.getOptionFlags() & 0x0004) == 0x0000);
+       
+       // Something about continues...
+
+       
+       // Write out and re-read
       wb = writeOutAndReadBack(wb);
-       assertEquals(1, wb.getNumberOfSheets());
+       assertEquals(3, wb.getNumberOfSheets());
+       
+       // Check it's the same now
+       withExt = wb.getWorkbook().getSSTString(0);
+       withoutExt = wb.getWorkbook().getSSTString(31);
+       
+       assertEquals("O:Alloc:Qty", withExt.getString());
+       assertTrue((withExt.getOptionFlags() & 0x0004) == 0x0004);
+       
+       assertEquals("RT", withoutExt.getString());
+       assertTrue((withoutExt.getOptionFlags() & 0x0004) == 0x0000);
    }
 }
--- a/test-data/spreadsheet/47847.xls
+++ b/test-data/spreadsheet/47847.xls