code improvements to RecordFactoryInputStream

git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@801850 13f79535-47bb-0310-9956-ffa450edef68
2009-08-07 00:21:00 +00:00 · 2009-08-07 00:21:00 +00:00 · 54f17d41da
parent 5f224a73e7
commit 54f17d41da
3 changed files with 252 additions and 268 deletions
--- a/src/java/org/apache/poi/hssf/eventusermodel/HSSFEventFactory.java
+++ b/src/java/org/apache/poi/hssf/eventusermodel/HSSFEventFactory.java
@ -31,7 +31,7 @@ import org.apache.poi.poifs.filesystem.POIFSFileSystem;
 * processWorkbookEvents along with a request.
 *
 * This will cause your file to be processed a record at a time.  Each record with
- * a static id matching one that you have registed in your HSSFRequest will be passed
+ * a static id matching one that you have registered in your HSSFRequest will be passed
 * to your associated HSSFListener.
 *
 * @see org.apache.poi.hssf.dev.EFHSSF
@ -39,115 +39,98 @@ import org.apache.poi.poifs.filesystem.POIFSFileSystem;
 * @author Andrew C. Oliver (acoliver at apache dot org)
 * @author Carey Sublette  (careysub@earthling.net)
 */
+public class HSSFEventFactory {
+	/** Creates a new instance of HSSFEventFactory */
+	public HSSFEventFactory() {
+		// no instance fields
+	}

-public class HSSFEventFactory
-{
-    /** Creates a new instance of HSSFEventFactory */
-
-    public HSSFEventFactory()
-    {
-    }
-
-    /**
-     * Processes a file into essentially record events.
-     *
-     * @param req       an Instance of HSSFRequest which has your registered listeners
-     * @param fs        a POIFS filesystem containing your workbook
-     */
-
-    public void processWorkbookEvents(HSSFRequest req, POIFSFileSystem fs)
-        throws IOException
-    {
-        InputStream in = fs.createDocumentInputStream("Workbook");
-
-        processEvents(req, in);
-    }
-
-    /**
+	/**
 	 * Processes a file into essentially record events.
 	 *
-	 * @param req       an Instance of HSSFRequest which has your registered listeners
-	 * @param fs        a POIFS filesystem containing your workbook
-	 * @return 			numeric user-specified result code.
+	 * @param req an Instance of HSSFRequest which has your registered listeners
+	 * @param fs  a POIFS filesystem containing your workbook
 	 */
+	public void processWorkbookEvents(HSSFRequest req, POIFSFileSystem fs) throws IOException {
+		InputStream in = fs.createDocumentInputStream("Workbook");

+		processEvents(req, in);
+	}
+
+	/**
+	 * Processes a file into essentially record events.
+	 *
+	 * @param req an Instance of HSSFRequest which has your registered listeners
+	 * @param fs  a POIFS filesystem containing your workbook
+	 * @return    numeric user-specified result code.
+	 */
 	public short abortableProcessWorkbookEvents(HSSFRequest req, POIFSFileSystem fs)
-		throws IOException, HSSFUserException
-	{
+		throws IOException, HSSFUserException {
 		InputStream in = fs.createDocumentInputStream("Workbook");
 		return abortableProcessEvents(req, in);
-    }
+	}

-    /**
-     * Processes a DocumentInputStream into essentially Record events.
-     *
-     * If an <code>AbortableHSSFListener</code> causes a halt to processing during this call
-     * the method will return just as with <code>abortableProcessEvents</code>, but no
-     * user code or <code>HSSFUserException</code> will be passed back.
-     *
-     * @see org.apache.poi.poifs.filesystem.POIFSFileSystem#createDocumentInputStream(String)
-     * @param req       an Instance of HSSFRequest which has your registered listeners
-     * @param in        a DocumentInputStream obtained from POIFS's POIFSFileSystem object
-     */
-
-    public void processEvents(HSSFRequest req, InputStream in)
-        throws IOException
-	{
-		try
-		{
+	/**
+	 * Processes a DocumentInputStream into essentially Record events.
+	 *
+	 * If an <code>AbortableHSSFListener</code> causes a halt to processing during this call
+	 * the method will return just as with <code>abortableProcessEvents</code>, but no
+	 * user code or <code>HSSFUserException</code> will be passed back.
+	 *
+	 * @see org.apache.poi.poifs.filesystem.POIFSFileSystem#createDocumentInputStream(String)
+	 * @param req an Instance of HSSFRequest which has your registered listeners
+	 * @param in  a DocumentInputStream obtained from POIFS's POIFSFileSystem object
+	 */
+	public void processEvents(HSSFRequest req, InputStream in) {
+		try {
 			genericProcessEvents(req, new RecordInputStream(in));
+		} catch (HSSFUserException hue) {
+			/*If an HSSFUserException user exception is thrown, ignore it.*/
 		}
-		catch (HSSFUserException hue)
-		{/*If an HSSFUserException user exception is thrown, ignore it.*/ }
 	}


-    /**
-     * Processes a DocumentInputStream into essentially Record events.
-     *
-     * @see org.apache.poi.poifs.filesystem.POIFSFileSystem#createDocumentInputStream(String)
-     * @param req       an Instance of HSSFRequest which has your registered listeners
-     * @param in        a DocumentInputStream obtained from POIFS's POIFSFileSystem object
-	 * @return 			numeric user-specified result code.
-     */
-
-    public short abortableProcessEvents(HSSFRequest req, InputStream in)
-        throws IOException, HSSFUserException
-    {
-		return genericProcessEvents(req, new RecordInputStream(in));
-    }
-
-     /**
+	/**
 	 * Processes a DocumentInputStream into essentially Record events.
 	 *
 	 * @see org.apache.poi.poifs.filesystem.POIFSFileSystem#createDocumentInputStream(String)
-	 * @param req       an Instance of HSSFRequest which has your registered listeners
-	 * @param in        a DocumentInputStream obtained from POIFS's POIFSFileSystem object
-	 * @return 			numeric user-specified result code.
+	 * @param req an Instance of HSSFRequest which has your registered listeners
+	 * @param in  a DocumentInputStream obtained from POIFS's POIFSFileSystem object
+	 * @return    numeric user-specified result code.
 	 */
+	public short abortableProcessEvents(HSSFRequest req, InputStream in)
+		throws HSSFUserException {
+		return genericProcessEvents(req, new RecordInputStream(in));
+	}

+	/**
+	 * Processes a DocumentInputStream into essentially Record events.
+	 *
+	 * @see org.apache.poi.poifs.filesystem.POIFSFileSystem#createDocumentInputStream(String)
+	 * @param req an Instance of HSSFRequest which has your registered listeners
+	 * @param in  a DocumentInputStream obtained from POIFS's POIFSFileSystem object
+	 * @return    numeric user-specified result code.
+	 */
 	protected short genericProcessEvents(HSSFRequest req, RecordInputStream in)
-		throws IOException, HSSFUserException
-	{
-		boolean going = true;
+		throws HSSFUserException {
 		short userCode = 0;
-		Record r = null;
-		
+
 		// Create a new RecordStream and use that
-		RecordFactoryInputStream recordStream = new RecordFactoryInputStream(in);
-		
+		RecordFactoryInputStream recordStream = new RecordFactoryInputStream(in, false);
+
 		// Process each record as they come in
-		while(going) {
-			r = recordStream.nextRecord();
-			if(r != null) {
-				userCode = req.processRecord(r);
-				if (userCode != 0) break;
-			} else {
-				going = false;
+		while(true) {
+			Record r = recordStream.nextRecord();
+			if(r == null) {
+				break;
+			}
+			userCode = req.processRecord(r);
+			if (userCode != 0) {
+				break;
 			}
 		}
-		
+
 		// All done, return our last code
 		return userCode;
-    }
+	}
 }
--- a/src/java/org/apache/poi/hssf/record/RecordFactory.java
+++ b/src/java/org/apache/poi/hssf/record/RecordFactory.java
@ -369,12 +369,11 @@ public final class RecordFactory {
 	public static List<Record> createRecords(InputStream in) throws RecordFormatException {
 		List<Record> records = new ArrayList<Record>(NUM_RECORDS);

-		RecordFactoryInputStream recStream = new RecordFactoryInputStream(new RecordInputStream(in));
-                recStream.setIncludeContinueRecords(true);
+		RecordFactoryInputStream recStream = new RecordFactoryInputStream(new RecordInputStream(in), true);

-        Record record;
+		Record record;
 		while ((record = recStream.nextRecord())!=null) {
-                        records.add(record);
+			records.add(record);
 		}

 		return records;
--- a/src/java/org/apache/poi/hssf/record/RecordFactoryInputStream.java
+++ b/src/java/org/apache/poi/hssf/record/RecordFactoryInputStream.java
@ -19,10 +19,6 @@ package org.apache.poi.hssf.record;
 import org.apache.poi.hssf.eventusermodel.HSSFEventFactory;
 import org.apache.poi.hssf.eventusermodel.HSSFListener;

-import java.util.Arrays;
-import java.util.LinkedList;
-import java.util.List;
-
 /**
 * A stream based way to get at complete records, with
 * as low a memory footprint as possible.
@ -34,203 +30,209 @@ import java.util.List;
 * them, but this does allow for a "pull" style of coding.
 */
 public class RecordFactoryInputStream {
-    private final RecordInputStream recStream;

-    /**
-     * Have we returned all the records there are?
-     */
-    private boolean complete = false;
+	private final RecordInputStream _recStream;
+	private final boolean _shouldIncludeContinueRecords;

-    /**
-     * Sometimes we end up with a bunch of
-     * records. When we do, these should
-     * be returned before the next normal
-     * record processing occurs (i.e. before
-     * we check for continue records and
-     * return rec)
-     */
-    private final LinkedList bonusRecords = new LinkedList();
+	/**
+	 * Temporarily stores a group of {@link NumberRecord}s.  This is uses when the most
+	 * recently read underlying record is a {@link MulRKRecord}
+	 */
+	private NumberRecord[] _multipleNumberRecords;

-    /**
-     * The most recent record that we gave to the user
-     */
-    private Record lastRecord = null;
-    /**
-     * The most recent DrawingRecord seen
-     */
-    private DrawingRecord lastDrawingRecord = new DrawingRecord();
+	/**
+	 * used to help iterating over multiple number records
+	 */
+	private int _multipleNumberRecordIndex = -1;

-    private int bofDepth = 0;
+	/**
+	 * The most recent record that we gave to the user
+	 */
+	private Record _lastRecord = null;
+	/**
+	 * The most recent DrawingRecord seen
+	 */
+	private DrawingRecord _lastDrawingRecord = new DrawingRecord();

-    private boolean lastRecordWasEOFLevelZero = false;
+	private int _bofDepth;

-    private boolean includeContinueRecords = false;
+	private boolean _lastRecordWasEOFLevelZero;

-    public RecordFactoryInputStream(RecordInputStream inp) {
-        recStream = inp;
-    }

-    /**
-     * Returns the next (complete) record from the
-     * stream, or null if there are no more.
-     */
-    public Record nextRecord() {
-        Record r = null;
+	/**
+	 * @param shouldIncludeContinueRecords caller can pass <code>false</code> if loose
+	 * {@link ContinueRecord}s should be skipped (this is sometimes useful in event based
+	 * processing).
+	 */
+	public RecordFactoryInputStream(RecordInputStream inp, boolean shouldIncludeContinueRecords) {
+		_recStream = inp;
+		_shouldIncludeContinueRecords = shouldIncludeContinueRecords;

-        // Loop until we get something
-        while (r == null && !complete) {
-            // Are there any bonus records that we need to
-            //  return?
-            r = getBonusRecord();
+		/*
+		* How to recognise end of stream?
+		* In the best case, the underlying input stream (in) ends just after the last EOF record
+		* Usually however, the stream is padded with an arbitrary byte count.  Excel and most apps
+		* reliably use zeros for padding and if this were always the case, this code could just
+		* skip all the (zero sized) records with sid==0.  However, bug 46987 shows a file with
+		* non-zero padding that is read OK by Excel (Excel also fixes the padding).
+		*
+		* So to properly detect the workbook end of stream, this code has to identify the last
+		* EOF record.  This is not so easy because the worbook bof+eof pair do not bracket the
+		* whole stream.  The worksheets follow the workbook, but it is not easy to tell how many
+		* sheet sub-streams should be present.  Hence we are looking for an EOF record that is not
+		* immediately followed by a BOF record.  One extra complication is that bof+eof sub-
+		* streams can be nested within worksheet streams and it's not clear in these cases what
+		* record might follow any EOF record.  So we also need to keep track of the bof/eof
+		* nesting level.
+		*/
+		_bofDepth=0;
+		_lastRecordWasEOFLevelZero = false;
+	}

-            // If not, ask for the next real record
-            if (r == null) {
-                r = getNextRecord();
-            }
-        }
+	/**
+	 * Returns the next (complete) record from the
+	 * stream, or null if there are no more.
+	 */
+	public Record nextRecord() {
+		Record r;
+		r = getNextMultipleNumberRecord();
+		if (r != null) {
+			// found a NumberRecord (expanded from a recent MULRK record)
+			return r;
+		}
+		while (true) {
+			if (!_recStream.hasNextRecord()) {
+				// recStream is exhausted;
+	    		return null;
+			}

-        // All done
-        return r;
-    }
+			// step underlying RecordInputStream to the next record
+			_recStream.nextRecord();

-    /**
-     * If there are any "bonus" records, that should
-     * be returned before processing new ones,
-     * grabs the next and returns it.
-     * If not, returns null;
-     */
-    private Record getBonusRecord() {
-        if (!bonusRecords.isEmpty()) {
-            return (Record) bonusRecords.removeFirst();
-        }
-        return null;
-    }
+			if (_lastRecordWasEOFLevelZero) {
+				// Potential place for ending the workbook stream
+				// Check that the next record is not BOFRecord(0x0809)
+				// Normally the input stream contains only zero padding after the last EOFRecord,
+				// but bug 46987 suggests that the padding may be garbage.
+				// This code relies on the padding bytes not starting with BOFRecord.sid
+				if (_recStream.getSid() != BOFRecord.sid) {
+					return null;
+				}
+				// else - another sheet substream starting here
+			}

-    /**
-     * Returns the next available record, or null if
-     * this pass didn't return a record that's
-     * suitable for returning (eg was a continue record).
-     */
-    private Record getNextRecord() {
-        /*
-        * How to recognise end of stream?
-        * In the best case, the underlying input stream (in) ends just after the last EOF record
-        * Usually however, the stream is padded with an arbitrary byte count.  Excel and most apps
-        * reliably use zeros for padding and if this were always the case, this code could just
-        * skip all the (zero sized) records with sid==0.  However, bug 46987 shows a file with
-        * non-zero padding that is read OK by Excel (Excel also fixes the padding).
-        *
-        * So to properly detect the workbook end of stream, this code has to identify the last
-        * EOF record.  This is not so easy because the worbook bof+eof pair do not bracket the
-        * whole stream.  The worksheets follow the workbook, but it is not easy to tell how many
-        * sheet sub-streams should be present.  Hence we are looking for an EOF record that is not
-        * immediately followed by a BOF record.  One extra complication is that bof+eof sub-
-        * streams can be nested within worksheet streams and it's not clear in these cases what
-        * record might follow any EOF record.  So we also need to keep track of the bof/eof
-        * nesting level.
-        */
+			r = readNextRecord();
+			if (r == null) {
+				// some record types may get skipped (e.g. DBCellRecord and ContinueRecord)
+				continue;
+			}
+			return r;
+		}
+	}

-        if (recStream.hasNextRecord()) {
-            // Grab our next record
-            recStream.nextRecord();
+	/**
+	 * @return the next {@link NumberRecord} from the multiple record group as expanded from
+	 * a recently read {@link MulRKRecord}. <code>null</code> if not present.
+	 */
+	private NumberRecord getNextMultipleNumberRecord() {
+		if (_multipleNumberRecords != null) {
+			int ix = _multipleNumberRecordIndex;
+			if (ix < _multipleNumberRecords.length) {
+				NumberRecord result = _multipleNumberRecords[ix];
+				_multipleNumberRecordIndex = ix + 1;
+				return result;
+			}
+			_multipleNumberRecordIndex = -1;
+			_multipleNumberRecords = null;
+		}
+		return null;
+	}

-            if (lastRecordWasEOFLevelZero && recStream.getSid() != BOFRecord.sid) {
-                // Normally InputStream (in) contains only zero padding after this point
-                complete = true;
-                return null;
-            }
+	/**
+	 * @return the next available record, or <code>null</code> if
+	 * this pass didn't return a record that's
+	 * suitable for returning (eg was a continue record).
+	 */
+	private Record readNextRecord() {

-            Record record = RecordFactory.createSingleRecord(recStream);
-            lastRecordWasEOFLevelZero = false;
+		Record record = RecordFactory.createSingleRecord(_recStream);
+		_lastRecordWasEOFLevelZero = false;

-            if (record instanceof BOFRecord) {
-                bofDepth++;
-                return record;
-            }
+		if (record instanceof BOFRecord) {
+			_bofDepth++;
+			return record;
+		}

-            if (record instanceof EOFRecord) {
-                bofDepth--;
-                if (bofDepth < 1) {
-                    lastRecordWasEOFLevelZero = true;
-                }
+		if (record instanceof EOFRecord) {
+			_bofDepth--;
+			if (_bofDepth < 1) {
+				_lastRecordWasEOFLevelZero = true;
+			}

-                return record;
-            }
+			return record;
+		}

-            if (record instanceof DBCellRecord) {
-                // Not needed by POI.  Regenerated from scratch by POI when spreadsheet is written
-                return null;
-            }
+		if (record instanceof DBCellRecord) {
+			// Not needed by POI.  Regenerated from scratch by POI when spreadsheet is written
+			return null;
+		}

-            if (record instanceof RKRecord) {
-                return RecordFactory.convertToNumberRecord((RKRecord) record);
-            }
+		if (record instanceof RKRecord) {
+			return RecordFactory.convertToNumberRecord((RKRecord) record);
+		}

-            if (record instanceof MulRKRecord) {
-                NumberRecord[] records = RecordFactory.convertRKRecords((MulRKRecord) record);
+		if (record instanceof MulRKRecord) {
+			NumberRecord[] records = RecordFactory.convertRKRecords((MulRKRecord) record);

-                List<NumberRecord> list = Arrays.asList(records);
-                bonusRecords.addAll(list.subList(1, list.size()));
+			_multipleNumberRecords = records;
+			_multipleNumberRecordIndex = 1;
+			return records[0];
+		}

-                return records[0];
-            }
+		if (record.getSid() == DrawingGroupRecord.sid
+				&& _lastRecord instanceof DrawingGroupRecord) {
+			DrawingGroupRecord lastDGRecord = (DrawingGroupRecord) _lastRecord;
+			lastDGRecord.join((AbstractEscherHolderRecord) record);
+			return null;
+		}
+		if (record.getSid() == ContinueRecord.sid) {
+			ContinueRecord contRec = (ContinueRecord) record;

-            if (record.getSid() == DrawingGroupRecord.sid
-                    && lastRecord instanceof DrawingGroupRecord) {
-                DrawingGroupRecord lastDGRecord = (DrawingGroupRecord) lastRecord;
-                lastDGRecord.join((AbstractEscherHolderRecord) record);
-                return null;
-            } else if (record.getSid() == ContinueRecord.sid) {
-                ContinueRecord contRec = (ContinueRecord) record;
-
-                if (lastRecord instanceof ObjRecord || lastRecord instanceof TextObjectRecord) {
-                    // Drawing records have a very strange continue behaviour.
-                    //There can actually be OBJ records mixed between the continues.
-                    lastDrawingRecord.processContinueRecord(contRec.getData());
-                    //we must remember the position of the continue record.
-                    //in the serialization procedure the original structure of records must be preserved
-                    if (includeContinueRecords) {
-                        return record;
-                    } else {
-                        return null;
-                    }
-                } else if (lastRecord instanceof DrawingGroupRecord) {
-                    ((DrawingGroupRecord) lastRecord).processContinueRecord(contRec.getData());
-                    return null;
-                } else if (lastRecord instanceof DrawingRecord) {
-                    ((DrawingRecord) lastRecord).processContinueRecord(contRec.getData());
-                    return null;
-                } else if (lastRecord instanceof UnknownRecord) {
-                    //Gracefully handle records that we don't know about,
-                    //that happen to be continued
-                    return record;
-                } else if (lastRecord instanceof EOFRecord) {
-                    // This is really odd, but excel still sometimes
-                    //  outputs a file like this all the same
-                    return record;
-                } else {
-                    throw new RecordFormatException("Unhandled Continue Record");
-                }
-            } else {
-                lastRecord = record;
-                if (record instanceof DrawingRecord) {
-                    lastDrawingRecord = (DrawingRecord) record;
-                }
-
-                return record;
-            }
-
-        } else {
-            // No more records
-            complete = true;
-            return null;
-        }
-    }
-
-    /**
-     * Return or not ContinueRecord in nextRecord
-     */
-    public void setIncludeContinueRecords(boolean includeContinueRecords) {
-        this.includeContinueRecords = includeContinueRecords;
-    }
-}
+			if (_lastRecord instanceof ObjRecord || _lastRecord instanceof TextObjectRecord) {
+				// Drawing records have a very strange continue behaviour.
+				//There can actually be OBJ records mixed between the continues.
+				_lastDrawingRecord.processContinueRecord(contRec.getData());
+				//we must remember the position of the continue record.
+				//in the serialization procedure the original structure of records must be preserved
+				if (_shouldIncludeContinueRecords) {
+					return record;
+				}
+				return null;
+			}
+			if (_lastRecord instanceof DrawingGroupRecord) {
+				((DrawingGroupRecord) _lastRecord).processContinueRecord(contRec.getData());
+				return null;
+			}
+			if (_lastRecord instanceof DrawingRecord) {
+				((DrawingRecord) _lastRecord).processContinueRecord(contRec.getData());
+				return null;
+			}
+			if (_lastRecord instanceof UnknownRecord) {
+				//Gracefully handle records that we don't know about,
+				//that happen to be continued
+				return record;
+			}
+			if (_lastRecord instanceof EOFRecord) {
+				// This is really odd, but excel still sometimes
+				//  outputs a file like this all the same
+				return record;
+			}
+			throw new RecordFormatException("Unhandled Continue Record");
+		}
+		_lastRecord = record;
+		if (record instanceof DrawingRecord) {
+			_lastDrawingRecord = (DrawingRecord) record;
+		}
+		return record;
+	}
+}