From de1d2a6247fa6d950e5e53021a0896c7820f6623 Mon Sep 17 00:00:00 2001
From: Erik-Berndt Scheper <erik.berndt.scheper@gmail.com>
Date: Wed, 24 Nov 2010 13:58:54 +0100
Subject: [PATCH] HHH-5371 - Document sql table partitioning for audit tables

---
 .../main/docbook/en-US/Envers_Reference.xml   |   1 +
 ...ibernate_Envers_-_Easy_Entity_Auditing.xml |   1 +
 .../docbook/en-US/content/partitioning.xml    | 358 ++++++++++++++++++
 3 files changed, 360 insertions(+)
 create mode 100644 documentation/envers/src/main/docbook/en-US/content/partitioning.xml
diff --git a/documentation/envers/src/main/docbook/en-US/Envers_Reference.xml b/documentation/envers/src/main/docbook/en-US/Envers_Reference.xml
index 01579e0356..ab6f22776a 100644
--- a/documentation/envers/src/main/docbook/en-US/Envers_Reference.xml
+++ b/documentation/envers/src/main/docbook/en-US/Envers_Reference.xml
@@ -54,6 +54,7 @@
     <xi:include href="content/queries.xml" xmlns:xi="http://www.w3.org/2001/XInclude" />
     <xi:include href="content/schema.xml" xmlns:xi="http://www.w3.org/2001/XInclude" />
     <xi:include href="content/tables.xml" xmlns:xi="http://www.w3.org/2001/XInclude" />
+    <xi:include href="content/partitioning.xml" xmlns:xi="http://www.w3.org/2001/XInclude" />
     <xi:include href="content/source.xml" xmlns:xi="http://www.w3.org/2001/XInclude" />
     <xi:include href="content/exceptions.xml" xmlns:xi="http://www.w3.org/2001/XInclude" />
     <xi:include href="content/migration.xml" xmlns:xi="http://www.w3.org/2001/XInclude" />
diff --git a/documentation/envers/src/main/docbook/en-US/Hibernate_Envers_-_Easy_Entity_Auditing.xml b/documentation/envers/src/main/docbook/en-US/Hibernate_Envers_-_Easy_Entity_Auditing.xml
index 7c312de837..c4149d724d 100644
--- a/documentation/envers/src/main/docbook/en-US/Hibernate_Envers_-_Easy_Entity_Auditing.xml
+++ b/documentation/envers/src/main/docbook/en-US/Hibernate_Envers_-_Easy_Entity_Auditing.xml
@@ -54,6 +54,7 @@
     <xi:include href="content/queries.xml" xmlns:xi="http://www.w3.org/2001/XInclude" />
     <xi:include href="content/schema.xml" xmlns:xi="http://www.w3.org/2001/XInclude" />
     <xi:include href="content/tables.xml" xmlns:xi="http://www.w3.org/2001/XInclude" />
+    <xi:include href="content/partitioning.xml" xmlns:xi="http://www.w3.org/2001/XInclude" />
     <xi:include href="content/source.xml" xmlns:xi="http://www.w3.org/2001/XInclude" />
     <xi:include href="content/exceptions.xml" xmlns:xi="http://www.w3.org/2001/XInclude" />
     <xi:include href="content/migration.xml" xmlns:xi="http://www.w3.org/2001/XInclude" />
diff --git a/documentation/envers/src/main/docbook/en-US/content/partitioning.xml b/documentation/envers/src/main/docbook/en-US/content/partitioning.xml
new file mode 100644
index 0000000000..86ff8a06d1
--- /dev/null
+++ b/documentation/envers/src/main/docbook/en-US/content/partitioning.xml
@@ -0,0 +1,358 @@
+<?xml version='1.0' encoding="UTF-8"?>
+<!--
+  ~ Hibernate, Relational Persistence for Idiomatic Java
+  ~
+  ~ Copyright (c) 2008, Red Hat Middleware LLC or third-party contributors as
+  ~ indicated by the @author tags or express copyright attribution
+  ~ statements applied by the authors.  All third-party contributions are
+  ~ distributed under license by Red Hat Middleware LLC.
+  ~
+  ~ This copyrighted material is made available to anyone wishing to use, modify,
+  ~ copy, or redistribute it subject to the terms and conditions of the GNU
+  ~ Lesser General Public License, as published by the Free Software Foundation.
+  ~
+  ~ This program is distributed in the hope that it will be useful,
+  ~ but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+  ~ or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License
+  ~ for more details.
+  ~
+  ~ You should have received a copy of the GNU Lesser General Public License
+  ~ along with this distribution; if not, write to:
+  ~ Free Software Foundation, Inc.
+  ~ 51 Franklin Street, Fifth Floor
+  ~ Boston, MA  02110-1301  USA
+  -->
+
+<!DOCTYPE chapter PUBLIC "-//OASIS//DTD DocBook XML V4.5//EN" "http://www.oasis-open.org/docbook/xml/4.5/docbookx.dtd" [
+<!ENTITY % BOOK_ENTITIES SYSTEM "../Hibernate_Envers_-_Easy_Entity_Auditing.ent">
+%BOOK_ENTITIES;
+]>
+
+<chapter id="partitioning">
+    <title>Audit table partitioning</title>
+
+    <section id="partitioning-benefits">
+
+        <title>Benefits of audit table partitioning</title>
+
+        <para>
+            Because audit tables tend to grow indefinitely they can quickly become really large. When the audit tables have grown
+            to a certain limit (varying per RDBMS and/or operating system) it makes sense to start using table partitioning.
+            SQL table partitioning offers a lot of advantages including, but certainly not limited to:
+            <orderedlist>
+                <listitem>
+                    <para>
+                        Improved query performance by selectively moving rows to various partitions (or even purging old rows)
+                    </para>
+                </listitem>
+                <listitem>
+                    <para>
+                        Faster data loads, index creation, etc.
+                    </para>
+                </listitem>
+            </orderedlist>
+        </para>
+        
+    </section>
+
+    <section id="partitioning-columns">
+
+        <title>Suitable columns for audit table partitioning</title>
+        <para>         
+            Generally SQL tables must be partitioned on a column that exists within the table. As a rule it makes sense to use
+            either the <emphasis>end revision</emphasis> or the <emphasis>end revision timestamp</emphasis> column for 
+            partioning of audit tables.
+            <note>
+                <para>
+                    End revision information is not available for the default AuditStrategy. 
+                </para>
+
+                <para>
+                    Therefore the following Envers configuration options are required: 
+                </para>
+                <para>
+                    <literal>org.hibernate.envers.audit_strategy</literal> = 
+                    <literal>org.hibernate.envers.strategy.ValidityAuditStrategy</literal>
+                </para>
+                <para>
+                    <literal>org.hibernate.envers.audit_strategy_validity_store_revend_timestamp</literal> = 
+                    <literal>true</literal>
+                </para>
+
+                <para>
+                    Optionally, you can also override the default values following properties: 
+                </para>
+                <para>
+                    <literal>org.hibernate.envers.audit_strategy_validity_end_rev_field_name</literal>
+                </para>
+                <para>
+                    <literal>org.hibernate.envers.audit_strategy_validity_revend_timestamp_field_name</literal>
+                </para>
+
+                <para>
+                    For more information, see <xref linkend="configuration"/>.
+                </para>
+            </note>
+        </para>
+    
+        <para>
+           The reason why the end revision information should be used for audit table partioning is based on the assumption that
+           audit tables should be partionioned on an &apos;increasing level of interestingness&apos;, like so: 
+        </para>
+        
+        <para>
+            <orderedlist>
+                <listitem>
+                    <para>
+                        A couple of partitions with audit data that is not very (or no longer) interesting. 
+                        This can be stored on slow media, and perhaps even be purged eventually.  
+                    </para>
+                </listitem>
+                <listitem>
+                    <para>
+                        Some partitions for audit data that is potentially interesting.
+                    </para>
+                </listitem>
+                <listitem>
+                    <para>
+                        One partition for audit data that is most likely to be interesting. 
+                        This should be stored on the fastest media, both for reading and writing.  
+                    </para>
+                </listitem>
+            </orderedlist>
+        </para>
+
+        
+    </section>
+
+    <section id="partitioning-example">
+
+        <title>Audit table partitioning example</title>
+        <para>
+            In order to determine a suitable column for the &apos;increasing level of interestingness&apos;,
+            consider a simplified example of a salary registration for an unnamed agency. 
+        </para>
+
+        <para>
+            Currently, the salary table contains the following rows for a certain person X:
+                      
+            <table frame="topbot">
+                <title>Salaries table</title>
+                <tgroup cols="2">
+                    <colspec colname="c1" colwidth="1*"/>
+                    <colspec colname="c2" colwidth="1*"/>
+                    <thead>
+                        <row>
+                            <entry>Year</entry>
+                            <entry>Salary (USD)</entry>
+                        </row>
+                    </thead>
+                    <tbody>
+                        <row>
+                            <entry>2006</entry>
+                            <entry>3300</entry>
+                        </row>
+                        <row>
+                            <entry>2007</entry>
+                            <entry>3500</entry>
+                        </row>
+                        <row>
+                            <entry>2008</entry>
+                            <entry>4000</entry>
+                        </row>
+                        <row>
+                            <entry>2009</entry>
+                            <entry>4500</entry>
+                        </row>
+                    </tbody>
+                </tgroup>
+            </table>
+        </para>
+
+        <para>
+            The salary for the current fiscal year (2010) is unknown. The agency requires that all changes in registered 
+            salaries for a fiscal year are recorded (i.e. an audit trail). The rationale behind this is that decisions 
+            made at a certain date are based on the registered salary at that time. And at any time it must be possible 
+            reproduce the reason why a certain decision was made at a certain date.
+        </para>
+
+        <para>
+            The following audit information is available, sorted on in order of occurrence:
+
+            <table frame="topbot">
+                <title>Salaries - audit table</title>
+                <tgroup cols="5">
+                    <colspec colname="c1" colwidth="1*"/>
+                    <colspec colname="c2" colwidth="1*"/>
+                    <colspec colname="c3" colwidth="1*"/>
+                    <colspec colname="c4" colwidth="1*"/>
+                    <colspec colname="c5" colwidth="1*"/>
+                    <thead>
+                        <row>
+                            <entry>Year</entry>
+                            <entry>Revision type</entry>
+                            <entry>Revision timestamp</entry>
+                            <entry>Salary (USD)</entry>
+                            <entry>End revision timestamp</entry>
+                        </row>
+                    </thead>
+                    <tbody>
+                        <row>
+                            <entry>2006</entry>
+                            <entry>ADD</entry>
+                            <entry>2007-04-01</entry>
+                            <entry>3300</entry>
+                            <entry>null</entry>
+                        </row>
+                        <row>
+                            <entry>2007</entry>
+                            <entry>ADD</entry>
+                            <entry>2008-04-01</entry>
+                            <entry>35</entry>
+                            <entry>2008-04-02</entry>
+                        </row>
+                        <row>
+                            <entry>2007</entry>
+                            <entry>MOD</entry>
+                            <entry>2008-04-02</entry>
+                            <entry>3500</entry>
+                            <entry>null</entry>
+                        </row>
+                        <row>
+                            <entry>2008</entry>
+                            <entry>ADD</entry>
+                            <entry>2009-04-01</entry>
+                            <entry>3700</entry>
+                            <entry>2009-07-01</entry>
+                        </row>
+                        <row>
+                            <entry>2008</entry>
+                            <entry>MOD</entry>
+                            <entry>2009-07-01</entry>
+                            <entry>4100</entry>
+                            <entry>2010-02-01</entry>
+                        </row>
+                        <row>
+                            <entry>2008</entry>
+                            <entry>MOD</entry>
+                            <entry>2010-02-01</entry>
+                            <entry>4000</entry>
+                            <entry>null</entry>
+                        </row>
+                        <row >
+                            <entry>2009</entry>
+                            <entry>ADD</entry>
+                            <entry>2010-04-01</entry>
+                            <entry>4500</entry>
+                            <entry>null</entry>
+                        </row>
+                    </tbody>
+                </tgroup>
+            </table>
+        </para>
+
+        <section id="partitioning-example-column">
+
+            <title>Determining a suitable partitioning column</title>
+            <para>
+                To partition this data, the &apos;level of interestingness&apos; must be defined. 
+                Consider the following:
+                <orderedlist>
+                    <listitem>
+                        <para>
+                            For fiscal year 2006 there is only one revision. It has the oldest <emphasis>revision timestamp</emphasis>
+                            of all audit rows, but should still be regarded as interesting because it is the latest modification 
+                            for this fiscal year in the salary table; its <emphasis>end revision timestamp</emphasis> is null.
+                        </para>
+                        <para>
+                            Also note that it would be very unfortunate if in 2011 there would be an update of the salary for fiscal 
+                            year 2006 (which is possible in until at least 10 years after the fiscal year) and the audit 
+                            information would have been moved to a slow disk (based on the age of the 
+                            <emphasis>revision timestamp</emphasis>). Remember that in this case Envers will have to update 
+                            the <emphasis>end revision timestamp</emphasis> of the most recent audit row.
+                        </para>
+                    </listitem>
+                    <listitem>
+                        <para>
+                             There are two revisions in the salary of fiscal year 2007 which both have nearly the same 
+                             <emphasis>revision timestamp</emphasis> and a different <emphasis>end revision timestamp</emphasis>. 
+                             On first sight it is evident that the first revision was a mistake and probably uninteresting. 
+                             The only interesting revision for 2007 is the one with <emphasis>end revision timestamp</emphasis> null.
+                        </para>
+                    </listitem>
+                </orderedlist>
+                        
+                Based on the above, it is evident that only the <emphasis>end revision timestamp</emphasis> is suitable for 
+                audit table partitioning. The <emphasis>revision timestamp</emphasis> is not suitable. 
+            </para>
+            
+        </section>
+
+        <section id="partitioning-example-scheme">
+
+            <title>Determining a suitable partitioning scheme</title>
+            <para>
+                A possible partitioning scheme for the salary table would be as follows:
+                <orderedlist>
+                    <listitem>
+                        <para>
+                            <emphasis>end revision timestamp</emphasis> year = 2008
+                        </para>
+                        <para>
+                            This partition contains audit data that is not very (or no longer) interesting. 
+                        </para>
+                    </listitem>
+                    <listitem>
+                        <para>
+                            <emphasis>end revision timestamp</emphasis> year = 2009
+                        </para>
+                        <para>
+                            This partition contains audit data that is potentially interesting.
+                        </para>
+                    </listitem>
+                    <listitem>
+                        <para>
+                            <emphasis>end revision timestamp</emphasis> year >= 2010 or null  
+                        </para>
+                        <para>
+                            This partition contains the most interesting audit data. 
+                        </para>
+                    </listitem>
+                </orderedlist>
+            </para>
+
+            <para>
+                This partitioning scheme also covers the potential problem of the update of the 
+                <emphasis>end revision timestamp</emphasis>, which occurs if a row in the audited table is modified. 
+                Even though Envers will update the <emphasis>end revision timestamp</emphasis> of the audit row to 
+                the system date at the instant of modification, the audit row will remain in the same partition
+                (the &apos;extension bucket&apos;).
+            </para>
+            
+            <para>
+                And sometime in 2011, the last partition (or &apos;extension bucket&apos;) is split into two new partitions:
+                <orderedlist>
+                    <listitem>
+                        <para>
+                            <emphasis>end revision timestamp</emphasis> year = 2010
+                        </para>
+                        <para>
+                            This partition contains audit data that is potentially interesting (in 2011).
+                        </para>
+                    </listitem>
+                    <listitem>
+                        <para>
+                            <emphasis>end revision timestamp</emphasis> year >= 2011 or null  
+                        </para>
+                        <para>
+                            This partition contains the most interesting audit data and is the new &apos;extension bucket&apos;. 
+                        </para>
+                    </listitem>
+                </orderedlist>
+            </para>
+            
+        </section>
+        
+    </section>
+</chapter>
+