Added an implementation of Eugene Myers difference algorithm.

JIRA: COLLECTIONS-404

git-svn-id: https://svn.apache.org/repos/asf/commons/proper/collections/trunk@1311964 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Luc Maisonobe 2012-04-10 20:09:44 +00:00
parent 2fa3c8c5f0
commit eb88ae9669
12 changed files with 1355 additions and 0 deletions

View File

@ -0,0 +1,148 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.commons.collections.list.difference;
/** This interface should be implemented by user object to walk
* through {@link EditScript EditScript} objects.
* <p>Users should implement this interface in order to walk through
* the {@link EditScript EditScript} object created by the comparison
* of two sequences. This is a direct application of the visitor
* design pattern. The {@link EditScript#visit EditScript.visit}
* method takes an object implementing this interface as an argument,
* it will perform the loop over all commands in the script and the
* proper methods of the user class will be called as the commands are
* encountered.</p>
* <p>The implementation of the user visitor class will depend on the
* need. Here are two examples.
* </p>
*
* <p>
* The first example is a visitor that build the longest common
* subsequence:
* <pre>
* import org.apache.commons.collections.list.difference.CommandVisitor;
*
* import java.util.ArrayList;
*
* public class LongestCommonSubSequence implements CommandVisitor {
*
* public LongestCommonSubSequence() {
* a = new ArrayList();
* }
*
* public void visitInsertCommand(Object object) {
* }
*
* public void visitKeepCommand(Object object) {
* a.add(object);
* }
*
* public void visitDeleteCommand(Object object) {
* }
*
* public Object[] getSubSequence() {
* return a.toArray();
* }
*
* private arrayList a;
*
* }
* </pre>
* </p>
*
* <p>
* The second example is a visitor that shows the commands and the way
* they transform the first sequence into the second one:
* <pre>
* import org.apache.commons.collections.list.difference.CommandVisitor;
*
* import java.util.Arrays;
* import java.util.ArrayList;
* import java.util.Iterator;
*
* public class ShowVisitor implements CommandVisitor {
*
* public ShowVisitor(Object[] sequence1) {
* v = new ArrayList();
* v.addAll(Arrays.asList(sequence1));
* index = 0;
* }
*
* public void visitInsertCommand(Object object) {
* v.insertElementAt(object, index++);
* display("insert", object);
* }
*
* public void visitKeepCommand(Object object) {
* ++index;
* display("keep ", object);
* }
*
* public void visitDeleteCommand(Object object) {
* v.remove(index);
* display("delete", object);
* }
*
* private void display(String commandName, Object object) {
* System.out.println(commandName + " " + object + " ->" + this);
* }
*
* public String toString() {
* StringBuffer buffer = new StringBuffer();
* for (Iterator iter = v.iterator(); iter.hasNext();) {
* buffer.append(' ').append(iter.next());
* }
* return buffer.toString();
* }
*
* private ArrayList v;
* private int index;
*
* }
* </pre>
* </p>
* @since 4.0
* @author Jordane Sarda
* @author Luc Maisonobe
* @version $Id$
*/
public interface CommandVisitor<T> {
/** Method called when an insert command is encountered.
* @param object object to insert (this object comes from the
* second sequence)
*/
void visitInsertCommand(T object);
/** Method called when a keep command is encountered.
* @param object object to keep (this object comes from the
* first sequence)
*/
void visitKeepCommand(T object);
/** Method called when a delete command is encountered.
* @param object object to delete (this object comes from the
* first sequence)
*/
void visitDeleteCommand(T object);
}

View File

@ -0,0 +1,56 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.commons.collections.list.difference;
/** Command representing the deletion of one object of the first sequence.
* When one object of the first sequence has no corresponding object
* in the second sequence at the right place, the {@link EditScript
* edit script} transforming the first sequence into the second
* sequence uses an instance of this class to represent the deletion
* of this object. The objects embedded in these type of commands
* always come from the first sequence.
* @see SequencesComparator
* @see EditScript
* @since 4.0
* @author Jordane Sarda
* @author Luc Maisonobe
* @version $Id$
*/
public class DeleteCommand<T> extends EditCommand<T> {
/** Simple constructor.
* Creates a new instance of DeleteCommand
* @param object the object of the first sequence that should be deleted
*/
public DeleteCommand(T object) {
super(object);
}
/** Accept a visitor.
* When a <code>DeleteCommand</code> accepts a visitor, it calls
* its {@link CommandVisitor#visitDeleteCommand
* visitDeleteCommand} method.
* @param visitor the visitor to be accepted
*/
public void accept(CommandVisitor<T> visitor) {
visitor.visitDeleteCommand(object);
}
}

View File

@ -0,0 +1,76 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.commons.collections.list.difference;
/** Abstract base class for all commands used to transform an objects
* sequence into another one.
* <p>When two objects sequences are compared through the {@link
* SequencesComparator#getScript SequencesComparator.getScript}
* method, the result is provided has a {@link EditScript script}
* containing the commands that progressively transform the first
* sequence into the second one.</p>
* <p>There are only three types of commands, all of which are
* subclasses of this abstract class. Each command is associated with
* one object belonging to at least one of the sequences. These
* commands are {@link InsertCommand InsertCommand} which correspond
* to an object of the second sequence beeing inserted into the first
* sequence, {@link DeleteCommand DeleteCommand} which correspond to
* an object of the first sequence beeing removed and {@link
* KeepCommand KeepCommand} which correspond to an object of the first
* sequence which <code>equals</code> an object in the second
* sequence. It is guaranteed that comparison is always performed this
* way (i.e. the <code>equals</code> method of the object from the
* first sequence is used and the object passed as an argument comes
* from the second sequence) ; this can be important if subclassing is
* used for some elements in the first sequence and the
* <code>equals</code> method is specialized.</p>
* @see SequencesComparator
* @see EditScript
* @since 4.0
* @author Jordane Sarda
* @author Luc Maisonobe
* @version $Id$
*/
public abstract class EditCommand<T> {
/** Simple constructor.
* Creates a new instance of EditCommand
* @param object reference to the object associated with this
* command, this refers to an element of one of the sequences
* beeing compared
*/
protected EditCommand(T object) {
this.object = object;
}
/** Accept a visitor.
* This method is invoked for each commands belonging to an {@link
* EditScript EditScript}, in order to implement the visitor
* design pattern
* @param visitor the visitor to be accepted
*/
public abstract void accept(CommandVisitor<T> visitor);
/** Object on which the command should be applied. */
protected T object;
}

View File

@ -0,0 +1,126 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.commons.collections.list.difference;
import java.util.ArrayList;
import java.util.List;
/**
* This class gathers all the {@link EditCommand commands} needed to
* transform one objects sequence into another objects sequence.
* <p>An edit script is the most general view of the differences
* between two sequences. It is built as the result of the comparison
* between two sequences by the {@link SequencesComparator
* SequencesComparator} class. The user can walk through it using
* the <em>visitor</em> design pattern.</p>
* <p>It is guaranteed that the objects embedded in the {@link
* InsertCommand insert commands} come from the second sequence and
* that the objects embedded in either the {@link DeleteCommand delete
* commands} or {@link KeepCommand keep commands} come from the first
* sequence. This can be important if subclassing is used for some
* elements in the first sequence and the <code>equals</code> method
* is specialized.</p>
* @see SequencesComparator
* @see EditCommand
* @see CommandVisitor
* @see ReplacementsHandler
*
* @since 4.0
* @author Jordane Sarda
* @author Luc Maisonobe
* @version $Id$
*/
public class EditScript<T> {
/** Container for the commands. */
private List<EditCommand<T>> commands;
/** Length of the longest common subsequence. */
private int lcsLength;
/** Number of modifications. */
private int modifications;
/** Simple constructor.
* Creates a new empty script.
*/
public EditScript() {
commands = new ArrayList<EditCommand<T>>();
lcsLength = 0;
modifications = 0;
}
/** Add a keep command to the script.
* @param command command to add
*/
public void append(KeepCommand<T> command) {
commands.add(command);
++lcsLength;
}
/** Add an insert command to the script.
* @param command command to add
*/
public void append(InsertCommand<T> command) {
commands.add(command);
++modifications;
}
/** Add a delete command to the script.
* @param command command to add
*/
public void append(DeleteCommand<T> command) {
commands.add(command);
++modifications;
}
/** Visit the script.
* The script implements the <em>visitor</em> design pattern, this
* method is the entry point to which the user supplies its own
* visitor, the script will be responsible to drive it through the
* commands in order and call the appropriate method as each
* command is encountered.
* @param visitor the visitor that will visit all commands in turn
*/
public void visit(CommandVisitor<T> visitor) {
for (EditCommand<T> command : commands) {
command.accept(visitor);
}
}
/** Get the length of the Longest Common Subsequence (LCS).
* The length of the longest common subsequence is the number of
* {@link KeepCommand keep commands} in the script.
* @return length of the Longest Common Subsequence
*/
public int getLCSLength() {
return lcsLength;
}
/** Get the number of effective modifications.
* The number of effective modification is the number of {@link
* DeleteCommand delete} and {@link InsertCommand insert} commands
* in the script.
* @return number of effective modifications
*/
public int getModifications() {
return modifications;
}
}

View File

@ -0,0 +1,57 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.commons.collections.list.difference;
/** Command representing the insertion of one object of the second sequence.
* When one object of the second sequence has no corresponding object
* in the first sequence at the right place, the {@link EditScript
* edit script} transforming the first sequence into the second
* sequence uses an instance of this class to represent the insertion
* of this object. The objects embedded in these type of commands
* always come from the second sequence.
* @see SequencesComparator
* @see EditScript
* @since 4.0
* @author Jordane Sarda
* @author Luc Maisonobe
* @version $Id$
*/
public class InsertCommand<T> extends EditCommand<T> {
/** Simple constructor.
* Creates a new instance of InsertCommand
* @param object the object of the second sequence that should be inserted
*/
public InsertCommand(T object) {
super(object);
}
/** Accept a visitor.
* When an <code>InsertCommand</code> accepts a visitor, it calls
* its {@link CommandVisitor#visitInsertCommand
* visitInsertCommand} method.
* @param visitor the visitor to be accepted
*/
public void accept(CommandVisitor<T> visitor) {
visitor.visitInsertCommand(object);
}
}

View File

@ -0,0 +1,58 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.commons.collections.list.difference;
/** Command representing the keeping of one object present in both sequences.
* When one object of the first sequence <code>equals</code> another
* objects in the second sequence at the right place, the {@link
* EditScript edit script} transforming the first sequence into the
* second sequence uses an instance of this class to represent the
* keeping of this object. The objects embedded in these type of
* commands always come from the first sequence.
* @see SequencesComparator
* @see EditScript
* @since 4.0
* @author Jordane Sarda
* @author Luc Maisonobe
* @version $Id$
*/
public class KeepCommand<T> extends EditCommand<T> {
/** Simple constructor.
* Creates a new instance of KeepCommand
* @param object the object belonging to both sequences (the
* object is a reference to the instance in the first sequence
* which is known to be equal to an instance in the second
* sequence)
*/
public KeepCommand(T object) {
super(object);
}
/** Accept a visitor.
* When a <code>KeepCommand</code> accepts a visitor, it calls
* its {@link CommandVisitor#visitKeepCommand visitKeepCommand} method.
* @param visitor the visitor to be accepted
*/
public void accept(CommandVisitor<T> visitor) {
visitor.visitKeepCommand(object);
}
}

View File

@ -0,0 +1,107 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.commons.collections.list.difference;
import java.util.ArrayList;
import java.util.List;
/**
* This class handles sequences of replacements resulting from a
* comparison.
* <p>The comparison of two objects sequences leads to the
* identification of common parts and parts which only belong to the
* first or to the second sequence. The common parts appear in the
* edit script in the form of <em>keep</em> commands, they can be considered
* as synchronization objects between the two sequences. These
* synchronization objects split the two sequences in synchronized
* sub-sequences. The first sequence can be transformed into the second
* one by replacing each synchronized sub-sequence of the first
* sequence by the corresponding sub-sequence of the second
* sequence. This is a synthetic way to see an {@link EditScript edit
* script}, replacing individual {@link DeleteCommand delete}, {@link
* KeepCommand keep} and {@link InsertCommand insert} commands by
* fewer replacements acting on complete sub-sequences.</p>
* <p>This class is devoted to perform this interpretation. It visits
* an {@link EditScript edit script} (because it implements the {@link
* CommandVisitor CommandVisitor} interface) and calls a user-supplied
* handler implementing the {@link ReplacementsHandler
* ReplacementsHandler} interface to process the sub-sequences.</p>
* @see ReplacementsHandler
* @see EditScript
* @see SequencesComparator
* @since 4.0
* @author Luc Maisonobe
* @author Jordane Sarda
* @version $Id$
*/
public class ReplacementsFinder<T> implements CommandVisitor<T> {
private List<T> pendingInsertions;
private List<T> pendingDeletions;
private int skipped;
/** Handler to call when synchronized sequences are found. */
private ReplacementsHandler<T> handler;
/** Simple constructor.
* Creates a new instance of ReplacementsFinder
* @param handler handler to call when synchronized sequences are
* found
*/
public ReplacementsFinder(ReplacementsHandler<T> handler) {
pendingInsertions = new ArrayList<T>();
pendingDeletions = new ArrayList<T>();
skipped = 0;
this.handler = handler;
}
/** Add an object to the pending insertions set.
* @param object object to insert
*/
public void visitInsertCommand(T object) {
pendingInsertions.add(object);
}
/** Handle a synchronization object.
* <p>When a synchronization object is identified, the pending
* insertions and pending deletions sets are provided to the user
* handler as subsequences.</p>
* @param object synchronization object detected
*/
public void visitKeepCommand(T object) {
if (pendingDeletions.isEmpty() && pendingInsertions.isEmpty()) {
++skipped;
} else {
handler.handleReplacement(skipped, pendingDeletions, pendingInsertions);
pendingDeletions.clear();
pendingInsertions.clear();
skipped = 1;
}
}
/** Add an object to the pending deletions set.
* @param object object to delete
*/
public void visitDeleteCommand(T object) {
pendingDeletions.add(object);
}
}

View File

@ -0,0 +1,48 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.commons.collections.list.difference;
import java.util.List;
/** This interface is devoted to handle synchronized replacement sequences.
* @see ReplacementsFinder
* @since 4.0
* @author Luc Maisonobe
* @version $Id$
*/
public interface ReplacementsHandler<T> {
/** Handle two synchronized sequences.
* <p>This method is called by a {@link ReplacementsFinder
* ReplacementsFinder} instance when it has synchronized two
* sub-sequences of object arrays being compared, and at least one
* of the sequences is non-empty. Since the sequences are
* synchronized, the objects before the two sub-sequences are equals
* (if they exist). This property also holds for the objects after the
* two sub-sequences.</p>
* <p>The replacement is defined as replacing the <code>from</code>
* sub-sequence into the <code>to</code> sub-sequence.</p>
* @param skipped number of tokens skipped since the last call (i.e.
* number of tokens that were in both sequences), this number should
* be strictly positive except on the very first call where it can be
* zero (if the first object of the two sequences are different)
* @param from sub-sequence of objects coming from the first sequence
* @param to sub-sequence of objects coming from the second sequence
*/
public void handleReplacement(int skipped, List<T> from, List<T> to);
}

View File

@ -0,0 +1,265 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.commons.collections.list.difference;
import java.util.List;
/**
* This class allows to compare two objects sequences.
* <p>The two sequences can hold any object type, as only the
* <code>equals</code> method is used to compare the elements of the
* sequences. It is guaranteed that the comparisons will always be
* done as <code>o1.equals(o2)</code> where <code>o1</code> belongs to
* the first sequence and <code>o2</code> belongs to the second
* sequence. This can be important if subclassing is used for some
* elements in the first sequence and the <code>equals</code> method
* is specialized.</p>
* <p>Comparison can be seen from two points of view: either as
* giving the smallest modification allowing to transform the first
* sequence into the second one, or as giving the longest sequence
* which is a subsequence of both initial sequences. The
* <code>equals</code> method is used to compare objects, so any
* object can be put into sequences. Modifications include deleting,
* inserting or keeping one object, starting from the beginning of the
* first sequence.</p>
* <p>This class implements the comparison algorithm, which is the
* very efficient algorithm from Eugene W. Myers <a
* href="http://www.cis.upenn.edu/~bcpierce/courses/dd/papers/diff.ps">An
* O(ND) Difference Algorithm and Its Variations</a>. This algorithm
* produces the shortest possible {@link
* org.apache.commons.collections.list.difference.EditScript edit script}
* containing all the {@link
* org.apache.commons.collections.list.difference.EditCommand commands} needed to
* transform the first sequence into the second one.</p>
* @see EditScript
* @see EditCommand
* @see CommandVisitor
* @since 4.0
* @author Jordane Sarda
* @version $Id$
*/
public class SequencesComparator<T> {
/** First sequence. */
private List<T> sequence1;
/** Second sequence. */
private List<T> sequence2;
/** Temporary variables. */
private int[] vDown;
private int[] vUp;
/** Simple constructor.
* <p>Creates a new instance of SequencesComparator</p>
* <p>It is <em>guaranteed</em> that the comparisons will always be
* done as <code>o1.equals(o2)</code> where <code>o1</code> belongs
* to the first sequence and <code>o2</code> belongs to the second
* sequence. This can be important if subclassing is used for some
* elements in the first sequence and the <code>equals</code> method
* is specialized.</p>
* @param sequence1 first sequence to be compared
* @param sequence2 second sequence to be compared
*/
public SequencesComparator(List<T> sequence1, List<T> sequence2) {
this.sequence1 = sequence1;
this.sequence2 = sequence2;
int size = sequence1.size() + sequence2.size() + 2;
vDown = new int[size];
vUp = new int[size];
}
/** Build a snake.
* @param start the value of the start of the snake
* @param diag the value of the diagonal of the snake
* @param end1 the value of the end of the first sequence to be compared
* @param end2 the value of the end of the second sequence to be compared
* @return the snake built
*/
private Snake buildSnake(int start, int diag, int end1, int end2) {
int end = start;
while (((end - diag) < end2)
&& (end < end1)
&& sequence1.get(end).equals(sequence2.get(end - diag))) {
++end;
}
return new Snake(start, end, diag);
}
/** Get the middle snake corresponding to two subsequences of the
* main sequences.
* The snake is found using the MYERS Algorithm (this algorithms has
* also been implemented in the GNU diff program). This algorithm is
* explained in Eugene Myers article: <a
* href="http://www.cs.arizona.edu/people/gene/PAPERS/diff.ps">An
* O(ND) Difference Algorithm and Its Variations</a>.
* @param start1 the begin of the first sequence to be compared
* @param end1 the end of the first sequence to be compared
* @param start2 the begin of the second sequence to be compared
* @param end2 the end of the second sequence to be compared
* @return the middle snake
*/
private Snake getMiddleSnake(int start1, int end1, int start2, int end2) {
// Myers Algorithm
//Initialisations
int m = end1 - start1;
int n = end2 - start2;
if ((m == 0) || (n == 0)) {
return null;
}
int delta = m - n;
int sum = n + m;
int offset = ((sum % 2 == 0) ? sum : (sum + 1)) / 2;
vDown[1+offset] = start1;
vUp[1+offset] = end1 + 1;
for (int d = 0; d <= offset ; ++d) {
// Down
for (int k = -d; k <= d; k += 2) {
// First step
int i = k + offset;
if ((k == -d) || ((k != d) && (vDown[i-1] < vDown[i+1]))) {
vDown[i] = vDown[i+1];
} else {
vDown[i] = vDown[i-1] + 1;
}
int x = vDown[i];
int y = x - start1 + start2 - k;
while ((x < end1) && (y < end2) && (sequence1.get(x).equals(sequence2.get(y)))) {
vDown[i] = ++x;
++y;
}
// Second step
if (((delta % 2) != 0 ) && ((delta - d) <= k) && (k <= (delta + d))) {
if (vUp[i-delta] <= vDown[i]) {
return buildSnake(vUp[i-delta], k + start1 - start2, end1, end2);
}
}
}
// Up
for (int k = (delta - d); k <= (delta + d); k += 2) {
// First step
int i = k + offset - delta;
if ((k == (delta - d))
|| ((k != (delta + d)) && (vUp[i+1] <= vUp[i-1]))) {
vUp[i] = vUp[i+1] - 1;
} else {
vUp[i] = vUp[i-1];
}
int x = vUp[i] - 1;
int y = x - start1 + start2 - k;
while ((x >= start1) && (y >= start2)
&& sequence1.get(x).equals(sequence2.get(y))) {
vUp[i] = x--;
y--;
}
// Second step
if (((delta % 2) == 0) && (-d <= k) && (k <= d) ) {
if (vUp[i] <= vDown[i + delta]) {
return buildSnake(vUp[i], k + start1 - start2, end1, end2);
}
}
}
}
// this should not happen
throw new RuntimeException("Internal Error");
}
/** Build an edit script.
* @param start1 the begin of the first sequence to be compared
* @param end1 the end of the first sequence to be compared
* @param start2 the begin of the second sequence to be compared
* @param end2 the end of the second sequence to be compared
* @param script the edited script
*/
private void buildScript(int start1, int end1, int start2, int end2,
EditScript<T> script) {
Snake middle = getMiddleSnake(start1, end1, start2, end2);
if ((middle == null)
|| ((middle.getStart() == end1) && (middle.getDiag() == (end1 - end2)))
|| ((middle.getEnd() == start1) && (middle.getDiag() == (start1 - start2)))) {
int i = start1;
int j = start2;
while ((i < end1) || (j < end2)) {
if ((i < end1) && (j < end2) && sequence1.get(i).equals(sequence2.get(j))) {
script.append(new KeepCommand<T>(sequence1.get(i)));
++i;
++j;
} else {
if ((end1 - start1) > (end2 - start2)) {
script.append(new DeleteCommand<T>(sequence1.get(i)));
++i;
} else {
script.append(new InsertCommand<T>(sequence2.get(j)));
++j;
}
}
}
} else {
buildScript(start1, middle.getStart(),
start2, middle.getStart() - middle.getDiag(),
script);
for (int i = middle.getStart(); i < middle.getEnd(); ++i) {
script.append(new KeepCommand<T>(sequence1.get(i)));
}
buildScript(middle.getEnd(), end1,
middle.getEnd() - middle.getDiag(), end2,
script);
}
}
/** Get the edit script script.
* <p>It is guaranteed that the objects embedded in the {@link
* InsertCommand insert commands} come from the second sequence and
* that the objects embedded in either the {@link DeleteCommand
* delete commands} or {@link KeepCommand keep commands} come from
* the first sequence. This can be important if subclassing is used
* for some elements in the first sequence and the
* <code>equals</code> method is specialized.</p>
* @return the edit script resulting from the comparison of the two
* sequences
*/
public EditScript<T> getScript() {
EditScript<T> script = new EditScript<T>();
buildScript(0, sequence1.size(), 0, sequence2.size(), script);
return script;
}
}

View File

@ -0,0 +1,86 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.commons.collections.list.difference;
/**
* This class is a simple placeholder to hold the end part of a path
* under construction in a {@link SequencesComparator
* SequencesComparator}.
* <p>A snake is an internal structure used in Eugene W. Myers
* algorithm (<a
* href="http://www.cis.upenn.edu/~bcpierce/courses/dd/papers/diff.ps">An
* O(ND) Difference Algorithm and Its Variations</a>).</p>
* @since 4.0
* @author Jordane Sarda
* @version $Id$
*/
public class Snake {
/** Start index. */
private int start;
/** End index. */
private int end;
/** Diagonal number. */
private int diag;
/** Simple constructor.
* Creates a new instance of Snake with default indices
*/
public Snake() {
start = -1;
end = -1;
diag = 0;
}
/** Simple constructor.
* Creates a new instance of Snake with specified indices
* @param start start index of the snake
* @param end end index of the snake
* @param diag diagonal number
*/
public Snake(int start, int end, int diag) {
this.start = start;
this.end = end;
this.diag = diag;
}
/** Get the start index of the snake.
* @return start index of the snake
*/
public int getStart() {
return start;
}
/** Get the end index of the snake.
* @return end index of the snake
*/
public int getEnd() {
return end;
}
/** Get the diagonal number of the snake.
* @return diagonal number of the snake
*/
public int getDiag() {
return diag;
}
}

View File

@ -0,0 +1,93 @@
<!-- $Id$ -->
<!--
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
-->
<body>
This package provides classes to compare two objects sequences.
<p>
The two sequences can hold any object type, as only the
<code>equals</code> method is used to compare the elements of the
sequences. It is guaranteed that the comparisons will always be done
as <code>o1.equals(o2)</code> where <code>o1</code> belongs to the
first sequence and <code>o2</code> belongs to the second
sequence. This can be important if subclassing is used for some
elements in the first sequence and the <code>equals</code> method is
specialized.
</p>
<p>
Comparison can be seen from two points of view: either as giving the
smallest modification allowing to transform the first sequence into
the second one, or as giving the longest sequence which is a
subsequence of both initial sequences. The <code>equals</code> method
is used to compare objects, so any object can be put into
sequences. Modifications include deleting, inserting or keeping one
object, starting from the beginning of the first sequence. Like most
algorithms of the same type, objects transpositions are not
supported. This means that if a sequence <code>(A, B)</code> is
compared to <code>(B, A)</code>, the result will be either the
sequence of three commands <code>delete A</code>, <code>keep B</code>,
<code>insert A</code> or the sequence <code>insert B</code>,
<code>keep A</code>, <code>delete B</code>.
</p>
<p>
The package uses a very efficient comparison algorithm designed by
Eugene W. Myers and described in his paper: <a
href="http://www.cis.upenn.edu/~bcpierce/courses/dd/papers/diff.ps">An O(ND)
Difference Algorithm and Its Variations</a>. This algorithm produces
the shortest possible {@link
org.apache.commons.collections.list.difference.EditScript edit script} containing
all the {@link org.apache.commons.collections.list.difference.EditCommand
commands} needed to transform the first sequence into the second
one. The entry point for the user to this algorithm is the {@link
org.apache.commons.collections.list.difference.SequencesComparator
SequencesComparator} class.
</p>
<p>
As explained in Gene Myers paper, the edit script is equivalent to all
other representations and contains all the needed information either
to perform the transformation, of course, or to retrieve the longest
common subsequence for example.
</p>
<p>
If the user needs a very fine grained access to the comparison result,
he needs to go through this script by providing a visitor implementing
the {@link org.apache.commons.collections.list.difference.CommandVisitor
CommandVisitor} interface.
</p>
<p>
Sometimes however, a more synthetic approach is needed. If the user
prefers to see the differences between the two sequences as global
<code>replacement</code> operations acting on complete subsequences of
the original sequences, he will provide an object implementing the
simple {@link org.apache.commons.collections.list.difference.ReplacementsHandler
ReplacementsHandler} interface, using an instance of the {@link
org.apache.commons.collections.list.difference.ReplacementsFinder
ReplacementsFinder} class as a command converting layer between his
object and the edit script. The number of objects which are common to
both initial arrays and hence are skipped between each call to the user
{@link
org.apache.commons.collections.list.difference.ReplacementsHandler#handleReplacement
handleReplacement} method is also provided. This allows the user to keep
track of the current index in both arrays if he needs so.
</p>
</body>

View File

@ -0,0 +1,235 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.commons.collections.list.difference;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.Random;
import org.junit.After;
import org.junit.Assert;
import org.junit.Before;
import org.junit.Test;
public class SequencesComparatorTest {
private List<String> before;
private List<String> after;
private int[] length;
@Test
public void testLength() {
for (int i = 0; i < before.size(); ++i) {
SequencesComparator<Character> comparator =
new SequencesComparator<Character>(sequence(before.get(i)),
sequence(after.get(i)));
Assert.assertEquals(length[i], comparator.getScript().getModifications());
}
}
@Test
public void testExecution() {
ExecutionVisitor<Character> ev = new ExecutionVisitor<Character>();
for (int i = 0; i < before.size(); ++i) {
ev.setList(sequence(before.get(i)));
new SequencesComparator<Character>(sequence(before.get(i)),
sequence(after.get(i))).getScript().visit(ev);
Assert.assertEquals(after.get(i), ev.getString());
}
}
@Test
public void testMinimal() {
String[] shadokAlph = new String[] {
new String("GA"),
new String("BU"),
new String("ZO"),
new String("MEU")
};
List<String> sentenceBefore = new ArrayList<String>();
List<String> sentenceAfter = new ArrayList<String>();
sentenceBefore.add(shadokAlph[0]);
sentenceBefore.add(shadokAlph[2]);
sentenceBefore.add(shadokAlph[3]);
sentenceBefore.add(shadokAlph[1]);
sentenceBefore.add(shadokAlph[0]);
sentenceBefore.add(shadokAlph[0]);
sentenceBefore.add(shadokAlph[2]);
sentenceBefore.add(shadokAlph[1]);
sentenceBefore.add(shadokAlph[3]);
sentenceBefore.add(shadokAlph[0]);
sentenceBefore.add(shadokAlph[2]);
sentenceBefore.add(shadokAlph[1]);
sentenceBefore.add(shadokAlph[3]);
sentenceBefore.add(shadokAlph[2]);
sentenceBefore.add(shadokAlph[2]);
sentenceBefore.add(shadokAlph[0]);
sentenceBefore.add(shadokAlph[1]);
sentenceBefore.add(shadokAlph[3]);
sentenceBefore.add(shadokAlph[0]);
sentenceBefore.add(shadokAlph[3]);
Random random = new Random(4564634237452342L);
for (int nbCom = 0; nbCom <= 40; nbCom+=5) {
sentenceAfter.clear();
sentenceAfter.addAll(sentenceBefore);
for (int i = 0; i<nbCom; i++) {
if (random.nextInt(2) == 0) {
sentenceAfter.add(random.nextInt(sentenceAfter.size() + 1),
shadokAlph[random.nextInt(4)]);
} else {
sentenceAfter.remove(random.nextInt(sentenceAfter.size()));
}
}
SequencesComparator<String> comparator =
new SequencesComparator<String>(sentenceBefore, sentenceAfter);
Assert.assertTrue(comparator.getScript().getModifications() <= nbCom);
}
}
@Test
public void testShadok() {
int lgMax = 5;
String[] shadokAlph = new String[] {
new String("GA"),
new String("BU"),
new String("ZO"),
new String("MEU")
};
List<List<String>> shadokSentences = new ArrayList<List<String>>();
for (int lg=0; lg<lgMax; ++lg) {
List<List<String>> newTab = new ArrayList<List<String>>();
newTab.add(new ArrayList<String>());
for (int k = 0; k < shadokAlph.length; k++) {
for (List<String> sentence : shadokSentences) {
List<String> newSentence = new ArrayList<String>(sentence);
newSentence.add(shadokAlph[k]);
newTab.add(newSentence);
}
}
shadokSentences = newTab;
}
ExecutionVisitor<String> ev = new ExecutionVisitor<String>();
for (int i = 0; i < shadokSentences.size(); ++i) {
for (int j = 0; j < shadokSentences.size(); ++j) {
ev.setList(shadokSentences.get(i));
new SequencesComparator<String>(shadokSentences.get(i),
shadokSentences.get(j)).getScript().visit(ev);
StringBuilder concat = new StringBuilder();
for (final String s : shadokSentences.get(j)) {
concat.append(s);
}
Assert.assertEquals(concat.toString(), ev.getString());
}
}
}
private List<Character> sequence(String string) {
List<Character> list = new ArrayList<Character>();
for (int i = 0; i < string.length(); ++i) {
list.add(new Character(string.charAt(i)));
}
return list;
}
private class ExecutionVisitor<T> implements CommandVisitor<T> {
private List<T> v;
private int index;
public void setList(List<T> array) {
v = new ArrayList<T>(array);
index = 0;
}
public void visitInsertCommand(T object) {
v.add(index++, object);
}
public void visitKeepCommand(T object) {
++index;
}
public void visitDeleteCommand(T object) {
v.remove(index);
}
public String getString() {
StringBuffer buffer = new StringBuffer();
for (T c : v) {
buffer.append(c);
}
return buffer.toString();
}
}
@Before
public void setUp() {
before = Arrays.asList(new String[] {
"bottle",
"nematode knowledge",
"",
"aa",
"prefixed string",
"ABCABBA",
"glop glop",
"coq",
"spider-man"
});
after = Arrays.asList(new String[] {
"noodle",
"empty bottle",
"",
"C",
"prefix",
"CBABAC",
"pas glop pas glop",
"ane",
"klingon"
});
length = new int[] {
6,
16,
0,
3,
9,
5,
8,
6,
13
};
}
@After
public void tearDown() {
before = null;
after = null;
length = null;
}
}