MATH-1548: Move standard quality measures of a SOM into class "NeuronSquareMesh2D".

All these indicators are usually computed in order to evaluate the quality of a SOM:
Computing them separately is inefficient when the number of samples becomes large.
This commit is contained in:
Gilles Sadowski 2020-06-26 18:05:12 +02:00
parent 9cbf1d1844
commit 28e5b802fe
6 changed files with 292 additions and 324 deletions

View File

@ -20,6 +20,7 @@ package org.apache.commons.math4.ml.neuralnet.twod;
import java.util.List;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.Collection;
import java.io.Serializable;
import java.io.ObjectInputStream;
@ -30,6 +31,10 @@ import org.apache.commons.math4.ml.neuralnet.FeatureInitializer;
import org.apache.commons.math4.ml.neuralnet.Network;
import org.apache.commons.math4.ml.neuralnet.Neuron;
import org.apache.commons.math4.ml.neuralnet.SquareNeighbourhood;
import org.apache.commons.math4.ml.neuralnet.MapRanking;
import org.apache.commons.math4.ml.neuralnet.twod.util.LocationFinder;
import org.apache.commons.math4.ml.distance.DistanceMeasure;
import org.apache.commons.math4.ml.distance.EuclideanDistance;
/**
* Neural network with the topology of a two-dimensional surface.
@ -339,6 +344,17 @@ public class NeuronSquareMesh2D
return location == null ? null : getNeuron(location[0], location[1]);
}
/**
* Computes various {@link DataVisualization indicators} of the quality
* of the representation of the given {@code data} by this map.
*
* @param data Features.
* @return a new instance holding quality indicators.
*/
public DataVisualization computeQualityIndicators(Iterable<double[]> data) {
return DataVisualization.from(copy(), data);
}
/**
* Computes the location of a neighbouring neuron.
* Returns {@code null} if the resulting location is not part
@ -625,4 +641,227 @@ public class NeuronSquareMesh2D
featuresList);
}
}
/**
* Miscellaneous indicators of the map quality:
* <ul>
* <li>Hit histogram</li>
* <li>Quantization error</li>
* <li>Topographic error</li>
* <li>Unified distance matrix</li>
* </ul>
*/
public static class DataVisualization {
/** Distance function. */
private static final DistanceMeasure DISTANCE = new EuclideanDistance();
/** Total number of samples. */
private final int numberOfSamples;
/** Hit histogram. */
private final double[][] hitHistogram;
/** Quantization error. */
private final double[][] quantizationError;
/** Mean quantization error. */
private final double meanQuantizationError;
/** Topographic error. */
private final double[][] topographicError;
/** Mean topographic error. */
private final double meanTopographicError;
/** U-matrix. */
private final double[][] uMatrix;
/**
* @param numberOfSamples Number of samples.
* @param hitHistogram Hit histogram.
* @param quantizationError Quantization error.
* @param topographicError Topographic error.
* @param uMatrix U-matrix.
*/
private DataVisualization(int numberOfSamples,
double[][] hitHistogram,
double[][] quantizationError,
double[][] topographicError,
double[][] uMatrix) {
this.numberOfSamples = numberOfSamples;
this.hitHistogram = hitHistogram;
this.quantizationError = quantizationError;
meanQuantizationError = hitWeightedMean(quantizationError, hitHistogram);
this.topographicError = topographicError;
meanTopographicError = hitWeightedMean(topographicError, hitHistogram);
this.uMatrix = uMatrix;
}
/**
* @param map Map
* @param data Data.
* @return the metrics.
*/
static DataVisualization from(NeuronSquareMesh2D map,
Iterable<double[]> data) {
final LocationFinder finder = new LocationFinder(map);
final MapRanking rank = new MapRanking(map, DISTANCE);
final Network net = map.getNetwork();
final int nR = map.getNumberOfRows();
final int nC = map.getNumberOfColumns();
// Hit bins.
final int[][] hitCounter = new int[nR][nC];
// Hit bins.
final double[][] hitHistogram = new double[nR][nC];
// Quantization error bins.
final double[][] quantizationError = new double[nR][nC];
// Topographic error bins.
final double[][] topographicError = new double[nR][nC];
// U-matrix.
final double[][] uMatrix = new double[nR][nC];
int numSamples = 0;
for (double[] sample : data) {
++numSamples;
final List<Neuron> winners = rank.rank(sample, 2);
final Neuron best = winners.get(0);
final Neuron secondBest = winners.get(1);
final LocationFinder.Location locBest = finder.getLocation(best);
final int rowBest = locBest.getRow();
final int colBest = locBest.getColumn();
// Increment hit counter.
hitCounter[rowBest][colBest] += 1;
// Aggregate quantization error.
quantizationError[rowBest][colBest] += DISTANCE.compute(sample, best.getFeatures());
// Aggregate topographic error.
if (!net.getNeighbours(best).contains(secondBest)) {
// Increment count if first and second best matching units
// are not neighbours.
topographicError[rowBest][colBest] += 1;
}
}
for (int r = 0; r < nR; r++) {
for (int c = 0; c < nC; c++) {
final Neuron neuron = map.getNeuron(r, c);
final Collection<Neuron> neighbours = net.getNeighbours(neuron);
final double[] features = neuron.getFeatures();
double uDistance = 0;
int neighbourCount = 0;
for (Neuron n : neighbours) {
++neighbourCount;
uDistance += DISTANCE.compute(features, n.getFeatures());
}
final int hitCount = hitCounter[r][c];
if (hitCount != 0) {
hitHistogram[r][c] = hitCount / (double) numSamples;
quantizationError[r][c] /= hitCount;
topographicError[r][c] /= hitCount;
uMatrix[r][c] = uDistance / neighbourCount;
}
}
}
return new DataVisualization(numSamples,
hitHistogram,
quantizationError,
topographicError,
uMatrix);
}
/**
* @return the total number of samples.
*/
public final int getNumberOfSamples() {
return numberOfSamples;
}
/**
* @return the quantization error.
* Each bin will contain the average of the distances between samples
* mapped to the corresponding unit and the weight vector of that unit.
* @see #getMeanQuantizationError()
*/
public double[][] getQuantizationError() {
return copy(quantizationError);
}
/**
* @return the topographic error.
* Each bin will contain the number of data for which the first and
* second best matching units are not adjacent in the map.
* @see #getMeanTopographicError()
*/
public double[][] getTopographicError() {
return copy(topographicError);
}
/**
* @return the hits histogram (normalized).
* Each bin will contain the number of data for which the corresponding
* neuron is the best matching unit.
*/
public double[][] getNormalizedHits() {
return copy(hitHistogram);
}
/**
* @return the U-matrix.
* Each bin will contain the average distance between a unit and all its
* neighbours will be computed (and stored in the pixel corresponding to
* that unit of the 2D-map). The number of neighbours taken into account
* depends on the network {@link org.apache.commons.math4.ml.neuralnet.SquareNeighbourhood
* neighbourhood type}.
*/
public double[][] getUMatrix() {
return copy(uMatrix);
}
/**
* @return the mean (hit-weighted) quantization error.
* @see #getQuantizationError()
*/
public double getMeanQuantizationError() {
return meanQuantizationError;
}
/**
* @return the mean (hit-weighted) topographic error.
* @see #getTopographicError()
*/
public double getMeanTopographicError() {
return meanTopographicError;
}
/**
* @param orig Source.
* @return a deep copy of the original array.
*/
private static double[][] copy(double[][] orig) {
final double[][] copy = new double[orig.length][];
for (int i = 0; i < orig.length; i++) {
copy[i] = orig[i].clone();
}
return copy;
}
/**
* @param metrics Metrics.
* @param normalizedHits Hits histogram (normalized).
* @return the hit-weighted mean of the given {@code metrics}.
*/
private double hitWeightedMean(double[][] metrics,
double[][] normalizedHits) {
double mean = 0;
final int rows = metrics.length;
final int cols = metrics[0].length;
for (int i = 0; i < rows; i++) {
for (int j = 0; j < cols; j++) {
mean += normalizedHits[i][j] * metrics[i][j];
}
}
return mean;
}
}
}

View File

@ -1,85 +0,0 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.commons.math4.ml.neuralnet.twod.util;
import org.apache.commons.math4.ml.neuralnet.MapRanking;
import org.apache.commons.math4.ml.neuralnet.Neuron;
import org.apache.commons.math4.ml.neuralnet.twod.NeuronSquareMesh2D;
import org.apache.commons.math4.ml.distance.DistanceMeasure;
/**
* Computes the hit histogram.
* Each bin will contain the number of data for which the corresponding
* neuron is the best matching unit.
* @since 3.6
*/
public class HitHistogram implements MapDataVisualization {
/** Distance. */
private final DistanceMeasure distance;
/** Whether to compute relative bin counts. */
private final boolean normalizeCount;
/**
* @param normalizeCount Whether to compute relative bin counts.
* If {@code true}, the data count in each bin will be divided by the total
* number of samples.
* @param distance Distance.
*/
public HitHistogram(boolean normalizeCount,
DistanceMeasure distance) {
this.normalizeCount = normalizeCount;
this.distance = distance;
}
/** {@inheritDoc} */
@Override
public double[][] computeImage(NeuronSquareMesh2D map,
Iterable<double[]> data) {
final int nR = map.getNumberOfRows();
final int nC = map.getNumberOfColumns();
final LocationFinder finder = new LocationFinder(map);
final MapRanking rank = new MapRanking(map.getNetwork(), distance);
// Totla number of samples.
int numSamples = 0;
// Hit bins.
final double[][] hit = new double[nR][nC];
for (double[] sample : data) {
final Neuron best = rank.rank(sample, 1).get(0);
final LocationFinder.Location loc = finder.getLocation(best);
final int row = loc.getRow();
final int col = loc.getColumn();
hit[row][col] += 1;
++numSamples;
}
if (normalizeCount) {
for (int r = 0; r < nR; r++) {
for (int c = 0; c < nC; c++) {
hit[r][c] /= numSamples;
}
}
}
return hit;
}
}

View File

@ -1,78 +0,0 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.commons.math4.ml.neuralnet.twod.util;
import org.apache.commons.math4.ml.neuralnet.MapRanking;
import org.apache.commons.math4.ml.neuralnet.Neuron;
import org.apache.commons.math4.ml.neuralnet.twod.NeuronSquareMesh2D;
import org.apache.commons.math4.ml.distance.DistanceMeasure;
/**
* Computes the quantization error histogram.
* Each bin will contain the average of the distances between samples
* mapped to the corresponding unit and the weight vector of that unit.
* @since 3.6
*/
public class QuantizationError implements MapDataVisualization {
/** Distance. */
private final DistanceMeasure distance;
/**
* @param distance Distance.
*/
public QuantizationError(DistanceMeasure distance) {
this.distance = distance;
}
/** {@inheritDoc} */
@Override
public double[][] computeImage(NeuronSquareMesh2D map,
Iterable<double[]> data) {
final int nR = map.getNumberOfRows();
final int nC = map.getNumberOfColumns();
final LocationFinder finder = new LocationFinder(map);
final MapRanking rank = new MapRanking(map.getNetwork(), distance);
// Hit bins.
final int[][] hit = new int[nR][nC];
// Error bins.
final double[][] error = new double[nR][nC];
for (double[] sample : data) {
final Neuron best = rank.rank(sample, 1).get(0);
final LocationFinder.Location loc = finder.getLocation(best);
final int row = loc.getRow();
final int col = loc.getColumn();
hit[row][col] += 1;
error[row][col] += distance.compute(sample, best.getFeatures());
}
for (int r = 0; r < nR; r++) {
for (int c = 0; c < nC; c++) {
final int count = hit[r][c];
if (count != 0) {
error[r][c] /= count;
}
}
}
return error;
}
}

View File

@ -1,93 +0,0 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.commons.math4.ml.neuralnet.twod.util;
import java.util.List;
import org.apache.commons.math4.ml.neuralnet.MapRanking;
import org.apache.commons.math4.ml.neuralnet.Neuron;
import org.apache.commons.math4.ml.neuralnet.Network;
import org.apache.commons.math4.ml.neuralnet.twod.NeuronSquareMesh2D;
import org.apache.commons.math4.ml.distance.DistanceMeasure;
/**
* Computes the topographic error histogram.
* Each bin will contain the number of data for which the first and
* second best matching units are not adjacent in the map.
* @since 3.6
*/
public class TopographicErrorHistogram implements MapDataVisualization {
/** Distance. */
private final DistanceMeasure distance;
/** Whether to compute relative bin counts. */
private final boolean relativeCount;
/**
* @param relativeCount Whether to compute relative bin counts.
* If {@code true}, the data count in each bin will be divided by the total
* number of samples mapped to the neuron represented by that bin.
* @param distance Distance.
*/
public TopographicErrorHistogram(boolean relativeCount,
DistanceMeasure distance) {
this.relativeCount = relativeCount;
this.distance = distance;
}
/** {@inheritDoc} */
@Override
public double[][] computeImage(NeuronSquareMesh2D map,
Iterable<double[]> data) {
final int nR = map.getNumberOfRows();
final int nC = map.getNumberOfColumns();
final LocationFinder finder = new LocationFinder(map);
final Network net = map.getNetwork();
final MapRanking rank = new MapRanking(net, distance);
// Hit bins.
final int[][] hit = new int[nR][nC];
// Error bins.
final double[][] error = new double[nR][nC];
for (double[] sample : data) {
final List<Neuron> p = rank.rank(sample, 2);
final Neuron best = p.get(0);
final LocationFinder.Location loc = finder.getLocation(best);
final int row = loc.getRow();
final int col = loc.getColumn();
hit[row][col] += 1;
if (!net.getNeighbours(best).contains(p.get(1))) {
// Increment count if first and second best matching units
// are not neighbours.
error[row][col] += 1;
}
}
if (relativeCount) {
for (int r = 0; r < nR; r++) {
for (int c = 0; c < nC; c++) {
error[r][c] /= hit[r][c];
}
}
}
return error;
}
}

View File

@ -17,59 +17,36 @@
package org.apache.commons.math4.ml.neuralnet.twod.util;
import java.util.Collection;
import org.apache.commons.math4.ml.neuralnet.Neuron;
import org.apache.commons.math4.ml.neuralnet.Network;
import org.apache.commons.math4.ml.neuralnet.twod.NeuronSquareMesh2D;
import org.apache.commons.math4.ml.distance.DistanceMeasure;
/**
* <a href="http://en.wikipedia.org/wiki/U-Matrix">U-Matrix</a>
* visualization of high-dimensional data projection.
* The 8 individual inter-units distances will be
* {@link #computeImage(NeuronSquareMesh2D) computed}. They will be
* stored in additional pixels around each of the original units of the
* 2D-map. The additional pixels that lie along a "diagonal" are shared
* by <em>two</em> pairs of units: their value will be set to the average
* distance between the units belonging to each of the pairs. The value
* zero will be stored in the pixel corresponding to the location of a
* unit of the 2D-map.
*
* @since 3.6
* @see NeuronSquareMesh2D.DataVisualization#getUMatrix()
*/
public class UnifiedDistanceMatrix implements MapVisualization {
/** Whether to show distance between each pair of neighbouring units. */
private final boolean individualDistances;
/** Distance. */
private final DistanceMeasure distance;
/**
* Simple constructor.
*
* @param individualDistances If {@code true}, the 8 individual
* inter-units distances will be {@link #computeImage(NeuronSquareMesh2D)
* computed}. They will be stored in additional pixels around each of
* the original units of the 2D-map. The additional pixels that lie
* along a "diagonal" are shared by <em>two</em> pairs of units: their
* value will be set to the average distance between the units belonging
* to each of the pairs. The value zero will be stored in the pixel
* corresponding to the location of a unit of the 2D-map.
* <br>
* If {@code false}, only the average distance between a unit and all its
* neighbours will be computed (and stored in the pixel corresponding to
* that unit of the 2D-map). In that case, the number of neighbours taken
* into account depends on the network's
* {@link org.apache.commons.math4.ml.neuralnet.SquareNeighbourhood
* neighbourhood type}.
* @param distance Distance.
*/
public UnifiedDistanceMatrix(boolean individualDistances,
DistanceMeasure distance) {
this.individualDistances = individualDistances;
public UnifiedDistanceMatrix(DistanceMeasure distance) {
this.distance = distance;
}
/** {@inheritDoc} */
@Override
public double[][] computeImage(NeuronSquareMesh2D map) {
if (individualDistances) {
return individualDistances(map);
} else {
return averageDistances(map);
}
}
/**
* Computes the distances between a unit of the map and its
* neighbours.
@ -81,7 +58,8 @@ public class UnifiedDistanceMatrix implements MapVisualization {
* @param map Map.
* @return an image representing the individual distances.
*/
private double[][] individualDistances(NeuronSquareMesh2D map) {
@Override
public double[][] computeImage(NeuronSquareMesh2D map) {
final int numRows = map.getNumberOfRows();
final int numCols = map.getNumberOfColumns();
@ -174,37 +152,4 @@ public class UnifiedDistanceMatrix implements MapVisualization {
return uMatrix;
}
/**
* Computes the distances between a unit of the map and its neighbours.
*
* @param map Map.
* @return an image representing the average distances.
*/
private double[][] averageDistances(NeuronSquareMesh2D map) {
final int numRows = map.getNumberOfRows();
final int numCols = map.getNumberOfColumns();
final double[][] uMatrix = new double[numRows][numCols];
final Network net = map.getNetwork();
for (int i = 0; i < numRows; i++) {
for (int j = 0; j < numCols; j++) {
final Neuron neuron = map.getNeuron(i, j);
final Collection<Neuron> neighbours = net.getNeighbours(neuron);
final double[] features = neuron.getFeatures();
double d = 0;
int count = 0;
for (Neuron n : neighbours) {
++count;
d += distance.compute(features, n.getFeatures());
}
uMatrix[i][j] = d / count;
}
}
return uMatrix;
}
}

View File

@ -25,6 +25,9 @@ import java.io.ObjectOutputStream;
import java.util.Collection;
import java.util.Set;
import java.util.HashSet;
import java.util.List;
import java.util.stream.StreamSupport;
import java.util.stream.Collectors;
import org.apache.commons.math4.exception.NumberIsTooSmallException;
import org.apache.commons.math4.exception.OutOfRangeException;
@ -872,4 +875,41 @@ public class NeuronSquareMesh2DTest {
Assert.assertTrue(fromMap.contains(n));
}
}
@Test
public void testDataVisualization() {
final FeatureInitializer[] initArray = { init };
final NeuronSquareMesh2D map = new NeuronSquareMesh2D(3, true,
3, true,
SquareNeighbourhood.VON_NEUMANN,
initArray);
// Trivial test: Use neurons' features as data.
final List<double[]> data = StreamSupport.stream(map.spliterator(), false)
.map(n -> n.getFeatures())
.collect(Collectors.toList());
final NeuronSquareMesh2D.DataVisualization v = map.computeQualityIndicators(data);
final int numRows = map.getNumberOfRows();
final int numCols = map.getNumberOfColumns();
// Test hits.
final double[][] hits = v.getNormalizedHits();
final double expectedHits = 1d / (numRows * numCols);
for (int i = 0; i < numRows; i++) {
for (int j = 0; j < numCols; j++) {
Assert.assertEquals(expectedHits, hits[i][j], 0d);
}
}
// Test quantization error.
final double[][] qe = v.getQuantizationError();
final double expectedQE = 0;
for (int i = 0; i < numRows; i++) {
for (int j = 0; j < numCols; j++) {
Assert.assertEquals(expectedQE, qe[i][j], 0d);
}
}
}
}