DeterminePartitionsJob: Minimize distance from target instead of mean

This commit is contained in:
Gian Merlino 2013-05-18 16:58:30 -07:00
parent b0a58af05d
commit 7f7ea485b1
1 changed files with 14 additions and 16 deletions

View File

@ -90,8 +90,8 @@ import java.util.Set;
* put all those rows in the same partition, and that partition may be much larger than the target size.</li> * put all those rows in the same partition, and that partition may be much larger than the target size.</li>
* </ul> * </ul>
* *
* "Best" means a very high cardinality dimension, or, if none exist, the dimension that minimizes segment size * "Best" means a very high cardinality dimension, or, if none exist, the dimension that minimizes variation of
* variance. * segment size relative to the target.
*/ */
public class DeterminePartitionsJob implements Jobby public class DeterminePartitionsJob implements Jobby
{ {
@ -692,8 +692,8 @@ public class DeterminePartitionsJob implements Jobby
} }
int maxCardinality = Integer.MIN_VALUE; int maxCardinality = Integer.MIN_VALUE;
long minVariance = Long.MAX_VALUE; long minDistance = Long.MAX_VALUE;
DimPartitions minVariancePartitions = null; DimPartitions minDistancePartitions = null;
DimPartitions maxCardinalityPartitions = null; DimPartitions maxCardinalityPartitions = null;
for(final DimPartitions dimPartitions : dimPartitionss.values()) { for(final DimPartitions dimPartitions : dimPartitionss.values()) {
@ -722,16 +722,16 @@ public class DeterminePartitionsJob implements Jobby
} }
final int cardinality = dimPartitions.getCardinality(); final int cardinality = dimPartitions.getCardinality();
final long variance = dimPartitions.getVariance(); final long distance = dimPartitions.getDistanceSquaredFromTarget(config.getTargetPartitionSize());
if(cardinality > maxCardinality) { if(cardinality > maxCardinality) {
maxCardinality = cardinality; maxCardinality = cardinality;
maxCardinalityPartitions = dimPartitions; maxCardinalityPartitions = dimPartitions;
} }
if(variance < minVariance) { if(distance < minDistance) {
minVariance = variance; minDistance = distance;
minVariancePartitions = dimPartitions; minDistancePartitions = dimPartitions;
} }
} }
@ -745,7 +745,7 @@ public class DeterminePartitionsJob implements Jobby
final DimPartitions chosenPartitions = maxCardinality > HIGH_CARDINALITY_THRESHOLD final DimPartitions chosenPartitions = maxCardinality > HIGH_CARDINALITY_THRESHOLD
? maxCardinalityPartitions ? maxCardinalityPartitions
: minVariancePartitions; : minDistancePartitions;
final List<ShardSpec> chosenShardSpecs = Lists.transform( final List<ShardSpec> chosenShardSpecs = Lists.transform(
chosenPartitions.partitions, new Function<DimPartition, ShardSpec>() chosenPartitions.partitions, new Function<DimPartition, ShardSpec>()
@ -824,17 +824,15 @@ public class DeterminePartitionsJob implements Jobby
return sum; return sum;
} }
public long getVariance() public long getDistanceSquaredFromTarget(long target)
{ {
final long meanRows = getRows() / partitions.size(); long distance = 0;
long variance = 0;
for(final DimPartition dimPartition : partitions) { for(final DimPartition dimPartition : partitions) {
variance += (dimPartition.rows - meanRows) * (dimPartition.rows - meanRows); distance += (dimPartition.rows - target) * (dimPartition.rows - target);
} }
variance /= partitions.size(); distance /= partitions.size();
return variance; return distance;
} }
public int getRows() public int getRows()