mirror of https://github.com/apache/druid.git
DeterminePartitionsJob: Minimize distance from target instead of mean
This commit is contained in:
parent
b0a58af05d
commit
7f7ea485b1
|
@ -90,8 +90,8 @@ import java.util.Set;
|
||||||
* put all those rows in the same partition, and that partition may be much larger than the target size.</li>
|
* put all those rows in the same partition, and that partition may be much larger than the target size.</li>
|
||||||
* </ul>
|
* </ul>
|
||||||
*
|
*
|
||||||
* "Best" means a very high cardinality dimension, or, if none exist, the dimension that minimizes segment size
|
* "Best" means a very high cardinality dimension, or, if none exist, the dimension that minimizes variation of
|
||||||
* variance.
|
* segment size relative to the target.
|
||||||
*/
|
*/
|
||||||
public class DeterminePartitionsJob implements Jobby
|
public class DeterminePartitionsJob implements Jobby
|
||||||
{
|
{
|
||||||
|
@ -692,8 +692,8 @@ public class DeterminePartitionsJob implements Jobby
|
||||||
}
|
}
|
||||||
|
|
||||||
int maxCardinality = Integer.MIN_VALUE;
|
int maxCardinality = Integer.MIN_VALUE;
|
||||||
long minVariance = Long.MAX_VALUE;
|
long minDistance = Long.MAX_VALUE;
|
||||||
DimPartitions minVariancePartitions = null;
|
DimPartitions minDistancePartitions = null;
|
||||||
DimPartitions maxCardinalityPartitions = null;
|
DimPartitions maxCardinalityPartitions = null;
|
||||||
|
|
||||||
for(final DimPartitions dimPartitions : dimPartitionss.values()) {
|
for(final DimPartitions dimPartitions : dimPartitionss.values()) {
|
||||||
|
@ -722,16 +722,16 @@ public class DeterminePartitionsJob implements Jobby
|
||||||
}
|
}
|
||||||
|
|
||||||
final int cardinality = dimPartitions.getCardinality();
|
final int cardinality = dimPartitions.getCardinality();
|
||||||
final long variance = dimPartitions.getVariance();
|
final long distance = dimPartitions.getDistanceSquaredFromTarget(config.getTargetPartitionSize());
|
||||||
|
|
||||||
if(cardinality > maxCardinality) {
|
if(cardinality > maxCardinality) {
|
||||||
maxCardinality = cardinality;
|
maxCardinality = cardinality;
|
||||||
maxCardinalityPartitions = dimPartitions;
|
maxCardinalityPartitions = dimPartitions;
|
||||||
}
|
}
|
||||||
|
|
||||||
if(variance < minVariance) {
|
if(distance < minDistance) {
|
||||||
minVariance = variance;
|
minDistance = distance;
|
||||||
minVariancePartitions = dimPartitions;
|
minDistancePartitions = dimPartitions;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -745,7 +745,7 @@ public class DeterminePartitionsJob implements Jobby
|
||||||
|
|
||||||
final DimPartitions chosenPartitions = maxCardinality > HIGH_CARDINALITY_THRESHOLD
|
final DimPartitions chosenPartitions = maxCardinality > HIGH_CARDINALITY_THRESHOLD
|
||||||
? maxCardinalityPartitions
|
? maxCardinalityPartitions
|
||||||
: minVariancePartitions;
|
: minDistancePartitions;
|
||||||
|
|
||||||
final List<ShardSpec> chosenShardSpecs = Lists.transform(
|
final List<ShardSpec> chosenShardSpecs = Lists.transform(
|
||||||
chosenPartitions.partitions, new Function<DimPartition, ShardSpec>()
|
chosenPartitions.partitions, new Function<DimPartition, ShardSpec>()
|
||||||
|
@ -824,17 +824,15 @@ public class DeterminePartitionsJob implements Jobby
|
||||||
return sum;
|
return sum;
|
||||||
}
|
}
|
||||||
|
|
||||||
public long getVariance()
|
public long getDistanceSquaredFromTarget(long target)
|
||||||
{
|
{
|
||||||
final long meanRows = getRows() / partitions.size();
|
long distance = 0;
|
||||||
|
|
||||||
long variance = 0;
|
|
||||||
for(final DimPartition dimPartition : partitions) {
|
for(final DimPartition dimPartition : partitions) {
|
||||||
variance += (dimPartition.rows - meanRows) * (dimPartition.rows - meanRows);
|
distance += (dimPartition.rows - target) * (dimPartition.rows - target);
|
||||||
}
|
}
|
||||||
|
|
||||||
variance /= partitions.size();
|
distance /= partitions.size();
|
||||||
return variance;
|
return distance;
|
||||||
}
|
}
|
||||||
|
|
||||||
public int getRows()
|
public int getRows()
|
||||||
|
|
Loading…
Reference in New Issue