diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index 8eb238ca8bc..6bf393ebe25 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -147,6 +147,9 @@ New Features filtering. Range trees can also handle values larger than 64 bits. (Adrien Grand, Mike McCandless) +* LUCENE-6647: Add GeoHash string utility APIs (Nick Knize, Mike + McCandless). + API Changes * LUCENE-6508: Simplify Lock api, there is now just diff --git a/lucene/core/src/java/org/apache/lucene/util/BitUtil.java b/lucene/core/src/java/org/apache/lucene/util/BitUtil.java index d412ac1c63a..8059865cd02 100644 --- a/lucene/core/src/java/org/apache/lucene/util/BitUtil.java +++ b/lucene/core/src/java/org/apache/lucene/util/BitUtil.java @@ -26,7 +26,8 @@ public final class BitUtil { private static final long MAGIC[] = { 0x5555555555555555L, 0x3333333333333333L, 0x0F0F0F0F0F0F0F0FL, 0x00FF00FF00FF00FFL, - 0x0000FFFF0000FFFFL, 0x00000000FFFFFFFFL + 0x0000FFFF0000FFFFL, 0x00000000FFFFFFFFL, + 0xAAAAAAAAAAAAAAAAL }; // shift values for bit interleaving private static final short SHIFT[] = {1, 2, 4, 8, 16}; @@ -144,6 +145,13 @@ public final class BitUtil { return b; } + /** + * flip flops odd with even bits + */ + public static final long flipFlop(final long b) { + return ((b & MAGIC[6]) >>> 1) | ((b & MAGIC[0]) << 1 ); + } + /** Same as {@link #zigZagEncode(long)} but on integers. */ public static int zigZagEncode(int i) { return (i >> 31) ^ (i << 1); diff --git a/lucene/sandbox/src/java/org/apache/lucene/util/GeoHashUtils.java b/lucene/sandbox/src/java/org/apache/lucene/util/GeoHashUtils.java new file mode 100644 index 00000000000..242d3856818 --- /dev/null +++ b/lucene/sandbox/src/java/org/apache/lucene/util/GeoHashUtils.java @@ -0,0 +1,153 @@ +package org.apache.lucene.util; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * Utilities for converting to/from the GeoHash standard + * + * The geohash long format is represented as lon/lat (x/y) interleaved with the 4 least significant bits + * representing the level (1-12) [xyxy...xyxyllll] + * + * This differs from a morton encoded value which interleaves lat/lon (y/x). + * + * @lucene.experimental + */ +public class GeoHashUtils { + public static final char[] BASE_32 = {'0', '1', '2', '3', '4', '5', '6', + '7', '8', '9', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'j', 'k', 'm', 'n', + 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z'}; + + public static final String BASE_32_STRING = new String(BASE_32); + + public static final int PRECISION = 12; + private static final short MORTON_OFFSET = (GeoUtils.BITS<<1) - (PRECISION*5); + + /** + * Encode lon/lat to the geohash based long format (lon/lat interleaved, 4 least significant bits = level) + */ + public static final long longEncode(final double lon, final double lat, final int level) { + // shift to appropriate level + final short msf = (short)(((12 - level) * 5) + MORTON_OFFSET); + return ((BitUtil.flipFlop(GeoUtils.mortonHash(lon, lat)) >>> msf) << 4) | level; + } + + /** + * Encode from geohash string to the geohash based long format (lon/lat interleaved, 4 least significant bits = level) + */ + public static final long longEncode(final String hash) { + int level = hash.length()-1; + long b; + long l = 0L; + for(char c : hash.toCharArray()) { + b = (long)(BASE_32_STRING.indexOf(c)); + l |= (b<<(level--*5)); + } + return (l<<4)|hash.length(); + } + + /** + * Encode to a geohash string from the geohash based long format + */ + public static final String stringEncode(long geoHashLong) { + int level = (int)geoHashLong&15; + geoHashLong >>>= 4; + char[] chars = new char[level]; + do { + chars[--level] = BASE_32[(int)(geoHashLong&31L)]; + geoHashLong>>>=5; + } while(level > 0); + + return new String(chars); + } + + /** + * Encode to a geohash string from full resolution longitude, latitude) + */ + public static final String stringEncode(final double lon, final double lat) { + return stringEncode(lon, lat, 12); + } + + /** + * Encode to a level specific geohash string from full resolution longitude, latitude + */ + public static final String stringEncode(final double lon, final double lat, final int level) { + // bit twiddle to geohash (since geohash is a swapped (lon/lat) encoding) + final long hashedVal = BitUtil.flipFlop(GeoUtils.mortonHash(lon, lat)); + + StringBuilder geoHash = new StringBuilder(); + short precision = 0; + final short msf = (GeoUtils.BITS<<1)-5; + long mask = 31L<>>(msf-(precision*5)))]); + // next 5 bits + mask >>>= 5; + } while (++precision < level); + return geoHash.toString(); + } + + /** + * Encode to a full precision geohash string from a given morton encoded long value + */ + public static final String stringEncodeFromMortonLong(final long hashedVal) throws Exception { + return stringEncode(hashedVal, PRECISION); + } + + /** + * Encode to a geohash string at a given level from a morton long + */ + public static final String stringEncodeFromMortonLong(long hashedVal, final int level) { + // bit twiddle to geohash (since geohash is a swapped (lon/lat) encoding) + hashedVal = BitUtil.flipFlop(hashedVal); + + StringBuilder geoHash = new StringBuilder(); + short precision = 0; + final short msf = (GeoUtils.BITS<<1)-5; + long mask = 31L<>>(msf-(precision*5)))]); + // next 5 bits + mask >>>= 5; + } while (++precision < level); + return geoHash.toString(); + } + + /** + * Encode to a morton long value from a given geohash string + */ + public static final long mortonEncode(final String hash) { + int level = 11; + long b; + long l = 0L; + for(char c : hash.toCharArray()) { + b = (long)(BASE_32_STRING.indexOf(c)); + l |= (b<<((level--*5) + MORTON_OFFSET)); + } + return BitUtil.flipFlop(l); + } + + /** + * Encode to a morton long value from a given geohash long value + */ + public static final long mortonEncode(final long geoHashLong) { + final int level = (int)(geoHashLong&15); + final short odd = (short)(level & 1); + + return BitUtil.flipFlop((geoHashLong >>> 4) << odd) << (((12 - level) * 5) + (MORTON_OFFSET - odd)); + } +} diff --git a/lucene/sandbox/src/test/org/apache/lucene/search/TestGeoPointQuery.java b/lucene/sandbox/src/test/org/apache/lucene/search/TestGeoPointQuery.java index e3d089a56d3..ace8c7fb5a2 100644 --- a/lucene/sandbox/src/test/org/apache/lucene/search/TestGeoPointQuery.java +++ b/lucene/sandbox/src/test/org/apache/lucene/search/TestGeoPointQuery.java @@ -46,6 +46,7 @@ import org.apache.lucene.util.GeoUtils; import org.apache.lucene.util.IOUtils; import org.apache.lucene.util.LuceneTestCase; import org.apache.lucene.util.SloppyMath; +import org.apache.lucene.util.TestGeoUtils; import org.apache.lucene.util.TestUtil; import org.junit.AfterClass; import org.junit.BeforeClass; @@ -68,37 +69,10 @@ public class TestGeoPointQuery extends LuceneTestCase { // determining the possible haversine error private static final int DISTANCE_ERR = 1000; - // Global bounding box we will "cover" in the random test; we have to make this "smallish" else the queries take very long: - private static double originLat; - private static double originLon; -// private static double range; - private static double lonRange; - private static double latRange; - @BeforeClass public static void beforeClass() throws Exception { directory = newDirectory(); - // when we randomly test the full lat/lon space it can result in very very slow query times, this is due to the - // number of ranges that can be created in degenerate cases. - - // Between 1.0 and 3.0: -// range = 2*(random().nextDouble() + 0.5); - // Between 1.0 and 90.0 - //lonRange = 1.0 + (90.0 - 1.0) * random().nextDouble(); - //latRange = 1.0 + (45.0 - 1.0) * random().nextDouble(); - - // Between 1.0 and 3.0: - lonRange = 2*(random().nextDouble() + 0.5); - latRange = 2*(random().nextDouble() + 0.5); - - originLon = GeoUtils.MIN_LON_INCL + lonRange + (GeoUtils.MAX_LON_INCL - GeoUtils.MIN_LON_INCL - 2*lonRange) * random().nextDouble(); - originLon = GeoUtils.normalizeLon(originLon); - originLat = GeoUtils.MIN_LAT_INCL + latRange + (GeoUtils.MAX_LAT_INCL - GeoUtils.MIN_LAT_INCL - 2*latRange) * random().nextDouble(); - originLat = GeoUtils.normalizeLat(originLat); - if (VERBOSE) { - System.out.println("TEST: originLon=" + originLon + " lonRange= " + lonRange + " originLat=" + originLat + " latRange=" + latRange); - } RandomIndexWriter writer = new RandomIndexWriter(random(), directory, newIndexWriterConfig(new MockAnalyzer(random())) .setMaxBufferedDocs(TestUtil.nextInt(random(), 100, 1000)) @@ -306,13 +280,13 @@ public class TestGeoPointQuery extends LuceneTestCase { if (x == 0) { // Identical lat to old point lats[docID] = lats[oldDocID]; - lons[docID] = randomLon(); + lons[docID] = TestGeoUtils.randomLon(); if (VERBOSE) { //System.out.println(" doc=" + docID + " lat=" + lats[docID] + " lon=" + lons[docID] + " (same lat as doc=" + oldDocID + ")"); } } else if (x == 1) { // Identical lon to old point - lats[docID] = randomLat(); + lats[docID] = TestGeoUtils.randomLat(); lons[docID] = lons[oldDocID]; if (VERBOSE) { //System.out.println(" doc=" + docID + " lat=" + lats[docID] + " lon=" + lons[docID] + " (same lon as doc=" + oldDocID + ")"); @@ -327,8 +301,8 @@ public class TestGeoPointQuery extends LuceneTestCase { } } } else { - lats[docID] = randomLat(); - lons[docID] = randomLon(); + lats[docID] = TestGeoUtils.randomLat(); + lons[docID] = TestGeoUtils.randomLon(); haveRealDoc = true; if (VERBOSE) { //System.out.println(" doc=" + docID + " lat=" + lats[docID] + " lon=" + lons[docID]); @@ -618,19 +592,11 @@ public class TestGeoPointQuery extends LuceneTestCase { || (tMaxLon - tLon) == 0 || (tMaxLat - tLat) == 0); } - private static double randomLat() { - return GeoUtils.normalizeLat(originLat + latRange * (random().nextDouble() - 0.5)); - } - - private static double randomLon() { - return GeoUtils.normalizeLon(originLon + lonRange * (random().nextDouble() - 0.5)); - } - private static GeoBoundingBox randomBBox() { - double lat0 = randomLat(); - double lat1 = randomLat(); - double lon0 = randomLon(); - double lon1 = randomLon(); + double lat0 = TestGeoUtils.randomLat(); + double lat1 = TestGeoUtils.randomLat(); + double lon0 = TestGeoUtils.randomLon(); + double lon1 = TestGeoUtils.randomLon(); if (lat1 < lat0) { double x = lat0; diff --git a/lucene/sandbox/src/test/org/apache/lucene/util/TestGeoUtils.java b/lucene/sandbox/src/test/org/apache/lucene/util/TestGeoUtils.java new file mode 100644 index 00000000000..a5456b9f32d --- /dev/null +++ b/lucene/sandbox/src/test/org/apache/lucene/util/TestGeoUtils.java @@ -0,0 +1,100 @@ +package org.apache.lucene.util; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.junit.BeforeClass; +import org.junit.Test; + +/** + * Tests class for methods in GeoUtils + * + * @lucene.experimental + */ +public class TestGeoUtils extends LuceneTestCase { + + // Global bounding box we will "cover" in the random test; we have to make this "smallish" else the queries take very long: + private static double originLat; + private static double originLon; + // private static double range; + private static double lonRange; + private static double latRange; + + @BeforeClass + public static void beforeClass() throws Exception { + // Between 1.0 and 3.0: + lonRange = 2 * (random().nextDouble() + 0.5); + latRange = 2 * (random().nextDouble() + 0.5); + + originLon = GeoUtils.MIN_LON_INCL + lonRange + (GeoUtils.MAX_LON_INCL - GeoUtils.MIN_LON_INCL - 2 * lonRange) * random().nextDouble(); + originLon = GeoUtils.normalizeLon(originLon); + originLat = GeoUtils.MIN_LAT_INCL + latRange + (GeoUtils.MAX_LAT_INCL - GeoUtils.MIN_LAT_INCL - 2 * latRange) * random().nextDouble(); + originLat = GeoUtils.normalizeLat(originLat); + + if (VERBOSE) { + System.out.println("TEST: originLon=" + originLon + " lonRange= " + lonRange + " originLat=" + originLat + " latRange=" + latRange); + } + } + + @Test + public void testGeoHash() { + int numPoints = atLeast(100); + String randomGeoHashString; + String mortonGeoHash; + long mortonLongFromGHLong, geoHashLong, mortonLongFromGHString; + int randomLevel; + for (int i = 0; i < numPoints; ++i) { + // random point + double lat = randomLatFullRange(); + double lon = randomLonFullRange(); + + // compute geohash straight from lat/lon and from morton encoded value to ensure they're the same + randomGeoHashString = GeoHashUtils.stringEncode(lon, lat, randomLevel = random().nextInt(12 - 1) + 1); + mortonGeoHash = GeoHashUtils.stringEncodeFromMortonLong(GeoUtils.mortonHash(lon, lat), randomLevel); + assertEquals(randomGeoHashString, mortonGeoHash); + + // v&v conversion from lat/lon or geohashstring to geohash long and back to geohash string + geoHashLong = (random().nextBoolean()) ? GeoHashUtils.longEncode(lon, lat, randomLevel) : GeoHashUtils.longEncode(randomGeoHashString); + assertEquals(randomGeoHashString, GeoHashUtils.stringEncode(geoHashLong)); + + // v&v conversion from geohash long to morton long + mortonLongFromGHString = GeoHashUtils.mortonEncode(randomGeoHashString); + mortonLongFromGHLong = GeoHashUtils.mortonEncode(geoHashLong); + assertEquals(mortonLongFromGHLong, mortonLongFromGHString); + + // v&v lat/lon from geohash string and geohash long + assertEquals(GeoUtils.mortonUnhashLat(mortonLongFromGHString), GeoUtils.mortonUnhashLat(mortonLongFromGHLong), 0); + assertEquals(GeoUtils.mortonUnhashLon(mortonLongFromGHString), GeoUtils.mortonUnhashLon(mortonLongFromGHLong), 0); + } + } + + public static double randomLatFullRange() { + return (180d * random().nextDouble()) - 90d; + } + + public static double randomLonFullRange() { + return (360d * random().nextDouble()) - 180d; + } + + public static double randomLat() { + return GeoUtils.normalizeLat(originLat + latRange * (random().nextDouble() - 0.5)); + } + + public static double randomLon() { + return GeoUtils.normalizeLon(originLon + lonRange * (random().nextDouble() - 0.5)); + } +}