From 0b8b147a74ccf6e077a024f01e53107895145ff4 Mon Sep 17 00:00:00 2001
From: Jason Tedor <jason@tedor.me>
Date: Tue, 21 Jul 2015 14:04:46 -0400
Subject: [PATCH] Correctly list blobs in Azure storage to prevent snapshot
 corruption and do not unnecessarily duplicate Lucene segments in Azure
 Storage

This commit addresses an issue that was leading to snapshot corruption for snapshots stored as blobs in Azure Storage.

The underlying issue is that in cases when multiple snapshots of an index were taken and persisted into Azure Storage, snapshots subsequent
to the first would repeatedly overwrite the snapshot files. This issue does render useless all snapshots except the final snapshot.

The root cause of this is due to String concatenation involving null. In particular, to list all of the blobs in a snapshot directory in
Azure the code would use the method listBlobsByPrefix where the prefix is null. In the listBlobsByPrefix method, the path keyPath + prefix
is constructed. However, per 5.1.11, 5.4 and 15.18.1 of the Java Language Specification, the reference null is first converted to the string
"null" before performing the concatenation. This leads to no blobs being returned and therefore the snapshot mechanism would operate as if
it were writing the first snapshot of the index. The fix is simply to check if prefix is null and handle the concatenation accordingly.

Upon fixing this issue so that subsequent snapshots would no longer overwrite earlier snapshots, it was discovered that the snapshot metadata
returned by the listBlobsByPrefix method was not sufficient for the snapshot layer to detect whether or not the Lucene segments had already
been copied to the Azure storage layer in an earlier snapshot. This led the snapshot layer to unnecessarily duplicate these Lucene segments
in Azure Storage.

The root cause of this is due to known behavior in the CloudBlobContainer.getBlockBlobReference method in the Azure API. Namely, this method
does not fetch blob attributes from Azure. As such, the lengths of all the blobs appeared to the snapshot layer to be of length zero and
therefore they would compare as not equal to any new blobs that the snapshot layer is going to persist. To remediate this, the method
CloudBlockBlob.downloadAttributes must be invoked. This will fetch the attributes from Azure Storage so that a proper comparison of the
blobs can be performed.

Closes elastic/elasticsearch-cloud-azure#51, closes elastic/elasticsearch-cloud-azure#99
---
 .../azure/blobstore/AzureBlobContainer.java   |  2 +-
 .../storage/AzureStorageServiceImpl.java      | 22 ++++++++++++++-----
 2 files changed, 17 insertions(+), 7 deletions(-)

diff --git a/plugins/cloud-azure/src/main/java/org/elasticsearch/cloud/azure/blobstore/AzureBlobContainer.java b/plugins/cloud-azure/src/main/java/org/elasticsearch/cloud/azure/blobstore/AzureBlobContainer.java
index bc15a64fc73..04231328311 100644
--- a/plugins/cloud-azure/src/main/java/org/elasticsearch/cloud/azure/blobstore/AzureBlobContainer.java
+++ b/plugins/cloud-azure/src/main/java/org/elasticsearch/cloud/azure/blobstore/AzureBlobContainer.java
@@ -143,6 +143,6 @@ public class AzureBlobContainer extends AbstractBlobContainer {
     }
 
     protected String buildKey(String blobName) {
-        return keyPath + blobName;
+        return keyPath + (blobName == null ? "" : blobName);
     }
 }
diff --git a/plugins/cloud-azure/src/main/java/org/elasticsearch/cloud/azure/storage/AzureStorageServiceImpl.java b/plugins/cloud-azure/src/main/java/org/elasticsearch/cloud/azure/storage/AzureStorageServiceImpl.java
index 58c73bc1e57..86e6ba93f11 100644
--- a/plugins/cloud-azure/src/main/java/org/elasticsearch/cloud/azure/storage/AzureStorageServiceImpl.java
+++ b/plugins/cloud-azure/src/main/java/org/elasticsearch/cloud/azure/storage/AzureStorageServiceImpl.java
@@ -173,14 +173,24 @@ public class AzureStorageServiceImpl extends AbstractLifecycleComponent<AzureSto
         logger.debug("listing container [{}], keyPath [{}], prefix [{}]", container, keyPath, prefix);
         MapBuilder<String, BlobMetaData> blobsBuilder = MapBuilder.newMapBuilder();
 
-        CloudBlobContainer blob_container = client.getContainerReference(container);
-        if (blob_container.exists()) {
-            for (ListBlobItem blobItem : blob_container.listBlobs(keyPath + prefix)) {
+        CloudBlobContainer blobContainer = client.getContainerReference(container);
+        if (blobContainer.exists()) {
+            for (ListBlobItem blobItem : blobContainer.listBlobs(keyPath + (prefix == null ? "" : prefix))) {
                 URI uri = blobItem.getUri();
                 logger.trace("blob url [{}]", uri);
-                String blobpath = uri.getPath().substring(container.length() + 1);
-                BlobProperties properties = blob_container.getBlockBlobReference(blobpath).getProperties();
-                String name = blobpath.substring(keyPath.length() + 1);
+
+                // uri.getPath is of the form /container/keyPath.* and we want to strip off the /container/
+                // this requires 1 + container.length() + 1, with each 1 corresponding to one of the /
+                String blobPath = uri.getPath().substring(1 + container.length() + 1);
+
+                CloudBlockBlob blob = blobContainer.getBlockBlobReference(blobPath);
+
+                // fetch the blob attributes from Azure (getBlockBlobReference does not do this)
+                // this is needed to retrieve the blob length (among other metadata) from Azure Storage
+                blob.downloadAttributes();
+
+                BlobProperties properties = blob.getProperties();
+                String name = blobPath.substring(keyPath.length());
                 logger.trace("blob url [{}], name [{}], size [{}]", uri, name, properties.getLength());
                 blobsBuilder.put(name, new PlainBlobMetaData(name, properties.getLength()));
             }