Skip to content
5 changes: 5 additions & 0 deletions docs/changelog/134152.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
pr: 134152
summary: Correctly update SLM stats with master shutdown
area: ILM+SLM
type: bug
issues: []
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@
* cluster state as custom metadata. When a snapshot is started by SLM, it is added to this set. Upon completion,
* is it removed. If a snapshot does not record its failure in SnapshotLifecycleStats, likely due to a master shutdown,
* it will not be removed from the registered set. A subsequent snapshot will then find that a registered snapshot
* is no longer running and will infer that it failed, updating SnapshotLifecycleStats accordingly.
* is no longer running and update SnapshotLifecycleStats based on the status of the snapshot.
*/
public class RegisteredPolicySnapshots implements Metadata.ProjectCustom {

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
import org.elasticsearch.action.admin.cluster.snapshots.restore.TransportRestoreSnapshotAction;
import org.elasticsearch.action.admin.cluster.snapshots.status.SnapshotStatus;
import org.elasticsearch.action.admin.cluster.snapshots.status.SnapshotsStatusResponse;
import org.elasticsearch.action.support.master.AcknowledgedResponse;
import org.elasticsearch.cluster.SnapshotsInProgress;
import org.elasticsearch.cluster.health.ClusterHealthStatus;
import org.elasticsearch.cluster.routing.UnassignedInfo;
Expand Down Expand Up @@ -72,6 +73,7 @@ public class SLMSnapshotBlockingIntegTests extends AbstractSnapshotIntegTestCase
private static final String NEVER_EXECUTE_CRON_SCHEDULE = "* * * 31 FEB ? *";

static final String REPO = "my-repo";
List<String> masterNodeNames = null;
List<String> dataNodeNames = null;

@Override
Expand All @@ -85,7 +87,7 @@ protected Settings nodeSettings(int nodeOrdinal, Settings otherSettings) {
@Before
public void ensureClusterNodes() {
logger.info("--> starting enough nodes to ensure we have enough to safely stop for tests");
internalCluster().startMasterOnlyNodes(2);
masterNodeNames = internalCluster().startMasterOnlyNodes(2);
dataNodeNames = internalCluster().startDataOnlyNodes(2);
ensureGreen();
}
Expand Down Expand Up @@ -329,6 +331,185 @@ public void testRetentionWithMultipleRepositories() throws Exception {
testUnsuccessfulSnapshotRetention(randomBoolean());
}

// Test that SLM stats and lastSuccess/lastFailure are correctly updated with master shutdown
public void testSLMWithMasterShutdown() throws Exception {
final String indexName = "test";
final String policyName = "test-policy";
int clusterSize = masterNodeNames.size() + dataNodeNames.size();
indexRandomDocs(indexName, 20);
createRepository(REPO, "mock");

createSnapshotPolicy(
policyName,
"snap",
NEVER_EXECUTE_CRON_SCHEDULE,
REPO,
indexName,
true,
false,
new SnapshotRetentionConfiguration(TimeValue.ZERO, null, null)
);

// block snapshot from completing
blockMasterFromFinalizingSnapshotOnIndexFile(REPO);

// first SLM execution
final String snapshotName = executePolicy(policyName);
final String initialMaster = internalCluster().getMasterName();
waitForBlock(initialMaster, REPO);

// restart master
internalCluster().restartNode(initialMaster);
ensureStableCluster(clusterSize);
awaitNoMoreRunningOperations();

// ensure snapshot is completed successfully after master failover
assertBusy(() -> {
final SnapshotInfo snapshotInfo;
try {
GetSnapshotsResponse snapshotsStatusResponse = clusterAdmin().prepareGetSnapshots(TEST_REQUEST_TIMEOUT, REPO)
.setSnapshots(snapshotName)
.get();
snapshotInfo = snapshotsStatusResponse.getSnapshots().get(0);
} catch (SnapshotMissingException sme) {
throw new AssertionError(sme);
}
assertEquals(SnapshotState.SUCCESS, snapshotInfo.state());
}, 30L, TimeUnit.SECONDS);
assertSnapshotSuccessful(snapshotName);

// the SLM policy metadata has not been updated due to master shutdown
assertBusy(() -> {
SnapshotLifecyclePolicyItem policy = client().execute(
GetSnapshotLifecycleAction.INSTANCE,
new GetSnapshotLifecycleAction.Request(TEST_REQUEST_TIMEOUT, TEST_REQUEST_TIMEOUT, policyName)
).get().getPolicies().getFirst();
assertNull(policy.getLastSuccess());
assertNull(policy.getLastFailure());
assertEquals(0, policy.getPolicyStats().getSnapshotFailedCount());
assertEquals(0, policy.getPolicyStats().getSnapshotTakenCount());
});

// 2nd SLM execution, it should pick up the last missing stats
String snapshotSecond = executePolicy(policyName);

awaitNoMoreRunningOperations();
assertSnapshotSuccessful(snapshotSecond);

// stats should have 2 successful snapshots, 1 from the new snapshot and 1 from previous success
assertBusy(() -> {
SnapshotLifecyclePolicyItem policy = client().execute(
GetSnapshotLifecycleAction.INSTANCE,
new GetSnapshotLifecycleAction.Request(TEST_REQUEST_TIMEOUT, TEST_REQUEST_TIMEOUT, policyName)
).get().getPolicies().getFirst();
assertNull(policy.getLastFailure());
assertNotNull(policy.getLastSuccess());
assertEquals(snapshotSecond, policy.getLastSuccess().getSnapshotName());
assertEquals(0, policy.getPolicyStats().getSnapshotFailedCount());
assertEquals(2, policy.getPolicyStats().getSnapshotTakenCount());
});
}

public void testSLMWithMasterShutdownAndDeletedSnapshot() throws Exception {
final String indexName = "test";
final String policyName = "test-policy";
int clusterSize = masterNodeNames.size() + dataNodeNames.size();
indexRandomDocs(indexName, 20);
createRepository(REPO, "mock");

createSnapshotPolicy(
policyName,
"snap",
NEVER_EXECUTE_CRON_SCHEDULE,
REPO,
indexName,
true,
false,
new SnapshotRetentionConfiguration(TimeValue.ZERO, null, null)
);

// block snapshot from completing
blockMasterFromFinalizingSnapshotOnIndexFile(REPO);

// first SLM execution
final String snapshotName = executePolicy(policyName);
final String initialMaster = internalCluster().getMasterName();
waitForBlock(initialMaster, REPO);

// restart master
internalCluster().restartNode(initialMaster);
ensureStableCluster(clusterSize);
awaitNoMoreRunningOperations();

// ensure snapshot is completed successfully after master failover
assertBusy(() -> {
final SnapshotInfo snapshotInfo;
try {
GetSnapshotsResponse snapshotsStatusResponse = clusterAdmin().prepareGetSnapshots(TEST_REQUEST_TIMEOUT, REPO)
.setSnapshots(snapshotName)
.get();
snapshotInfo = snapshotsStatusResponse.getSnapshots().get(0);
} catch (SnapshotMissingException sme) {
throw new AssertionError(sme);
}
assertEquals(SnapshotState.SUCCESS, snapshotInfo.state());
}, 30L, TimeUnit.SECONDS);
assertSnapshotSuccessful(snapshotName);

// the SLM policy metadata has not been updated due to master shutdown
assertBusy(() -> {
SnapshotLifecyclePolicyItem policy = client().execute(
GetSnapshotLifecycleAction.INSTANCE,
new GetSnapshotLifecycleAction.Request(TEST_REQUEST_TIMEOUT, TEST_REQUEST_TIMEOUT, policyName)
).get().getPolicies().getFirst();
assertNull(policy.getLastSuccess());
assertNull(policy.getLastFailure());
assertEquals(0, policy.getPolicyStats().getSnapshotFailedCount());
assertEquals(0, policy.getPolicyStats().getSnapshotTakenCount());
});

// delete the snapshot, simulate missing snapshot from repo
assertBusy(() -> {
AcknowledgedResponse response = clusterAdmin().prepareDeleteSnapshot(TEST_REQUEST_TIMEOUT, REPO, snapshotName).get();
assertTrue(response.isAcknowledged());
});

// 2nd SLM execution, it should pick up the last missing stats
String snapshotSecond = executePolicy(policyName);

awaitNoMoreRunningOperations();
assertSnapshotSuccessful(snapshotSecond);

// stats should have 1 successful and 1 failed snapshot, the deleted snapshot is inferred failure
assertBusy(() -> {
SnapshotLifecyclePolicyItem policy = client().execute(
GetSnapshotLifecycleAction.INSTANCE,
new GetSnapshotLifecycleAction.Request(TEST_REQUEST_TIMEOUT, TEST_REQUEST_TIMEOUT, policyName)
).get().getPolicies().getFirst();
assertNotNull(policy.getLastSuccess());
assertEquals(snapshotSecond, policy.getLastSuccess().getSnapshotName());
assertNotNull(policy.getLastFailure());
assertEquals(snapshotName, policy.getLastFailure().getSnapshotName());
assertEquals(1, policy.getPolicyStats().getSnapshotFailedCount());
assertEquals(1, policy.getPolicyStats().getSnapshotTakenCount());
});
}

private void assertSnapshotSuccessful(String snapshotName) throws Exception {
assertBusy(() -> {
final SnapshotInfo snapshotInfo;
try {
GetSnapshotsResponse snapshotsStatusResponse = clusterAdmin().prepareGetSnapshots(TEST_REQUEST_TIMEOUT, REPO)
.setSnapshots(snapshotName)
.get();
snapshotInfo = snapshotsStatusResponse.getSnapshots().get(0);
} catch (SnapshotMissingException sme) {
throw new AssertionError(sme);
}
assertEquals(SnapshotState.SUCCESS, snapshotInfo.state());
});
}

private void testUnsuccessfulSnapshotRetention(boolean partialSuccess) throws Exception {
final String indexName = "test-idx";
final String policyId = "test-policy";
Expand Down
Loading