Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -31,8 +31,17 @@ public final class IndexNotAvailableException extends UncheckedInternalRequestEx
* @param index the index
*/
public IndexNotAvailableException(Index index)
{
this(index.getIndexMetadata().name);
}

/**
* Creates a new <code>IndexNotAvailableException</code> for the specified index.
* @param indexName the index name
*/
public IndexNotAvailableException(String indexName)
{
super(RequestFailureReason.INDEX_NOT_AVAILABLE,
String.format("The secondary index '%s' is not yet available", index.getIndexMetadata().name));
String.format("The secondary index '%s' is not yet available", indexName));
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -509,7 +509,9 @@ static SegmentMetadata.ComponentMetadataMap createMetadataMap(long termsOffset,
public static PqInfo getPqIfPresent(IndexContext indexContext, Function<VectorCompression, Boolean> matcher)
{
// Retrieve the first compressed vectors for a segment with at least MAX_PQ_TRAINING_SET_SIZE rows
// or the one with the most rows if none reach that size
// or the one with the most rows if none reach that size. Can safely ignore whether the view is queryable
// because we're just getting the PQ, and if one is loaded, we can assume it was validated (at least as much
// as we validate any other index component for reads).
var view = indexContext.getReferencedView(TimeUnit.SECONDS.toNanos(5));
if (view == null)
{
Expand Down
4 changes: 4 additions & 0 deletions src/java/org/apache/cassandra/index/sai/plan/QueryView.java
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@
import org.apache.cassandra.db.memtable.Memtable;
import org.apache.cassandra.dht.AbstractBounds;
import org.apache.cassandra.dht.Bounds;
import org.apache.cassandra.index.IndexNotAvailableException;
import org.apache.cassandra.index.sai.IndexContext;
import org.apache.cassandra.index.sai.QueryContext;
import org.apache.cassandra.index.sai.SSTableIndex;
Expand Down Expand Up @@ -143,6 +144,9 @@ protected QueryView build() throws MissingIndexException
if (!indexContext.isIndexed())
throw new MissingIndexException(indexContext);

if (!saiView.isQueryable())
throw new IndexNotAvailableException(indexContext.getIndexName());

var sstableReaders = new ArrayList<SSTableReader>(saiView.size());
// These are already referenced because they are referenced by the same view we just referenced.
// TODO review saiView.match() method for boolean predicates.
Expand Down
16 changes: 12 additions & 4 deletions src/java/org/apache/cassandra/index/sai/view/IndexViewManager.java
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,9 @@ public IndexViewManager(IndexContext context)
IndexViewManager(IndexContext context, Collection<SSTableIndex> indices)
{
this.context = context;
this.viewRef.set(new View(context, indices));
// A view starts our as non-queryable because C* subsequently creates this IndexViewManager before completing
// the index build. Once the build is done, it replaces the view with a queryable one.
this.viewRef.set(new View(context, indices, false));
Copy link

@jasonstack jasonstack Oct 3, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I wonder if it works on CNDB which skips index build on writer.

If we create index on an empty table, existing CNDB directly marks the index as queryable. But with this change, will read request fail because view.isQueryable = false?

Can you create a CNDB PR to verify if tests are working?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks for this comment. I was wondering if there was different logic in CNDB. I would tend to think this logic is captured in a test though, right? Here is the test pr https://github.com/riptano/cndb/pull/15531. I haven't reviewed the test failures yet, but I will now. None of them appear directly related to SAI at first glance, though.

}

public View getView()
Expand Down Expand Up @@ -128,7 +130,7 @@ public Set<SSTableContext> update(Collection<SSTableReader> oldSSTables,
referencedSSTableIndexes.add(sstableIndex);
}

newView = new View(context, referencedSSTableIndexes);
newView = new View(context, referencedSSTableIndexes, indexes.right.isEmpty());
}
while (newView == null || !viewRef.compareAndSet(currentView, newView));

Expand Down Expand Up @@ -177,6 +179,8 @@ public void prepareSSTablesForRebuild(Collection<SSTableReader> sstablesToRebuil
if (iterations++ > 1000)
throw new IllegalStateException("Failed to prepare index view after 1000 iterations");

// Iff we keep all the indexes, then we can stay queryable.
boolean retainedAllIndexes = true;
for (var index : oldView.getIndexes())
{
if (!toRemove.contains(index.getSSTable()))
Expand All @@ -185,9 +189,13 @@ public void prepareSSTablesForRebuild(Collection<SSTableReader> sstablesToRebuil
continue outer;
newIndexes.add(index);
}
else
{
retainedAllIndexes = false;
}
}

newView = new View(context, newIndexes);
newView = new View(context, newIndexes, retainedAllIndexes);
}
while (newView == null || !viewRef.compareAndSet(oldView, newView));
oldView.release();
Expand All @@ -203,7 +211,7 @@ public void prepareSSTablesForRebuild(Collection<SSTableReader> sstablesToRebuil
public void invalidate(boolean indexWasDropped)
{
// No need to loop here because we don't use the old view when building the new view.
var oldView = viewRef.getAndSet(new View(context, Collections.emptySet()));
var oldView = viewRef.getAndSet(new View(context, Collections.emptySet(), false));
if (indexWasDropped)
oldView.markIndexWasDropped();
else
Expand Down
10 changes: 9 additions & 1 deletion src/java/org/apache/cassandra/index/sai/view/View.java
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@
public class View implements Iterable<SSTableIndex>
{
private final Map<Descriptor, SSTableIndex> view;
private final boolean isQueryable;
private final AtomicInteger references = new AtomicInteger(1);
private volatile boolean indexWasDropped;

Expand All @@ -60,9 +61,11 @@ public class View implements Iterable<SSTableIndex>
* @param indexes the indexes. Note that the referencing logic for these indexes is handled
* outside of this constructor and all indexes are assumed to have been referenced already.
* The view will release the indexes when it is finally released.
* @param isQueryable true if the view is queryable; false otherwise.
*/
public View(IndexContext context, Collection<SSTableIndex> indexes)
public View(IndexContext context, Collection<SSTableIndex> indexes, boolean isQueryable)
{
this.isQueryable = isQueryable;
this.view = new HashMap<>();
this.keyValidator = context.keyValidator();

Expand Down Expand Up @@ -151,6 +154,11 @@ public int size()
return view.size();
}

public boolean isQueryable()
{
return isQueryable;
}

/**
* Tells if an index for the given sstable exists.
* It's equivalent to {@code getSSTableIndex(descriptor) != null }.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -189,18 +189,26 @@ public void testIndexRebuildWhenAddingSStableViaRemoteReload()
// unlink sstable and index context: expect no rows to be read by base and index
cfs.clearUnsafe();
IndexMetadata indexMetadata = cfs.metadata().indexes.iterator().next();
((StorageAttachedIndex) cfs.getIndexManager().getIndex(indexMetadata)).getIndexContext().prepareSSTablesForRebuild(sstables);
// First confirm that if we pass an empty set, the index stays queryable. Not sure if this is a realistic
// code path, but it seems plausible that we could have an unrelated sstable passed to this method in the
// event of a data race, and in that case, we should stay queryable.
((StorageAttachedIndex) cfs.getIndexManager().getIndex(indexMetadata)).getIndexContext().prepareSSTablesForRebuild(Collections.emptySet());
assertEmpty(execute("SELECT * FROM %s WHERE a=1"));
assertEmpty(execute("SELECT * FROM %s WHERE c=1"));

// TODO why? This change reverts back to behavior from before https://github.com/datastax/cassandra/pull/1491,
// but it seems invalid.
// track sstable again: expect no rows to be read by index
// Now pass the actual sstables: expect no rows to be read by base and for the index to be non-queryable because
// preparing an sstable for rebuild removes the index from the view, which would otherwise result in
// partial results.
((StorageAttachedIndex) cfs.getIndexManager().getIndex(indexMetadata)).getIndexContext().prepareSSTablesForRebuild(sstables);
assertEmpty(execute("SELECT * FROM %s WHERE a=1"));
assertInvalid("SELECT * FROM %s WHERE c=1");

// track sstable again: expect index to remain non-queryable because it is not rebuilt (the view hasn't been updated)
Comment on lines -196 to +206
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@jasonstack - I see that you wrote these test assertions in #309. We've flipped this logic a couple of times this year. First we went towards non-queryable with https://github.com/datastax/cassandra/pull/1491/files#diff-98107b1903c60731ef493d2e86f26460e40d2b0e60862db3a49ea27ba4e282fa. Then, we reverted it back to the #309 state when I merged #1700. You also commented that this test could be removed here: #1700 (comment).

So, my key question: do you have any concern with the design change to make the view object store the index's queryability state?

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

do you have any concern with the design change to make the view object store the index's queryability state?

With this change, we effectively have two queryability states: one in SecondaryIndexManager (SIM) and another in View. This introduces a potential race between the two, so we need be careful the update order.

Marking queryable:

  • We should mark the View first, then update SIM.
  • Reason: the coordinator (which can also act as a replica) checks SIM for queryability when routing. If SIM is marked queryable before View, queries can be sent to a replica (itself) whose view isn’t actually ready, causing failures.

Marking non-queryable. There are two possible approaches

  • View first, then SIM (current PR behavior) – prevents the view from serving inconsistent results. However, in-flight queries might fail if SIM is still routing but View is already non-queryable.
  • SIM first, then View – avoids both inconsistencies and in-flight query failures, but adds lot more complexity to the code (It requires the in-flight query that passes SIM acquires a queryable view..).

I am +1 on keeping the code simple with current PR approach. WDYT?

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You also commented that this test could be removed here: #1700 (comment).

I think we can remove it and rely on CNDB side tests

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

With this change, we effectively have two queryability states: one in SecondaryIndexManager (SIM) and another in View. This introduces a potential race between the two, so we need be careful the update order.

We already have a race. I am just proposing one way to solve it. We could alternatively make SIM get a reference to the view when checking queryability.

In general, I tend to agree with you that two ways to track the same concept is a design smell. As the new tests show, there is a race already where it is possible to get an incomplete set of sstable indexes, which is all I am trying to fix.

cfs.getTracker().addInitialSSTables(sstables);
assertRows(execute("SELECT * FROM %s WHERE a=1"), row(1, 1, 1));
assertEmpty(execute("SELECT * FROM %s WHERE c=1"));
assertInvalid("SELECT * FROM %s WHERE c=1");

// remote reload should trigger index rebuild
// remote reload should trigger index rebuild, making the index queryable again
cfs.getTracker().notifySSTablesChanged(Collections.emptySet(), sstables, OperationType.REMOTE_RELOAD, Optional.empty(), null);
waitForIndexBuilds(KEYSPACE, indexName); // this is needed because index build on remote reload is async
assertRows(execute("SELECT * FROM %s WHERE a=1"), row(1, 1, 1));
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,118 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package org.apache.cassandra.index.sai.plan;

import java.util.concurrent.ForkJoinPool;
import java.util.concurrent.TimeUnit;

import org.junit.Test;

import org.apache.cassandra.exceptions.ReadFailureException;
import org.apache.cassandra.index.sai.SAITester;
import org.apache.cassandra.index.sai.view.IndexViewManager;
import org.apache.cassandra.inject.ActionBuilder;
import org.apache.cassandra.inject.Expression;
import org.apache.cassandra.inject.Injection;
import org.apache.cassandra.inject.Injections;
import org.apache.cassandra.inject.InvokePointBuilder;
import org.awaitility.Awaitility;

import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertThrows;
import static org.junit.Assert.assertTrue;

/**
* When a query is in progress and an index build fails, we need to mark the index as non-queryable instead of
* returning partial or incorrect results.
*/
public class FailedSSTableIndexLoadWhileQueryingTest extends SAITester
{

@Test
public void testSSTableIndexInitFailsAfterQueryViewBuildEqualityQuery() throws Throwable
{
createTable("CREATE TABLE %s (k text PRIMARY KEY, x int)");

var indexName = createIndex("CREATE CUSTOM INDEX ON %s(x) USING 'StorageAttachedIndex'");

execute("INSERT INTO %s (k, x) VALUES (?, ?)", "a", 0);
execute("INSERT INTO %s (k, x) VALUES (?, ?)", "b", 0);
execute("INSERT INTO %s (k, x) VALUES (?, ?)", "c", 1);

testSSTableIndexInitFailsAfterQueryViewBuiltBeforeQueryExecution(indexName, "SELECT k FROM %s WHERE x = 0");
}

@Test
public void testSSTableIndexInitFailsAfterQueryViewBuildNotContainsQuery() throws Throwable
{
createTable("CREATE TABLE %s (k text PRIMARY KEY, x set<int>)");

var indexName = createIndex("CREATE CUSTOM INDEX ON %s(x) USING 'StorageAttachedIndex'");

execute("INSERT INTO %s (k, x) VALUES ('a', {1, 2, 3})");
execute("INSERT INTO %s (k, x) VALUES ('b', {1, 2, 3})");
execute("INSERT INTO %s (k, x) VALUES ('c', {1, 2, 4})");

testSSTableIndexInitFailsAfterQueryViewBuiltBeforeQueryExecution(indexName, "SELECT k FROM %s WHERE x NOT CONTAINS 3");
}

private void testSSTableIndexInitFailsAfterQueryViewBuiltBeforeQueryExecution(String indexName, String query) throws Throwable
{
// This bug is only reachable when you fail to replace an sstable via compaction.
flush();

Injection failSSTableIndexLoadOnInit = Injections.newCustom("FailSSTableIndexLoadOnInit-" + indexName)
.add(InvokePointBuilder.newInvokePoint()
.onClass("org.apache.cassandra.index.sai.SSTableIndex")
.onMethod("<init>")
.atEntry()
)
.add(ActionBuilder.newActionBuilder().actions()
.doThrow(java.lang.RuntimeException.class, Expression.quote("Byteman-injected fault in MemtableIndexWriter.complete"))
)
.build();
Injections.inject(failSSTableIndexLoadOnInit);

// We use two barriers to ensure that we first flush and fail to load the index (thereby putting in place
// an invalid view) and then to make sure the query gets that view.
var badViewPoint = InvokePointBuilder.newInvokePoint().onClass(IndexViewManager.class).onMethod("update").atExit();
var badViewBarrier = Injections.newBarrier("pause_after_setting_bad_view", 2, false).add(badViewPoint).build();
Injections.inject(badViewBarrier);

// Flush in a separate thread since the badViewPointBarrier will block it, thereby preventing it from
// marking the index as non-queryable.
ForkJoinPool.commonPool().submit(() -> compact());

// Wait for compaction to reach the badViewBarrier (the point where we know the view is bad but the
// index is still considered queryable).
Awaitility.waitAtMost(5, TimeUnit.SECONDS).until(() -> badViewBarrier.getCount() == 1);

// The query hits the getViewBarrier.
assertThrows(ReadFailureException.class, () -> execute(query));
assertEquals("Confirm that flush hit the barrier, but did not pass it", 1, badViewBarrier.getCount());

// Confirm index is considered queryable. The primary point of the remaining assertions is to show to readers
// that we have a period of time after creating a broken view and before marking the index as non-queryable.
assertTrue(isIndexQueryable(KEYSPACE, indexName));
// Arrive and unblock the thread that will mark the index as non-queryable.
badViewBarrier.arrive();
// Expect index to go non-queryable soon.
Awaitility.waitAtMost(5, TimeUnit.SECONDS).until(() -> !isIndexQueryable(KEYSPACE, indexName));
}
}