Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
113 changes: 71 additions & 42 deletions src/discof/restore/fd_snaprd_tile.c
Original file line number Diff line number Diff line change
Expand Up @@ -96,7 +96,6 @@ struct fd_snaprd_tile {
int malformed;
long deadline_nanos;
ulong ack_cnt;
int peer_selection;

fd_ip4_port_t addr;

Expand Down Expand Up @@ -152,6 +151,7 @@ struct fd_snaprd_tile {
uint max_incremental_snapshots_to_keep;
int entrypoints_enabled;
int gossip_peers_enabled;
int peer_selection;
} config;

struct {
Expand Down Expand Up @@ -684,7 +684,7 @@ after_credit( fd_snaprd_tile_t * ctx,
return;
}

if( FD_LIKELY( ctx->peer_selection ) ) {
if( FD_LIKELY( ctx->config.peer_selection ) ) {
fd_ssping_advance( ctx->ssping, now, ctx->selector );
fd_http_resolver_advance( ctx->ssresolver, now, ctx->selector );

Expand Down Expand Up @@ -730,7 +730,8 @@ after_credit( fd_snaprd_tile_t * ctx,

fd_sspeer_t best = fd_sspeer_selector_best( ctx->selector, 0, ULONG_MAX );
if( FD_UNLIKELY( !best.addr.l ) ) {
ctx->state = FD_SNAPRD_STATE_WAITING_FOR_PEERS;
ctx->state = FD_SNAPRD_STATE_WAITING_FOR_PEERS;
ctx->deadline_nanos = now+FD_SNAPRD_WAITING_FOR_PEERS_TIMEOUT_DEADLINE_NANOS;
break;
}

Expand All @@ -744,6 +745,11 @@ after_credit( fd_snaprd_tile_t * ctx,
ctx->metrics.full.bytes_total = ctx->local_in.full_snapshot_size;
ctx->state = FD_SNAPRD_STATE_READING_FULL_FILE;
} else {
if( FD_UNLIKELY( !ctx->config.do_download ) ) {
FD_LOG_ERR(( "Local snapshot `%s` is too old and downloading new snapshots is disabled. "
"Please enable downloading via [snapshots.download] and restart.", ctx->local_in.full_snapshot_path ) );
Comment on lines +749 to +750
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do we have any existing precedent for LOG_ERR'ing here? This is a situation where the validator isn't expecting to make progress / start up correctly, but it's not really a "crashable" edge case...

So is there some precedent for "alert loudly and keep running" or do we prefer to crash? Presumably most operators have their validators running under systemd or whatnot which will auto restart if it crashes/stops, which won't give them any chance to correct the config

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Separately not sure how I feel about asking the operator to enable downloading. Maybe they just need to get a newer snapshot some other way external to our code. We should also probably log the local slot, cluster slot, and the max slot age so they can see those things

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Agree with logging the local slot, cluster slot, and the max slot age. Not sure if we have precedent for alert loudly and keep going, or crashing. We added this download config option because agave had something similar: no_snapshot_fetch, which disabled downloading snapshots and only loaded from local snapshots if present.

}

if( FD_UNLIKELY( !ctx->config.incremental_snapshot_fetch ) ) send_expected_slot( stem, best.ssinfo.full.slot );

fd_sspeer_t best_incremental = fd_sspeer_selector_best( ctx->selector, 1, best.ssinfo.full.slot );
Expand All @@ -765,7 +771,8 @@ after_credit( fd_snaprd_tile_t * ctx,

fd_sspeer_t best = fd_sspeer_selector_best( ctx->selector, 1, ctx->http.full.slot );
if( FD_UNLIKELY( !best.addr.l ) ) {
ctx->state = FD_SNAPRD_STATE_WAITING_FOR_PEERS_INCREMENTAL;
ctx->state = FD_SNAPRD_STATE_WAITING_FOR_PEERS_INCREMENTAL;
ctx->deadline_nanos = now+FD_SNAPRD_WAITING_FOR_PEERS_TIMEOUT_DEADLINE_NANOS;
break;
}

Expand All @@ -790,9 +797,14 @@ after_credit( fd_snaprd_tile_t * ctx,
ctx->ack_cnt = 0UL;

if( FD_UNLIKELY( ctx->malformed ) ) {
fd_stem_publish( stem, 0UL, FD_SNAPSHOT_MSG_CTRL_RESET_FULL, 0UL, 0UL, 0UL, 0UL, 0UL );
ctx->state = FD_SNAPRD_STATE_FLUSHING_FULL_FILE_RESET;
ctx->malformed = 0;
if( FD_LIKELY( ctx->config.peer_selection ) ) {
fd_stem_publish( stem, 0UL, FD_SNAPSHOT_MSG_CTRL_RESET_FULL, 0UL, 0UL, 0UL, 0UL, 0UL );
ctx->state = FD_SNAPRD_STATE_FLUSHING_FULL_FILE_RESET;
ctx->malformed = 0;
} else {
FD_LOG_ERR(( "Local incremental snapshot `%s` is malformed and no snapshot sources are configured. "
"Please add snapshot sources and restart. ", ctx->local_in.incremental_snapshot_path ));
}
break;
}

Expand Down Expand Up @@ -822,6 +834,18 @@ after_credit( fd_snaprd_tile_t * ctx,
if( FD_UNLIKELY( ctx->ack_cnt<NUM_SNAP_CONSUMERS ) ) break;
ctx->ack_cnt = 0UL;

if( FD_UNLIKELY( ctx->malformed ) ) {
if( FD_LIKELY( ctx->config.peer_selection ) ) {
fd_stem_publish( stem, 0UL, FD_SNAPSHOT_MSG_CTRL_RESET_FULL, 0UL, 0UL, 0UL, 0UL, 0UL );
ctx->state = FD_SNAPRD_STATE_FLUSHING_FULL_FILE_RESET;
ctx->malformed = 0;
} else {
FD_LOG_ERR(( "Local full snapshot `%s` is malformed and no snapshot sources are configured. "
"Please add snapshot sources and restart. ", ctx->local_in.full_snapshot_path ));
}
break;
}

if( FD_LIKELY( !ctx->config.incremental_snapshot_fetch ) ) {
ctx->state = FD_SNAPRD_STATE_SHUTDOWN;
metrics_write( ctx ); /* ensures that shutdown state is written to metrics workspace before the tile actually shuts down */
Expand Down Expand Up @@ -920,7 +944,7 @@ before_frag( fd_snaprd_tile_t * ctx FD_PARAM_UNUSED,
return !( ( sig==FD_GOSSIP_UPDATE_TAG_CONTACT_INFO ||
sig==FD_GOSSIP_UPDATE_TAG_CONTACT_INFO_REMOVE ||
sig==FD_GOSSIP_UPDATE_TAG_SNAPSHOT_HASHES ) &&
( ctx->config.entrypoints_enabled || ctx->config.gossip_peers_enabled ) && ctx->peer_selection );
( ctx->config.entrypoints_enabled || ctx->config.gossip_peers_enabled ) && ctx->config.peer_selection );
}
return 0;
}
Expand Down Expand Up @@ -1091,12 +1115,6 @@ privileged_init( fd_topo_t * topo,

fd_memset( &ctx->metrics, 0, sizeof(ctx->metrics) );

/* By default, the snaprd tile selects peers and its initial state is
WAITING_FOR_PEERS. */
ctx->peer_selection = 1;
ctx->state = FD_SNAPRD_STATE_WAITING_FOR_PEERS;
ctx->deadline_nanos = fd_log_wallclock() + FD_SNAPRD_WAITING_FOR_PEERS_TIMEOUT_DEADLINE_NANOS;

ctx->local_in.full_snapshot_fd = -1;
ctx->local_in.incremental_snapshot_fd = -1;
ctx->local_out.dir_fd = -1;
Expand Down Expand Up @@ -1152,41 +1170,52 @@ privileged_init( fd_topo_t * topo,
if( FD_UNLIKELY( !S_ISREG( incremental_stat.st_mode ) ) ) FD_LOG_ERR(( "incremental snapshot path `%s` is not a regular file", incremental_path ));
ctx->local_in.incremental_snapshot_size = (ulong)incremental_stat.st_size;
}
}

ctx->local_out.dir_fd = -1;
ctx->local_out.full_snapshot_fd = -1;
ctx->local_out.incremental_snapshot_fd = -1;

if( FD_UNLIKELY( tile->snaprd.maximum_local_snapshot_age==0U ) ) {
/* Disable peer selection if we are reading snapshots from disk
and there is no maximum local snapshot age set. Set the
initial state to READING_FULL_FILE to avoid peer selection
logic.

TODO: Why? Document in TOML. */
ctx->peer_selection = 0;
ctx->state = FD_SNAPRD_STATE_READING_FULL_FILE;
ctx->metrics.full.bytes_total = ctx->local_in.full_snapshot_size;
FD_LOG_NOTICE(( "reading full snapshot from local file `%s`", ctx->local_in.full_snapshot_path ));
if( FD_UNLIKELY( !tile->snaprd.entrypoints_enabled &&
!tile->snaprd.gossip_peers_enabled &&
!tile->snaprd.http.peers_cnt ) ) {
if( FD_UNLIKELY( ctx->local_in.full_snapshot_slot==ULONG_MAX ) ) {
FD_LOG_ERR(("No local full snapshots found in `%s` and no snapshot sources configured. "
"Please add snapshot sources and restart. ", tile->snaprd.snapshots_path ));
}
}

/* Set up download descriptors because even if we have local
if( FD_UNLIKELY( tile->snaprd.incremental_snapshot_fetch && ctx->local_in.incremental_snapshot_slot==ULONG_MAX ) ) {
FD_LOG_ERR(( "No local incremental snapshots found in `%s` and no snapshot sources configured. "
"Please add snapshot sources or set [snapshots.incremental_snapshots] to false and restart. ", tile->snaprd.snapshots_path ));
}

/* Disable peer selection when there are no configured snapshot
sources but local snapshots exist. The snaprd state machine is
configured to start in the READING_FULL_FILE state and does not
attempt to select peers to download snapshots from. */
FD_TEST( ctx->local_in.full_snapshot_slot!=ULONG_MAX );
ctx->config.peer_selection = 0;
ctx->state = FD_SNAPRD_STATE_READING_FULL_FILE;
ctx->metrics.full.bytes_total = ctx->local_in.full_snapshot_size;
FD_LOG_NOTICE(( "reading full snapshot from local file `%s`", ctx->local_in.full_snapshot_path ));
} else {
/* Set up download descriptors because even if we have local
snapshots, we may need to download new snapshots if the local
snapshots are too old. */
ctx->local_out.dir_fd = open( tile->snaprd.snapshots_path, O_DIRECTORY|O_CLOEXEC );
if( FD_UNLIKELY( -1==ctx->local_out.dir_fd ) ) FD_LOG_ERR(( "open() failed `%s` (%i-%s)", tile->snaprd.snapshots_path, errno, fd_io_strerror( errno ) ));
ctx->local_out.dir_fd = open( tile->snaprd.snapshots_path, O_DIRECTORY|O_CLOEXEC );
if( FD_UNLIKELY( -1==ctx->local_out.dir_fd ) ) FD_LOG_ERR(( "open() failed `%s` (%i-%s)", tile->snaprd.snapshots_path, errno, fd_io_strerror( errno ) ));

FD_TEST( fd_cstr_printf_check( ctx->local_out.full_snapshot_path, PATH_MAX, NULL, "%s/snapshot.tar.bz2-partial", tile->snaprd.snapshots_path ) );
ctx->local_out.full_snapshot_fd = openat( ctx->local_out.dir_fd, "snapshot.tar.bz2-partial", O_WRONLY|O_CREAT|O_TRUNC|O_NONBLOCK, S_IRUSR|S_IWUSR );
if( FD_UNLIKELY( -1==ctx->local_out.full_snapshot_fd ) ) FD_LOG_ERR(( "open() failed `%s` (%i-%s)", ctx->local_out.full_snapshot_path, errno, fd_io_strerror( errno ) ));
FD_TEST( fd_cstr_printf_check( ctx->local_out.full_snapshot_path, PATH_MAX, NULL, "%s/snapshot.tar.bz2-partial", tile->snaprd.snapshots_path ) );
ctx->local_out.full_snapshot_fd = openat( ctx->local_out.dir_fd, "snapshot.tar.bz2-partial", O_WRONLY|O_CREAT|O_TRUNC|O_NONBLOCK, S_IRUSR|S_IWUSR );
if( FD_UNLIKELY( -1==ctx->local_out.full_snapshot_fd ) ) FD_LOG_ERR(( "open() failed `%s` (%i-%s)", ctx->local_out.full_snapshot_path, errno, fd_io_strerror( errno ) ));

if( FD_LIKELY( tile->snaprd.incremental_snapshot_fetch ) ) {
FD_TEST( fd_cstr_printf_check( ctx->local_out.incremental_snapshot_path, PATH_MAX, NULL, "%s/incremental-snapshot.tar.bz2-partial", tile->snaprd.snapshots_path ) );
ctx->local_out.incremental_snapshot_fd = openat( ctx->local_out.dir_fd, "incremental-snapshot.tar.bz2-partial", O_WRONLY|O_CREAT|O_TRUNC|O_NONBLOCK, S_IRUSR|S_IWUSR );
if( FD_UNLIKELY( -1==ctx->local_out.incremental_snapshot_fd ) ) FD_LOG_ERR(( "open() failed `%s` (%i-%s)", ctx->local_out.incremental_snapshot_path, errno, fd_io_strerror( errno ) ));
} else {
ctx->local_out.incremental_snapshot_fd = -1;
if( FD_LIKELY( tile->snaprd.incremental_snapshot_fetch ) ) {
FD_TEST( fd_cstr_printf_check( ctx->local_out.incremental_snapshot_path, PATH_MAX, NULL, "%s/incremental-snapshot.tar.bz2-partial", tile->snaprd.snapshots_path ) );
ctx->local_out.incremental_snapshot_fd = openat( ctx->local_out.dir_fd, "incremental-snapshot.tar.bz2-partial", O_WRONLY|O_CREAT|O_TRUNC|O_NONBLOCK, S_IRUSR|S_IWUSR );
if( FD_UNLIKELY( -1==ctx->local_out.incremental_snapshot_fd ) ) FD_LOG_ERR(( "open() failed `%s` (%i-%s)", ctx->local_out.incremental_snapshot_path, errno, fd_io_strerror( errno ) ));
}

/* By default, the snaprd tile selects peers and its initial state
is WAITING_FOR_PEERS. */
ctx->config.peer_selection = 1;
ctx->state = FD_SNAPRD_STATE_WAITING_FOR_PEERS;
ctx->deadline_nanos = fd_log_wallclock() + FD_SNAPRD_WAITING_FOR_PEERS_TIMEOUT_DEADLINE_NANOS;
}
}

Expand Down
Loading