diff --git a/.github/workflows/test.e2e.persistent.yml b/.github/workflows/test.e2e.persistent.yml index c2230d30aa45..a20758ff3030 100644 --- a/.github/workflows/test.e2e.persistent.yml +++ b/.github/workflows/test.e2e.persistent.yml @@ -28,7 +28,7 @@ jobs: run: ./scripts/build.sh -r - name: Run e2e tests with persistent network shell: bash - run: ./scripts/tests.e2e.persistent.sh ./build/avalanchego + run: E2E_SERIAL=1 ./scripts/tests.e2e.persistent.sh ./build/avalanchego - name: Upload testnet network dir uses: actions/upload-artifact@v3 with: diff --git a/.github/workflows/test.e2e.yml b/.github/workflows/test.e2e.yml index 75a1e35762d2..18ca3de8f17f 100644 --- a/.github/workflows/test.e2e.yml +++ b/.github/workflows/test.e2e.yml @@ -28,7 +28,7 @@ jobs: run: ./scripts/build.sh -r - name: Run e2e tests shell: bash - run: ./scripts/tests.e2e.sh ./build/avalanchego + run: E2E_SERIAL=1 ./scripts/tests.e2e.sh ./build/avalanchego - name: Upload testnet network dir uses: actions/upload-artifact@v3 with: diff --git a/scripts/tests.e2e.sh b/scripts/tests.e2e.sh index e90bb5d70a1f..fd78ec45093f 100755 --- a/scripts/tests.e2e.sh +++ b/scripts/tests.e2e.sh @@ -5,6 +5,7 @@ set -euo pipefail # e.g., # ./scripts/build.sh # ./scripts/tests.e2e.sh ./build/avalanchego +# E2E_SERIAL=1 ./scripts/tests.e2e.sh ./build/avalanchego if ! [[ "$0" =~ scripts/tests.e2e.sh ]]; then echo "must be run from repository root" exit 255 @@ -43,10 +44,28 @@ else fi ################################# -# - Execute in parallel (-p) with the ginkgo cli to minimize execution time. -# The test binary by itself isn't capable of running specs in parallel. +# Determine ginkgo args +GINKGO_ARGS="" +if [[ -n "${E2E_SERIAL:-}" ]]; then + # Specs will be executed serially. This supports running e2e tests in CI + # where parallel execution of tests that start new nodes beyond the + # initial set of validators could overload the free tier CI workers. + # Forcing serial execution in this test script instead of marking + # resource-hungry tests as serial supports executing the test suite faster + # on powerful development workstations. + echo "tests will be executed serially to minimize resource requirements" +else + # Enable parallel execution of specs defined in the test binary by + # default. This requires invoking the binary via the ginkgo cli + # since the test binary isn't capable of executing specs in + # parallel. + echo "tests will be executed in parallel" + GINKGO_ARGS="-p" +fi + +################################# # - Execute in random order to identify unwanted dependency -ginkgo -p -v --randomize-all ./tests/e2e/e2e.test -- ${E2E_ARGS} \ +ginkgo ${GINKGO_ARGS} -v --randomize-all ./tests/e2e/e2e.test -- ${E2E_ARGS} \ && EXIT_CODE=$? || EXIT_CODE=$? if [[ ${EXIT_CODE} -gt 0 ]]; then diff --git a/tests/e2e/e2e.go b/tests/e2e/e2e.go index b9c1b49dd3aa..faeda0f7554e 100644 --- a/tests/e2e/e2e.go +++ b/tests/e2e/e2e.go @@ -36,6 +36,10 @@ const ( // Enough for test/custom networks. DefaultConfirmTxTimeout = 20 * time.Second + // This interval should represent the upper bound of the time + // required to start a new node on a local test network. + DefaultNodeStartTimeout = 20 * time.Second + // A long default timeout used to timeout failed operations but // unlikely to induce flaking due to unexpected resource // contention. @@ -168,3 +172,29 @@ func Eventually(condition func() bool, waitFor time.Duration, tick time.Duration } } } + +// Add an ephemeral node that is only intended to be used by a single test. Its ID and +// URI are not intended to be returned from the Network instance to minimize +// accessibility from other tests. +func AddEphemeralNode(network testnet.Network, flags testnet.FlagsMap) testnet.Node { + require := require.New(ginkgo.GinkgoT()) + + node, err := network.AddEphemeralNode(ginkgo.GinkgoWriter, flags) + require.NoError(err) + + // Ensure node is stopped on teardown. It's configuration is not removed to enable + // collection in CI to aid in troubleshooting failures. + ginkgo.DeferCleanup(func() { + tests.Outf("Shutting down ephemeral node %s\n", node.GetID()) + require.NoError(node.Stop()) + }) + + return node +} + +// Wait for the given node to report healthy. +func WaitForHealthy(node testnet.Node) { + ctx, cancel := context.WithTimeout(context.Background(), DefaultTimeout) + defer cancel() + require.NoError(ginkgo.GinkgoT(), testnet.WaitForHealthy(ctx, node)) +} diff --git a/tests/e2e/e2e_test.go b/tests/e2e/e2e_test.go index 87008535fa9a..0e5a880ce5ae 100644 --- a/tests/e2e/e2e_test.go +++ b/tests/e2e/e2e_test.go @@ -26,6 +26,7 @@ import ( // ensure test packages are scanned by ginkgo _ "github.com/ava-labs/avalanchego/tests/e2e/banff" _ "github.com/ava-labs/avalanchego/tests/e2e/c" + _ "github.com/ava-labs/avalanchego/tests/e2e/faultinjection" _ "github.com/ava-labs/avalanchego/tests/e2e/p" _ "github.com/ava-labs/avalanchego/tests/e2e/static-handlers" _ "github.com/ava-labs/avalanchego/tests/e2e/x" diff --git a/tests/e2e/faultinjection/duplicate_node_id.go b/tests/e2e/faultinjection/duplicate_node_id.go new file mode 100644 index 000000000000..c80bf0fc3724 --- /dev/null +++ b/tests/e2e/faultinjection/duplicate_node_id.go @@ -0,0 +1,94 @@ +// Copyright (C) 2019-2023, Ava Labs, Inc. All rights reserved. +// See the file LICENSE for licensing terms. + +package faultinjection + +import ( + "context" + "fmt" + + ginkgo "github.com/onsi/ginkgo/v2" + + "github.com/stretchr/testify/require" + + "github.com/ava-labs/avalanchego/api/info" + "github.com/ava-labs/avalanchego/config" + "github.com/ava-labs/avalanchego/ids" + "github.com/ava-labs/avalanchego/tests/e2e" + "github.com/ava-labs/avalanchego/tests/fixture/testnet" + "github.com/ava-labs/avalanchego/utils/set" +) + +var _ = ginkgo.Describe("Duplicate node handling", func() { + require := require.New(ginkgo.GinkgoT()) + + ginkgo.It("should ensure that a given Node ID (i.e. staking keypair) can be used at most once on a network", func() { + network := e2e.Env.GetNetwork() + nodes := network.GetNodes() + + ginkgo.By("creating new node") + node1 := e2e.AddEphemeralNode(network, testnet.FlagsMap{}) + e2e.WaitForHealthy(node1) + + ginkgo.By("checking that the new node is connected to its peers") + checkConnectedPeers(nodes, node1) + + ginkgo.By("creating a second new node with the same staking keypair as the first new node") + node1Flags := node1.GetConfig().Flags + node2Flags := testnet.FlagsMap{ + config.StakingTLSKeyContentKey: node1Flags[config.StakingTLSKeyContentKey], + config.StakingCertContentKey: node1Flags[config.StakingCertContentKey], + // Construct a unique data dir to ensure the two nodes' data will be stored + // separately. Usually the dir name is the node ID but in this one case the nodes have + // the same node ID. + config.DataDirKey: fmt.Sprintf("%s-second", node1Flags[config.DataDirKey]), + } + node2 := e2e.AddEphemeralNode(network, node2Flags) + + ginkgo.By("checking that the second new node fails to become healthy before timeout") + err := testnet.WaitForHealthy(e2e.DefaultContext(), node2) + require.ErrorIs(err, context.DeadlineExceeded) + + ginkgo.By("stopping the first new node") + require.NoError(node1.Stop()) + + ginkgo.By("checking that the second new node becomes healthy within timeout") + e2e.WaitForHealthy(node2) + + ginkgo.By("checking that the second new node is connected to its peers") + checkConnectedPeers(nodes, node2) + }) +}) + +// Check that a new node is connected to existing nodes and vice versa +func checkConnectedPeers(existingNodes []testnet.Node, newNode testnet.Node) { + require := require.New(ginkgo.GinkgoT()) + + // Collect the node ids of the new node's peers + infoClient := info.NewClient(newNode.GetProcessContext().URI) + peers, err := infoClient.Peers(context.Background()) + require.NoError(err) + peerIDs := set.NewSet[ids.NodeID](len(existingNodes)) + for _, peer := range peers { + peerIDs.Add(peer.ID) + } + + newNodeID := newNode.GetID() + for _, existingNode := range existingNodes { + // Check that the existing node is a peer of the new node + require.True(peerIDs.Contains(existingNode.GetID())) + + // Check that the new node is a peer + infoClient := info.NewClient(existingNode.GetProcessContext().URI) + peers, err := infoClient.Peers(context.Background()) + require.NoError(err) + isPeer := false + for _, peer := range peers { + if peer.ID == newNodeID { + isPeer = true + break + } + } + require.True(isPeer) + } +} diff --git a/tests/fixture/testnet/common.go b/tests/fixture/testnet/common.go new file mode 100644 index 000000000000..ab983e893e46 --- /dev/null +++ b/tests/fixture/testnet/common.go @@ -0,0 +1,42 @@ +// Copyright (C) 2019-2023, Ava Labs, Inc. All rights reserved. +// See the file LICENSE for licensing terms. + +package testnet + +import ( + "context" + "errors" + "fmt" + "time" +) + +const ( + DefaultNodeTickerInterval = 50 * time.Millisecond +) + +var ErrNotRunning = errors.New("not running") + +// WaitForHealthy blocks until Node.IsHealthy returns true or an error (including context timeout) is observed. +func WaitForHealthy(ctx context.Context, node Node) error { + if _, ok := ctx.Deadline(); !ok { + return fmt.Errorf("unable to wait for health for node %q with a context without a deadline", node.GetID()) + } + ticker := time.NewTicker(DefaultNodeTickerInterval) + defer ticker.Stop() + + for { + healthy, err := node.IsHealthy(ctx) + if err != nil && !errors.Is(err, ErrNotRunning) { + return fmt.Errorf("failed to wait for health of node %q: %w", node.GetID(), err) + } + if healthy { + return nil + } + + select { + case <-ctx.Done(): + return fmt.Errorf("failed to wait for health of node %q before timeout: %w", node.GetID(), ctx.Err()) + case <-ticker.C: + } + } +} diff --git a/tests/fixture/testnet/interfaces.go b/tests/fixture/testnet/interfaces.go index 98a16cc0ff35..2c1479ec48bd 100644 --- a/tests/fixture/testnet/interfaces.go +++ b/tests/fixture/testnet/interfaces.go @@ -4,6 +4,9 @@ package testnet import ( + "context" + "io" + "github.com/ava-labs/avalanchego/ids" "github.com/ava-labs/avalanchego/node" ) @@ -12,6 +15,7 @@ import ( type Network interface { GetConfig() NetworkConfig GetNodes() []Node + AddEphemeralNode(w io.Writer, flags FlagsMap) (Node, error) } // Defines node capabilities supportable regardless of how a network is orchestrated. @@ -19,4 +23,6 @@ type Node interface { GetID() ids.NodeID GetConfig() NodeConfig GetProcessContext() node.NodeProcessContext + IsHealthy(ctx context.Context) (bool, error) + Stop() error } diff --git a/tests/fixture/testnet/local/README.md b/tests/fixture/testnet/local/README.md index 83455fd219e6..fdfbbdb4d58b 100644 --- a/tests/fixture/testnet/local/README.md +++ b/tests/fixture/testnet/local/README.md @@ -152,7 +152,10 @@ HOME │ └── config.json // C-Chain config for all nodes ├── defaults.json // Default flags and configuration for network ├── genesis.json // Genesis for all nodes - └── network.env // Sets network dir env to simplify use of network + ├── network.env // Sets network dir env to simplify use of network + └── ephemeral // Parent directory for ephemeral nodes (e.g. created by tests) + └─ NodeID-FdxnAvr4jK9XXAwsYZPgWAHW2QnwSZ // Data dir for an ephemeral node + └── ... ``` diff --git a/tests/fixture/testnet/local/config.go b/tests/fixture/testnet/local/config.go index 8da3e3564ab2..19652e76a313 100644 --- a/tests/fixture/testnet/local/config.go +++ b/tests/fixture/testnet/local/config.go @@ -20,7 +20,6 @@ const ( DefaultNetworkStartTimeout = 2 * time.Minute DefaultNodeInitTimeout = 10 * time.Second DefaultNodeStopTimeout = 5 * time.Second - DefaultNodeTickerInterval = 50 * time.Millisecond ) // A set of flags appropriate for local testing. diff --git a/tests/fixture/testnet/local/network.go b/tests/fixture/testnet/local/network.go index c5d38da1584b..0d090d4ffe9f 100644 --- a/tests/fixture/testnet/local/network.go +++ b/tests/fixture/testnet/local/network.go @@ -24,17 +24,21 @@ import ( "github.com/ava-labs/avalanchego/utils/set" ) -// This interval was chosen to avoid spamming node APIs during -// startup, as smaller intervals (e.g. 50ms) seemed to noticeably -// increase the time for a network's nodes to be seen as healthy. -const networkHealthCheckInterval = 200 * time.Millisecond +const ( + // This interval was chosen to avoid spamming node APIs during + // startup, as smaller intervals (e.g. 50ms) seemed to noticeably + // increase the time for a network's nodes to be seen as healthy. + networkHealthCheckInterval = 200 * time.Millisecond + + defaultEphemeralDirName = "ephemeral" +) var ( - errInvalidNodeCount = errors.New("failed to populate local network config: non-zero node count is only valid for a network without nodes") - errInvalidKeyCount = errors.New("failed to populate local network config: non-zero key count is only valid for a network without keys") - errLocalNetworkDirNotSet = errors.New("local network directory not set - has Create() been called?") - errInvalidNetworkDir = errors.New("failed to write local network: invalid network directory") - errNotHealthyBeforeTimeout = errors.New("failed to see all nodes healthy before timeout") + errInvalidNodeCount = errors.New("failed to populate local network config: non-zero node count is only valid for a network without nodes") + errInvalidKeyCount = errors.New("failed to populate local network config: non-zero key count is only valid for a network without keys") + errLocalNetworkDirNotSet = errors.New("local network directory not set - has Create() been called?") + errInvalidNetworkDir = errors.New("failed to write local network: invalid network directory") + errMissingBootstrapNodes = errors.New("failed to add node due to missing bootstrap nodes") ) // Default root dir for storing networks and their configuration. @@ -102,6 +106,18 @@ func (ln *LocalNetwork) GetNodes() []testnet.Node { return nodes } +// Adds a backend-agnostic ephemeral node to the network +func (ln *LocalNetwork) AddEphemeralNode(w io.Writer, flags testnet.FlagsMap) (testnet.Node, error) { + if flags == nil { + flags = testnet.FlagsMap{} + } + return ln.AddLocalNode(w, &LocalNode{ + NodeConfig: testnet.NodeConfig{ + Flags: flags, + }, + }, true /* isEphemeral */) +} + // Starts a new network stored under the provided root dir. Required // configuration will be defaulted if not provided. func StartNetwork( @@ -266,7 +282,7 @@ func (ln *LocalNetwork) PopulateLocalNetworkConfig(networkID uint32, nodeCount i for _, node := range ln.Nodes { // Ensure the node is configured for use with the network and // knows where to write its configuration. - if err := ln.PopulateNodeConfig(node); err != nil { + if err := ln.PopulateNodeConfig(node, ln.Dir); err != nil { return err } } @@ -274,9 +290,10 @@ func (ln *LocalNetwork) PopulateLocalNetworkConfig(networkID uint32, nodeCount i return nil } -// Ensure the provided node has the configuration it needs to -// start. Requires that the network has valid genesis data. -func (ln *LocalNetwork) PopulateNodeConfig(node *LocalNode) error { +// Ensure the provided node has the configuration it needs to start. If the data dir is +// not set, it will be defaulted to [nodeParentDir]/[node ID]. Requires that the +// network has valid genesis data. +func (ln *LocalNetwork) PopulateNodeConfig(node *LocalNode, nodeParentDir string) error { flags := node.Flags // Set values common to all nodes @@ -298,7 +315,7 @@ func (ln *LocalNetwork) PopulateNodeConfig(node *LocalNode) error { dataDir := node.GetDataDir() if len(dataDir) == 0 { // NodeID will have been set by EnsureKeys - dataDir = filepath.Join(ln.Dir, node.NodeID.String()) + dataDir = filepath.Join(nodeParentDir, node.NodeID.String()) flags[config.DataDirKey] = dataDir } @@ -368,7 +385,7 @@ func (ln *LocalNetwork) WaitForHealthy(ctx context.Context, w io.Writer) error { } healthy, err := node.IsHealthy(ctx) - if err != nil && !errors.Is(err, errProcessNotRunning) { + if err != nil && !errors.Is(err, testnet.ErrNotRunning) { return err } if !healthy { @@ -383,7 +400,7 @@ func (ln *LocalNetwork) WaitForHealthy(ctx context.Context, w io.Writer) error { select { case <-ctx.Done(): - return errNotHealthyBeforeTimeout + return fmt.Errorf("failed to see all nodes healthy before timeout: %w", ctx.Err()) case <-ticker.C: } } @@ -632,3 +649,65 @@ func (ln *LocalNetwork) ReadAll() error { } return ln.ReadNodes() } + +func (ln *LocalNetwork) AddLocalNode(w io.Writer, node *LocalNode, isEphemeral bool) (*LocalNode, error) { + // Assume network configuration has been written to disk and is current in memory + + if node == nil { + // Set an empty data dir so that PopulateNodeConfig will know + // to set the default of `[network dir]/[node id]`. + node = NewLocalNode("") + } + + // Default to a data dir of [network-dir]/[node-ID] + nodeParentDir := ln.Dir + if isEphemeral { + // For an ephemeral node, default to a data dir of [network-dir]/[ephemeral-dir]/[node-ID] + // to provide a clear separation between nodes that are expected to expose stable API + // endpoints and those that will live for only a short time (e.g. a node started by a test + // and stopped on teardown). + // + // The data for an ephemeral node is still stored in the file tree rooted at the network + // dir to ensure that recursively archiving the network dir in CI will collect all node + // data used for a test run. + nodeParentDir = filepath.Join(ln.Dir, defaultEphemeralDirName) + } + + if err := ln.PopulateNodeConfig(node, nodeParentDir); err != nil { + return nil, err + } + + // Collect staking addresses of running nodes for use in bootstraping the new node + if err := ln.ReadNodes(); err != nil { + return nil, fmt.Errorf("failed to read local network nodes: %w", err) + } + + var ( + // Use dynamic port allocation. + httpPort uint16 = 0 + stakingPort uint16 = 0 + + bootstrapIPs = make([]string, 0, len(ln.Nodes)) + bootstrapIDs = make([]string, 0, len(ln.Nodes)) + ) + + for _, node := range ln.Nodes { + if len(node.StakingAddress) == 0 { + // Node is not running + continue + } + + bootstrapIPs = append(bootstrapIPs, node.StakingAddress) + bootstrapIDs = append(bootstrapIDs, node.NodeID.String()) + } + if len(bootstrapIDs) == 0 { + return nil, errMissingBootstrapNodes + } + + node.SetNetworkingConfigDefaults(httpPort, stakingPort, bootstrapIDs, bootstrapIPs) + + if err := node.WriteConfig(); err != nil { + return nil, err + } + return node, node.Start(w, ln.ExecPath) +} diff --git a/tests/fixture/testnet/local/node.go b/tests/fixture/testnet/local/node.go index 670f517cc383..2de516825677 100644 --- a/tests/fixture/testnet/local/node.go +++ b/tests/fixture/testnet/local/node.go @@ -27,10 +27,7 @@ import ( "github.com/ava-labs/avalanchego/utils/perms" ) -var ( - errProcessNotRunning = errors.New("process not running") - errNodeAlreadyRunning = errors.New("failed to start local node: node is already running") -) +var errNodeAlreadyRunning = errors.New("failed to start local node: node is already running") // Defines local-specific node configuration. Supports setting default // and node-specific values. @@ -186,13 +183,25 @@ func (n *LocalNode) Start(w io.Writer, defaultExecPath string) error { return err } + // Determine appropriate level of node description detail + nodeDescription := fmt.Sprintf("node %q", n.NodeID) + isEphemeralNode := filepath.Base(filepath.Dir(n.GetDataDir())) == defaultEphemeralDirName + if isEphemeralNode { + nodeDescription = "ephemeral " + nodeDescription + } + nonDefaultNodeDir := filepath.Base(n.GetDataDir()) != n.NodeID.String() + if nonDefaultNodeDir { + // Only include the data dir if its base is not the default (the node ID) + nodeDescription = fmt.Sprintf("%s with path: %s", nodeDescription, n.GetDataDir()) + } + go func() { if err := cmd.Wait(); err != nil { if err.Error() != "signal: killed" { - _, _ = fmt.Fprintf(w, "node %q finished with error: %v\n", n.NodeID, err) + _, _ = fmt.Fprintf(w, "%s finished with error: %v\n", nodeDescription, err) } } - _, _ = fmt.Fprintf(w, "node %q exited\n", n.NodeID) + _, _ = fmt.Fprintf(w, "%s exited\n", nodeDescription) }() // A node writes a process context file on start. If the file is not @@ -202,7 +211,7 @@ func (n *LocalNode) Start(w io.Writer, defaultExecPath string) error { return fmt.Errorf("failed to start local node: %w", err) } - _, err = fmt.Fprintf(w, "Started %s\n", n.NodeID) + _, err = fmt.Fprintf(w, "Started %s\n", nodeDescription) return err } @@ -256,7 +265,7 @@ func (n *LocalNode) Stop() error { } // Wait for the node process to stop - ticker := time.NewTicker(DefaultNodeTickerInterval) + ticker := time.NewTicker(testnet.DefaultNodeTickerInterval) defer ticker.Stop() ctx, cancel := context.WithTimeout(context.Background(), DefaultNodeStopTimeout) defer cancel() @@ -286,7 +295,7 @@ func (n *LocalNode) IsHealthy(ctx context.Context) (bool, error) { return false, fmt.Errorf("failed to determine process status: %w", err) } if proc == nil { - return false, errProcessNotRunning + return false, testnet.ErrNotRunning } // Check that the node is reporting healthy @@ -308,11 +317,11 @@ func (n *LocalNode) IsHealthy(ctx context.Context) (bool, error) { } } // Assume all other errors are not recoverable - return false, err + return false, fmt.Errorf("failed to query node health: %w", err) } func (n *LocalNode) WaitForProcessContext(ctx context.Context) error { - ticker := time.NewTicker(DefaultNodeTickerInterval) + ticker := time.NewTicker(testnet.DefaultNodeTickerInterval) defer ticker.Stop() ctx, cancel := context.WithTimeout(ctx, DefaultNodeInitTimeout)