Commit 7120e63a authored by Brad Davidson's avatar Brad Davidson Committed by Brad Davidson

Use etcd proxy to bootstrap control-plane-only nodes, if possible

Signed-off-by: 's avatarBrad Davidson <brad.davidson@rancher.com> (cherry picked from commit f9403687) Signed-off-by: 's avatarBrad Davidson <brad.davidson@rancher.com>
parent 5d5d6ee1
......@@ -37,48 +37,53 @@ func (c *Cluster) Bootstrap(ctx context.Context, clusterReset bool) error {
return errors.Wrap(err, "failed to set datastore driver")
}
// Check if we need to bootstrap, and whether or not the managed database has already
// been initialized (created or joined an existing cluster). Note that nodes without
// a local datastore always need to bootstrap and never count as initialized.
// This also sets c.clientAccessInfo if c.config.JoinURL and c.config.Token are set.
shouldBootstrap, isInitialized, err := c.shouldBootstrapLoad(ctx)
if err != nil {
return errors.Wrap(err, "failed to check if bootstrap data has been initialized")
}
c.shouldBootstrap = shouldBootstrap
if c.managedDB != nil {
if !clusterReset {
isHTTP := c.config.JoinURL != "" && c.config.Token != ""
// For secondary servers, we attempt to connect and reconcile with the datastore.
// If that fails we fallback to the local etcd cluster start
if isInitialized && isHTTP && c.clientAccessInfo != nil {
if err := c.httpBootstrap(ctx); err == nil {
logrus.Info("Successfully reconciled with datastore")
if c.config.DisableETCD {
// secondary server with etcd disabled, start the etcd proxy so that we can attempt to use it
// when reconciling.
if err := c.startEtcdProxy(ctx); err != nil {
return errors.Wrap(err, "failed to start etcd proxy")
}
} else if isInitialized && !clusterReset {
// For secondary servers with etcd, first attempt to connect and reconcile using the join URL.
// This saves on having to start up a temporary etcd just to extract bootstrap data.
if c.clientAccessInfo != nil {
if err := c.httpBootstrap(ctx); err != nil {
logrus.Warnf("Unable to reconcile with remote datastore: %v", err)
} else {
logrus.Info("Successfully reconciled with remote datastore")
return nil
}
logrus.Warnf("Unable to reconcile with datastore: %v", err)
}
// In the case of etcd, if the database has been initialized, it doesn't
// need to be bootstrapped however we still need to check the database
// and reconcile the bootstrap data. Below we're starting a temporary
// instance of etcd in the event that etcd certificates are unavailable,
// reading the data, and comparing that to the data on disk, all the while
// starting normal etcd.
if isInitialized {
if err := c.reconcileEtcd(ctx); err != nil {
logrus.Fatalf("Failed to reconcile with temporary etcd: %v", err)
}
// Not a secondary server or failed to reconcile via join URL, start up a temporary etcd
// with the local datastore and use that to reconcile.
if err := c.reconcileEtcd(ctx); err != nil {
logrus.Fatalf("Failed to reconcile with temporary etcd: %v", err)
}
}
}
if c.shouldBootstrap {
if shouldBootstrap {
return c.bootstrap(ctx)
}
return nil
}
// shouldBootstrapLoad returns true if we need to load ControlRuntimeBootstrap data again and a second boolean
// indicating that the server has or has not been initialized, if etcd. This is controlled by a stamp file on
// disk that records successful bootstrap using a hash of the join token.
// shouldBootstrapLoad returns true if we need to load ControlRuntimeBootstrap data again and a
// second boolean indicating that the server has or has not been initialized, if etcd. This is
// controlled by a stamp file on disk that records successful bootstrap using a hash of the join
// token. This function also sets up the HTTP Bootstrap request handler and sets
// c.clientAccessInfo if join url and token are set.
func (c *Cluster) shouldBootstrapLoad(ctx context.Context) (bool, bool, error) {
opts := []clientaccess.ValidationOption{
clientaccess.WithUser("server"),
......@@ -88,7 +93,6 @@ func (c *Cluster) shouldBootstrapLoad(ctx context.Context) (bool, bool, error) {
// Non-nil managedDB indicates that the database is either initialized, initializing, or joining
if c.managedDB != nil {
c.config.Runtime.HTTPBootstrap = c.serveBootstrap()
isInitialized, err := c.managedDB.IsInitialized()
if err != nil {
return false, false, err
......@@ -121,9 +125,13 @@ func (c *Cluster) shouldBootstrapLoad(ctx context.Context) (bool, bool, error) {
if err != nil {
return false, false, errors.Wrap(err, "failed to validate token")
}
logrus.Infof("Managed %s cluster not yet initialized", c.managedDB.EndpointName())
c.clientAccessInfo = info
if c.config.DisableETCD {
logrus.Infof("Managed %s disabled on this node", c.managedDB.EndpointName())
} else {
logrus.Infof("Managed %s cluster not yet initialized", c.managedDB.EndpointName())
}
}
}
......@@ -441,13 +449,22 @@ func (c *Cluster) readBootstrapFromDisk() (*bytes.Buffer, error) {
func (c *Cluster) bootstrap(ctx context.Context) error {
c.joining = true
if c.config.Runtime.HTTPBootstrap != nil {
// We can only compare config when we have a server URL that we are joining against -
// if loading directly from the datastore we do not have any way to get the config
// from another server for comparison.
if c.managedDB != nil {
// Try to compare local config against the server we're joining.
if err := c.compareConfig(); err != nil {
return errors.Wrap(err, "failed to validate server configuration")
}
// Try to bootstrap from the datastore using the local etcd proxy.
if data, err := c.getBootstrapData(ctx, c.clientAccessInfo.Password); err != nil {
logrus.Debugf("Failed to get bootstrap data from etcd proxy: %v", err)
} else {
if err := c.ReconcileBootstrapData(ctx, bytes.NewReader(data), &c.config.Runtime.ControlRuntimeBootstrap, false); err != nil {
logrus.Debugf("Failed to reconcile bootstrap data from etcd proxy: %v", err)
} else {
return nil
}
}
// fall back to bootstrapping from the join URL
return c.httpBootstrap(ctx)
}
......@@ -472,7 +489,8 @@ func (c *Cluster) compareConfig() error {
}
serverConfig, err := agentClientAccessInfo.Get("/v1-" + version.Program + "/config")
if err != nil {
return err
logrus.Warnf("Skipping cluster configuration validation: %v", err)
return nil
}
clusterControl := &config.Control{}
if err := json.Unmarshal(serverConfig, clusterControl); err != nil {
......
......@@ -25,7 +25,6 @@ type Cluster struct {
joining bool
storageStarted bool
saveBootstrap bool
shouldBootstrap bool
cnFilterFunc func(...string) []string
}
......@@ -42,48 +41,6 @@ func (c *Cluster) Start(ctx context.Context) (<-chan struct{}, error) {
if c.config.DisableETCD {
ready := make(chan struct{})
defer close(ready)
// try to get /db/info urls first, for a current list of etcd cluster member client URLs
clientURLs, _, err := etcd.ClientURLs(ctx, c.clientAccessInfo, c.config.PrivateIP)
if err != nil {
return nil, err
}
// If we somehow got no error but also no client URLs, just use the address of the server we're joining
if len(clientURLs) == 0 {
clientURL, err := url.Parse(c.config.JoinURL)
if err != nil {
return nil, err
}
clientURL.Host = clientURL.Hostname() + ":2379"
clientURLs = append(clientURLs, clientURL.String())
logrus.Warnf("Got empty etcd ClientURL list; using server URL %s", clientURL)
}
etcdProxy, err := etcd.NewETCDProxy(ctx, c.config.SupervisorPort, c.config.DataDir, clientURLs[0], utilsnet.IsIPv6CIDR(c.config.ServiceIPRanges[0]))
if err != nil {
return nil, err
}
// immediately update the load balancer with all etcd addresses
// client URLs are a full URI, but the proxy only wants host:port
for i, c := range clientURLs {
u, err := url.Parse(c)
if err != nil {
return nil, errors.Wrap(err, "failed to parse etcd ClientURL")
}
clientURLs[i] = u.Host
}
etcdProxy.Update(clientURLs)
// start periodic endpoint sync goroutine
c.setupEtcdProxy(ctx, etcdProxy)
// remove etcd member if it exists
if err := c.managedDB.RemoveSelf(ctx); err != nil {
logrus.Warnf("Failed to remove this node from etcd members")
}
c.config.Runtime.EtcdConfig.Endpoints = strings.Split(c.config.Datastore.Endpoint, ",")
c.config.Runtime.EtcdConfig.TLSConfig = c.config.Datastore.BackendTLSConfig
return ready, nil
}
......@@ -142,6 +99,49 @@ func (c *Cluster) Start(ctx context.Context) (<-chan struct{}, error) {
return ready, nil
}
// startEtcdProxy starts an etcd load-balancer proxy, for control-plane-only nodes
// without a local datastore.
func (c *Cluster) startEtcdProxy(ctx context.Context) error {
defaultURL, err := url.Parse(c.config.JoinURL)
if err != nil {
return err
}
defaultURL.Host = defaultURL.Hostname() + ":2379"
etcdProxy, err := etcd.NewETCDProxy(ctx, c.config.SupervisorPort, c.config.DataDir, defaultURL.String(), utilsnet.IsIPv6CIDR(c.config.ServiceIPRanges[0]))
if err != nil {
return err
}
// immediately update the load balancer with all etcd addresses
// from /db/info, for a current list of etcd cluster member client URLs.
// client URLs are a full URI, but the proxy only wants host:port
if clientURLs, _, err := etcd.ClientURLs(ctx, c.clientAccessInfo, c.config.PrivateIP); err != nil || len(clientURLs) == 0 {
logrus.Warnf("Failed to get etcd ClientURLs: %v", err)
} else {
for i, c := range clientURLs {
u, err := url.Parse(c)
if err != nil {
return errors.Wrap(err, "failed to parse etcd ClientURL")
}
clientURLs[i] = u.Host
}
etcdProxy.Update(clientURLs)
}
// start periodic endpoint sync goroutine
c.setupEtcdProxy(ctx, etcdProxy)
// remove etcd member if it exists
if err := c.managedDB.RemoveSelf(ctx); err != nil {
logrus.Warnf("Failed to remove this node from etcd members: %v", err)
}
c.config.Runtime.EtcdConfig.Endpoints = strings.Split(c.config.Datastore.Endpoint, ",")
c.config.Runtime.EtcdConfig.TLSConfig = c.config.Datastore.BackendTLSConfig
return nil
}
// startStorage starts the kine listener and configures the endpoints, if necessary.
// This calls into the kine endpoint code, which sets up the database client
// and unix domain socket listener if using an external database. In the case of an etcd
......
......@@ -198,8 +198,12 @@ func (c *Cluster) storageBootstrap(ctx context.Context) error {
attempts := 0
tokenKey := storageKey(normalizedToken)
return wait.PollUntilContextCancel(ctx, time.Second, true, func(ctx context.Context) (bool, error) {
return wait.PollUntilContextCancel(ctx, 5*time.Second, true, func(ctx context.Context) (bool, error) {
attempts++
ctx, cancel := context.WithTimeout(ctx, 10*time.Second)
defer cancel()
value, saveBootstrap, err := getBootstrapKeyFromStorage(ctx, storageClient, normalizedToken, token)
c.saveBootstrap = saveBootstrap
if err != nil {
......
......@@ -649,6 +649,13 @@ func (e *ETCD) Register(handler http.Handler) (http.Handler, error) {
// is being removed from the cluster.
if !e.config.DisableAPIServer {
e.config.Runtime.LeaderElectedClusterControllerStarts[version.Program+"-etcd"] = func(ctx context.Context) {
// ensure client is started, as etcd startup may not have handled this if this is a control-plane-only node
if e.client == nil {
if err := e.startClient(ctx); err != nil {
panic(errors.Wrap(err, "failed to start etcd client"))
}
}
registerEndpointsHandlers(ctx, e)
registerMemberHandlers(ctx, e)
registerSnapshotHandlers(ctx, e)
......@@ -1648,6 +1655,12 @@ func GetAPIServerURLsFromETCD(ctx context.Context, cfg *config.Control) ([]strin
// GetMembersClientURLs will list through the member lists in etcd and return
// back a combined list of client urls for each member in the cluster
func (e *ETCD) GetMembersClientURLs(ctx context.Context) ([]string, error) {
if e.client == nil {
if err := e.startClient(ctx); err != nil {
return nil, err
}
}
members, err := e.client.MemberList(ctx)
if err != nil {
return nil, err
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment