From 6ffea2225ddf5d47534b1b942a38b60044c042e3 Mon Sep 17 00:00:00 2001 From: Kristoffer Dalby Date: Sat, 2 Oct 2021 15:28:19 +0100 Subject: [PATCH 1/7] Attempt to close failed streams If we have a failed write toward any of our connections, attempt to close the connection by returning "false" as in unsuccessful stream --- machine.go | 1 + poll.go | 9 ++++++--- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/machine.go b/machine.go index 1d4939c1..2d4bb518 100644 --- a/machine.go +++ b/machine.go @@ -309,6 +309,7 @@ func (h *Headscale) notifyChangesToPeers(m *Machine) { Str("machine", m.Name). Str("peer", p.Name). Msgf("Peer %s does not appear to be polling", p.Name) + return } log.Trace(). Str("func", "notifyChangesToPeers"). diff --git a/poll.go b/poll.go index 60bfa9ea..c5da3a96 100644 --- a/poll.go +++ b/poll.go @@ -230,6 +230,7 @@ func (h *Headscale) PollNetMapStream( Str("channel", "pollData"). Err(err). Msg("Cannot write data") + return false } log.Trace(). Str("handler", "PollNetMapStream"). @@ -237,7 +238,7 @@ func (h *Headscale) PollNetMapStream( Str("channel", "pollData"). Int("bytes", len(data)). Msg("Data from pollData channel written successfully") - // TODO: Abstract away all the database calls, this can cause race conditions + // TODO(kradalby): Abstract away all the database calls, this can cause race conditions // when an outdated machine object is kept alive, e.g. db is update from // command line, but then overwritten. err = h.UpdateMachine(&m) @@ -276,6 +277,7 @@ func (h *Headscale) PollNetMapStream( Str("channel", "keepAlive"). Err(err). Msg("Cannot write keep alive message") + return false } log.Trace(). Str("handler", "PollNetMapStream"). @@ -283,7 +285,7 @@ func (h *Headscale) PollNetMapStream( Str("channel", "keepAlive"). Int("bytes", len(data)). Msg("Keep alive sent successfully") - // TODO: Abstract away all the database calls, this can cause race conditions + // TODO(kradalby): Abstract away all the database calls, this can cause race conditions // when an outdated machine object is kept alive, e.g. db is update from // command line, but then overwritten. err = h.UpdateMachine(&m) @@ -336,6 +338,7 @@ func (h *Headscale) PollNetMapStream( Str("channel", "update"). Err(err). Msg("Could not write the map response") + return false } log.Trace(). Str("handler", "PollNetMapStream"). @@ -347,7 +350,7 @@ func (h *Headscale) PollNetMapStream( // we sometimes end in a state were the update // is not picked up by a client and we use this // to determine if we should "force" an update. - // TODO: Abstract away all the database calls, this can cause race conditions + // TODO(kradalby): Abstract away all the database calls, this can cause race conditions // when an outdated machine object is kept alive, e.g. db is update from // command line, but then overwritten. err = h.UpdateMachine(&m) From ed728f57e06568eb4b618610fd353b4b9fe02a36 Mon Sep 17 00:00:00 2001 From: Kristoffer Dalby Date: Sat, 2 Oct 2021 15:29:27 +0100 Subject: [PATCH 2/7] Remove WriteTimeout from HTTP Golangs built in HTTP server does not allow different HTTP timeout for different types of handlers, so we cannot have a write timeout as we attempt to do long polling (my bad). See linked article. Also removed redundant server declaration --- app.go | 25 +++++++++++-------------- 1 file changed, 11 insertions(+), 14 deletions(-) diff --git a/app.go b/app.go index dc398eb7..8be137cf 100644 --- a/app.go +++ b/app.go @@ -172,16 +172,18 @@ func (h *Headscale) Serve() error { r.GET("/apple/:platform", h.ApplePlatformConfig) var err error - timeout := 30 * time.Second - go h.watchForKVUpdates(5000) go h.expireEphemeralNodes(5000) s := &http.Server{ - Addr: h.cfg.Addr, - Handler: r, - ReadTimeout: timeout, - WriteTimeout: timeout, + Addr: h.cfg.Addr, + Handler: r, + ReadTimeout: 30 * time.Second, + // Go does not handle timeouts in HTTP very well, and there is + // no good way to handle streaming timeouts, therefore we need to + // keep this at unlimited and be careful to clean up connections + // https://blog.cloudflare.com/the-complete-guide-to-golang-net-http-timeouts/#aboutstreaming + WriteTimeout: 0, } if h.cfg.TLSLetsEncryptHostname != "" { @@ -194,13 +196,9 @@ func (h *Headscale) Serve() error { HostPolicy: autocert.HostWhitelist(h.cfg.TLSLetsEncryptHostname), Cache: autocert.DirCache(h.cfg.TLSLetsEncryptCacheDir), } - s := &http.Server{ - Addr: h.cfg.Addr, - TLSConfig: m.TLSConfig(), - Handler: r, - ReadTimeout: timeout, - WriteTimeout: timeout, - } + + s.TLSConfig = m.TLSConfig() + if h.cfg.TLSLetsEncryptChallengeType == "TLS-ALPN-01" { // Configuration via autocert with TLS-ALPN-01 (https://tools.ietf.org/html/rfc8737) // The RFC requires that the validation is done on port 443; in other words, headscale @@ -211,7 +209,6 @@ func (h *Headscale) Serve() error { // port 80 for the certificate validation in addition to the headscale // service, which can be configured to run on any other port. go func() { - log.Fatal(). Err(http.ListenAndServe(h.cfg.TLSLetsEncryptListen, m.HTTPHandler(http.HandlerFunc(h.redirect)))). Msg("failed to set up a HTTP server") From cefe2d5bccda4ad0a6d36f70c3466bff79bcacfa Mon Sep 17 00:00:00 2001 From: Kristoffer Dalby Date: Sat, 2 Oct 2021 15:30:41 +0100 Subject: [PATCH 3/7] Improve and clarify log entry --- poll.go | 22 +++++++++++++++++++++- 1 file changed, 21 insertions(+), 1 deletion(-) diff --git a/poll.go b/poll.go index c5da3a96..33b4b124 100644 --- a/poll.go +++ b/poll.go @@ -259,7 +259,7 @@ func (h *Headscale) PollNetMapStream( Str("machine", m.Name). Str("channel", "pollData"). Int("bytes", len(data)). - Msg("Machine updated successfully after sending pollData") + Msg("Machine entry in database updated successfully after sending pollData") return true case data := <-keepAliveChan: @@ -396,13 +396,33 @@ func (h *Headscale) PollNetMapStream( m.LastSeen = &now h.db.Save(&m) + log.Trace(). + Str("handler", "PollNetMapStream"). + Str("machine", m.Name). + Str("channel", "Done"). + Msg("Canceling keepAlive channel") cancelKeepAlive <- struct{}{} + log.Trace(). + Str("handler", "PollNetMapStream"). + Str("machine", m.Name). + Str("channel", "Done"). + Msg("Closing update channel") h.closeUpdateChannel(&m) close(pollDataChan) + log.Trace(). + Str("handler", "PollNetMapStream"). + Str("machine", m.Name). + Str("channel", "Done"). + Msg("Closing pollData channel") close(keepAliveChan) + log.Trace(). + Str("handler", "PollNetMapStream"). + Str("machine", m.Name). + Str("channel", "Done"). + Msg("Closing keepAliveChan channel") return false } From 39abc4e97317b64597147200ed313216c86e24d5 Mon Sep 17 00:00:00 2001 From: Kristoffer Dalby Date: Sat, 2 Oct 2021 15:38:53 +0100 Subject: [PATCH 4/7] Clarify error messages for nodes that are not connected If a node does not have an update channel, it is probably not connected, clarify the log messages and make sure we dont print that it was updated successfully (continue, not return) --- machine.go | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/machine.go b/machine.go index 2d4bb518..1cd2d95f 100644 --- a/machine.go +++ b/machine.go @@ -308,8 +308,8 @@ func (h *Headscale) notifyChangesToPeers(m *Machine) { Str("func", "notifyChangesToPeers"). Str("machine", m.Name). Str("peer", p.Name). - Msgf("Peer %s does not appear to be polling", p.Name) - return + Msgf("Peer %s does not have an open update client, skipping.", p.Name) + continue } log.Trace(). Str("func", "notifyChangesToPeers"). @@ -380,11 +380,12 @@ func (h *Headscale) sendRequestOnUpdateChannel(m *tailcfg.Node) error { Msgf("Notified machine %s", m.Name) } } else { + err := errors.New("machine does not have an open update channel") log.Info(). Str("func", "requestUpdate"). Str("machine", m.Name). - Msgf("Machine %s does not appear to be polling", m.Name) - return errors.New("machine does not seem to be polling") + Msgf("Machine %s does not have an open update channel", m.Name) + return err } return nil } From 54daa0da23e44a5f9836485ae9e0e9582c7347a8 Mon Sep 17 00:00:00 2001 From: Kristoffer Dalby Date: Sat, 2 Oct 2021 17:35:39 +0100 Subject: [PATCH 5/7] Fix spelling error --- poll.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/poll.go b/poll.go index 33b4b124..73726610 100644 --- a/poll.go +++ b/poll.go @@ -400,7 +400,7 @@ func (h *Headscale) PollNetMapStream( Str("handler", "PollNetMapStream"). Str("machine", m.Name). Str("channel", "Done"). - Msg("Canceling keepAlive channel") + Msg("Cancelling keepAlive channel") cancelKeepAlive <- struct{}{} log.Trace(). From 78a0f3ca37e140f227cfd9c28331f32dff945fe4 Mon Sep 17 00:00:00 2001 From: Kristoffer Dalby Date: Sat, 2 Oct 2021 18:39:09 +0100 Subject: [PATCH 6/7] Up ping timeout --- integration_test.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/integration_test.go b/integration_test.go index f62ca1da..925fc4ca 100644 --- a/integration_test.go +++ b/integration_test.go @@ -433,7 +433,7 @@ func (s *IntegrationTestSuite) TestPingAllPeers() { command := []string{ "tailscale", "ping", "--timeout=1s", - "--c=20", + "--c=100", "--until-direct=true", ip.String(), } From d637a9c3024160b81b25047b92d7c2757ba3c1bf Mon Sep 17 00:00:00 2001 From: Kristoffer Dalby Date: Sat, 2 Oct 2021 22:56:48 +0100 Subject: [PATCH 7/7] Change ping count --- integration_test.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/integration_test.go b/integration_test.go index 925fc4ca..b768fa59 100644 --- a/integration_test.go +++ b/integration_test.go @@ -433,7 +433,7 @@ func (s *IntegrationTestSuite) TestPingAllPeers() { command := []string{ "tailscale", "ping", "--timeout=1s", - "--c=100", + "--c=10", "--until-direct=true", ip.String(), }