Skip to content

Commit 7b3f829

Browse files
authored
chore(ui): Only rejoin new nodes and add back instrumentation (#16445)
1 parent 6c0f67d commit 7b3f829

File tree

12 files changed

+126
-74
lines changed

12 files changed

+126
-74
lines changed

‎docs/sources/shared/configuration.md

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -124,7 +124,7 @@ ui:
124124

125125
# How frequently to rejoin the cluster to address split brain issues.
126126
# CLI flag: -ui.rejoin-interval
127-
[rejoin_interval: <duration> | default = 15s]
127+
[rejoin_interval: <duration> | default = 3m]
128128

129129
# Number of initial peers to join from the discovered set.
130130
# CLI flag: -ui.cluster-max-join-peers
@@ -138,6 +138,10 @@ ui:
138138
# CLI flag: -ui.enable-ipv6
139139
[enable_ipv6: <boolean> | default = false]
140140

141+
# Enable debug logging for the UI.
142+
# CLI flag: -ui.debug
143+
[debug: <boolean> | default = false]
144+
141145
discovery:
142146
# List of peers to join the cluster. Supports multiple values separated by
143147
# commas. Each value can be a hostname, an IP address, or a DNS name (A/AAAA

‎go.mod

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -50,7 +50,7 @@ require (
5050
github.com/google/uuid v1.6.0
5151
github.com/gorilla/mux v1.8.1
5252
github.com/gorilla/websocket v1.5.3
53-
github.com/grafana/ckit v0.0.0-20250109002736-4ca45886e452
53+
github.com/grafana/ckit v0.0.0-20250226083311-4f9f4aacabb5
5454
github.com/grafana/cloudflare-go v0.0.0-20230110200409-c627cf6792f2
5555
github.com/grafana/dskit v0.0.0-20241007172036-53283a0f6b41
5656
github.com/grafana/go-gelf/v2 v2.0.1
@@ -318,7 +318,7 @@ require (
318318
github.com/hashicorp/go-rootcerts v1.0.2 // indirect
319319
github.com/hashicorp/go-sockaddr v1.0.7 // indirect
320320
github.com/hashicorp/go-uuid v1.0.3 // indirect
321-
github.com/hashicorp/memberlist v0.5.2 // indirect
321+
github.com/hashicorp/memberlist v0.5.3 // indirect
322322
github.com/hashicorp/serf v0.10.1 // indirect
323323
github.com/huandu/xstrings v1.5.0 // indirect
324324
github.com/jcmturner/aescts/v2 v2.0.0 // indirect

‎go.sum

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -621,8 +621,8 @@ github.com/gorilla/sessions v1.2.1/go.mod h1:dk2InVEVJ0sfLlnXv9EAgkf6ecYs/i80K/z
621621
github.com/gorilla/websocket v0.0.0-20170926233335-4201258b820c/go.mod h1:E7qHFY5m1UJ88s3WnNqhKjPHQ0heANvMoAMk2YaljkQ=
622622
github.com/gorilla/websocket v1.5.3 h1:saDtZ6Pbx/0u+bgYQ3q96pZgCzfhKXGPqt7kZ72aNNg=
623623
github.com/gorilla/websocket v1.5.3/go.mod h1:YR8l580nyteQvAITg2hZ9XVh4b55+EU/adAjf1fMHhE=
624-
github.com/grafana/ckit v0.0.0-20250109002736-4ca45886e452 h1:d/pdVKdLSNUfHUlWsN39OqUI94XgWKOGJdi568yOXmc=
625-
github.com/grafana/ckit v0.0.0-20250109002736-4ca45886e452/go.mod h1:x6HpYv0+NXPJRBbDYA40IcxWHvrrKwgrMe1Mue172wE=
624+
github.com/grafana/ckit v0.0.0-20250226083311-4f9f4aacabb5 h1:EkW+rjr8zqiB4Jd7Gn5BmUhDz6PsZ0w33/4osKRd5x8=
625+
github.com/grafana/ckit v0.0.0-20250226083311-4f9f4aacabb5/go.mod h1:izhHi8mZ16lxMxsdlFjPHzkopbjKNdorTtitYyzAejY=
626626
github.com/grafana/cloudflare-go v0.0.0-20230110200409-c627cf6792f2 h1:qhugDMdQ4Vp68H0tp/0iN17DM2ehRo1rLEdOFe/gB8I=
627627
github.com/grafana/cloudflare-go v0.0.0-20230110200409-c627cf6792f2/go.mod h1:w/aiO1POVIeXUQyl0VQSZjl5OAGDTL5aX+4v0RA1tcw=
628628
github.com/grafana/dskit v0.0.0-20241007172036-53283a0f6b41 h1:a4O59OU3FJZ+EJUVnlvvNTvdAc4uRN1P6EaGwqL9CnA=

‎pkg/loki/modules.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1954,7 +1954,7 @@ func (t *Loki) initDataObjExplorer() (services.Service, error) {
19541954

19551955
func (t *Loki) initUI() (services.Service, error) {
19561956
t.Cfg.UI = t.Cfg.UI.WithAdvertisePort(t.Cfg.Server.HTTPListenPort)
1957-
svc, err := ui.NewService(t.Cfg.UI, t.Server.HTTP, log.With(util_log.Logger, "component", "ui"))
1957+
svc, err := ui.NewService(t.Cfg.UI, t.Server.HTTP, log.With(util_log.Logger, "component", "ui"), prometheus.DefaultRegisterer)
19581958
if err != nil {
19591959
return nil, err
19601960
}

‎pkg/ui/config.go

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@ type Config struct {
2121
ClusterMaxJoinPeers int `yaml:"cluster_max_join_peers"` // Number of initial peers to join from the discovered set.
2222
ClusterName string `yaml:"cluster_name"` // Name to prevent nodes without this identifier from joining the cluster.
2323
EnableIPv6 bool `yaml:"enable_ipv6"`
24+
Debug bool `yaml:"debug"`
2425
Discovery struct {
2526
JoinPeers []string `yaml:"join_peers"`
2627
} `yaml:"discovery"`
@@ -42,11 +43,12 @@ func (cfg *Config) RegisterFlags(f *flag.FlagSet) {
4243
f.Var((*flagext.StringSlice)(&cfg.InfNames), "ui.interface", "Name of network interface to read address from.")
4344
f.StringVar(&cfg.NodeName, "ui.node-name", hostname, "Name to use for this node in the cluster.")
4445
f.StringVar(&cfg.AdvertiseAddr, "ui.advertise-addr", "", "IP address to advertise in the cluster.")
45-
f.DurationVar(&cfg.RejoinInterval, "ui.rejoin-interval", 15*time.Second, "How frequently to rejoin the cluster to address split brain issues.")
46+
f.DurationVar(&cfg.RejoinInterval, "ui.rejoin-interval", 3*time.Minute, "How frequently to rejoin the cluster to address split brain issues.")
4647
f.IntVar(&cfg.ClusterMaxJoinPeers, "ui.cluster-max-join-peers", 3, "Number of initial peers to join from the discovered set.")
4748
f.StringVar(&cfg.ClusterName, "ui.cluster-name", "", "Name to prevent nodes without this identifier from joining the cluster.")
4849
f.BoolVar(&cfg.EnableIPv6, "ui.enable-ipv6", false, "Enable using a IPv6 instance address.")
4950
f.Var((*flagext.StringSlice)(&cfg.Discovery.JoinPeers), "ui.discovery.join-peers", "List of peers to join the cluster. Supports multiple values separated by commas. Each value can be a hostname, an IP address, or a DNS name (A/AAAA and SRV records).")
51+
f.BoolVar(&cfg.Debug, "ui.debug", false, "Enable debug logging for the UI.")
5052
}
5153

5254
func (cfg Config) Validate() error {

‎pkg/ui/discovery.go

Lines changed: 40 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,45 @@ import (
1212
)
1313

1414
func (s *Service) getBootstrapPeers() ([]string, error) {
15+
peers, err := s.discoverPeers()
16+
if err != nil {
17+
return nil, err
18+
}
19+
return selectRandomPeers(peers, s.cfg.ClusterMaxJoinPeers), nil
20+
}
21+
22+
func selectRandomPeers(peers []string, maxPeers int) []string {
23+
// Here we return the entire list because we can't take a subset.
24+
if maxPeers == 0 || len(peers) < maxPeers {
25+
return peers
26+
}
27+
28+
// We shuffle the list and return only a subset of the peers.
29+
rand.Shuffle(len(peers), func(i, j int) {
30+
peers[i], peers[j] = peers[j], peers[i]
31+
})
32+
return peers[:maxPeers]
33+
}
34+
35+
func (s *Service) discoverNewPeers(prevPeers map[string]struct{}) ([]string, error) {
36+
peers, err := s.discoverPeers()
37+
if err != nil {
38+
return nil, err
39+
}
40+
41+
// Build list of new peers that weren't in previous list
42+
var newPeers []string
43+
for _, peer := range peers {
44+
if _, ok := prevPeers[peer]; !ok {
45+
newPeers = append(newPeers, peer)
46+
prevPeers[peer] = struct{}{}
47+
}
48+
}
49+
50+
return selectRandomPeers(newPeers, s.cfg.ClusterMaxJoinPeers), nil
51+
}
52+
53+
func (s *Service) discoverPeers() ([]string, error) {
1554
if len(s.cfg.Discovery.JoinPeers) == 0 {
1655
return nil, nil
1756
}
@@ -29,17 +68,7 @@ func (s *Service) getBootstrapPeers() ([]string, error) {
2968
}
3069

3170
// Return unique addresses.
32-
peers := uniq(addresses)
33-
// Here we return the entire list because we can't take a subset.
34-
if s.cfg.ClusterMaxJoinPeers == 0 || len(peers) < s.cfg.ClusterMaxJoinPeers {
35-
return peers, nil
36-
}
37-
38-
// We shuffle the list and return only a subset of the peers.
39-
rand.Shuffle(len(peers), func(i, j int) {
40-
peers[i], peers[j] = peers[j], peers[i]
41-
})
42-
return peers[:s.cfg.ClusterMaxJoinPeers], nil
71+
return uniq(addresses), nil
4372
}
4473

4574
func uniq(addresses []string) []string {

‎pkg/ui/service.go

Lines changed: 30 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -18,9 +18,8 @@ import (
1818
"github.com/grafana/ckit/peer"
1919
"github.com/grafana/dskit/ring"
2020
"github.com/grafana/dskit/services"
21+
"github.com/prometheus/client_golang/prometheus"
2122
"golang.org/x/net/http2"
22-
23-
util_log "github.com/grafana/loki/v3/pkg/util/log"
2423
)
2524

2625
// This allows to rate limit the number of updates when the cluster is frequently changing (e.g. during rollout).
@@ -37,10 +36,11 @@ type Service struct {
3736

3837
cfg Config
3938
logger log.Logger
39+
reg prometheus.Registerer
4040
}
4141

42-
func NewService(cfg Config, router *mux.Router, logger log.Logger) (*Service, error) {
43-
addr, err := ring.GetInstanceAddr(cfg.AdvertiseAddr, cfg.InfNames, util_log.Logger, cfg.EnableIPv6)
42+
func NewService(cfg Config, router *mux.Router, logger log.Logger, reg prometheus.Registerer) (*Service, error) {
43+
addr, err := ring.GetInstanceAddr(cfg.AdvertiseAddr, cfg.InfNames, logger, cfg.EnableIPv6)
4444
if err != nil {
4545
return nil, err
4646
}
@@ -54,21 +54,30 @@ func NewService(cfg Config, router *mux.Router, logger log.Logger) (*Service, er
5454
},
5555
},
5656
}
57+
58+
if !cfg.Debug {
59+
logger = level.NewFilter(logger, level.AllowInfo())
60+
}
5761
advertiseAddr := fmt.Sprintf("%s:%d", cfg.AdvertiseAddr, cfg.AdvertisePort)
5862
node, err := ckit.NewNode(httpClient, ckit.Config{
59-
Name: cfg.NodeName,
60-
// TODO(cyriltovena): ckit debug logs are too verbose
61-
Log: level.NewFilter(logger, level.AllowInfo()),
63+
Name: cfg.NodeName,
64+
Log: logger,
6265
AdvertiseAddr: advertiseAddr,
6366
Label: cfg.ClusterName,
6467
})
6568
if err != nil {
6669
return nil, err
6770
}
71+
if reg != nil {
72+
if err := reg.Register(node.Metrics()); err != nil {
73+
return nil, err
74+
}
75+
}
6876

6977
svc := &Service{
7078
cfg: cfg,
7179
logger: logger,
80+
reg: reg,
7281
node: node,
7382
router: router,
7483
client: httpClient,
@@ -102,6 +111,10 @@ func (s *Service) run(ctx context.Context) error {
102111
level.Error(s.logger).Log("msg", "failed to bootstrap a fresh cluster with no peers", "err", err)
103112
}
104113
}
114+
newPeers := make(map[string]struct{})
115+
for _, p := range peers {
116+
newPeers[p] = struct{}{}
117+
}
105118

106119
var wg sync.WaitGroup
107120
if s.cfg.RejoinInterval > 0 {
@@ -116,15 +129,17 @@ func (s *Service) run(ctx context.Context) error {
116129
case <-ctx.Done():
117130
return
118131
case <-ticker.C:
119-
peers, err := s.getBootstrapPeers()
132+
peers, err := s.discoverNewPeers(newPeers)
120133
if err != nil {
121134
level.Warn(s.logger).Log("msg", "failed to get peers to join; will try again", "err", err)
122135
continue
123136
}
124-
level.Info(s.logger).Log("msg", "rejoining cluster", "peers_count", len(peers))
125-
if err := s.node.Start(peers); err != nil {
126-
level.Warn(s.logger).Log("msg", "failed to connect to peers; will try again", "err", err)
127-
continue
137+
if len(peers) > 0 {
138+
level.Info(s.logger).Log("msg", "rejoining cluster", "peers_count", len(newPeers))
139+
if err := s.node.Start(peers); err != nil {
140+
level.Warn(s.logger).Log("msg", "failed to connect to peers; will try again", "err", err)
141+
continue
142+
}
128143
}
129144
}
130145
}
@@ -142,6 +157,9 @@ func (s *Service) stop(_ error) error {
142157
if err := s.node.ChangeState(ctx, peer.StateTerminating); err != nil {
143158
level.Error(s.logger).Log("msg", "failed to change state to terminating", "err", err)
144159
}
160+
if s.reg != nil {
161+
s.reg.Unregister(s.node.Metrics())
162+
}
145163
return s.node.Stop()
146164
}
147165

‎production/helm/loki/README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@ Helm chart for Grafana Loki and Grafana Enterprise Logs supporting both simple,
1616
|------------|------|---------|
1717
| https://charts.min.io/ | minio(minio) | 5.4.0 |
1818
| https://grafana.github.io/helm-charts | grafana-agent-operator(grafana-agent-operator) | 0.5.1 |
19-
| https://grafana.github.io/helm-charts | rollout_operator(rollout-operator) | 0.23.0 |
19+
| https://grafana.github.io/helm-charts | rollout_operator(rollout-operator) | 0.24.0 |
2020

2121
Find more information in the Loki Helm Chart [documentation](https://grafana.com/docs/loki/next/installation/helm).
2222

‎production/helm/loki/values.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -286,7 +286,7 @@ loki:
286286
ui:
287287
discovery:
288288
join_peers:
289-
- '{{ include "loki.distributorFullname" . }}.{{ $.Release.Namespace }}.svc.{{ .Values.global.clusterDomain }}'
289+
- '{{ include "loki.queryFrontendFullname" . }}.{{ $.Release.Namespace }}.svc.{{ .Values.global.clusterDomain }}'
290290
{{- end }}
291291
{{- with .Values.loki.querier }}
292292
querier:

‎vendor/github.com/grafana/ckit/metrics.go

Lines changed: 16 additions & 6 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

0 commit comments

Comments
 (0)