@@ -3,7 +3,9 @@ package builder
3
3
import (
4
4
"context"
5
5
"fmt"
6
+ "io"
6
7
"os"
8
+ "strings"
7
9
"sync"
8
10
"time"
9
11
@@ -136,16 +138,39 @@ func (b *Builder) running(ctx context.Context) error {
136
138
retries := backoff .New (ctx , b .cfg .BackoffConfig )
137
139
for retries .Ongoing () {
138
140
err := b .connectAndBuild (ctx )
139
- if err == nil || errors .Is (err , context .Canceled ) {
140
- break
141
+ if err != nil {
142
+ err := standardizeRPCError (err )
143
+
144
+ // When the builder is shutting down, we will get a context canceled error
145
+ if errors .Is (err , context .Canceled ) && b .State () != services .Running {
146
+ level .Debug (b .logger ).Log ("msg" , "builder is shutting down" )
147
+ break
148
+ }
149
+
150
+ // If the planner disconnects while we are sending/receive a message we get an EOF error.
151
+ // In this case we should reset the backoff and retry
152
+ if errors .Is (err , io .EOF ) {
153
+ level .Error (b .logger ).Log ("msg" , "planner disconnected. Resetting backoff and retrying" , "err" , err )
154
+ retries .Reset ()
155
+ continue
156
+ }
157
+
158
+ // Otherwise (e.g. failed to connect to the builder), we should retry
159
+ code := status .Code (err )
160
+ level .Error (b .logger ).Log ("msg" , "failed to connect and build. Retrying" , "retry" , retries .NumRetries (), "maxRetries" , b .cfg .BackoffConfig .MaxRetries , "code" , code .String (), "err" , err )
161
+ retries .Wait ()
162
+ continue
141
163
}
142
164
143
- level .Error (b .logger ).Log ("msg" , "failed to connect and build. Retrying" , "err" , err )
144
- retries .Wait ()
165
+ // We shouldn't get here. If we do, we better restart the builder.
166
+ // Adding a log line for visibility
167
+ level .Error (b .logger ).Log ("msg" , "unexpected end of connectAndBuild. Restarting builder" )
168
+ break
145
169
}
146
170
147
171
if err := retries .Err (); err != nil {
148
172
if errors .Is (err , context .Canceled ) {
173
+ // Edge case when the builder is shutting down while we check for retries
149
174
return nil
150
175
}
151
176
return fmt .Errorf ("failed to connect and build: %w" , err )
@@ -154,6 +179,31 @@ func (b *Builder) running(ctx context.Context) error {
154
179
return nil
155
180
}
156
181
182
+ // standardizeRPCError converts some gRPC errors we want to handle differently to standard errors.
183
+ // 1. codes.Canceled -> context.Canceled
184
+ // 2. codes.Unavailable with EOF -> io.EOF
185
+ // 3. All other errors are returned as is.
186
+ func standardizeRPCError (err error ) error {
187
+ if err == nil {
188
+ return nil
189
+ }
190
+
191
+ if st , ok := status .FromError (err ); ok {
192
+ switch st .Code () {
193
+ case codes .Canceled :
194
+ // Happens when the builder is shutting down, and we are sending/receiving a message
195
+ return context .Canceled
196
+ case codes .Unavailable :
197
+ // We want to handle this case as a retryable error that resets the backoff
198
+ if i := strings .LastIndex (st .Message (), "EOF" ); i != - 1 {
199
+ return io .EOF
200
+ }
201
+ }
202
+ }
203
+
204
+ return err
205
+ }
206
+
157
207
func (b * Builder ) plannerAddress () string {
158
208
if b .ringWatcher == nil {
159
209
return b .cfg .PlannerAddress
@@ -173,8 +223,7 @@ func (b *Builder) connectAndBuild(ctx context.Context) error {
173
223
return fmt .Errorf ("failed to create grpc dial options: %w" , err )
174
224
}
175
225
176
- // nolint:staticcheck // grpc.DialContext() has been deprecated; we'll address it before upgrading to gRPC 2.
177
- conn , err := grpc .DialContext (ctx , b .plannerAddress (), opts ... )
226
+ conn , err := grpc .NewClient (b .plannerAddress (), opts ... )
178
227
if err != nil {
179
228
return fmt .Errorf ("failed to dial bloom planner: %w" , err )
180
229
}
@@ -202,21 +251,17 @@ func (b *Builder) connectAndBuild(ctx context.Context) error {
202
251
}
203
252
204
253
func (b * Builder ) builderLoop (c protos.PlannerForBuilder_BuilderLoopClient ) error {
254
+ ctx := c .Context ()
255
+
205
256
// Send ready message to planner
206
257
if err := c .Send (& protos.BuilderToPlanner {BuilderID : b .ID }); err != nil {
207
258
return fmt .Errorf ("failed to send ready message to planner: %w" , err )
208
259
}
209
260
210
- for b .State () == services .Running {
211
- // When the planner connection closes, an EOF or "planner shutting down" error is returned.
212
- // When the builder is shutting down, a gRPC context canceled error is returned.
261
+ // Will break when planner<->builder connection is closed or when the builder is shutting down.
262
+ for ctx .Err () == nil {
213
263
protoTask , err := c .Recv ()
214
264
if err != nil {
215
- if status .Code (err ) == codes .Canceled {
216
- level .Debug (b .logger ).Log ("msg" , "builder loop context canceled" )
217
- return nil
218
- }
219
-
220
265
return fmt .Errorf ("failed to receive task from planner: %w" , err )
221
266
}
222
267
@@ -235,7 +280,7 @@ func (b *Builder) builderLoop(c protos.PlannerForBuilder_BuilderLoopClient) erro
235
280
continue
236
281
}
237
282
238
- newMetas , err := b .processTask (c . Context () , task )
283
+ newMetas , err := b .processTask (ctx , task )
239
284
if err != nil {
240
285
err = fmt .Errorf ("failed to process task: %w" , err )
241
286
}
@@ -249,8 +294,8 @@ func (b *Builder) builderLoop(c protos.PlannerForBuilder_BuilderLoopClient) erro
249
294
b .metrics .processingTask .Set (0 )
250
295
}
251
296
252
- level .Debug (b .logger ).Log ("msg" , "builder loop stopped" )
253
- return nil
297
+ level .Debug (b .logger ).Log ("msg" , "builder loop stopped" , "ctx_err" , ctx . Err () )
298
+ return ctx . Err ()
254
299
}
255
300
256
301
func (b * Builder ) logTaskCompleted (
0 commit comments