Skip to content

Commit 73cbbb0

Browse files
authored
feat: add retries for s3 ObjectExists calls (#14062)
**What this PR does / why we need it**: It was determined via careful examination that Loki would issue retries for S3 `GetObject` calls, but would not retry calls to `ObjectExists`. For transient issues (e.g. rate-limiting), it makes sense to retry the command to see if an object does exist. If the object is not found (meaning, a 404 return code, and a successful query of the S3 storage to show the object does not exist), the retries are not leveraged, as it is not needed.
1 parent 5395daf commit 73cbbb0

File tree

3 files changed

+136
-12
lines changed

3 files changed

+136
-12
lines changed

‎docs/sources/shared/configuration.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1562,7 +1562,7 @@ backoff_config:
15621562
# CLI flag: -s3.max-backoff
15631563
[max_period: <duration> | default = 3s]
15641564

1565-
# Maximum number of times to retry when s3 get Object
1565+
# Maximum number of times to retry for s3 GetObject or ObjectExists
15661566
# CLI flag: -s3.max-retries
15671567
[max_retries: <int> | default = 5]
15681568

@@ -5430,7 +5430,7 @@ backoff_config:
54305430
# CLI flag: -<prefix>.storage.s3.max-backoff
54315431
[max_period: <duration> | default = 3s]
54325432

5433-
# Maximum number of times to retry when s3 get Object
5433+
# Maximum number of times to retry for s3 GetObject or ObjectExists
54345434
# CLI flag: -<prefix>.storage.s3.max-retries
54355435
[max_retries: <int> | default = 5]
54365436

‎pkg/storage/chunk/client/aws/s3_storage_client.go

Lines changed: 28 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -124,7 +124,7 @@ func (cfg *S3Config) RegisterFlagsWithPrefix(prefix string, f *flag.FlagSet) {
124124

125125
f.DurationVar(&cfg.BackoffConfig.MinBackoff, prefix+"s3.min-backoff", 100*time.Millisecond, "Minimum backoff time when s3 get Object")
126126
f.DurationVar(&cfg.BackoffConfig.MaxBackoff, prefix+"s3.max-backoff", 3*time.Second, "Maximum backoff time when s3 get Object")
127-
f.IntVar(&cfg.BackoffConfig.MaxRetries, prefix+"s3.max-retries", 5, "Maximum number of times to retry when s3 get Object")
127+
f.IntVar(&cfg.BackoffConfig.MaxRetries, prefix+"s3.max-retries", 5, "Maximum number of times to retry for s3 GetObject or ObjectExists")
128128
}
129129

130130
// Validate config and returns error on failure
@@ -307,16 +307,34 @@ func buckets(cfg S3Config) ([]string, error) {
307307
func (a *S3ObjectClient) Stop() {}
308308

309309
func (a *S3ObjectClient) ObjectExists(ctx context.Context, objectKey string) (bool, error) {
310-
err := instrument.CollectedRequest(ctx, "S3.ObjectExists", s3RequestDuration, instrument.ErrorCode, func(ctx context.Context) error {
311-
headObjectInput := &s3.HeadObjectInput{
312-
Bucket: aws.String(a.bucketFromKey(objectKey)),
313-
Key: aws.String(objectKey),
310+
var lastErr error
311+
312+
retries := backoff.New(ctx, a.cfg.BackoffConfig)
313+
for retries.Ongoing() {
314+
if ctx.Err() != nil {
315+
return false, errors.Wrap(ctx.Err(), "ctx related error during s3 objectExists")
314316
}
315-
_, err := a.S3.HeadObject(headObjectInput)
316-
return err
317-
})
318-
if err != nil {
319-
return false, err
317+
lastErr = instrument.CollectedRequest(ctx, "S3.ObjectExists", s3RequestDuration, instrument.ErrorCode, func(ctx context.Context) error {
318+
headObjectInput := &s3.HeadObjectInput{
319+
Bucket: aws.String(a.bucketFromKey(objectKey)),
320+
Key: aws.String(objectKey),
321+
}
322+
_, requestErr := a.S3.HeadObject(headObjectInput)
323+
return requestErr
324+
})
325+
if lastErr == nil {
326+
return true, nil
327+
}
328+
329+
if a.IsObjectNotFoundErr(lastErr) {
330+
return false, lastErr
331+
}
332+
333+
retries.Wait()
334+
}
335+
336+
if lastErr != nil {
337+
return false, lastErr
320338
}
321339

322340
return true, nil

‎pkg/storage/chunk/client/aws/s3_storage_client_test.go

Lines changed: 106 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -196,6 +196,112 @@ func Test_Hedging(t *testing.T) {
196196
}
197197
}
198198

199+
type MockS3Client struct {
200+
s3.S3
201+
HeadObjectFunc func(*s3.HeadObjectInput) (*s3.HeadObjectOutput, error)
202+
}
203+
204+
func (m *MockS3Client) HeadObject(input *s3.HeadObjectInput) (*s3.HeadObjectOutput, error) {
205+
return m.HeadObjectFunc(input)
206+
}
207+
208+
func Test_RetryLogic(t *testing.T) {
209+
for _, tc := range []struct {
210+
name string
211+
maxRetries int
212+
exists bool
213+
do func(c *S3ObjectClient) error
214+
}{
215+
{
216+
"get object with retries",
217+
3,
218+
true,
219+
func(c *S3ObjectClient) error {
220+
_, _, err := c.GetObject(context.Background(), "foo")
221+
return err
222+
},
223+
},
224+
{
225+
"object exists with retries",
226+
3,
227+
true,
228+
func(c *S3ObjectClient) error {
229+
_, err := c.ObjectExists(context.Background(), "foo")
230+
return err
231+
},
232+
},
233+
{
234+
"object doesn't exist with retries",
235+
3,
236+
false,
237+
func(c *S3ObjectClient) error {
238+
_, err := c.ObjectExists(context.Background(), "foo")
239+
return err
240+
},
241+
},
242+
} {
243+
t.Run(tc.name, func(t *testing.T) {
244+
callCount := atomic.NewInt32(0)
245+
246+
mockS3 := &MockS3Client{
247+
HeadObjectFunc: func(input *s3.HeadObjectInput) (*s3.HeadObjectOutput, error) {
248+
callNum := callCount.Inc()
249+
if !tc.exists {
250+
rfIn := awserr.NewRequestFailure(
251+
awserr.New("NotFound", "Not Found", nil), 404, "abc",
252+
)
253+
return nil, rfIn
254+
}
255+
256+
// Fail the first set of calls
257+
if int(callNum) <= tc.maxRetries-1 {
258+
time.Sleep(200 * time.Millisecond) // Simulate latency
259+
return nil, errors.New("simulated error on mock call")
260+
}
261+
262+
// Succeed on the last call
263+
return &s3.HeadObjectOutput{}, nil
264+
},
265+
}
266+
267+
c, err := NewS3ObjectClient(S3Config{
268+
AccessKeyID: "foo",
269+
SecretAccessKey: flagext.SecretWithValue("bar"),
270+
BackoffConfig: backoff.Config{MaxRetries: tc.maxRetries},
271+
BucketNames: "foo",
272+
Inject: func(next http.RoundTripper) http.RoundTripper {
273+
return RoundTripperFunc(func(req *http.Request) (*http.Response, error) {
274+
// Increment the call counter
275+
callNum := callCount.Inc()
276+
277+
// Fail the first set of calls
278+
if int(callNum) <= tc.maxRetries-1 {
279+
time.Sleep(200 * time.Millisecond) // Simulate latency
280+
return nil, errors.New("simulated error on call")
281+
}
282+
283+
// Succeed on the last call
284+
return &http.Response{
285+
StatusCode: http.StatusOK,
286+
Body: io.NopCloser(bytes.NewReader([]byte("object content"))),
287+
}, nil
288+
})
289+
},
290+
}, hedging.Config{})
291+
require.NoError(t, err)
292+
c.S3 = mockS3
293+
err = tc.do(c)
294+
if tc.exists {
295+
require.NoError(t, err)
296+
require.Equal(t, tc.maxRetries, int(callCount.Load()))
297+
} else {
298+
require.True(t, c.IsObjectNotFoundErr(err))
299+
require.Equal(t, 1, int(callCount.Load()))
300+
}
301+
})
302+
}
303+
}
304+
199305
func Test_ConfigRedactsCredentials(t *testing.T) {
200306
underTest := S3Config{
201307
AccessKeyID: "access key id",

0 commit comments

Comments
 (0)