Skip to content

Commit 3e1f2fc

Browse files
caching: do not try to fill the gap in log results cache when the new query interval does not overlap the cached query interval (#9757)
**What this PR does / why we need it**: Currently, when we find a relevant cached negative response for a logs query, we do the following: * If the cached query completely covers the new query: * return back an empty response. * else: * fill the gaps on either/both sides of the cached query. The problem with filling the gaps is that when the cached query does not overlap at all with the new query, we have to extend the query beyond what the query requests for. However, with the logs query, we have a limit on the number of lines we can send back in the response. So, this could result in the query response having logs which were not requested by the query, which then get filtered out by the [response extractor](https://github.com/grafana/loki/blob/b78d3f05525d8bcab13e621bc2e5851aadc8fc91/pkg/querier/queryrange/log_result_cache.go#L299), unexpectedly resulting in an empty response. For example, if the query was cached for start=15, end=20 and we get a `backwards` query for start=5, end=10. To fill the gap, the query would be executed for start=5, end=15. Now, if we have logs more than the query `limit` in the range 10-15, we would filter out all the data in the response extractor and send back an empty response to the user. This PR fixes the issue by doing the following changes when handling cache hit: * If the cached query completely covers the new query: * return back an empty response[_existing_]. * else if the cached query does not overlap with the new query: * do the new query as requested. * If the new query results in an empty response and has a higher interval than the cached query: * update the cache * else: * query the data for missing intervals on both/either side[_existing_] * update the cache with extended intervals if the new queries resulted in an empty response[_existing_] **Special notes for your reviewer**: We could do further improvements in the handling of queries not overlapping with cached query by selectively extending the queries based on query direction and cached query lying before/after the new query. For example, if the new query is doing `backwards` query and the `cachedQuery.End` < `newQuery.Start`, it should be okay to extend the query and do `cachedQuery.End` to `newQuery.End` to fill the cache since query would first fill the most relevant data before hitting the limits. I did not want to complicate the fix so went without implementing this approach. We can revisit later if we feel we need to improve our caching. **Checklist** - [x] Tests updated - [x] `CHANGELOG.md` updated --------- Co-authored-by: Travis Patterson <travis.patterson@grafana.com>
1 parent 0da4625 commit 3e1f2fc

File tree

4 files changed

+182
-90
lines changed

4 files changed

+182
-90
lines changed

‎CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -56,6 +56,7 @@
5656
* [9495](https://github.com/grafana/loki/pull/9495) **thampiotr**: Promtail: Fix potential goroutine leak in file tailer.
5757
* [9650](https://github.com/grafana/loki/pull/9650) **ashwanthgoli**: Config: ensure storage config defaults apply to named stores.
5858
* [9629](https://github.com/grafana/loki/pull/9629) **periklis**: Fix duplicate label values from ingester streams.
59+
* [9757](https://github.com/grafana/loki/pull/9757) **sandeepsukhani**: Frontend Caching: Fix a bug in negative logs results cache causing Loki to unexpectedly send empty/incorrect results.
5960
* [9754](https://github.com/grafana/loki/pull/9754) **ashwanthgoli**: Fixes an issue with indexes becoming unqueriable if the index prefix is different from the one configured in the latest period config.
6061
* [9763](https://github.com/grafana/loki/pull/9763) **ssncferreira**: Fix the logic of the `offset` operator for downstream queries on instant query splitting of (range) vector aggregation expressions containing an offset.
6162
* [9773](https://github.com/grafana/loki/pull/9773) **ssncferreira**: Fix instant query summary statistic's `splits` corresponding to the number of subqueries a query is split into based on `split_queries_by_interval`.

‎pkg/querier/queryrange/log_result_cache.go

Lines changed: 90 additions & 64 deletions
Original file line numberDiff line numberDiff line change
@@ -187,77 +187,95 @@ func (l *logResultCache) handleHit(ctx context.Context, cacheKey string, cachedR
187187
if cachedRequest.StartTs.UnixNano() <= lokiReq.StartTs.UnixNano() && cachedRequest.EndTs.UnixNano() >= lokiReq.EndTs.UnixNano() {
188188
return result, nil
189189
}
190-
// we could be missing data at the start and the end.
191-
// so we're going to fetch what is missing.
192-
var (
193-
startRequest, endRequest *LokiRequest
194-
startResp, endResp *LokiResponse
195-
updateCache bool
196-
ok bool
197-
)
198-
g, ctx := errgroup.WithContext(ctx)
199-
200-
// if we're missing data at the start, start fetching from the start to the cached start.
201-
if lokiReq.GetStartTs().Before(cachedRequest.GetStartTs()) {
202-
g.Go(func() error {
203-
startRequest = lokiReq.WithStartEndTime(lokiReq.GetStartTs(), cachedRequest.GetStartTs())
204-
resp, err := l.next.Do(ctx, startRequest)
205-
if err != nil {
206-
return err
207-
}
208-
startResp, ok = resp.(*LokiResponse)
209-
if !ok {
210-
return fmt.Errorf("unexpected response type %T", resp)
211-
}
212-
return nil
213-
})
214-
}
215-
216-
// if we're missing data at the end, start fetching from the cached end to the end.
217-
if lokiReq.GetEndTs().After(cachedRequest.GetEndTs()) {
218-
g.Go(func() error {
219-
endRequest = lokiReq.WithStartEndTime(cachedRequest.GetEndTs(), lokiReq.GetEndTs())
220-
resp, err := l.next.Do(ctx, endRequest)
221-
if err != nil {
222-
return err
223-
}
224-
endResp, ok = resp.(*LokiResponse)
225-
if !ok {
226-
return fmt.Errorf("unexpected response type %T", resp)
227-
}
228-
return nil
229-
})
230-
}
231190

232-
if err := g.Wait(); err != nil {
233-
return nil, err
234-
}
191+
updateCache := false
192+
// if the query does not overlap cached interval, do not try to fill the gap since it requires extending the queries beyond what is requested in the query.
193+
// Extending the queries beyond what is requested could result in empty responses due to response limit set in the queries.
194+
if !overlap(lokiReq.StartTs, lokiReq.EndTs, cachedRequest.StartTs, cachedRequest.EndTs) {
195+
resp, err := l.next.Do(ctx, lokiReq)
196+
if err != nil {
197+
return nil, err
198+
}
199+
result = resp.(*LokiResponse)
235200

236-
// if we have data at the start, we need to merge it with the cached data if it's empty and update the cache.
237-
// If it's not empty only merge the response.
238-
if startResp != nil {
239-
if isEmpty(startResp) {
240-
cachedRequest = cachedRequest.WithStartEndTime(startRequest.GetStartTs(), cachedRequest.GetEndTs())
201+
// if the response is empty and the query is larger than what is cached, update the cache
202+
if isEmpty(result) && (lokiReq.EndTs.UnixNano()-lokiReq.StartTs.UnixNano() > cachedRequest.EndTs.UnixNano()-cachedRequest.StartTs.UnixNano()) {
203+
cachedRequest = cachedRequest.WithStartEndTime(lokiReq.GetStartTs(), lokiReq.GetEndTs())
241204
updateCache = true
242-
} else {
243-
if startResp.Status != loghttp.QueryStatusSuccess {
244-
return startResp, nil
205+
}
206+
} else {
207+
// we could be missing data at the start and the end.
208+
// so we're going to fetch what is missing.
209+
var (
210+
startRequest, endRequest *LokiRequest
211+
startResp, endResp *LokiResponse
212+
)
213+
g, ctx := errgroup.WithContext(ctx)
214+
215+
// if we're missing data at the start, start fetching from the start to the cached start.
216+
if lokiReq.GetStartTs().Before(cachedRequest.GetStartTs()) {
217+
g.Go(func() error {
218+
startRequest = lokiReq.WithStartEndTime(lokiReq.GetStartTs(), cachedRequest.GetStartTs())
219+
resp, err := l.next.Do(ctx, startRequest)
220+
if err != nil {
221+
return err
222+
}
223+
var ok bool
224+
startResp, ok = resp.(*LokiResponse)
225+
if !ok {
226+
return fmt.Errorf("unexpected response type %T", resp)
227+
}
228+
return nil
229+
})
230+
}
231+
232+
// if we're missing data at the end, start fetching from the cached end to the end.
233+
if lokiReq.GetEndTs().After(cachedRequest.GetEndTs()) {
234+
g.Go(func() error {
235+
endRequest = lokiReq.WithStartEndTime(cachedRequest.GetEndTs(), lokiReq.GetEndTs())
236+
resp, err := l.next.Do(ctx, endRequest)
237+
if err != nil {
238+
return err
239+
}
240+
var ok bool
241+
endResp, ok = resp.(*LokiResponse)
242+
if !ok {
243+
return fmt.Errorf("unexpected response type %T", resp)
244+
}
245+
return nil
246+
})
247+
}
248+
249+
if err := g.Wait(); err != nil {
250+
return nil, err
251+
}
252+
253+
// if we have data at the start, we need to merge it with the cached data if it's empty and update the cache.
254+
// If it's not empty only merge the response.
255+
if startResp != nil {
256+
if isEmpty(startResp) {
257+
cachedRequest = cachedRequest.WithStartEndTime(startRequest.GetStartTs(), cachedRequest.GetEndTs())
258+
updateCache = true
259+
} else {
260+
if startResp.Status != loghttp.QueryStatusSuccess {
261+
return startResp, nil
262+
}
263+
result = mergeLokiResponse(startResp, result)
245264
}
246-
result = mergeLokiResponse(extractLokiResponse(lokiReq.GetStartTs(), lokiReq.GetEndTs(), startResp), result)
247265
}
248-
}
249266

250-
// if we have data at the end, we need to merge it with the cached data if it's empty and update the cache.
251-
// If it's not empty only merge the response.
252-
if endResp != nil {
253-
if isEmpty(endResp) {
254-
cachedRequest = cachedRequest.WithStartEndTime(cachedRequest.GetStartTs(), endRequest.GetEndTs())
255-
updateCache = true
256-
} else {
257-
if endResp.Status != loghttp.QueryStatusSuccess {
258-
return endResp, nil
267+
// if we have data at the end, we need to merge it with the cached data if it's empty and update the cache.
268+
// If it's not empty only merge the response.
269+
if endResp != nil {
270+
if isEmpty(endResp) {
271+
cachedRequest = cachedRequest.WithStartEndTime(cachedRequest.GetStartTs(), endRequest.GetEndTs())
272+
updateCache = true
273+
} else {
274+
if endResp.Status != loghttp.QueryStatusSuccess {
275+
return endResp, nil
276+
}
277+
result = mergeLokiResponse(endResp, result)
259278
}
260-
result = mergeLokiResponse(extractLokiResponse(lokiReq.GetStartTs(), lokiReq.GetEndTs(), endResp), result)
261279
}
262280
}
263281

@@ -333,3 +351,11 @@ func emptyResponse(lokiReq *LokiRequest) *LokiResponse {
333351
},
334352
}
335353
}
354+
355+
func overlap(aFrom, aThrough, bFrom, bThrough time.Time) bool {
356+
if aFrom.After(bThrough) || bFrom.After(aThrough) {
357+
return false
358+
}
359+
360+
return true
361+
}

‎pkg/querier/queryrange/log_result_cache_test.go

Lines changed: 79 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,8 @@ import (
77
"time"
88

99
"github.com/go-kit/log"
10+
"github.com/prometheus/client_golang/prometheus"
11+
"github.com/prometheus/client_golang/prometheus/testutil"
1012
"github.com/stretchr/testify/mock"
1113
"github.com/stretchr/testify/require"
1214
"github.com/weaveworks/common/user"
@@ -434,38 +436,54 @@ func Test_LogResultCacheDifferentRangeNonEmptyAndEmpty(t *testing.T) {
434436
fake.AssertExpectations(t)
435437
}
436438

437-
func Test_LogResultFillingGap(t *testing.T) {
439+
// Test_LogResultNonOverlappingCache tests the scenario where the cached query does not overlap with the new request
440+
func Test_LogResultNonOverlappingCache(t *testing.T) {
441+
metrics := NewLogResultCacheMetrics(prometheus.NewPedanticRegistry())
442+
mockCache := cache.NewMockCache()
438443
var (
439444
ctx = user.InjectOrgID(context.Background(), "foo")
440445
lrc = NewLogResultCache(
441446
log.NewNopLogger(),
442447
fakeLimits{
443448
splits: map[string]time.Duration{"foo": time.Minute},
444449
},
445-
cache.NewMockCache(),
446-
nil,
450+
mockCache,
447451
nil,
448452
nil,
453+
metrics,
449454
)
450455
)
451456

457+
checkCacheMetrics := func(expectedHits, expectedMisses int) {
458+
require.Equal(t, float64(expectedHits), testutil.ToFloat64(metrics.CacheHit))
459+
require.Equal(t, float64(expectedMisses), testutil.ToFloat64(metrics.CacheMiss))
460+
}
461+
452462
// data requested for just 1 sec, resulting in empty response
453463
req1 := &LokiRequest{
454464
StartTs: time.Unix(0, time.Minute.Nanoseconds()+30*time.Second.Nanoseconds()),
455465
EndTs: time.Unix(0, time.Minute.Nanoseconds()+31*time.Second.Nanoseconds()),
456466
Limit: entriesLimit,
457467
}
458468

459-
// data requested for just 1 sec, within the same split but couple seconds apart
469+
// data requested for just 1 sec(non-overlapping), resulting in empty response
460470
req2 := &LokiRequest{
461-
StartTs: time.Unix(0, time.Minute.Nanoseconds()+35*time.Second.Nanoseconds()),
462-
EndTs: time.Unix(0, time.Minute.Nanoseconds()+36*time.Second.Nanoseconds()),
471+
StartTs: time.Unix(0, time.Minute.Nanoseconds()+24*time.Second.Nanoseconds()),
472+
EndTs: time.Unix(0, time.Minute.Nanoseconds()+25*time.Second.Nanoseconds()),
463473
Limit: entriesLimit,
464474
}
465475

476+
// data requested for larger interval than req1(overlapping with req2), returns empty response
466477
req3 := &LokiRequest{
467-
StartTs: time.Unix(0, time.Minute.Nanoseconds()+25*time.Second.Nanoseconds()),
468-
EndTs: time.Unix(0, time.Minute.Nanoseconds()+26*time.Second.Nanoseconds()),
478+
StartTs: time.Unix(0, time.Minute.Nanoseconds()+24*time.Second.Nanoseconds()),
479+
EndTs: time.Unix(0, time.Minute.Nanoseconds()+29*time.Second.Nanoseconds()),
480+
Limit: entriesLimit,
481+
}
482+
483+
// data requested for larger interval than req3(non-overlapping), returns non-empty response
484+
req4 := &LokiRequest{
485+
StartTs: time.Unix(0, time.Minute.Nanoseconds()+10*time.Second.Nanoseconds()),
486+
EndTs: time.Unix(0, time.Minute.Nanoseconds()+20*time.Second.Nanoseconds()),
469487
Limit: entriesLimit,
470488
}
471489

@@ -476,34 +494,49 @@ func Test_LogResultFillingGap(t *testing.T) {
476494
Response: emptyResponse(req1),
477495
},
478496
},
479-
// partial request being made for missing interval at the end
497+
// req2 should do query for just its query range and should not update the cache
480498
{
481499
RequestResponse: queryrangebase.RequestResponse{
482500
Request: &LokiRequest{
483-
StartTs: time.Unix(0, time.Minute.Nanoseconds()+31*time.Second.Nanoseconds()),
484-
EndTs: time.Unix(0, time.Minute.Nanoseconds()+36*time.Second.Nanoseconds()),
501+
StartTs: time.Unix(0, time.Minute.Nanoseconds()+24*time.Second.Nanoseconds()),
502+
EndTs: time.Unix(0, time.Minute.Nanoseconds()+25*time.Second.Nanoseconds()),
485503
Limit: entriesLimit,
486504
},
487-
Response: nonEmptyResponse(&LokiRequest{
488-
StartTs: time.Unix(0, time.Minute.Nanoseconds()+31*time.Second.Nanoseconds()),
489-
EndTs: time.Unix(0, time.Minute.Nanoseconds()+36*time.Second.Nanoseconds()),
505+
Response: emptyResponse(&LokiRequest{
506+
StartTs: time.Unix(0, time.Minute.Nanoseconds()+24*time.Second.Nanoseconds()),
507+
EndTs: time.Unix(0, time.Minute.Nanoseconds()+25*time.Second.Nanoseconds()),
490508
Limit: entriesLimit,
491-
}, time.Unix(31, 0), time.Unix(34, 0), lblFooBar), // data not present for actual query interval i.e req2
509+
}),
492510
},
493511
},
494-
// partial request being made for missing interval at the beginning
512+
// req3 should do query for just its query range and should update the cache
495513
{
496514
RequestResponse: queryrangebase.RequestResponse{
497515
Request: &LokiRequest{
498-
StartTs: time.Unix(0, time.Minute.Nanoseconds()+25*time.Second.Nanoseconds()),
499-
EndTs: time.Unix(0, time.Minute.Nanoseconds()+30*time.Second.Nanoseconds()),
516+
StartTs: time.Unix(0, time.Minute.Nanoseconds()+24*time.Second.Nanoseconds()),
517+
EndTs: time.Unix(0, time.Minute.Nanoseconds()+29*time.Second.Nanoseconds()),
518+
Limit: entriesLimit,
519+
},
520+
Response: emptyResponse(&LokiRequest{
521+
StartTs: time.Unix(0, time.Minute.Nanoseconds()+24*time.Second.Nanoseconds()),
522+
EndTs: time.Unix(0, time.Minute.Nanoseconds()+29*time.Second.Nanoseconds()),
523+
Limit: entriesLimit,
524+
}),
525+
},
526+
},
527+
// req4 should do query for its query range. Data would be non-empty so cache should not be updated
528+
{
529+
RequestResponse: queryrangebase.RequestResponse{
530+
Request: &LokiRequest{
531+
StartTs: time.Unix(0, time.Minute.Nanoseconds()+10*time.Second.Nanoseconds()),
532+
EndTs: time.Unix(0, time.Minute.Nanoseconds()+20*time.Second.Nanoseconds()),
500533
Limit: entriesLimit,
501534
},
502535
Response: nonEmptyResponse(&LokiRequest{
503-
StartTs: time.Unix(0, time.Minute.Nanoseconds()+25*time.Second.Nanoseconds()),
504-
EndTs: time.Unix(0, time.Minute.Nanoseconds()+30*time.Second.Nanoseconds()),
536+
StartTs: time.Unix(0, time.Minute.Nanoseconds()+10*time.Second.Nanoseconds()),
537+
EndTs: time.Unix(0, time.Minute.Nanoseconds()+20*time.Second.Nanoseconds()),
505538
Limit: entriesLimit,
506-
}, time.Unix(27, 0), time.Unix(29, 0), lblFooBar), // data not present for actual query interval i.e req3
539+
}, time.Unix(71, 0), time.Unix(79, 0), lblFooBar),
507540
},
508541
},
509542
})
@@ -513,16 +546,37 @@ func Test_LogResultFillingGap(t *testing.T) {
513546
resp, err := h.Do(ctx, req1)
514547
require.NoError(t, err)
515548
require.Equal(t, emptyResponse(req1), resp)
549+
checkCacheMetrics(0, 1)
550+
require.Equal(t, 1, mockCache.NumKeyUpdates())
516551

517-
// although the caching code would request for more data than the actual query, we should have empty response here since we
518-
// do not have any data for the query we made
552+
// req2 should not update the cache since it has same length as previously cached query
519553
resp, err = h.Do(ctx, req2)
520554
require.NoError(t, err)
521-
require.Equal(t, mergeLokiResponse(emptyResponse(req1), emptyResponse(req2)), resp)
555+
require.Equal(t, emptyResponse(req2), resp)
556+
checkCacheMetrics(1, 1)
557+
require.Equal(t, 1, mockCache.NumKeyUpdates())
522558

559+
// req3 should update the cache since it has larger length than previously cached query
523560
resp, err = h.Do(ctx, req3)
524561
require.NoError(t, err)
525-
require.Equal(t, mergeLokiResponse(emptyResponse(req1), emptyResponse(req3)), resp)
562+
require.Equal(t, emptyResponse(req3), resp)
563+
checkCacheMetrics(2, 1)
564+
require.Equal(t, 2, mockCache.NumKeyUpdates())
565+
566+
// req4 returns non-empty response so it should not update the cache
567+
resp, err = h.Do(ctx, req4)
568+
require.NoError(t, err)
569+
require.Equal(t, nonEmptyResponse(req4, time.Unix(71, 0), time.Unix(79, 0), lblFooBar), resp)
570+
checkCacheMetrics(3, 1)
571+
require.Equal(t, 2, mockCache.NumKeyUpdates())
572+
573+
// req2 should return back empty response from the cache, without updating the cache
574+
resp, err = h.Do(ctx, req2)
575+
require.NoError(t, err)
576+
require.Equal(t, emptyResponse(req2), resp)
577+
checkCacheMetrics(4, 1)
578+
require.Equal(t, 2, mockCache.NumKeyUpdates())
579+
526580
fake.AssertExpectations(t)
527581
}
528582

‎pkg/storage/chunk/cache/mock.go

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,13 @@ import (
77
"github.com/grafana/loki/pkg/logqlmodel/stats"
88
)
99

10+
type MockCache interface {
11+
Cache
12+
NumKeyUpdates() int
13+
}
14+
1015
type mockCache struct {
16+
numKeyUpdates int
1117
sync.Mutex
1218
cache map[string][]byte
1319
}
@@ -17,6 +23,7 @@ func (m *mockCache) Store(_ context.Context, keys []string, bufs [][]byte) error
1723
defer m.Unlock()
1824
for i := range keys {
1925
m.cache[keys[i]] = bufs[i]
26+
m.numKeyUpdates++
2027
}
2128
return nil
2229
}
@@ -43,8 +50,12 @@ func (m *mockCache) GetCacheType() stats.CacheType {
4350
return "mock"
4451
}
4552

53+
func (m *mockCache) NumKeyUpdates() int {
54+
return m.numKeyUpdates
55+
}
56+
4657
// NewMockCache makes a new MockCache.
47-
func NewMockCache() Cache {
58+
func NewMockCache() MockCache {
4859
return &mockCache{
4960
cache: map[string][]byte{},
5061
}

0 commit comments

Comments
 (0)