Skip to content

Commit cbdd36a

Browse files
owen-dashwanthgoli
andauthored
feat: blockbuilder component (#14621)
Signed-off-by: Owen Diehl <ow.diehl@gmail.com> Co-authored-by: Ashwanth Goli <iamashwanth@gmail.com>
1 parent d3d31f1 commit cbdd36a

File tree

22 files changed

+2300
-45
lines changed

22 files changed

+2300
-45
lines changed

‎docs/sources/shared/configuration.md

Lines changed: 51 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -137,6 +137,57 @@ Pass the `-config.expand-env` flag at the command line to enable this way of set
137137
# itself to a key value store.
138138
[ingester: <ingester>]
139139

140+
block_builder:
141+
# How many flushes can happen concurrently
142+
# CLI flag: -blockbuilder.concurrent-flushes
143+
[concurrent_flushes: <int> | default = 1]
144+
145+
# How many workers to process writes, defaults to number of available cpus
146+
# CLI flag: -blockbuilder.concurrent-writers
147+
[concurrent_writers: <int> | default = 1]
148+
149+
# The targeted _uncompressed_ size in bytes of a chunk block When this
150+
# threshold is exceeded the head block will be cut and compressed inside the
151+
# chunk.
152+
# CLI flag: -blockbuilder.chunks-block-size
153+
[chunk_block_size: <int> | default = 256KB]
154+
155+
# A target _compressed_ size in bytes for chunks. This is a desired size not
156+
# an exact size, chunks may be slightly bigger or significantly smaller if
157+
# they get flushed for other reasons (e.g. chunk_idle_period). A value of 0
158+
# creates chunks with a fixed 10 blocks, a non zero value will create chunks
159+
# with a variable number of blocks to meet the target size.
160+
# CLI flag: -blockbuilder.chunk-target-size
161+
[chunk_target_size: <int> | default = 1536KB]
162+
163+
# The algorithm to use for compressing chunk. (none, gzip, lz4-64k, snappy,
164+
# lz4-256k, lz4-1M, lz4, flate, zstd)
165+
# CLI flag: -blockbuilder.chunk-encoding
166+
[chunk_encoding: <string> | default = "snappy"]
167+
168+
# The maximum duration of a timeseries chunk in memory. If a timeseries runs
169+
# for longer than this, the current chunk will be flushed to the store and a
170+
# new chunk created.
171+
# CLI flag: -blockbuilder.max-chunk-age
172+
[max_chunk_age: <duration> | default = 2h]
173+
174+
# The interval at which to run.
175+
# CLI flag: -blockbuilder.interval
176+
[interval: <duration> | default = 10m]
177+
178+
backoff_config:
179+
# Minimum delay when backing off.
180+
# CLI flag: -blockbuilder.backoff..backoff-min-period
181+
[min_period: <duration> | default = 100ms]
182+
183+
# Maximum delay when backing off.
184+
# CLI flag: -blockbuilder.backoff..backoff-max-period
185+
[max_period: <duration> | default = 10s]
186+
187+
# Number of times to backoff and retry before failing.
188+
# CLI flag: -blockbuilder.backoff..backoff-retries
189+
[max_retries: <int> | default = 10]
190+
140191
pattern_ingester:
141192
# Whether the pattern ingester is enabled.
142193
# CLI flag: -pattern-ingester.enabled

‎pkg/blockbuilder/controller.go

Lines changed: 305 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,305 @@
1+
package blockbuilder
2+
3+
import (
4+
"context"
5+
"fmt"
6+
"time"
7+
8+
"github.com/prometheus/prometheus/model/labels"
9+
10+
"github.com/grafana/dskit/backoff"
11+
12+
"github.com/grafana/loki/v3/pkg/kafka"
13+
"github.com/grafana/loki/v3/pkg/kafka/partition"
14+
15+
"github.com/grafana/loki/pkg/push"
16+
)
17+
18+
// [min,max)
19+
type Offsets struct {
20+
Min, Max int64
21+
}
22+
23+
type Job struct {
24+
Partition int32
25+
Offsets Offsets
26+
}
27+
28+
// Interface required for interacting with queue partitions.
29+
type PartitionController interface {
30+
Topic() string
31+
Partition() int32
32+
// Returns the highest committed offset from the consumer group
33+
HighestCommittedOffset(ctx context.Context) (int64, error)
34+
// Returns the highest available offset in the partition
35+
HighestPartitionOffset(ctx context.Context) (int64, error)
36+
// Returns the earliest available offset in the partition
37+
EarliestPartitionOffset(ctx context.Context) (int64, error)
38+
// Commits the offset to the consumer group.
39+
Commit(context.Context, int64) error
40+
// Process will run load batches at a time and send them to channel,
41+
// so it's advised to not buffer the channel for natural backpressure.
42+
// As a convenience, it returns the last seen offset, which matches
43+
// the final record sent on the channel.
44+
Process(context.Context, Offsets, chan<- []AppendInput) (int64, error)
45+
46+
Close() error
47+
}
48+
49+
// PartitionJobController loads a single job a time, bound to a given
50+
// * topic
51+
// * partition
52+
// * offset_step_len: the number of offsets each job to contain. e.g. "10" could yield a job w / min=15, max=25
53+
//
54+
// At a high level, it watches a source topic/partition (where log data is ingested) and a "committed" topic/partition.
55+
// The "committed" partition corresponds to the offsets from the source partition which have been committed to object storage.
56+
// In essence, the following loop is performed
57+
// 1. load the most recent record from the "committed" partition. This contains the highest msg offset in the "source" partition
58+
// that has been committed to object storage. We'll call that $START_POS.
59+
// 2. Create a job with `min=$START_POS+1,end=$START_POS+1+$STEP_LEN`
60+
// 3. Sometime later when the job has been processed, we'll commit the final processed offset from the "source" partition (which
61+
// will be <= $END_POS) to the "committed" partition.
62+
//
63+
// NB(owen-d): In our case, "source" is the partition
64+
//
65+
// containing log data and "committed" is the consumer group
66+
type PartitionJobController struct {
67+
stepLen int64
68+
part partition.ReaderIfc
69+
backoff backoff.Config
70+
decoder *kafka.Decoder
71+
}
72+
73+
func NewPartitionJobController(
74+
controller partition.ReaderIfc,
75+
backoff backoff.Config,
76+
) (*PartitionJobController, error) {
77+
decoder, err := kafka.NewDecoder()
78+
if err != nil {
79+
return nil, err
80+
}
81+
return &PartitionJobController{
82+
stepLen: 1000, // Default step length of 1000 offsets per job
83+
part: controller,
84+
backoff: backoff,
85+
decoder: decoder,
86+
}, nil
87+
}
88+
89+
func (l *PartitionJobController) HighestCommittedOffset(ctx context.Context) (int64, error) {
90+
return withBackoff(
91+
ctx,
92+
l.backoff,
93+
func() (int64, error) {
94+
return l.part.FetchLastCommittedOffset(ctx)
95+
},
96+
)
97+
}
98+
99+
func (l *PartitionJobController) HighestPartitionOffset(ctx context.Context) (int64, error) {
100+
return withBackoff(
101+
ctx,
102+
l.backoff,
103+
func() (int64, error) {
104+
return l.part.FetchPartitionOffset(ctx, partition.KafkaEndOffset)
105+
},
106+
)
107+
}
108+
109+
func (l *PartitionJobController) EarliestPartitionOffset(ctx context.Context) (int64, error) {
110+
return withBackoff(
111+
ctx,
112+
l.backoff,
113+
func() (int64, error) {
114+
return l.part.FetchPartitionOffset(ctx, partition.KafkaStartOffset)
115+
},
116+
)
117+
}
118+
119+
func (l *PartitionJobController) Process(ctx context.Context, offsets Offsets, ch chan<- []AppendInput) (int64, error) {
120+
l.part.SetOffsetForConsumption(offsets.Min)
121+
122+
var (
123+
lastOffset = offsets.Min - 1
124+
boff = backoff.New(ctx, l.backoff)
125+
err error
126+
)
127+
128+
for boff.Ongoing() {
129+
var records []partition.Record
130+
records, err = l.part.Poll(ctx)
131+
if err != nil {
132+
boff.Wait()
133+
continue
134+
}
135+
136+
if len(records) == 0 {
137+
// No more records available
138+
break
139+
}
140+
141+
// Reset backoff on successful poll
142+
boff.Reset()
143+
144+
converted := make([]AppendInput, 0, len(records))
145+
for _, record := range records {
146+
offset := records[len(records)-1].Offset
147+
if offset >= offsets.Max {
148+
break
149+
}
150+
lastOffset = offset
151+
152+
stream, labels, err := l.decoder.Decode(record.Content)
153+
if err != nil {
154+
return 0, fmt.Errorf("failed to decode record: %w", err)
155+
}
156+
if len(stream.Entries) == 0 {
157+
continue
158+
}
159+
160+
converted = append(converted, AppendInput{
161+
tenant: record.TenantID,
162+
labels: labels,
163+
labelsStr: stream.Labels,
164+
entries: stream.Entries,
165+
})
166+
167+
select {
168+
case ch <- converted:
169+
case <-ctx.Done():
170+
return 0, ctx.Err()
171+
}
172+
}
173+
}
174+
175+
return lastOffset, err
176+
}
177+
178+
// LoadJob(ctx) returns the next job by finding the most recent unconsumed offset in the partition
179+
// Returns whether an applicable job exists, the job, and an error
180+
func (l *PartitionJobController) LoadJob(ctx context.Context) (bool, Job, error) {
181+
// Read the most recent committed offset
182+
committedOffset, err := l.HighestCommittedOffset(ctx)
183+
if err != nil {
184+
return false, Job{}, err
185+
}
186+
187+
earliestOffset, err := l.EarliestPartitionOffset(ctx)
188+
if err != nil {
189+
return false, Job{}, err
190+
}
191+
192+
startOffset := committedOffset + 1
193+
if startOffset < earliestOffset {
194+
startOffset = earliestOffset
195+
}
196+
197+
highestOffset, err := l.HighestPartitionOffset(ctx)
198+
if err != nil {
199+
return false, Job{}, err
200+
}
201+
if highestOffset == committedOffset {
202+
return false, Job{}, nil
203+
}
204+
205+
// Create the job with the calculated offsets
206+
job := Job{
207+
Partition: l.part.Partition(),
208+
Offsets: Offsets{
209+
Min: startOffset,
210+
Max: min(startOffset+l.stepLen, highestOffset),
211+
},
212+
}
213+
214+
return true, job, nil
215+
}
216+
217+
// implement a dummy controller which can be parameterized to
218+
// deterministically simulate partitions
219+
type dummyPartitionController struct {
220+
topic string
221+
partition int32
222+
committed int64
223+
highest int64
224+
numTenants int // number of unique tenants to simulate
225+
streamsPerTenant int // number of streams per tenant
226+
entriesPerOffset int // coefficient for entries per offset
227+
}
228+
229+
// used in testing
230+
// nolint:revive
231+
func NewDummyPartitionController(topic string, partition int32, highest int64) *dummyPartitionController {
232+
return &dummyPartitionController{
233+
topic: topic,
234+
partition: partition,
235+
committed: 0, // always starts at zero
236+
highest: highest,
237+
numTenants: 2, // default number of tenants
238+
streamsPerTenant: 2, // default streams per tenant
239+
entriesPerOffset: 1, // default entries per offset coefficient
240+
}
241+
}
242+
243+
func (d *dummyPartitionController) Topic() string {
244+
return d.topic
245+
}
246+
247+
func (d *dummyPartitionController) Partition() int32 {
248+
return d.partition
249+
}
250+
251+
func (d *dummyPartitionController) HighestCommittedOffset(_ context.Context) (int64, error) {
252+
return d.committed, nil
253+
}
254+
255+
func (d *dummyPartitionController) HighestPartitionOffset(_ context.Context) (int64, error) {
256+
return d.highest, nil
257+
}
258+
259+
func (d *dummyPartitionController) Commit(_ context.Context, offset int64) error {
260+
d.committed = offset
261+
return nil
262+
}
263+
264+
func (d *dummyPartitionController) Process(ctx context.Context, offsets Offsets, ch chan<- []AppendInput) (int64, error) {
265+
for i := int(offsets.Min); i < int(offsets.Max); i++ {
266+
batch := d.createBatch(i)
267+
select {
268+
case <-ctx.Done():
269+
return int64(i - 1), ctx.Err()
270+
case ch <- batch:
271+
}
272+
}
273+
return offsets.Max - 1, nil
274+
}
275+
276+
// creates (tenants*streams) inputs
277+
func (d *dummyPartitionController) createBatch(offset int) []AppendInput {
278+
result := make([]AppendInput, 0, d.numTenants*d.streamsPerTenant)
279+
for i := 0; i < d.numTenants; i++ {
280+
tenant := fmt.Sprintf("tenant-%d", i)
281+
for j := 0; j < d.streamsPerTenant; j++ {
282+
lbls := labels.Labels{
283+
{Name: "stream", Value: fmt.Sprintf("stream-%d", j)},
284+
}
285+
entries := make([]push.Entry, d.entriesPerOffset)
286+
for k := 0; k < d.entriesPerOffset; k++ {
287+
entries[k] = push.Entry{
288+
Timestamp: time.Now(),
289+
Line: fmt.Sprintf("tenant=%d stream=%d line=%d offset=%d", i, j, k, offset),
290+
}
291+
}
292+
result = append(result, AppendInput{
293+
tenant: tenant,
294+
labels: lbls,
295+
labelsStr: lbls.String(),
296+
entries: entries,
297+
})
298+
}
299+
}
300+
return result
301+
}
302+
303+
func (d *dummyPartitionController) Close() error {
304+
return nil
305+
}

0 commit comments

Comments
 (0)