Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
scheduler job completion
  • Loading branch information
owen-d committed Dec 10, 2024
commit d7e90621712aeb8d9bb4d55edb258e4037c1198e
37 changes: 19 additions & 18 deletions pkg/blockbuilder/scheduler/queue.go
Original file line number Diff line number Diff line change
Expand Up @@ -159,31 +159,32 @@ func (q *JobQueue) MarkComplete(id string, status types.JobStatus) {
q.mu.Lock()
defer q.mu.Unlock()

jobMeta, ok := q.inProgress[id]
jobMeta, ok := q.existsLockLess(id)
if !ok {
level.Error(q.logger).Log("msg", "failed to mark job as complete", "job", id, "status", status)
return
}

// Update metadata for completion
jobMeta.Status = status
jobMeta.UpdateTime = time.Now()

// Add to completed buffer
if old, evicted := q.completed.Push(jobMeta); evicted {
// If the buffer is full, evict the oldest job and remove it from the status map to avoid leaks
delete(q.statusMap, old.ID())
switch jobMeta.Status {
case types.JobStatusInProgress:
// update & remove from in progress
delete(q.inProgress, id)
case types.JobStatusPending:
_, ok := q.pending.Remove(id)
if !ok {
level.Error(q.logger).Log("msg", "failed to remove job from pending queue", "job", id)
}
default:
level.Error(q.logger).Log("msg", "unknown job status, cannot mark as complete", "job", id, "status", status)
}

// Update status map and clean up
q.statusMap[id] = status
delete(q.inProgress, id)
jobMeta.Status = status
jobMeta.UpdateTime = time.Now()

// If the job failed, re-enqueue it with its original priority
if status == types.JobStatusFailed {
// Create new metadata for the re-enqueued job
newJobMeta := NewJobWithMetadata(jobMeta.Job, jobMeta.Priority)
q.pending.Push(newJobMeta)
q.statusMap[id] = types.JobStatusPending
// add it to the completed buffer, removing any evicted job from the statusMap
removal, evicted := q.completed.Push(jobMeta)
if evicted {
delete(q.statusMap, removal.ID())
}
}

Expand Down
23 changes: 19 additions & 4 deletions pkg/blockbuilder/scheduler/scheduler.go
Original file line number Diff line number Diff line change
Expand Up @@ -214,17 +214,32 @@ func (s *BlockScheduler) HandleGetJob(ctx context.Context) (*types.Job, bool, er
}
}

func (s *BlockScheduler) HandleCompleteJob(_ context.Context, job *types.Job, success bool) error {
func (s *BlockScheduler) HandleCompleteJob(ctx context.Context, job *types.Job, success bool) (err error) {
logger := log.With(s.logger, "job", job.ID())

if success {
level.Info(logger).Log("msg", "job completed successfully")
s.queue.MarkComplete(job.ID(), types.JobStatusComplete)
return nil
// TODO(owen-d): do i need to increment offset here?
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

i think we need to commit job.Offsets().Max-1 given builders only consume upto but not including Max

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

sounds good

if err = s.offsetManager.Commit(
ctx,
job.Partition(),
job.Offsets().Max,
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

next jobs would start with job.Offsets().Max

Suggested change
job.Offsets().Max,
job.Offsets().Max-1,
); err == nil {
s.queue.MarkComplete(job.ID(), types.JobStatusComplete)
level.Info(logger).Log("msg", "job completed successfully")
return nil
}

level.Error(logger).Log("msg", "failed to commit offset", "err", err)
}

level.Error(logger).Log("msg", "job failed, re-enqueuing")
s.queue.MarkComplete(job.ID(), types.JobStatusFailed)
s.queue.pending.Push(
NewJobWithMetadata(
job,
DefaultPriority,
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

shouldn't failed jobs instead get a higher priority?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, but I didn't want to complicate the PR further by adding logic for priority discovery on retries. I think this is best left for a followup PR.

),
)
return nil
}

Expand Down
28 changes: 27 additions & 1 deletion pkg/blockbuilder/scheduler/scheduler_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,10 @@ import (

"github.com/go-kit/log"
"github.com/prometheus/client_golang/prometheus"
"github.com/twmb/franz-go/pkg/kadm"

"github.com/grafana/loki/v3/pkg/blockbuilder/types"
"github.com/grafana/loki/v3/pkg/kafka/partition"
)

type testEnv struct {
Expand All @@ -18,9 +20,33 @@ type testEnv struct {
builder *Worker
}

type mockOffsetManager struct {
topic string
consumerGroup string
}

func (m *mockOffsetManager) Topic() string { return m.topic }
func (m *mockOffsetManager) ConsumerGroup() string { return m.consumerGroup }
func (m *mockOffsetManager) GroupLag(ctx context.Context, lookbackPeriod time.Duration) (map[int32]kadm.GroupMemberLag, error) {
return nil, nil
}
func (m *mockOffsetManager) FetchLastCommittedOffset(ctx context.Context, partition int32) (int64, error) {
return 0, nil
}
func (m *mockOffsetManager) FetchPartitionOffset(ctx context.Context, partition int32, position partition.SpecialOffset) (int64, error) {
return 0, nil
}
func (m *mockOffsetManager) Commit(ctx context.Context, partition int32, offset int64) error {
return nil
}

func newTestEnv(builderID string) *testEnv {
queue := NewJobQueue()
scheduler := NewScheduler(Config{}, queue, nil, log.NewNopLogger(), prometheus.NewRegistry())
mockOffsetMgr := &mockOffsetManager{
topic: "test-topic",
consumerGroup: "test-group",
}
scheduler := NewScheduler(Config{}, queue, mockOffsetMgr, log.NewNopLogger(), prometheus.NewRegistry())
transport := types.NewMemoryTransport(scheduler)
builder := NewWorker(builderID, transport)

Expand Down