Skip to content

Commit 8eca826

Browse files
rfrattoJStickler
andauthored
chore(storage/bloom): support simplifiable regexp matchers (#14622)
This adds support for basic regexps which can be simplified into a sequence of OR matchers, such as: * `key=~"value" becomes key="value" * `key=~"value1|value2" becomes key="value1" or key="value2". * `key=~".+" checks for the presence of key. This is currently the only way to check if a key exists. Only the cases above are "officially" supported. However, we technically support basic concatenations and character classes due to how regexp/syntax parses and simplifies expressions such as `value1|value2` into `value[12]`. To prevent unbounded cardinality, we limit regexp expansion to 25 matchers; otherwise a regexp like `value[0-9][0-9][0-9][0-9]` would expand into 10,000 matchers (too many!). Closes grafana/loki-private#1106. Co-authored-by: J Stickler <julie.stickler@grafana.com>
1 parent 7b53f20 commit 8eca826

File tree

6 files changed

+425
-38
lines changed

6 files changed

+425
-38
lines changed

‎docs/sources/query/query_accceleration.md

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,11 @@ If [bloom filters][] are enabled, you can write LogQL queries using [structured
2626
Queries will be accelerated for any [label filter expression][] that satisfies _all_ of the following criteria:
2727

2828
* The label filter expression using **string equality**, such as `| key="value"`.
29+
* `or` and `and` operators can be used to match multiple values, such as `| detected_level="error" or detected_level="warn"`.
30+
* _Basic_ regular expressions are automatically simplified into a supported expression:
31+
* `| key=~"value"` is converted to `| key="value"`.
32+
* `| key=~"value1|value2"` is converted to `| key="value1" or key="value2"`.
33+
* `| key=~".+"` checks for existence of `key`. `.*` is not supported.
2934
* The label filter expression is querying for structured metadata and not a stream label.
3035
* The label filter expression is placed before any [parser expression][], [labels format expression][], [drop labels expression][], or [keep labels expression][].
3136

‎pkg/bloomgateway/processor_test.go

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -141,7 +141,7 @@ func TestProcessor(t *testing.T) {
141141
}
142142

143143
matchers := []v1.LabelMatcher{
144-
v1.PlainLabelMatcher{
144+
v1.KeyValueMatcher{
145145
Key: "trace_id",
146146
Value: "nomatch",
147147
},
@@ -191,7 +191,7 @@ func TestProcessor(t *testing.T) {
191191
day: config.NewDayTime(truncateDay(now)),
192192
}
193193
matchers := []v1.LabelMatcher{
194-
v1.PlainLabelMatcher{
194+
v1.KeyValueMatcher{
195195
Key: "trace_id",
196196
Value: "nomatch",
197197
},
@@ -238,7 +238,7 @@ func TestProcessor(t *testing.T) {
238238
day: config.NewDayTime(truncateDay(now)),
239239
}
240240
matchers := []v1.LabelMatcher{
241-
v1.PlainLabelMatcher{
241+
v1.KeyValueMatcher{
242242
Key: "trace_id",
243243
Value: "nomatch",
244244
},

‎pkg/storage/bloom/v1/ast_extractor.go

Lines changed: 191 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,24 @@
11
package v1
22

33
import (
4+
regexsyn "github.com/grafana/regexp/syntax"
5+
46
"github.com/prometheus/prometheus/model/labels"
57

68
"github.com/grafana/loki/v3/pkg/logql/log"
79
"github.com/grafana/loki/v3/pkg/logql/syntax"
10+
"github.com/grafana/loki/v3/pkg/util"
811
)
912

13+
// Simplifiable regexp expressions can quickly expand into very high
14+
// cardinality; we limit the number of matchers to prevent this. However,
15+
// since bloom tests are relatively cheap to test, we can afford to be a little
16+
// generous while still preventing excessive cardinality.
17+
//
18+
// For example, the regex `[0-9]` expands to 10 matchers (0, 1, .. 9), while
19+
// `[0-9][0-9][0-9]` expands to 1000 matchers (000, 001, .., 999).
20+
const maxRegexMatchers = 200
21+
1022
// LabelMatcher represents bloom tests for key-value pairs, mapped from
1123
// LabelFilterExprs from the AST.
1224
type LabelMatcher interface{ isLabelMatcher() }
@@ -15,9 +27,13 @@ type LabelMatcher interface{ isLabelMatcher() }
1527
// mapped. Bloom tests for UnsupportedLabelMatchers must always pass.
1628
type UnsupportedLabelMatcher struct{}
1729

18-
// PlainLabelMatcher represents a direct key-value matcher. Bloom tests
19-
// must only pass if the key-value pair exists in the bloom.
20-
type PlainLabelMatcher struct{ Key, Value string }
30+
// KeyValueMatcher represents a direct key-value matcher. Bloom tests must only
31+
// pass if the key-value pair exists in the bloom.
32+
type KeyValueMatcher struct{ Key, Value string }
33+
34+
// KeyMatcher represents a key matcher. Bloom tests must only pass if the key
35+
// exists in the bloom.
36+
type KeyMatcher struct{ Key string }
2137

2238
// OrLabelMatcher represents a logical OR test. Bloom tests must only pass if
2339
// one of the Left or Right label matcher bloom tests pass.
@@ -54,21 +70,27 @@ func buildLabelMatcher(filter log.LabelFilterer) LabelMatcher {
5470
switch filter := filter.(type) {
5571

5672
case *log.LineFilterLabelFilter:
57-
if filter.Type != labels.MatchEqual {
58-
return UnsupportedLabelMatcher{}
73+
if filter.Type == labels.MatchEqual {
74+
return KeyValueMatcher{
75+
Key: filter.Name,
76+
Value: filter.Value,
77+
}
78+
} else if filter.Type == labels.MatchRegexp {
79+
reg, err := regexsyn.Parse(filter.Value, regexsyn.Perl)
80+
if err != nil {
81+
return UnsupportedLabelMatcher{}
82+
}
83+
return buildSimplifiedRegexMatcher(filter.Name, reg.Simplify())
5984
}
6085

61-
return PlainLabelMatcher{
62-
Key: filter.Name,
63-
Value: filter.Value,
64-
}
86+
return UnsupportedLabelMatcher{}
6587

6688
case *log.StringLabelFilter:
6789
if filter.Type != labels.MatchEqual {
6890
return UnsupportedLabelMatcher{}
6991
}
7092

71-
return PlainLabelMatcher{
93+
return KeyValueMatcher{
7294
Key: filter.Name,
7395
Value: filter.Value,
7496
}
@@ -89,11 +111,169 @@ func buildLabelMatcher(filter log.LabelFilterer) LabelMatcher {
89111
}
90112
}
91113

114+
// buildSimplifiedRegexMatcher builds a simplified label matcher from a regex.
115+
// reg may be mutated.
116+
func buildSimplifiedRegexMatcher(key string, reg *regexsyn.Regexp) LabelMatcher {
117+
switch reg.Op {
118+
case regexsyn.OpAlternate:
119+
util.ClearCapture(reg)
120+
121+
left := buildSimplifiedRegexMatcher(key, reg.Sub[0])
122+
if len(reg.Sub) == 1 {
123+
// This shouldn't be possible (even `warn|` has two subexpressions, where
124+
// the latter matches an empty string), but we have a length check here
125+
// anyway just to avoid a potential panic.
126+
return left
127+
}
128+
for _, sub := range reg.Sub[1:] {
129+
right := buildSimplifiedRegexMatcher(key, sub)
130+
left = OrLabelMatcher{Left: left, Right: right}
131+
}
132+
return left
133+
134+
case regexsyn.OpConcat:
135+
// OpConcat checks for the concatenation of two or more subexpressions. For
136+
// example, value1|value2 simplifies to value[12], with the two
137+
// subexpressions value and [12].
138+
//
139+
// We expand subexpressions back out into full matchers where possible, so
140+
// value[12] becomes value1 OR value2, and value[1-9] becomes value1 OR
141+
// value2 .. OR value9.
142+
util.ClearCapture(reg)
143+
144+
matchers, ok := expandSubexpr(reg)
145+
if !ok || len(matchers) == 0 {
146+
return UnsupportedLabelMatcher{}
147+
}
148+
149+
var left LabelMatcher = KeyValueMatcher{Key: key, Value: matchers[0]}
150+
for _, matcher := range matchers[1:] {
151+
right := KeyValueMatcher{Key: key, Value: matcher}
152+
left = OrLabelMatcher{Left: left, Right: right}
153+
}
154+
return left
155+
156+
case regexsyn.OpCapture:
157+
util.ClearCapture(reg)
158+
return buildSimplifiedRegexMatcher(key, reg)
159+
160+
case regexsyn.OpLiteral:
161+
return KeyValueMatcher{
162+
Key: key,
163+
Value: string(reg.Rune),
164+
}
165+
166+
case regexsyn.OpPlus:
167+
if reg.Sub[0].Op == regexsyn.OpAnyChar || reg.Sub[0].Op == regexsyn.OpAnyCharNotNL { // .+
168+
return KeyMatcher{Key: key}
169+
}
170+
171+
return UnsupportedLabelMatcher{}
172+
173+
default:
174+
return UnsupportedLabelMatcher{}
175+
}
176+
}
177+
178+
func expandSubexpr(reg *regexsyn.Regexp) (prefixes []string, ok bool) {
179+
switch reg.Op {
180+
case regexsyn.OpAlternate:
181+
util.ClearCapture(reg)
182+
183+
for _, sub := range reg.Sub {
184+
subPrefixes, ok := expandSubexpr(sub)
185+
if !ok {
186+
return nil, false
187+
} else if len(prefixes)+len(subPrefixes) > maxRegexMatchers {
188+
return nil, false
189+
}
190+
prefixes = append(prefixes, subPrefixes...)
191+
}
192+
return prefixes, true
193+
194+
case regexsyn.OpCharClass:
195+
// OpCharClass stores ranges of characters, so [12] is the range of bytes
196+
// []rune('1', '2'), while [15] is represented as []rune('1', '1', '5',
197+
// '5').
198+
//
199+
// To expand OpCharClass, we iterate over each pair of runes.
200+
if len(reg.Rune)%2 != 0 {
201+
// Invalid regexp; sequences should be even.
202+
return nil, false
203+
}
204+
205+
for i := 0; i < len(reg.Rune); i += 2 {
206+
start, end := reg.Rune[i+0], reg.Rune[i+1]
207+
for r := start; r <= end; r++ {
208+
prefixes = append(prefixes, string(r))
209+
if len(prefixes) > maxRegexMatchers {
210+
return nil, false
211+
}
212+
}
213+
}
214+
215+
return prefixes, true
216+
217+
case regexsyn.OpConcat:
218+
if len(reg.Sub) == 0 {
219+
return nil, false
220+
}
221+
222+
// We get the prefixes for each subexpression and then iteratively combine
223+
// them together.
224+
//
225+
// For the regexp [12][34]value (which concatenates [12], [34], and value):
226+
//
227+
// 1. We get the prefixes for [12], which are 1 and 2.
228+
// 2. We get the prefixes for [34], which are 3 and 4.
229+
// 3. We add the prefixes together to get 13, 14, 23, and 24.
230+
// 4. We get the prerfixes for value, which is value.
231+
// 5. Finally, we add the prefixes together to get 13value, 14value, 23value, and 24value.
232+
curPrefixes, ok := expandSubexpr(reg.Sub[0])
233+
if !ok {
234+
return nil, false
235+
}
236+
237+
for _, sub := range reg.Sub[1:] {
238+
subPrefixes, ok := expandSubexpr(sub)
239+
if !ok {
240+
return nil, false
241+
} else if len(curPrefixes)*len(subPrefixes) > maxRegexMatchers {
242+
return nil, false
243+
}
244+
245+
newPrefixes := make([]string, 0, len(curPrefixes)*len(subPrefixes))
246+
247+
for _, curPrefix := range curPrefixes {
248+
for _, subPrefix := range subPrefixes {
249+
newPrefixes = append(newPrefixes, curPrefix+subPrefix)
250+
}
251+
}
252+
253+
curPrefixes = newPrefixes
254+
}
255+
256+
return curPrefixes, true
257+
258+
case regexsyn.OpCapture:
259+
util.ClearCapture(reg)
260+
return expandSubexpr(reg)
261+
262+
case regexsyn.OpLiteral:
263+
prefixes = append(prefixes, string(reg.Rune))
264+
return prefixes, true
265+
266+
default:
267+
return nil, false
268+
}
269+
}
270+
92271
//
93272
// Implement marker types:
94273
//
95274

96275
func (UnsupportedLabelMatcher) isLabelMatcher() {}
97-
func (PlainLabelMatcher) isLabelMatcher() {}
276+
func (KeyValueMatcher) isLabelMatcher() {}
277+
func (KeyMatcher) isLabelMatcher() {}
98278
func (OrLabelMatcher) isLabelMatcher() {}
99279
func (AndLabelMatcher) isLabelMatcher() {}

0 commit comments

Comments
 (0)