Skip to content

Commit 41fafd8

Browse files
feat: Apply patterns line length limit to json message key (#14296)
1 parent 1a4436c commit 41fafd8

File tree

3 files changed

+59
-24
lines changed

3 files changed

+59
-24
lines changed

‎pkg/pattern/drain/drain.go

Lines changed: 3 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -153,11 +153,11 @@ func New(config *Config, format string, metrics *Metrics) *Drain {
153153
var tokenizer LineTokenizer
154154
switch format {
155155
case FormatJSON:
156-
tokenizer = newJSONTokenizer(config.ParamString)
156+
tokenizer = newJSONTokenizer(config.ParamString, config.MaxAllowedLineLength)
157157
case FormatLogfmt:
158-
tokenizer = newLogfmtTokenizer(config.ParamString)
158+
tokenizer = newLogfmtTokenizer(config.ParamString, config.MaxAllowedLineLength)
159159
default:
160-
tokenizer = newPunctuationTokenizer()
160+
tokenizer = newPunctuationTokenizer(config.MaxAllowedLineLength)
161161
}
162162

163163
d.idToCluster = createLogClusterCache(config.MaxClusters, func(int, *LogCluster) {
@@ -206,9 +206,6 @@ func (d *Drain) Train(content string, ts int64) *LogCluster {
206206
if !d.limiter.Allow() {
207207
return nil
208208
}
209-
if len(content) > d.config.MaxAllowedLineLength {
210-
return nil
211-
}
212209
d.tokens, d.state = d.tokenizer.Tokenize(content, d.tokens, d.state)
213210
return d.train(d.tokens, d.state, ts)
214211
}

‎pkg/pattern/drain/line_tokenizer.go

Lines changed: 29 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -37,9 +37,10 @@ func (spacesTokenizer) Clone(tokens []string, _ interface{}) ([]string, interfac
3737
type punctuationTokenizer struct {
3838
includeDelimiters [128]rune
3939
excludeDelimiters [128]rune
40+
maxLineLength int
4041
}
4142

42-
func newPunctuationTokenizer() *punctuationTokenizer {
43+
func newPunctuationTokenizer(maxLineLength int) *punctuationTokenizer {
4344
var included [128]rune
4445
var excluded [128]rune
4546
included['='] = 1
@@ -51,10 +52,15 @@ func newPunctuationTokenizer() *punctuationTokenizer {
5152
return &punctuationTokenizer{
5253
includeDelimiters: included,
5354
excludeDelimiters: excluded,
55+
maxLineLength: maxLineLength,
5456
}
5557
}
5658

5759
func (p *punctuationTokenizer) Tokenize(line string, tokens []string, state interface{}) ([]string, interface{}) {
60+
if len(line) > p.maxLineLength {
61+
return nil, nil
62+
}
63+
5864
if cap(tokens) == 0 {
5965
tokens = make([]string, 0, 128)
6066
}
@@ -190,18 +196,24 @@ func (splittingTokenizer) Clone(tokens []string, state interface{}) ([]string, i
190196
}
191197

192198
type logfmtTokenizer struct {
193-
dec *logfmt.Decoder
194-
varReplace string
199+
dec *logfmt.Decoder
200+
varReplace string
201+
maxLineLength int
195202
}
196203

197-
func newLogfmtTokenizer(varReplace string) *logfmtTokenizer {
204+
func newLogfmtTokenizer(varReplace string, maxLineLength int) *logfmtTokenizer {
198205
return &logfmtTokenizer{
199-
dec: logfmt.NewDecoder(nil),
200-
varReplace: varReplace,
206+
dec: logfmt.NewDecoder(nil),
207+
varReplace: varReplace,
208+
maxLineLength: maxLineLength,
201209
}
202210
}
203211

204212
func (t *logfmtTokenizer) Tokenize(line string, tokens []string, _ interface{}) ([]string, interface{}) {
213+
if len(line) > t.maxLineLength {
214+
return nil, nil
215+
}
216+
205217
if cap(tokens) == 0 {
206218
tokens = make([]string, 0, 64)
207219
}
@@ -251,11 +263,12 @@ func (t *logfmtTokenizer) Clone(tokens []string, _ interface{}) ([]string, inter
251263

252264
type jsonTokenizer struct {
253265
*punctuationTokenizer
254-
varReplace string
266+
varReplace string
267+
maxLineLength int
255268
}
256269

257-
func newJSONTokenizer(varReplace string) *jsonTokenizer {
258-
return &jsonTokenizer{newPunctuationTokenizer(), varReplace}
270+
func newJSONTokenizer(varReplace string, maxLineLength int) *jsonTokenizer {
271+
return &jsonTokenizer{newPunctuationTokenizer(maxLineLength), varReplace, maxLineLength}
259272
}
260273

261274
func (t *jsonTokenizer) Tokenize(line string, tokens []string, state interface{}) ([]string, interface{}) {
@@ -272,7 +285,13 @@ func (t *jsonTokenizer) Tokenize(line string, tokens []string, state interface{}
272285
return nil, nil
273286
}
274287

275-
return t.punctuationTokenizer.Tokenize(unsafeString(found), tokens, state)
288+
foundLine := unsafeString(found)
289+
290+
if len(foundLine) > t.maxLineLength {
291+
return nil, nil
292+
}
293+
294+
return t.punctuationTokenizer.Tokenize(foundLine, tokens, state)
276295
}
277296

278297
func (t *jsonTokenizer) Join(tokens []string, state interface{}) string {

‎pkg/pattern/drain/line_tokenizer_test.go

Lines changed: 27 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -115,6 +115,14 @@ var testCases = []TestCase{
115115
typeSplitting: {`!@£$%^&*()`},
116116
},
117117
},
118+
{
119+
name: "line length greater than max allowed length",
120+
line: `09:17:38.033366 ▶ INFO route ops sending to dest https://graphite-cortex-ops-blocks-us-east4.grafana.net/graphite/metrics: service_is_carbon-relay-ng.instance_is_carbon-relay-ng-c665b7b-j2trk.mtype_is_counter.dest_is_https_graphite-cortex-ops-blocks-us-east4_grafana_netgraphitemetrics.unit_is_Metric.action_is_drop.reason_is_queue_full 0 1717060658 userid invalid`,
121+
want: map[string][]string{
122+
typePunctuation: []string(nil),
123+
typeSplitting: {`09:`, `17:`, `38.033366`, `▶`, `INFO`, ``, `route`, `ops`, `sending`, `to`, `dest`, `https:`, `//graphite-cortex-ops-blocks-us-east4.grafana.net/graphite/metrics:`, ``, `service_is_carbon-relay-ng.instance_is_carbon-relay-ng-c665b7b-j2trk.mtype_is_counter.dest_is_https_graphite-cortex-ops-blocks-us-east4_grafana_netgraphitemetrics.unit_is_Metric.action_is_drop.reason_is_queue_full`, `0`, `1717060658`, `userid`, `invalid`},
124+
},
125+
},
118126
}
119127

120128
func TestTokenizer_Tokenize(t *testing.T) {
@@ -124,7 +132,7 @@ func TestTokenizer_Tokenize(t *testing.T) {
124132
}{
125133
{
126134
name: typePunctuation,
127-
tokenizer: newPunctuationTokenizer(),
135+
tokenizer: newPunctuationTokenizer(360),
128136
},
129137
{
130138
name: typeSplitting,
@@ -149,7 +157,7 @@ func TestTokenizer_TokenizeAndJoin(t *testing.T) {
149157
}{
150158
{
151159
name: typePunctuation,
152-
tokenizer: newPunctuationTokenizer(),
160+
tokenizer: newPunctuationTokenizer(DefaultConfig().MaxAllowedLineLength),
153161
},
154162
{
155163
name: typeSplitting,
@@ -168,7 +176,7 @@ func TestTokenizer_TokenizeAndJoin(t *testing.T) {
168176
}
169177

170178
func BenchmarkSplittingTokenizer(b *testing.B) {
171-
tokenizer := newPunctuationTokenizer()
179+
tokenizer := newPunctuationTokenizer(DefaultConfig().MaxAllowedLineLength)
172180

173181
for _, tt := range testCases {
174182
tc := tt
@@ -213,9 +221,13 @@ func TestLogFmtTokenizer(t *testing.T) {
213221
line: `logger=sqlstore.metrics traceID=c933fefbe893411d3be8e1648d6bcf37 t=2024-07-10T16:00:15.564896897Z level=debug msg="query finished" status=success elapsedtime=1.324305ms <REDACTED> error=null`,
214222
want: []string{"logger", "sqlstore.metrics", "traceID", "<_>", "t", "<_>", "level", "debug", "msg", "query finished", "status", "success", "elapsedtime", "1.324305ms", "<REDACTED>", "", "error", "null"},
215223
},
224+
{
225+
line: `ts=2024-05-30T12:50:36.648377186Z caller=scheduler_processor.go:143 level=warn msg="error contacting scheduler" err="rpc error: code = Unavailable desc = connection error: desc = \"error reading server preface: EOF\"" addr=10.0.151.101:9095 ip=127.0.0.1 userid=1234456`,
226+
want: []string(nil),
227+
},
216228
}
217229

218-
tokenizer := newLogfmtTokenizer(param)
230+
tokenizer := newLogfmtTokenizer(param, 250)
219231

220232
for _, tt := range tests {
221233
t.Run(tt.name, func(t *testing.T) {
@@ -268,7 +280,7 @@ func TestLogFmtTokenizerJoin(t *testing.T) {
268280
},
269281
}
270282

271-
tokenizer := newLogfmtTokenizer("")
283+
tokenizer := newLogfmtTokenizer("", DefaultConfig().MaxAllowedLineLength)
272284

273285
for _, tt := range tests {
274286
t.Run("", func(t *testing.T) {
@@ -306,16 +318,23 @@ func TestJsonTokenizer(t *testing.T) {
306318
want: []string{"successfully", "discovered", "15", "agent", "IP", "addresses"},
307319
pattern: "<_>successfully discovered 15 agent IP addresses<_>",
308320
},
321+
{
322+
line: `{"msg":{"actor":{"alternateId":"foo@grafana.com","displayName":"Foo bar","id":"dq23","type":"User"},"authenticationContext":{"authenticationStep":0,"externalSessionId":"123d"},"client":{"device":"Computer","geographicalContext":{"city":"Berlin","country":"DE","state":"Land Berlin"},"ipAddress":"0.0.0.0","userAgent":{"browser":"CHROME","os":"Mac OS X","rawUserAgent":"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36"},"zone":"null"},"debugContext":{"debugData":{"authMethodFirstEnrollment":"123","authMethodFirstType":"foo","authMethodFirstVerificationTime":"2024-07-02T11:28:03.219Z","authMethodSecondEnrollment":"var","authMethodSecondType":"ddd","authMethodSecondVerificationTime":"2024-07-03T06:59:09.151Z","authnRequestId":"1","dtHash":"1","logOnlySecurityData":"{\"risk\":{\"level\":\"LOW\"},\"behaviors\":{\"New Geo-Location\":\"NEGATIVE\",\"New Device\":\"NEGATIVE\",\"New IP\":\"NEGATIVE\",\"New State\":\"NEGATIVE\",\"New Country\":\"NEGATIVE\",\"Velocity\":\"NEGATIVE\",\"New City\":\"NEGATIVE\"}}","requestId":"1","threatSuspected":"false","url":"/foo?"}},"displayMessage":"Evaluation of sign-on policy","eventType":"policy.evaluate_sign_on","legacyEventType":"app.oauth2.token.grant.refresh_token_success","outcome":{"reason":"Sign-on policy evaluation resulted in AUTHENTICATED","result":"ALLOW"},"published":"2024-07-03T09:19:59.973Z","request":{"ipChain":[{"geographicalContext":{"city":"Berlin","country":"Germany","geolocation":{"lat":52.5363,"lon":13.4169},"postalCode":"10435","state":"Land Berlin"},"ip":"95.90.234.241","version":"V4"}]},"securityContext":{"asNumber":3209,"asOrg":"kabel deutschland breitband customer 19","domain":"kabel-deutschland.de","isProxy":false,"isp":"vodafone gmbh"},"severity":"INFO","target":[{"alternateId":"Salesforce.com","detailEntry":{"signOnModeEvaluationResult":"AUTHENTICATED","signOnModeType":"SAML_2_0"},"displayName":"Salesforce.com","id":"0oa5sfmj3hz0mTgoW357","type":"AppInstance"},{"alternateId":"unknown","detailEntry":{"policyRuleFactorMode":"2FA"},"displayName":"Catch-all Rule","id":"1","type":"Rule"}],"transaction":{"detail":{},"id":"1","type":"WEB"},"context":[{"repo":{"id":27826205,"name":"hermanwahyudi/selenium","url":"https://api.github.com/repos/hermanwahyudi/selenium"},"payload":{"push_id":536863976,"size":1,"distinct_size":0,"ref":"refs/heads/master","head":"1b58dd4c4e14ea9cf5212b981774bd448a266c3c","before":"20b10e3a605bd177efff62f1130943774ac07bf3","commits":[{"sha":"1b58dd4c4e14ea9cf5212b981774bd448a266c3c","author":{"email":"2bb20d8a71fb7adbc1d6239cc9ff4130f26819dc@gmail.com","name":"Herman"},"message":"Update README.md","distinct":false,"url":"https://api.github.com/repos/hermanwahyudi/selenium/commits/1b58dd4c4e14ea9cf5212b981774bd448a266c3c"}]}},{"repo":{"id":27826205,"name":"hermanwahyudi/selenium","url":"https://api.github.com/repos/hermanwahyudi/selenium"},"payload":{"push_id":536863976,"size":1,"distinct_size":0,"ref":"refs/heads/master","head":"1b58dd4c4e14ea9cf5212b981774bd448a266c3c","before":"20b10e3a605bd177efff62f1130943774ac07bf3","commits":[{"sha":"1b58dd4c4e14ea9cf5212b981774bd448a266c3c","author":{"email":"2bb20d8a71fb7adbc1d6239cc9ff4130f26819dc@gmail.com","name":"Herman"},"message":"Update README.md","distinct":false,"url":"https://api.github.com/repos/hermanwahyudi/selenium/commits/1b58dd4c4e14ea9cf5212b981774bd448a266c3c"}]}}],"uuid":"1","version":"0"},"level":"info","type":"received event","time":"2024-07-03T09:19:59Z"}`,
323+
want: []string(nil),
324+
pattern: "",
325+
},
309326
}
310327

311-
tokenizer := newJSONTokenizer(param)
328+
tokenizer := newJSONTokenizer(param, DefaultConfig().MaxAllowedLineLength)
312329

313330
for _, tt := range tests {
314331
t.Run(tt.name, func(t *testing.T) {
315332
got, state := tokenizer.Tokenize(tt.line, nil, nil)
316333
require.Equal(t, tt.want, got)
317-
pattern := tokenizer.Join(got, state)
318-
require.Equal(t, tt.pattern, pattern)
334+
if len(got) == len(tt.want) && len(tt.want) != 0 {
335+
pattern := tokenizer.Join(got, state)
336+
require.Equal(t, tt.pattern, pattern)
337+
}
319338
})
320339
}
321340
}

0 commit comments

Comments
 (0)