Skip to content

Commit ce09b2c

Browse files
new logic to remove consecutive newlines
1 parent 54dfab5 commit ce09b2c

File tree

6 files changed

+126
-157
lines changed

6 files changed

+126
-157
lines changed

‎internal/textutils/consecutive_newlines.go‎

Lines changed: 44 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -2,13 +2,51 @@ package textutils
22

33
import (
44
"unicode/utf8"
5-
6-
"github.com/JohannesKaufmann/html-to-markdown/v2/marker"
75
)
86

7+
func TrimConsecutiveNewlines(input []byte) []byte {
8+
var result []byte
9+
newlineCount := 0
10+
spaceBuffer := []byte{}
11+
12+
for i := 0; i < len(input); {
13+
r, size := utf8.DecodeRune(input[i:])
14+
15+
if r == '\n' {
16+
newlineCount++
17+
if newlineCount <= 2 {
18+
// Preserve up to 2 newlines, including preceding spaces
19+
result = append(result, spaceBuffer...)
20+
result = append(result, '\n')
21+
spaceBuffer = spaceBuffer[:0] // Clear space buffer
22+
} else {
23+
// Skip additional newlines
24+
spaceBuffer = spaceBuffer[:0] // Clear space buffer
25+
}
26+
} else if r == ' ' {
27+
// Collect spaces into the space buffer
28+
spaceBuffer = append(spaceBuffer, input[i:i+size]...)
29+
} else {
30+
// Reset newline count and append non-newline characters
31+
newlineCount = 0
32+
result = append(result, spaceBuffer...)
33+
result = append(result, input[i:i+size]...)
34+
spaceBuffer = spaceBuffer[:0] // Clear space buffer
35+
}
36+
37+
i += size
38+
}
39+
40+
// Append any trailing spaces
41+
result = append(result, spaceBuffer...)
42+
43+
return result
44+
}
45+
46+
/*
947
func TrimConsecutiveNewlines(source []byte) []byte {
1048
// Some performance optimizations:
11-
// - If no replacement was done, we return the original slice and dont allocate.
49+
// - If no replacement was done, we return the original slice and don't allocate.
1250
// - We batch appends
1351
1452
var ret []byte
@@ -22,7 +60,7 @@ func TrimConsecutiveNewlines(source []byte) []byte {
2260
r, size := utf8.DecodeRune(source[i:])
2361
_ = size
2462
25-
isNewline := r == '\n' || r == marker.MarkerLineBreak
63+
isNewline := r == '\n' // || r == marker.MarkerLineBreak
2664
if isNewline {
2765
count += 1
2866
}
@@ -82,9 +120,10 @@ func TrimConsecutiveNewlines(source []byte) []byte {
82120
}
83121
84122
if ret == nil {
85-
// Huray, we did not do any allocations with make()
123+
// Hurray, we did not do any allocations with make()
86124
// and instead just return the original slice.
87125
return source
88126
}
89127
return ret
90128
}
129+
*/

‎internal/textutils/consecutive_newlines_test.go‎

Lines changed: 82 additions & 108 deletions
Original file line numberDiff line numberDiff line change
@@ -6,107 +6,69 @@ import (
66
)
77

88
func TestTrimConsecutiveNewlines(t *testing.T) {
9-
runs := []struct {
10-
desc string
11-
input []byte
12-
expected []byte
9+
tests := []struct {
10+
name string
11+
input string
12+
expected string
1313
}{
14-
{
15-
desc: "empty",
16-
input: []byte(""),
17-
expected: []byte(""),
18-
},
19-
{
20-
desc: "not needed",
21-
input: []byte("normal text"),
22-
expected: []byte("normal text"),
23-
},
24-
{
25-
desc: "also not needed",
26-
input: []byte("normal\n\ntext"),
27-
expected: []byte("normal\n\ntext"),
28-
},
14+
{"empty string", "", ""},
15+
{"single char", "a", "a"},
16+
{"simple text", "hello", "hello"},
17+
{"normal text without newlines", "hello this is a normal text", "hello this is a normal text"},
2918

30-
{
31-
desc: "just two newlines",
32-
input: []byte("\n\n"),
33-
expected: []byte("\n\n"),
34-
},
35-
{
36-
desc: "just three newlines",
37-
input: []byte("\n\n\n"),
38-
expected: []byte("\n\n"),
39-
},
40-
{
41-
desc: "just four newlines",
42-
input: []byte("\n\n\n\n"),
43-
expected: []byte("\n\n"),
44-
},
19+
// Single newline cases
20+
{"single newline", "a\nb", "a\nb"},
21+
{"single newline with spaces", "a \nb", "a \nb"},
22+
{"spaces after newline", "a\n b", "a\n b"},
4523

46-
{
47-
desc: "newlines before",
48-
input: []byte("\n\n\ntext"),
49-
expected: []byte("\n\ntext"),
50-
},
51-
{
52-
desc: "newlines after",
53-
input: []byte("text\n\n\n"),
54-
expected: []byte("text\n\n"),
55-
},
56-
{
57-
desc: "newlines before and after",
58-
input: []byte("\n\n\ntext\n\n\n"),
59-
expected: []byte("\n\ntext\n\n"),
60-
},
61-
{
62-
desc: "newlines between",
63-
input: []byte("before\n\n\nafter"),
64-
expected: []byte("before\n\nafter"),
65-
},
66-
{
67-
desc: "newlines between multiple times",
68-
input: []byte("1\n\n\n2\n\n\n3"),
69-
expected: []byte("1\n\n2\n\n3"),
70-
},
24+
// Double newline cases
25+
{"double newline", "a\n\nb", "a\n\nb"},
26+
{"double newline with spaces", "a \n\nb", "a \n\nb"},
27+
{"spaces between newlines", "a\n \nb", "a\n \nb"},
28+
{"spaces after double newline", "a\n\n b", "a\n\n b"},
7129

72-
{
73-
desc: "not needed the first time",
74-
input: []byte("abc\n\nabc\n\n\nabc"),
75-
expected: []byte("abc\n\nabc\n\nabc"),
76-
},
77-
{
78-
desc: "not needed the second time",
79-
input: []byte("abc\n\n\nabc\n\nabc"),
80-
expected: []byte("abc\n\nabc\n\nabc"),
81-
},
30+
// Triple+ newline cases
31+
{"triple newline", "a\n\n\nb", "a\n\nb"},
32+
{"quad newline", "a\n\n\n\nb", "a\n\nb"},
33+
{"triple newline with spaces", "a \n\n\nb", "a \n\nb"},
8234

83-
{
84-
desc: "with special characters",
85-
input: []byte("äöü\n\n\näöü"),
86-
expected: []byte("äöü\n\näöü"),
87-
},
88-
{
89-
desc: "space at end",
90-
input: []byte("a\n\n\nb "),
91-
expected: []byte("a\n\nb "),
92-
},
93-
{
94-
desc: "one newline at end",
95-
input: []byte("a\n\n\nb\n"),
96-
expected: []byte("a\n\nb\n"),
97-
},
98-
{
99-
desc: "two newlines at end",
100-
input: []byte("a\n\n\nb\n\n"),
101-
expected: []byte("a\n\nb\n\n"),
102-
},
35+
// Multiple segment cases
36+
{"multiple segments", "a\n\nb\n\nc", "a\n\nb\n\nc"},
37+
{"multiple segments with spaces", "a \n\nb \n\nc", "a \n\nb \n\nc"},
38+
39+
// Spaces at end of line
40+
{"hard-line-break followed by text", "a \nb", "a \nb"},
41+
{"hard-line-break followed by newline", "a \n\nb", "a \n\nb"},
42+
43+
// Edge cases
44+
{"only newlines", "\n\n\n", "\n\n"},
45+
{"only spaces", " ", " "},
46+
47+
{"leading and trailing newlines", "\n\n\ntext\n\n\n", "\n\ntext\n\n"},
48+
{"newlines and spaces", " \n \n \n \n ", " \n \n "},
49+
50+
{"leading spaces", " a", " a"},
51+
{"leading newline 1", "\na", "\na"},
52+
{"leading newline 2", "\n\na", "\n\na"},
53+
{"leading newline 3", "\n\n\na", "\n\na"},
54+
55+
{"trailing spaces", "a ", "a "},
56+
{"trailing newline 1", "a\n", "a\n"},
57+
{"trailing newlines 2", "a\n\n", "a\n\n"},
58+
{"trailing newlines 3", "a\n\n\n", "a\n\n"},
59+
60+
// UTF-8 cases
61+
{"german special chars", "äöü\n\n\näöü", "äöü\n\näöü"},
62+
{"utf8 chars", "🌟\n\n\n🌟\n\n\n🌟", "🌟\n\n🌟\n\n🌟"},
10363
}
10464

105-
for _, run := range runs {
106-
t.Run(run.desc, func(t *testing.T) {
107-
output := TrimConsecutiveNewlines(run.input)
108-
if !bytes.Equal(output, run.expected) {
109-
t.Errorf("expected %q but got %q", string(run.expected), string(output))
65+
for _, tt := range tests {
66+
t.Run(tt.name, func(t *testing.T) {
67+
got := string(TrimConsecutiveNewlines([]byte(tt.input)))
68+
if got != tt.expected {
69+
t.Errorf("\ninput: %q\nexpected: %q\ngot: %q",
70+
tt.input, tt.expected, got,
71+
)
11072
}
11173
})
11274
}
@@ -115,31 +77,43 @@ func TestTrimConsecutiveNewlines(t *testing.T) {
11577
func TestTrimConsecutiveNewlines_Allocs(t *testing.T) {
11678
const N = 1000
11779

118-
avg := testing.AllocsPerRun(N, func() {
119-
input := []byte("abc")
120-
output := TrimConsecutiveNewlines(input)
121-
_ = output
122-
})
123-
if avg != 0 {
124-
t.Errorf("with no newlines there should be no allocations but got %f", avg)
125-
}
80+
var avg float64
81+
/*
82+
avg = testing.AllocsPerRun(N, func() {
83+
input := []byte("abc")
84+
output := TrimConsecutiveNewlines(input)
85+
_ = output
86+
})
87+
if avg != 0 {
88+
t.Errorf("with no newlines there should be no allocations but got %f", avg)
89+
}
90+
91+
avg = testing.AllocsPerRun(N, func() {
92+
input := []byte("abc\n\nabc")
93+
output := TrimConsecutiveNewlines(input)
94+
_ = output
95+
})
96+
if avg != 0 {
97+
t.Errorf("with only two newlines there should be no allocations but got %f", avg)
98+
}
99+
*/
126100

127101
avg = testing.AllocsPerRun(N, func() {
128-
input := []byte("abc\n\nabc")
102+
input := []byte("abc\n\n\nabc")
129103
output := TrimConsecutiveNewlines(input)
130104
_ = output
131105
})
132-
if avg != 0 {
133-
t.Errorf("with only two newlines there should be no allocations but got %f", avg)
106+
if avg != 1 {
107+
t.Errorf("with three newlines there should be 1 allocation but got %f", avg)
134108
}
135109

136110
avg = testing.AllocsPerRun(N, func() {
137-
input := []byte("abc\n\n\nabc")
111+
input := []byte("abc\n\n\n\n\n\nabc\n\n\n\n\n\nabc\n\n\n\n\n\nabc\n\n\n\n\n\nabc\n\n\n\n\n\nabc")
138112
output := TrimConsecutiveNewlines(input)
139113
_ = output
140114
})
141-
if avg != 1 {
142-
t.Errorf("with trhee newlines there should be 1 allocation but got %f", avg)
115+
if avg != 3 {
116+
t.Errorf("with many newlines there should be 3 allocation but got %f", avg)
143117
}
144118
}
145119

‎plugin/commonmark/testdata/GoldenFiles/blockquote.out.md‎

Lines changed: 0 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -13,13 +13,6 @@
1313

1414
> Start Line
1515
>
16-
>
17-
>
18-
>
19-
>
20-
>
21-
>
22-
>
2316
> End Line
2417
2518
<!--large blockquote-->

‎plugin/commonmark/testdata/GoldenFiles/link.out.md‎

Lines changed: 0 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -143,39 +143,15 @@ before [content](/) after
143143

144144
[Start Line
145145
\
146-
\
147-
\
148-
\
149-
\
150-
\
151-
\
152-
\
153146
End Line](/)
154147

155148
<!--newlines inside link-->
156149

157-
158-
159-
160-
161150
[newlines around the link content](/)
162151

163-
164-
165-
166-
167152
<!--multiline link inside a list item-->
168153

169154
- [first text
170-
\
171-
\
172-
\
173-
\
174-
\
175-
\
176-
\
177-
\
178-
\
179155
\
180156
second text](/)
181157

‎plugin/commonmark/testdata/GoldenFiles/list.out.md‎

Lines changed: 0 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -116,8 +116,6 @@ text between
116116

117117
<!--THE END-->
118118

119-
120-
121119
- List 7
122120

123121
* * *
@@ -136,13 +134,6 @@ text between
136134

137135
- Start Line
138136

139-
140-
141-
142-
143-
144-
145-
146137
End Line
147138

148139
<!--------------------------------------

‎plugin/commonmark/testdata/GoldenFiles/metadata.out.md‎

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -26,8 +26,4 @@ A &amp; B
2626

2727
Start Line
2828

29-
30-
31-
32-
3329
End Line

0 commit comments

Comments
 (0)