Skip to content

Commit dfdcbe0

Browse files
committed
Lazy calculate WordCount, ReadingTime and FuzzyWordCount
This avoids having to execute these expensive operations for sites not using these values. This commit sums up a set of wordcounting and autosummary related performance improvements. The effect of these kind of depends on what features your site use, but a benchmark from 4 Hugo sites in the wild shows promise: ``` benchmark old ns/op new ns/op delta BenchmarkHugo-4 21293005843 20032857342 -5.92% benchmark old allocs new allocs delta BenchmarkHugo-4 65290922 65186032 -0.16% benchmark old bytes new bytes delta BenchmarkHugo-4 9771213416 9681866464 -0.91% ```
1 parent ea0e8c5 commit dfdcbe0

File tree

7 files changed

+96
-50
lines changed

7 files changed

+96
-50
lines changed

‎helpers/content.go‎

Lines changed: 15 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -138,19 +138,28 @@ func StripHTML(s string) string {
138138
// Walk through the string removing all tags
139139
b := bp.GetBuffer()
140140
defer bp.PutBuffer(b)
141-
142-
inTag := false
141+
var inTag, isSpace, wasSpace bool
143142
for _, r := range s {
144-
switch r {
145-
case '<':
143+
if !inTag {
144+
isSpace = false
145+
}
146+
147+
switch {
148+
case r == '<':
146149
inTag = true
147-
case '>':
150+
case r == '>':
148151
inTag = false
152+
case unicode.IsSpace(r):
153+
isSpace = true
154+
fallthrough
149155
default:
150-
if !inTag {
156+
if !inTag && (!isSpace || (isSpace && !wasSpace)) {
151157
b.WriteRune(r)
152158
}
153159
}
160+
161+
wasSpace = isSpace
162+
154163
}
155164
return b.String()
156165
}

‎helpers/content_test.go‎

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -34,11 +34,22 @@ func TestStripHTML(t *testing.T) {
3434
}
3535
data := []test{
3636
{"<h1>strip h1 tag <h1>", "strip h1 tag "},
37-
{"<p> strip p tag </p>", " strip p tag \n"},
37+
{"<p> strip p tag </p>", " strip p tag "},
3838
{"</br> strip br<br>", " strip br\n"},
3939
{"</br> strip br2<br />", " strip br2\n"},
4040
{"This <strong>is</strong> a\nnewline", "This is a newline"},
4141
{"No Tags", "No Tags"},
42+
{`<p>Summary Next Line.
43+
<figure >
44+
45+
<img src="/not/real" />
46+
47+
48+
</figure>
49+
.
50+
More text here.</p>
51+
52+
<p>Some more text</p>`, "Summary Next Line. . More text here.\nSome more text\n"},
4253
}
4354
for i, d := range data {
4455
output := StripHTML(d.input)

‎hugolib/page.go‎

Lines changed: 41 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -107,9 +107,10 @@ type Source struct {
107107
source.File
108108
}
109109
type PageMeta struct {
110-
WordCount int
111-
FuzzyWordCount int
112-
ReadingTime int
110+
wordCount int
111+
fuzzyWordCount int
112+
readingTime int
113+
pageMetaInit sync.Once
113114
Weight int
114115
}
115116

@@ -485,28 +486,48 @@ func (p *Page) ReadFrom(buf io.Reader) (int64, error) {
485486
return int64(len(p.rawContent)), nil
486487
}
487488

489+
func (p *Page) WordCount() int {
490+
p.analyzePage()
491+
return p.wordCount
492+
}
493+
494+
func (p *Page) ReadingTime() int {
495+
p.analyzePage()
496+
return p.readingTime
497+
}
498+
499+
func (p *Page) FuzzyWordCount() int {
500+
p.analyzePage()
501+
return p.fuzzyWordCount
502+
}
503+
488504
func (p *Page) analyzePage() {
489-
if p.isCJKLanguage {
490-
p.WordCount = 0
491-
for _, word := range p.PlainWords() {
492-
runeCount := utf8.RuneCountInString(word)
493-
if len(word) == runeCount {
494-
p.WordCount++
495-
} else {
496-
p.WordCount += runeCount
505+
p.pageMetaInit.Do(func() {
506+
if p.isCJKLanguage {
507+
p.wordCount = 0
508+
for _, word := range p.PlainWords() {
509+
runeCount := utf8.RuneCountInString(word)
510+
if len(word) == runeCount {
511+
p.wordCount++
512+
} else {
513+
p.wordCount += runeCount
514+
}
497515
}
516+
} else {
517+
p.wordCount = helpers.TotalWords(p.Plain())
498518
}
499-
} else {
500-
p.WordCount = len(p.PlainWords())
501-
}
502519

503-
p.FuzzyWordCount = (p.WordCount + 100) / 100 * 100
520+
// TODO(bep) is set in a test. Fix that.
521+
if p.fuzzyWordCount == 0 {
522+
p.fuzzyWordCount = (p.wordCount + 100) / 100 * 100
523+
}
504524

505-
if p.isCJKLanguage {
506-
p.ReadingTime = (p.WordCount + 500) / 501
507-
} else {
508-
p.ReadingTime = (p.WordCount + 212) / 213
509-
}
525+
if p.isCJKLanguage {
526+
p.readingTime = (p.wordCount + 500) / 501
527+
} else {
528+
p.readingTime = (p.wordCount + 212) / 213
529+
}
530+
})
510531
}
511532

512533
func (p *Page) permalink() (*url.URL, error) {

‎hugolib/pageSort_test.go‎

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -95,11 +95,11 @@ func TestLimit(t *testing.T) {
9595

9696
func TestPageSortReverse(t *testing.T) {
9797
p1 := createSortTestPages(10)
98-
assert.Equal(t, 0, p1[0].FuzzyWordCount)
99-
assert.Equal(t, 9, p1[9].FuzzyWordCount)
98+
assert.Equal(t, 0, p1[0].fuzzyWordCount)
99+
assert.Equal(t, 9, p1[9].fuzzyWordCount)
100100
p2 := p1.Reverse()
101-
assert.Equal(t, 9, p2[0].FuzzyWordCount)
102-
assert.Equal(t, 0, p2[9].FuzzyWordCount)
101+
assert.Equal(t, 9, p2[0].fuzzyWordCount)
102+
assert.Equal(t, 0, p2[9].fuzzyWordCount)
103103
// cached
104104
assert.True(t, probablyEqualPages(p2, p1.Reverse()))
105105
}
@@ -149,7 +149,7 @@ func createSortTestPages(num int) Pages {
149149
if i%2 == 0 {
150150
w = 10
151151
}
152-
pages[i].FuzzyWordCount = i
152+
pages[i].fuzzyWordCount = i
153153
pages[i].Weight = w
154154
pages[i].Description = "initial"
155155
}

‎hugolib/page_test.go‎

Lines changed: 13 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -504,10 +504,13 @@ func checkPageContent(t *testing.T, page *Page, content string, msg ...interface
504504
}
505505

506506
func normalizeContent(c string) string {
507-
norm := strings.Replace(c, "\n", "", -1)
507+
norm := c
508+
norm = strings.Replace(norm, "\n", " ", -1)
508509
norm = strings.Replace(norm, " ", " ", -1)
509510
norm = strings.Replace(norm, " ", " ", -1)
510511
norm = strings.Replace(norm, " ", " ", -1)
512+
norm = strings.Replace(norm, "p> ", "p>", -1)
513+
norm = strings.Replace(norm, "> <", "> <", -1)
511514
return strings.TrimSpace(norm)
512515
}
513516

@@ -710,8 +713,8 @@ func TestPageWithShortCodeInSummary(t *testing.T) {
710713

711714
assertFunc := func(t *testing.T, ext string, p *Page) {
712715
checkPageTitle(t, p, "Simple")
713-
checkPageContent(t, p, normalizeExpected(ext, "<p>Summary Next Line. <figure > <img src=\"/not/real\" /> </figure>.\nMore text here.</p><p>Some more text</p>"), ext)
714-
checkPageSummary(t, p, "Summary Next Line. . More text here. Some more text", ext)
716+
checkPageContent(t, p, normalizeExpected(ext, "<p>Summary Next Line. \n<figure >\n \n <img src=\"/not/real\" />\n \n \n</figure>\n.\nMore text here.</p>\n\n<p>Some more text</p>\n"))
717+
checkPageSummary(t, p, "Summary Next Line. . More text here. Some more text")
715718
checkPageType(t, p, "page")
716719
checkPageLayout(t, p, "page/single.html", "_default/single.html", "theme/page/single.html", "theme/_default/single.html")
717720
}
@@ -793,7 +796,7 @@ func TestWordCountWithAllCJKRunesWithoutHasCJKLanguage(t *testing.T) {
793796
testCommonResetState()
794797

795798
assertFunc := func(t *testing.T, ext string, p *Page) {
796-
if p.WordCount != 8 {
799+
if p.WordCount() != 8 {
797800
t.Fatalf("[%s] incorrect word count for content '%s'. expected %v, got %v", ext, p.plain, 8, p.WordCount)
798801
}
799802
}
@@ -806,11 +809,10 @@ func TestWordCountWithAllCJKRunesHasCJKLanguage(t *testing.T) {
806809
viper.Set("HasCJKLanguage", true)
807810

808811
assertFunc := func(t *testing.T, ext string, p *Page) {
809-
if p.WordCount != 15 {
812+
if p.WordCount() != 15 {
810813
t.Fatalf("[%s] incorrect word count for content '%s'. expected %v, got %v", ext, p.plain, 15, p.WordCount)
811814
}
812815
}
813-
814816
testAllMarkdownEnginesForPage(t, assertFunc, "simple", simplePageWithAllCJKRunes)
815817
}
816818

@@ -820,15 +822,14 @@ func TestWordCountWithMainEnglishWithCJKRunes(t *testing.T) {
820822
viper.Set("HasCJKLanguage", true)
821823

822824
assertFunc := func(t *testing.T, ext string, p *Page) {
823-
if p.WordCount != 74 {
825+
if p.WordCount() != 74 {
824826
t.Fatalf("[%s] incorrect word count for content '%s'. expected %v, got %v", ext, p.plain, 74, p.WordCount)
825827
}
826828

827829
if p.Summary != simplePageWithMainEnglishWithCJKRunesSummary {
828830
t.Fatalf("[%s] incorrect Summary for content '%s'. expected %v, got %v", ext, p.plain,
829831
simplePageWithMainEnglishWithCJKRunesSummary, p.Summary)
830832
}
831-
832833
}
833834

834835
testAllMarkdownEnginesForPage(t, assertFunc, "simple", simplePageWithMainEnglishWithCJKRunes)
@@ -839,15 +840,14 @@ func TestWordCountWithIsCJKLanguageFalse(t *testing.T) {
839840
viper.Set("HasCJKLanguage", true)
840841

841842
assertFunc := func(t *testing.T, ext string, p *Page) {
842-
if p.WordCount != 75 {
843+
if p.WordCount() != 75 {
843844
t.Fatalf("[%s] incorrect word count for content '%s'. expected %v, got %v", ext, p.plain, 74, p.WordCount)
844845
}
845846

846847
if p.Summary != simplePageWithIsCJKLanguageFalseSummary {
847848
t.Fatalf("[%s] incorrect Summary for content '%s'. expected %v, got %v", ext, p.plain,
848849
simplePageWithIsCJKLanguageFalseSummary, p.Summary)
849850
}
850-
851851
}
852852

853853
testAllMarkdownEnginesForPage(t, assertFunc, "simple", simplePageWithIsCJKLanguageFalse)
@@ -857,15 +857,15 @@ func TestWordCountWithIsCJKLanguageFalse(t *testing.T) {
857857
func TestWordCount(t *testing.T) {
858858

859859
assertFunc := func(t *testing.T, ext string, p *Page) {
860-
if p.WordCount != 483 {
860+
if p.WordCount() != 483 {
861861
t.Fatalf("[%s] incorrect word count. expected %v, got %v", ext, 483, p.WordCount)
862862
}
863863

864-
if p.FuzzyWordCount != 500 {
864+
if p.FuzzyWordCount() != 500 {
865865
t.Fatalf("[%s] incorrect word count. expected %v, got %v", ext, 500, p.WordCount)
866866
}
867867

868-
if p.ReadingTime != 3 {
868+
if p.ReadingTime() != 3 {
869869
t.Fatalf("[%s] incorrect min read. expected %v, got %v", ext, 3, p.ReadingTime)
870870
}
871871

‎hugolib/pagination_test.go‎

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -55,7 +55,7 @@ func TestSplitPageGroups(t *testing.T) {
5555
// first group 10 in weight
5656
assert.Equal(t, 10, pg.Key)
5757
for _, p := range pg.Pages {
58-
assert.True(t, p.FuzzyWordCount%2 == 0) // magic test
58+
assert.True(t, p.fuzzyWordCount%2 == 0) // magic test
5959
}
6060
}
6161
} else {
@@ -70,7 +70,7 @@ func TestSplitPageGroups(t *testing.T) {
7070
// last should have 5 in weight
7171
assert.Equal(t, 5, pg.Key)
7272
for _, p := range pg.Pages {
73-
assert.True(t, p.FuzzyWordCount%2 != 0) // magic test
73+
assert.True(t, p.fuzzyWordCount%2 != 0) // magic test
7474
}
7575
}
7676
} else {
@@ -443,10 +443,10 @@ func TestPage(t *testing.T) {
443443
page21, _ := f2.page(1)
444444
page2Nil, _ := f2.page(3)
445445

446-
assert.Equal(t, 1, page11.FuzzyWordCount)
446+
assert.Equal(t, 3, page11.fuzzyWordCount)
447447
assert.Nil(t, page1Nil)
448448

449-
assert.Equal(t, 1, page21.FuzzyWordCount)
449+
assert.Equal(t, 3, page21.fuzzyWordCount)
450450
assert.Nil(t, page2Nil)
451451
}
452452

@@ -468,7 +468,7 @@ func createTestPages(num int) Pages {
468468
if i%2 == 0 {
469469
w = 10
470470
}
471-
pages[i].FuzzyWordCount = i
471+
pages[i].fuzzyWordCount = i + 2
472472
pages[i].Weight = w
473473
}
474474

‎hugolib/site_test.go‎

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,11 @@ import (
3333
"github.com/stretchr/testify/require"
3434
)
3535

36+
func init() {
37+
//There are expected ERROR logging in tests that produces a lot of noise.
38+
jww.SetStdoutThreshold(jww.LevelCritical)
39+
}
40+
3641
const (
3742
pageSimpleTitle = `---
3843
title: simple template

0 commit comments

Comments
 (0)