Skip to content

Commit 9bbe068

Browse files
committed
add find top 10 word
1 parent a401bfa commit 9bbe068

1 file changed

Lines changed: 25 additions & 0 deletions

File tree

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
from bs4 import BeautifulSoup
2+
from collections import Counter
3+
from nltk.corpus import stopwords
4+
from nltk import LancasterStemmer
5+
import nltk
6+
7+
# 创建一个新词干
8+
ls = nltk.LancasterStemmer()
9+
10+
# 读取文件并生成soup
11+
with open("../data/index.html") as infile:
12+
soup = BeautifulSoup(infile)
13+
14+
# 提取并标记文本
15+
words = nltk.word_tokenize(soup.text)
16+
17+
# 转换为小写
18+
words = [w.lower() for w in words]
19+
20+
# 删除停用词,并分析剩余部分的词干
21+
words = [ ls.stem(w) for w in words if w not in stopwords.words("english") and w.isalnum()]
22+
23+
# 对词进行计数
24+
freqs = Counter(words)
25+
print(freqs.most_common(10))

0 commit comments

Comments
 (0)