1 parent a401bfa commit 9bbe068Copy full SHA for 9bbe068
1 file changed
chapter3/Unit16/Unit16_FindTop10StopWord.py
@@ -0,0 +1,25 @@
1
+from bs4 import BeautifulSoup
2
+from collections import Counter
3
+from nltk.corpus import stopwords
4
+from nltk import LancasterStemmer
5
+import nltk
6
+
7
+# 创建一个新词干
8
+ls = nltk.LancasterStemmer()
9
10
+# 读取文件并生成soup
11
+with open("../data/index.html") as infile:
12
+ soup = BeautifulSoup(infile)
13
14
+# 提取并标记文本
15
+words = nltk.word_tokenize(soup.text)
16
17
+# 转换为小写
18
+words = [w.lower() for w in words]
19
20
+# 删除停用词,并分析剩余部分的词干
21
+words = [ ls.stem(w) for w in words if w not in stopwords.words("english") and w.isalnum()]
22
23
+# 对词进行计数
24
+freqs = Counter(words)
25
+print(freqs.most_common(10))
0 commit comments