Skip to content

Commit e2a53be

Browse files
author
morvanzhou
committed
add tfidf visual
1 parent 9d97f00 commit e2a53be

3 files changed

Lines changed: 17 additions & 1 deletion

File tree

‎tf_idf.py‎

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
import numpy as np
22
from collections import Counter
33
import itertools
4+
from visual import show_tfidf
45

56
docs = [
67
"it is a good day, I like to stay here",
@@ -133,4 +134,6 @@ def get_keywords(n=2):
133134
q = "I get a coffee cup"
134135
scores = docs_score(q)
135136
d_ids = scores.argsort()[-3:][::-1]
136-
print("\ntop 3 docs for '{}':\n{}".format(q, [docs[i] for i in d_ids]))
137+
print("\ntop 3 docs for '{}':\n{}".format(q, [docs[i] for i in d_ids]))
138+
139+
show_tfidf(tf_idf.T, [i2v[i] for i in range(len(i2v))], "tfidf_matrix")

‎tf_idf_sklearn.py‎

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
from sklearn.feature_extraction.text import TfidfVectorizer
22
from sklearn.metrics.pairwise import cosine_similarity
3+
from visual import show_tfidf
34

45

56
docs = [
@@ -32,3 +33,6 @@
3233
res = res.ravel().argsort()[-3:]
3334
print("\ntop 3 docs for '{}':\n{}".format(q, [docs[i] for i in res[::-1]]))
3435

36+
37+
i2v = {i: v for v, i in vectorizer.vocabulary_.items()}
38+
show_tfidf(tf_idf.todense(), [i2v[i] for i in range(len(i2v))], "tfidf_sklearn_matrix")

‎visual.py‎

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,15 @@
66
import utils
77

88

9+
def show_tfidf(tfidf, vocb, filename):
10+
# [n_vocab, n_doc]
11+
plt.imshow(tfidf, cmap="YlGn", vmin=tfidf.min(), vmax=tfidf.max())
12+
plt.xticks(np.arange(tfidf.shape[1]), vocb, fontsize=6, rotation=90)
13+
plt.yticks(np.arange(tfidf.shape[0]), np.arange(1, tfidf.shape[1]+1), fontsize=6)
14+
plt.tight_layout()
15+
plt.savefig("./visual/results/%s.png" % filename, format="png", dpi=500)
16+
plt.show()
17+
918
def show_w2v_word_embedding(model, data: utils.Dataset, path):
1019
word_emb = model.embeddings.get_weights()[0]
1120
for i in range(data.num_word):

0 commit comments

Comments
 (0)