jieba 分词西游记

发布时间 2023-12-17 22:07:26作者: 过过过过过过


import jieba

txt = open("西游记.txt", "r", encoding='utf-8').read()
words = jieba.lcut(txt)
counts = {}

for word in words:
if len(word) == 1:
continue
elif word == "大圣" or word=="老孙" or word=="行者" or word=="孙大圣" or word=="孙行者" or word=="猴王" or word=="悟空" or word=="齐天大圣" or word=="猴子":
rword = "孙悟空"
elif word == "师父" or word == "三藏" or word=="圣僧":
rword = "唐僧"
elif word == "呆子" or word=="八戒" or word=="老猪":
rword = "猪八戒"
elif word=="沙和尚":
rword="沙僧"
elif word == "妖精" or word=="妖魔" or word=="妖道":
rword = "妖怪"
elif word=="佛祖":
rword="如来"
elif word=="三太子":
rword="白马"
else:
rword = word
counts[rword] = counts.get(rword,0) + 1

items = list(counts.items())
items.sort(key=lambda x: x[1], reverse=True)

for i in range(20):
word, count = items[i]
print("{0:<10}{1:>5}".format(word, count))

 

扫描第一章运行结果(一本书太大了)

孙悟空 119
祖师 54
石猴 49
一个 22
什么 21
唐僧 21
弟子 18
老猴 15
神仙 14
我们 13
起来 13
大王 13
怎么 12
哪里 11
问道 10
地上 10
高兴 10
知道 9
本事 9
妖怪 9