西游记jieba分词统计

发布时间 2023-12-29 10:44:52作者: 苏将复何及

import jieba

排除非人名

excludes = {"一个","那里","怎么","我们","不知","和尚","妖精","两个","甚么","不是",
"只见","国王","徒弟","呆子","如何","这个","大王","原来","不敢","不曾",
"闻言","正是","只是","那怪","出来","一声","真个","小妖" }
txt = open("西游记.txt","r",encoding='gb18030').read()

对文本进行分词

words = jieba.lcut(txt)

创建统计用字典

counts = {}
for word in words:
if len(word) == 1:
continue
elif word == "老孙" or word == "大圣" or word =="悟空":
rword = "行者"
elif word == "师父" or word == "三藏" or word =="长老":
rword = "唐僧"
else:
rword = word
counts[rword] = counts.get(rword, 0) + 1

把排除序列去除

for word in excludes:
del counts[word]
items = list(counts.items())

按照从大到小排序

items.sort(key=lambda x:x[1], reverse=True)
for i in range(10):
word, count = items[i]
print("{0:<10}{1:>5}".format(word, count))