jieba分词--西游记

发布时间 2023-12-17 21:46:42作者: 张舒情


import jieba
excludes={"一个","那里","怎么","我们","不知","两个","甚么","不是","只见","原来","如何","这个","不曾","不敢","闻言","正是","只是","那怪","出来","一声","真个","不得","这里","今日","那个","取经","却说","如今","三个","这般","就是","不见","铁棒","认得","不能","不要","果然","上前","有些","性命"}
txt = open("西游记.txt", "r", encoding="utf-8").read()
words = jieba.lcut(txt)
counts = {}
for word in words:
if len(word) == 1:
continue
elif word=="唐僧" or word=="师父":
rword="唐僧"
elif word=="三藏" or word=="沙僧":
rword="沙僧"
elif word=="老孙" or word=="大圣" or word=="悟空" or word=="孙行者" or word=="孙大圣":
rword="悟空"
# elif word=="孟德" or word=="丞相":
# rword="曹操"
else:
rword=word
counts[rword] = counts.get(rword,0) + 1
for word in excludes:
del counts[word]
items = list(counts.items())
items.sort(key=lambda x:x[1], reverse=True)
for i in range(20):
word, count = items[i]
print("{0:<10}{1:>5}".format(word, count))