jieba库西游记分词(13号)

发布时间 2023-12-19 20:22:31作者: huang123456789
import jieba
def merge_synonyms(words, synonyms):
    merged_words = []
    for word in words:
        for synonym_list in synonyms:
            if word in synonym_list:
                merged_words.append(synonym_list[0])
                break
        else:
            merged_words.append(word)
    return merged_words
def remove_stopwords(words, stopwords):
    # 去除停用词
    return [word for word in words if word not in stopwords]

def main():
    with open("E:\\示例文本文件\\西游记.txt", "r", encoding='GB2312', errors='ignore') as file:
        text = file.read()
    words = list(jieba.cut(text, cut_all=False))
    synonyms = [
        ['孙悟空', '孙猴子'],
    ]
    stopwords = [
        '', '', '', '', '', '', '', '', '', '', '', ''
    ]
    merged_words = merge_synonyms(words, synonyms)
    filtered_words = remove_stopwords(merged_words, stopwords)
    word_freq = {}
    for word in filtered_words:
        if len(word) > 1:
            word_freq[word] = word_freq.get(word, 0) + 1
    sorted_word_freq = sorted(word_freq.items(), key=lambda x: x[1], reverse=True)

    for i in range(min(20, len(sorted_word_freq))):
        k, v = sorted_word_freq[i]
        print("{} {}".format(k, v))
if __name__ == "__main__":
    main()