python123——西游记相关的分词,出现次数最高的20个

发布时间 2023-12-13 09:18:11作者: 奇诺qwq

 

 

 

#统计西游记人物出场次数,(去除冠词,代词等干扰)并降序排列p173
import jieba
excludes={"一个","那里","怎么","我们","不知","两个","甚么","不是","只见","原来","如何","这个","不曾","不敢","闻言","正是","只是","那怪","出来","一声","真个","不得","这里","今日","那个","取经","却说","如今","三个","这般","就是","不见","铁棒","认得","不能","不要","果然","上前","有些","性命"}
txt = open("西游记.txt", "r", encoding="utf-8").read()
words = jieba.lcut(txt)
counts = {}
for word in words:
    if len(word) == 1:
        continue
    elif word=="唐僧" or word=="师父":
        rword="唐僧"
    elif word=="三藏" or word=="沙僧":
        rword="沙僧"
    elif word=="老孙" or word=="大圣" or word=="悟空" or word=="孙行者" or word=="孙大圣":
        rword="悟空"
    # elif word=="孟德" or word=="丞相":
    #     rword="曹操"
    else:
        rword=word
    counts[rword] = counts.get(rword,0) + 1
for word in excludes:
    del counts[word]
items = list(counts.items())
items.sort(key=lambda x:x[1], reverse=True) 
for i in range(20):
    word, count = items[i]
    print("{0:<10}{1:>5}".format(word, count))

 

运行结果