利用WordCloud和jieba生成词云图(也叫文字云)

发布时间 2023-09-12 16:06:52作者: 勋勋的大宝贝

Python生成词云的常用库「wordcloud」。安装: pip install wordcloud

wordcloud默认是为了英文文本来做词云的,如果需要制作中文文本词云,就需要先对中文进行分词。这里就需要用到中文分词库「jieba」。安装:pip install jieba

 
# 功能:生成词云图

from wordcloud import WordCloud
import jieba
from collections import Counter
import numpy as np
from PIL import Image
from matplotlib import pyplot as plt


"""
word.txt里的内容为:
Python
C逆向
C++逆向
C++逆向
C++逆向
C逆向
网络爬虫
数据解析
"""
# 读取文件内容
with open('word.txt', 'r', encoding='utf-8') as f:
    words = f.read()

# 增加jieba库词语(一些完整的词可能被分割)
jieba.add_word("网络爬虫")
jieba.add_word("JS逆向")
jieba.add_word("APP逆向")
jieba.add_word("C逆向")
jieba.add_word("C++逆向")
jieba.add_word("网络数据")

# 使用jieba进行分词
words_list_jieba = jieba.lcut(words)    # ['Python', '\n', 'C逆向', '\n', 'C++逆向', '\n', 'C++逆向', '\n', 'C++逆向', '\n', 'C逆向', '\n', '网络爬虫', '\n', '数据', '解析']

# 定义需要排除的词语集合
excluded_words = ["", "\n", "", '小说', '', "", "一雄", "如何", "什么", '可以', '', "只是", "", "", '', " ",
                  "", "", "", "", '那么', '', '', '', '', '不是', '', "", "就是", "一个", "没有", "",
                  "", "/", "", "", "一部", "", "", "", "", "我们", "你们", "他们", "", "", "", "",
                  "", "", "", "", "", "", "如果", "然后", "因为", "所以", "", "", "", "", "", "",
                  "", "", "", "", "", "", "", "", "这些", "那些", "", "", "", "", "", "", "",
                  "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "",
                  "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "",
                  "", "", "", "这个", "那个", "这样", "那样", "一些", "很多", "非常", "可能", "一定", "一直", "经常",
                  "不断", "不只", "不要", "不得", "不能", "无法", "没法", "必须", "应该", "需要", "", "", "", "",
                  "", "", "", "", "", "", "", "", "", "", "", "工作", "生活", "家庭", "朋友", "",
                  "感觉", "思考", "想法", "方法", "原因", "结果", "可能性", "比较", "不同", "相同", "重要", "容易", "困难",
                  "简单", "复杂", "正确", "错误", "", "", "", "", "", "", "", "", "", "", "", "",
                  "", "", "", "", "——", "", "·", "", "", ""]

# 过滤排除的词语
words_list = [x for x in words_list_jieba if x not in excluded_words]   # ['Python', 'C逆向', 'C++逆向', 'C++逆向', 'C++逆向', 'C逆向', '网络爬虫', '数据', '解析']

# 使用Counter进行词频统计
word_counter = Counter(words_list)  # Counter({'C++逆向': 3, 'C逆向': 2, 'Python': 1, '网络爬虫': 1, '数据': 1, '解析': 1})
sorted_file = word_counter.most_common()    #[('C++逆向', 3), ('C逆向', 2), ('Python', 1), ('网络爬虫', 1), ('数据', 1), ('解析', 1)]

# 加载图像作为遮罩
mask = np.array(Image.open("1.png"))  # 蒙版图片

# 生成词云时指定遮罩
wordcloud = WordCloud(
    font_path='simhei.ttf',
    background_color='white',
    mask=mask).generate_from_frequencies(dict(sorted_file))

# 保存词云图
wordcloud_image_path = 'wordcloud.png'
wordcloud.to_file(wordcloud_image_path)


# # 生成图片
# image = wordcloud.to_image()
# # 展示图片
# image.show()


# # 显示词云图
# plt.figure(figsize=(10, 8))
# plt.imshow(wordcloud, interpolation='bilinear')
# plt.axis('off')
# plt.show()