K均值聚类

发布时间 2023-09-13 17:39:39作者: 孙犯困
import random
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt


# 计算欧式距离
def Distance(dataSet, centroids, k) -> np.array:
    dis = []
    for data in dataSet:
        diff = np.tile(data, (k, 1)) - centroids  # 行数上复制k份,方便作差
        temp1 = diff ** 2
        temp2 = np.sum(temp1, axis=1)  # 按行相加
        dis_temp = temp2 ** 0.5
        dis.append(dis_temp)
    dis = np.array(dis)  # 转换为一个array类型
    #print(dis)
    return dis


# 更新质心
def Update_cen(dataSet, centroids, k):
    # 计算每个样本到质心的距离,返回值是array数组
    distance = Distance(dataSet, centroids, k)
    # print("输出所有样本到质心的距离:", distance)
    # 分组并计算新的质心
    minIndex = np.argmin(distance, axis=1)  # axis=1 返回每行最小值的索引
    print("输出最小值索引", minIndex)
    newCentroids = pd.DataFrame(dataSet).groupby(minIndex).mean()  # 每个数据离哪一个中心最近?按此分组 求均值得到新的质心

    #newCentroids_Data = pd.DataFrame(dataset)
    #print(newCentroids_Data)
    #newCentroids_temp = newCentroids_Data.groupby(minIndex)
    #for key, value in newCentroids_temp:  #输出查看排序后数组
    #   print(key)
    #   print(value)
    #   print("")
    # print("新的质心(dataframe):", newCentroids)
    newCentroids = newCentroids.values
    # print("新的质心(值):", newCentroids)

    # 计算变化量
    changed = newCentroids - centroids
    return changed, newCentroids


# k-means 算法实现
def kmeans(dataSet, k):
    # (1) 随机选定k个质心
    #centroids = random.sample(dataSet, k)
    centroids=[[2,10],[5,8],[1,2],[35,4],[15,1]]
    print("质心:", centroids)

    # (2) 计算样本值到质心之间的距离,直到质心的位置不再改变
    #np.sum((cur_centers - org_centers) / org_centers * 100.0) > self.tolerance_:
    changed, newCentroids = Update_cen(dataSet, centroids, k)
    while np.any(changed):
        changed, newCentroids = Update_cen(dataSet, newCentroids, k)
    centroids = sorted(newCentroids.tolist())

    # (3) 根据最终的质心,计算每个集群的样本
    cluster = []
    dis = Distance(dataSet, centroids, k)  # 调用欧拉距离
    minIndex = np.argmin(dis, axis=1)
    for i in range(k):
        cluster.append([])
    for i, j in enumerate(minIndex):  # enumerate()可同时遍历索引和遍历元素
        cluster[j].append(dataSet[i])
        print("集群样本")
        print(cluster[1])

    return centroids, cluster


# 创建数据集
def createDataSet():
   return [[37,17],[16,35],[47,15],[48,33],[30,40],[37,11],[34,16],[47,12],[34,18],[2,18],[36,32],[10,39],[43,13],[2,50],[26,28],[49,3],[43,15],[34,7],[24,38],[32,21],[36,42],[12,35],[1,14],[22,20],[48,12],[2,10],[20,9],[43,4],[50,25],[5,10],[5,20],[33,32],[48,44],[26,27],[21,44],[42,6],[26,36],[49,1],[1,36],[25,16],[43,10],[46,6],[35,26],[21,40],[12,42],[33,21],[5,21],[48,9],[49,41],[1,2],]

if __name__ == '__main__':
    dataset = createDataSet()
    k = 5;
    centroids, cluster = kmeans(dataset, k)
    print('质心为:%s' % centroids)
    print('集群为:%s' % cluster)
    color_all = ['c', 'g', 'r', 'm', 'y', 'k', 'b']
    # x = list(np.array(dataset).T[0])
    # y = list(np.array(dataset).T[1])
    #plt.scatter(list(np.array(dataset).T[0]), list(np.array(dataset).T[1]), marker='o', color='green', label="数据集" )
    plt.scatter(list(np.array(centroids).T[0]), list(np.array(centroids).T[1]), marker='x', color='red', label="质心")
    for i in range(k):
        j = random.randint(0,6) 
        plt.scatter(list(np.array(cluster[i]).T[0]), list(np.array(cluster[i]).T[1]), marker='o', color=color_all[i], label="数据集" )
        
    plt.show()
# 肘部法确认K值?