import random
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
# 计算欧式距离
def Distance(dataSet, centroids, k) -> np.array:
dis = []
for data in dataSet:
diff = np.tile(data, (k, 1)) - centroids # 行数上复制k份,方便作差
temp1 = diff ** 2
temp2 = np.sum(temp1, axis=1) # 按行相加
dis_temp = temp2 ** 0.5
dis.append(dis_temp)
dis = np.array(dis) # 转换为一个array类型
#print(dis)
return dis
# 更新质心
def Update_cen(dataSet, centroids, k):
# 计算每个样本到质心的距离,返回值是array数组
distance = Distance(dataSet, centroids, k)
# print("输出所有样本到质心的距离:", distance)
# 分组并计算新的质心
minIndex = np.argmin(distance, axis=1) # axis=1 返回每行最小值的索引
print("输出最小值索引", minIndex)
newCentroids = pd.DataFrame(dataSet).groupby(minIndex).mean() # 每个数据离哪一个中心最近?按此分组 求均值得到新的质心
#newCentroids_Data = pd.DataFrame(dataset)
#print(newCentroids_Data)
#newCentroids_temp = newCentroids_Data.groupby(minIndex)
#for key, value in newCentroids_temp: #输出查看排序后数组
# print(key)
# print(value)
# print("")
# print("新的质心(dataframe):", newCentroids)
newCentroids = newCentroids.values
# print("新的质心(值):", newCentroids)
# 计算变化量
changed = newCentroids - centroids
return changed, newCentroids
# k-means 算法实现
def kmeans(dataSet, k):
# (1) 随机选定k个质心
#centroids = random.sample(dataSet, k)
centroids=[[2,10],[5,8],[1,2],[35,4],[15,1]]
print("质心:", centroids)
# (2) 计算样本值到质心之间的距离,直到质心的位置不再改变
#np.sum((cur_centers - org_centers) / org_centers * 100.0) > self.tolerance_:
changed, newCentroids = Update_cen(dataSet, centroids, k)
while np.any(changed):
changed, newCentroids = Update_cen(dataSet, newCentroids, k)
centroids = sorted(newCentroids.tolist())
# (3) 根据最终的质心,计算每个集群的样本
cluster = []
dis = Distance(dataSet, centroids, k) # 调用欧拉距离
minIndex = np.argmin(dis, axis=1)
for i in range(k):
cluster.append([])
for i, j in enumerate(minIndex): # enumerate()可同时遍历索引和遍历元素
cluster[j].append(dataSet[i])
print("集群样本")
print(cluster[1])
return centroids, cluster
# 创建数据集
def createDataSet():
return [[37,17],[16,35],[47,15],[48,33],[30,40],[37,11],[34,16],[47,12],[34,18],[2,18],[36,32],[10,39],[43,13],[2,50],[26,28],[49,3],[43,15],[34,7],[24,38],[32,21],[36,42],[12,35],[1,14],[22,20],[48,12],[2,10],[20,9],[43,4],[50,25],[5,10],[5,20],[33,32],[48,44],[26,27],[21,44],[42,6],[26,36],[49,1],[1,36],[25,16],[43,10],[46,6],[35,26],[21,40],[12,42],[33,21],[5,21],[48,9],[49,41],[1,2],]
if __name__ == '__main__':
dataset = createDataSet()
k = 5;
centroids, cluster = kmeans(dataset, k)
print('质心为:%s' % centroids)
print('集群为:%s' % cluster)
color_all = ['c', 'g', 'r', 'm', 'y', 'k', 'b']
# x = list(np.array(dataset).T[0])
# y = list(np.array(dataset).T[1])
#plt.scatter(list(np.array(dataset).T[0]), list(np.array(dataset).T[1]), marker='o', color='green', label="数据集" )
plt.scatter(list(np.array(centroids).T[0]), list(np.array(centroids).T[1]), marker='x', color='red', label="质心")
for i in range(k):
j = random.randint(0,6)
plt.scatter(list(np.array(cluster[i]).T[0]), list(np.array(cluster[i]).T[1]), marker='o', color=color_all[i], label="数据集" )
plt.show()
# 肘部法确认K值?
K均值聚类
发布时间 2023-09-13 17:39:39作者: 孙犯困