机器学习——K近邻算法-kd(简化因数据过过多而造成的搜索复杂度大)

发布时间 2023-11-25 19:21:34作者: cojames

kd树是为了减少搜索最近邻点的时间复杂度,一般来说可以使用穷举法,但是太耗时,因此采用平衡二叉树的思想来解决这个问题

"""
This is the implementation of Knn(KdTree),
which is accessible in https://github.com/FlameCharmander/MachineLearning,
accomplished by FlameCharmander,
and my csdn blog is https://blog.csdn.net/tudaodiaozhale,
contact me via 13030880@qq.com.
"""
import numpy as np
class Node:
    def __init__(self, data, lchild = None, rchild = None):
        self.data = data
        self.lchild = lchild
        self.rchild = rchild
 
class KdTree:
    def __init__(self):
        self.kdTree = None
 
    def create(self, dataSet, depth):   #创建kd树,返回根结点
        if (len(dataSet) > 0):
            m, n = np.shape(dataSet)    #求出样本行,列
            midIndex = int(m / 2) #中间数的索引位置
            axis = depth % n    #判断以哪个轴划分数据
            sortedDataSet = self.sort(dataSet, axis) #进行排序
            node = Node(sortedDataSet[midIndex]) #将节点数据域设置为中位数,具体参考下书本
            # print sortedDataSet[midIndex]
            leftDataSet = sortedDataSet[: midIndex] #将中位数的左边创建2改副本
            rightDataSet = sortedDataSet[midIndex+1 :]
            print(leftDataSet)
            print(rightDataSet)
            node.lchild = self.create(leftDataSet, depth+1) #将中位数左边样本传入来递归创建树
            node.rchild = self.create(rightDataSet, depth+1)
            return node
        else:
            return None
 
    def sort(self, dataSet, axis):  #采用冒泡排序,利用aixs作为轴进行划分
        sortDataSet = dataSet[:]    #由于不能破坏原样本,此处建立一个副本
        m, n = np.shape(sortDataSet)
        for i in range(m):
            for j in range(0, m - i - 1):
                if (sortDataSet[j][axis] > sortDataSet[j+1][axis]):
                    temp = sortDataSet[j]
                    sortDataSet[j] = sortDataSet[j+1]
                    sortDataSet[j+1] = temp
        print(sortDataSet)
        return sortDataSet
 
    def preOrder(self, node):
        if node != None:
            print("tttt->%s" % node.data)
            self.preOrder(node.lchild)
            self.preOrder(node.rchild)
 
    # def search(self, tree, x):
    #     node = tree
    #     depth = 0
    #     while (node != None):
    #         print node.data
    #         n = len(x)  #特征数
    #         axis = depth % n
    #         if x[axis] < node.data[axis]:
    #             node = node.lchild
    #         else:
    #             node = node.rchild
    #         depth += 1
    def search(self, tree, x):
        self.nearestPoint = None    #保存最近的点
        self.nearestValue = 0   #保存最近的值
        def travel(node, depth = 0):    #递归搜索
            if node != None:    #递归终止条件
                n = len(x)  #特征数
                axis = depth % n    #计算轴
                if x[axis] < node.data[axis]:   #如果数据小于结点,则往左结点找
                    travel(node.lchild, depth+1)
                else:
                    travel(node.rchild, depth+1)
 
                #以下是递归完毕后,往父结点方向回朔
                distNodeAndX = self.dist(x, node.data)  #目标和节点的距离判断
                if (self.nearestPoint == None): #确定当前点,更新最近的点和最近的值
                    self.nearestPoint = node.data
                    self.nearestValue = distNodeAndX
                elif (self.nearestValue > distNodeAndX):
                    self.nearestPoint = node.data
                    self.nearestValue = distNodeAndX
 
                print(node.data, depth, self.nearestValue, node.data[axis], x[axis])
                if (abs(x[axis] - node.data[axis]) <= self.nearestValue):  #确定是否需要去子节点的区域去找(圆的判断)
                    if x[axis] < node.data[axis]:
                        travel(node.rchild, depth+1)
                    else:
                        travel(node.lchild, depth + 1)
        travel(tree)
        return self.nearestPoint
 
    def dist(self, x1, x2): #欧式距离的计算
        return ((np.array(x1) - np.array(x2)) ** 2).sum() ** 0.5
 
dataSet = [[2, 3],
           [5, 4],
           [9, 6],
           [4, 7],
           [8, 1],
           [7, 2]]
x = [3, 4.5]
kdtree = KdTree()
tree = kdtree.create(dataSet, 0)
kdtree.preOrder(tree)
print(kdtree.search(tree, x))