基于oneAPI的并行算法实践

发布时间 2023-12-03 22:10:09作者: KazoeYakuman

本文介绍了利用oneAPI,使用sycl编程实现并行算法,完成了矩阵乘法、归并排序、图像卷积三个任务的过程。

矩阵乘法

在此任务中,我们使用sycl编写并行计算的内核。为了提高局部计算效率,我们使用共享内存存储部分矩阵数据。


std::vector<std::vector<float>> matrixMultiply(const std::vector<std::vector<float>> &A, const std::vector<std::vector<float>> &B) {
    // Create a SYCL queue
    sycl::queue queue(sycl::default_selector{});

    // Flatten 2D matrices to 1D vectors
    std::vector<float> flatA;
    std::vector<float> flatB;
    for (const auto &row : A)
        flatA.insert(flatA.end(), row.begin(), row.end());
    for (const auto &row : B)
        flatB.insert(flatB.end(), row.begin(), row.end());

    // Create buffers for flattened matrices A, B, and C
    sycl::buffer<float, 1> bufferA(flatA.data(), sycl::range<1>(M * K));
    sycl::buffer<float, 1> bufferB(flatB.data(), sycl::range<1>(K * N));
    sycl::buffer<float, 1> bufferC(sycl::range<1>(M * N));

    // Submit a command group to the queue
    queue.submit([&](sycl::handler &cgh) {
        // Accessors for the buffers
        auto a = bufferA.get_access<sycl::access::mode::read>(cgh);
        auto b = bufferB.get_access<sycl::access::mode::read>(cgh);
        auto c = bufferC.get_access<sycl::access::mode::write>(cgh);

        // Shared memory for each work group
        sycl::accessor<float, 2, sycl::access::mode::read_write, sycl::access::target::local>
            localMem(sycl::range<2>(blockSize, blockSize), cgh);

        // Define a 2D range for global size
        sycl::range<2> globalSize(M, N);

        cgh.parallel_for<class matrixMultiplyKernel>(sycl::nd_range<2>(globalSize, sycl::range<2>(blockSize, blockSize)),
                                                     [=](sycl::nd_item<2> item) {
                                                         int row = item.get_global_id(0);
                                                         int col = item.get_global_id(1);

                                                         float sum = 0.0f;

                                                         // Loop over blocks
                                                         for (int kBlock = 0; kBlock < K; kBlock += blockSize) {
                                                             // Load block of A and B into shared memory
                                                             localMem[item.get_local_id(0)][item.get_local_id(1)] = a[row * K + kBlock + item.get_local_id(1)];
                                                             item.barrier(sycl::access::fence_space::local_space);
                                                             localMem[item.get_local_id(0)][item.get_local_id(1)] *= b[(kBlock + item.get_local_id(0)) * N + col];
                                                             item.barrier(sycl::access::fence_space::local_space);

                                                             // Perform block-level computation
                                                             for (int i = 0; i < blockSize; ++i) {
                                                                 sum += localMem[item.get_local_id(0)][i] * localMem[i][item.get_local_id(1)];
                                                             }
                                                             item.barrier(sycl::access::fence_space::local_space);
                                                         }

                                                         // Write result to global memory
                                                         c[row * N + col] = sum;
                                                     });
    }).wait();  // Wait for the command group to finish

    // Copy result from bufferC to a 2D vector and return
    std::vector<std::vector<float>> result(M, std::vector<float>(N, 0.0f));
    sycl::host_accessor resultAccessor(bufferC, sycl::read_only);
    for (int i = 0; i < M; ++i) {
        for (int j = 0; j < N; ++j) {
            result[i][j] = resultAccessor[i * N + j];
        }
    }

    return result;
}

归并排序

归并排序是一种经典的排序算法,通过分治的思想将问题分解成小问题,然后将小问题的解合并起来。在数据规模较大时,我们可以引入并行计算,通过将排序任务划分成小块并使用并行计算资源来同时处理这些块,从而提高算法效率。


// Function to merge two sorted halves of a vector
template <typename T>
void merge(std::vector<T> &data, std::vector<T> &temp, size_t left, size_t middle, size_t right) {
    size_t i = left;
    size_t j = middle + 1;
    size_t k = left;

    while (i <= middle && j <= right) {
        if (data[i] <= data[j]) {
            temp[k++] = data[i++];
        } else {
            temp[k++] = data[j++];
        }
    }

    while (i <= middle) {
        temp[k++] = data[i++];
    }

    while (j <= right) {
        temp[k++] = data[j++];
    }

    // Copy merged elements back to the original vector
    for (size_t l = left; l <= right; ++l) {
        data[l] = temp[l];
    }
}

// Recursive function for parallel merge sort
template <typename T>
void mergeSortRecursive(sycl::queue &queue, std::vector<T> &data, std::vector<T> &temp, size_t left, size_t right) {
    if (left < right) {
        size_t middle = (left + right) / 2;

        // Create SYCL buffers for data and temp vectors
        sycl::buffer<T, 1> bufferData(data.data(), sycl::range<1>(data.size()));
        sycl::buffer<T, 1> bufferTemp(temp.data(), sycl::range<1>(temp.size()));

        // Submit a command group for parallel merge sort
        queue.submit([&](sycl::handler &cgh) {
            auto dataAccessor = bufferData.get_access<sycl::access::mode::read_write>(cgh);
            auto tempAccessor = bufferTemp.get_access<sycl::access::mode::read_write>(cgh);

            cgh.parallel_for<class mergeSortKernel>(sycl::range<1>(1), [=](sycl::item<1> item) {
                // Recursive call for left half
                mergeSortRecursive(queue, data, temp, left, middle);

                // Recursive call for right half
                mergeSortRecursive(queue, data, temp, middle + 1, right);

                // Merge the two halves
                merge(data, temp, left, middle, right);
            });
        });
    }
}

// Parallel merge sort entry function
template <typename T>
void parallelMergeSort(std::vector<T> &data) {
    // Create a SYCL queue
    sycl::queue queue(sycl::default_selector{});

    // Create a temporary vector for merging
    std::vector<T> temp(data.size());

    // Perform parallel merge sort
    mergeSortRecursive(queue, data, temp, 0, data.size() - 1);
}

图像卷积

在本任务中,我们将图片分成许多小块,并行地进行卷积操作,提高了算法的效率。


// Function to perform image convolution using SYCL
std::vector<std::vector<float>> imageConvolution(const std::vector<std::vector<float>> &image,
                                                 const std::vector<std::vector<float>> &kernel) {
    const size_t imageHeight = image.size();
    const size_t imageWidth = image[0].size();
    const size_t kernelSize = kernel.size();
    const size_t padding = kernelSize / 2;

    // Create a SYCL queue
    sycl::queue queue(sycl::default_selector{});

    // Create buffers for image and kernel
    sycl::buffer<float, 2> bufferImage(image.data(), sycl::range<2>(imageHeight, imageWidth));
    sycl::buffer<float, 2> bufferKernel(kernel.data(), sycl::range<2>(kernelSize, kernelSize));

    // Create a buffer for the result image
    sycl::buffer<float, 2> bufferResult(sycl::range<2>(imageHeight, imageWidth));

    // Submit a command group to the queue
    queue.submit([&](sycl::handler &cgh) {
        auto imageAccessor = bufferImage.get_access<sycl::access::mode::read>(cgh);
        auto kernelAccessor = bufferKernel.get_access<sycl::access::mode::read>(cgh);
        auto resultAccessor = bufferResult.get_access<sycl::access::mode::write>(cgh);

        cgh.parallel_for<class imageConvolutionKernel>(sycl::range<2>(imageHeight, imageWidth),
                                                       [=](sycl::item<2> item) {
                                                           float sum = 0.0f;

                                                           // Iterate over the kernel
                                                           for (size_t i = 0; i < kernelSize; ++i) {
                                                               for (size_t j = 0; j < kernelSize; ++j) {
                                                                   // Coordinates in the original image
                                                                   size_t x = item[0] + i - padding;
                                                                   size_t y = item[1] + j - padding;

                                                                   // Apply padding using nearest mode
                                                                   x = sycl::clamp(x, size_t(0), imageHeight - 1);
                                                                   y = sycl::clamp(y, size_t(0), imageWidth - 1);

                                                                   // Convolution operation
                                                                   sum += imageAccessor[x][y] * kernelAccessor[i][j];
                                                               }
                                                           }

                                                           // Write the result to the buffer
                                                           resultAccessor[item] = sum;
                                                       });
    }).wait();  // Wait for the command group to finish

    // Copy result from buffer to a 2D vector and return
    std::vector<std::vector<float>> result(imageHeight, std::vector<float>(imageWidth, 0.0f));
    sycl::host_accessor resultAccessor(bufferResult, sycl::read_only);
    for (size_t i = 0; i < imageHeight; ++i) {
        for (size_t j = 0; j < imageWidth; ++j) {
            result[i][j] = resultAccessor[i][j];
        }
    }

    return result;
}