Neural network image classification using Intel oneAPI tool

发布时间 2023-06-07 17:10:41作者: MercedesKK

With the continuous development of artificial intelligence technology, image classification has become a popular research area. In this field, deep learning techniques are widely used. However, the computational complexity of deep learning techniques is high and requires a large amount of computational resources. To solve this problem, Intel has launched the oneAPI tool suite, which includes tools such as DPC++ programming language, Intel® oneMKL, and Intel® oneTBB, which can help developers to better utilize the hardware gas pedals of Intel architecture.

In this article, we will use the Intel oneAPI tool suite to implement an image classifier to demonstrate its application in the field of deep learning.

First, we need to prepare training data and test data. In this example, we will use the MNIST dataset, which is a handwritten digital image dataset containing 60,000 training images and 10,000 test images. We will use the DPC++ programming language to implement the neural network model and use Intel® oneMKL to perform linear algebra calculations.

The following is the code of the neural network model written in DPC++:

#include <CL/sycl.hpp>
#include <iostream>
#include <fstream>
#include <vector>

using namespace sycl;

class NeuralNetwork {
public:
   NeuralNetwork(int inputSize, int hiddenSize, int outputSize, float learningRate)
      : inputSize(inputSize), hiddenSize(hiddenSize), outputSize(outputSize), learningRate(learningRate)
  {
       // Initialize weight matrices
       inputToHiddenWeights.resize(inputSize * hiddenSize);
       hiddenToOutputWeights.resize(hiddenSize * outputSize);
       for (int i = 0; i < inputToHiddenWeights.size(); i++) {
           inputToHiddenWeights[i] = (float)rand() / RAND_MAX - 0.5f;
      }
       for (int i = 0; i < hiddenToOutputWeights.size(); i++) {
           hiddenToOutputWeights[i] = (float)rand() / RAND_MAX - 0.5f;
      }

       // Initialize bias vectors
       hiddenBiases.resize(hiddenSize);
       outputBiases.resize(outputSize);
       for (int i = 0; i < hiddenBiases.size(); i++) {
           hiddenBiases[i] = (float)rand() / RAND_MAX - 0.5f;
      }
       for (int i = 0; i < outputBiases.size(); i++) {
           outputBiases[i] = (float)rand() / RAND_MAX - 0.5f;
      }
  }

   void train(const std::vector<float>& inputs, const std::vector<float>& targets) {
       // Forward pass
       std::vector<float> hiddenOutputs(hiddenSize);
       std::vector<float> outputOutputs(outputSize);
       for (int i = 0; i < hiddenSize; i++) {
           float sum = hiddenBiases[i];
           for (int j = 0; j < inputSize; j++) {
               sum += inputs[j] * inputToHiddenWeights[j * hiddenSize + i];
          }
           hiddenOutputs[i] = sigmoid(sum);
      }
       for (int i = 0; i < outputSize; i++) {
           float sum = outputBiases[i];
           for (int j = 0; j < hiddenSize; j++) {
               sum += hiddenOutputs[j] * hiddenToOutputWeights[j * outputSize + i];
          }
           outputOutputs[i] = sigmoid(sum);
      }

       // Backward pass
       std::vector<float> outputErrors(outputSize);
       for (int i = 0; i < outputSize; i++) {
           float error = targets[i] - outputOutputs[i];
           outputErrors[i] = error * sigmoidDerivative(outputOutputs[i]);
      }
       std::vector<float> hiddenErrors(hiddenSize);
       for (int i = 0; i < hiddenSize; i++) {
           float error = 0;
           for (int j = 0; j < outputSize; j++) {
               error += outputErrors[j] * hiddenToOutputWeights[i * outputSize + j];
          }
           hiddenErrors[i] = error * sigmoidDerivative(hiddenOutputs[i]);
      }

       // Update weights and biases
       for (int i =0; i < hiddenSize; i++) {
           for (int j = 0; j < inputSize; j++) {
               inputToHiddenWeights[j * hiddenSize + i] += learningRate * hiddenErrors[i] * inputs[j];
          }
      }
       for (int i = 0; i < outputSize; i++) {
           for (int j = 0; j < hiddenSize; j++) {
               hiddenToOutputWeights[j * outputSize + i] += learningRate * outputErrors[i] * hiddenOutputs[j];
          }
      }
       for (int i = 0; i < hiddenSize; i++) {
           hiddenBiases[i] += learningRate * hiddenErrors[i];
      }
       for (int i = 0; i < outputSize; i++) {
           outputBiases[i] += learningRate * outputErrors[i];
      }
  }

   std::vector<float> predict(const std::vector<float>& inputs) {
       std::vector<float> hiddenOutputs(hiddenSize);
       std::vector<float> outputOutputs(outputSize);
       for (int i = 0; i < hiddenSize; i++) {
           float sum = hiddenBiases[i];
           for (int j = 0; j < inputSize; j++) {
               sum += inputs[j] * inputToHiddenWeights[j * hiddenSize + i];
          }
           hiddenOutputs[i] = sigmoid(sum);
      }
       for (int i = 0; i < outputSize; i++) {
           float sum = outputBiases[i];
           for (int j = 0; j < hiddenSize; j++) {
               sum += hiddenOutputs[j] * hiddenToOutputWeights[j * outputSize + i];
          }
           outputOutputs[i] = sigmoid(sum);
      }
       return outputOutputs;
  }

private:
   int inputSize;
   int hiddenSize;
   int outputSize;
   float learningRate;
   std::vector<float> inputToHiddenWeights;
   std::vector<float> hiddenToOutputWeights;
   std::vector<float> hiddenBiases;
   std::vector<float> outputBiases;

   float sigmoid(float x) {
       return 1 / (1 + exp(-x));
  }

   float sigmoidDerivative(float x) {
       return x * (1 - x);
  }
};

int main() {
   // Load data
   std::vector<std::vector<float>> trainImages;
   std::vector<int> trainLabels;
   std::vector<std::vector<float>> testImages;
   std::vector<int> testLabels;
   std::ifstream trainImagesFile("train-images.bin", std::ios::binary);
   std::ifstream trainLabelsFile("train-labels.bin", std::ios::binary);
   std::ifstream testImagesFile("test-images.bin", std::ios::binary);
   std::ifstream testLabelsFile("test-labels.bin", std::ios::binary);
   if (trainImagesFile && trainLabelsFile && testImagesFile && testLabelsFile) {
       int trainMagicNumber, trainCount, trainRows, trainCols;
       int testMagicNumber, testCount, testRows, testCols;
       trainImagesFile.read(reinterpret_cast<char*>(&trainMagicNumber), sizeof(trainMagicNumber));
       trainImagesFile.read(reinterpret_cast<char*>(&trainCount), sizeof(trainCount));
       trainImagesFile.read(reinterpret_cast<char*>(&trainRows), sizeof(trainRows));
       trainImagesFile.read(reinterpret_cast<char*>(&trainCols), sizeof(trainCols));
       testImagesFile.read(reinterpret_cast<char*>(&testMagicNumber), sizeof(testMagicNumber));
       testImagesFile.read(reinterpret_cast<char*>(&testCount), sizeof(testCount));
       testImagesFile.read(reinterpret_cast<char*>(&testRows), sizeof(testRows));
       testImagesFile.read(reinterpret_cast<char*>(&testCols), sizeof(testCols));
       trainImages.resize(trainCount);
       trainLabels.resize(trainCount);
       testImages.resize(testCount);
       testLabels.resize(testCount);
       for (int i = 0; i < trainCount; i++) {
           trainImages[i].resize(trainRows * trainCols);
           trainImagesFile.read(reinterpret_cast<char*>(trainImages[i].data()), trainRows * trainCols);
           trainLabelsFile.read(reinterpret_cast<char*>(&trainLabels[i]), 1);
      }
       for (int i = 0; i < testCount; i++) {
           testImages[i].resize(testRows * testCols);
           testImagesFile.read(reinterpret_cast<char*>(testImages[i].data()), testRows * testCols);
           testLabelsFile.read(reinterpret_cast<char*>(&testLabels[i]), 1);
      }
  } else {
       std::cerr << "Failed to open data files." << std::endl;
       return 1;
  }

   // Initialize neural network
   const int inputSize = 28 * 28;
   const int hiddenSize = 100;
   const int outputSize = 10;
   const float
继续

learningRate = 0.1f;
NeuralNetwork neuralNetwork(inputSize, hiddenSize, outputSize, learningRate);

arduino
Copy
// Train neural network
const int numEpochs = 10;
const int numBatches = 600;
const int batchSize = 100;
const int numTrainImages = trainImages.size();
for (int epoch = 0; epoch < numEpochs; epoch++) {
   std::cout << "Epoch " << epoch + 1 << "/" << numEpochs << std::endl;
   for (int batch = 0; batch < numBatches; batch++) {
       std::vector<float> batchInputs;
       std::vector<float> batchTargets;
       for (int i = 0; i < batchSize; i++) {
           int index = batch * batchSize + i;
           if (index >= numTrainImages) {
               break;
          }
           const auto& image = trainImages[index];
           batchInputs.insert(batchInputs.end(), image.begin(), image.end());
           std::vector<float> target(outputSize, 0);
           target[trainLabels[index]] = 1;
           batchTargets.insert(batchTargets.end(), target.begin(), target.end());
      }
       neuralNetwork.train(batchInputs, batchTargets);
  }
}

// Test neural network
int numCorrect = 0;
const int numTestImages = testImages.size();
for (int i = 0; i < numTestImages; i++) {
   const auto& image = testImages[i];
   const auto& target = testLabels[i];
   auto output = neuralNetwork.predict(image);
   int prediction = std::distance(output.begin(), std::max_element(output.begin(), output.end()));
   if (prediction == target) {
       numCorrect++;
  }
}
float accuracy = (float)numCorrect / numTestImages;
std::cout << "Accuracy: " << accuracy << std::endl;

return 0;
}

In the above code, we define a NeuralNetwork class to implement the neural network model. The constructor initializes the weight matrix and bias vector, and the train() function and predict() function implement the forward and backward propagation, respectively.

Next, we need to use Intel® oneMKL to perform linear algebra calculations. To do this, we need to add the following header file:

#include

Then, in the train() function and predict() function, we can use the cblas_sgemv() function and cblas_sgemm() function in Intel® oneMKL to perform matrix and vector multiplication operations. For example, in the train() function, we can put the following code.

for (int i =0; i < hiddenSize; i++) {
  for (int j = 0; j < inputSize; j++) {
      inputToHiddenWeights[j * hiddenSize + i] += learningRate * hiddenErrors[i] * inputs[j];
  }
}

Replace the code with the following:

cblas_sger(CblasRowMajor, inputSize, hiddenSize, learningRate, inputs.data(), 1, hiddenErrors.data(), 1, inputToHiddenWeights.data(), hiddenSize);

This will use the cblas_sger() function to add the outer product of the input vector and the error vector to the input to the hidden layer weights matrix.

Similarly, in the predict() function, we can put the following code:

for (int i = 0; i < outputSize; i++) {
  float sum = outputBiases[i];
  for (int j = 0; j < hiddenSize; j++) {
      sum += hiddenOutputs[j] * hiddenToOutputWeights[j * outputSize + i];
  }
  outputOutputs[i] = sigmoid(sum);
}

Replace the code with the following:

cblas_sgemv(CblasRowMajor, CblasNoTrans, hiddenSize, outputSize, 1.0f, hiddenOutputs.data(), hiddenSize, hiddenToOutputWeights.data(), outputSize, 0.0f, outputOutputs.data(), 1);
for (int i = 0; i < outputSize; i++) {
  outputOutputs[i] = sigmoid(outputOutputs[i] + outputBiases[i]);
}

This will add the product of the hidden layer output vector and the hidden layer to output layer weight matrix to the output vector using the cblas_sgemv() function and map the result to the [0,1] range using the sigmoid() function.

Finally, we can use the Intel® oneMKL library in the Intel® oneAPI tool suite to implement efficient linear algebra calculations to make the training and inference of neural networks run faster. Also, we can use Intel® Threading Building Blocks (TBB) to implement parallelization and further improve performance.

For example, in the train() function, we can use the parallel_for() function in TBB to parallelize the loop. The following is sample code for replacing a for loop using the parallel_for() function:

tbb::parallel_for(tbb::blocked_range<int>(0, hiddenSize),
  [&](const tbb::blocked_range<int>& r) {
      for (int i = r.begin(); i != r.end(); i++) {
          float hiddenError = 0;
          for (int j = 0; j < outputSize; j++) {
              hiddenError += outputErrors[j] * hiddenToOutputWeights[i * outputSize + j];
          }
          hiddenErrors[i] = hiddenOutputs[i] * (1 - hiddenOutputs[i]) * hiddenError;
      }
  });

This will use the TBB parallelization loop to distribute the computation of the hiddenErrors vector to multiple threads to improve the computation efficiency.

In the predict() function, we can also use the TBB parallelization loop to speed up the computation. For example, the following is sample code for parallelizing a for loop using the parallel_for() function:

tbb::parallel_for(tbb::blocked_range<int>(0, outputSize),
  [&](const tbb::blocked_range<int>& r) {
      for (int i = r.begin(); i != r.end(); i++) {
          float sum = cblas_sdot(hiddenSize, hiddenOutputs.data(), 1, hiddenToOutputWeights.data() + i, outputSize);
          outputOutputs[i] = sigmoid(sum + outputBiases[i]);
      }
  });

This will use the TBB parallelization loop to distribute the computation of the outputOutputs vector into multiple threads to improve computational efficiency.

In summary, the use of Intel® oneMKL and Intel® TBB can greatly improve the performance of neural networks, making them more suitable for handling large-scale data sets and complex models.