本文介绍了利用oneAPI,使用sycl编程实现并行算法,完成了矩阵乘法、归并排序、图像卷积三个任务的过程。
矩阵乘法
在此任务中,我们使用sycl编写并行计算的内核。为了提高局部计算效率,我们使用共享内存存储部分矩阵数据。
std::vector<std::vector<float>> matrixMultiply(const std::vector<std::vector<float>> &A, const std::vector<std::vector<float>> &B) {
// Create a SYCL queue
sycl::queue queue(sycl::default_selector{});
// Flatten 2D matrices to 1D vectors
std::vector<float> flatA;
std::vector<float> flatB;
for (const auto &row : A)
flatA.insert(flatA.end(), row.begin(), row.end());
for (const auto &row : B)
flatB.insert(flatB.end(), row.begin(), row.end());
// Create buffers for flattened matrices A, B, and C
sycl::buffer<float, 1> bufferA(flatA.data(), sycl::range<1>(M * K));
sycl::buffer<float, 1> bufferB(flatB.data(), sycl::range<1>(K * N));
sycl::buffer<float, 1> bufferC(sycl::range<1>(M * N));
// Submit a command group to the queue
queue.submit([&](sycl::handler &cgh) {
// Accessors for the buffers
auto a = bufferA.get_access<sycl::access::mode::read>(cgh);
auto b = bufferB.get_access<sycl::access::mode::read>(cgh);
auto c = bufferC.get_access<sycl::access::mode::write>(cgh);
// Shared memory for each work group
sycl::accessor<float, 2, sycl::access::mode::read_write, sycl::access::target::local>
localMem(sycl::range<2>(blockSize, blockSize), cgh);
// Define a 2D range for global size
sycl::range<2> globalSize(M, N);
cgh.parallel_for<class matrixMultiplyKernel>(sycl::nd_range<2>(globalSize, sycl::range<2>(blockSize, blockSize)),
[=](sycl::nd_item<2> item) {
int row = item.get_global_id(0);
int col = item.get_global_id(1);
float sum = 0.0f;
// Loop over blocks
for (int kBlock = 0; kBlock < K; kBlock += blockSize) {
// Load block of A and B into shared memory
localMem[item.get_local_id(0)][item.get_local_id(1)] = a[row * K + kBlock + item.get_local_id(1)];
item.barrier(sycl::access::fence_space::local_space);
localMem[item.get_local_id(0)][item.get_local_id(1)] *= b[(kBlock + item.get_local_id(0)) * N + col];
item.barrier(sycl::access::fence_space::local_space);
// Perform block-level computation
for (int i = 0; i < blockSize; ++i) {
sum += localMem[item.get_local_id(0)][i] * localMem[i][item.get_local_id(1)];
}
item.barrier(sycl::access::fence_space::local_space);
}
// Write result to global memory
c[row * N + col] = sum;
});
}).wait(); // Wait for the command group to finish
// Copy result from bufferC to a 2D vector and return
std::vector<std::vector<float>> result(M, std::vector<float>(N, 0.0f));
sycl::host_accessor resultAccessor(bufferC, sycl::read_only);
for (int i = 0; i < M; ++i) {
for (int j = 0; j < N; ++j) {
result[i][j] = resultAccessor[i * N + j];
}
}
return result;
}
归并排序
归并排序是一种经典的排序算法,通过分治的思想将问题分解成小问题,然后将小问题的解合并起来。在数据规模较大时,我们可以引入并行计算,通过将排序任务划分成小块并使用并行计算资源来同时处理这些块,从而提高算法效率。
// Function to merge two sorted halves of a vector
template <typename T>
void merge(std::vector<T> &data, std::vector<T> &temp, size_t left, size_t middle, size_t right) {
size_t i = left;
size_t j = middle + 1;
size_t k = left;
while (i <= middle && j <= right) {
if (data[i] <= data[j]) {
temp[k++] = data[i++];
} else {
temp[k++] = data[j++];
}
}
while (i <= middle) {
temp[k++] = data[i++];
}
while (j <= right) {
temp[k++] = data[j++];
}
// Copy merged elements back to the original vector
for (size_t l = left; l <= right; ++l) {
data[l] = temp[l];
}
}
// Recursive function for parallel merge sort
template <typename T>
void mergeSortRecursive(sycl::queue &queue, std::vector<T> &data, std::vector<T> &temp, size_t left, size_t right) {
if (left < right) {
size_t middle = (left + right) / 2;
// Create SYCL buffers for data and temp vectors
sycl::buffer<T, 1> bufferData(data.data(), sycl::range<1>(data.size()));
sycl::buffer<T, 1> bufferTemp(temp.data(), sycl::range<1>(temp.size()));
// Submit a command group for parallel merge sort
queue.submit([&](sycl::handler &cgh) {
auto dataAccessor = bufferData.get_access<sycl::access::mode::read_write>(cgh);
auto tempAccessor = bufferTemp.get_access<sycl::access::mode::read_write>(cgh);
cgh.parallel_for<class mergeSortKernel>(sycl::range<1>(1), [=](sycl::item<1> item) {
// Recursive call for left half
mergeSortRecursive(queue, data, temp, left, middle);
// Recursive call for right half
mergeSortRecursive(queue, data, temp, middle + 1, right);
// Merge the two halves
merge(data, temp, left, middle, right);
});
});
}
}
// Parallel merge sort entry function
template <typename T>
void parallelMergeSort(std::vector<T> &data) {
// Create a SYCL queue
sycl::queue queue(sycl::default_selector{});
// Create a temporary vector for merging
std::vector<T> temp(data.size());
// Perform parallel merge sort
mergeSortRecursive(queue, data, temp, 0, data.size() - 1);
}
图像卷积
在本任务中,我们将图片分成许多小块,并行地进行卷积操作,提高了算法的效率。
// Function to perform image convolution using SYCL
std::vector<std::vector<float>> imageConvolution(const std::vector<std::vector<float>> &image,
const std::vector<std::vector<float>> &kernel) {
const size_t imageHeight = image.size();
const size_t imageWidth = image[0].size();
const size_t kernelSize = kernel.size();
const size_t padding = kernelSize / 2;
// Create a SYCL queue
sycl::queue queue(sycl::default_selector{});
// Create buffers for image and kernel
sycl::buffer<float, 2> bufferImage(image.data(), sycl::range<2>(imageHeight, imageWidth));
sycl::buffer<float, 2> bufferKernel(kernel.data(), sycl::range<2>(kernelSize, kernelSize));
// Create a buffer for the result image
sycl::buffer<float, 2> bufferResult(sycl::range<2>(imageHeight, imageWidth));
// Submit a command group to the queue
queue.submit([&](sycl::handler &cgh) {
auto imageAccessor = bufferImage.get_access<sycl::access::mode::read>(cgh);
auto kernelAccessor = bufferKernel.get_access<sycl::access::mode::read>(cgh);
auto resultAccessor = bufferResult.get_access<sycl::access::mode::write>(cgh);
cgh.parallel_for<class imageConvolutionKernel>(sycl::range<2>(imageHeight, imageWidth),
[=](sycl::item<2> item) {
float sum = 0.0f;
// Iterate over the kernel
for (size_t i = 0; i < kernelSize; ++i) {
for (size_t j = 0; j < kernelSize; ++j) {
// Coordinates in the original image
size_t x = item[0] + i - padding;
size_t y = item[1] + j - padding;
// Apply padding using nearest mode
x = sycl::clamp(x, size_t(0), imageHeight - 1);
y = sycl::clamp(y, size_t(0), imageWidth - 1);
// Convolution operation
sum += imageAccessor[x][y] * kernelAccessor[i][j];
}
}
// Write the result to the buffer
resultAccessor[item] = sum;
});
}).wait(); // Wait for the command group to finish
// Copy result from buffer to a 2D vector and return
std::vector<std::vector<float>> result(imageHeight, std::vector<float>(imageWidth, 0.0f));
sycl::host_accessor resultAccessor(bufferResult, sycl::read_only);
for (size_t i = 0; i < imageHeight; ++i) {
for (size_t j = 0; j < imageWidth; ++j) {
result[i][j] = resultAccessor[i][j];
}
}
return result;
}
标签:std,sycl,并行算法,实践,queue,item,oneAPI,vector,size
From: https://www.cnblogs.com/kazoeyakuman/p/17873901.html