我尝试将缓冲区 A 和缓冲区 B 分配给 GPU 0,然后将它们(缓冲区 A 和缓冲区 B)分配给 GPU 1,并运行它们添加 A + B 的内核,并将其分配给缓冲区 D(我之前在 GPU 中分配) 1)
我的问题是它在 python 中不起作用,在 pycuda 中不起作用,在 cup 中不起作用。
我在 PyCuda 中尝试了这段代码,它可以工作,但我不能证明它在 GPU1 上分配缓冲区
import pycuda.driver as cuda
import pycuda.autoinit
from pycuda.compiler import SourceModule
import numpy as np
import time
cuda.init()
array_size = 1000000
h_a = np.random.randint(0, 101, size=array_size).astype(np.float32)
h_b = np.random.randint(0, 101, size=array_size).astype(np.float32)
h_c = np.random.randint(0, 101, size=array_size).astype(np.float32)
dev0 = cuda.Device(0)
ctx0 = dev0.make_context()
ctx0.push()
d_a_gpu0 = cuda.mem_alloc(h_a.nbytes)
d_b_gpu0 = cuda.mem_alloc(h_b.nbytes)
d_c_gpu0 = cuda.mem_alloc(h_c.nbytes)
cuda.memcpy_htod(d_a_gpu0, h_a)
cuda.memcpy_htod(d_b_gpu0, h_b)
cuda.memcpy_htod(d_c_gpu0, h_c)
ctx0.pop()
dev1 = cuda.Device(1)
ctx1 = dev1.make_context()
ctx1.push()
d_a_gpu1 = cuda.mem_alloc(h_a.nbytes)
d_b_gpu1 = cuda.mem_alloc(h_b.nbytes)
d_c_gpu1 = cuda.mem_alloc(h_c.nbytes)
ctx1.pop()
ctx0.push()
start_peer_transfer = time.time()
cuda.memcpy_peer(d_a_gpu1, dest_context=ctx1, src=d_a_gpu0, src_context=ctx0, size=h_a.nbytes)
cuda.memcpy_peer(d_b_gpu1, dest_context=ctx1, src=d_b_gpu0, src_context=ctx0, size=h_b.nbytes)
end_peer_transfer = time.time()
ctx0.pop()
kernel_code = """
__global__ void add_arrays(float *a, float *b, float *c)
{
int idx = threadIdx.x + blockIdx.x * blockDim.x;
if (idx < 1000000)
{
c[idx] = a[idx] + b[idx];
}
}
"""
mod = SourceModule(kernel_code)
add_arrays = mod.get_function("add_arrays")
block_size = 256
grid_size = (array_size + block_size - 1) // block_size
ctx0.push()
start_gpu0_kernel = time.time()
add_arrays(d_a_gpu0, d_b_gpu0, d_c_gpu0, block=(block_size, 1, 1), grid=(grid_size, 1))
cuda.Context.synchronize()
end_gpu0_kernel = time.time()
ctx0.pop()
ctx1.push()
start_gpu1_kernel = time.time()
add_arrays(d_a_gpu1, d_b_gpu1, d_c_gpu1, block=(block_size, 1, 1), grid=(grid_size, 1))
cuda.Context.synchronize()
end_gpu1_kernel = time.time()
ctx1.pop()
h_c_result = np.empty_like(h_c)
h_d_result = np.empty_like(h_c)
ctx0.push()
cuda.memcpy_dtoh(h_c_result, d_c_gpu0)
ctx0.pop()
ctx1.push()
cuda.memcpy_dtoh(h_d_result, d_c_gpu1)
ctx1.pop()
comparison = np.allclose(h_c_result, h_d_result)
print("Results are identical:", comparison)
print(f"Time for cuda.memcpy_peer: {end_peer_transfer - start_peer_transfer:.6f} seconds")
print(f"Time for kernel execution on GPU 0: {end_gpu0_kernel - start_gpu0_kernel:.6f} seconds")
print(f"Time for kernel execution on GPU 1: {end_gpu1_kernel - start_gpu1_kernel:.6f} seconds")
d_a_gpu0.free()
d_b_gpu0.free()
d_c_gpu0.free()
d_a_gpu1.free()
d_b_gpu1.free()
d_c_gpu1.free()
ctx1.pop()
ctx0.pop()
For CuPy 我尝试了这段代码,该代码在“deviceCanAccessPeer”上返回错误
import cupy as cp
import numpy as np
import time
try:
with cp.cuda.Device(0):
A = cp.random.rand(1000)
B = cp.random.rand(1000)
C = cp.empty_like(A)
start_time = time.time()
C = A + B
C_cpu = cp.asnumpy(C)
print("Device 0 computation time:", time.time() - start_time)
if cp.cuda.runtime.getDeviceCount() > 1:
cp.cuda.runtime.deviceCanAccessPeer(1, 0)
cp.cuda.runtime.deviceEnablePeerAccess(1)
with cp.cuda.Device(1):
D = cp.empty_like(A)
start_time = time.time()
A_device1 = cp.empty_like(A)
B_device1 = cp.empty_like(B)
cp.cuda.runtime.memcpyPeer(A_device1.data.ptr, 1, A.data.ptr, 0, A.nbytes)
cp.cuda.runtime.memcpyPeer(B_device1.data.ptr, 1, B.data.ptr, 0, B.nbytes)
D = A_device1 + B_device1
D_cpu = cp.asnumpy(D)
print("Device 1 computation time:", time.time() - start_time)
comparison = np.array_equal(C_cpu, D_cpu)
print("Comparison result:", comparison)
except cp.cuda.runtime.CUDARuntimeError as e:
print(f"CUDA Runtime Error: {e}")
except Exception as e:
print(f"An error occurred: {e}")
当然可以,我可以帮助你弄清楚如何在两个 GPU 之间传输缓冲区。
你遇到的错误是因为虽然你正在使用
cuda.memcpy_peer
和
cp.cuda.runtime.memcpyPeer
,但你没有在使用 CuPy 时启用对等访问,也没有在你的代码的关键部分设置当前上下文。
以下是如何在 PyCUDA 和 CuPy 中正确执行此操作的方法:
PyCUDA
import pycuda.driver as cuda
import pycuda.autoinit
from pycuda.compiler import SourceModule
import numpy as np
import time
cuda.init()
array_size = 1000000
h_a = np.random.randint(0, 101, size=array_size).astype(np.float32)
h_b = np.random.randint(0, 101, size=array_size).astype(np.float32)
h_c = np.random.randint(0, 101, size=array_size).astype(np.float32)
dev0 = cuda.Device(0)
ctx0 = dev0.make_context()
ctx0.push()
d_a_gpu0 = cuda.mem_alloc(h_a.nbytes)
d_b_gpu0 = cuda.mem_alloc(h_b.nbytes)
cuda.memcpy_htod(d_a_gpu0, h_a)
cuda.memcpy_htod(d_b_gpu0, h_b)
ctx0.pop()
dev1 = cuda.Device(1)
ctx1 = dev1.make_context()
ctx1.push()
d_a_gpu1 = cuda.mem_alloc(h_a.nbytes)
d_b_gpu1 = cuda.mem_alloc(h_b.nbytes)
d_c_gpu1 = cuda.mem_alloc(h_c.nbytes)
ctx1.pop()
# 在进行任何内存传输之前启用对等访问
cuda.enable_peer_access(src_device=dev0, dest_device=dev1)
ctx0.push()
cuda.memcpy_peer(d_a_gpu1, dest_context=ctx1, src=d_a_gpu0, src_context=ctx0, size=h_a.nbytes)
cuda.memcpy_peer(d_b_gpu1, dest_context=ctx1, src=d_b_gpu0, src_context=ctx0, size=h_b.nbytes)
ctx0.pop()
kernel_code = """
__global__ void add_arrays(float *a, float *b, float *c)
{
int idx = threadIdx.x + blockIdx.x * blockDim.x;
if (idx < 1000000)
{
c[idx] = a[idx] + b[idx];
}
}
"""
mod = SourceModule(kernel_code)
add_arrays = mod.get_function("add_arrays")
block_size = 256
grid_size = (array_size + block_size - 1) // block_size
ctx1.push()
start_gpu1_kernel = time.time()
add_arrays(d_a_gpu1, d_b_gpu1, d_c_gpu1, block=(block_size, 1, 1), grid=(grid_size, 1))
cuda.Context.synchronize()
end_gpu1_kernel = time.time()
ctx1.pop()
h_d_result = np.empty_like(h_c)
ctx1.push()
cuda.memcpy_dtoh(h_d_result, d_c_gpu1)
ctx1.pop()
# 清理(其他释放保持不变)
cuda.disable_peer_access(src_device=dev0, dest_device=dev1)
print(f"Time for kernel execution on GPU 1: {end_gpu1_kernel - start_gpu1_kernel:.6f} seconds")
# ...(其余的清理代码)
CuPy
import cupy as cp
import numpy as np
import time
with cp.cuda.Device(0):
A = cp.random.rand(1000).astype(cp.float32)
B = cp.random.rand(1000).astype(cp.float32)
if cp.cuda.runtime.getDeviceCount() > 1:
# 检查对等访问可用性
if cp.cuda.runtime.deviceCanAccessPeer(0, 1):
cp.cuda.runtime.deviceEnablePeerAccess(1)
cp.cuda.runtime.deviceEnablePeerAccess(0) # 在两个方向上启用
with cp.cuda.Device(1):
A_device1 = cp.empty_like(A)
B_device1 = cp.empty_like(B)
C_device1 = cp.empty_like(A)
# 使用流进行异步复制
with cp.cuda.Stream() as stream:
A_device1.data.copy_from(A.data, stream=stream)
B_device1.data.copy_from(B.data, stream=stream)
stream.synchronize() # 等待复制完成
C_device1 = A_device1 + B_device1
print(C_device1)
cp.cuda.runtime.deviceDisablePeerAccess(1)
cp.cuda.runtime.deviceDisablePeerAccess(0)
else:
print("GPU 0 无法访问 GPU 1 的内存")
说明:
-
对等访问和检查:
在使用
cuda.memcpy_peer
或cp.cuda.runtime.memcpyPeer
之前,你需要启用对等访问。这对 PyCUDA 和 CuPy 都至关重要。还要确保 GPU 可以通过deviceCanAccessPeer
函数相互访问。 - 上下文管理: PyCUDA 需要明确的上下文管理。在执行任何操作之前,请确保在要使用的正确设备/上下文中。
-
错误检查:
始终包含错误检查!CUDA 函数通常返回错误代码,并且包含
cuda.Context.synchronize()
(对于 PyCUDA)等同步点可以帮助识别你的代码中出现问题的位置。 - 性能: 考虑使用 CUDA 流来实现异步内存传输,从而有可能提高性能,尤其是在执行多个传输操作时。
通过这些更改,你的代码应该可以在两个 GPU 之间正确传输数据。请记住在使用完对等访问后禁用它,以防止潜在的冲突。
如有任何其他问题,请告诉我!
标签:python,cuda,pycuda,multi-gpu,cupy From: 78773865