sách gpt4 ai đã đi

cuda - 让 CUDA Thrust 使用您选择的 CUDA 流

In lại 作者:行者123 更新时间:2023-12-04 22:04:45 38 4
mua khóa gpt4 Nike

查看 CUDA Thrust 代码中的内核启动,似乎它们总是使用默认流。我可以让 Thrust 使用我选择的流吗?我在 API 中遗漏了什么吗?

1 Câu trả lời

我想在 Thrust 1.8 发布后更新 talonmies 提供的答案,它引入了将 CUDA 执行流指示为的可能性

thrust::cuda::par.on(stream)

也可以看看

Thrust Release 1.8.0 .

在下面,我将重铸示例

False dependency issue for the Fermi architecture

在 CUDA Thrust API 方面。
#include 

#include "cuda_runtime.h"
#include "device_launch_parameters.h"

#include

#include
#include

#include "Utilities.cuh"

sử dụng không gian tên std;

#define NUM_THREADS 32
#define NUM_BLOCKS 16
#define NUM_STREAMS 3

struct BinaryOp{ __host__ __device__ int operator()(const int& o1,const int& o2) { return o1 * o2; } };

int chính()
{
const int N = 6000000;

// --- Host side input data allocation and initialization. Registering host memory as page-locked (required for asynch cudaMemcpyAsync).
int *h_in = new int[N]; for(int i = 0; i < N; i++) h_in[i] = 5;
gpuErrchk(cudaHostRegister(h_in, N * sizeof(int), cudaHostRegisterPortable));

// --- Host side input data allocation and initialization. Registering host memory as page-locked (required for asynch cudaMemcpyAsync).
int *h_out = new int[N]; for(int i = 0; i < N; i++) h_out[i] = 0;
gpuErrchk(cudaHostRegister(h_out, N * sizeof(int), cudaHostRegisterPortable));

// --- Host side check results vector allocation and initialization
int *h_checkResults = new int[N]; for(int i = 0; i < N; i++) h_checkResults[i] = h_in[i] * h_in[i];

// --- Device side input data allocation.
int *d_in = 0; gpuErrchk(cudaMalloc((void **)&d_in, N * sizeof(int)));

// --- Device side output data allocation.
int *d_out = 0; gpuErrchk( cudaMalloc((void **)&d_out, N * sizeof(int)));

int streamSize = N / NUM_STREAMS;
size_t streamMemSize = N * sizeof(int) / NUM_STREAMS;

// --- Set kernel launch configuration
dim3 nThreads = dim3(NUM_THREADS,1,1);
dim3 nBlocks = dim3(NUM_BLOCKS, 1,1);
dim3 subKernelBlock = dim3((int)ceil((float)nBlocks.x / 2));

// --- Create CUDA streams
cudaStream_t streams[NUM_STREAMS];
for(int i = 0; i < NUM_STREAMS; i++)
gpuErrchk(cudaStreamCreate(&streams[i]));

/**************************/
/* BREADTH-FIRST APPROACH */
/**************************/

for(int i = 0; i < NUM_STREAMS; i++) {
int offset = i * streamSize;
cudaMemcpyAsync(&d_in[offset], &h_in[offset], streamMemSize, cudaMemcpyHostToDevice, streams[i]);
}

for(int i = 0; i < NUM_STREAMS; i++)
{
int offset = i * streamSize;

thrust::transform(thrust::cuda::par.on(streams[i]), thrust::device_pointer_cast(&d_in[offset]), thrust::device_pointer_cast(&d_in[offset]) + streamSize/2,
thrust::device_pointer_cast(&d_in[offset]), thrust::device_pointer_cast(&d_out[offset]), BinaryOp());
thrust::transform(thrust::cuda::par.on(streams[i]), thrust::device_pointer_cast(&d_in[offset + streamSize/2]), thrust::device_pointer_cast(&d_in[offset + streamSize/2]) + streamSize/2,
thrust::device_pointer_cast(&d_in[offset + streamSize/2]), thrust::device_pointer_cast(&d_out[offset + streamSize/2]), BinaryOp());

}

for(int i = 0; i < NUM_STREAMS; i++) {
int offset = i * streamSize;
cudaMemcpyAsync(&h_out[offset], &d_out[offset], streamMemSize, cudaMemcpyDeviceToHost, streams[i]);
}

for(int i = 0; i < NUM_STREAMS; i++)
gpuErrchk(cudaStreamSynchronize(streams[i]));

gpuErrchk(cudaDeviceSynchronize());

// --- Release resources
gpuErrchk(cudaHostUnregister(h_in));
gpuErrchk(cudaHostUnregister(h_out));
gpuErrchk(cudaFree(d_in));
gpuErrchk(cudaFree(d_out));

for(int i = 0; i < NUM_STREAMS; i++)
gpuErrchk(cudaStreamDestroy(streams[i]));

cudaDeviceReset();

// --- GPU output check
int sum = 0;
for(int i = 0; i < N; i++) {
//printf("%i %i\n", h_out[i], h_checkResults[i]);
sum += h_checkResults[i] - h_out[i];
}

cout << "Error between CPU and GPU: " << sum << endl;

delete[] h_in;
delete[] h_out;
delete[] h_checkResults;

trả về 0;
}

Utilities.cu Utilities.cuh 运行此类示例所需的文件保存在此 github page .

Visual Profiler 时间线显示了 CUDA Thrust 操作和内存传输的并发性

nhập mô tả hình ảnh ở đây

关于cuda - 让 CUDA Thrust 使用您选择的 CUDA 流,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/24368197/

38 4 0
Bài viết được đề xuất: svn - 每个客户端的更新
Bài viết được đề xuất: AngularJS - 指令名称中是否可以有下划线?
Bài viết được đề xuất: scala - 在Scala中处理嵌套的可为空对象的惯用方法?
Bài viết được đề xuất: Java Scanner异常处理
行者123
Hồ sơ cá nhân

Tôi là một lập trình viên xuất sắc, rất giỏi!

Nhận phiếu giảm giá Didi Taxi miễn phí
Mã giảm giá Didi Taxi
Giấy chứng nhận ICP Bắc Kinh số 000000
Hợp tác quảng cáo: 1813099741@qq.com 6ren.com