Cuda Pipeline 同步机制
pipeline
它实现上是一个proxy pattern, cuda::pipeline
是每个thread访问pipeline_shared_state
的proxy
// 申请一个pipeline,同步API,会自动记录producer和consumer的数量
cuda::pipeline pipeline = cuda::make_pipeline(block, &shared_state, thread_role);
if (thread_role == cuda::pipeline_role::producer) {
// Only the producer threads schedule asynchronous memcpys:
pipeline.producer_acquire();
size_t shared_idx = fetch_batch % stages_count;
size_t batch_idx = fetch_batch;
size_t global_batch_idx = block_batch(batch_idx) + thread_idx;
size_t shared_batch_idx = shared_offset[shared_idx] + thread_idx;
cuda::memcpy_async(shared + shared_batch_idx, global_in + global_batch_idx, sizeof(int), pipeline);
// 同步接口?
pipeline.producer_commit();
}
if (thread_role == cuda::pipeline_role::consumer) {
// Only the consumer threads compute:
// 同步接口?
pipeline.consumer_wait();
size_t shared_idx = compute_batch % stages_count;
size_t global_batch_idx = block_batch(compute_batch) + thread_idx;
size_t shared_batch_idx = shared_offset[shared_idx] + thread_idx;
compute(global_out + global_batch_idx, *(shared + shared_batch_idx));
pipeline.consumer_release();
}