Cuda Pipeline 同步机制

pipeline

它实现上是一个proxy pattern, cuda::pipeline是每个thread访问pipeline_shared_state的proxy

// 申请一个pipeline,同步API,会自动记录producer和consumer的数量
cuda::pipeline pipeline = cuda::make_pipeline(block, &shared_state, thread_role);
if (thread_role == cuda::pipeline_role::producer) {
  // Only the producer threads schedule asynchronous memcpys:
  pipeline.producer_acquire();
  size_t shared_idx = fetch_batch % stages_count;
  size_t batch_idx = fetch_batch;
  size_t global_batch_idx = block_batch(batch_idx) + thread_idx;
  size_t shared_batch_idx = shared_offset[shared_idx] + thread_idx;
  cuda::memcpy_async(shared + shared_batch_idx, global_in + global_batch_idx, sizeof(int), pipeline);
  // 同步接口?
  pipeline.producer_commit();
}
if (thread_role == cuda::pipeline_role::consumer) {
  // Only the consumer threads compute:
  // 同步接口?
  pipeline.consumer_wait();
  size_t shared_idx = compute_batch % stages_count;
  size_t global_batch_idx = block_batch(compute_batch) + thread_idx;
  size_t shared_batch_idx = shared_offset[shared_idx] + thread_idx;
  compute(global_out + global_batch_idx, *(shared + shared_batch_idx));
  pipeline.consumer_release();
}

Revision #1
Created 11 January 2025 09:46:27 by Colin
Updated 12 January 2025 06:33:50 by Colin