Upload
trung
View
44
Download
3
Embed Size (px)
DESCRIPTION
GPU&CUDA Labwork Week 6. Bin ZHOU USTC: Fall 2012. 上机实验. 目的 自主进行CUDA程序优化尝试 工具 cuda 4.2; Linux 为主, windows 为辅 GTX 640 方法 上机实验. 实验内容. 1)合并访存实验 2) Warp 实验 3)Bank Conflict 实验 提供示意伪代码. 声明. 伪代码不保证可用,只是示例,请自己写好代码 - PowerPoint PPT Presentation
Citation preview
GPU&CUDA Labwork Week6Bin ZHOU
USTC: Fall 2012
目的 自主进行 CUDA 程序优化尝试 工具 cuda 4.2 ; Linux 为主, windows 为辅 GTX 640 方法 上机实验
上机实验
1 )合并访存实验
2 ) Warp 实验
3 ) Bank Conflict 实验
提供示意伪代码
实验内容
声明
伪代码不保证可用,只是示例,请自己写好代码
After 4-week lab work, students should know the basic cuda programming well and so
minimum support for basic questions!
合并访存实验 Step1: 实现矩阵相加,使用 A 方式 A[N]= B[N]+ C[N]; N > 32768 ( 伪代码 ) Stride = M; __global__ kernel(float *A,float *B,float *C) { base = blockIdx.x*blockDim.x; offset = threadIdx.x * Stride; for (int i=0;i<Stride;i++) A[base+offset+i] = B[base+offset+i] + C[base+offset
+i]; } 每个线程做 M 个数据,注意 block 的大小和数量; 测试性能;记录时间;
Step 2: 使用 B 模式 A[N]= B[N]+ C[N]; N > 32768 ( 伪代码 ) __global__ kernel(float *A,float *B,float *C) { base = blockIdx.x*blockDim.x; offset = threadIdx.x; A[base+offset] = B[base+offset] + C[base+offset]; } 每个线程做 1 个数据,注意 block 的大小和数量; 测试性能
合并内存访问 (实例 )Float3 需要 12bytes: float3
a=d_in[threadIdx.x]
每个线程需要执行 3 次读取操作不连续的空间
7
合并内存访问 (实例 )利用 shared memory 实现 coalescing
每个 block256 个线程,则每个 block 需要sizeof ( float3 ) ×256bytes shared memory
每个线程读取 3 个 float 地址的偏移量: 0,(threads/block),2*(threads/block)
Step3 与前两步类似,起始地址从512 开始
合并内存访问 (实例 )具体代码
地址的偏移量:threadIdx.x,
threadIdx.x+256,
threadIdx.x+512,
Warp 实验 以上代码,奇数号线程做加法,偶数号做减法;并加入延迟 M 足够大 __global__ kernel(float *A,float *B,float *C) { base = blockIdx.x*blockDim.x; offset = threadIdx.x; if (threadIdx.x%2 == 1) { for (x =0;x < M; x ++) k = k + 5; A[base+offset] = B[base+offset] + C[base+offset] + k;
} else { for (x =0;x < M; x ++) k = k + 5; A[base+offset] = B[base+offset] + C[base+offset] - k;
} }
以上代码,奇数号 Warp 做加法,偶数号做减法;并加入延迟 M 足够大 __global__ kernel(float *A,float *B,float *C) { base = blockIdx.x*blockDim.x; offset = threadIdx.x; warpId = threadIdx.x / 32; if (warpId %2 == 1) { for (x =0;x < M; x ++) k = k + 5; A[base+offset] = B[base+offset] + C[base+offset] + k; } else { for (x =0;x < M; x ++) k = k + 5; A[base+offset] = B[base+offset] + C[base+offset] - k; } }
Bank Conflict A = BT 矩阵转置 __global__ void transposeNaive(float *odata, float
*idata, int width, int height) { int xIndex = blockIdx.x * TILE_DIM + threadIdx.x; int yIndex = blockIdx.y * TILE_DIM + threadIdx.y;
int index_in = xIndex + width * yIndex; int index_out = yIndex + height * xIndex;
odata[index_out] = idata[index_in]; }
Coalesced but with conflict _global__ void transposeCoalesced(float *odata, float *idata,
int width, int height) { __shared__ float tile[TILE_DIM][TILE_DIM];
int xIndex = blockIdx.x * TILE_DIM + threadIdx.x; int yIndex = blockIdx.y * TILE_DIM + threadIdx.y; int index_in = xIndex + (yIndex)*width;
xIndex = blockIdx.y * TILE_DIM + threadIdx.x; yIndex = blockIdx.x * TILE_DIM + threadIdx.y; int index_out = xIndex + (yIndex)*height;
tile[threadIdx.y][threadIdx.x] = idata[index_in];
__syncthreads();
odata[index_out] = tile[threadIdx.x][threadIdx.y]; }
Remove Conflict
Padding Shared memory
__shared__ float tile[TILE_DIM][TILE_DIM+1];