可以发现卷积核 F 通常很小,在整个卷积内核的执行过程中不会改变,所有线程都以相同的顺序访问其元素。因此我们可以考虑将其存储在常量内存里,之前说过它和全局内存的区别是线程不能修改常量内存变量的值并且常量内存非常小,目前为 64 KB.
假设已经在主机代码里分配好 F_h 的内存,可以通过 cudaMemcpyToSymbol() 将其从主机内存传输到设备常量内存中。内核函数以全局变量的形式访问常量内存变量。因此,它们的指针不需要作为参数传递给内核函数。
__global__voidconvolution_tiled_2D_constant_mem_kernel_2(// OUT_TILE_DIM^2 threads per block
float*N,float*P,intwidth,intheight){// Upper left output tile coord
intcol=blockIdx.x*OUT_TILE_DIM+threadIdx.x;introw=blockIdx.y*OUT_TILE_DIM+threadIdx.y;// Each thread may need to load multiple elements into shared memory
__shared__floatN_s[IN_TILE_DIM][IN_TILE_DIM];for(inti=threadIdx.y;i<IN_TILE_DIM;i+=OUT_TILE_DIM){for(intj=threadIdx.x;j<IN_TILE_DIM;j+=OUT_TILE_DIM){intin_col=blockIdx.x*OUT_TILE_DIM+j-FILTER_RADIUS;intin_row=blockIdx.y*OUT_TILE_DIM+i-FILTER_RADIUS;if(in_row>=0&&in_row<height&&in_col>=0&&in_col<width){N_s[i][j]=N[in_row*width+in_col];}else{N_s[i][j]=0.0f;}}}__syncthreads();// Calculate output elements
if(threadIdx.x<OUT_TILE_DIM&&threadIdx.y<OUT_TILE_DIM&&row<height&&col<width){floatPvalue=0.0f;for(intfRow=0;fRow<2*FILTER_RADIUS+1;fRow++){for(intfCol=0;fCol<2*FILTER_RADIUS+1;fCol++){Pvalue+=F_c[fRow][fCol]*N_s[threadIdx.y+fRow][threadIdx.x+fCol];}}P[row*width+col]=Pvalue;}}
7.5 Tiled Convolution Using Caches for Halo Cells#
__global__voidconvolution_tiled_cached_2D_shared_mem_kernel(// OUT_TILE_DIM^2 threads per block
float*N,float*P,intwidth,intheight){intcol=blockIdx.x*OUT_TILE_DIM+threadIdx.x;introw=blockIdx.y*OUT_TILE_DIM+threadIdx.y;// loading input tile
__shared__floatN_s[IN_TILE_DIM][IN_TILE_DIM];if(row<height&&col<width){N_s[threadIdx.y][threadIdx.x]=N[row*width+col];}else{N_s[threadIdx.y][threadIdx.x]=0.0f;}__syncthreads();// Calculate output elements
if(col<width&&row<height){floatPvalue=0.0f;// turning off the threads at the edge of the block
for(intfRow=0;fRow<2*FILTER_RADIUS+1;fRow++){for(intfCol=0;fCol<2*FILTER_RADIUS+1;fCol++){if(threadIdx.x+fCol-FILTER_RADIUS>=0&&threadIdx.x+fCol-FILTER_RADIUS<IN_TILE_DIM&&threadIdx.x+fRow-FILTER_RADIUS>=0&&threadIdx.x+fRow-FILTER_RADIUS<IN_TILE_DIM){Pvalue+=F_c[fRow][fCol]*N_s[threadIdx.y+fRow][threadIdx.x+fCol];}else{if(row-FILTER_RADIUS+fRow>=0&&row-FILTER_RADIUS+fRow<height&&col-FILTER_RADIUS+fCol>=0&&col-FILTER_RADIUS+fCol<width){Pvalue+=F_c[fRow][fCol]*N[(row-FILTER_RADIUS+fRow)*width+(col-FILTER_RADIUS+fCol)];}}}}N[row*width+col]=Pvalue;}}