Upload
roger-hopkins
View
230
Download
0
Tags:
Embed Size (px)
Citation preview
DebuggingPGI Compilers for Heterogeneous Supercomputing
Common OpenACC Errors
• acc parallel or loop independent errors (not a parallel loop)
• data bounds errors (not enough data moved to the device)
• stale data on device or host (missing update)
• present error (missing data clause somewhere)
• roundoff error (differences in float arithmetic host vs device)
• roundoff error for summation (parallel accumulation)
• async errors (missing wait)
• compiler error (ask for help)
• other runtime error (need debugger or other help)
DEBUGGING PGI CUDA FORTRANAND OPENACC ON GPUS WITH ALLINEA DDT
Sebastien Deldon (PGI)- Beau Paisley (Allinea)
TALK HIGHLIGHTS
Brief CUDA Fortran overview
Brief OpenACC overview
CUDA Fortran/OpenACC debug info generation
Allinea DDT overview and features
CUDA Fortran & OpenACC live debugging demos
3 WAYS TO PROGRAM ACCELERATORS
“Drop-in” Acceleration
Maximum Power &Flexibility
Applications
LibrariesProgramming
LanguagesDirectives
Easy On-ramp to Acceleration
attributes(global) subroutine mm_kernel ( A, B, C, N, M, L )real :: A(N,M), B(M,L), C(N,L), Cijinteger, value :: N, M, Linteger :: i, j, kb, k, tx, tyreal, shared :: Asub(16,16),Bsub(16,16)tx = threadidx%xty = threadidx%yi = (blockidx%x-1) * 16 + tx j = (blockidx%y-1) * 16 + tyCij = 0.0do kb = 1, M, 16 Asub(tx,ty) = A(i,kb+tx-1) Bsub(tx,ty) = B(kb+ty-1,j) call syncthreads() do k = 1,16 Cij = Cij + Asub(tx,k) * Bsub(k,ty) enddo call syncthreads()enddoC(i,j) = Cij end subroutine mmul_kernel
real, device, allocatable, dimension(:,:) :: Adev,Bdev,Cdev
. . . allocate (Adev(N,M), Bdev(M,L), Cdev(N,L))Adev = A(1:N,1:M) Bdev = B(1:M,1:L) call mm_kernel <<<dim3(N/16,M/16),dim3(16,16)>>> ( Adev, Bdev, Cdev, N, M, L) C(1:N,1:L) = Cdev deallocate ( Adev, Bdev, Cdev ) . . .
Host Code Device Code
CUDA FORTRAN
!$CUF KERNEL DIRECTIVES
module madd_device_module use cudaforcontains subroutine madd_dev(a,b,c,sum,n1,n2) real,dimension(:,:),device :: a,b,c real :: sum integer :: n1,n2 type(dim3) :: grid, block!$cuf kernel do (2) <<<(*,*),(32,4)>>> do j = 1,n2 do i = 1,n1 a(i,j) = b(i,j) + c(i,j) sum = sum + a(i,j) enddo enddo end subroutineend module
module madd_device_module use cudafor implicit nonecontains attributes(global) subroutine madd_kernel(a,b,c,blocksum,n1,n2) real, dimension(:,:) :: a,b,c real, dimension(:) :: blocksum integer, value :: n1,n2 integer :: i,j,tindex,tneighbor,bindex real :: mysum real, shared :: bsum(256)! Do this thread's work mysum = 0.0 do j = threadidx%y + (blockidx%y-1)*blockdim%y, n2, blockdim%y*griddim%y do i = threadidx%x + (blockidx%x-1)*blockdim%x, n1, blockdim%x*griddim%x a(i,j) = b(i,j) + c(i,j) mysum = mysum + a(i,j) ! accumulates partial sum per thread enddo enddo ! Now add up all partial sums for the whole thread block ! Compute this thread's linear index in the thread block ! We assume 256 threads in the thread block tindex = threadidx%x + (threadidx%y-1)*blockdim%x ! Store this thread's partial sum in the shared memory block bsum(tindex) = mysum call syncthreads() ! Accumulate all the partial sums for this thread block to a single value tneighbor = 128 do while( tneighbor >= 1 ) if( tindex <= tneighbor ) & bsum(tindex) = bsum(tindex) + bsum(tindex+tneighbor) tneighbor = tneighbor / 2 call syncthreads() enddo ! Store the partial sum for the thread block bindex = blockidx%x + (blockidx%y-1)*griddim%x if( tindex == 1 ) blocksum(bindex) = bsum(1) end subroutine
! Add up partial sums for all thread blocks to a single cumulative sum attributes(global) subroutine madd_sum_kernel(blocksum,dsum,nb) real, dimension(:) :: blocksum real :: dsum integer, value :: nb real, shared :: bsum(256) integer :: tindex,tneighbor,i ! Again, we assume 256 threads in the thread block ! accumulate a partial sum for each thread tindex = threadidx%x bsum(tindex) = 0.0 do i = tindex, nb, blockdim%x bsum(tindex) = bsum(tindex) + blocksum(i) enddo call syncthreads() ! This code is copied from the previous kernel ! Accumulate all the partial sums for this thread block to a single value ! Since there is only one thread block, this single value is the final result tneighbor = 128 do while( tneighbor >= 1 ) if( tindex <= tneighbor ) & bsum(tindex) = bsum(tindex) + bsum(tindex+tneighbor) tneighbor = tneighbor / 2 call syncthreads() enddo if( tindex == 1 ) dsum = bsum(1) end subroutine
subroutine madd_dev(a,b,c,dsum,n1,n2) real, dimension(:,:), device :: a,b,c real, device :: dsum real, dimension(:), allocatable, device :: blocksum integer :: n1,n2,nb type(dim3) :: grid, block integer :: r ! Compute grid/block size; block size must be 256 threads grid = dim3((n1+31)/32, (n2+7)/8, 1) block = dim3(32,8,1) nb = grid%x * grid%y allocate(blocksum(1:nb)) call madd_kernel<<< grid, block >>>(a,b,c,blocksum,n1,n2) call madd_sum_kernel<<< 1, 256 >>>(blocksum,dsum,nb) r = cudaThreadSynchronize() ! don't deallocate too early deallocate(blocksum) end subroutine end module
Equivalenthand-writtenCUDA kernels
OPENACC MEMBERS
...#pragma acc data copy(b[0:n][0:m]) \ create(a[0:n][0:m]) {for (iter = 1; iter <= p; ++iter){ #pragma acc kernels { for (i = 1; i < n-1; ++i){ for (j = 1; j < m-1; ++j){ a[i][j]=w0*b[i][j]+ w1*(b[i-1][j]+b[i+1][j]+ b[i][j-1]+b[i][j+1])+ w2*(b[i-1][j-1]+b[i-1][j+1]+ b[i+1][j-1]+b[i+1][j+1]); } } for( i = 1; i < n-1; ++i ) for( j = 1; j < m-1; ++j ) b[i][j] = a[i][j]; }}}...
S2(B)S1(B)S1(B)S2(B)
OPENACC
Host Memory
AcceleratorMemory
AA
BB S1(B)
Sp(B)Sp(B)
Sp(B)
HOW DOES THEPGI ACCELERATOR COMPILER WORK?
Unified CPU/GPU binary
C/C++/Fortran OpenACC Cuda FortranCode
compilePGI Accelerator compiler
Devic
e
X86 ASM
CUDA C
NVIDIA SDK nvcc
GPU ASM
PGI Accelerator linker
link
Host
NATIVE LLVM CODE GENERATION TO ENABLE DEBUGGING
Unified CPU/GPU binary
C/C++/Fortran OpenACC Cuda FortranCode
compilePGI Accelerator compiler
Devic
e
X86 ASM
NVVM IR
NVIDIA SDK libnvvm
GPU ASM
PGI Accelerator linker
link
Host
ENABLING DEVICE-SIDE DEBUGGING
PGI Accelerator native NVVM IR/libnvvm code generator
Generate debug info using NVVM IR debug metadata
—Source line correlation
—Global/local variables
Debug info for CUDA predefined variables (threadIdx, …)
Debug info for Fortran-specific features
CUDA FORTRAN DEBUGGING STATUS
CUDA Fortran debugging features in PGI 14.1 and later
One-to-one mapping for source line correlation
Set and run to breakpoints in CUDA Fortran kernels
Step through kernel code
Examine kernel local variables, global variables in device/shared memories, predefined variables
CUDA FORTRAN DEBUGGING LIMITATIONS
Lower optimization level when invoking libnvvm – code generation may chang
Array bounds debug information only for constant
!$CUF directive support available with PGI 14.4
OpenACC debug challengesvoidMatrixMultiplication(float * restrict a, float * restrict b, float * restrict c, int m, int n, int p){ int i, j, k ;
#pragma acc data copy(a[0:(m*n)]), copyin(b[0:(m*p)],c[0:(p*n)]){#pragma acc kernels loop independent, gang, vector(8) for (i=0; i<m; i++){#pragma acc loop gang, vector (8) for (j=0; j<n; j++) {#pragma acc loop seq for (k=0; k<p; k++) a[i*n+j] += b[i*p+k]*c[k*n+j] ;
} }}}
% pgcc –g –O0 –ta=nvidia –acc mmul.c –o mmul% ddt …
extern "C" __global__ __launch_bounds__(64) voidMatrixMultiplication_20_gpu( float* const __restrict _c ; float* const __restrict _b, float* _a, int _n, int _p){ int _i, _j, _k ;
_i = threadIdx.y + blockIdx.y*8 ; _j = threadIdx.x + blockIdx.x*8 ; for (_k=0; _k<_p; _k++) _a [_i*_n+_j] += _b[_i*_p+_k]*_c [_k*_n+_j] ;}
Original Source CodeSimplified Pseudo Kernel Code
OPENACC DEBUG CHALLENGE
Source line correlation
Variable correlation
Variable not referenced anymore
Do we expose compiler-created variables ?
How to deal with significantly restructured loops ?
extern "C" __global__ __launch_bounds__(64) voidMatrixMultiplication_20_gpu( float* const __restrict _c ; float* const __restrict _b, float* _a, int _n, int _p){ int _i, _j, _k ;
_i = threadIdx.y + blockIdx.y*8 ; _j = threadIdx.x + blockIdx.x*8 ; for (_k=0; _k<_p; _k++) _a [_i*_n+_j] += _b[_i*_p+_k]*_c [_k*_n+_j] ;} Simplified Pseudo Kernel
Code
OPENACC DEBUG STATUS
Available in PGI 14.4
Source line correlation
Debug support for variables turned into kernel parameters
!$CUF directives debug support
OPENACC DEBUG LIMITATIONS
Same as for CUDA Fortran debugging
No support for source variables that are not referenced by generated kernel
No support for generated for common block variable passed as parameters
Limited support for acc routines
ABOUT ALLINEA DDT
Graphical debugger designed for:— C/C++, Fortran, UPC, CUDA, CUDA Fortran
— Multithreaded code Single address space
— Multiprocess code Interdependent or independent processes
— Accelerated codes
GPUs, Intel Xeon Phi Any mix of the above
Slash your time to debug :— Reproduces and triggers your bugs instantly
— Helps you easily understand where issues come from quickly
— Helps you to fix them as swiftly as possible
LET’S SEE DDT IN ACTION
idata
Global Memory
CUDA FORTRAN DEBUG DEMOattributes(global) subroutine transposeNoBankConflicts(odata, idata) implicit none real, intent(out) :: odata(ny,nx) real, intent(in) :: idata(nx,ny) real, shared :: tile(TILE_DIM+1, TILE_DIM) integer :: x, y, j
x = (blockIdx%x-1) * TILE_DIM + threadIdx%x y = (blockIdx%y-1) * TILE_DIM + threadIdx%y
do j = 0, TILE_DIM-1, BLOCK_ROWS tile(threadIdx%x, threadIdx%y+j) = idata(x,y+j) end do
call syncthreads()
x = (blockIdx%y-1) * TILE_DIM + threadIdx%x y = (blockIdx%x-1) * TILE_DIM + threadIdx%y
do j = 0, TILE_DIM-1, BLOCK_ROWS odata(x,y+j) = tile(threadIdx%y+j, threadIdx%x) end do
end subroutine transposeNoBankConflicts
http://devblogs.nvidia.com/parallelforall/efficient-matrix-transpose-cuda-
fortran/
% pgfortran –g –O0 –Mcuda mtrans.cuf –o mtrans% ddt …
tile
Shared Memory
odata
Global Memory
OpenACC debug demovoidMatrixMultiplication(float * restrict a, float * restrict b, float * restrict c, int m, int n, int p){ int i, j, k ;
#pragma acc data copy(a[0:(m*n)]), copyin(b[0:(m*p)],c[0:(p*n)]){#pragma acc kernels loop independent, gang, vector(8) for (i=0; i<m; i++){#pragma acc loop gang, vector (8) for (j=0; j<n; j++) {#pragma acc loop seq for (k=0; k<p; k++) a[i*n+j] += b[i*p+k]*c[k*n+j] ;
} }}}
% pgcc –g –O0 –ta=nvidia –acc mmul.c –o mm% ddt …
extern "C" __global__ __launch_bounds__(64) voidMatrixMultiplication_20_gpu( float* const __restrict _c ; float* const __restrict _b, float* _a, int _n, int _p){ int _i, _j, _k ;
_i = threadIdx.y + blockIdx.y*8 ; _j = threadIdx.x + blockIdx.x*8 ; for (_k=0; _k<_p; _k++) _a [_i*_n+_j] += _b[_i*_p+_k]*_c [_k*_n+_j] ;}
Original Source CodeSimplified Pseudo Kernel Code
COPYRIGHT NOTICE
© Contents copyright 2014, NVIDIA Corporation. This material may not be reproduced in any manner
without the expressed written permission of NVIDIA.
PGFORTRAN, PGF95, PGI Accelerator and PGI Unified Binary are trademarks, and PGI, PGCC, PGC++, PGI Visual Fortran, PVF, PGI CDK, Cluster Development Kit, PGPROF, PGDBG, and The Portland Group are registered trademarks of NVIDIA Corporation. Other brands and names are the property of their respective owners.
BACKUP SLIDES
25
Debugging CUDA Fortran with Allinea DDT
Set and run to breakpoints in CUDA Fortran
kernels
View CUDA Fortran kernels
source code
Drill into CUDA thread-blocks to examine local
variables
Evaluate data in device
shared/globalmemories
Track execution stacks for CUDA threads/blocks
26
View arrays in device
shred/global memory
27
Inspect values in CUDA Fortran
multidimensional arrays
28
Vizualize values in CUDA Fortran multidimensional
arrays
29
View C source code
Track CUDA thread/blocks
execution stack in OpenACC kernel
Set and run to breakpoints in
OpenACC parallel region
Examine OpenACC kernel local variables in device memory
30
Inspect values in OpenACC
multidimensional arrays
TRAP ERROR WHERE IT OCCURRED