structCSRGRAPH{intnumVertices;int*scrPtrs;// Strating outgoing edge index of each vertex
int*dstList;// Destination vertex index of each edge
};__global__voidbfs_kernel_csr(CSRGRAPHgraph,unsignedint*level,unsignedint*visited,unsignedintcurrLevel){unsignedvertexId=blockIdx.x*blockDim.x+threadIdx.x;if(vertexId<graph.numVertices){if(level[vertexId]==currLevel-1){for(inti=graph.scrPtrs[vertexId];i<graph.scrPtrs[vertexId+1];i++){unsignedintneighbor=graph.dstList[i];if(level[neighbor]==0xFFFFFFFF){// unvisited neighbor
level[neighbor]=currLevel;visited[neighbor]=1;*visited=1;// flag to indicate whether reached the end of the graph
}}}}}
structCSCGRAPH{intnumVertices;int*dstPtrs;// Starting incoming edge index of each vertex
int*scrList;// Source vertex index of each edge
};__global__voidbfs_kernel_csc(CSCGRAPHgraph,unsignedint*level,unsignedint*visited,unsignedintcurrLevel){unsignedvertexId=blockIdx.x*blockDim.x+threadIdx.x;if(vertexId<graph.numVertices){if(level[vertexId]==0xFFFFFFF){// loop through its incoming edges if not visited
for(inti=graph.dstPtrs[vertexId];i<graph.dstPtrs[vertexId+1];i++){unsignedintneighbor=graph.scrList[i];if(level[neighbor]==currLevel-1){level[vertexId]=currLevel;*visited=1;// flag to indicate whether reached the end of the graph
break;// Only need 1 neighbor in previous level to identify the vetex is currLevel
}}}}}
下图展示了这个内核如何执行从第 1 层到第 2 层的遍历。
Example of a Vertex-centric Pull (bottom-up) Traversal from Level 1 to Level 2
structCOOGRAPH{intnumVertices;intnumEdges;int*srcList;// Source vertex index of each edge
int*dstList;// Destination vertex index of each edge
};__global__voidbfs_kernel_coo(COOGRAPHgraph,unsignedint*level,unsignedint*visited,unsignedintcurrLevel){unsignededgeId=blockIdx.x*blockDim.x+threadIdx.x;if(edgeId<graph.numEdges){unsignedintsrc=graph.srcList[edgeId];if(level[src]==currLevel-1){unsignedintneighbor=graph.dstList[edgeId];if(level[neighbor]==0xFFFFFFFF){// unvisited neighbor
level[neighbor]=currLevel;visited[neighbor]=1;*visited=1;// flag to indicate whether reached the end of the graph
}}}}
下图展示了该内核如何执行从从第 1 层到第 2 层的遍历。
Example of an Edge-centric Traversal from Level 1 to Level 2
__global__voidfrontier_bfs_kernel(CSRGRAPHgraph,unsignedint*level,unsignedint*prevFroniter,unsignedint*currFroniter,unsignedintnumPrevFroniter,unsignedint*numCurrFroniter,unsignedint*currLevel){// Each thread processes a node in prevFroniter.
unsignedinti=blockIdx.x*blockDim.x+threadIdx.x;if(i<numPrevFroniter){unsignedintvertexId=prevFroniter[i];// All its neighbouring nodes are traversed.
for(unsignedintedge=graph.scrPtrs[vertexId];edge<graph.scrPtrs[vertexId+1];edge++){unsignedintneighbor=graph.dstList[edge];if(atomicCAS(level+neighbor,0xFFFFFFFF,currLevel)==0xFFFFFFFF){// check if neighbor is unvisited
unsignedintcurrFroniterIndex=atomicAdd(numCurrFroniter,1);currFroniter[currFroniterIndex]=neighbor;}}}}
#define LOCAL_FRONTIER_SIZE 4
__global__voidprivate_frontier_bfs_kernel(CSRGRAPHgraph,unsignedint*level,unsignedint*prevFroniter,unsignedint*currFroniter,unsignedintnumPrevFroniter,unsignedint*numCurrFroniter,unsignedint*currLevel){// Initialize privative frontier
__shared__unsignedintcurrFrontier_s[LOCAL_FRONTIER_SIZE];__shared__unsignedintnumCurrFrontier_s;if(threadIdx.x==0){numCurrFrontier_s=0;}__syncthreads();// Perform BFS on private frontier
unsignedinti=blockIdx.x*blockDim.x+threadIdx.x;if(i<numPrevFroniter){unsignedintvertexId=prevFroniter[i];for(unsignedintedge=graph.scrPtrs[vertexId];edge<graph.scrPtrs[vertexId+1];edge++){unsignedintneighbor=graph.dstList[edge];if(atomicCAS(level+neighbor,0xFFFFFFFF,currLevel)==0xFFFFFFFF){// Once a new frontier node is found,
unsignedcurrFroniterIndex=atomicAdd(&numCurrFrontier_s,1);if(currFroniterIndex<LOCAL_FRONTIER_SIZE){// Try to add it to the private frontier (currFrontier_s)
currFrontier_s[currFroniterIndex]=neighbor;}else{numCurrFrontier_s=LOCAL_FRONTIER_SIZE;// frontier is full, stop adding new elements
unsignedintcurrFrontierIdx=atomicAdd(numCurrFroniter,1);currFroniter[currFrontierIdx]=neighbor;}}}}// Copy private frontier to global frontier
__syncthreads();__shared__unsignedintcurrFrontierStartIdx;// Start index of private frontier in global frontier
if(threadIdx.x==0){currFrontierStartIdx=atomicAdd(numCurrFroniter,numCurrFrontier_s);}__syncthreads();// Commit private frontier to global frontier
for(unsignedintj=threadIdx.x;j<numCurrFrontier_s;j+=blockDim.x){unsignedintcurrFroniterIdx=currFrontierStartIdx+j;currFroniter[currFroniterIdx]=currFrontier_s[j];}}