| 
						
						
							
								
							
						
						
					 | 
					 | 
					@ -183,6 +183,7 @@ template <int SIZE>
 | 
				
			
			
		
	
		
		
			
				
					
					 | 
					 | 
					 | 
					__global__ void VecFP16MatrixColReduce(const __half2 *__restrict__ in,
 | 
					 | 
					 | 
					 | 
					__global__ void VecFP16MatrixColReduce(const __half2 *__restrict__ in,
 | 
				
			
			
		
	
		
		
			
				
					
					 | 
					 | 
					 | 
					                                       __half2 *__restrict__ out, size_t width,
 | 
					 | 
					 | 
					 | 
					                                       __half2 *__restrict__ out, size_t width,
 | 
				
			
			
		
	
		
		
			
				
					
					 | 
					 | 
					 | 
					                                       size_t height) {
 | 
					 | 
					 | 
					 | 
					                                       size_t height) {
 | 
				
			
			
		
	
		
		
			
				
					
					 | 
					 | 
					 | 
					 | 
					 | 
					 | 
					 | 
					#if CUDA_ARCH_FP16_SUPPORTED(__CUDA_ARCH__)
 | 
				
			
			
		
	
		
		
			
				
					
					 | 
					 | 
					 | 
					  int idx = threadIdx.x + blockIdx.x * blockDim.x;
 | 
					 | 
					 | 
					 | 
					  int idx = threadIdx.x + blockIdx.x * blockDim.x;
 | 
				
			
			
		
	
		
		
			
				
					
					 | 
					 | 
					 | 
					  int by = blockIdx.y;
 | 
					 | 
					 | 
					 | 
					  int by = blockIdx.y;
 | 
				
			
			
		
	
		
		
			
				
					
					 | 
					 | 
					 | 
					  __half2 zero = __half2half2(static_cast<__half>(0));
 | 
					 | 
					 | 
					 | 
					  __half2 zero = __half2half2(static_cast<__half>(0));
 | 
				
			
			
		
	
	
		
		
			
				
					| 
						
						
						
							
								
							
						
					 | 
					 | 
					@ -196,6 +197,7 @@ __global__ void VecFP16MatrixColReduce(const __half2 *__restrict__ in,
 | 
				
			
			
		
	
		
		
			
				
					
					 | 
					 | 
					 | 
					
 | 
					 | 
					 | 
					 | 
					
 | 
				
			
			
		
	
		
		
			
				
					
					 | 
					 | 
					 | 
					    atomicAdd(&(out[idx]), sum);
 | 
					 | 
					 | 
					 | 
					    atomicAdd(&(out[idx]), sum);
 | 
				
			
			
		
	
		
		
			
				
					
					 | 
					 | 
					 | 
					  }
 | 
					 | 
					 | 
					 | 
					  }
 | 
				
			
			
		
	
		
		
			
				
					
					 | 
					 | 
					 | 
					 | 
					 | 
					 | 
					 | 
					#endif
 | 
				
			
			
		
	
		
		
			
				
					
					 | 
					 | 
					 | 
					}
 | 
					 | 
					 | 
					 | 
					}
 | 
				
			
			
		
	
		
		
			
				
					
					 | 
					 | 
					 | 
					
 | 
					 | 
					 | 
					 | 
					
 | 
				
			
			
		
	
		
		
			
				
					
					 | 
					 | 
					 | 
					template <typename T>
 | 
					 | 
					 | 
					 | 
					template <typename T>
 | 
				
			
			
		
	
	
		
		
			
				
					| 
						
							
								
							
						
						
							
								
							
						
						
					 | 
					 | 
					@ -363,7 +365,6 @@ class ElementwiseAddGradKernel : public ElemwiseGradKernel<T> {
 | 
				
			
			
		
	
		
		
			
				
					
					 | 
					 | 
					 | 
					      int max_blocks = std::max(max_physical_threads / (block_x * block_y), 1);
 | 
					 | 
					 | 
					 | 
					      int max_blocks = std::max(max_physical_threads / (block_x * block_y), 1);
 | 
				
			
			
		
	
		
		
			
				
					
					 | 
					 | 
					 | 
					      int theory_block = (width + blocks.x - 1) / blocks.x;
 | 
					 | 
					 | 
					 | 
					      int theory_block = (width + blocks.x - 1) / blocks.x;
 | 
				
			
			
		
	
		
		
			
				
					
					 | 
					 | 
					 | 
					      dim3 grids(std::min(theory_block, max_blocks));
 | 
					 | 
					 | 
					 | 
					      dim3 grids(std::min(theory_block, max_blocks));
 | 
				
			
			
		
	
		
		
			
				
					
					 | 
					 | 
					 | 
					#if CUDA_ARCH_FP16_SUPPORTED(__CUDA_ARCH__)
 | 
					 | 
					 | 
					 | 
					 | 
				
			
			
		
	
		
		
			
				
					
					 | 
					 | 
					 | 
					      if (std::is_same<T, paddle::platform::float16>::value && width < 2048 &&
 | 
					 | 
					 | 
					 | 
					      if (std::is_same<T, paddle::platform::float16>::value && width < 2048 &&
 | 
				
			
			
		
	
		
		
			
				
					
					 | 
					 | 
					 | 
					          width % 2 == 0 && height % 64 == 0) {
 | 
					 | 
					 | 
					 | 
					          width % 2 == 0 && height % 64 == 0) {
 | 
				
			
			
		
	
		
		
			
				
					
					 | 
					 | 
					 | 
					        auto &dev_ctx =
 | 
					 | 
					 | 
					 | 
					        auto &dev_ctx =
 | 
				
			
			
		
	
	
		
		
			
				
					| 
						
						
						
							
								
							
						
					 | 
					 | 
					@ -381,7 +382,6 @@ class ElementwiseAddGradKernel : public ElemwiseGradKernel<T> {
 | 
				
			
			
		
	
		
		
			
				
					
					 | 
					 | 
					 | 
					                                                                 width, height);
 | 
					 | 
					 | 
					 | 
					                                                                 width, height);
 | 
				
			
			
		
	
		
		
			
				
					
					 | 
					 | 
					 | 
					        return;
 | 
					 | 
					 | 
					 | 
					        return;
 | 
				
			
			
		
	
		
		
			
				
					
					 | 
					 | 
					 | 
					      }
 | 
					 | 
					 | 
					 | 
					      }
 | 
				
			
			
		
	
		
		
			
				
					
					 | 
					 | 
					 | 
					#endif
 | 
					 | 
					 | 
					 | 
					 | 
				
			
			
		
	
		
		
			
				
					
					 | 
					 | 
					 | 
					
 | 
					 | 
					 | 
					 | 
					
 | 
				
			
			
		
	
		
		
			
				
					
					 | 
					 | 
					 | 
					      if (width / height < 32) {
 | 
					 | 
					 | 
					 | 
					      if (width / height < 32) {
 | 
				
			
			
		
	
		
		
			
				
					
					 | 
					 | 
					 | 
					        MatrixColReduce<T, block_x, block_y><<<grids, blocks, 0, stream>>>(
 | 
					 | 
					 | 
					 | 
					        MatrixColReduce<T, block_x, block_y><<<grids, blocks, 0, stream>>>(
 | 
				
			
			
		
	
	
		
		
			
				
					| 
						
							
								
							
						
						
						
					 | 
					 | 
					
 
 |