You can not select more than 25 topics
			Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
		
		
		
		
		
			
		
			
				
					
					
						
							209 lines
						
					
					
						
							8.2 KiB
						
					
					
				
			
		
		
	
	
							209 lines
						
					
					
						
							8.2 KiB
						
					
					
				// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
 | 
						|
//
 | 
						|
// Licensed under the Apache License, Version 2.0 (the "License");
 | 
						|
// you may not use this file except in compliance with the License.
 | 
						|
// You may obtain a copy of the License at
 | 
						|
//
 | 
						|
//     http://www.apache.org/licenses/LICENSE-2.0
 | 
						|
//
 | 
						|
// Unless required by applicable law or agreed to in writing, software
 | 
						|
// distributed under the License is distributed on an "AS IS" BASIS,
 | 
						|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 | 
						|
// See the License for the specific language governing permissions and
 | 
						|
// limitations under the License.
 | 
						|
 | 
						|
#include <stack>
 | 
						|
#include "paddle/fluid/operators/math/math_function.h"
 | 
						|
#include "paddle/fluid/operators/math/tree2col.h"
 | 
						|
 | 
						|
namespace paddle {
 | 
						|
namespace operators {
 | 
						|
namespace math {
 | 
						|
using Tensor = framework::Tensor;
 | 
						|
using Node = paddle::operators::math::TreeNode;
 | 
						|
template <typename T>
 | 
						|
__global__ void tree2col(const T* eta, const int* node, const int* index,
 | 
						|
                         const T* vectors, T* result, int feature_size, int n) {
 | 
						|
  const int thread_id =
 | 
						|
      (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
 | 
						|
  const int patch_id = thread_id / feature_size;
 | 
						|
  const int j = thread_id % feature_size;
 | 
						|
  if (patch_id < n) {
 | 
						|
    const int begin_o = patch_id * 3 * feature_size;
 | 
						|
    const int begin = index[patch_id * 2], end = index[patch_id * 2 + 1];
 | 
						|
    T res_l = 0, res_r = 0, res_t = 0;
 | 
						|
    for (int i = begin; i < end; i++) {
 | 
						|
      const int id = node[i];
 | 
						|
      const T vec = vectors[id * feature_size + j];
 | 
						|
      res_l += eta[i * 3] * vec;
 | 
						|
      res_r += eta[i * 3 + 1] * vec;
 | 
						|
      res_t += eta[i * 3 + 2] * vec;
 | 
						|
    }
 | 
						|
    result[begin_o + j * 3] = res_l;
 | 
						|
    result[begin_o + j * 3 + 1] = res_r;
 | 
						|
    result[begin_o + j * 3 + 2] = res_t;
 | 
						|
  }
 | 
						|
}
 | 
						|
template <typename T>
 | 
						|
class Tree2ColFunctor<platform::CUDADeviceContext, T> {
 | 
						|
 public:
 | 
						|
  void operator()(const paddle::platform::CUDADeviceContext& context,
 | 
						|
                  const framework::Tensor& EdgeSet,
 | 
						|
                  const framework::Tensor& node_features,
 | 
						|
                  framework::Tensor* patch, int max_depth) {
 | 
						|
    std::vector<std::vector<int>> tr;
 | 
						|
    auto gpu_place = boost::get<platform::CUDAPlace>(context.GetPlace());
 | 
						|
    auto cpu_place = platform::CPUPlace();
 | 
						|
    auto stream = context.stream();
 | 
						|
    auto feature_dims = node_features.dims();
 | 
						|
    math::SetConstant<platform::CUDADeviceContext, T> constant;
 | 
						|
 | 
						|
    Tensor EdgeSet_cpu;
 | 
						|
    framework::TensorCopy(EdgeSet, cpu_place, &EdgeSet_cpu);
 | 
						|
    int64_t feature_size = feature_dims[1];
 | 
						|
    size_t patch_elem_size = 3 * static_cast<size_t>(feature_size);
 | 
						|
    size_t node_count = 0, patch_count = 0, total_size = 0;
 | 
						|
    size_t max_size = feature_dims[0];
 | 
						|
    Tree2ColUtil::construct_tree(EdgeSet_cpu, &tr, &node_count);
 | 
						|
 | 
						|
    std::vector<std::vector<Node>> processing_list;
 | 
						|
    for (size_t u = 1; u <= node_count; u++) {
 | 
						|
      std::vector<Node> tmp = Tree2ColUtil::construct_patch(u, max_depth, tr);
 | 
						|
      if (!tmp.empty()) {
 | 
						|
        processing_list.push_back(tmp);
 | 
						|
        total_size += tmp.size();
 | 
						|
      }
 | 
						|
    }
 | 
						|
 | 
						|
    size_t patch_size = processing_list.size();
 | 
						|
    Tensor node_cpu, node_gpu, eta_cpu, eta_gpu, index_cpu, index_gpu;
 | 
						|
    int* node = node_cpu.mutable_data<int>({static_cast<int64_t>(total_size)},
 | 
						|
                                           cpu_place);
 | 
						|
    T* eta = eta_cpu.mutable_data<T>({static_cast<int64_t>(total_size * 3)},
 | 
						|
                                     cpu_place);
 | 
						|
    int* index = index_cpu.mutable_data<int>(
 | 
						|
        {static_cast<int64_t>(patch_size * 2)}, cpu_place);
 | 
						|
 | 
						|
    int idx = 0, index_idx = 0;
 | 
						|
    for (auto& tmp : processing_list) {
 | 
						|
      index[index_idx++] = idx;
 | 
						|
      for (auto& v : tmp) {
 | 
						|
        node[idx] = static_cast<int>(v.node - 1);
 | 
						|
        eta[idx * 3] = v.eta_l<T>(max_depth);
 | 
						|
        eta[idx * 3 + 1] = v.eta_r<T>(max_depth);
 | 
						|
        eta[idx * 3 + 2] = v.eta_t<T>(max_depth);
 | 
						|
        idx++;
 | 
						|
      }
 | 
						|
      index[index_idx++] = idx;
 | 
						|
    }
 | 
						|
    framework::TensorCopy(node_cpu, gpu_place, context, &node_gpu);
 | 
						|
    framework::TensorCopy(eta_cpu, gpu_place, context, &eta_gpu);
 | 
						|
    framework::TensorCopy(index_cpu, gpu_place, context, &index_gpu);
 | 
						|
 | 
						|
    int elem_size = patch_size * feature_size;
 | 
						|
    int blocks = (elem_size + 1024 - 1) / 1024;
 | 
						|
    int block_x = 512;
 | 
						|
    int block_y = (blocks + 512 - 1) / 512;
 | 
						|
    dim3 threads(1024, 1);
 | 
						|
    dim3 grid(block_x, block_y);
 | 
						|
 | 
						|
    patch->mutable_data<T>(
 | 
						|
        {static_cast<int64_t>(max_size), static_cast<int64_t>(patch_elem_size)},
 | 
						|
        gpu_place);
 | 
						|
    constant(context, patch, 0);
 | 
						|
    tree2col<T><<<grid, threads, 0, stream>>>(
 | 
						|
        eta_gpu.data<T>(), node_gpu.data<int>(), index_gpu.data<int>(),
 | 
						|
        node_features.data<T>(), patch->data<T>(), feature_size, patch_size);
 | 
						|
  }
 | 
						|
};
 | 
						|
template <typename T>
 | 
						|
class Col2TreeFunctor<platform::CUDADeviceContext, T> {
 | 
						|
 public:
 | 
						|
  void operator()(const platform::CUDADeviceContext& context,
 | 
						|
                  const framework::Tensor& EdgeSet,
 | 
						|
                  const framework::Tensor& patch_grad,
 | 
						|
                  framework::Tensor* embedding_grad, int max_depth) {
 | 
						|
    std::vector<std::vector<int>> tr;
 | 
						|
    auto gpu_place = boost::get<platform::CUDAPlace>(context.GetPlace());
 | 
						|
    auto cpu_place = platform::CPUPlace();
 | 
						|
    auto stream = context.stream();
 | 
						|
    auto output_dims = patch_grad.dims();
 | 
						|
    math::SetConstant<platform::CUDADeviceContext, T> constant;
 | 
						|
 | 
						|
    Tensor EdgeSet_cpu;
 | 
						|
    framework::TensorCopy(EdgeSet, cpu_place, &EdgeSet_cpu);
 | 
						|
    int64_t output_size = output_dims[1];
 | 
						|
    size_t patch_elem_size = 3 * static_cast<size_t>(output_size);
 | 
						|
    size_t node_count = 0, patch_count = 0;
 | 
						|
    size_t max_size = output_dims[0];
 | 
						|
    Tree2ColUtil::construct_tree(EdgeSet_cpu, &tr, &node_count);
 | 
						|
    std::vector<std::vector<Node>> processing_list;
 | 
						|
    std::vector<std::vector<Node>> grad_list;
 | 
						|
    grad_list.resize(node_count);
 | 
						|
    size_t total_size = 0, grad_size = node_count;
 | 
						|
    for (size_t u = 1; u <= node_count; u++) {
 | 
						|
      std::vector<Node> tmp = Tree2ColUtil::construct_patch(u, max_depth, tr);
 | 
						|
      if (!tmp.empty()) {
 | 
						|
        processing_list.push_back(tmp);
 | 
						|
      }
 | 
						|
    }
 | 
						|
    for (size_t patch_id = 0; patch_id < processing_list.size(); patch_id++) {
 | 
						|
      for (auto v : processing_list[patch_id]) {
 | 
						|
        grad_list[v.get_node() - 1].push_back(v.change_node(patch_id + 1));
 | 
						|
      }
 | 
						|
    }
 | 
						|
    for (auto& tmp : grad_list) {
 | 
						|
      total_size += tmp.size();
 | 
						|
    }
 | 
						|
 | 
						|
    Tensor node_cpu, node_gpu, eta_cpu, eta_gpu, index_cpu, index_gpu;
 | 
						|
    int* node = node_cpu.mutable_data<int>({static_cast<int64_t>(total_size)},
 | 
						|
                                           cpu_place);
 | 
						|
    T* eta = eta_cpu.mutable_data<T>({static_cast<int64_t>(total_size * 3)},
 | 
						|
                                     cpu_place);
 | 
						|
    int* index = index_cpu.mutable_data<int>(
 | 
						|
        {static_cast<int64_t>(grad_size * 2)}, cpu_place);
 | 
						|
 | 
						|
    size_t idx = 0, index_idx = 0;
 | 
						|
    for (auto& tmp : grad_list) {
 | 
						|
      index[index_idx++] = idx;
 | 
						|
      for (auto& v : tmp) {
 | 
						|
        node[idx] = static_cast<int>(v.node - 1);
 | 
						|
        eta[idx * 3] = v.eta_l<T>(max_depth);
 | 
						|
        eta[idx * 3 + 1] = v.eta_r<T>(max_depth);
 | 
						|
        eta[idx * 3 + 2] = v.eta_t<T>(max_depth);
 | 
						|
        idx++;
 | 
						|
      }
 | 
						|
      index[index_idx++] = idx;
 | 
						|
    }
 | 
						|
    framework::TensorCopy(node_cpu, gpu_place, &node_gpu);
 | 
						|
    framework::TensorCopy(eta_cpu, gpu_place, &eta_gpu);
 | 
						|
    framework::TensorCopy(index_cpu, gpu_place, &index_gpu);
 | 
						|
 | 
						|
    int elem_size = output_size * grad_size;
 | 
						|
    int blocks = (elem_size + 1024 - 1) / 1024;
 | 
						|
    int block_x = 512;
 | 
						|
    int block_y = (blocks + 512 - 1) / 512;
 | 
						|
    dim3 threads(1024, 1);
 | 
						|
    dim3 grid(block_x, block_y);
 | 
						|
 | 
						|
    embedding_grad->mutable_data<T>(
 | 
						|
        {static_cast<int64_t>(max_size), static_cast<int64_t>(patch_elem_size)},
 | 
						|
        gpu_place);
 | 
						|
 | 
						|
    constant(context, embedding_grad, 0);
 | 
						|
    tree2col<T><<<grid, threads, 0, stream>>>(
 | 
						|
        eta_gpu.data<T>(), node_gpu.data<int>(), index_gpu.data<int>(),
 | 
						|
        patch_grad.data<T>(), embedding_grad->data<T>(), output_size,
 | 
						|
        grad_size);
 | 
						|
  }
 | 
						|
};
 | 
						|
 | 
						|
template class Tree2ColFunctor<platform::CUDADeviceContext, float>;
 | 
						|
template class Tree2ColFunctor<platform::CUDADeviceContext, double>;
 | 
						|
template class Col2TreeFunctor<platform::CUDADeviceContext, float>;
 | 
						|
template class Col2TreeFunctor<platform::CUDADeviceContext, double>;
 | 
						|
}  // namespace math
 | 
						|
}  // namespace operators
 | 
						|
}  // namespace paddle
 |