Merge pull request #5084 from lcy-seso/crf

Add the LinearChainCrf operator.
8 years ago · 8401039feb
parent d3b07a6ede ebd992ec79
commit 8401039feb
12 changed files with 1020 additions and 35 deletions
--- a/paddle/framework/operator.cc
+++ b/paddle/framework/operator.cc
@ -37,32 +37,32 @@ ExecutionContext::GetEigenDevice<platform::GPUPlace, Eigen::GpuDevice>() const {
 std::string OperatorBase::Input(const std::string& name) const {
  auto& ins = Inputs(name);
  PADDLE_ENFORCE_LE(ins.size(), 1UL,
-                    "Op %s input %s should contain only one variable", type_,
-                    name);
+                    "Operator %s's input %s should contain only one variable.",
+                    type_, name);
  return ins.empty() ? kEmptyVarName : ins[0];
 }

 const std::vector<std::string>& OperatorBase::Inputs(
    const std::string& name) const {
  auto it = inputs_.find(name);
-  PADDLE_ENFORCE(it != inputs_.end(), "Op %s do not have input %s", type_,
-                 name);
+  PADDLE_ENFORCE(it != inputs_.end(), "Operator %s does not have the input %s.",
+                 type_, name);
  return it->second;
 }

 std::string OperatorBase::Output(const std::string& name) const {
  auto& outs = Outputs(name);
  PADDLE_ENFORCE_LE(outs.size(), 1UL,
-                    "Op %s output %s should contain only one variable", type_,
-                    name);
+                    "Operator %s's output %s should contain only one variable.",
+                    type_, name);
  return outs.empty() ? kEmptyVarName : outs[0];
 }

 const std::vector<std::string>& OperatorBase::Outputs(
    const std::string& name) const {
  auto it = outputs_.find(name);
-  PADDLE_ENFORCE(it != outputs_.end(), "Op %s does not have output called %s",
-                 type_, name);
+  PADDLE_ENFORCE(it != outputs_.end(),
+                 "Operator %s does not have an output called %s.", type_, name);
  return it->second;
 }

--- a/paddle/framework/operator.h
+++ b/paddle/framework/operator.h
@ -427,7 +427,8 @@ class OperatorWithKernel : public OperatorBase {
            int tmp = static_cast<int>(ToDataType(t->type()));
            VLOG(3) << "Input " << ipt_name << " with data_type " << tmp;
            PADDLE_ENFORCE(tmp == data_type || data_type == -1,
-                           "DataType of Paddle Op %s must be same.", Type());
+                           "DataType of Paddle Op %s must be the same.",
+                           Type());
            data_type = tmp;
          }
        }
--- a/paddle/framework/tensor.h
+++ b/paddle/framework/tensor.h
@ -118,10 +118,12 @@ class Tensor {
                             const platform::DeviceContext& ctx);

  /**
-   * @brief   Return the slice of the tensor.
+   * @brief  Return a sub-tensor of the given tensor.
   *
-   * @param[in] begin_idx   The begin index of the slice.
-   * @param[in] end_idx     The end index of the slice.
+   * @param[in] begin_idx   The index of the start row(inclusive) to slice.
+   *                        The index number begins from 0.
+   * @param[in] end_idx     The index of the end row(exclusive) to slice.
+   *                        The index number begins from 0.
   */
  inline Tensor Slice(const int& begin_idx, const int& end_idx) const;

--- a/paddle/framework/tensor_impl.h
+++ b/paddle/framework/tensor_impl.h
@ -112,9 +112,10 @@ inline void* Tensor::mutable_data(platform::Place place, std::type_index type) {
  if (holder_ != nullptr) {
    holder_->set_type(type);
  }
-  PADDLE_ENFORCE_GT(numel(), 0,
-                    "Tensor's numel must be larger than zero to call "
-                    "Tensor::mutable_data. Call Tensor::set_dim first.");
+  PADDLE_ENFORCE_GT(
+      numel(), 0,
+      "When calling this method, the Tensor's numel must be larger than zero. "
+      "Please check Tensor::Resize has been called first.");
  int64_t size = numel() * SizeOfType(type);
  /* some versions of boost::variant don't have operator!= */
  if (holder_ == nullptr || !(holder_->place() == place) ||
@ -229,10 +230,12 @@ inline void Tensor::CopyFromVector(const std::vector<T>& src,

 inline Tensor Tensor::Slice(const int& begin_idx, const int& end_idx) const {
  check_memory_size();
-  PADDLE_ENFORCE_GE(begin_idx, 0, "Slice begin index is less than zero.");
-  PADDLE_ENFORCE_LE(end_idx, dims_[0], "Slice end index is out of bound.");
-  PADDLE_ENFORCE_LT(begin_idx, end_idx,
-                    "Begin index must be less than end index.");
+  PADDLE_ENFORCE_GE(begin_idx, 0,
+                    "The start row index must be greater than 0.");
+  PADDLE_ENFORCE_LE(end_idx, dims_[0], "The end row index is out of bound.");
+  PADDLE_ENFORCE_LT(
+      begin_idx, end_idx,
+      "The start row index must be lesser than the end row index.");

  if (dims_[0] == 1) {
    return *this;
--- a/paddle/gserver/layers/CRFLayer.cpp
+++ b/paddle/gserver/layers/CRFLayer.cpp
@ -101,8 +101,10 @@ void CRFLayer::backward(const UpdateCallback& callback) {
                              : real(1.0f);
    instanceWeight *= coeff_;

-    MatrixPtr grad = output.grad->subRowMatrix(starts[i], starts[i + 1]);
-    grad->add(*crfs_[i].getXGrad(), real(1.0f), instanceWeight);
+    if (output.grad) {
+      MatrixPtr grad = output.grad->subRowMatrix(starts[i], starts[i + 1]);
+      grad->add(*crfs_[i].getXGrad(), real(1.0f), instanceWeight);
+    }
    if (needWGrad) {
      weight_->getWGrad()->add(
          *crfs_[i].getWGrad(), real(1.0f), instanceWeight);
--- a/paddle/gserver/layers/LinearChainCRF.cpp
+++ b/paddle/gserver/layers/LinearChainCRF.cpp
@ -102,7 +102,6 @@ real LinearChainCRF::forward(real* x, int* s, int length) {
 }

 void LinearChainCRF::backward(real* x, int* s, int length, bool needWGrad) {
-  MatrixPtr matX = Matrix::create(x, length, numClasses_);
  Matrix::resizeOrCreate(matGrad_, length, numClasses_);
  Matrix::resizeOrCreate(beta_, length, numClasses_);
  real* b = b_->getData();
--- a/paddle/operators/cross_entropy_op.cc
+++ b/paddle/operators/cross_entropy_op.cc
@ -28,8 +28,9 @@ class CrossEntropyOp : public framework::OperatorWithKernel {

    auto x_dims = ctx->GetInputDim("X");
    auto label_dims = ctx->GetInputDim("Label");
-    PADDLE_ENFORCE_EQ(x_dims.size(), 2, "Input(X)'s rank should be 2.");
-    PADDLE_ENFORCE_EQ(label_dims.size(), 2, "Input(Label)'s rank should be 2.");
+    PADDLE_ENFORCE_EQ(x_dims.size(), 2UL, "Input(X)'s rank should be 2.");
+    PADDLE_ENFORCE_EQ(label_dims.size(), 2UL,
+                      "Input(Label)'s rank should be 2.");
    PADDLE_ENFORCE_EQ(x_dims[0], label_dims[0],
                      "The 1st dimension of Input(X) and Input(Label) should "
                      "be equal.");
@ -38,8 +39,8 @@ class CrossEntropyOp : public framework::OperatorWithKernel {
                        "If Attr(soft_label) == true, the 2nd dimension of "
                        "Input(X) and Input(Label) should be equal.");
    } else {
-      PADDLE_ENFORCE_EQ(label_dims[1], 1,
-                        "If Attr(soft_label) == false, the 2nd dimension of "
+      PADDLE_ENFORCE_EQ(label_dims[1], 1UL,
+                        "If Attr(softLabel) == false, the 2nd dimension of "
                        "Input(Label) should be 1.");
    }

@ -48,7 +49,8 @@ class CrossEntropyOp : public framework::OperatorWithKernel {
  }

 protected:
-  // CrossEntropy's data type just determined by "X"
+  // Explicitly set that data type of the output of the cross_entropy operator
+  // is determined by its input "X".
  framework::DataType IndicateDataType(
      const framework::ExecutionContext& ctx) const override {
    return framework::ToDataType(ctx.Input<Tensor>("X")->type());
--- a/paddle/operators/linear_chain_crf_op.cc
+++ b/paddle/operators/linear_chain_crf_op.cc
--- a/paddle/operators/linear_chain_crf_op.cu
+++ b/paddle/operators/linear_chain_crf_op.cu
@ -0,0 +1,26 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/linear_chain_crf_op.h"
+
+namespace ops = paddle::operators;
+
+REGISTER_OP_GPU_KERNEL(
+    linear_chain_crf,
+    ops::LinearChainCRFOpKernel<paddle::platform::GPUPlace, float>,
+    ops::LinearChainCRFOpKernel<paddle::platform::GPUPlace, double>);
+REGISTER_OP_GPU_KERNEL(
+    linear_chain_crf_grad,
+    ops::LinearChainCRFGradOpKernel<paddle::platform::GPUPlace, float>,
+    ops::LinearChainCRFGradOpKernel<paddle::platform::GPUPlace, double>);
--- a/paddle/operators/linear_chain_crf_op.h
+++ b/paddle/operators/linear_chain_crf_op.h
--- a/paddle/operators/softmax_with_cross_entropy_op.cc
+++ b/paddle/operators/softmax_with_cross_entropy_op.cc
@ -32,9 +32,9 @@ class SoftmaxWithCrossEntropyOpMaker
    AddInput("Label",
             "(Tensor, default: Tensor<int>), The ground truth which is a 2-D "
             "tensor. "
-             "If softLable is set to 0, Label is a Tensor<int> with shape [N x "
-             "1]. "
-             "If softLable is set to 1, Label is a Tensor<float/double> "
+             "If softLabel is set to false, Label is a Tensor<int> with shape "
+             "[N x 1]."
+             "If softLabel is set to true, Label is a Tensor<float/double> "
             "with shape [N x K].");
    AddOutput(
        "Softmax",
@ -60,19 +60,23 @@ Because this operators performs a softmax on logits internally, it expects
 unscaled logits. Please do not call this op with the output of softmax operator,
 which will produce incorrect results.

-This operators expects mutually exclusive hard labels, each sample in a batch
-is in exactly one class with probabilities 1. Each sample in the batch with one
-and only one label.
+When the attribute softLabel is set false, this operators expects mutually
+exclusive hard labels, each sample in a batch is in exactly one class with
+probabilities 1. Each sample in the batch with one and only one label.

 Equation:

 1) hard label (one-hot label)

-Loss_j = -\text{Logit}_{Label_j} + \log\left(\sum_{i=0}^{K}\exp(\text{Logit}_i)\right), j = 1, ..., K
+Loss_j = \f$ -\text{Logit}_{Label_j} +
+\log\left(\sum_{i=0}^{K}\exp(\text{Logit}_i)\right),
+j = 1, ..., K $\f

 2) soft label (a distribution over all classes)

-Loss_j = -\sum_{i=0}^{K}\text{Label}_i\left(\text{Logit}_i-\log\left(\sum_{i=0}^{K}\exp(\text{Logit}_i)\right)\right), j = 1,...,K
+Loss_j = \f$ -\sum_{i=0}^{K}\text{Label}_i\left(\text{Logit}_i -
+\log\left(\sum_{i=0}^{K}\exp(\text{Logit}_i)\right)\right),
+j = 1,...,K $\f

 )DOC");
  }
--- a/python/paddle/v2/framework/tests/test_linear_chain_crf_op.py
+++ b/python/paddle/v2/framework/tests/test_linear_chain_crf_op.py
@ -0,0 +1,142 @@
+import unittest
+import random
+import numpy as np
+
+from op_test import OpTest
+
+
+class LinearChainCrfForward(object):
+    def __init__(self, seq_start_positions, emission_weights, emission_row_max,
+                 emission_exps, transition_weights, transition_exps, labels):
+        self.tag_num = emission_weights.shape[1]
+        self.seq_num = len(seq_start_positions) - 1
+
+        self.seq_start_positions = seq_start_positions
+        self.labels = labels
+        self.x = emission_weights
+
+        self.x_row_max = emission_row_max
+        self.x_exps = emission_exps
+
+        # unnormalized logits of the transition weights for the start mark.
+        self.a = transition_weights[0, :]
+        self.a_exps = transition_exps[0, :]
+        # unnormalized logits of the transition weights for the end mark.
+        self.b = transition_weights[1, :]
+        self.b_exps = transition_exps[1, :]
+        # unnormalized logits of the transition weights for all the other tags.
+        self.w = transition_weights[2:, :]
+        self.w_exps = transition_exps[2:, :]
+
+        # The output of linear chain crf operator.
+        # alpha is a memo table in dynamic programming to caculate
+        # nomalization factor.
+        self.alpha = np.zeros(
+            (seq_start_positions[-1], self.tag_num), dtype="float64")
+        self.log_likelihood = np.zeros((self.seq_num, 1))
+
+    def _l1_norm(self, x):
+        s = np.sum(x)
+        x /= s
+        return s
+
+    def _forward_a_sequence(self, x, x_row_max, x_exps, label, alpha):
+        seq_len = x_row_max.shape[0]
+        log_likelihood = 0.
+
+        for i in range(self.tag_num):
+            alpha[0, i] = self.a_exps[i] * x_exps[0, i]
+        log_likelihood = -x_row_max[0] - np.log(self._l1_norm(alpha[0, :]))
+
+        # calculate the unnormalized logits of the normalization factor.
+        for k in range(1, seq_len):
+            for i in range(self.tag_num):
+                s = 0.
+                for j in range(self.tag_num):
+                    s += alpha[k - 1, j] * self.w_exps[j, i]
+                alpha[k, i] = x_exps[k, i] * s
+            log_likelihood -= x_row_max[k] + np.log(self._l1_norm(alpha[k, :]))
+        s = 0.
+        for i in range(self.tag_num):
+            s += alpha[-1, i] * self.b_exps[i]
+        log_likelihood -= np.log(s)
+
+        # calculate the nominator part.
+        log_likelihood += (
+            self.a[label[0]] + x[0, label[0]] + self.b[label[-1]])
+
+        for k in range(1, seq_len):
+            log_likelihood += (x[k, label[k]] + self.w[label[k - 1], label[k]])
+        return -log_likelihood
+
+    def crf_forward_compute(self):
+        for i in range(self.seq_num):
+            start = self.seq_start_positions[i]
+            end = self.seq_start_positions[i + 1]
+
+            self.log_likelihood[i] = self._forward_a_sequence(
+                self.x[start:end, :], self.x_row_max[start:end, :],
+                self.x_exps[start:end, :], self.labels[start:end, :],
+                self.alpha[start:end, :])
+        return self.alpha, self.log_likelihood
+
+
+class TestLinearChainCrfOp(OpTest):
+    def set_test_data(self):
+        # TODO(caoying) Fix the unittest by: add the boundary cases when
+        # sequence lengths are 1, 2, and 3.
+
+        SEQ_NUM = 3
+        TAG_NUM = 17
+        MAX_SEQ_LEN = 5
+
+        # the linear_chain_crf operator only supports sequence (LoD level = 1)
+        lod = [[0]]
+        for i in range(SEQ_NUM):
+            lod[-1].append(lod[-1][-1] + random.randint(1, MAX_SEQ_LEN))
+        emission = np.random.uniform(-1, 1,
+                                     [lod[-1][-1], TAG_NUM]).astype("float64")
+        emission_row_max = np.amax(emission, axis=1, keepdims=True)
+        emission_exps = np.exp(emission - emission_row_max)
+
+        transition = np.random.uniform(-0.5, 0.5,
+                                       [TAG_NUM + 2, TAG_NUM]).astype("float64")
+        transition_exps = np.exp(transition)
+
+        labels = np.random.randint(
+            low=0, high=TAG_NUM, size=(lod[-1][-1], 1), dtype="int32")
+
+        self.inputs = {
+            "Emission": (emission, lod),
+            "Transition": transition,
+            "Label": (labels, lod)
+        }
+        crf = LinearChainCrfForward(lod[0], emission, emission_row_max,
+                                    emission_exps, transition, transition_exps,
+                                    labels)
+        alpha, log_likelihood = crf.crf_forward_compute()
+
+        self.outputs = {
+            "Alpha": alpha,
+            "EmissionExps": emission_exps,
+            "TransitionExps": transition_exps,
+            "LogLikelihood": log_likelihood
+        }
+
+    def setUp(self):
+        self.op_type = "linear_chain_crf"
+        self.set_test_data()
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(["Emission", "Transition"], "LogLikelihood")
+
+    def test_check_grad_ignore_transition(self):
+        self.check_grad(
+            ["Emission"], "LogLikelihood", no_grad_set=set("Transition"))
+
+
+if __name__ == "__main__":
+    unittest.main()