Paddle/paddle/math/SparseRowMatrix.cpp

/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

    http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */


#include "SparseRowMatrix.h"
#include "CpuSparseMatrix.h"

#include <cmath>
#include <algorithm>

#include "paddle/utils/Logging.h"

#include "SIMDFunctions.h"

#include "paddle/utils/Util.h"
#include "paddle/utils/Thread.h"

P_DEFINE_bool(allow_inefficient_sparse_update, false,
              "Whether to allow inefficient sparse update");

namespace paddle {

const unsigned int SparseRowCpuMatrix::kUnusedId_ = -1U;

void SparseRowCpuMatrix::init(size_t height, size_t width) {
  // @TODO(yuyang18) Just remove this limit
  CHECK(simd::vec_check(width)) << width;
  height_ = height;
  if (!indexDictHandle_) {
    indexDictHandle_.reset(new IndexDict);
    indexDictHandle_->globalIndices.assign(height, kUnusedId_);
  }
  localIndices_ = &indexDictHandle_->localIndices;
  globalIndices_ = indexDictHandle_->globalIndices.data();
}

void SparseRowCpuMatrix::mul(CpuSparseMatrix* a, CpuMatrix* b, real scaleAB,
                             real scaleT) {
  CpuMatrix::mul<CpuMatrix, SparseRowCpuMatrix>(a, b, this, scaleAB, scaleT);
}

void SparseRowCpuMatrix::copyFrom(const real* src, size_t size) {
  LOG(FATAL) << "This should not be called";
}

void SparseRowCpuMatrix::zeroMem() {
  apply(
    [](real* buf, size_t len) {
      memset(buf, 0, sizeof(real) * len);
    });
  clearRows();
}

void SparseRowCpuMatrix::applyL1Decay(real learningRate, real decayRate) {
  apply([=](real* buf, size_t len) {
      CpuVector value(0, nullptr);
      value.subVecFrom(buf, 0, len);
      value.applyL1(learningRate, decayRate);
    });
}

void SparseRowCpuMatrix::sgdUpdate(BaseMatrix& value, IVector& t0,
                                   real learningRate, int currentTime,
                                   real decayRate, bool useL1, bool fini) {
  std::vector<unsigned int>& localIndices = indexDictHandle_->localIndices;

  // t0 and value are vectors
  CHECK_EQ(t0.getSize(), this->height_);
  CHECK_EQ(value.width_, this->height_ * this->width_);

  if (decayRate == 0.0f) {
    if (fini) {
      return;
    }

    for (size_t i = 0; i < localIndices.size(); ++i) {
      real* g = getLocalRow(i);
      real* v = value.rowBuf(localIndices[i]);
      for (size_t j = 0; j < this->width_; ++j) {
        v[j] -= learningRate * g[j];
      }
    }
    return;
  }  // else

  if (useL1) {  // L1 decay
    if (fini) {
      for (size_t i = 0; i < this->height_; ++i) {
        real* v = value.rowBuf(i);
        int* t = t0.getData() + i;
        if (t[0] < currentTime) {
          // W(t0) -> W(t+1)
          int tDiff = currentTime - t[0];
          real delta = tDiff * learningRate * decayRate;
          simd::decayL1(v, v, delta, this->width_);
        }
      }
      return;
    }  // else

    for (size_t i = 0; i < localIndices.size(); ++i) {
      real* g = getLocalRow(i);
      real* v = value.rowBuf(localIndices[i]);
      int* t = t0.getData() + localIndices[i];
      if (t[0] < currentTime) {
        // W(t0) -> W(t)
        int tDiff = currentTime - t[0];
        real delta = tDiff * learningRate * decayRate;
        simd::decayL1(v, v, delta, this->width_);
      }

      // W(t) -> W(t+1)
      for (size_t j = 0; j < this->width_; ++j) {
        v[j] -= learningRate * g[j];
      }
      simd::decayL1(v, v, learningRate*decayRate, this->width_);

      // state update to t+1
      t[0] = currentTime + 1;
    }

  } else {  // L2 decay
    if (fini) {
      for (size_t i = 0; i < this->height_; ++i) {
        real* v = value.rowBuf(i);
        int* t = t0.getData() + i;
        if (t[0] < currentTime) {
          // W(t0) -> W(t+1)
          int tDiff = currentTime - t[0];
          real recip = 1.0f / (1.0f + tDiff * learningRate * decayRate);
          for (size_t j = 0; j < this->width_; ++j) {
            v[j] *= recip;
          }
        }
      }
      return;
    }  // else

    real recipDecay = 1.0f / (1.0f + learningRate * decayRate);

    for (size_t i = 0; i < localIndices.size(); ++i) {
      real* g = getLocalRow(i);
      real* v = value.rowBuf(localIndices[i]);
      int* t = t0.getData() + localIndices[i];
      if (t[0] < currentTime) {
        // W(t0) -> W(t)
        int tDiff = currentTime - t[0];
        real recip = 1.0f / (1.0f + tDiff * learningRate * decayRate);
        for (size_t j = 0; j < this->width_; ++j) {
          v[j] *= recip;
        }
      }

      // W(t) -> W(t+1)
      for (size_t j = 0; j < this->width_; ++j) {
        v[j] = recipDecay * (v[j] - learningRate * g[j]);
      }

      // state update to t+1
      t[0] = currentTime + 1;
    }
  }
}

void SparseRowCpuMatrix::addTo(BaseMatrix& dest, std::vector<uint32_t>& ids,
                               size_t tid, size_t numThreads) {
  CHECK(!dest.useGpu_);
  CHECK_EQ(dest.height_ * dest.width_, this->height_ * this->width_);

  std::vector<unsigned int>& localIndices = indexDictHandle_->localIndices;
  for (size_t i = 0; i < localIndices.size(); ++i) {
    uint32_t id = localIndices[i];
    if (id % numThreads == tid) {
      simd::addTo(dest.rowBuf(id), getLocalRow(i),
                  this->width_);
      ids.push_back(id);
    }
  }
}

void SparseRowCpuMatrix::addTo(SparseRowCpuMatrix& dest, size_t tid,
                               size_t numThreads) {
  CHECK(!dest.useGpu_);
  CHECK_EQ(dest.height_ * dest.width_, this->height_ * this->width_);

  std::vector<unsigned int>& localIndices = indexDictHandle_->localIndices;
  for (size_t i = 0; i < localIndices.size(); ++i) {
    uint32_t id = localIndices[i];
    if (id % numThreads == tid) {
      dest.checkIndex(id);
      simd::addTo(dest.getRow(id), getLocalRow(i), this->width_);
    }
  }
}

void SparseRowCpuMatrix::zeroMemThread(size_t tid, size_t numThreads) {
  std::vector<unsigned int>& localIndices = indexDictHandle_->localIndices;
  for (size_t i = 0; i < localIndices.size(); ++i) {
    uint32_t id = localIndices[i];
    if (id % numThreads == tid) {
      memset(this->getLocalRow(i), 0, this->width_ * sizeof(real));
    }
  }
}

void SparseAutoGrowRowCpuMatrix::mul(CpuSparseMatrix* a, CpuMatrix* b,
                                     real scaleAB, real scaleT) {
  CpuMatrix::mul<CpuMatrix, SparseAutoGrowRowCpuMatrix>(a, b, this, scaleAB,
                                                        scaleT);
}

void CacheRowCpuMatrix::mul(CpuSparseMatrix* a, CpuMatrix* b, real scaleAB,
                            real scaleT) {
  CpuMatrix::mul<CpuMatrix, CacheRowCpuMatrix>(a, b, this, scaleAB, scaleT);
}

void SparsePrefetchRowCpuMatrix::addRows(const unsigned int* ids, size_t len) {
  std::vector<unsigned int>& localIndices = indexDictHandle_->localIndices;
  for (size_t i = 0; i < len; i ++) {
    CHECK_LT(*(ids + i), this->getHeight())
      << "id:" << *(ids + i) << "Height:" << this->getHeight()
      << "sparse id value exceeds the max input dimension, "
      << "it could be caused invalid input data samples";
  }
  localIndices.insert(localIndices.end(), ids, ids + len);
}

void SparsePrefetchRowCpuMatrix::addRows(MatrixPtr input) {
  CpuSparseMatrix* mat = dynamic_cast<CpuSparseMatrix*>(input.get());
  CHECK(mat) << "only support sparse matrix";
  addRows(reinterpret_cast<const unsigned int*>(mat->getCols()),
          mat->getElementCnt());
}

void SparsePrefetchRowCpuMatrix::addRows(IVectorPtr ids) {
  std::vector<unsigned int>& localIndices = indexDictHandle_->localIndices;
  size_t numSamples = ids->getSize();
  int* index = ids->getData();
  for (size_t i = 0; i < numSamples; ++i) {
    if (index[i] == -1) continue;

    unsigned int id = (unsigned int)index[i];
    CHECK_LT(id, this->getHeight())
      << "id:" << id << "Height:" << this->getHeight()
      << "sparse id value exceeds the max input dimension, "
      << "it could be caused invalid input data samples";
    localIndices.push_back(id);
  }
}

void SparsePrefetchRowCpuMatrix::setupIndices() {
  auto& localIndices = indexDictHandle_->localIndices;
  uniqueIds(localIndices);
  // for each sparse row
  for (size_t id = 0; id < localIndices.size(); ++id) {
    globalIndices_[localIndices[id]] = id;  // sparse row -> local id
  }
  checkStoreSize();
}

void SparseRowCpuMatrix::checkIndices() {
  std::vector<unsigned int>& localIndices = indexDictHandle_->localIndices;
  for (size_t i = 0; i < localIndices.size(); ++i) {
    CHECK_EQ(globalIndices_[localIndices[i]], i);
  }
  checkStoreSize();
}

}  // namespace paddle
fix dash and space bug, ISSUE=4586495 git-svn-id: https://svn.baidu.com/idl/trunk/paddle@1408 1ad973e4-5ce8-4261-8a94-b56d1f490c56 9 years ago			`/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.`

			`Licensed under the Apache License, Version 2.0 (the "License");`
			`you may not use this file except in compliance with the License.`
			`You may obtain a copy of the License at`

			`http://www.apache.org/licenses/LICENSE-2.0`

			`Unless required by applicable law or agreed to in writing, software`
			`distributed under the License is distributed on an "AS IS" BASIS,`
			`WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.`
			`See the License for the specific language governing permissions and`
			`limitations under the License. */`


			`#include "SparseRowMatrix.h"`
			`#include "CpuSparseMatrix.h"`

			`#include <cmath>`
			`#include <algorithm>`

			`#include "paddle/utils/Logging.h"`

			`#include "SIMDFunctions.h"`

			`#include "paddle/utils/Util.h"`
			`#include "paddle/utils/Thread.h"`

			`P_DEFINE_bool(allow_inefficient_sparse_update, false,`
			`"Whether to allow inefficient sparse update");`

			`namespace paddle {`

			`const unsigned int SparseRowCpuMatrix::kUnusedId_ = -1U;`

			`void SparseRowCpuMatrix::init(size_t height, size_t width) {`
			`// @TODO(yuyang18) Just remove this limit`
			`CHECK(simd::vec_check(width)) << width;`
			`height_ = height;`
			`if (!indexDictHandle_) {`
			`indexDictHandle_.reset(new IndexDict);`
			`indexDictHandle_->globalIndices.assign(height, kUnusedId_);`
			`}`
			`localIndices_ = &indexDictHandle_->localIndices;`
			`globalIndices_ = indexDictHandle_->globalIndices.data();`
			`}`

			`void SparseRowCpuMatrix::mul(CpuSparseMatrix* a, CpuMatrix* b, real scaleAB,`
			`real scaleT) {`
			`CpuMatrix::mul<CpuMatrix, SparseRowCpuMatrix>(a, b, this, scaleAB, scaleT);`
			`}`

			`void SparseRowCpuMatrix::copyFrom(const real* src, size_t size) {`
			`LOG(FATAL) << "This should not be called";`
			`}`

			`void SparseRowCpuMatrix::zeroMem() {`
			`apply(`
			`[](real* buf, size_t len) {`
			`memset(buf, 0, sizeof(real) * len);`
			`});`
			`clearRows();`
			`}`

			`void SparseRowCpuMatrix::applyL1Decay(real learningRate, real decayRate) {`
			`apply([=](real* buf, size_t len) {`
			`CpuVector value(0, nullptr);`
			`value.subVecFrom(buf, 0, len);`
			`value.applyL1(learningRate, decayRate);`
			`});`
			`}`

			`void SparseRowCpuMatrix::sgdUpdate(BaseMatrix& value, IVector& t0,`
			`real learningRate, int currentTime,`
			`real decayRate, bool useL1, bool fini) {`
			`std::vector<unsigned int>& localIndices = indexDictHandle_->localIndices;`

			`// t0 and value are vectors`
			`CHECK_EQ(t0.getSize(), this->height_);`
			`CHECK_EQ(value.width_, this->height_ * this->width_);`

			`if (decayRate == 0.0f) {`
			`if (fini) {`
			`return;`
			`}`

			`for (size_t i = 0; i < localIndices.size(); ++i) {`
			`real* g = getLocalRow(i);`
			`real* v = value.rowBuf(localIndices[i]);`
			`for (size_t j = 0; j < this->width_; ++j) {`
			`v[j] -= learningRate * g[j];`
			`}`
			`}`
			`return;`
			`} // else`

			`if (useL1) { // L1 decay`
			`if (fini) {`
			`for (size_t i = 0; i < this->height_; ++i) {`
			`real* v = value.rowBuf(i);`
			`int* t = t0.getData() + i;`
			`if (t[0] < currentTime) {`
			`// W(t0) -> W(t+1)`
			`int tDiff = currentTime - t[0];`
			`real delta = tDiff * learningRate * decayRate;`
			`simd::decayL1(v, v, delta, this->width_);`
			`}`
			`}`
			`return;`
			`} // else`

			`for (size_t i = 0; i < localIndices.size(); ++i) {`
			`real* g = getLocalRow(i);`
			`real* v = value.rowBuf(localIndices[i]);`
			`int* t = t0.getData() + localIndices[i];`
			`if (t[0] < currentTime) {`
			`// W(t0) -> W(t)`
			`int tDiff = currentTime - t[0];`
			`real delta = tDiff * learningRate * decayRate;`
			`simd::decayL1(v, v, delta, this->width_);`
			`}`

			`// W(t) -> W(t+1)`
			`for (size_t j = 0; j < this->width_; ++j) {`
			`v[j] -= learningRate * g[j];`
			`}`
			`simd::decayL1(v, v, learningRate*decayRate, this->width_);`

			`// state update to t+1`
			`t[0] = currentTime + 1;`
			`}`

			`} else { // L2 decay`
			`if (fini) {`
			`for (size_t i = 0; i < this->height_; ++i) {`
			`real* v = value.rowBuf(i);`
			`int* t = t0.getData() + i;`
			`if (t[0] < currentTime) {`
			`// W(t0) -> W(t+1)`
			`int tDiff = currentTime - t[0];`
			`real recip = 1.0f / (1.0f + tDiff * learningRate * decayRate);`
			`for (size_t j = 0; j < this->width_; ++j) {`
			`v[j] *= recip;`
			`}`
			`}`
			`}`
			`return;`
			`} // else`

			`real recipDecay = 1.0f / (1.0f + learningRate * decayRate);`

			`for (size_t i = 0; i < localIndices.size(); ++i) {`
			`real* g = getLocalRow(i);`
			`real* v = value.rowBuf(localIndices[i]);`
			`int* t = t0.getData() + localIndices[i];`
			`if (t[0] < currentTime) {`
			`// W(t0) -> W(t)`
			`int tDiff = currentTime - t[0];`
			`real recip = 1.0f / (1.0f + tDiff * learningRate * decayRate);`
			`for (size_t j = 0; j < this->width_; ++j) {`
			`v[j] *= recip;`
			`}`
			`}`

			`// W(t) -> W(t+1)`
			`for (size_t j = 0; j < this->width_; ++j) {`
			`v[j] = recipDecay * (v[j] - learningRate * g[j]);`
			`}`

			`// state update to t+1`
			`t[0] = currentTime + 1;`
			`}`
			`}`
			`}`

			`void SparseRowCpuMatrix::addTo(BaseMatrix& dest, std::vector<uint32_t>& ids,`
			`size_t tid, size_t numThreads) {`
			`CHECK(!dest.useGpu_);`
			`CHECK_EQ(dest.height_ * dest.width_, this->height_ * this->width_);`

			`std::vector<unsigned int>& localIndices = indexDictHandle_->localIndices;`
			`for (size_t i = 0; i < localIndices.size(); ++i) {`
			`uint32_t id = localIndices[i];`
			`if (id % numThreads == tid) {`
			`simd::addTo(dest.rowBuf(id), getLocalRow(i),`
			`this->width_);`
			`ids.push_back(id);`
			`}`
			`}`
			`}`

			`void SparseRowCpuMatrix::addTo(SparseRowCpuMatrix& dest, size_t tid,`
			`size_t numThreads) {`
			`CHECK(!dest.useGpu_);`
			`CHECK_EQ(dest.height_ * dest.width_, this->height_ * this->width_);`

			`std::vector<unsigned int>& localIndices = indexDictHandle_->localIndices;`
			`for (size_t i = 0; i < localIndices.size(); ++i) {`
			`uint32_t id = localIndices[i];`
			`if (id % numThreads == tid) {`
			`dest.checkIndex(id);`
			`simd::addTo(dest.getRow(id), getLocalRow(i), this->width_);`
			`}`
			`}`
			`}`

			`void SparseRowCpuMatrix::zeroMemThread(size_t tid, size_t numThreads) {`
			`std::vector<unsigned int>& localIndices = indexDictHandle_->localIndices;`
			`for (size_t i = 0; i < localIndices.size(); ++i) {`
			`uint32_t id = localIndices[i];`
			`if (id % numThreads == tid) {`
			`memset(this->getLocalRow(i), 0, this->width_ * sizeof(real));`
			`}`
			`}`
			`}`

			`void SparseAutoGrowRowCpuMatrix::mul(CpuSparseMatrix* a, CpuMatrix* b,`
			`real scaleAB, real scaleT) {`
			`CpuMatrix::mul<CpuMatrix, SparseAutoGrowRowCpuMatrix>(a, b, this, scaleAB,`
			`scaleT);`
			`}`

			`void CacheRowCpuMatrix::mul(CpuSparseMatrix* a, CpuMatrix* b, real scaleAB,`
			`real scaleT) {`
			`CpuMatrix::mul<CpuMatrix, CacheRowCpuMatrix>(a, b, this, scaleAB, scaleT);`
			`}`

			`void SparsePrefetchRowCpuMatrix::addRows(const unsigned int* ids, size_t len) {`
			`std::vector<unsigned int>& localIndices = indexDictHandle_->localIndices;`
add input sparse data check for sparse layer at runtime (#247) * add input sparse data check for sparse layer at runtime, to avoid invalid data access at pserver end while doing prefetch * remote sparse design support binary sparse and float saprse both 8 years ago			`for (size_t i = 0; i < len; i ++) {`
			`CHECK_LT(*(ids + i), this->getHeight())`
			`<< "id:" << *(ids + i) << "Height:" << this->getHeight()`
			`<< "sparse id value exceeds the max input dimension, "`
			`<< "it could be caused invalid input data samples";`
			`}`
fix dash and space bug, ISSUE=4586495 git-svn-id: https://svn.baidu.com/idl/trunk/paddle@1408 1ad973e4-5ce8-4261-8a94-b56d1f490c56 9 years ago			`localIndices.insert(localIndices.end(), ids, ids + len);`
			`}`

			`void SparsePrefetchRowCpuMatrix::addRows(MatrixPtr input) {`
			`CpuSparseMatrix* mat = dynamic_cast<CpuSparseMatrix*>(input.get());`
add input sparse data check for sparse layer at runtime (#247) * add input sparse data check for sparse layer at runtime, to avoid invalid data access at pserver end while doing prefetch * remote sparse design support binary sparse and float saprse both 8 years ago			`CHECK(mat) << "only support sparse matrix";`
fix dash and space bug, ISSUE=4586495 git-svn-id: https://svn.baidu.com/idl/trunk/paddle@1408 1ad973e4-5ce8-4261-8a94-b56d1f490c56 9 years ago			`addRows(reinterpret_cast<const unsigned int*>(mat->getCols()),`
			`mat->getElementCnt());`
			`}`

			`void SparsePrefetchRowCpuMatrix::addRows(IVectorPtr ids) {`
			`std::vector<unsigned int>& localIndices = indexDictHandle_->localIndices;`
			`size_t numSamples = ids->getSize();`
			`int* index = ids->getData();`
			`for (size_t i = 0; i < numSamples; ++i) {`
			`if (index[i] == -1) continue;`
add input sparse data check for sparse layer at runtime (#247) * add input sparse data check for sparse layer at runtime, to avoid invalid data access at pserver end while doing prefetch * remote sparse design support binary sparse and float saprse both 8 years ago
			`unsigned int id = (unsigned int)index[i];`
			`CHECK_LT(id, this->getHeight())`
			`<< "id:" << id << "Height:" << this->getHeight()`
			`<< "sparse id value exceeds the max input dimension, "`
			`<< "it could be caused invalid input data samples";`
			`localIndices.push_back(id);`
fix dash and space bug, ISSUE=4586495 git-svn-id: https://svn.baidu.com/idl/trunk/paddle@1408 1ad973e4-5ce8-4261-8a94-b56d1f490c56 9 years ago			`}`
			`}`

			`void SparsePrefetchRowCpuMatrix::setupIndices() {`
			`auto& localIndices = indexDictHandle_->localIndices;`
			`uniqueIds(localIndices);`
			`// for each sparse row`
			`for (size_t id = 0; id < localIndices.size(); ++id) {`
			`globalIndices_[localIndices[id]] = id; // sparse row -> local id`
			`}`
			`checkStoreSize();`
			`}`

			`void SparseRowCpuMatrix::checkIndices() {`
			`std::vector<unsigned int>& localIndices = indexDictHandle_->localIndices;`
			`for (size_t i = 0; i < localIndices.size(); ++i) {`
			`CHECK_EQ(globalIndices_[localIndices[i]], i);`
			`}`
			`checkStoreSize();`
			`}`

			`} // namespace paddle`