!1874 Updates to string tensor

Merge pull request !1874 from h.farahat/string_tensor_updates
5 years ago · 5eb95599f6
parent 0e7839826e f837ddc956
commit 5eb95599f6
13 changed files with 526 additions and 218 deletions
--- a/mindspore/ccsrc/dataset/api/de_pipeline.cc
+++ b/mindspore/ccsrc/dataset/api/de_pipeline.cc
@ -654,18 +654,8 @@ Status DEPipeline::ParseBatchOp(const py::dict &args, std::shared_ptr<DatasetOp>
        (void)builder->SetColumnsToMap(ToStringVector(value));
      }
      if (key == "pad_info") {
-        std::map<std::string, std::pair<TensorShape, float>> pad_info;
-        for (auto p : py::reinterpret_borrow<py::dict>(value)) {
-          if (!p.second.is_none()) {
-            py::tuple tp = py::reinterpret_borrow<py::tuple>(p.second);
-            CHECK_FAIL_RETURN_UNEXPECTED(tp.size() == 2, "tuple in pad_info must be (list,int) or (list,float)");
-            TensorShape shape = tp[0].is_none() ? TensorShape::CreateUnknownRankShape() : TensorShape(tp[0]);
-            float pad_val = tp[1].is_none() ? 0 : ToFloat(tp[1]);
-            (void)pad_info.insert({ToString(p.first), {shape, pad_val}});
-          } else {  // tuple is None
-            (void)pad_info.insert({ToString(p.first), {TensorShape({}), 0}});
-          }
-        }
+        PadInfo pad_info;
+        RETURN_IF_NOT_OK(ParsePadInfo(value, &pad_info));
        (void)builder->SetPaddingMap(pad_info, true);
      }
    }
@ -1166,5 +1156,31 @@ Status DEPipeline::ParseTextFileOp(const py::dict &args, std::shared_ptr<Dataset
  *ptr = op;
  return Status::OK();
 }
+Status DEPipeline::ParsePadInfo(py::handle value, PadInfo *pad_info) {
+  for (auto p : py::reinterpret_borrow<py::dict>(value)) {
+    if (!p.second.is_none()) {
+      auto tp = py::reinterpret_borrow<py::tuple>(p.second);
+      CHECK_FAIL_RETURN_UNEXPECTED(tp.size() == 2, "tuple in pad_info must be (list,int) or (list,float)");
+      TensorShape shape = tp[0].is_none() ? TensorShape::CreateUnknownRankShape() : TensorShape(tp[0]);
+      std::shared_ptr<Tensor> pad_val = nullptr;
+      if (py::isinstance<py::str>(tp[1])) {
+        std::string pad_val_string = tp[1].is_none() ? "" : ToString(tp[1]);
+        CHECK_FAIL_RETURN_UNEXPECTED(
+          Tensor::CreateTensor(&pad_val, std::vector<std::string>{pad_val_string}, TensorShape::CreateScalar()),
+          "Cannot create pad_value Tensor");
+      } else {
+        float pad_val_float = tp[1].is_none() ? 0 : ToFloat(tp[1]);
+        CHECK_FAIL_RETURN_UNEXPECTED(Tensor::CreateTensor(&pad_val, TensorImpl::kFlexible, TensorShape::CreateScalar(),
+                                                          DataType(DataType::DE_FLOAT32)),
+                                     "Cannot create pad_value Tensor");
+        pad_val->SetItemAt<float>({}, pad_val_float);
+      }
+      (void)pad_info->insert({ToString(p.first), {shape, pad_val}});
+    } else {  // tuple is None
+      (void)pad_info->insert({ToString(p.first), {TensorShape({}), nullptr}});
+    }
+  }
+  return Status::OK();
+}
 }  // namespace dataset
 }  // namespace mindspore
--- a/mindspore/ccsrc/dataset/api/de_pipeline.h
+++ b/mindspore/ccsrc/dataset/api/de_pipeline.h
@ -169,6 +169,8 @@ class DEPipeline {
  // Validate required args passed to storage op.
  Status ValidateArgStorageOp(const py::dict &args);

+  static Status ParsePadInfo(py::handle value, PadInfo *pad_info);
+
  int batch_size_;
  int repeat_num_;
  int num_rows_;
--- a/mindspore/ccsrc/dataset/core/data_type.cc
+++ b/mindspore/ccsrc/dataset/core/data_type.cc
@ -138,7 +138,7 @@ DataType DataType::FromNpArray(const py::array &arr) {
    return DataType(DataType::DE_FLOAT32);
  } else if (py::isinstance<py::array_t<std::double_t>>(arr)) {
    return DataType(DataType::DE_FLOAT64);
-  } else if (arr.dtype().kind() == 'S') {
+  } else if (arr.dtype().kind() == 'S' || arr.dtype().kind() == 'U') {
    return DataType(DataType::DE_STRING);
  } else {
    MS_LOG(ERROR) << "Cannot convert from numpy type. Unknown data type is returned!";
--- a/mindspore/ccsrc/dataset/core/tensor.cc
+++ b/mindspore/ccsrc/dataset/core/tensor.cc
@ -229,7 +229,12 @@ Status Tensor::CreateTensorFromNumpyString(std::shared_ptr<Tensor> *ptr, py::arr
  }
  arr.resize({arr.size()});  // flatten the py::array so we can iterate once
  std::vector<std::string> strings;
-  std::for_each(arr.begin(), arr.end(), [&strings](const auto &s) { strings.emplace_back(py::cast<py::bytes>(s)); });
+
+  if (arr.dtype().kind() == 'U') {
+    std::for_each(arr.begin(), arr.end(), [&strings](const auto &s) { strings.emplace_back(py::cast<py::str>(s)); });
+  } else {
+    std::for_each(arr.begin(), arr.end(), [&strings](const auto &s) { strings.emplace_back(py::cast<py::bytes>(s)); });
+  }

  arr.resize(shape);  // resize arr back to the original shape

@ -699,6 +704,8 @@ Status Tensor::GetDataAsNumpyStrings(py::array *data) {
  for (; itr != end<std::string_view>(); itr++) {
    max = std::max((*itr).length(), max);
  }
+  // if all strings are empty, numpy stores a byte for each string |S1
+  max = (max == 0 ? 1 : max);
  uint64_t total_size = shape_.NumOfElements() * max;
  char *tmp_data = reinterpret_cast<char *>(data_allocator_->allocate(total_size));
  if (tmp_data == nullptr) RETURN_STATUS_UNEXPECTED("Cannot create temp array.");
@ -708,8 +715,10 @@ Status Tensor::GetDataAsNumpyStrings(py::array *data) {
  itr = begin<std::string_view>();
  uint64_t i = 0;
  for (; itr != end<std::string_view>(); itr++, i++) {
-    ret_code = memcpy_s(tmp_data + i * max, total_size, (*itr).data(), (*itr).length());
-    CHECK_FAIL_RETURN_UNEXPECTED(ret_code == 0, "Failed to copy string data.");
+    if (!(*itr).empty()) {
+      ret_code = memcpy_s(tmp_data + i * max, total_size, (*itr).data(), (*itr).length());
+      CHECK_FAIL_RETURN_UNEXPECTED(ret_code == 0, "Failed to copy string data.");
+    }
  }
  auto strides = shape_.Strides();
  std::transform(strides.begin(), strides.end(), strides.begin(), [&max](const auto &s) { return s * max; });
@ -847,6 +856,21 @@ Status Tensor::GetStringAt(dsize_t index, uchar **string_start, offset_t *length
  *length = offset_ptr[index + 1] - start - 1;  // -1 to skip the \0 from the string length
  return Status::OK();
 }
+Status Tensor::CopyLastDimAt(const std::shared_ptr<Tensor> &src, const std::vector<dsize_t> &index) {
+  CHECK_FAIL_RETURN_UNEXPECTED(src->type() == type_, "Source Tensor has a different type");
+  CHECK_FAIL_RETURN_UNEXPECTED(index.back() == 0, "Last dim in index should be 0");
+
+  uint8_t type_size = type_.SizeInBytes();
+  size_t len = std::min(src->shape()[-1], shape_[-1]) * type_size;
+  dsize_t src_flat_ind = 0, dst_flat_ind = 0;
+  RETURN_IF_NOT_OK(src->shape().ToFlatIndex(index, &src_flat_ind));
+  RETURN_IF_NOT_OK(shape_.ToFlatIndex(index, &dst_flat_ind));
+
+  const unsigned char *src_addr = src->GetBuffer() + src_flat_ind * type_size;
+  unsigned char *dst_addr = GetMutableBuffer() + dst_flat_ind * type_size;
+  CHECK_FAIL_RETURN_UNEXPECTED(memcpy_s(dst_addr, len, src_addr, len) == 0, "memcpy error");
+  return Status::OK();
+}

 }  // namespace dataset
 }  // namespace mindspore
--- a/mindspore/ccsrc/dataset/core/tensor.h
+++ b/mindspore/ccsrc/dataset/core/tensor.h
@ -529,6 +529,12 @@ class Tensor {
    return TensorIterator<T>(data_end_);
  }

+  // Copies the last dimension at `index` from Tensor `src` to this Tensor.
+  // @param src Tensor
+  // @param index vector to the start of the dimension. The last dim should be 0
+  // @return Status
+  Status CopyLastDimAt(const std::shared_ptr<Tensor> &src, const std::vector<dsize_t> &index);
+
 protected:
  // A function that prints Tensor recursively, first called by print
  // @param out
--- a/mindspore/ccsrc/dataset/core/tensor_shape.h
+++ b/mindspore/ccsrc/dataset/core/tensor_shape.h
@ -118,7 +118,10 @@ class TensorShape {

  bool operator!=(const TensorShape &rhs) const { return !(rhs == *this); }

-  dsize_t operator[](const dsize_t index) const { return raw_shape_[index]; }
+  dsize_t operator[](const dsize_t index) const {
+    if (index < 0) return raw_shape_[raw_shape_.size() + index];
+    return raw_shape_[index];
+  }

  // Return the Shape as a vector
  // @return
--- a/mindspore/ccsrc/dataset/engine/datasetops/batch_op.cc
+++ b/mindspore/ccsrc/dataset/engine/datasetops/batch_op.cc
--- a/mindspore/ccsrc/dataset/engine/datasetops/batch_op.h
+++ b/mindspore/ccsrc/dataset/engine/datasetops/batch_op.h
@ -38,6 +38,7 @@ class DataBuffer;

 using TensorBatch = std::vector<std::shared_ptr<Tensor>>;
 using TensorBatchTable = std::vector<TensorBatch>;
+using PadInfo = std::map<std::string, std::pair<TensorShape, std::shared_ptr<Tensor>>>;

 class BatchOp : public ParallelOp {
 public:
@ -66,7 +67,7 @@ class BatchOp : public ParallelOp {
      return *this;
    }

-    Builder &SetPaddingMap(const std::map<std::string, std::pair<TensorShape, float>> &pad_map, bool pad = true) {
+    Builder &SetPaddingMap(const PadInfo &pad_map, bool pad = true) {
      builder_pad_ = pad;
      builder_pad_map_ = pad_map;
      return *this;
@ -119,7 +120,7 @@ class BatchOp : public ParallelOp {
    int32_t builder_num_workers_;
    int32_t builder_op_connector_size_;
    std::vector<std::string> builder_cols_to_map_;
-    std::map<std::string, std::pair<TensorShape, float>> builder_pad_map_;
+    PadInfo builder_pad_map_;
    py::function builder_batch_size_func_;
    py::function builder_batch_map_func_;
  };
@ -150,8 +151,7 @@ class BatchOp : public ParallelOp {
  // @param int32_t rows_per_buf
  // @param int32_t num_workers
  BatchOp(int32_t batch_size, bool drop, bool pad, int32_t op_queue_size, int32_t num_workers,
-          const std::vector<std::string> &, py::function batch_size_func, py::function batch_map_func,
-          std::map<std::string, std::pair<TensorShape, float>> pad_map);
+          const std::vector<std::string> &, py::function batch_size_func, py::function batch_map_func, PadInfo pad_map);

  // BatchOp destructor
  ~BatchOp() {}
@ -183,15 +183,6 @@ class BatchOp : public ParallelOp {
  // @return Status - The error code return
  Status operator()() override;

-  // Pad input tensor according pad_shape, need to have same rank.
-  // @param std::shared_ptr<Tensor> src - tensor to pad from
-  // @param std::shared_ptr<Tensor> *dst - return tensor padded
-  // @param std::vector<dsize_t> pad_shape - shape to pad to
-  // @param float pad_val - value to pad with
-  // @return - The error code return
-  Status PadTensor(std::shared_ptr<Tensor> src, std::shared_ptr<Tensor> *dst, const std::vector<dsize_t> &pad_shape,
-                   float pad_val);
-
  // Base-class override for NodePass visitor acceptor.
  // @param p - Pointer to the NodePass to be accepted.
  // @param modified - Whether this node visit modified the pipeline.
@ -199,18 +190,6 @@ class BatchOp : public ParallelOp {
  Status Accept(NodePass *p, bool *modified) override;

 private:
-  // recursive helper function. This function could be very expensive if called on a multi-dimensional tensor
-  // it is only meant to be called by PadTensor.
-  // @tparam T - type of tensor and fill value
-  // @param std::shared_ptr<Tensor> src - Tensor to pad from
-  // @param std::shared_ptr<Tensor>* dst - Tensor to pad to, return value
-  // @param std::vector<dsize_t> cur_ind - recursion helper
-  // @param T pad_val - value to pad tensor with
-  // @param size_t cur_dim - recursion helper
-  // @return Status - The error code return
-  Status PadHelper(std::shared_ptr<Tensor> src, std::shared_ptr<Tensor> dst, std::vector<dsize_t> cur_ind,
-                   const std::vector<dsize_t> &src_s, const std::vector<dsize_t> &dst_s, size_t cur_dim = 0);
-
  // Worker thread for doing the memcpy of batch
  // @param int32_t param workerId
  // @return Status - The error code return
@ -225,23 +204,33 @@ class BatchOp : public ParallelOp {
  // @param const std::unique_ptr<TensorQTable> *src - table that has the rows for batching
  // @param const std::unique_ptr<TensorQTable> *dest - dest_table to hold batched rows
  // @param int32_t size - batch_size
+  // @param const std::unordered_map<std::string, int32_t>& column_name_id_map - column names to index mapping
  // @return Status - The error code return
-  Status BatchRows(const std::unique_ptr<TensorQTable> *src, const std::unique_ptr<TensorQTable> *dest, size_t size);
+  static Status BatchRows(const std::unique_ptr<TensorQTable> *src, const std::unique_ptr<TensorQTable> *dest,
+                          dsize_t batch_size);

  // Function that calls pyfunc to perform map on batch
  // @param (std::pair<std::unique_ptr<TensorQTable>, batch_stats> *table_pair - contains un-batched tensor
  // @return Status - The error code return
  Status MapColumns(std::pair<std::unique_ptr<TensorQTable>, CBatchInfo> *table_pair);

+  // @param const PadInfo &pad_info pad info to unpack
+  // @param const std::unordered_map<std::string, int32_t>& column_name_id_map - column names to index mapping
  // @param std::set<int32_t> *cols, col ids to perform pad on
  // @param std::vector<float> *vals, default padding value for each column
  // @param std::vector<std::vector<dsize_t>> *shapes, padding shape specified by user
  // @return Status - The error code return
-  Status UnpackPadInfo(std::set<int32_t> *cols, std::vector<float> *vals, std::vector<std::vector<dsize_t>> *shapes);
-
-  // @param table_pair
+  static Status UnpackPadInfo(const PadInfo &pad_info,
+                              const std::unordered_map<std::string, int32_t> &column_name_id_map,
+                              std::set<int32_t> *pad_cols, std::vector<std::shared_ptr<Tensor>> *pad_vals,
+                              std::vector<std::vector<dsize_t>> *pad_shapes);
+
+  // @param table
+  // @param const PadInfo &pad_info pad info
+  // @param const std::unordered_map<std::string, int32_t>& column_name_id_map - column names to index mapping
  // @return Status - The error code return
-  Status PadColumns(std::pair<std::unique_ptr<TensorQTable>, CBatchInfo> *table_pair);
+  static Status PadColumns(std::unique_ptr<TensorQTable> *table, const PadInfo &pad_info,
+                           const std::unordered_map<std::string, int32_t> &column_name_id_map);

  // the number of thread pulling from the mOutConnector of the Op below
  // @return int32_t, 1
@ -264,11 +253,11 @@ class BatchOp : public ParallelOp {
  Status InvokeBatchMapFunc(TensorTable *input, TensorTable *output, CBatchInfo info);

  int32_t start_batch_size_;
-  bool drop_;                                                      // bool for whether to drop remainder or not
-  bool pad_;                                                       // bool for whether to perform padding on tensor
-  std::vector<std::string> pyfunc_column_names_;                   // Name of the columns to perform map op on
-  std::map<std::string, std::pair<TensorShape, float>> pad_info_;  // column names to perform padding on
-  std::unique_ptr<ChildIterator> child_iterator_;                  // child iterator for fetching TensorRows 1 by 1
+  bool drop_;                                      // bool for whether to drop remainder or not
+  bool pad_;                                       // bool for whether to perform padding on tensor
+  std::vector<std::string> pyfunc_column_names_;   // Name of the columns to perform map op on
+  PadInfo pad_info_;                               // column names to perform padding on
+  std::unique_ptr<ChildIterator> child_iterator_;  // child iterator for fetching TensorRows 1 by 1
  QueueList<std::pair<std::unique_ptr<TensorQTable>, CBatchInfo>> worker_queues_;  // internal queue for syncing worker
  py::function batch_size_func_;  // Function pointer of batch size function
  py::function batch_map_func_;   // Function pointer of per batch map function
--- a/mindspore/ccsrc/dataset/kernels/data/data_utils.cc
+++ b/mindspore/ccsrc/dataset/kernels/data/data_utils.cc
@ -15,6 +15,8 @@
 */

 #include "dataset/kernels/data/data_utils.h"
+#include <algorithm>
+#include <string>
 #include <vector>
 #include "dataset/core/constants.h"
 #include "dataset/core/tensor.h"
@ -220,5 +222,125 @@ Status ToFloat16(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *

  return Status::OK();
 }
+
+Status PadEnd(const std::shared_ptr<Tensor> &src, std::shared_ptr<Tensor> *dst, const std::vector<dsize_t> &pad_shape,
+              const std::shared_ptr<Tensor> &pad_val) {
+  if (pad_val == nullptr) {
+    if (src->type().IsNumeric()) {
+      return PadEndNumeric(src, dst, pad_shape, 0);
+    } else {
+      return PadEndString(src, dst, pad_shape, "");
+    }
+  }
+  if (pad_val->type().IsNumeric()) {
+    float val = 0;
+    RETURN_IF_NOT_OK(pad_val->GetItemAt<float>(&val, {}));
+    return PadEndNumeric(src, dst, pad_shape, val);
+  }
+  std::string_view val;
+  RETURN_IF_NOT_OK(pad_val->GetItemAt(&val, {}));
+  return PadEndString(src, dst, pad_shape, std::string(val));
+}
+
+Status PadEndNumeric(const std::shared_ptr<Tensor> &src, std::shared_ptr<Tensor> *dst,
+                     const std::vector<dsize_t> &pad_shape, float pad_val) {
+  CHECK_FAIL_RETURN_UNEXPECTED(src != nullptr && dst != nullptr, "tensor can't be nullptr");
+  if (src->Rank() == 0 || src->shape().AsVector() == pad_shape) {
+    (*dst) = src;  // if no padding, copy the pointer
+  } else {
+    CHECK_FAIL_RETURN_UNEXPECTED(src->Rank() == pad_shape.size(), "Pad to diff rank not allowed");
+    RETURN_IF_NOT_OK(Tensor::CreateTensor(dst, TensorImpl::kFlexible, TensorShape(pad_shape), src->type()));
+    auto tensor_type = src->type().value();
+    if (pad_val == 0) {  // if pad with zero, don't care what type it is
+      RETURN_IF_NOT_OK((*dst)->Zero());
+    } else if (tensor_type == DataType::DE_INT8) {
+      RETURN_IF_NOT_OK((*dst)->Fill<int8_t>(pad_val));
+    } else if (tensor_type == DataType::DE_BOOL) {
+      RETURN_IF_NOT_OK((*dst)->Fill<bool>(pad_val));
+    } else if (tensor_type == DataType::DE_UINT8) {
+      RETURN_IF_NOT_OK((*dst)->Fill<uint8_t>(pad_val));
+    } else if (tensor_type == DataType::DE_INT16) {
+      RETURN_IF_NOT_OK((*dst)->Fill<int16_t>(pad_val));
+    } else if (tensor_type == DataType::DE_FLOAT16) {
+      RETURN_IF_NOT_OK((*dst)->Fill<float16>(static_cast<float16>(pad_val)));
+    } else if (tensor_type == DataType::DE_UINT16) {
+      RETURN_IF_NOT_OK((*dst)->Fill<uint16_t>(pad_val));
+    } else if (tensor_type == DataType::DE_INT32) {
+      RETURN_IF_NOT_OK((*dst)->Fill<int32_t>(pad_val));
+    } else if (tensor_type == DataType::DE_UINT32) {
+      RETURN_IF_NOT_OK((*dst)->Fill<uint32_t>(pad_val));
+    } else if (tensor_type == DataType::DE_INT64) {
+      RETURN_IF_NOT_OK((*dst)->Fill<int64_t>(pad_val));
+    } else if (tensor_type == DataType::DE_UINT64) {
+      RETURN_IF_NOT_OK((*dst)->Fill<uint64_t>(pad_val));
+    } else if (tensor_type == DataType::DE_FLOAT32) {
+      RETURN_IF_NOT_OK((*dst)->Fill<float>(pad_val));
+    } else if (tensor_type == DataType::DE_FLOAT64) {
+      RETURN_IF_NOT_OK((*dst)->Fill<double>(pad_val));
+    } else {
+      RETURN_STATUS_UNEXPECTED("Incorrect/Unknown tensor type");
+    }
+    std::vector<dsize_t> cur_ind(src->Rank(), 0);
+    RETURN_IF_NOT_OK(PadEndNumericHelper(src, *dst, cur_ind, 0));
+  }
+  return Status::OK();
+}
+Status PadEndNumericHelper(const std::shared_ptr<Tensor> &src, std::shared_ptr<Tensor> dst,
+                           std::vector<dsize_t> cur_ind, size_t cur_dim) {
+  if (cur_dim == src->Rank() - 1) {  // if this is the last dimension, copy the data
+    dst->CopyLastDimAt(src, cur_ind);
+  } else {  // not the last dimension, keep doing recursion
+    dsize_t min_ind = std::min(dst->shape()[cur_dim], src->shape()[cur_dim]);
+    for (dsize_t i = 0; i < min_ind; i++) {
+      cur_ind[cur_dim] = i;
+      RETURN_IF_NOT_OK(PadEndNumericHelper(src, dst, cur_ind, cur_dim + 1));
+    }
+  }
+  return Status::OK();
+}
+
+Status PadEndString(const std::shared_ptr<Tensor> &src, std::shared_ptr<Tensor> *dst,
+                    const std::vector<dsize_t> &pad_shape, const std::string &pad_val) {
+  CHECK_FAIL_RETURN_UNEXPECTED(src != nullptr && dst != nullptr, "tensor can't be nullptr");
+  if (src->Rank() == 0 || src->shape().AsVector() == pad_shape) {
+    (*dst) = src;  // if no padding, copy the pointer
+  } else {
+    CHECK_FAIL_RETURN_UNEXPECTED(src->Rank() == pad_shape.size(), "Pad to diff rank not allowed");
+    std::vector<dsize_t> cur_ind(src->Rank(), 0);
+    std::vector<std::string> strings;
+    RETURN_IF_NOT_OK(PadEndStringHelper(src, &strings, TensorShape(pad_shape), cur_ind, 0, pad_val));
+    RETURN_IF_NOT_OK(Tensor::CreateTensor(dst, strings, TensorShape(pad_shape)));
+  }
+  return Status::OK();
+}
+
+Status PadEndStringHelper(const std::shared_ptr<Tensor> &src, std::vector<std::string> *dst,
+                          const TensorShape &dst_shape, std::vector<dsize_t> cur_ind, size_t cur_dim,
+                          const std::string &pad_value) {
+  if (cur_dim == src->Rank() - 1) {  // if this is the last dimension, copy the data
+    dsize_t min_ind = std::min(dst_shape[cur_dim], src->shape()[cur_dim]);
+    for (dsize_t i = 0; i < min_ind; i++) {
+      cur_ind[cur_dim] = i;
+      std::string_view item;
+      RETURN_IF_NOT_OK(src->GetItemAt(&item, cur_ind));
+      dst->emplace_back(item);
+    }
+    for (dsize_t i = min_ind; i < dst_shape[cur_dim]; i++) {
+      dst->emplace_back(pad_value);
+    }
+
+  } else {  // not the last dimension, keep doing recursion
+    dsize_t min_ind = std::min(dst_shape[cur_dim], src->shape()[cur_dim]);
+    for (dsize_t i = 0; i < min_ind; i++) {
+      cur_ind[cur_dim] = i;
+      RETURN_IF_NOT_OK(PadEndStringHelper(src, dst, dst_shape, cur_ind, cur_dim + 1, pad_value));
+    }
+    dsize_t count = (dst_shape[cur_dim] - min_ind) * dst_shape.Strides()[cur_dim];
+    for (dsize_t i = 0; i < count; i++) {
+      dst->emplace_back(pad_value);
+    }
+  }
+  return Status::OK();
+}
 }  // namespace dataset
 }  // namespace mindspore
--- a/mindspore/ccsrc/dataset/kernels/data/data_utils.h
+++ b/mindspore/ccsrc/dataset/kernels/data/data_utils.h
@ -17,6 +17,7 @@
 #define DATASET_KERNELS_DATA_DATA_UTILS_H_

 #include <memory>
+#include <string>
 #include <vector>
 #include "dataset/core/constants.h"
 #include "dataset/core/cv_tensor.h"
@ -58,6 +59,59 @@ void Cast(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output)
 Status ToFloat16(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output);

 Status TypeCast(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output, const DataType &data_type);
+
+// Pad input tensor according pad_shape, need to have same rank.
+// Based on the type of the input tensor, PadEndNumeric/String will be called.
+// @param std::shared_ptr<Tensor> src - tensor to pad from
+// @param std::shared_ptr<Tensor> *dst - return tensor padded
+// @param std::vector<dsize_t> pad_shape - shape to pad to
+// @param std::shared_ptr<Tensor> pad_val - value to pad with in Tensor format,
+// @return - The error code return
+Status PadEnd(const std::shared_ptr<Tensor> &src, std::shared_ptr<Tensor> *dst, const std::vector<dsize_t> &pad_shape,
+              const std::shared_ptr<Tensor> &pad_val);
+
+// Pad input numeric tensor according pad_shape, need to have same rank.
+// @param std::shared_ptr<Tensor> src - tensor to pad from
+// @param std::shared_ptr<Tensor> *dst - return tensor padded
+// @param std::vector<dsize_t> pad_shape - shape to pad to
+// @param float pad_val - value to pad with
+// @return - The error code return
+Status PadEndNumeric(const std::shared_ptr<Tensor> &src, std::shared_ptr<Tensor> *dst,
+                     const std::vector<dsize_t> &pad_shape, float pad_val);
+
+// recursive helper function for padding numric tensors. This function could be very expensive if called on a
+// multi-dimensional tensor it is only meant to be called by PadEndNumeric.
+// @tparam T - type of tensor and fill value
+// @param std::shared_ptr<Tensor> src - Tensor to pad from
+// @param std::shared_ptr<Tensor>* dst - Tensor to pad to, return value
+// @param std::vector<dsize_t> cur_ind - recursion helper
+// @param T pad_val - value to pad tensor with
+// @param size_t cur_dim - recursion helper
+// @return Status - The error code return
+Status PadEndNumericHelper(const std::shared_ptr<Tensor> &src, std::shared_ptr<Tensor> dst,
+                           std::vector<dsize_t> cur_ind, size_t cur_dim = 0);
+
+// Pad input string tensor according pad_shape, need to have same rank.
+// @param std::shared_ptr<Tensor> src - tensor to pad from
+// @param std::shared_ptr<Tensor> *dst - return tensor padded
+// @param std::vector<dsize_t> pad_shape - shape to pad to
+// @param std::string pad_val - value to pad with
+// @return - The error code return
+Status PadEndString(const std::shared_ptr<Tensor> &src, std::shared_ptr<Tensor> *dst,
+                    const std::vector<dsize_t> &pad_shape, const std::string &pad_val);
+
+// recursive helper function for padding string tensors. This function could be very expensive if called on a
+// multi-dimensional tensor it is only meant to be called by PadEndNumeric.
+// @tparam T - type of tensor and fill value
+// @param std::shared_ptr<Tensor> src - Tensor to pad from
+// @param std::shared_ptr<Tensor>* dst - Tensor to pad to, return value
+// @param std::vector<dsize_t> cur_ind - recursion helper
+// @param std::string pad_val - value to pad tensor with
+// @param size_t cur_dim - recursion helper
+// @return Status - The error code return
+Status PadEndStringHelper(const std::shared_ptr<Tensor> &src, std::vector<std::string> *dst,
+                          const TensorShape &dst_shape, std::vector<dsize_t> cur_ind, size_t cur_dim,
+                          const std::string &pad_value);
 }  // namespace dataset
 }  // namespace mindspore

--- a/mindspore/dataset/engine/validators.py
+++ b/mindspore/dataset/engine/validators.py
@ -19,10 +19,12 @@ import inspect as ins
 import os
 from functools import wraps
 from multiprocessing import cpu_count
+
 import numpy as np
 from mindspore._c_expression import typing
-from . import samplers
+
 from . import datasets
+from . import samplers

 INT32_MAX = 2147483647
 valid_detype = [
@ -704,7 +706,7 @@ def check_pad_info(key, val):
                    check_type(dim, "dim in pad_shape", int)
                    assert dim > 0, "pad shape should be positive integers"
        if val[1] is not None:
-            check_type(val[1], "pad_value", (int, float))
+            check_type(val[1], "pad_value", (int, float, str, bytes))


 def check_batch(method):
--- a/tests/ut/cpp/dataset/batch_op_test.cc
+++ b/tests/ut/cpp/dataset/batch_op_test.cc
@ -299,8 +299,11 @@ TEST_F(MindDataTestBatchOp, TestBatchDropTrueRepeat) {
 TEST_F(MindDataTestBatchOp, TestSimpleBatchPadding) {
  std::string schema_file = datasets_root_path_ + "/testBatchDataset";
  std::shared_ptr<BatchOp> op;
-  std::map<std::string, std::pair<TensorShape, float>> m;
-  m.insert({"col_1d", std::make_pair(TensorShape({4}), -1)});
+  PadInfo m;
+  std::shared_ptr<Tensor> pad_value;
+  Tensor::CreateTensor(&pad_value, TensorImpl::kFlexible, TensorShape::CreateScalar(), DataType(DataType::DE_FLOAT32));
+  pad_value->SetItemAt<float>({}, -1);
+  m.insert({"col_1d", std::make_pair(TensorShape({4}), pad_value)});
  de::BatchOp::Builder(12).SetDrop(false).SetPaddingMap(m, true).Build(&op);
  auto tree = Build({Storage(schema_file), op});
  tree->Prepare();
@ -308,9 +311,54 @@ TEST_F(MindDataTestBatchOp, TestSimpleBatchPadding) {
  if (rc.IsError()) {
    MS_LOG(ERROR) << "Return code error detected during tree launch: " << rc.ToString() << ".";
  } else {
-    int64_t payload[] = {-9223372036854775807 - 1,  1,  -1, -1, 2,  3,  -1, -1, 4,  5,  -1, -1, 6,  7,  -1, -1,
-                         8,  9,  -1, -1, 10, 11, -1, -1, 12, 13, -1, -1, 14, 15, -1, -1,
-                         16, 17, -1, -1, 18, 19, -1, -1, 20, 21, -1, -1, 22, 23, -1, -1};
+    int64_t payload[] = {-9223372036854775807 - 1,
+                         1,
+                         -1,
+                         -1,
+                         2,
+                         3,
+                         -1,
+                         -1,
+                         4,
+                         5,
+                         -1,
+                         -1,
+                         6,
+                         7,
+                         -1,
+                         -1,
+                         8,
+                         9,
+                         -1,
+                         -1,
+                         10,
+                         11,
+                         -1,
+                         -1,
+                         12,
+                         13,
+                         -1,
+                         -1,
+                         14,
+                         15,
+                         -1,
+                         -1,
+                         16,
+                         17,
+                         -1,
+                         -1,
+                         18,
+                         19,
+                         -1,
+                         -1,
+                         20,
+                         21,
+                         -1,
+                         -1,
+                         22,
+                         23,
+                         -1,
+                         -1};
    std::shared_ptr<de::Tensor> t;
    rc = de::Tensor::CreateTensor(&t, TensorImpl::kFlexible, de::TensorShape({12, 4}), de::DataType(DataType::DE_INT64),
                                  (unsigned char *)payload);
--- a/tests/ut/python/dataset/test_tensor_string.py
+++ b/tests/ut/python/dataset/test_tensor_string.py
@ -12,15 +12,14 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-import pytest
+import mindspore._c_dataengine as cde
 import numpy as np

-import mindspore.dataset as ds
-import mindspore._c_dataengine as cde
 import mindspore.common.dtype as mstype
-from mindspore.dataset.text import to_str
+import mindspore.dataset as ds
+from mindspore.dataset.text import to_str, to_bytes
+

-# pylint: disable=comparison-with-itself
 def test_basic():
    x = np.array([["ab", "cde", "121"], ["x", "km", "789"]], dtype='S')
    n = cde.Tensor(x)
@ -28,8 +27,8 @@ def test_basic():
    np.testing.assert_array_equal(x, arr)


-def compare(strings):
-    arr = np.array(strings, dtype='S')
+def compare(strings, dtype='S'):
+    arr = np.array(strings, dtype=dtype)

    def gen():
        (yield arr,)
@ -37,25 +36,51 @@ def compare(strings):
    data = ds.GeneratorDataset(gen, column_names=["col"])

    for d in data:
-        np.testing.assert_array_equal(d[0], arr)
+        np.testing.assert_array_equal(d[0], arr.astype('S'))


 def test_generator():
    compare(["ab"])
+    compare(["", ""])
+    compare([""])
+    compare(["ab", ""])
    compare(["ab", "cde", "121"])
    compare([["ab", "cde", "121"], ["x", "km", "789"]])
+    compare([["ab", "", "121"], ["", "km", "789"]])
+    compare(["ab"], dtype='U')
+    compare(["", ""], dtype='U')
+    compare([""], dtype='U')
+    compare(["ab", ""], dtype='U')
+    compare(["", ""], dtype='U')
+    compare(["", "ab"], dtype='U')
+    compare(["ab", "cde", "121"], dtype='U')
+    compare([["ab", "cde", "121"], ["x", "km", "789"]], dtype='U')
+    compare([["ab", "", "121"], ["", "km", "789"]], dtype='U')
+
+
+line = np.array(["This is a text file.",
+                 "Be happy every day.",
+                 "Good luck to everyone."])
+
+words = np.array([["This", "text", "file", "a"],
+                  ["Be", "happy", "day", "b"],
+                  ["女", "", "everyone", "c"]])
+
+chinese = np.array(["今天天气太好了我们一起去外面玩吧",
+                    "男默女泪",
+                    "江州市长江大桥参加了长江大桥的通车仪式"])


 def test_batching_strings():
    def gen():
-        yield (np.array(["ab", "cde", "121"], dtype='S'),)
+        for row in chinese:
+            yield (np.array(row),)

-    data = ds.GeneratorDataset(gen, column_names=["col"]).batch(10)
+    data = ds.GeneratorDataset(gen, column_names=["col"])
+    data = data.batch(2, drop_remainder=True)

-    with pytest.raises(RuntimeError) as info:
-        for _ in data:
-            pass
-    assert "[Batch ERROR] Batch does not support" in str(info.value)
+    for d in data:
+        np.testing.assert_array_equal(d[0], to_bytes(chinese[0:2]))


 def test_map():
@ -67,7 +92,7 @@ def test_map():
    def split(b):
        s = to_str(b)
        splits = s.item().split()
-        return np.array(splits, dtype='S')
+        return np.array(splits)

    data = data.map(input_columns=["col"], operations=split)
    expected = np.array(["ab", "cde", "121"], dtype='S')
@ -91,19 +116,6 @@ def test_map2():
        np.testing.assert_array_equal(d[0], expected)


-line = np.array(["This is a text file.",
-                 "Be happy every day.",
-                 "Good luck to everyone."])
-
-words = np.array([["This", "text", "file", "a"],
-                  ["Be", "happy", "day", "b"],
-                  ["女", "", "everyone", "c"]])
-
-chinese = np.array(["今天天气太好了我们一起去外面玩吧",
-                    "男默女泪",
-                    "江州市长江大桥参加了长江大桥的通车仪式"])
-
-
 def test_tfrecord1():
    s = ds.Schema()
    s.add_column("line", "string", [])
@ -181,6 +193,94 @@ def test_mindrecord():
        np.testing.assert_array_equal(chinese[i], to_str(d["chinese"]))


+# The following tests cases were copied from test_pad_batch but changed to strings instead
+
+
+# this generator function yield two columns
+# col1d: [0],[1], [2], [3]
+# col2d: [[100],[200]], [[101],[201]], [102],[202]], [103],[203]]
+def gen_2cols(num):
+    for i in range(num):
+        yield (np.array([str(i)]), np.array([[str(i + 100)], [str(i + 200)]]))
+
+
+# this generator function yield one column of variable shapes
+# col: [0], [0,1], [0,1,2], [0,1,2,3]
+def gen_var_col(num):
+    for i in range(num):
+        yield (np.array([str(j) for j in range(i + 1)]),)
+
+
+# this generator function yield two columns of variable shapes
+# col1: [0], [0,1], [0,1,2], [0,1,2,3]
+# col2: [100], [100,101], [100,101,102], [100,110,102,103]
+def gen_var_cols(num):
+    for i in range(num):
+        yield (np.array([str(j) for j in range(i + 1)]), np.array([str(100 + j) for j in range(i + 1)]))
+
+
+# this generator function yield two columns of variable shapes
+# col1: [[0]], [[0,1]], [[0,1,2]], [[0,1,2,3]]
+# col2: [[100]], [[100,101]], [[100,101,102]], [[100,110,102,103]]
+def gen_var_cols_2d(num):
+    for i in range(num):
+        yield (np.array([[str(j) for j in range(i + 1)]]), np.array([[str(100 + j) for j in range(i + 1)]]))
+
+
+def test_batch_padding_01():
+    data1 = ds.GeneratorDataset((lambda: gen_2cols(2)), ["col1d", "col2d"])
+    data1 = data1.batch(batch_size=2, drop_remainder=False, pad_info={"col2d": ([2, 2], b"-2"), "col1d": ([2], b"-1")})
+    data1 = data1.repeat(2)
+    for data in data1.create_dict_iterator():
+        np.testing.assert_array_equal([[b"0", b"-1"], [b"1", b"-1"]], data["col1d"])
+        np.testing.assert_array_equal([[[b"100", b"-2"], [b"200", b"-2"]], [[b"101", b"-2"], [b"201", b"-2"]]],
+                                      data["col2d"])
+
+
+def test_batch_padding_02():
+    data1 = ds.GeneratorDataset((lambda: gen_2cols(2)), ["col1d", "col2d"])
+    data1 = data1.batch(batch_size=2, drop_remainder=False, pad_info={"col2d": ([1, 2], "")})
+    data1 = data1.repeat(2)
+    for data in data1.create_dict_iterator():
+        np.testing.assert_array_equal([[b"0"], [b"1"]], data["col1d"])
+        np.testing.assert_array_equal([[[b"100", b""]], [[b"101", b""]]], data["col2d"])
+
+
+def test_batch_padding_03():
+    data1 = ds.GeneratorDataset((lambda: gen_var_col(4)), ["col"])
+    data1 = data1.batch(batch_size=2, drop_remainder=False, pad_info={"col": (None, "PAD_VALUE")})  # pad automatically
+    data1 = data1.repeat(2)
+    res = dict()
+    for ind, data in enumerate(data1.create_dict_iterator()):
+        res[ind] = data["col"].copy()
+    np.testing.assert_array_equal(res[0], [[b"0", b"PAD_VALUE"], [0, 1]])
+    np.testing.assert_array_equal(res[1], [[b"0", b"1", b"2", b"PAD_VALUE"], [b"0", b"1", b"2", b"3"]])
+    np.testing.assert_array_equal(res[2], [[b"0", b"PAD_VALUE"], [b"0", b"1"]])
+    np.testing.assert_array_equal(res[3], [[b"0", b"1", b"2", b"PAD_VALUE"], [b"0", b"1", b"2", b"3"]])
+
+
+def test_batch_padding_04():
+    data1 = ds.GeneratorDataset((lambda: gen_var_cols(2)), ["col1", "col2"])
+    data1 = data1.batch(batch_size=2, drop_remainder=False, pad_info={})  # pad automatically
+    data1 = data1.repeat(2)
+    for data in data1.create_dict_iterator():
+        np.testing.assert_array_equal(data["col1"], [[b"0", b""], [b"0", b"1"]])
+        np.testing.assert_array_equal(data["col2"], [[b"100", b""], [b"100", b"101"]])
+
+
+def test_batch_padding_05():
+    data1 = ds.GeneratorDataset((lambda: gen_var_cols_2d(3)), ["col1", "col2"])
+    data1 = data1.batch(batch_size=3, drop_remainder=False,
+                        pad_info={"col2": ([2, None], "-2"), "col1": (None, "-1")})  # pad automatically
+    for data in data1.create_dict_iterator():
+        np.testing.assert_array_equal(data["col1"],
+                                      [[[b"0", b"-1", b"-1"]], [[b"0", b"1", b"-1"]], [[b"0", b"1", b"2"]]])
+        np.testing.assert_array_equal(data["col2"],
+                                      [[[b"100", b"-2", b"-2"], [b"-2", b"-2", b"-2"]],
+                                       [[b"100", b"101", b"-2"], [b"-2", b"-2", b"-2"]],
+                                       [[b"100", b"101", b"102"], [b"-2", b"-2", b"-2"]]])
+
+
 if __name__ == '__main__':
    test_generator()
    test_basic()
@ -191,3 +291,8 @@ if __name__ == '__main__':
    test_tfrecord2()
    test_tfrecord3()
    test_mindrecord()
+    test_batch_padding_01()
+    test_batch_padding_02()
+    test_batch_padding_03()
+    test_batch_padding_04()
+    test_batch_padding_05()