introducing new C++ API

pull/2773/head
ervinzhang 5 years ago
parent 2f565f4c20
commit bd5a777f81

@ -17,6 +17,10 @@ else()
set(CMAKE_CXX_FLAGS_RELEASE "$ENV{CXXFLAGS} -O2 -Wl,--allow-shlib-undefined -DHALF_ENABLE_CPP11_USER_LITERALS=0 -D_FORTIFY_SOURCE=2")
endif()
if (ENABLE_PYTHON)
add_compile_definitions(ENABLE_PYTHON)
endif()
set(CMAKE_CXX_FLAGS_DEBUG "$ENV{CXXFLAGS} -O0 -g2 -ggdb -fno-inline-functions -fno-omit-frame-pointer -Wl,--allow-shlib-undefined -D_LIBCPP_INLINE_VISIBILITY='' -D'_LIBCPP_EXTERN_TEMPLATE(...)=' -DHALF_ENABLE_CPP11_USER_LITERALS=0 -D_FORTIFY_SOURCE=2 -Wno-cpp")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -I/usr/local/include -std=c++17 -Werror -Wall -Wno-deprecated-declarations -fPIC")

@ -25,7 +25,7 @@ usage()
echo "Usage:"
echo "bash build.sh [-d] [-r] [-v] [-c on|off] [-t on|off] [-g on|off] [-h] [-b ge] [-m infer|train] \\"
echo " [-a on|off] [-Q on|off] [-p on|off] [-i] [-L] [-R] [-D on|off] [-j[n]] [-e gpu|d|cpu] \\"
echo " [-P on|off] [-z [on|off]] [-M on|off] [-V 9.2|10.1] [-I] [-K] [-B on|off] [-E]"
echo " [-P on|off] [-z [on|off]] [-M on|off] [-V 9.2|10.1] [-I] [-K] [-B on|off] [-E] [-l on|off]"
echo ""
echo "Options:"
echo " -d Debug mode"
@ -56,6 +56,7 @@ usage()
echo " -s Enable serving module, default off"
echo " -B Enable debugger, default off"
echo " -E Enable IBVERBS for parameter server, default off"
echo " -l Compile with python dependency, default on"
}
# check value of input is 'on' or 'off'
@ -98,9 +99,10 @@ checkopts()
ENABLE_SERVING="off"
ENABLE_DEBUGGER="off"
ENABLE_IBVERBS="off"
ENABLE_PYTHON="on"
# Process the options
while getopts 'drvj:c:t:hsb:a:g:p:ie:m:I:LRP:Q:D:zM:V:K:sB:E' opt
while getopts 'drvj:c:t:hsb:a:g:p:ie:m:l:I:LRP:Q:D:zM:V:K:sB:E' opt
do
OPTARG=$(echo ${OPTARG} | tr '[A-Z]' '[a-z]')
case "${opt}" in
@ -151,6 +153,10 @@ checkopts()
check_on_off $OPTARG p
ENABLE_PROFILE="$OPTARG"
;;
l)
check_on_off $OPTARG l
ENABLE_PYTHON="$OPTARG"
;;
i)
INC_BUILD="on"
;;
@ -316,6 +322,7 @@ build_mindspore()
CMAKE_ARGS="${CMAKE_ARGS} -DENABLE_DUMP_E2E=ON"
fi
CMAKE_ARGS="${CMAKE_ARGS} -DENABLE_DUMP_IR=${ENABLE_DUMP_IR}"
CMAKE_ARGS="${CMAKE_ARGS} -DENABLE_PYTHON=${ENABLE_PYTHON}"
if [[ "X$ENABLE_MPI" = "Xon" ]]; then
CMAKE_ARGS="${CMAKE_ARGS} -DENABLE_MPI=ON"
fi

@ -19,6 +19,7 @@ option(ENABLE_MPI "enable mpi" OFF)
option(ENABLE_AKG "enable akg" OFF)
option(ENABLE_DEBUGGER "enable debugger" OFF)
option(ENABLE_IBVERBS "enable IBVERBS for parameter server" OFF)
option(ENABLE_PYTHON "Enable python" ON)
if (CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
if (WIN32)

@ -39,6 +39,7 @@ include_directories(${CMAKE_SOURCE_DIR}/mindspore/ccsrc/device/ascend/platform)
include_directories(${CMAKE_BINARY_DIR}) # for protobuf generated .h
include_directories(${CMAKE_SOURCE_DIR}/mindspore/ccsrc/mindrecord/include)
include_directories(${CMAKE_SOURCE_DIR}/mindspore/ccsrc/dataset/include)
######################################################################
####################### Flags ########################################
@ -67,7 +68,10 @@ add_dependencies(engine-gnn core)
add_dependencies(engine core)
add_dependencies(text core)
add_dependencies(text-kernels core)
add_dependencies(APItoPython core)
add_dependencies(cpp-API core)
if (ENABLE_PYTHON)
add_dependencies(APItoPython core)
endif()
if (ENABLE_TDTQUE)
add_dependencies(engine-tdt core)
endif ()
@ -78,7 +82,7 @@ set(submodules
$<TARGET_OBJECTS:kernels>
$<TARGET_OBJECTS:kernels-image>
$<TARGET_OBJECTS:kernels-data>
$<TARGET_OBJECTS:APItoPython>
$<TARGET_OBJECTS:cpp-API>
$<TARGET_OBJECTS:engine-datasetops-source>
$<TARGET_OBJECTS:engine-datasetops-source-sampler>
$<TARGET_OBJECTS:engine-gnn>
@ -90,6 +94,12 @@ set(submodules
$<TARGET_OBJECTS:text-kernels>
)
if (ENABLE_PYTHON)
set(submodules
${submodules}
$<TARGET_OBJECTS:APItoPython>)
endif()
if (ENABLE_TDTQUE)
add_library(_c_dataengine SHARED ${submodules} $<TARGET_OBJECTS:engine-tdt>)
else ()

@ -1,7 +1,16 @@
file(GLOB_RECURSE _CURRENT_SRC_FILES RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} "*.cc")
set_property(SOURCE ${_CURRENT_SRC_FILES} PROPERTY COMPILE_DEFINITIONS SUBMODULE_ID=mindspore::SubModuleId::SM_MD)
add_library(APItoPython OBJECT
de_pipeline.cc
python_bindings.cc
if (ENABLE_PYTHON)
add_library(APItoPython OBJECT
de_pipeline.cc
python_bindings.cc
)
target_include_directories(APItoPython PRIVATE ${pybind11_INCLUDE_DIRS})
endif()
add_library(cpp-API OBJECT
datasets.cc
iterator.cc
transforms.cc
samplers.cc
)
target_include_directories(APItoPython PRIVATE ${pybind11_INCLUDE_DIRS})

File diff suppressed because it is too large Load Diff

@ -0,0 +1,101 @@
/**
* Copyright 2020 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "dataset/include/iterator.h"
#include "dataset/core/client.h"
#include "dataset/include/datasets.h"
namespace mindspore {
namespace dataset {
namespace api {
// Get the next row from the data pipeline.
void Iterator::GetNextRow(TensorMap *row) {
Status rc = iterator_->GetNextAsMap(row);
if (rc.IsError()) {
MS_LOG(ERROR) << "GetNextRow: Failed to get next row.";
row->clear();
}
}
// Shut down the data pipeline.
void Iterator::Stop() {
// Releasing the iterator_ unique_ptre. This should trigger the destructor of iterator_.
iterator_.reset();
// Release ownership of tree_ shared pointer. This will decrement the ref count.
tree_.reset();
}
// Function to build and launch the execution tree.
Status Iterator::BuildAndLaunchTree(std::shared_ptr<Dataset> ds) {
// One time init
Status rc;
rc = GlobalInit();
RETURN_IF_NOT_OK(rc);
// Instantiate the execution tree
tree_ = std::make_shared<ExecutionTree>();
// Iterative BFS converting Dataset tree into runtime Execution tree.
std::queue<std::pair<std::shared_ptr<Dataset>, std::shared_ptr<DatasetOp>>> q;
if (ds != nullptr) {
// Convert the current root node.
auto root_op = ds->Build()->front();
RETURN_UNEXPECTED_IF_NULL(root_op);
RETURN_IF_NOT_OK(tree_->AssociateNode(root_op));
q.push(std::make_pair(ds, root_op));
// Traverse down to the children and convert them to the corresponding DatasetOps (i.e. execution tree nodes)
while (!q.empty()) {
auto node_pair = q.front();
q.pop();
// Iterate through all the direct children of the first element in our BFS queue
for (auto child : node_pair.first->children) {
auto child_ops = child->Build();
RETURN_UNEXPECTED_IF_NULL(child_ops);
auto node_op = node_pair.second;
// Iterate through all the DatasetOps returned by calling Build on the last Dataset object, associate them
// with the execution tree and add the child and parent relationship between the nodes
// Note that some Dataset objects might return more than one DatasetOps
// e.g. MapDataset will return MapOp and ProjectOp if project_columns is set for MapDataset
for (auto child_op : *child_ops) {
RETURN_IF_NOT_OK(tree_->AssociateNode(child_op));
RETURN_IF_NOT_OK(node_op->AddChild(child_op));
node_op = child_op;
}
// Add the child and the last element of the returned DatasetOps (which is now the leaf node in our current
// execution tree) to the BFS queue
q.push(std::make_pair(child, child_ops->back()));
}
}
RETURN_IF_NOT_OK(tree_->AssignRoot(root_op));
}
// Launch the execution tree.
RETURN_IF_NOT_OK(tree_->Prepare());
RETURN_IF_NOT_OK(tree_->Launch());
iterator_ = std::make_unique<DatasetIterator>(tree_);
RETURN_UNEXPECTED_IF_NULL(iterator_);
return rc;
}
} // namespace api
} // namespace dataset
} // namespace mindspore

@ -297,7 +297,7 @@ void bindTensor(py::module *m) {
}))
.def_buffer([](Tensor &tensor) {
py::buffer_info info;
THROW_IF_ERROR(Tensor::GetBufferInfo(tensor, &info));
THROW_IF_ERROR(Tensor::GetBufferInfo(&tensor, &info));
return info;
})
.def("__str__", &Tensor::ToString)
@ -311,7 +311,7 @@ void bindTensor(py::module *m) {
return res;
}
py::buffer_info info;
THROW_IF_ERROR(Tensor::GetBufferInfo(tensor, &info));
THROW_IF_ERROR(Tensor::GetBufferInfo(&tensor, &info));
return py::array(pybind11::dtype(info), info.shape, info.strides, info.ptr, t);
});

@ -0,0 +1,224 @@
/**
* Copyright 2020 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "dataset/include/samplers.h"
#include "dataset/engine/datasetops/source/sampler/sampler.h"
#include "dataset/engine/datasetops/source/sampler/distributed_sampler.h"
#include "dataset/engine/datasetops/source/sampler/random_sampler.h"
#include "dataset/engine/datasetops/source/sampler/sequential_sampler.h"
#include "dataset/engine/datasetops/source/sampler/subset_random_sampler.h"
#include "dataset/engine/datasetops/source/sampler/weighted_random_sampler.h"
#include "dataset/engine/datasetops/source/sampler/pk_sampler.h"
namespace mindspore {
namespace dataset {
namespace api {
SamplerObj::SamplerObj() {}
/// Function to create a Distributed Sampler.
std::shared_ptr<DistributedSamplerObj> DistributedSampler(int64_t num_shards, int64_t shard_id, bool shuffle,
int64_t num_samples, uint32_t seed) {
auto sampler = std::make_shared<DistributedSamplerObj>(num_shards, shard_id, shuffle, num_samples, seed);
// Input validation
if (!sampler->ValidateParams()) {
return nullptr;
}
return sampler;
}
/// Function to create a PK Sampler.
std::shared_ptr<PKSamplerObj> PKSampler(int64_t num_val, bool shuffle, int64_t num_samples) {
auto sampler = std::make_shared<PKSamplerObj>(num_val, shuffle, num_samples);
// Input validation
if (!sampler->ValidateParams()) {
return nullptr;
}
return sampler;
}
/// Function to create a Random Sampler.
std::shared_ptr<RandomSamplerObj> RandomSampler(bool replacement, int64_t num_samples) {
auto sampler = std::make_shared<RandomSamplerObj>(replacement, num_samples);
// Input validation
if (!sampler->ValidateParams()) {
return nullptr;
}
return sampler;
}
/// Function to create a Sequential Sampler.
std::shared_ptr<SequentialSamplerObj> SequentialSampler(int64_t start_index, int64_t num_samples) {
auto sampler = std::make_shared<SequentialSamplerObj>(start_index, num_samples);
// Input validation
if (!sampler->ValidateParams()) {
return nullptr;
}
return sampler;
}
/// Function to create a Subset Random Sampler.
std::shared_ptr<SubsetRandomSamplerObj> SubsetRandomSampler(const std::vector<int64_t> &indices, int64_t num_samples) {
auto sampler = std::make_shared<SubsetRandomSamplerObj>(indices, num_samples);
// Input validation
if (!sampler->ValidateParams()) {
return nullptr;
}
return sampler;
}
/// Function to create a Weighted Random Sampler.
std::shared_ptr<WeightedRandomSamplerObj> WeightedRandomSampler(const std::vector<double> &weights, int64_t num_samples,
bool replacement) {
auto sampler = std::make_shared<WeightedRandomSamplerObj>(weights, num_samples, replacement);
// Input validation
if (!sampler->ValidateParams()) {
return nullptr;
}
return sampler;
}
/* ####################################### Derived Sampler classes ################################# */
// DistributedSampler
DistributedSamplerObj::DistributedSamplerObj(int64_t num_shards, int64_t shard_id, bool shuffle, int64_t num_samples,
uint32_t seed)
: num_shards_(num_shards), shard_id_(shard_id), shuffle_(shuffle), num_samples_(num_samples), seed_(seed) {}
bool DistributedSamplerObj::ValidateParams() {
if (num_shards_ <= 0) {
MS_LOG(ERROR) << "DistributedSampler: invalid num_shards: " << num_shards_;
return false;
}
if (shard_id_ < 0 || shard_id_ >= num_shards_) {
MS_LOG(ERROR) << "DistributedSampler: invalid input, shard_id: " << shard_id_ << ", num_shards: " << num_shards_;
return false;
}
if (num_samples_ < 0) {
MS_LOG(ERROR) << "DistributedSampler: invalid num_samples: " << num_samples_;
return false;
}
return true;
}
std::shared_ptr<Sampler> DistributedSamplerObj::Build() {
return std::make_shared<dataset::DistributedSampler>(num_samples_, num_shards_, shard_id_, shuffle_, seed_);
}
// PKSampler
PKSamplerObj::PKSamplerObj(int64_t num_val, bool shuffle, int64_t num_samples)
: num_val_(num_val), shuffle_(shuffle), num_samples_(num_samples) {}
bool PKSamplerObj::ValidateParams() {
if (num_val_ <= 0) {
MS_LOG(ERROR) << "PKSampler: invalid num_val: " << num_val_;
return false;
}
if (num_samples_ < 0) {
MS_LOG(ERROR) << "PKSampler: invalid num_samples: " << num_samples_;
return false;
}
return true;
}
std::shared_ptr<Sampler> PKSamplerObj::Build() {
return std::make_shared<dataset::PKSampler>(num_samples_, num_val_, shuffle_);
}
// RandomSampler
RandomSamplerObj::RandomSamplerObj(bool replacement, int64_t num_samples)
: replacement_(replacement), num_samples_(num_samples) {}
bool RandomSamplerObj::ValidateParams() {
if (num_samples_ < 0) {
MS_LOG(ERROR) << "RandomSampler: invalid num_samples: " << num_samples_;
return false;
}
return true;
}
std::shared_ptr<Sampler> RandomSamplerObj::Build() {
bool reshuffle_each_epoch = true;
auto sampler = std::make_shared<dataset::RandomSampler>(num_samples_, replacement_, reshuffle_each_epoch);
return sampler;
}
// SequentialSampler
SequentialSamplerObj::SequentialSamplerObj(int64_t start_index, int64_t num_samples)
: start_index_(start_index), num_samples_(num_samples) {}
bool SequentialSamplerObj::ValidateParams() {
if (num_samples_ < 0) {
MS_LOG(ERROR) << "SequentialSampler: invalid num_samples: " << num_samples_;
return false;
}
if (start_index_ < 0) {
MS_LOG(ERROR) << "SequentialSampler: invalid start_index: " << start_index_;
return false;
}
return true;
}
std::shared_ptr<Sampler> SequentialSamplerObj::Build() {
auto sampler = std::make_shared<dataset::SequentialSampler>(num_samples_, start_index_);
return sampler;
}
// SubsetRandomSampler
SubsetRandomSamplerObj::SubsetRandomSamplerObj(const std::vector<int64_t> &indices, int64_t num_samples)
: indices_(indices), num_samples_(num_samples) {}
bool SubsetRandomSamplerObj::ValidateParams() {
if (num_samples_ < 0) {
MS_LOG(ERROR) << "SubsetRandomSampler: invalid num_samples: " << num_samples_;
return false;
}
return true;
}
std::shared_ptr<Sampler> SubsetRandomSamplerObj::Build() {
auto sampler = std::make_shared<dataset::SubsetRandomSampler>(num_samples_, indices_);
return sampler;
}
// WeightedRandomSampler
WeightedRandomSamplerObj::WeightedRandomSamplerObj(const std::vector<double> &weights, int64_t num_samples,
bool replacement)
: weights_(weights), num_samples_(num_samples), replacement_(replacement) {}
bool WeightedRandomSamplerObj::ValidateParams() {
if (num_samples_ < 0) {
MS_LOG(ERROR) << "WeightedRandomSampler: invalid num_samples: " << num_samples_;
return false;
}
return true;
}
std::shared_ptr<Sampler> WeightedRandomSamplerObj::Build() {
auto sampler = std::make_shared<dataset::WeightedRandomSampler>(num_samples_, weights_, replacement_);
return sampler;
}
} // namespace api
} // namespace dataset
} // namespace mindspore

File diff suppressed because it is too large Load Diff

@ -1,10 +1,6 @@
ms_protobuf_generate(EXAMPLE_SRCS EXAMPLE_HDRS example.proto)
ms_protobuf_generate(FEATURE_SRCS FEATURE_HDRS feature.proto)
file(GLOB_RECURSE _CURRENT_SRC_FILES RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} "*.cc")
set_property(SOURCE ${_CURRENT_SRC_FILES} PROPERTY COMPILE_DEFINITIONS SUBMODULE_ID=mindspore::SubModuleId::SM_MD)
add_library(core OBJECT
${EXAMPLE_SRCS}
${FEATURE_SRCS}
set(DATASET_CORE_SRC_FILES
client.cc
config_manager.cc
cv_tensor.cc
@ -13,6 +9,13 @@ add_library(core OBJECT
tensor.cc
tensor_row.cc
tensor_shape.cc
)
)
ms_protobuf_generate(EXAMPLE_SRCS EXAMPLE_HDRS example.proto)
ms_protobuf_generate(FEATURE_SRCS FEATURE_HDRS feature.proto)
add_library(core OBJECT ${DATASET_CORE_SRC_FILES} ${EXAMPLE_SRCS} ${FEATURE_SRCS})
add_dependencies(core mindspore::protobuf)
target_include_directories(core PRIVATE ${pybind11_INCLUDE_DIRS})
if (ENABLE_PYTHON)
target_include_directories(core PRIVATE ${pybind11_INCLUDE_DIRS})
endif()

@ -25,21 +25,25 @@
#include "dataset/core/tensor_shape.h"
#include "dataset/engine/data_schema.h"
#include "dataset/engine/dataset_iterator.h"
#include "dataset/engine/datasetops/source/mindrecord_op.h"
#include "dataset/engine/datasetops/source/tf_reader_op.h"
#ifdef ENABLE_PYTHON
#include "dataset/engine/datasetops/barrier_op.h"
#include "dataset/engine/datasetops/batch_op.h"
#include "dataset/engine/datasetops/filter_op.h"
#include "dataset/engine/datasetops/source/generator_op.h"
#include "dataset/engine/datasetops/build_vocab_op.h"
#endif
#include "dataset/engine/datasetops/batch_op.h"
#include "dataset/engine/datasetops/dataset_op.h"
#include "dataset/engine/datasetops/device_queue_op.h"
#include "dataset/engine/datasetops/map_op.h"
#include "dataset/engine/datasetops/project_op.h"
#include "dataset/engine/datasetops/rename_op.h"
#include "dataset/engine/datasetops/filter_op.h"
#include "dataset/engine/datasetops/repeat_op.h"
#include "dataset/engine/datasetops/skip_op.h"
#include "dataset/engine/datasetops/shuffle_op.h"
#include "dataset/engine/datasetops/source/generator_op.h"
#include "dataset/engine/datasetops/source/mindrecord_op.h"
#include "dataset/engine/datasetops/source/tf_reader_op.h"
#include "dataset/engine/datasetops/take_op.h"
#include "dataset/engine/datasetops/zip_op.h"
#include "dataset/engine/datasetops/concat_op.h"

@ -32,6 +32,12 @@ enum class DatasetType { kUnknown, kArrow, kTf };
// Possible flavours of Tensor implementations
enum class TensorImpl { kNone, kFlexible, kCv, kNP };
// Possible values for Border types
enum class BorderType { kConstant = 0, kEdge = 1, kReflect = 2, kSymmetric = 3 };
// Possible interpolation modes
enum class InterpolationMode { kLinear = 0, kNearestNeighbour = 1, kCubic = 2, kArea = 3 };
// convenience functions for 32bit int bitmask
inline bool BitTest(uint32_t bits, uint32_t bitMask) { return (bits & bitMask) == bitMask; }

@ -14,11 +14,12 @@
* limitations under the License.
*/
#include "dataset/core/data_type.h"
#ifdef ENABLE_PYTHON
#include "dataset/core/pybind_support.h"
#endif
#include "utils/log_adapter.h"
#include "dataset/core/pybind_support.h"
namespace mindspore {
namespace dataset {
@ -29,12 +30,14 @@ uint8_t DataType::SizeInBytes() const {
return 0;
}
#ifdef ENABLE_PYTHON
py::dtype DataType::AsNumpyType() const {
if (type_ < DataType::NUM_OF_TYPES)
return py::dtype(kTypeInfo[type_].pybindType_);
else
return py::dtype("unknown");
}
#endif
uint8_t DataType::AsCVType() const {
uint8_t res = kCVInvalidType;
@ -112,6 +115,7 @@ std::string DataType::ToString() const {
return "unknown";
}
#ifdef ENABLE_PYTHON
DataType DataType::FromNpArray(const py::array &arr) {
if (py::isinstance<py::array_t<bool>>(arr)) {
return DataType(DataType::DE_BOOL);
@ -156,6 +160,7 @@ std::string DataType::GetPybindFormat() const {
}
return res;
}
#endif
} // namespace dataset
} // namespace mindspore

@ -19,14 +19,16 @@
#include <opencv2/core/hal/interface.h>
#include <string>
#ifdef ENABLE_PYTHON
#include "pybind11/numpy.h"
#include "pybind11/pybind11.h"
#include "dataset/core/constants.h"
#include "dataset/core/pybind_support.h"
namespace py = pybind11;
#else
#include "Eigen/Core"
using float16 = Eigen::half;
#endif
#include "dataset/core/constants.h"
namespace mindspore {
namespace dataset {
@ -59,6 +61,7 @@ class DataType {
const uint8_t cvType_; // OpenCv matching type
};
#ifdef ENABLE_PYTHON
static inline const TypeInfo kTypeInfo[] = {
// name, sizeInBytes, pybindTypem formatDescriptor, openCV
{"unknown", 0, "object", "", kCVInvalidType}, // DE_UNKNOWN
@ -76,19 +79,38 @@ class DataType {
{"float64", 8, "double", py::format_descriptor<double>::format(), CV_64F}, // DE_FLOAT64
{"string", 0, "bytes", "S", kCVInvalidType} // DE_STRING
};
#else
static inline const TypeInfo kTypeInfo[] = {
// name, sizeInBytes, pybindTypem formatDescriptor, openCV
{"unknown", 0, "object", "", kCVInvalidType}, // DE_UNKNOWN
{"bool", 1, "bool", "", CV_8U}, // DE_BOOL
{"int8", 1, "int8", "", CV_8S}, // DE_INT8
{"uint8", 1, "uint8", "", CV_8U}, // DE_UINT8
{"int16", 2, "int16", "", CV_16S}, // DE_INT16
{"uint16", 2, "uint16", "", CV_16U}, // DE_UINT16
{"int32", 4, "int32", "", CV_32S}, // DE_INT32
{"uint32", 4, "uint32", "", kCVInvalidType}, // DE_UINT32
{"int64", 8, "int64", "", kCVInvalidType}, // DE_INT64
{"uint64", 8, "uint64", "", kCVInvalidType}, // DE_UINT64
{"float16", 2, "float16", "", CV_16F}, // DE_FLOAT16
{"float32", 4, "float32", "", CV_32F}, // DE_FLOAT32
{"float64", 8, "double", "", CV_64F}, // DE_FLOAT64
{"string", 0, "bytes", "", kCVInvalidType} // DE_STRING
};
#endif
// No arg constructor to create an unknown shape
DataType() : type_(DE_UNKNOWN) {}
// Create a type from a given string
// @param type_str
/// \param type_str
explicit DataType(const std::string &type_str);
// Default destructor
~DataType() = default;
// Create a type from a given enum
// @param d
/// \param d
constexpr explicit DataType(Type d) : type_(d) {}
constexpr bool operator==(const DataType a) const { return type_ == a.type_; }
@ -100,49 +122,49 @@ class DataType {
constexpr bool operator!=(const Type a) const { return type_ != a; }
// Disable this usage `if(d)` where d is of type DataType
// @return
/// \return
operator bool() = delete;
// To be used in Switch/case
// @return
/// \return
operator Type() const { return type_; }
// The number of bytes needed to store one value of this type
// @return
/// \return
uint8_t SizeInBytes() const;
// Convert from DataType to OpenCV type
// @return
/// \return
uint8_t AsCVType() const;
// Convert from OpenCV type to DataType
// @param cv_type
// @return
/// \param cv_type
/// \return
static DataType FromCVType(int cv_type);
// Returns a string representation of the type
// @return
/// \return
std::string ToString() const;
// returns true if the template type is the same as the Tensor type_
// @tparam T
// @return true or false
/// \tparam T
/// \return true or false
template <typename T>
bool IsCompatible() const {
return type_ == FromCType<T>();
}
// returns true if the template type is the same as the Tensor type_
// @tparam T
// @return true or false
/// \tparam T
/// \return true or false
template <typename T>
bool IsLooselyCompatible() const;
// << Stream output operator overload
// @notes This allows you to print the info using stream operators
// @param out - reference to the output stream being overloaded
// @param rO - reference to the DataType to display
// @return - the output stream must be returned
/// \notes This allows you to print the info using stream operators
/// \param out - reference to the output stream being overloaded
/// \param rO - reference to the DataType to display
/// \return - the output stream must be returned
friend std::ostream &operator<<(std::ostream &out, const DataType &so) {
out << so.ToString();
return out;
@ -151,22 +173,24 @@ class DataType {
template <typename T>
static DataType FromCType();
#ifdef ENABLE_PYTHON
// Convert from DataType to Pybind type
// @return
/// \return
py::dtype AsNumpyType() const;
// Convert from NP type to DataType
// @param type
// @return
/// \param type
/// \return
static DataType FromNpType(const py::dtype &type);
// Convert from NP array to DataType
// @param py array
// @return
/// \param py array
/// \return
static DataType FromNpArray(const py::array &arr);
#endif
// Get the buffer string format of the current type. Used in pybind buffer protocol.
// @return
/// \return
std::string GetPybindFormat() const;
bool IsSignedInt() const {

@ -28,10 +28,12 @@
#include "dataset/core/constants.h"
#include "dataset/core/cv_tensor.h"
#include "dataset/core/global_context.h"
#ifdef ENABLE_PYTHON
#include "dataset/core/pybind_support.h"
namespace py = pybind11;
#endif
#include "dataset/core/tensor_shape.h"
namespace py = pybind11;
namespace mindspore {
namespace dataset {
// Helper macros for printing tensor elements
@ -155,6 +157,7 @@ Tensor::Tensor(const std::vector<std::string> &strings, const TensorShape &shape
MS_ASSERT(num_bytes == 0);
if (shape.known()) Tensor::Reshape(shape);
}
Tensor::Tensor(const dataengine::BytesList &bytes_list, const TensorShape &shape)
: Tensor(TensorShape({static_cast<dsize_t>(bytes_list.value_size())}), DataType(DataType::DE_STRING)) {
// total bytes needed = offset array + strings
@ -194,6 +197,7 @@ Tensor::Tensor(const dataengine::BytesList &bytes_list, const TensorShape &shape
MS_ASSERT(num_bytes == 0);
if (shape.known()) Tensor::Reshape(shape);
}
Status Tensor::CreateTensor(std::shared_ptr<Tensor> *ptr, TensorImpl tensor_impl, const TensorShape &shape,
DataType type, const unsigned char *data) {
if (!shape.known()) {
@ -223,6 +227,7 @@ Status Tensor::CreateTensor(std::shared_ptr<Tensor> *ptr, TensorImpl tensor_impl
return Status::OK(); // returns base-class shared_ptr
}
#ifdef ENABLE_PYTHON
Status Tensor::CreateTensorFromNumpyString(std::shared_ptr<Tensor> *ptr, py::array arr) {
std::vector<dsize_t> shape;
for (dsize_t i = 0; i < arr.ndim(); i++) {
@ -297,6 +302,7 @@ Status Tensor::CreateTensor(std::shared_ptr<Tensor> *ptr, py::array arr) {
return Status::OK(); // returns base-class shared_ptr
}
#endif
Status Tensor::CreateTensor(std::shared_ptr<Tensor> *ptr, const std::vector<std::string> &strings,
const TensorShape &shape) {
@ -698,21 +704,24 @@ std::vector<dsize_t> Tensor::Strides() {
return strides;
}
Status Tensor::GetBufferInfo(Tensor &t, py::buffer_info *out) {
CHECK_FAIL_RETURN_UNEXPECTED(t.type().IsNumeric(), "Cannot use GetBufferInfo on tensor of strings.");
#ifdef ENABLE_PYTHON
Status Tensor::GetBufferInfo(Tensor *t, py::buffer_info *out) {
RETURN_UNEXPECTED_IF_NULL(t);
CHECK_FAIL_RETURN_UNEXPECTED(t->type().IsNumeric(), "Cannot use GetBufferInfo on tensor of strings.");
std::string format_desc = t.type().GetPybindFormat();
std::string format_desc = t->type().GetPybindFormat();
if (format_desc.empty()) {
RETURN_STATUS_UNEXPECTED("Cannot convert DE type tp pybind format");
}
*out = py::buffer_info(t.GetMutableBuffer(), /* Pointer to buffer */
t.type().SizeInBytes(), /* Size of one scalar */
format_desc, /* Python struct-style format descriptor */
t.Rank(), /* Number of dimensions */
t.shape().AsVector(), /* Buffer dimensions */
t.Strides());
*out = py::buffer_info(t->GetMutableBuffer(), /* Pointer to buffer */
t->type().SizeInBytes(), /* Size of one scalar */
format_desc, /* Python struct-style format descriptor */
t->Rank(), /* Number of dimensions */
t->shape().AsVector(), /* Buffer dimensions */
t->Strides());
return Status::OK();
}
#endif
template <typename T>
Status Tensor::GetItemAt(T *o, const std::vector<dsize_t> &index) const {
@ -752,6 +761,8 @@ Status Tensor::GetItemAt(std::string_view *o, const std::vector<dsize_t> &index)
o->swap(sv);
return Status::OK();
}
#ifdef ENABLE_PYTHON
// return data as numpy, should return status
Status Tensor::GetDataAsNumpy(py::array *data) {
RETURN_UNEXPECTED_IF_NULL(data_);
@ -815,6 +826,7 @@ Status Tensor::GetDataAsNumpyStrings(py::array *data) {
data_allocator_->deallocate(reinterpret_cast<uchar *>(tmp_data));
return Status::OK();
}
#endif
void Tensor::Squeeze() { shape_ = shape_.Squeeze(); }

@ -26,20 +26,27 @@
#undef HAVE_STDDEF_H
#undef HAVE_STDLIB_H
#endif
#ifdef ENABLE_PYTHON
#include "pybind11/numpy.h"
#include "pybind11/pybind11.h"
#include "pybind11/stl.h"
#endif
#include "dataset/core/constants.h"
#include "dataset/core/data_type.h"
#include "dataset/core/tensor_shape.h"
#include "dataset/util/allocator.h"
#include "dataset/util/status.h"
#include "proto/example.pb.h"
#ifdef ENABLE_PYTHON
namespace py = pybind11;
#endif
namespace mindspore {
namespace dataset {
class Tensor;
template <typename T>
class Allocator;
using CharAllocPtr = std::unique_ptr<Allocator<unsigned char>>;
using TensorAllocPtr = std::shared_ptr<Allocator<Tensor>>; // An allocator shared_ptr for Tensors
@ -114,16 +121,17 @@ class Tensor {
static Status CreateTensor(std::shared_ptr<Tensor> *, TensorImpl tensor_impl, const TensorShape &shape, DataType type,
const unsigned char *data = nullptr);
/// Create a copy of the input tensor
/// \param out [out] output tensor to be generated
/// \param in [in] orginal tensor to be copied
/// \return Status
// Create a copy of the input tensor
// @param out [out] output tensor to be generated
// @param in [in] orginal tensor to be copied
// @return Status
static Status CreateTensor(std::shared_ptr<Tensor> *out, const std::shared_ptr<Tensor> &in) {
const TensorAlloc *alloc = GlobalContext::Instance()->tensor_allocator();
*out = std::allocate_shared<Tensor>(*alloc, in->shape(), in->type(), in->GetBuffer(), in->SizeInBytes());
return Status::OK();
}
#ifdef ENABLE_PYTHON
// A static factory method to create a Tensor from a given py::array.
// @param ptr output argument to hold the created Tensor
// @param arr py::array
@ -132,6 +140,7 @@ class Tensor {
// Helper function to create a tensor from Numpy of strings
static Status CreateTensorFromNumpyString(std::shared_ptr<Tensor> *ptr, py::array arr);
#endif
// A static factory method to create a Tensor from a given list of strings.
// @param ptr output argument to hold the created Tensor
@ -170,6 +179,7 @@ class Tensor {
static Status CreateTensor(std::shared_ptr<Tensor> *ptr, const T &item) {
return CreateTensor<T>(ptr, {item}, TensorShape::CreateScalar());
}
// Create tensor from protobuf bytelist with uint8 or int8 types
static Status CreateTensor(std::shared_ptr<Tensor> *ptr, const dataengine::BytesList &bytes_list,
const TensorShape &shape, const DataType &type, dsize_t pad_size);
@ -346,12 +356,12 @@ class Tensor {
virtual void Squeeze();
/// Calculates the strides of the Tensor
/// Ex: Tensor of shape <4,2,2> and type DE_UINT8 (1 byte)
/// The strides will be {6,2,1}.
/// Ex: Tensor of shape <4,2,2> and type DE_UINT32 (4 byte)
/// The strides will be {24,8,4}.
/// @return vector of integers
// Calculates the strides of the Tensor
// Ex: Tensor of shape <4,2,2> and type DE_UINT8 (1 byte)
// The strides will be {6,2,1}.
// Ex: Tensor of shape <4,2,2> and type DE_UINT32 (4 byte)
// The strides will be {24,8,4}.
// @return vector of integers
std::vector<dsize_t> Strides();
std::string ToString() {
@ -376,6 +386,7 @@ class Tensor {
// Slice string tensors
Status SliceString(std::shared_ptr<Tensor> *out, const std::vector<dsize_t> &indices);
#ifdef ENABLE_PYTHON
// Constructs numpy array from input tensor
// @param data this data is the location of python data
// @return Status code
@ -383,7 +394,8 @@ class Tensor {
Status GetDataAsNumpyStrings(py::array *data);
static Status GetBufferInfo(Tensor &t, py::buffer_info *out);
static Status GetBufferInfo(Tensor *t, py::buffer_info *out);
#endif
// Concatenate based on given tensor, can fill in current tensor with a smaller one, unlike InsertTensor
Status Concatenate(const std::vector<dsize_t> &index, const std::shared_ptr<Tensor> &input);
@ -570,7 +582,7 @@ class Tensor {
// Return a TensorIterator that points to the start of the Tensor.
// It's the user responsibility to use the correct type that matches the Tensor type
// @tparam T The type of values in the Tensor
// @param T The type of values in the Tensor
// @return TensorIterator
template <typename T>
TensorIterator<T> begin() {

@ -18,7 +18,6 @@
#include "dataset/core/tensor_row.h"
namespace py = pybind11;
namespace mindspore {
namespace dataset {

@ -77,6 +77,7 @@ TensorShape::TensorShape(const TensorShape &shape)
known_ = shape.known_; // override with the input shape in case of unknown-rank tensor shape.
}
#ifdef ENABLE_PYTHON
TensorShape::TensorShape(py::list l)
: raw_shape_(*GlobalContext::Instance()->int_allocator()), strides_(*GlobalContext::Instance()->int_allocator()) {
std::vector<dsize_t> list_c;
@ -89,6 +90,7 @@ TensorShape::TensorShape(py::list l)
}
AddListToShape(list_c);
}
#endif
TensorShape::TensorShape(cv::MatSize cv_size, uint32_t type)
: raw_shape_(*GlobalContext::Instance()->int_allocator()), strides_(*GlobalContext::Instance()->int_allocator()) {
@ -197,6 +199,7 @@ TensorShape TensorShape::AppendDim(dsize_t dim) const {
return TensorShape(vec);
}
#ifdef ENABLE_PYTHON
py::list TensorShape::AsPyList() {
py::list list;
for (auto i : raw_shape_) {
@ -204,6 +207,7 @@ py::list TensorShape::AsPyList() {
}
return list;
}
#endif
TensorShape TensorShape::Squeeze() const {
std::vector<dsize_t> new_shape;

@ -24,13 +24,16 @@
#include <opencv2/core/mat.hpp>
#ifdef ENABLE_PYTHON
#include "pybind11/pybind11.h"
namespace py = pybind11;
#endif
#include "dataset/core/constants.h"
#include "dataset/util/status.h"
#include "dataset/core/global_context.h"
#include "dataset/util/allocator.h"
namespace py = pybind11;
namespace mindspore {
namespace dataset {
// Class that represents a shape of a Tensor. A shape can be:
@ -43,7 +46,8 @@ namespace dataset {
// -# one or more dim is unknown --> not empty vector --> <d1, d2, d2, d3, ...> where di is unknown\n
// Example: <3,?> (the 1st dim is unknown)\n
// <2,?,?,?> (all dims but the 0th dim are unknown)
// TensorShape supports any dim > 0 and < 2^31-1
/// \brief TensorShape supports any dim > 0 and < 2^31-1
class TensorShape {
public:
static constexpr dsize_t kDimUnknown = -1; // constant for an unknown dimension
@ -51,57 +55,59 @@ class TensorShape {
// Force the compiler to not create a no-arg constructor
TensorShape() = delete;
// Create a Shape from an initialization list (e.g., TensorShape s = {2,2}).
// If one of the dims is set to DIM_UNKNOWN, the shape will flagged as unKnown
// @param list
/// \brief Create a Shape from an initialization list (e.g., TensorShape s = {2,2}).
/// If one of the dims is set to DIM_UNKNOWN, the shape will flagged as unKnown
/// \param[in] list
explicit TensorShape(const std::initializer_list<dsize_t> &list);
// Create a Shape from a vector (e.g., TensorShape s = std::vector<dsize_t>({2,2}) ).
// If one of the dims is set to DIM_UNKNOWN, the shape will flagged as unKnown
// @param list
/// \brief Create a Shape from a vector (e.g., TensorShape s = std::vector<dsize_t>({2,2}) ).
/// If one of the dims is set to DIM_UNKNOWN, the shape will flagged as unKnown
/// \param[in] list
explicit TensorShape(const std::vector<dsize_t> &list);
// Copy constructor
// @param shape
/// \brief Copy constructor
/// \param[in] shape
TensorShape(const TensorShape &shape);
// construct a TensorShape via a python list
// @param py::list l - a list object from python
#ifdef ENABLE_PYTHON
/// \brief construct a TensorShape via a python list
/// \param[in] py::list l - a list object from python
explicit TensorShape(py::list l);
#endif
~TensorShape() = default;
// Create a scalar Shape (i.e., empty shape with mKnown = true)
// @return TensorShape
/// \brief Create a scalar Shape (i.e., empty shape with mKnown = true)
/// \return TensorShape
static TensorShape CreateScalar() { return TensorShape({}); }
// Create a shape with an unknown rank.
// @return TensorShape
/// \brief Create a shape with an unknown rank.
/// \return TensorShape
static TensorShape CreateUnknownRankShape();
// Create a shape with a known rank .
// @return TensorShape
/// \brief Create a shape with a known rank .
/// \return TensorShape
static TensorShape CreateUnknownShapeWithRank(dsize_t rank);
// Insert a new dim into a copy of the current shape.
// @param dim to be added
// @param axis the index where dim should be added
// @return New modified shape
/// \brief Insert a new dim into a copy of the current shape.
/// \param[in] dim to be added
/// \param[in] axis the index where dim should be added
/// \return New modified shape
TensorShape InsertDim(dsize_t axis, dsize_t dim) const;
// Insert new dim at index 0. For example, <2,4> --> PrependDim(4) --> <4,2,4>
// @param dim
// @return
/// \brief Insert new dim at index 0. For example, <2,4> --> PrependDim(4) --> <4,2,4>
/// \param[in] dim
/// \return
TensorShape PrependDim(dsize_t dim) const;
// Insert a new dim at the end of the shape. For example, <2,4> --> AppendDim(4) --> <2,4,4>
// @param dim
// @return
/// \brief Insert a new dim at the end of the shape. For example, <2,4> --> AppendDim(4) --> <2,4,4>
/// \param[in] dim
/// \return
TensorShape AppendDim(dsize_t dim) const;
// Create a shape based on OpenCV shape and type
// @param cv_size
// @param type int that represent the type in OpenCV, example CV_8U, CV_64S
/// \brief Create a shape based on OpenCV shape and type
/// \param[in] cv_size
/// \param[in] type int that represent the type in OpenCV, example CV_8U, CV_64S
TensorShape(cv::MatSize cv_size, uint32_t type);
dsize_t Size() const { return raw_shape_.size(); }
@ -123,47 +129,50 @@ class TensorShape {
return raw_shape_[index];
}
// Return the Shape as a vector
// @return
/// \brief Return the Shape as a vector
/// \return
std::vector<dsize_t> AsVector() const;
// Returns the class info as a string
// @return
/// \brief Returns the class info as a string
/// \return
std::string ToString() const {
std::stringstream ss;
ss << *this;
return ss.str();
}
// Actual print function used by operator<<
// @param out output string stream
/// \brief Actual print function used by operator<<
/// \param out output string stream
void Print(std::ostream &out) const;
// << Stream output operator overload
// @notes This allows you to print the info using stream operators
// @param out - reference to the output stream being overloaded
// @param rO - reference to the TensorShape to display
// @return - the output stream must be returned
/// \brief << Stream output operator overload
/// This allows you to print the info using stream operators
/// \param[in] out - reference to the output stream being overloaded
/// \param[in] rO - reference to the TensorShape to display
/// \return - the output stream must be returned
friend std::ostream &operator<<(std::ostream &out, const TensorShape &so) {
so.Print(out);
return out;
}
#ifdef ENABLE_PYTHON
py::list AsPyList();
#endif
// Checks if the given index is a valid index for this tensor.
// For example: Tensor<3,4> Index<1,1> is valid. But Index<4,1> or <1> are not.
// @param index
// @return bool
/// \brief Checks if the given index is a valid index for this tensor.
/// For example: Tensor<3,4> Index<1,1> is valid. But Index<4,1> or <1> are not.
/// \param[in] index
/// \return bool
bool IsValidIndex(const std::vector<dsize_t> &index) const;
TensorShape Squeeze() const;
std::vector<dsize_t> Strides() const;
// Returns the location of the item assuming row major memory layout.
// @param index
// @return
/// \brief Returns the location of the item assuming row major memory layout.
/// \param[in] index
/// \param[out] flat_index
/// \return
Status ToFlatIndex(const std::vector<dsize_t> &index, dsize_t *flat_index) const;
private:
@ -174,11 +183,11 @@ class TensorShape {
// Vector to keep the strides of the shape. The size is rank+1
std::vector<dsize_t, IntAlloc> strides_;
// Internal utility function to iterate over a list, check if the dim is valid and then insert it into the shape.
// @tparam T list
// @param list Iterable list
// @return true if the shape is valid and no overflow would be generated when counting the number of elements.
// False otherwise.
/// \brief Internal utility function to iterate over a list,
/// check if the dim is valid and then insert it into the shape.
/// \param[in] list Iterable list
/// \return true if the shape is valid and no overflow would be generated when counting the number of elements.
/// False otherwise.
template <typename T>
void AddListToShape(const T &list);
};

@ -2,13 +2,12 @@ add_subdirectory(source)
file(GLOB_RECURSE _CURRENT_SRC_FILES RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} "*.cc")
set_property(SOURCE ${_CURRENT_SRC_FILES} PROPERTY COMPILE_DEFINITIONS SUBMODULE_ID=mindspore::SubModuleId::SM_MD)
add_library(engine-datasetops OBJECT
set(DATASET_ENGINE_DATASETOPS_SRC_FILES
dataset_op.cc
parallel_op.cc
pipeline_op.cc
barrier_op.cc
batch_op.cc
bucket_batch_by_length_op.cc
device_queue_op.cc
map_op.cc
project_op.cc
@ -18,8 +17,18 @@ add_library(engine-datasetops OBJECT
take_op.cc
shuffle_op.cc
zip_op.cc
concat_op.cc
filter_op.cc
build_vocab_op.cc
concat_op.cc
)
if (ENABLE_PYTHON)
set(DATASET_ENGINE_DATASETOPS_SRC_FILES
${DATASET_ENGINE_DATASETOPS_SRC_FILES}
bucket_batch_by_length_op.cc
barrier_op.cc
filter_op.cc
build_vocab_op.cc
)
endif()
add_library(engine-datasetops OBJECT ${DATASET_ENGINE_DATASETOPS_SRC_FILES})

@ -19,7 +19,9 @@
#include <iomanip>
#include "common/utils.h"
#ifdef ENABLE_PYTHON
#include "dataset/core/pybind_support.h"
#endif
#include "dataset/engine/data_buffer.h"
#include "dataset/engine/db_connector.h"
#include "dataset/engine/opt/pass.h"
@ -38,9 +40,14 @@ BatchOp::Builder::Builder(int32_t batch_size) : builder_drop_(false), builder_pa
Status BatchOp::Builder::Build(std::shared_ptr<BatchOp> *ptr) {
RETURN_IF_NOT_OK(SanityCheck());
#ifdef ENABLE_PYTHON
*ptr = std::make_shared<BatchOp>(builder_batch_size_, builder_drop_, builder_pad_, builder_op_connector_size_,
builder_num_workers_, builder_cols_to_map_, builder_batch_size_func_,
builder_batch_map_func_, builder_pad_map_);
#else
*ptr = std::make_shared<BatchOp>(builder_batch_size_, builder_drop_, builder_pad_, builder_op_connector_size_,
builder_num_workers_, builder_cols_to_map_, builder_pad_map_);
#endif
return Status::OK();
}
@ -52,6 +59,7 @@ Status BatchOp::Builder::SanityCheck() {
return err.empty() ? Status::OK() : Status(StatusCode::kUnexpectedError, __LINE__, __FILE__, common::SafeCStr(err));
}
#ifdef ENABLE_PYTHON
BatchOp::BatchOp(int32_t batch_size, bool drop, bool pad, int32_t op_queue_size, int32_t num_workers,
const std::vector<std::string> &cols_to_map, py::function batch_size_func, py::function batch_map_func,
PadInfo pad_map)
@ -65,6 +73,18 @@ BatchOp::BatchOp(int32_t batch_size, bool drop, bool pad, int32_t op_queue_size,
pad_info_(pad_map) {
worker_queues_.Init(num_workers, op_queue_size);
}
#else
BatchOp::BatchOp(int32_t batch_size, bool drop, bool pad, int32_t op_queue_size, int32_t num_workers,
const std::vector<std::string> &cols_to_map, PadInfo pad_map)
: ParallelOp(num_workers, op_queue_size),
start_batch_size_(batch_size),
drop_(drop),
pad_(pad),
pyfunc_column_names_(cols_to_map),
pad_info_(pad_map) {
worker_queues_.Init(num_workers, op_queue_size);
}
#endif
Status BatchOp::operator()() {
Status rc = LaunchThreadsAndInitOp();
@ -206,7 +226,9 @@ Status BatchOp::WorkerEntry(int32_t workerId) {
Status BatchOp::MakeBatchedBuffer(std::pair<std::unique_ptr<TensorQTable>, CBatchInfo> table_pair,
std::unique_ptr<DataBuffer> *db) {
RETURN_UNEXPECTED_IF_NULL(table_pair.first);
if (!pyfunc_column_names_.empty()) RETURN_IF_NOT_OK(MapColumns(&table_pair)); // pass it through pyfunc
#ifdef ENABLE_PYTHON
if (!pyfunc_column_names_.empty()) RETURN_IF_NOT_OK(MapColumns(&table_pair)); // pass it through pyfunc
#endif
if (pad_) RETURN_IF_NOT_OK(PadColumns(&table_pair.first, pad_info_, column_name_id_map_)); // do padding if needed
(*db) = std::make_unique<DataBuffer>(table_pair.second.batch_num_, DataBuffer::kDeBFlagNone);
std::unique_ptr<TensorQTable> dest_table = std::make_unique<TensorQTable>();
@ -229,6 +251,7 @@ Status BatchOp::EoeReceived(int32_t) {
return Status::OK();
}
#ifdef ENABLE_PYTHON
Status BatchOp::MapColumns(std::pair<std::unique_ptr<TensorQTable>, CBatchInfo> *table_pair) {
TensorBatchTable input_table;
input_table.reserve(pyfunc_column_names_.size());
@ -259,16 +282,22 @@ Status BatchOp::MapColumns(std::pair<std::unique_ptr<TensorQTable>, CBatchInfo>
}
return Status::OK();
}
#endif
Status BatchOp::GetBatchSize(int32_t *batch_size, CBatchInfo info) {
#ifdef ENABLE_PYTHON
if (batch_size_func_ != nullptr) {
RETURN_IF_NOT_OK(InvokeBatchSizeFunc(batch_size, info));
} else {
(*batch_size) = start_batch_size_;
}
#else
(*batch_size) = start_batch_size_;
#endif
return Status::OK();
}
#ifdef ENABLE_PYTHON
Status BatchOp::InvokeBatchSizeFunc(int32_t *batch_size, CBatchInfo info) {
{
// Acquire Python GIL
@ -336,6 +365,7 @@ Status BatchOp::InvokeBatchMapFunc(TensorBatchTable *input, TensorBatchTable *ou
}
return Status(StatusCode::kOK);
}
#endif
Status BatchOp::PadColumns(std::unique_ptr<TensorQTable> *table, const PadInfo &pad_info,
const std::unordered_map<std::string, int32_t> &column_name_id_map) {

@ -89,6 +89,7 @@ class BatchOp : public ParallelOp {
return *this;
}
#ifdef ENABLE_PYTHON
// set columns to perform map on
// @param const std::vector<std::string> & cols_to_map - name of columns to perform map on
// @return Builder & reference to builder class object
@ -104,6 +105,7 @@ class BatchOp : public ParallelOp {
builder_batch_size_func_ = batch_size_func;
return *this;
}
#endif
// @param std::shared_ptr<BatchOp> *ptr pointer to shared_ptr, actual return arg
// @return Status - The error code return
@ -121,8 +123,10 @@ class BatchOp : public ParallelOp {
int32_t builder_op_connector_size_;
std::vector<std::string> builder_cols_to_map_;
PadInfo builder_pad_map_;
#ifdef ENABLE_PYTHON
py::function builder_batch_size_func_;
py::function builder_batch_map_func_;
#endif
};
enum batchCtrl : int8_t { kNoCtrl = 0, kEOE = 1, kEOF = 2, kQuit = 3 };
@ -144,6 +148,7 @@ class BatchOp : public ParallelOp {
const int64_t get_epoch_num() const { return epoch_num_; }
};
#ifdef ENABLE_PYTHON
// BatchOp constructor
// @param int32_t batch_size
// @param bool drop
@ -152,6 +157,10 @@ class BatchOp : public ParallelOp {
// @param int32_t num_workers
BatchOp(int32_t batch_size, bool drop, bool pad, int32_t op_queue_size, int32_t num_workers,
const std::vector<std::string> &, py::function batch_size_func, py::function batch_map_func, PadInfo pad_map);
#else
BatchOp(int32_t batch_size, bool drop, bool pad, int32_t op_queue_size, int32_t num_workers,
const std::vector<std::string> &, PadInfo pad_map);
#endif
// BatchOp destructor
~BatchOp() {}
@ -219,10 +228,13 @@ class BatchOp : public ParallelOp {
// @return Status - The error code return
Status MakeBatchedBuffer(std::pair<std::unique_ptr<TensorQTable>, CBatchInfo> table_pair,
std::unique_ptr<DataBuffer> *db);
#ifdef ENABLE_PYTHON
// Function that calls pyfunc to perform map on batch
// @param (std::pair<std::unique_ptr<TensorQTable>, batch_stats> *table_pair - contains un-batched tensor
// @return Status - The error code return
Status MapColumns(std::pair<std::unique_ptr<TensorQTable>, CBatchInfo> *table_pair);
#endif
// @param const PadInfo &pad_info pad info to unpack
// @param const std::unordered_map<std::string, int32_t>& column_name_id_map - column names to index mapping
@ -247,6 +259,7 @@ class BatchOp : public ParallelOp {
// @return Status - The error code return
Status LaunchThreadsAndInitOp();
#ifdef ENABLE_PYTHON
// Invoke batch size function with current BatchInfo to generate batch size.
// @return Status - The error code return
Status InvokeBatchSizeFunc(int32_t *batch_size, CBatchInfo info);
@ -254,6 +267,7 @@ class BatchOp : public ParallelOp {
// Invoke batch map function with current BatchInfo to generate tensors to batch.
// @return Status - The error code return
Status InvokeBatchMapFunc(TensorTable *input, TensorTable *output, CBatchInfo info);
#endif
int32_t start_batch_size_;
bool drop_; // bool for whether to drop remainder or not
@ -262,8 +276,10 @@ class BatchOp : public ParallelOp {
PadInfo pad_info_; // column names to perform padding on
std::unique_ptr<ChildIterator> child_iterator_; // child iterator for fetching TensorRows 1 by 1
QueueList<std::pair<std::unique_ptr<TensorQTable>, CBatchInfo>> worker_queues_; // internal queue for syncing worker
#ifdef ENABLE_PYTHON
py::function batch_size_func_; // Function pointer of batch size function
py::function batch_map_func_; // Function pointer of per batch map function
#endif
};
} // namespace dataset
} // namespace mindspore

@ -1,19 +1,32 @@
add_subdirectory(sampler)
file(GLOB_RECURSE _CURRENT_SRC_FILES RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} "*.cc")
set_property(SOURCE ${_CURRENT_SRC_FILES} PROPERTY COMPILE_DEFINITIONS SUBMODULE_ID=mindspore::SubModuleId::SM_MD)
add_library(engine-datasetops-source OBJECT
generator_op.cc
set(DATASET_ENGINE_DATASETOPS_SOURCE_SRC_FILES
io_block.cc
mindrecord_op.cc
tf_reader_op.cc
image_folder_op.cc
mnist_op.cc
voc_op.cc
coco_op.cc
manifest_op.cc
cifar_op.cc
random_data_op.cc
celeba_op.cc
text_file_op.cc
clue_op.cc
)
)
set(DATASET_ENGINE_DATASETOPS_SOURCE_SRC_FILES
${DATASET_ENGINE_DATASETOPS_SOURCE_SRC_FILES}
mindrecord_op.cc
tf_reader_op.cc
)
if (ENABLE_PYTHON)
set(DATASET_ENGINE_DATASETOPS_SOURCE_SRC_FILES
${DATASET_ENGINE_DATASETOPS_SOURCE_SRC_FILES}
generator_op.cc
voc_op.cc
manifest_op.cc
)
endif()
add_library(engine-datasetops-source OBJECT ${DATASET_ENGINE_DATASETOPS_SOURCE_SRC_FILES})

@ -1,12 +1,21 @@
file(GLOB_RECURSE _CURRENT_SRC_FILES RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} "*.cc")
set_property(SOURCE ${_CURRENT_SRC_FILES} PROPERTY COMPILE_DEFINITIONS SUBMODULE_ID=mindspore::SubModuleId::SM_MD)
add_library(engine-datasetops-source-sampler OBJECT
set(DATASET_ENGINE_DATASETOPS_SOURCE_SAMPLER_SRC_FILES
distributed_sampler.cc
pk_sampler.cc
python_sampler.cc
random_sampler.cc
sampler.cc
sequential_sampler.cc
subset_random_sampler.cc
weighted_random_sampler.cc
)
if (ENABLE_PYTHON)
set(DATASET_ENGINE_DATASETOPS_SOURCE_SAMPLER_SRC_FILES
${DATASET_ENGINE_DATASETOPS_SOURCE_SAMPLER_SRC_FILES}
python_sampler.cc
)
endif()
add_library(engine-datasetops-source-sampler OBJECT ${DATASET_ENGINE_DATASETOPS_SOURCE_SAMPLER_SRC_FILES})

Some files were not shown because too many files have changed in this diff Show More

Loading…
Cancel
Save