|
|
|
@ -43,10 +43,20 @@ double ListProduct(std::vector<T> vec) {
|
|
|
|
|
// entries timing the length of each entry's data type
|
|
|
|
|
class OperatorCost {
|
|
|
|
|
public:
|
|
|
|
|
OperatorCost() {
|
|
|
|
|
explicit OperatorCost(bool is_inputs_related) : inputs_related_(is_inputs_related) {
|
|
|
|
|
// this is only for the case when set_is_parameter() and SetInputAndOutputTypeLength() are not invoked
|
|
|
|
|
for (size_t i = 0; i < MAXIMUM_INPUT_NUMBER; ++i) {
|
|
|
|
|
is_parameter_.push_back(false);
|
|
|
|
|
is_parameter_involve_.push_back(false);
|
|
|
|
|
inputs_type_lengths_.push_back(DEFAULT_DATA_TYPE_LENGTH);
|
|
|
|
|
outputs_type_lengths_.push_back(DEFAULT_DATA_TYPE_LENGTH);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
OperatorCost() : inputs_related_(false) {
|
|
|
|
|
// this is only for the case when set_is_parameter() and SetInputAndOutputTypeLength() are not invoked
|
|
|
|
|
for (size_t i = 0; i < MAXIMUM_INPUT_NUMBER; ++i) {
|
|
|
|
|
is_parameter_.push_back(false);
|
|
|
|
|
is_parameter_involve_.push_back(false);
|
|
|
|
|
inputs_type_lengths_.push_back(DEFAULT_DATA_TYPE_LENGTH);
|
|
|
|
|
outputs_type_lengths_.push_back(DEFAULT_DATA_TYPE_LENGTH);
|
|
|
|
|
}
|
|
|
|
@ -54,6 +64,8 @@ class OperatorCost {
|
|
|
|
|
virtual ~OperatorCost() = default;
|
|
|
|
|
|
|
|
|
|
void set_is_parameter(const std::vector<bool>& is_parameter);
|
|
|
|
|
void set_is_parameter_involve(const std::vector<bool>&);
|
|
|
|
|
void set_output_parameter_involve(int);
|
|
|
|
|
void SetInputAndOutputTypeLength(const std::vector<size_t>& input_lengths, const std::vector<size_t>& output_lengths);
|
|
|
|
|
std::vector<size_t> inputs_type_lengths() const { return inputs_type_lengths_; }
|
|
|
|
|
std::vector<size_t> outputs_type_lengths() const { return outputs_type_lengths_; }
|
|
|
|
@ -72,8 +84,19 @@ class OperatorCost {
|
|
|
|
|
const std::vector<TensorInfo>& outputs, const int32_t& stage_id) const = 0;
|
|
|
|
|
virtual double GetBackwardComputationCost(const std::vector<TensorInfo>& inputs,
|
|
|
|
|
const std::vector<TensorInfo>& outputs, const int32_t& stage_id) const = 0;
|
|
|
|
|
// per device PEAK memory cost in a training iteration
|
|
|
|
|
// Typically, the PEAK memory cost contributed by an operator is its output (if the output is parameter-invovled),
|
|
|
|
|
// plus necessary inputs.
|
|
|
|
|
virtual double GetMemoryCost(const std::vector<TensorInfo>& inputs, const std::vector<TensorInfo>& outputs) const;
|
|
|
|
|
|
|
|
|
|
protected:
|
|
|
|
|
// For each input in 'inputs_', a bool variable is true if the corresponding one is a parameter or a output of
|
|
|
|
|
// pre-operator that has parameters as input.
|
|
|
|
|
std::vector<bool> is_parameter_involve_;
|
|
|
|
|
int output_parameter_involve_ = -1; // -1: unset; 0: not parameter_involved; 1: parameter_involved
|
|
|
|
|
// Whether the inputs are related or not? For example, TensorAdd's two inputs are independent (not related), while
|
|
|
|
|
// Mul's two inputs are dependent (related).
|
|
|
|
|
bool inputs_related_;
|
|
|
|
|
// for each input in 'inputs_', there is a bool variable indicating whether that the corresponding input is parameter
|
|
|
|
|
std::vector<bool> is_parameter_;
|
|
|
|
|
// for each input and output, the followings record the number of bytes of each element
|
|
|
|
@ -85,7 +108,8 @@ using OperatorCostPtr = std::shared_ptr<OperatorCost>;
|
|
|
|
|
|
|
|
|
|
class MatMulCost : public OperatorCost {
|
|
|
|
|
public:
|
|
|
|
|
MatMulCost() = default;
|
|
|
|
|
explicit MatMulCost(bool is_inputs_related) : OperatorCost(is_inputs_related) {}
|
|
|
|
|
MatMulCost() : OperatorCost(true) {}
|
|
|
|
|
~MatMulCost() override = default;
|
|
|
|
|
|
|
|
|
|
// per device communication cost
|
|
|
|
@ -108,12 +132,12 @@ class MatMulCost : public OperatorCost {
|
|
|
|
|
double GetBackwardComputationCost(const std::vector<TensorInfo>& inputs, const std::vector<TensorInfo>& outputs,
|
|
|
|
|
const int32_t& stage_id) const override;
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
using MatMulCostPtr = std::shared_ptr<MatMulCost>;
|
|
|
|
|
|
|
|
|
|
class ActivationCost : public OperatorCost {
|
|
|
|
|
public:
|
|
|
|
|
ActivationCost() = default;
|
|
|
|
|
explicit ActivationCost(bool is_inputs_related) : OperatorCost(is_inputs_related) {}
|
|
|
|
|
ActivationCost() : OperatorCost(false) {}
|
|
|
|
|
~ActivationCost() override = default;
|
|
|
|
|
|
|
|
|
|
double GetCommCost(const std::vector<TensorInfo>& inputs, const std::vector<TensorInfo>& outputs,
|
|
|
|
@ -133,14 +157,14 @@ class ActivationCost : public OperatorCost {
|
|
|
|
|
double GetBackwardComputationCost(const std::vector<TensorInfo>& inputs, const std::vector<TensorInfo>& outputs,
|
|
|
|
|
const int32_t& stage_id) const override;
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
using ActivationCostPtr = std::shared_ptr<ActivationCost>;
|
|
|
|
|
using TransposeCost = ActivationCost;
|
|
|
|
|
using TransposeCostPtr = std::shared_ptr<TransposeCost>;
|
|
|
|
|
|
|
|
|
|
class SoftmaxCost : public OperatorCost {
|
|
|
|
|
public:
|
|
|
|
|
SoftmaxCost() = default;
|
|
|
|
|
explicit SoftmaxCost(bool is_inputs_related) : OperatorCost(is_inputs_related) {}
|
|
|
|
|
SoftmaxCost() : OperatorCost(false) {}
|
|
|
|
|
~SoftmaxCost() override = default;
|
|
|
|
|
|
|
|
|
|
double GetCommCost(const std::vector<TensorInfo>& inputs, const std::vector<TensorInfo>& outputs,
|
|
|
|
@ -160,12 +184,12 @@ class SoftmaxCost : public OperatorCost {
|
|
|
|
|
double GetBackwardComputationCost(const std::vector<TensorInfo>& inputs, const std::vector<TensorInfo>& outputs,
|
|
|
|
|
const int32_t&) const override;
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
using SoftmaxCostPtr = std::shared_ptr<SoftmaxCost>;
|
|
|
|
|
|
|
|
|
|
class TmpIdentityCost : public OperatorCost {
|
|
|
|
|
public:
|
|
|
|
|
TmpIdentityCost() = default;
|
|
|
|
|
explicit TmpIdentityCost(bool is_inputs_related) : OperatorCost(is_inputs_related) {}
|
|
|
|
|
TmpIdentityCost() : OperatorCost(false) {}
|
|
|
|
|
~TmpIdentityCost() override = default;
|
|
|
|
|
|
|
|
|
|
double GetCommCost(const std::vector<TensorInfo>& inputs, const std::vector<TensorInfo>& outputs,
|
|
|
|
@ -184,12 +208,15 @@ class TmpIdentityCost : public OperatorCost {
|
|
|
|
|
const int32_t& stage_id) const override;
|
|
|
|
|
double GetBackwardComputationCost(const std::vector<TensorInfo>& inputs, const std::vector<TensorInfo>& outputs,
|
|
|
|
|
const int32_t& stage_id) const override;
|
|
|
|
|
// per device PEAK memory cost in a training iteration
|
|
|
|
|
double GetMemoryCost(const std::vector<TensorInfo>& inputs, const std::vector<TensorInfo>& outputs) const override;
|
|
|
|
|
};
|
|
|
|
|
using TmpIdentityCostPtr = std::shared_ptr<TmpIdentityCost>;
|
|
|
|
|
|
|
|
|
|
class BatchParallelCost : public OperatorCost {
|
|
|
|
|
public:
|
|
|
|
|
BatchParallelCost() = default;
|
|
|
|
|
explicit BatchParallelCost(bool is_inputs_related) : OperatorCost(is_inputs_related) {}
|
|
|
|
|
BatchParallelCost() : OperatorCost(false) {}
|
|
|
|
|
~BatchParallelCost() override = default;
|
|
|
|
|
|
|
|
|
|
double GetCommCost(const std::vector<TensorInfo>& inputs, const std::vector<TensorInfo>& outputs,
|
|
|
|
@ -217,7 +244,8 @@ using BatchParallelCostPtr = std::shared_ptr<BatchParallelCost>;
|
|
|
|
|
|
|
|
|
|
class VirtualDatasetCost : public OperatorCost {
|
|
|
|
|
public:
|
|
|
|
|
VirtualDatasetCost() = default;
|
|
|
|
|
explicit VirtualDatasetCost(bool is_inputs_related) : OperatorCost(is_inputs_related) {}
|
|
|
|
|
VirtualDatasetCost() : OperatorCost(false) {}
|
|
|
|
|
~VirtualDatasetCost() override = default;
|
|
|
|
|
|
|
|
|
|
double GetCommCost(const std::vector<TensorInfo>& inputs, const std::vector<TensorInfo>& outputs,
|
|
|
|
@ -244,12 +272,17 @@ class VirtualDatasetCost : public OperatorCost {
|
|
|
|
|
const int32_t&) const override {
|
|
|
|
|
return 0.0;
|
|
|
|
|
}
|
|
|
|
|
// per device PEAK memory cost in a training iteration
|
|
|
|
|
double GetMemoryCost(const std::vector<TensorInfo>& inputs, const std::vector<TensorInfo>& outputs) const override {
|
|
|
|
|
return 0.0;
|
|
|
|
|
}
|
|
|
|
|
};
|
|
|
|
|
using VirtualDatasetCostPtr = std::shared_ptr<VirtualDatasetCost>;
|
|
|
|
|
|
|
|
|
|
class GeneratorBaseCost : public OperatorCost {
|
|
|
|
|
public:
|
|
|
|
|
GeneratorBaseCost() = default;
|
|
|
|
|
explicit GeneratorBaseCost(bool is_inputs_related) : OperatorCost(is_inputs_related) {}
|
|
|
|
|
GeneratorBaseCost() : OperatorCost(false) {}
|
|
|
|
|
~GeneratorBaseCost() override = default;
|
|
|
|
|
|
|
|
|
|
double GetCommCost(const std::vector<TensorInfo>& inputs, const std::vector<TensorInfo>& outputs,
|
|
|
|
@ -283,7 +316,8 @@ using GeneratorBaseCostPtr = std::shared_ptr<GeneratorBaseCost>;
|
|
|
|
|
|
|
|
|
|
class PReLUCost : public OperatorCost {
|
|
|
|
|
public:
|
|
|
|
|
PReLUCost() = default;
|
|
|
|
|
explicit PReLUCost(bool is_inputs_related) : OperatorCost(is_inputs_related) {}
|
|
|
|
|
PReLUCost() : OperatorCost(true) {}
|
|
|
|
|
~PReLUCost() override = default;
|
|
|
|
|
|
|
|
|
|
// per device communication cost
|
|
|
|
@ -310,7 +344,8 @@ using PReLUCostPtr = std::shared_ptr<PReLUCost>;
|
|
|
|
|
|
|
|
|
|
class OneHotCost : public OperatorCost {
|
|
|
|
|
public:
|
|
|
|
|
OneHotCost() = default;
|
|
|
|
|
explicit OneHotCost(bool is_inputs_related) : OperatorCost(is_inputs_related) {}
|
|
|
|
|
OneHotCost() : OperatorCost(true) {}
|
|
|
|
|
~OneHotCost() override = default;
|
|
|
|
|
|
|
|
|
|
// per device communication cost
|
|
|
|
@ -337,7 +372,8 @@ using OneHotCostPtr = std::shared_ptr<OneHotCost>;
|
|
|
|
|
|
|
|
|
|
class SoftmaxCrossEntropyWithLogitsCost : public OperatorCost {
|
|
|
|
|
public:
|
|
|
|
|
SoftmaxCrossEntropyWithLogitsCost() = default;
|
|
|
|
|
explicit SoftmaxCrossEntropyWithLogitsCost(bool is_inputs_related) : OperatorCost(is_inputs_related) {}
|
|
|
|
|
SoftmaxCrossEntropyWithLogitsCost() : OperatorCost(false) {}
|
|
|
|
|
~SoftmaxCrossEntropyWithLogitsCost() override = default;
|
|
|
|
|
|
|
|
|
|
// per device communication cost
|
|
|
|
@ -364,7 +400,8 @@ using SoftmaxCrossEntropyWithLogitsCostPtr = std::shared_ptr<SoftmaxCrossEntropy
|
|
|
|
|
|
|
|
|
|
class ReshapeCost : public OperatorCost {
|
|
|
|
|
public:
|
|
|
|
|
ReshapeCost() = default;
|
|
|
|
|
explicit ReshapeCost(bool is_inputs_related) : OperatorCost(is_inputs_related) {}
|
|
|
|
|
ReshapeCost() : OperatorCost(true) {}
|
|
|
|
|
|
|
|
|
|
~ReshapeCost() override = default;
|
|
|
|
|
|
|
|
|
@ -396,7 +433,8 @@ using ReshapeCostPtr = std::shared_ptr<ReshapeCost>;
|
|
|
|
|
|
|
|
|
|
class ArithmeticCost : public OperatorCost {
|
|
|
|
|
public:
|
|
|
|
|
ArithmeticCost() = default;
|
|
|
|
|
explicit ArithmeticCost(bool is_inputs_related) : OperatorCost(is_inputs_related) {}
|
|
|
|
|
ArithmeticCost() : OperatorCost(false) {}
|
|
|
|
|
~ArithmeticCost() override = default;
|
|
|
|
|
|
|
|
|
|
double GetCommCost(const std::vector<TensorInfo>& inputs, const std::vector<TensorInfo>& outputs,
|
|
|
|
@ -425,7 +463,8 @@ using BiasAddCostPtr = std::shared_ptr<BiasAddCost>;
|
|
|
|
|
|
|
|
|
|
class ReduceMethodCost : public OperatorCost {
|
|
|
|
|
public:
|
|
|
|
|
ReduceMethodCost() = default;
|
|
|
|
|
explicit ReduceMethodCost(bool is_inputs_related) : OperatorCost(is_inputs_related) {}
|
|
|
|
|
ReduceMethodCost() : OperatorCost(true) {}
|
|
|
|
|
~ReduceMethodCost() override = default;
|
|
|
|
|
|
|
|
|
|
double GetCommCost(const std::vector<TensorInfo>& inputs, const std::vector<TensorInfo>& outputs,
|
|
|
|
@ -455,7 +494,8 @@ using ReduceMethodCostPtr = std::shared_ptr<ReduceMethodCost>;
|
|
|
|
|
|
|
|
|
|
class ReduceMeanCost : public ReduceMethodCost {
|
|
|
|
|
public:
|
|
|
|
|
ReduceMeanCost() = default;
|
|
|
|
|
explicit ReduceMeanCost(bool is_inputs_related) : ReduceMethodCost(is_inputs_related) {}
|
|
|
|
|
ReduceMeanCost() : ReduceMethodCost(true) {}
|
|
|
|
|
~ReduceMeanCost() override = default;
|
|
|
|
|
|
|
|
|
|
double GetForwardComputationCost(const std::vector<TensorInfo>& inputs, const std::vector<TensorInfo>& outputs,
|
|
|
|
@ -465,7 +505,8 @@ using ReduceMeanCostPtr = std::shared_ptr<ReduceMeanCost>;
|
|
|
|
|
|
|
|
|
|
class GetNextCost : public OperatorCost {
|
|
|
|
|
public:
|
|
|
|
|
GetNextCost() = default;
|
|
|
|
|
explicit GetNextCost(bool is_inputs_related) : OperatorCost(is_inputs_related) {}
|
|
|
|
|
GetNextCost() : OperatorCost(false) {}
|
|
|
|
|
~GetNextCost() override = default;
|
|
|
|
|
|
|
|
|
|
double GetCommCost(const std::vector<TensorInfo>& inputs, const std::vector<TensorInfo>& outputs,
|
|
|
|
@ -499,7 +540,8 @@ using GetNextCostPtr = std::shared_ptr<GetNextCost>;
|
|
|
|
|
|
|
|
|
|
class DropOutCost : public OperatorCost {
|
|
|
|
|
public:
|
|
|
|
|
DropOutCost() = default;
|
|
|
|
|
explicit DropOutCost(bool is_inputs_related) : OperatorCost(is_inputs_related) {}
|
|
|
|
|
DropOutCost() : OperatorCost(true) {}
|
|
|
|
|
~DropOutCost() override = default;
|
|
|
|
|
|
|
|
|
|
double GetCommCost(const std::vector<TensorInfo>& inputs, const std::vector<TensorInfo>& outputs,
|
|
|
|
@ -530,7 +572,8 @@ using DropOutCostPtr = std::shared_ptr<DropOutCost>;
|
|
|
|
|
|
|
|
|
|
class GatherV2Cost : public OperatorCost {
|
|
|
|
|
public:
|
|
|
|
|
GatherV2Cost() = default;
|
|
|
|
|
explicit GatherV2Cost(bool is_inputs_related) : OperatorCost(is_inputs_related) {}
|
|
|
|
|
GatherV2Cost() : OperatorCost(true) {}
|
|
|
|
|
~GatherV2Cost() override = default;
|
|
|
|
|
|
|
|
|
|
double GetCommCost(const std::vector<TensorInfo>& inputs, const std::vector<TensorInfo>& outputs,
|
|
|
|
|