You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
416 lines
17 KiB
416 lines
17 KiB
/**
|
|
* Copyright 2019-2020 Huawei Technologies Co., Ltd
|
|
*
|
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
|
* you may not use this file except in compliance with the License.
|
|
* You may obtain a copy of the License at
|
|
*
|
|
* http://www.apache.org/licenses/LICENSE-2.0
|
|
*
|
|
* Unless required by applicable law or agreed to in writing, software
|
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
* See the License for the specific language governing permissions and
|
|
* limitations under the License.
|
|
*/
|
|
|
|
/*!
|
|
* \file candidate_sampling_ops.h
|
|
* \brief
|
|
*/
|
|
#ifndef OPS_BUILT_IN_OP_PROTO_INC_CANDIDATE_SAMPLING_OPS_H_
|
|
#define OPS_BUILT_IN_OP_PROTO_INC_CANDIDATE_SAMPLING_OPS_H_
|
|
|
|
#include "graph/operator_reg.h"
|
|
|
|
namespace ge {
|
|
|
|
/**
|
|
*@brief Generates labels for candidate sampling with
|
|
a learned unigram distribution. \n
|
|
|
|
*@par Inputs:
|
|
*Input "true_classes" is a 2D matrix.
|
|
*true_classes: A "batch_size * num_true" matrix, in which each row contains
|
|
the IDs of the "num_true" "target_classes" in the corresponding original label. \n
|
|
|
|
*@par Attributes:
|
|
*@li num_true: Number of true labels per context.
|
|
*@li num_sampled: Number of candidates to randomly sample.
|
|
*@li unique: If "unique" is true, samples with rejection,
|
|
so that all sampled candidates in a batch are unique.
|
|
*This requires some approximation to estimate the post-rejection
|
|
sampling probabilities.
|
|
*@li range_max: The sampler will sample integers from the interval
|
|
[0, range_max).
|
|
*@li seed: If either "seed" or "seed2" are set to be non-zero.
|
|
*@li seed2: A second seed to avoid seed collision. \n
|
|
|
|
*@par Outputs:
|
|
*@li sampled_candidates: A vector of length "num_sampled", in which each
|
|
element is the ID of a sampled candidate.
|
|
*@li true_expected_count: A "batch_size * num_true" matrix, representing
|
|
the number of times each candidate is expected to occur in a batch of sampled
|
|
candidates. If "unique" is true, then this is a probability.
|
|
*@li sampled_expected_count: A vector of length "num_sampled",
|
|
for each sampled candidate.
|
|
*representing the number of times the candidate is expected to occur
|
|
in a batch of sampled candidates.
|
|
* If "unique" is true, then this is a probability.
|
|
|
|
*@attention Constraints:
|
|
*ThreadUnsafeUnigramCandidateSampler runs on the Ascend AI CPU,
|
|
which delivers poor performance. \n
|
|
|
|
*@par Third-party framework compatibility
|
|
*Compatible with the TensorFlow operator ThreadUnsafeUnigramCandidateSampler. \n
|
|
|
|
*@par Restrictions:
|
|
*Warning: THIS FUNCTION IS EXPERIMENTAL. Please do not use.
|
|
*/
|
|
REG_OP(ThreadUnsafeUnigramCandidateSampler)
|
|
.INPUT(true_classes, TensorType({ DT_INT64 }))
|
|
.OUTPUT(sampled_candidates, TensorType({ DT_INT64 }))
|
|
.OUTPUT(true_expected_count, TensorType({ DT_FLOAT }))
|
|
.OUTPUT(sampled_expected_count, TensorType({ DT_FLOAT }))
|
|
.REQUIRED_ATTR(num_true, Int)
|
|
.REQUIRED_ATTR(num_sampled, Int)
|
|
.REQUIRED_ATTR(unique, Bool)
|
|
.REQUIRED_ATTR(range_max, Int)
|
|
.ATTR(seed, Int, 0)
|
|
.ATTR(seed2, Int, 0)
|
|
.OP_END_FACTORY_REG(ThreadUnsafeUnigramCandidateSampler)
|
|
|
|
/**
|
|
*@brief Generates labels for candidate sampling with a learned
|
|
unigram distribution. \n
|
|
|
|
*@par Inputs:
|
|
*true_classes: A "batch_size * num_true" matrix, in which each row contains
|
|
the IDs of the "num_true" "target_classes" in the corresponding original label.
|
|
*Input "true_classes" is a 2D matrix. \n
|
|
|
|
*@par Attributes:
|
|
*@li num_true: Number of true labels per context.
|
|
*@li num_sampled: Number of candidates to randomly sample.
|
|
*@li unique: If "unique" is true, samples with rejection,
|
|
so that all sampled candidates in a batch are unique.
|
|
*This requires some approximation to estimate the post-rejection
|
|
sampling probabilities.
|
|
*@li range_max: The sampler will sample integers from the interval
|
|
[0, range_max).
|
|
*@li seed: If either "seed" or "seed2" are set to be non-zero.
|
|
*@li seed2: A second seed to avoid seed collision. \n
|
|
|
|
*@par Outputs:
|
|
*@li sampled_candidates: A vector of length "num_sampled",
|
|
in which each element is the ID of a sampled candidate.
|
|
*@li true_expected_count: A "batch_size * num_true" matrix, representing the
|
|
number of times each candidate is expected to occur
|
|
in a batch of sampled candidates.
|
|
*If "unique" is true, then this is a probability.
|
|
*@li sampled_expected_count: A vector of length "num_sampled", for each
|
|
sampled candidate representing the number of times.
|
|
* the candidate is expected to occur in a batch of sampled candidates.
|
|
*If "unique" is true, then this is a probability. \n
|
|
|
|
*@attention Constraints:
|
|
*UniformCandidateSampler runs on the Ascend AI CPU,
|
|
which delivers poor performance. \n
|
|
|
|
*@par Third-party framework compatibility
|
|
*Compatible with the TensorFlow operator UniformCandidateSampler. \n
|
|
|
|
*@par Restrictions:
|
|
*Warning: THIS FUNCTION IS EXPERIMENTAL. Please do not use.
|
|
*/
|
|
REG_OP(UniformCandidateSampler)
|
|
.INPUT(true_classes, TensorType({ DT_INT64 }))
|
|
.OUTPUT(sampled_candidates, TensorType({ DT_INT64 }))
|
|
.OUTPUT(true_expected_count, TensorType({ DT_FLOAT }))
|
|
.OUTPUT(sampled_expected_count, TensorType({ DT_FLOAT }))
|
|
.REQUIRED_ATTR(num_true, Int)
|
|
.REQUIRED_ATTR(num_sampled, Int)
|
|
.REQUIRED_ATTR(unique, Bool)
|
|
.REQUIRED_ATTR(range_max, Int)
|
|
.ATTR(seed, Int, 0)
|
|
.ATTR(seed2, Int, 0)
|
|
.OP_END_FACTORY_REG(UniformCandidateSampler)
|
|
|
|
/**
|
|
*@brief Generates labels for candidate sampling with a learned
|
|
unigram distribution. \n
|
|
|
|
*@par Inputs:
|
|
*true_classes: A "batch_size * num_true" matrix, in which each row contains
|
|
the IDs of the "num_true" "target_classes" in the corresponding original label.
|
|
* Input "true_classes" is a 2D matrix. \n
|
|
|
|
*@par Attributes:
|
|
*@li num_true: Number of true labels per context.
|
|
*@li num_sampled: Number of candidates to randomly sample.
|
|
*@li unique: If "unique" is true, samples with rejection,
|
|
so that all sampled candidates in a batch are unique. This requires
|
|
some approximation to estimate the post-rejection sampling probabilities.
|
|
*@li range_max: The sampler will sample integers from the interval [0, range_max).
|
|
*@li vocab_file: Each valid line in this file (which should have a
|
|
CSV-like format) corresponds to a valid word ID.
|
|
*IDs are in sequential order, starting from num_reserved_ids.
|
|
*@li distortion: The distortion is used to skew the unigram probability
|
|
distribution. Each weight is first raised to the distortion's power before
|
|
adding to the internal unigram distribution.
|
|
*@li num_reserved_ids: Optionally some reserved IDs can be added in the range
|
|
[0, ..., num_reserved_ids) by the users.
|
|
* One use case is that a special unknown word token is used as ID 0.
|
|
*@li num_shards: A sampler can be used to sample from a subset of the
|
|
original range. in order to speed up the whole computation through parallelism.
|
|
*@li shard: A sampler can be used to sample from a subset of the original
|
|
range in order to speed up the whole computation through parallelism.
|
|
*@li unigrams: A list of unigram counts or probabilities, one per ID in
|
|
sequential order.
|
|
*@li seed: If either "seed" or "seed2" are set to be non-zero.
|
|
*@li seed2: A second seed to avoid seed collision. \n
|
|
|
|
*@par Outputs:
|
|
*@li sampled_candidates: A vector of length "num_sampled", in which each
|
|
element is the ID of a sampled candidate.
|
|
*@li true_expected_count: A "batch_size * num_true" matrix, representing the
|
|
number of times each candidate is expected to occur in a batch of sampled
|
|
candidates. If "unique" is true, then this is a probability.
|
|
*@li sampled_expected_count: A vector of length "num_sampled",
|
|
for each sampled candidate representing the number of times the candidate is
|
|
expected to occur in a batch of sampled candidates.
|
|
If "unique" is true, then this is a probability. \n
|
|
|
|
*@attention Constraints:
|
|
* FixedUnigramCandidateSampler runs on the Ascend AI CPU,
|
|
which delivers poor performance. \n
|
|
|
|
*@par Third-party framework compatibility
|
|
*Compatible with the TensorFlow operator FixedUnigramCandidateSampler. \n
|
|
|
|
*@par Restrictions:
|
|
*Warning: THIS FUNCTION IS EXPERIMENTAL. Please do not use.
|
|
*/
|
|
REG_OP(FixedUnigramCandidateSampler)
|
|
.INPUT(true_classes, TensorType({ DT_INT64 }))
|
|
.OUTPUT(sampled_candidates, TensorType({ DT_INT64 }))
|
|
.OUTPUT(true_expected_count, TensorType({ DT_FLOAT }))
|
|
.OUTPUT(sampled_expected_count, TensorType({ DT_FLOAT }))
|
|
.ATTR(num_true, Int, 0)
|
|
.ATTR(num_sampled, Int, 0)
|
|
.ATTR(unique, Bool, false)
|
|
.ATTR(range_max, Int, 0)
|
|
.ATTR(vocab_file, String, "")
|
|
.ATTR(distortion, Float, 1.0)
|
|
.ATTR(num_reserved_ids, Int, 0)
|
|
.ATTR(num_shards, Int, 1)
|
|
.ATTR(shard, Int, 0)
|
|
.REQUIRED_ATTR(unigrams, ListFloat)
|
|
.ATTR(seed, Int, 0)
|
|
.ATTR(seed2, Int, 0)
|
|
.OP_END_FACTORY_REG(FixedUnigramCandidateSampler)
|
|
|
|
/**
|
|
*@brief Generates labels for candidate sampling with a learned
|
|
unigram distribution. \n
|
|
|
|
*@par Inputs:
|
|
*true_classes: A "batch_size * num_true" matrix, in which each row contains
|
|
the IDs of the "num_true" "target_classes" in the corresponding original label.
|
|
* Input "true_classes" is a 2D matrix. \n
|
|
|
|
*@par Attributes:
|
|
*@li num_true: Number of true labels per context.
|
|
*@li num_sampled: Number of candidates to randomly sample.
|
|
*@li unique: If "unique" is true, samples with rejection,
|
|
so that all sampled candidates in a batch are unique.
|
|
*This requires some approximation to estimate the post-rejection
|
|
sampling probabilities.
|
|
*@li range_max: The sampler will sample integers from the interval
|
|
[0, range_max).
|
|
*@li seed: If either "seed" or "seed2" are set to be non-zero.
|
|
*@li seed2: A second seed to avoid seed collision. \n
|
|
|
|
*@par Outputs:
|
|
*@li sampled_candidates: A vector of length "num_sampled", in which each
|
|
element is the ID of a sampled candidate.
|
|
*@li true_expected_count: A "batch_size * num_true" matrix, representing
|
|
the number of times each candidate is expected to occur in a batch of sampled candidates.
|
|
*If "unique" is true, then this is a probability.
|
|
*@li sampled_expected_count: A vector of length "num_sampled", for each
|
|
sampled candidate representing the number of times the candidate is expected
|
|
to occur in a batch of sampled candidates.
|
|
*If "unique" is true, then this is a probability. \n
|
|
|
|
*@attention Constraints:
|
|
*LearnedUnigramCandidateSampler runs on the Ascend AI CPU, which delivers
|
|
poor performance. \n
|
|
|
|
*@par Third-party framework compatibility
|
|
*Compatible with the TensorFlow operator LearnedUnigramCandidateSampler. \n
|
|
|
|
*@par Restrictions:
|
|
*Warning: THIS FUNCTION IS EXPERIMENTAL. Please do not use.
|
|
*/
|
|
REG_OP(LearnedUnigramCandidateSampler)
|
|
.INPUT(true_classes, TensorType({ DT_INT64 }))
|
|
.OUTPUT(sampled_candidates, TensorType({ DT_INT64 }))
|
|
.OUTPUT(true_expected_count, TensorType({ DT_FLOAT }))
|
|
.OUTPUT(sampled_expected_count, TensorType({ DT_FLOAT }))
|
|
.REQUIRED_ATTR(num_true, Int)
|
|
.REQUIRED_ATTR(num_sampled, Int)
|
|
.REQUIRED_ATTR(unique, Bool)
|
|
.REQUIRED_ATTR(range_max, Int)
|
|
.ATTR(seed, Int, 0)
|
|
.ATTR(seed2, Int, 0)
|
|
.OP_END_FACTORY_REG(LearnedUnigramCandidateSampler)
|
|
|
|
/**
|
|
*@brief Generates labels for candidate sampling with a log-uniform
|
|
distribution. \n
|
|
|
|
*@par Inputs:
|
|
*true_classes: A "batch_size * num_true" matrix, in which each row contains
|
|
the IDs of the "num_true" "target_classes" in the corresponding original label.
|
|
* Input "true_classes" is a 2D matrix. \n
|
|
|
|
*@par Attributes:
|
|
*@li num_true: Number of true labels per context.
|
|
*@li num_sampled: Number of candidates to randomly sample.
|
|
*@li unique: If "unique" is true, samples with rejection, so that all
|
|
sampled candidates in a batch are unique. This requires some approximation
|
|
to estimate the post-rejection sampling probabilities.
|
|
*@li range_max: The sampler will sample integers from the interval
|
|
[0, range_max).
|
|
*@li seed: If either "seed" or "seed2" are set to be non-zero.
|
|
*@li seed2: A second seed to avoid seed collision. \n
|
|
|
|
*@par Outputs:
|
|
*@li sampled_candidates: A vector of length "num_sampled", in which each
|
|
element is the ID of a sampled candidate.
|
|
*@li true_expected_count: A "batch_size * num_true" matrix, representing
|
|
the number of times each candidate is expected to occur in a batch of sampled
|
|
candidates. If "unique" is true, then this is a probability.
|
|
*@li sampled_expected_count: A vector of length "num_sampled", for each
|
|
sampled candidate representing the number of times the candidate is expected
|
|
to occur in a batch of sampled candidates.
|
|
*If "unique" is true, then this is a probability. \n
|
|
|
|
*@attention Constraints:
|
|
*LogUniformCandidateSampler runs on the Ascend AI CPU, which delivers
|
|
poor performance. \n
|
|
|
|
*@par Third-party framework compatibility
|
|
*Compatible with the TensorFlow operator LogUniformCandidateSampler. \n
|
|
|
|
*@par Restrictions:
|
|
*Warning: THIS FUNCTION IS EXPERIMENTAL. Please do not use.
|
|
*/
|
|
REG_OP(LogUniformCandidateSampler)
|
|
.INPUT(true_classes, TensorType({ DT_INT64 }))
|
|
.OUTPUT(sampled_candidates, TensorType({ DT_INT64 }))
|
|
.OUTPUT(true_expected_count, TensorType({ DT_FLOAT }))
|
|
.OUTPUT(sampled_expected_count, TensorType({ DT_FLOAT }))
|
|
.REQUIRED_ATTR(num_true, Int)
|
|
.REQUIRED_ATTR(num_sampled, Int)
|
|
.REQUIRED_ATTR(unique, Bool)
|
|
.REQUIRED_ATTR(range_max, Int)
|
|
.ATTR(seed, Int, 0)
|
|
.ATTR(seed2, Int, 0)
|
|
.OP_END_FACTORY_REG(LogUniformCandidateSampler)
|
|
|
|
/**
|
|
*@brief Generates labels for candidate sampling with a learned
|
|
unigram distribution. \n
|
|
|
|
*@par Inputs:
|
|
*true_classes: A "batch_size * num_true" matrix, in which each row contains
|
|
the IDs of the "num_true" "target_classes" in the corresponding original label.
|
|
* Input "true_classes" is a 2D matrix. \n
|
|
|
|
*@par Attributes:
|
|
*@li num_true: Number of true labels per context.
|
|
*@li num_sampled: Number of candidates to randomly sample.
|
|
*@li unique: If "unique" is true, samples with rejection,
|
|
so that all sampled candidates in a batch are unique. This requires some
|
|
approximation to estimate the post-rejection sampling probabilities.
|
|
*@li seed: If either "seed" or "seed2" are set to be non-zero.
|
|
*@li seed2: A second seed to avoid seed collision. \n
|
|
|
|
*@par Outputs:
|
|
*@li sampled_candidates: A vector of length "num_sampled",
|
|
in which each element is the ID of a sampled candidate.
|
|
*@li true_expected_count: A "batch_size * num_true" matrix, representing the
|
|
number of times each candidate is expected to occur in a batch of sampled candidates.
|
|
*If "unique" is true, then this is a probability.
|
|
*@li sampled_expected_count: A vector of length "num_sampled", for each
|
|
sampled candidate representing the number of times the candidate is expected
|
|
to occur in a batch of sampled candidates. If "unique" is true, then this is a probability. \n
|
|
|
|
*@attention Constraints:
|
|
*AllCandidateSampler runs on the Ascend AI CPU, which delivers poor performance.
|
|
|
|
*@par Third-party framework compatibility
|
|
*Compatible with the TensorFlow operator AllCandidateSampler. \n
|
|
|
|
*@par Restrictions:
|
|
*Warning: THIS FUNCTION IS EXPERIMENTAL. Please do not use.
|
|
*/
|
|
REG_OP(AllCandidateSampler)
|
|
.INPUT(true_classes, TensorType({ DT_INT64 }))
|
|
.OUTPUT(sampled_candidates, TensorType({ DT_INT64 }))
|
|
.OUTPUT(true_expected_count, TensorType({ DT_FLOAT }))
|
|
.OUTPUT(sampled_expected_count, TensorType({ DT_FLOAT }))
|
|
.REQUIRED_ATTR(num_true, Int)
|
|
.REQUIRED_ATTR(num_sampled, Int)
|
|
.REQUIRED_ATTR(unique, Bool)
|
|
.ATTR(seed, Int, 0)
|
|
.ATTR(seed2, Int, 0)
|
|
.OP_END_FACTORY_REG(AllCandidateSampler)
|
|
|
|
/**
|
|
*@brief Computes the "ids" of the positions in "sampled_candidates" that
|
|
match "true_labels". \n
|
|
|
|
*@par Inputs:
|
|
* @li Input "true_classes" is a 2D matrix.
|
|
* @li true_classes: The "true_classes" output of UnpackSparseLabels.
|
|
* @li sampled_candidates: The "sampled_candidates" output of CandidateSampler. \n
|
|
|
|
*@par Attributes:
|
|
*@li num_true: Number of true labels per context.
|
|
*@li seed: If either "seed" or "seed2" are set to be non-zero.
|
|
*@li seed2: A second seed to avoid seed collision. \n
|
|
|
|
*@par Outputs:
|
|
* @li indices: A vector of indices corresponding to rows of "true_candidates".
|
|
* @li ids: A vector of IDs of positions in "sampled_candidates" that match a
|
|
"true_label" for the row with the corresponding index in indices.
|
|
* @li weights: A vector of the same length as "indices" and "ids", in which
|
|
each element is -FLOAT_MAX. \n
|
|
|
|
*@attention Constraints:
|
|
*ComputeAccidentalHits runs on the Ascend AI CPU, which delivers poor performance.
|
|
|
|
*@par Third-party framework compatibility
|
|
*Compatible with the TensorFlow operator ComputeAccidentalHits. \n
|
|
|
|
*@par Restrictions:
|
|
*Warning: THIS FUNCTION IS EXPERIMENTAL. Please do not use.
|
|
*/
|
|
REG_OP(ComputeAccidentalHits)
|
|
.INPUT(true_classes, TensorType({ DT_INT64 }))
|
|
.INPUT(sampled_candidates, TensorType({ DT_INT64 }))
|
|
.OUTPUT(indices, TensorType({ DT_INT32 }))
|
|
.OUTPUT(ids, TensorType({ DT_INT64 }))
|
|
.OUTPUT(weights, TensorType({ DT_FLOAT }))
|
|
.REQUIRED_ATTR(num_true, Int)
|
|
.ATTR(seed, Int, 0)
|
|
.ATTR(seed2, Int, 0)
|
|
.OP_END_FACTORY_REG(ComputeAccidentalHits)
|
|
|
|
} // namespace ge
|
|
|
|
#endif // OPS_BUILT_IN_OP_PROTO_INC_CANDIDATE_SAMPLING_OPS_H_
|