You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
87 lines
3.4 KiB
87 lines
3.4 KiB
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
|
|
|
|
Licensed under the Apache License, Version 2.0 (the "License");
|
|
you may not use this file except in compliance with the License.
|
|
You may obtain a copy of the License at
|
|
|
|
http://www.apache.org/licenses/LICENSE-2.0
|
|
|
|
Unless required by applicable law or agreed to in writing, software
|
|
distributed under the License is distributed on an "AS IS" BASIS,
|
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
See the License for the specific language governing permissions and
|
|
limitations under the License. */
|
|
syntax = "proto2";
|
|
|
|
package paddle;
|
|
|
|
message FileGroupConf {
|
|
optional uint32 queue_capacity = 1 [ default = 1 ];
|
|
// how many files to load for a load file thread
|
|
optional int32 load_file_count = 2 [ default = 1 ];
|
|
// how many threads to load files
|
|
// Setting to be 5~10 is appropriate when loading files by hadoop vfs
|
|
optional int32 load_thread_num = 3 [ default = 1 ];
|
|
};
|
|
|
|
message DataConfig {
|
|
|
|
required string type = 1;
|
|
|
|
// name of a text file which contains a list of file names at each line
|
|
optional string files = 3;
|
|
|
|
optional int32 feat_dim = 4; // feature dimension of one frame
|
|
repeated int32 slot_dims = 5; // feature slot dims
|
|
optional int32 context_len = 6; // max neibour frame numbers
|
|
optional uint64 buffer_capacity = 7; // the number of samples
|
|
|
|
// part of data used in training
|
|
// if not -1, part of train data is used in training
|
|
optional int64 train_sample_num = 8 [ default = -1 ];
|
|
|
|
// The number of documents processed once
|
|
optional int32 file_load_num = 9 [ default = -1 ];
|
|
optional bool async_load_data = 12 [ default = false ];
|
|
/// Note the field number 10, 11 and 13 have been deprecated.
|
|
optional bool for_test = 14
|
|
[ default = false ]; // whether this data is for test
|
|
optional FileGroupConf file_group_conf = 15;
|
|
repeated int32 float_slot_dims = 16;
|
|
|
|
/// Note the field number 17, 18 and 19 have been deprecated.
|
|
|
|
// a list of values which will be used to create additional one dimensional
|
|
// float
|
|
// values slots. These one dimensional slots can be used as the weight input
|
|
// for cost layers.
|
|
// Currently this is only supported by ProtoDataProvider.
|
|
repeated double constant_slots = 20;
|
|
|
|
// for PyDataProvider.
|
|
// Specify the load data script module name, object name and user args
|
|
optional string load_data_module = 21;
|
|
optional string load_data_object = 22;
|
|
optional string load_data_args = 23;
|
|
|
|
// for MultiDataProvider
|
|
repeated DataConfig sub_data_configs = 24; // sub dataproviders
|
|
/*
|
|
* the ratio of each sub dataproviders:
|
|
* e.g. sub dataprovider A's ratio is 1, B's ratio is 9, batch_size is 100,
|
|
* then each mini-batch is combined by 10 instance from A and 90 instances
|
|
* from B.
|
|
*/
|
|
optional int32 data_ratio = 25;
|
|
/*
|
|
* if one of the sub dataproviders is running out of data, then
|
|
* (1) it is "main data", then finish current pass.
|
|
* (2) it is not "main data", then reset it, and try getNextBatch again.
|
|
*/
|
|
optional bool is_main_data = 26 [ default = true ];
|
|
|
|
// the usage ratio of instances. Setting to 1.0 means the use of all
|
|
// instances.
|
|
optional double usage_ratio = 27 [ default = 1.0 ];
|
|
};
|