You can not select more than 25 topics
			Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
		
		
		
		
		
			
		
			
				
					
					
						
							161 lines
						
					
					
						
							6.4 KiB
						
					
					
				
			
		
		
	
	
							161 lines
						
					
					
						
							6.4 KiB
						
					
					
				/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 | 
						|
 | 
						|
Licensed under the Apache License, Version 2.0 (the "License");
 | 
						|
you may not use this file except in compliance with the License.
 | 
						|
You may obtain a copy of the License at
 | 
						|
 | 
						|
    http://www.apache.org/licenses/LICENSE-2.0
 | 
						|
 | 
						|
Unless required by applicable law or agreed to in writing, software
 | 
						|
distributed under the License is distributed on an "AS IS" BASIS,
 | 
						|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 | 
						|
See the License for the specific language governing permissions and
 | 
						|
limitations under the License. */
 | 
						|
syntax = "proto2";
 | 
						|
 | 
						|
import "DataConfig.proto";
 | 
						|
import "ModelConfig.proto";
 | 
						|
 | 
						|
package paddle;
 | 
						|
 | 
						|
message OptimizationConfig {
 | 
						|
  required int32 batch_size = 3;
 | 
						|
  required string algorithm = 4 [ default = "async_sgd" ];
 | 
						|
  optional int32 num_batches_per_send_parameter = 5 [ default = 1 ];
 | 
						|
  optional int32 num_batches_per_get_parameter = 6 [ default = 1 ];
 | 
						|
 | 
						|
  required double learning_rate = 7;
 | 
						|
  optional double learning_rate_decay_a = 8 [ default = 0 ];
 | 
						|
  optional double learning_rate_decay_b = 9 [ default = 0 ];
 | 
						|
  optional string learning_rate_schedule = 27 [ default = "constant" ];
 | 
						|
  // learning rate will be scaled according to learning_rate_schedule
 | 
						|
  // 1), constant:
 | 
						|
  // lr = learning_rate
 | 
						|
  // 2), poly:
 | 
						|
  // lr = learning_rate *
 | 
						|
  //      pow(1 + learning_rate_decay_a * num_samples_processed,
 | 
						|
  //          -learning_rate_decay_b)
 | 
						|
  // 3), exp:
 | 
						|
  // lr = learning_rate *
 | 
						|
  //      pow(learning_rate_decay_a,
 | 
						|
  //          num_samples_processed / learning_rate_decay_b)
 | 
						|
  // 4), discexp:
 | 
						|
  // lr = learning_rate *
 | 
						|
  //      pow(learning_rate_decay_a,
 | 
						|
  //          floor(num_samples_processed / learning_rate_decay_b))
 | 
						|
  // 5), linear:
 | 
						|
  // lr = max(learning_rate - learning_rate_decay_a * num_samples_processed,
 | 
						|
  //          learning_rate_decay_b)
 | 
						|
 | 
						|
  // owlqn related
 | 
						|
  // L1-regularization
 | 
						|
  optional double l1weight = 10 [ default = 0.1 ];
 | 
						|
  // L2-regularization
 | 
						|
  optional double l2weight = 11 [ default = 0 ];
 | 
						|
  // "c1" in wolfe condition: if (newobj <= oldobj + c1 * origDirDeriv * step)
 | 
						|
  // then accept the step
 | 
						|
  optional double c1 = 12 [ default = 0.0001 ];
 | 
						|
  // multiply the step with "backoff", when wolfe condition doesn't satisfy
 | 
						|
  optional double backoff = 13 [ default = 0.5 ];
 | 
						|
  // how many "s"s and "y"s are kept in owlqn
 | 
						|
  optional int32 owlqn_steps = 14 [ default = 10 ];
 | 
						|
  // accept the step if encountered "max_backoff" times of "reduce the step"
 | 
						|
  optional int32 max_backoff = 15 [ default = 5 ];
 | 
						|
  // L2-regularization coefficient is reduced linearly from iteration 0 to
 | 
						|
  // "l2weight_zero_iter", and set to 0 after "l2weight_zero_iter"
 | 
						|
  // iterations. set "l2weight_zero_iter" to 0 to disable this strategy.
 | 
						|
  optional int32 l2weight_zero_iter = 17 [ default = 0 ];
 | 
						|
 | 
						|
  // averaged sgd
 | 
						|
  // About average_window * numBatchProcessed parameter are used
 | 
						|
  // for average. To be accurate, between average_window * numBatchProcessed
 | 
						|
  // and 2 * average_window * numBatchProcessed parameters are used for
 | 
						|
  // average.
 | 
						|
  optional double average_window = 18 [ default = 0 ];
 | 
						|
  optional int64 max_average_window = 19 [ default = 0x7fffffffffffffff ];
 | 
						|
 | 
						|
  //////////////////////////
 | 
						|
  // Options Adaptive SGD //
 | 
						|
  //////////////////////////
 | 
						|
 | 
						|
  // learning method for sgd/asgd, such as "momentum", "adagrad", "adadelta",
 | 
						|
  // "rmsprop"
 | 
						|
  // default learning method("momentum") use global decayed learning rate with
 | 
						|
  // momentum.
 | 
						|
  // "adagrad", "adadelta" and "rmsprop" can set momentum too.
 | 
						|
  optional string learning_method = 23 [ default = "momentum" ];
 | 
						|
  optional double ada_epsilon = 24 [ default = 1e-6 ];
 | 
						|
  optional double ada_rou = 26 [ default = 0.95 ];
 | 
						|
 | 
						|
  // Force to do average in cpu in order to save gpu memory usage
 | 
						|
  optional bool do_average_in_cpu = 25 [ default = false ];
 | 
						|
 | 
						|
  // delta add rate in pserver, used while num_batches_per_send_parameter>1
 | 
						|
  // will be divided by #machines automatically.
 | 
						|
  optional double delta_add_rate = 28 [ default = 1.0 ];
 | 
						|
 | 
						|
  // We split a large size into smaller mini-batches, whose sizes are
 | 
						|
  // determined by mini_batch_size. It only takes effect when there is
 | 
						|
  // an ExternalMachine.
 | 
						|
  optional int32 mini_batch_size = 29 [ default = 128 ];
 | 
						|
 | 
						|
  // automatically set if any one of parameters set sparse remote update flag
 | 
						|
  optional bool use_sparse_remote_updater = 30 [ default = false ];
 | 
						|
 | 
						|
  // how to update center parameter and feedback to local parameter,
 | 
						|
  // when use local sgd update in cluster training.
 | 
						|
  // A option is elastic_average, proposed by the paper: Deep learning with
 | 
						|
  // elastic averaging SGD.
 | 
						|
  // If use elastic_average method, every trainer node should sample from whole
 | 
						|
  // data sets.
 | 
						|
  optional string center_parameter_update_method = 31 [ default = "average" ];
 | 
						|
 | 
						|
  // shrink sparse parameter value
 | 
						|
  // only works if parameter is remote sparse update and has L1 decay rate
 | 
						|
  optional double shrink_parameter_value = 32 [ default = 0 ];
 | 
						|
 | 
						|
  ////////////////////////////
 | 
						|
  // Options Adam Optimizer //
 | 
						|
  ////////////////////////////
 | 
						|
  optional double adam_beta1 = 33 [ default = 0.9 ];
 | 
						|
  optional double adam_beta2 = 34 [ default = 0.999 ];
 | 
						|
  optional double adam_epsilon = 35 [ default = 1e-8 ];
 | 
						|
 | 
						|
  // arguments for learning rate scheduler
 | 
						|
  // Format: num1:rate1,num2:rate2,...,numK:rateK
 | 
						|
  // For learning_rate_schedule="manual", num is the number of samples,
 | 
						|
  // For learning_rate_schedule="pass_manual",
 | 
						|
  //  num is the number of passes (starting from 0)
 | 
						|
  optional string learning_rate_args = 36 [ default = "" ];
 | 
						|
 | 
						|
  // for async sgd gradient commit control.
 | 
						|
  // when async_lagged_grad_discard_ratio * num_gradient_servers commit passed,
 | 
						|
  // current async gradient will be discard silently.
 | 
						|
  optional double async_lagged_grad_discard_ratio = 37 [ default = 1.5 ];
 | 
						|
 | 
						|
  // global threshold for gradient clipping
 | 
						|
  optional double gradient_clipping_threshold = 38 [ default = 0.0 ];
 | 
						|
};
 | 
						|
 | 
						|
message TrainerConfig {
 | 
						|
  optional ModelConfig model_config = 1;
 | 
						|
  optional DataConfig data_config = 2;
 | 
						|
  required OptimizationConfig opt_config = 3;
 | 
						|
  optional DataConfig test_data_config = 4;
 | 
						|
  repeated string config_files = 5;
 | 
						|
 | 
						|
  // the directory to save/load model files for each training path
 | 
						|
  optional string save_dir = 6 [ default = "./output/model" ];
 | 
						|
 | 
						|
  // Path of the initial model parameters.
 | 
						|
  // If it was set, start_pass will be ignored.
 | 
						|
  optional string init_model_path = 7;
 | 
						|
 | 
						|
  // Start training from this pass.
 | 
						|
  // Will load parameter from the previous pass.
 | 
						|
  optional int32 start_pass = 8 [ default = 0 ];
 | 
						|
 | 
						|
  // file path to the trainer config file
 | 
						|
  optional string config_file = 9;
 | 
						|
}
 |