|
|
|
@ -303,7 +303,7 @@ def train_32k_8p(epoch_size=3, batch_size=32, num_classes=32768):
|
|
|
|
|
return allreduce_fusion_dict
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def test_train_32k_8p_fusion1(epoch_size=3, batch_size=32, num_classes=32768): #1048576 #131072 #32768 #8192
|
|
|
|
|
def train_32k_8p_fusion1(epoch_size=3, batch_size=32, num_classes=32768): #1048576 #131072 #32768 #8192
|
|
|
|
|
cost_model_context.set_cost_model_context(costmodel_gamma=0.001, costmodel_beta=400.0)
|
|
|
|
|
cost_model_context.set_cost_model_context(costmodel_allreduce_fusion_algorithm=1)
|
|
|
|
|
cost_model_context.set_cost_model_context(costmodel_allreduce_fusion_times=2)
|
|
|
|
@ -475,7 +475,7 @@ def test_train_32k_8p_fusion1(epoch_size=3, batch_size=32, num_classes=32768): #
|
|
|
|
|
cost_model_context.reset_cost_model_context()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def test_train_32k_8p_fusion2(epoch_size=3, batch_size=32, num_classes=32768): #1048576 #131072 #32768 #8192
|
|
|
|
|
def train_32k_8p_fusion2(epoch_size=3, batch_size=32, num_classes=32768): #1048576 #131072 #32768 #8192
|
|
|
|
|
cost_model_context.set_cost_model_context(costmodel_allreduce_fusion_algorithm=2)
|
|
|
|
|
cost_model_context.set_cost_model_context(costmodel_allreduce_fusion_tail_time=0.1)
|
|
|
|
|
cost_model_context.set_cost_model_context(costmodel_allreduce_fusion_allreduce_inherent_time=0.05)
|
|
|
|
|