diff --git a/mindspore/ops/_op_impl/_custom_op/batch_matmul_impl.py b/mindspore/ops/_op_impl/_custom_op/batch_matmul_impl.py index c512989906..0421de2dab 100644 --- a/mindspore/ops/_op_impl/_custom_op/batch_matmul_impl.py +++ b/mindspore/ops/_op_impl/_custom_op/batch_matmul_impl.py @@ -14,10 +14,11 @@ # ============================================================================ """batch_matmul_impl""" -from mindspore.ops.op_info_register import op_info_register, TBERegOp, DataType from te import tik from topi.cce import util +from mindspore.ops.op_info_register import op_info_register, TBERegOp, DataType + cus_batchmatmul_op_info = TBERegOp("CusBatchMatMul") \ .fusion_type("OPAQUE") \ .async_flag(False) \ @@ -114,7 +115,8 @@ def CusBatchMatMul(input_x1, input_x2, output, transpose_a=False, transpose_b=Tr ((1, 64, 64), (1, 64, 64), "float32", False, True), ((1, 128, 128), (1, 128, 128), "float32", False, True), ((4, 128, 128), (4, 128, 128), "float32", False, True), - ((2, 128, 128), (2, 128, 128), "float32", False, True)] + ((2, 128, 128), (2, 128, 128), "float32", False, True), + ((32, 128, 128), (32, 128, 128), 'float32', False, True)] if input_shape not in support_shape: raise RuntimeError("input_shape %s is not supported" % str(input_shape)) @@ -232,7 +234,8 @@ def CusBatchMatMul(input_x1, input_x2, output, transpose_a=False, transpose_b=Tr ((2, 128, 128), (2, 128, 128), "float32", False, True), ((4, 128, 128), (4, 128, 128), "float32", False, True), ((8, 128, 128), (8, 128, 128), "float32", False, True), - ((16, 128, 128), (16, 128, 128), "float32", False, True) + ((16, 128, 128), (16, 128, 128), "float32", False, True), + ((32, 128, 128), (32, 128, 128), 'float32', False, True) ] if input_shape in input_shape_list: block_num = 32 diff --git a/mindspore/ops/_op_impl/_custom_op/fused_abs_max1_impl.py b/mindspore/ops/_op_impl/_custom_op/fused_abs_max1_impl.py index f4b8d44063..23fdd1a2a7 100644 --- a/mindspore/ops/_op_impl/_custom_op/fused_abs_max1_impl.py +++ b/mindspore/ops/_op_impl/_custom_op/fused_abs_max1_impl.py @@ -13,10 +13,11 @@ # limitations under the License. # ============================================================================ """CusFusedAbsMax1""" -from mindspore.ops.op_info_register import op_info_register, TBERegOp, DataType from te import tik from topi.cce import util +from mindspore.ops.op_info_register import op_info_register, TBERegOp, DataType + cus_fused_abs_max1_op_info = TBERegOp("CusFusedAbsMax1") \ .fusion_type("OPAQUE") \ .async_flag(False) \ @@ -36,152 +37,86 @@ def CusFusedAbsMax1(input_x, output, origin_shape=None, kernel_name="fused_abs_m """CusFusedAbsMax1""" input_x_shape = input_x.get("shape") output_shape = output.get("shape") + dtype = input_x.get("dtype") if util.get_product_version() == util.VERSION_MINI: tik_instance = tik.Tik(tik.Dprofile("v100", "mini")) else: tik_instance = tik.Tik(tik.Dprofile("v100", "cloud")) - if len(input_x_shape) > 2: - if (input_x_shape[0] == 1 and input_x_shape[1] == 128 and input_x_shape[2] == 128) or ( - input_x_shape[0] == 4 and input_x_shape[1] == 16) or (input_x_shape[0] == 16 and input_x_shape[1] == 4): - input_x = tik_instance.Tensor("float32", input_x_shape, name="input_x", scope=tik.scope_gm) - res = tik_instance.Tensor("float32", output_shape, name="res", scope=tik.scope_gm) - total_elements = 1 - for val in input_x_shape: - total_elements *= val + support_shape = [((1, 128, 128), "float32"), + ((2, 128, 128), "float32"), + ((4, 128, 128), "float32"), + ((8, 128, 128), "float32"), + ((16, 128, 128), "float32"), + ((5, 128, 128), "float32"), + ((9, 128, 128), "float32"), + ((18, 128, 128), "float32"), + ((36, 128, 128), "float32"), + ((32, 128, 128), "float32"), + ((1, 64, 64), "float32"), + ((32, 64), "float32") + ] + ori_shape = tuple(origin_shape) + input_info = (tuple(input_x_shape), dtype) + if input_info not in support_shape: + raise RuntimeError("input_shape %s is not supported" % str(input_info)) + if input_info == ((1, 128, 128), "float32"): + input_x = tik_instance.Tensor("float32", input_x_shape, name="input_x", scope=tik.scope_gm) + res = tik_instance.Tensor("float32", output_shape, name="res", scope=tik.scope_gm) + total_elements = 1 + for val in input_x_shape: + total_elements *= val + blocks = 32 + each_block_element = total_elements // blocks + with tik_instance.for_range(0, blocks, block_num=blocks) as block_index: + input_x_ub = tik_instance.Tensor("float32", (each_block_element,), name="input_x_ub", + scope=tik.scope_ubuf) + broadcast_0_local_UB = tik_instance.Tensor("float32", (4096,), name="broadcast_0_local_UB", + scope=tik.scope_ubuf) + tik_instance.data_move(input_x_ub, input_x[each_block_element * block_index], 0, 1, + each_block_element // 8, 0, 0) + repeat_time = each_block_element // 64 + tik_instance.vabs(64, input_x_ub, input_x_ub, repeat_time, 1, 1, 8, 8) + tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[256], 4, 1, 1, 1, 8, 8, 8) + tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[128], 2, 1, 1, 1, 8, 8, 8) + tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[64], 1, 1, 1, 1, 8, 8, 8) + with tik_instance.for_range(0, 64) as cc0: + data_temp = tik_instance.Scalar("float32") + data_temp.set_as(input_x_ub[cc0]) + tik_instance.vector_dup(64, broadcast_0_local_UB[cc0 * 64], data_temp, 1, 1, 8) + tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[2048], 32, 1, 1, + 1, 8, 8, 8) + tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[1024], 16, 1, 1, + 1, 8, 8, 8) + tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[512], 8, 1, 1, 1, + 8, 8, 8) + tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[256], 4, 1, 1, 1, + 8, 8, 8) + tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[128], 2, 1, 1, 1, + 8, 8, 8) + tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[64], 1, 1, 1, 1, + 8, 8, 8) + tik_instance.data_move(res[block_index, 0], broadcast_0_local_UB, 0, 1, 8, 0, 0) + elif input_info == ((2, 128, 128), "float32"): + if ori_shape == (147, 147): + phase_1 = 16384 + phase_2 = 1216 blocks = 32 - each_block_element = total_elements // blocks - with tik_instance.for_range(0, blocks, block_num=blocks) as block_index: - input_x_ub = tik_instance.Tensor("float32", (each_block_element,), name="input_x_ub", - scope=tik.scope_ubuf) - broadcast_0_local_UB = tik_instance.Tensor("float32", (4096,), name="broadcast_0_local_UB", - scope=tik.scope_ubuf) - tik_instance.data_move(input_x_ub, input_x[each_block_element * block_index], 0, 1, - each_block_element // 8, 0, 0) - repeat_time = each_block_element // 64 - tik_instance.vabs(64, input_x_ub, input_x_ub, repeat_time, 1, 1, 8, 8) - tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[256], 4, 1, 1, 1, 8, 8, 8) - tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[128], 2, 1, 1, 1, 8, 8, 8) - tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[64], 1, 1, 1, 1, 8, 8, 8) - with tik_instance.for_range(0, 64) as cc0: - data_temp = tik_instance.Scalar("float32") - data_temp.set_as(input_x_ub[cc0]) - tik_instance.vector_dup(64, broadcast_0_local_UB[cc0 * 64], data_temp, 1, 1, 8) - tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[2048], 32, 1, 1, - 1, 8, 8, 8) - tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[1024], 16, 1, 1, - 1, 8, 8, 8) - tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[512], 8, 1, 1, 1, - 8, 8, 8) - tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[256], 4, 1, 1, 1, - 8, 8, 8) - tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[128], 2, 1, 1, 1, - 8, 8, 8) - tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[64], 1, 1, 1, 1, - 8, 8, 8) - tik_instance.data_move(res[block_index, 0], broadcast_0_local_UB, 0, 1, 8, 0, 0) - elif (input_x_shape[0] == 2 and input_x_shape[1] == 128 and input_x_shape[2] == 128) or ( - input_x_shape[0] == 16 and input_x_shape[1] == 8): - if origin_shape[0] == 147 and ( - input_x_shape[0] == 2 and input_x_shape[1] == 128 and input_x_shape[2] == 128): - assert origin_shape[0] == 147 - assert origin_shape[1] == 147 - phase_1 = 16384 - phase_2 = 1216 - blocks = 32 - each_block_element = phase_1 // blocks + 64 - input_x = tik_instance.Tensor("float32", input_x_shape, name="input_x", scope=tik.scope_gm) - res = tik_instance.Tensor("float32", output_shape, name="res", scope=tik.scope_gm) - with tik_instance.for_range(0, blocks, block_num=blocks) as block_index: - input_x_ub = tik_instance.Tensor("float32", (each_block_element,), name="input_x_ub", - scope=tik.scope_ubuf) - broadcast_0_local_UB = tik_instance.Tensor("float32", (4096,), name="broadcast_0_local_UB", - scope=tik.scope_ubuf) - tik_instance.data_move(input_x_ub, input_x[512 * block_index], 0, 1, 512 // 8, 0, 0) - line_id = block_index % 19 - tik_instance.data_move(input_x_ub[512], input_x[16384 + 128 * line_id], 0, 1, 8, 0, 0) - repeat_time = each_block_element // 64 - tik_instance.vabs(64, input_x_ub, input_x_ub, repeat_time, 1, 1, 8, 8) - tik_instance.vmax(19, input_x_ub, input_x_ub, input_x_ub[512], 1, 1, 1, 1, 8, 8, 8) - tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[256], 4, 1, 1, 1, 8, 8, 8) - tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[128], 2, 1, 1, 1, 8, 8, 8) - tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[64], 1, 1, 1, 1, 8, 8, 8) - with tik_instance.for_range(0, 64) as cc0: - data_temp = tik_instance.Scalar("float32") - data_temp.set_as(input_x_ub[cc0]) - tik_instance.vector_dup(64, broadcast_0_local_UB[cc0 * 64], data_temp, 1, 1, 8) - tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[2048], 32, 1, - 1, 1, 8, 8, 8) - tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[1024], 16, 1, - 1, 1, 8, 8, 8) - tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[512], 8, 1, - 1, 1, 8, 8, 8) - tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[256], 4, 1, - 1, 1, 8, 8, 8) - tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[128], 2, 1, - 1, 1, 8, 8, 8) - tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[64], 1, 1, 1, - 1, 8, 8, 8) - tik_instance.data_move(res[block_index, 0], broadcast_0_local_UB, 0, 1, 8, 0, 0) - else: - input_x = tik_instance.Tensor("float32", input_x_shape, name="input_x", scope=tik.scope_gm) - res = tik_instance.Tensor("float32", output_shape, name="res", scope=tik.scope_gm) - total_elements = 1 - for val in input_x_shape: - total_elements *= val - blocks = 32 - each_block_element = total_elements // blocks - with tik_instance.for_range(0, blocks, block_num=blocks) as block_index: - input_x_ub = tik_instance.Tensor("float32", (each_block_element,), name="input_x_ub", - scope=tik.scope_ubuf) - broadcast_0_local_UB = tik_instance.Tensor("float32", (4096,), name="broadcast_0_local_UB", - scope=tik.scope_ubuf) - tik_instance.data_move(input_x_ub, input_x[each_block_element * block_index], 0, 1, - each_block_element // 8, 0, 0) - repeat_time = each_block_element // 64 - tik_instance.vabs(64, input_x_ub, input_x_ub, repeat_time, 1, 1, 8, 8) - tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[512], 8, 1, 1, 1, 8, 8, 8) - tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[256], 4, 1, 1, 1, 8, 8, 8) - tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[128], 2, 1, 1, 1, 8, 8, 8) - tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[64], 1, 1, 1, 1, 8, 8, 8) - with tik_instance.for_range(0, 64) as cc0: - data_temp = tik_instance.Scalar("float32") - data_temp.set_as(input_x_ub[cc0]) - tik_instance.vector_dup(64, broadcast_0_local_UB[cc0 * 64], data_temp, 1, 1, 8) - tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[2048], 32, 1, - 1, 1, 8, 8, 8) - tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[1024], 16, 1, - 1, 1, 8, 8, 8) - tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[512], 8, 1, - 1, 1, 8, 8, 8) - tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[256], 4, 1, - 1, 1, 8, 8, 8) - tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[128], 2, 1, - 1, 1, 8, 8, 8) - tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[64], 1, 1, 1, - 1, 8, 8, 8) - tik_instance.data_move(res[block_index, 0], broadcast_0_local_UB, 0, 1, 8, 0, 0) - elif (input_x_shape[0] == 4 and input_x_shape[1] == 128 and input_x_shape[2] == 128) or ( - input_x_shape[0] == 8 and input_x_shape[1] == 32) or (input_x_shape[0] == 32 and input_x_shape[1] == 8): + each_block_element = phase_1 // blocks + 64 input_x = tik_instance.Tensor("float32", input_x_shape, name="input_x", scope=tik.scope_gm) res = tik_instance.Tensor("float32", output_shape, name="res", scope=tik.scope_gm) - total_elements = 1 - for val in input_x_shape: - total_elements *= val - blocks = 32 - each_block_element = total_elements // blocks with tik_instance.for_range(0, blocks, block_num=blocks) as block_index: input_x_ub = tik_instance.Tensor("float32", (each_block_element,), name="input_x_ub", scope=tik.scope_ubuf) broadcast_0_local_UB = tik_instance.Tensor("float32", (4096,), name="broadcast_0_local_UB", scope=tik.scope_ubuf) - tik_instance.data_move(input_x_ub, input_x[each_block_element * block_index], 0, 1, - each_block_element // 8, 0, 0) + tik_instance.data_move(input_x_ub, input_x[512 * block_index], 0, 1, 512 // 8, 0, 0) + line_id = block_index % 19 + tik_instance.data_move(input_x_ub[512], input_x[16384 + 128 * line_id], 0, 1, 8, 0, 0) repeat_time = each_block_element // 64 tik_instance.vabs(64, input_x_ub, input_x_ub, repeat_time, 1, 1, 8, 8) - tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[1024], 16, 1, 1, 1, 8, 8, 8) - tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[512], 8, 1, 1, 1, 8, 8, 8) + tik_instance.vmax(19, input_x_ub, input_x_ub, input_x_ub[512], 1, 1, 1, 1, 8, 8, 8) tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[256], 4, 1, 1, 1, 8, 8, 8) tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[128], 2, 1, 1, 1, 8, 8, 8) tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[64], 1, 1, 1, 1, 8, 8, 8) @@ -189,169 +124,20 @@ def CusFusedAbsMax1(input_x, output, origin_shape=None, kernel_name="fused_abs_m data_temp = tik_instance.Scalar("float32") data_temp.set_as(input_x_ub[cc0]) tik_instance.vector_dup(64, broadcast_0_local_UB[cc0 * 64], data_temp, 1, 1, 8) - tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[2048], 32, 1, 1, + tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[2048], 32, 1, + 1, 1, 8, 8, 8) + tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[1024], 16, 1, + 1, 1, 8, 8, 8) + tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[512], 8, 1, + 1, 1, 8, 8, 8) + tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[256], 4, 1, + 1, 1, 8, 8, 8) + tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[128], 2, 1, + 1, 1, 8, 8, 8) + tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[64], 1, 1, 1, 1, 8, 8, 8) - tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[1024], 16, 1, 1, - 1, 8, 8, 8) - tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[512], 8, 1, 1, 1, - 8, 8, 8) - tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[256], 4, 1, 1, 1, - 8, 8, 8) - tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[128], 2, 1, 1, 1, - 8, 8, 8) - tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[64], 1, 1, 1, 1, - 8, 8, 8) tik_instance.data_move(res[block_index, 0], broadcast_0_local_UB, 0, 1, 8, 0, 0) - elif (input_x_shape[0] == 8 and input_x_shape[1] == 128 and input_x_shape[2] == 128) or ( - input_x_shape[0] == 32 and input_x_shape[1] == 16) or ( - input_x_shape[0] == 16 and input_x_shape[1] == 32): - if (input_x_shape[0] == 8 and input_x_shape[1] == 128 and input_x_shape[2] == 128) and origin_shape[ - 0] == 1000: - input_x = tik_instance.Tensor("float32", input_x_shape, name="input_x", scope=tik.scope_gm) - res = tik_instance.Tensor("float32", output_shape, name="res", scope=tik.scope_gm) - blocks = 32 - each_block_element = 7 * 128 * 128 // 32 + 4 * 128 - phase_1 = 7 * 128 * 128 // 32 - with tik_instance.for_range(0, blocks, block_num=blocks) as block_index: - input_x_ub = tik_instance.Tensor("float32", (each_block_element,), name="input_x_ub", - scope=tik.scope_ubuf) - broadcast_0_local_UB = tik_instance.Tensor("float32", (4096,), name="broadcast_0_local_UB", - scope=tik.scope_ubuf) - tik_instance.data_move(input_x_ub, input_x[phase_1 * block_index], 0, 1, phase_1 // 8, 0, 0) - tik_instance.data_move(input_x_ub[phase_1], input_x[114688 + block_index * 384], 0, 1, 384 // 8, 0, - 0) - move_idx = block_index % 8 - tik_instance.data_move(input_x_ub[phase_1 + 384], input_x[114688 + 96 * 128 + move_idx * 128], 0, 1, - 128 // 8, 0, 0) - repeat_time = each_block_element // 64 - tik_instance.vabs(64, input_x_ub, input_x_ub, repeat_time, 1, 1, 8, 8) - vmask = 1000 - 7 * 128 - 64 - with tik_instance.for_range(0, 4) as loop_idx: - tik_instance.vmax(vmask, input_x_ub[3584 + 128 * loop_idx], input_x_ub[3584 + 128 * loop_idx], - input_x_ub[3584 + 128 * loop_idx + 64], 1, 1, 1, 1, 8, 8, 8) - tik_instance.vmax(64, input_x_ub, input_x_ub[512], input_x_ub[2048], 24, 1, 1, 1, 8, 8, 8) - tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[1024], 16, 1, 1, 1, 8, 8, 8) - tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[512], 8, 1, 1, 1, 8, 8, 8) - tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[256], 4, 1, 1, 1, 8, 8, 8) - tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[128], 2, 1, 1, 1, 8, 8, 8) - tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[64], 1, 1, 1, 1, 8, 8, 8) - - with tik_instance.for_range(0, 4) as loop_idx: - tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[3584 + 128 * loop_idx], 1, 1, 1, 1, 8, - 8, 8) - with tik_instance.for_range(0, 64) as cc0: - data_temp = tik_instance.Scalar("float32") - data_temp.set_as(input_x_ub[cc0]) - tik_instance.vector_dup(64, broadcast_0_local_UB[cc0 * 64], data_temp, 1, 1, 8) - tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[2048], 32, 1, - 1, 1, 8, 8, 8) - tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[1024], 16, 1, - 1, 1, 8, 8, 8) - tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[512], 8, 1, - 1, 1, 8, 8, 8) - tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[256], 4, 1, - 1, 1, 8, 8, 8) - tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[128], 2, 1, - 1, 1, 8, 8, 8) - tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[64], 1, 1, 1, - 1, 8, 8, 8) - tik_instance.data_move(res[block_index, 0], broadcast_0_local_UB, 0, 1, 8, 0, 0) - - elif (input_x_shape[0] == 8 and input_x_shape[1] == 128 and input_x_shape[2] == 128) and origin_shape[ - 0] == 1001: - input_x = tik_instance.Tensor("float32", input_x_shape, name="input_x", scope=tik.scope_gm) - res = tik_instance.Tensor("float32", output_shape, name="res", scope=tik.scope_gm) - blocks = 32 - each_block_element = 7 * 128 * 128 // 32 + 4 * 128 - phase_1 = 7 * 128 * 128 // 32 - with tik_instance.for_range(0, blocks, block_num=blocks) as block_index: - input_x_ub = tik_instance.Tensor("float32", (each_block_element,), name="input_x_ub", - scope=tik.scope_ubuf) - broadcast_0_local_UB = tik_instance.Tensor("float32", (4096,), name="broadcast_0_local_UB", - scope=tik.scope_ubuf) - tik_instance.data_move(input_x_ub, input_x[phase_1 * block_index], 0, 1, phase_1 // 8, 0, 0) - tik_instance.data_move(input_x_ub[phase_1], input_x[114688 + block_index * 384], 0, 1, 384 // 8, 0, - 0) - tik_instance.data_move(input_x_ub[phase_1], input_x[114688 + block_index * 384], 0, 1, 384 // 8, 0, - 0) - move_idx = block_index % 9 - tik_instance.data_move(input_x_ub[phase_1 + 384], input_x[114688 + 96 * 128 + move_idx * 128], 0, 1, - 128 // 8, 0, 0) - repeat_time = each_block_element // 64 - tik_instance.vabs(64, input_x_ub, input_x_ub, repeat_time, 1, 1, 8, 8) - vmask = 1001 - 7 * 128 - 64 - with tik_instance.for_range(0, 4) as loop_idx: - tik_instance.vmax(vmask, input_x_ub[3584 + 128 * loop_idx], input_x_ub[3584 + 128 * loop_idx], - input_x_ub[3584 + 128 * loop_idx + 64], 1, 1, 1, 1, 8, 8, 8) - tik_instance.vmax(64, input_x_ub, input_x_ub[512], input_x_ub[2048], 24, 1, 1, 1, 8, 8, 8) - tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[1024], 16, 1, 1, 1, 8, 8, 8) - tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[512], 8, 1, 1, 1, 8, 8, 8) - tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[256], 4, 1, 1, 1, 8, 8, 8) - tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[128], 2, 1, 1, 1, 8, 8, 8) - tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[64], 1, 1, 1, 1, 8, 8, 8) - with tik_instance.for_range(0, 4) as loop_idx: - tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[3584 + 128 * loop_idx], 1, 1, 1, 1, 8, - 8, 8) - with tik_instance.for_range(0, 64) as cc0: - data_temp = tik_instance.Scalar("float32") - data_temp.set_as(input_x_ub[cc0]) - tik_instance.vector_dup(64, broadcast_0_local_UB[cc0 * 64], data_temp, 1, 1, 8) - tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[2048], 32, 1, - 1, 1, 8, 8, 8) - tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[1024], 16, 1, - 1, 1, 8, 8, 8) - tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[512], 8, 1, - 1, 1, 8, 8, 8) - tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[256], 4, 1, - 1, 1, 8, 8, 8) - tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[128], 2, 1, - 1, 1, 8, 8, 8) - tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[64], 1, 1, 1, - 1, 8, 8, 8) - tik_instance.data_move(res[block_index, 0], broadcast_0_local_UB, 0, 1, 8, 0, 0) - else: - input_x = tik_instance.Tensor("float32", input_x_shape, name="input_x", scope=tik.scope_gm) - res = tik_instance.Tensor("float32", output_shape, name="res", scope=tik.scope_gm) - total_elements = 1 - for val in input_x_shape: - total_elements *= val - blocks = 32 - each_block_element = total_elements // blocks - with tik_instance.for_range(0, blocks, block_num=blocks) as block_index: - input_x_ub = tik_instance.Tensor("float32", (each_block_element,), name="input_x_ub", - scope=tik.scope_ubuf) - broadcast_0_local_UB = tik_instance.Tensor("float32", (4096,), name="broadcast_0_local_UB", - scope=tik.scope_ubuf) - tik_instance.data_move(input_x_ub, input_x[each_block_element * block_index], 0, 1, - each_block_element // 8, 0, 0) - repeat_time = each_block_element // 64 - tik_instance.vabs(64, input_x_ub, input_x_ub, repeat_time, 1, 1, 8, 8) - tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[2048], 32, 1, 1, 1, 8, 8, 8) - tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[1024], 16, 1, 1, 1, 8, 8, 8) - tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[512], 8, 1, 1, 1, 8, 8, 8) - tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[256], 4, 1, 1, 1, 8, 8, 8) - tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[128], 2, 1, 1, 1, 8, 8, 8) - tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[64], 1, 1, 1, 1, 8, 8, 8) - with tik_instance.for_range(0, 64) as cc0: - data_temp = tik_instance.Scalar("float32") - data_temp.set_as(input_x_ub[cc0]) - tik_instance.vector_dup(64, broadcast_0_local_UB[cc0 * 64], data_temp, 1, 1, 8) - tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[2048], 32, 1, - 1, 1, 8, 8, 8) - tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[1024], 16, 1, - 1, 1, 8, 8, 8) - tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[512], 8, 1, - 1, 1, 8, 8, 8) - tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[256], 4, 1, - 1, 1, 8, 8, 8) - tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[128], 2, 1, - 1, 1, 8, 8, 8) - tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[64], 1, 1, 1, - 1, 8, 8, 8) - tik_instance.data_move(res[block_index, 0], broadcast_0_local_UB, 0, 1, 8, 0, 0) - elif (input_x_shape[0] == 16 and input_x_shape[1] == 128 and input_x_shape[2] == 128) or ( - input_x_shape[0] == 16 and input_x_shape[1] == 64) or ( - input_x_shape[0] == 64 and input_x_shape[1] == 16): + elif ori_shape in ((256, 256), None, (-1, -1)): input_x = tik_instance.Tensor("float32", input_x_shape, name="input_x", scope=tik.scope_gm) res = tik_instance.Tensor("float32", output_shape, name="res", scope=tik.scope_gm) total_elements = 1 @@ -368,9 +154,6 @@ def CusFusedAbsMax1(input_x, output, origin_shape=None, kernel_name="fused_abs_m each_block_element // 8, 0, 0) repeat_time = each_block_element // 64 tik_instance.vabs(64, input_x_ub, input_x_ub, repeat_time, 1, 1, 8, 8) - tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[4096], 64, 1, 1, 1, 8, 8, 8) - tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[2048], 32, 1, 1, 1, 8, 8, 8) - tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[1024], 16, 1, 1, 1, 8, 8, 8) tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[512], 8, 1, 1, 1, 8, 8, 8) tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[256], 4, 1, 1, 1, 8, 8, 8) tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[128], 2, 1, 1, 1, 8, 8, 8) @@ -379,108 +162,163 @@ def CusFusedAbsMax1(input_x, output, origin_shape=None, kernel_name="fused_abs_m data_temp = tik_instance.Scalar("float32") data_temp.set_as(input_x_ub[cc0]) tik_instance.vector_dup(64, broadcast_0_local_UB[cc0 * 64], data_temp, 1, 1, 8) - tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[2048], 32, 1, 1, + tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[2048], 32, 1, + 1, 1, 8, 8, 8) + tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[1024], 16, 1, + 1, 1, 8, 8, 8) + tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[512], 8, 1, + 1, 1, 8, 8, 8) + tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[256], 4, 1, + 1, 1, 8, 8, 8) + tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[128], 2, 1, + 1, 1, 8, 8, 8) + tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[64], 1, 1, 1, 1, 8, 8, 8) - tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[1024], 16, 1, 1, - 1, 8, 8, 8) - tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[512], 8, 1, 1, 1, - 8, 8, 8) - tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[256], 4, 1, 1, 1, - 8, 8, 8) - tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[128], 2, 1, 1, 1, - 8, 8, 8) - tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[64], 1, 1, 1, 1, - 8, 8, 8) tik_instance.data_move(res[block_index, 0], broadcast_0_local_UB, 0, 1, 8, 0, 0) - elif input_x_shape[0] == 5 and input_x_shape[1] == 128 and input_x_shape[2] == 128 and origin_shape[0] == 576: + else: + raise RuntimeError("origin shape %s is not supported" % str(ori_shape)) + elif input_info == ((4, 128, 128), "float32"): + input_x = tik_instance.Tensor("float32", input_x_shape, name="input_x", scope=tik.scope_gm) + res = tik_instance.Tensor("float32", output_shape, name="res", scope=tik.scope_gm) + total_elements = 1 + for val in input_x_shape: + total_elements *= val + blocks = 32 + each_block_element = total_elements // blocks + with tik_instance.for_range(0, blocks, block_num=blocks) as block_index: + input_x_ub = tik_instance.Tensor("float32", (each_block_element,), name="input_x_ub", + scope=tik.scope_ubuf) + broadcast_0_local_UB = tik_instance.Tensor("float32", (4096,), name="broadcast_0_local_UB", + scope=tik.scope_ubuf) + tik_instance.data_move(input_x_ub, input_x[each_block_element * block_index], 0, 1, + each_block_element // 8, 0, 0) + repeat_time = each_block_element // 64 + tik_instance.vabs(64, input_x_ub, input_x_ub, repeat_time, 1, 1, 8, 8) + tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[1024], 16, 1, 1, 1, 8, 8, 8) + tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[512], 8, 1, 1, 1, 8, 8, 8) + tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[256], 4, 1, 1, 1, 8, 8, 8) + tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[128], 2, 1, 1, 1, 8, 8, 8) + tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[64], 1, 1, 1, 1, 8, 8, 8) + with tik_instance.for_range(0, 64) as cc0: + data_temp = tik_instance.Scalar("float32") + data_temp.set_as(input_x_ub[cc0]) + tik_instance.vector_dup(64, broadcast_0_local_UB[cc0 * 64], data_temp, 1, 1, 8) + tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[2048], 32, 1, 1, + 1, 8, 8, 8) + tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[1024], 16, 1, 1, + 1, 8, 8, 8) + tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[512], 8, 1, 1, 1, + 8, 8, 8) + tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[256], 4, 1, 1, 1, + 8, 8, 8) + tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[128], 2, 1, 1, 1, + 8, 8, 8) + tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[64], 1, 1, 1, 1, + 8, 8, 8) + tik_instance.data_move(res[block_index, 0], broadcast_0_local_UB, 0, 1, 8, 0, 0) + elif input_info == ((8, 128, 128), "float32"): + if ori_shape == (1000, 1000): input_x = tik_instance.Tensor("float32", input_x_shape, name="input_x", scope=tik.scope_gm) res = tik_instance.Tensor("float32", output_shape, name="res", scope=tik.scope_gm) - total_elements = 69632 blocks = 32 - each_block_element = total_elements // blocks - phase_1 = 2048 - phase_2 = 128 + each_block_element = 7 * 128 * 128 // 32 + 4 * 128 + phase_1 = 7 * 128 * 128 // 32 with tik_instance.for_range(0, blocks, block_num=blocks) as block_index: input_x_ub = tik_instance.Tensor("float32", (each_block_element,), name="input_x_ub", scope=tik.scope_ubuf) broadcast_0_local_UB = tik_instance.Tensor("float32", (4096,), name="broadcast_0_local_UB", scope=tik.scope_ubuf) tik_instance.data_move(input_x_ub, input_x[phase_1 * block_index], 0, 1, phase_1 // 8, 0, 0) - tik_instance.data_move(input_x_ub[phase_1], input_x[65536 + phase_2 * block_index * 2], 0, 1, 8, 0, 0) - tik_instance.data_move(input_x_ub[phase_1 + 64], input_x[65536 + 128 + phase_2 * block_index * 2], 0, 1, - 8, 0, 0) + tik_instance.data_move(input_x_ub[phase_1], input_x[114688 + block_index * 384], 0, 1, 384 // 8, 0, + 0) + move_idx = block_index % 8 + tik_instance.data_move(input_x_ub[phase_1 + 384], input_x[114688 + 96 * 128 + move_idx * 128], 0, 1, + 128 // 8, 0, 0) repeat_time = each_block_element // 64 tik_instance.vabs(64, input_x_ub, input_x_ub, repeat_time, 1, 1, 8, 8) - tik_instance.vmax(64, input_x_ub[2048], input_x_ub[2048], input_x_ub[2048 + 64], 1, 1, 1, 1, 8, 8, 8) + vmask = 1000 - 7 * 128 - 64 + with tik_instance.for_range(0, 4) as loop_idx: + tik_instance.vmax(vmask, input_x_ub[3584 + 128 * loop_idx], input_x_ub[3584 + 128 * loop_idx], + input_x_ub[3584 + 128 * loop_idx + 64], 1, 1, 1, 1, 8, 8, 8) + tik_instance.vmax(64, input_x_ub, input_x_ub[512], input_x_ub[2048], 24, 1, 1, 1, 8, 8, 8) tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[1024], 16, 1, 1, 1, 8, 8, 8) tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[512], 8, 1, 1, 1, 8, 8, 8) tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[256], 4, 1, 1, 1, 8, 8, 8) tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[128], 2, 1, 1, 1, 8, 8, 8) tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[64], 1, 1, 1, 1, 8, 8, 8) - tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[2048], 1, 1, 1, 1, 8, 8, 8) + + with tik_instance.for_range(0, 4) as loop_idx: + tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[3584 + 128 * loop_idx], 1, 1, 1, 1, 8, + 8, 8) with tik_instance.for_range(0, 64) as cc0: data_temp = tik_instance.Scalar("float32") data_temp.set_as(input_x_ub[cc0]) tik_instance.vector_dup(64, broadcast_0_local_UB[cc0 * 64], data_temp, 1, 1, 8) - tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[2048], 32, 1, 1, + tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[2048], 32, 1, + 1, 1, 8, 8, 8) + tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[1024], 16, 1, + 1, 1, 8, 8, 8) + tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[512], 8, 1, + 1, 1, 8, 8, 8) + tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[256], 4, 1, + 1, 1, 8, 8, 8) + tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[128], 2, 1, + 1, 1, 8, 8, 8) + tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[64], 1, 1, 1, 1, 8, 8, 8) - tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[1024], 16, 1, 1, - 1, 8, 8, 8) - tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[512], 8, 1, 1, 1, - 8, 8, 8) - tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[256], 4, 1, 1, 1, - 8, 8, 8) - tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[128], 2, 1, 1, 1, - 8, 8, 8) - tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[64], 1, 1, 1, 1, - 8, 8, 8) tik_instance.data_move(res[block_index, 0], broadcast_0_local_UB, 0, 1, 8, 0, 0) - elif (input_x_shape[0] == 9 and input_x_shape[1] == 128 and input_x_shape[2] == 128) or ( - input_x_shape[0] == 72 and input_x_shape[1] == 8): + elif ori_shape == (1001, 1001): input_x = tik_instance.Tensor("float32", input_x_shape, name="input_x", scope=tik.scope_gm) res = tik_instance.Tensor("float32", output_shape, name="res", scope=tik.scope_gm) - total_elements = 1 - for val in input_x_shape: - total_elements *= val blocks = 32 - each_block_element = total_elements // blocks + each_block_element = 7 * 128 * 128 // 32 + 4 * 128 + phase_1 = 7 * 128 * 128 // 32 with tik_instance.for_range(0, blocks, block_num=blocks) as block_index: input_x_ub = tik_instance.Tensor("float32", (each_block_element,), name="input_x_ub", scope=tik.scope_ubuf) broadcast_0_local_UB = tik_instance.Tensor("float32", (4096,), name="broadcast_0_local_UB", scope=tik.scope_ubuf) - tik_instance.data_move(input_x_ub, input_x[each_block_element * block_index], 0, 1, - each_block_element // 8, 0, 0) + tik_instance.data_move(input_x_ub, input_x[phase_1 * block_index], 0, 1, phase_1 // 8, 0, 0) + tik_instance.data_move(input_x_ub[phase_1], input_x[114688 + block_index * 384], 0, 1, 384 // 8, 0, + 0) + tik_instance.data_move(input_x_ub[phase_1], input_x[114688 + block_index * 384], 0, 1, 384 // 8, 0, + 0) + move_idx = block_index % 9 + tik_instance.data_move(input_x_ub[phase_1 + 384], input_x[114688 + 96 * 128 + move_idx * 128], 0, 1, + 128 // 8, 0, 0) repeat_time = each_block_element // 64 tik_instance.vabs(64, input_x_ub, input_x_ub, repeat_time, 1, 1, 8, 8) - tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[2048], 32, 1, 1, 1, 8, 8, 8) + vmask = 1001 - 7 * 128 - 64 + with tik_instance.for_range(0, 4) as loop_idx: + tik_instance.vmax(vmask, input_x_ub[3584 + 128 * loop_idx], input_x_ub[3584 + 128 * loop_idx], + input_x_ub[3584 + 128 * loop_idx + 64], 1, 1, 1, 1, 8, 8, 8) + tik_instance.vmax(64, input_x_ub, input_x_ub[512], input_x_ub[2048], 24, 1, 1, 1, 8, 8, 8) tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[1024], 16, 1, 1, 1, 8, 8, 8) tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[512], 8, 1, 1, 1, 8, 8, 8) tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[256], 4, 1, 1, 1, 8, 8, 8) tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[128], 2, 1, 1, 1, 8, 8, 8) tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[64], 1, 1, 1, 1, 8, 8, 8) - tik_instance.vmax(64, input_x_ub[4096], input_x_ub[4096], input_x_ub[4096 + 256], 4, 1, 1, 1, 8, 8, 8) - tik_instance.vmax(64, input_x_ub[4096], input_x_ub[4096], input_x_ub[4096 + 128], 2, 1, 1, 1, 8, 8, 8) - tik_instance.vmax(64, input_x_ub[4096], input_x_ub[4096], input_x_ub[4096 + 64], 1, 1, 1, 1, 8, 8, 8) - tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[4096], 1, 1, 1, 1, 8, 8, 8) + with tik_instance.for_range(0, 4) as loop_idx: + tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[3584 + 128 * loop_idx], 1, 1, 1, 1, 8, + 8, 8) with tik_instance.for_range(0, 64) as cc0: data_temp = tik_instance.Scalar("float32") data_temp.set_as(input_x_ub[cc0]) tik_instance.vector_dup(64, broadcast_0_local_UB[cc0 * 64], data_temp, 1, 1, 8) - tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[2048], 32, 1, 1, + tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[2048], 32, 1, + 1, 1, 8, 8, 8) + tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[1024], 16, 1, + 1, 1, 8, 8, 8) + tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[512], 8, 1, + 1, 1, 8, 8, 8) + tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[256], 4, 1, + 1, 1, 8, 8, 8) + tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[128], 2, 1, + 1, 1, 8, 8, 8) + tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[64], 1, 1, 1, 1, 8, 8, 8) - tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[1024], 16, 1, 1, - 1, 8, 8, 8) - tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[512], 8, 1, 1, 1, - 8, 8, 8) - tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[256], 4, 1, 1, 1, - 8, 8, 8) - tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[128], 2, 1, 1, 1, - 8, 8, 8) - tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[64], 1, 1, 1, 1, - 8, 8, 8) tik_instance.data_move(res[block_index, 0], broadcast_0_local_UB, 0, 1, 8, 0, 0) - elif input_x_shape[0] == 18 and input_x_shape[1] == 128 and input_x_shape[2] == 128: + elif ori_shape in ((1024, 1024), None, (-1, -1)): input_x = tik_instance.Tensor("float32", input_x_shape, name="input_x", scope=tik.scope_gm) res = tik_instance.Tensor("float32", output_shape, name="res", scope=tik.scope_gm) total_elements = 1 @@ -497,390 +335,6 @@ def CusFusedAbsMax1(input_x, output, origin_shape=None, kernel_name="fused_abs_m each_block_element // 8, 0, 0) repeat_time = each_block_element // 64 tik_instance.vabs(64, input_x_ub, input_x_ub, repeat_time, 1, 1, 8, 8) - tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[4096], 64, 1, 1, 1, 8, 8, 8) - tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[2048], 32, 1, 1, 1, 8, 8, 8) - tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[1024], 16, 1, 1, 1, 8, 8, 8) - tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[512], 8, 1, 1, 1, 8, 8, 8) - tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[256], 4, 1, 1, 1, 8, 8, 8) - tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[128], 2, 1, 1, 1, 8, 8, 8) - tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[64], 1, 1, 1, 1, 8, 8, 8) - tik_instance.vmax(64, input_x_ub[8192], input_x_ub[8192], input_x_ub[8192 + 512], 8, 1, 1, 1, 8, 8, 8) - tik_instance.vmax(64, input_x_ub[8192], input_x_ub[8192], input_x_ub[8192 + 256], 4, 1, 1, 1, 8, 8, 8) - tik_instance.vmax(64, input_x_ub[8192], input_x_ub[8192], input_x_ub[8192 + 128], 2, 1, 1, 1, 8, 8, 8) - tik_instance.vmax(64, input_x_ub[8192], input_x_ub[8192], input_x_ub[8192 + 64], 1, 1, 1, 1, 8, 8, 8) - tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[8192], 1, 1, 1, 1, 8, 8, 8) - with tik_instance.for_range(0, 64) as cc0: - data_temp = tik_instance.Scalar("float32") - data_temp.set_as(input_x_ub[cc0]) - tik_instance.vector_dup(64, broadcast_0_local_UB[cc0 * 64], data_temp, 1, 1, 8) - tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[2048], 32, 1, 1, - 1, 8, 8, 8) - tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[1024], 16, 1, 1, - 1, 8, 8, 8) - tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[512], 8, 1, 1, 1, - 8, 8, 8) - tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[256], 4, 1, 1, 1, - 8, 8, 8) - tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[128], 2, 1, 1, 1, - 8, 8, 8) - tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[64], 1, 1, 1, 1, - 8, 8, 8) - tik_instance.data_move(res[block_index, 0], broadcast_0_local_UB, 0, 1, 8, 0, 0) - elif (input_x_shape[0] == 36 and input_x_shape[1] == 128 and input_x_shape[2] == 128) or ( - input_x_shape[0] == 144 and input_x_shape[1] == 16): - input_x = tik_instance.Tensor("float32", input_x_shape, name="input_x", scope=tik.scope_gm) - res = tik_instance.Tensor("float32", output_shape, name="res", scope=tik.scope_gm) - total_elements = 1 - for val in input_x_shape: - total_elements *= val - blocks = 32 - each_block_element = total_elements // blocks - with tik_instance.for_range(0, blocks, block_num=blocks) as block_index: - input_x_ub = tik_instance.Tensor("float32", (each_block_element,), name="input_x_ub", - scope=tik.scope_ubuf) - broadcast_0_local_UB = tik_instance.Tensor("float32", (4096,), name="broadcast_0_local_UB", - scope=tik.scope_ubuf) - tik_instance.data_move(input_x_ub, input_x[each_block_element * block_index], 0, 1, - each_block_element // 8, 0, 0) - repeat_time_1 = 255 - repeat_time_2 = each_block_element // 64 - 255 - tik_instance.vabs(64, input_x_ub, input_x_ub, repeat_time_1, 1, 1, 8, 8) - tik_instance.vabs(64, input_x_ub[repeat_time_1 * 64], input_x_ub[repeat_time_1 * 64], repeat_time_2, 1, - 1, 8, 8) - tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[8192], 128, 1, 1, 1, 8, 8, 8) - tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[4096], 64, 1, 1, 1, 8, 8, 8) - tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[2048], 32, 1, 1, 1, 8, 8, 8) - tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[1024], 16, 1, 1, 1, 8, 8, 8) - tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[512], 8, 1, 1, 1, 8, 8, 8) - tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[256], 4, 1, 1, 1, 8, 8, 8) - tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[128], 2, 1, 1, 1, 8, 8, 8) - tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[64], 1, 1, 1, 1, 8, 8, 8) - tik_instance.vmax(64, input_x_ub[16384], input_x_ub[16384], input_x_ub[16384 + 1024], 16, 1, 1, 1, 8, 8, - 8) - tik_instance.vmax(64, input_x_ub[16384], input_x_ub[16384], input_x_ub[16384 + 512], 8, 1, 1, 1, 8, 8, - 8) - tik_instance.vmax(64, input_x_ub[16384], input_x_ub[16384], input_x_ub[16384 + 256], 4, 1, 1, 1, 8, 8, - 8) - tik_instance.vmax(64, input_x_ub[16384], input_x_ub[16384], input_x_ub[16384 + 128], 2, 1, 1, 1, 8, 8, - 8) - tik_instance.vmax(64, input_x_ub[16384], input_x_ub[16384], input_x_ub[16384 + 64], 1, 1, 1, 1, 8, 8, 8) - tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[16384], 1, 1, 1, 1, 8, 8, 8) - with tik_instance.for_range(0, 64) as cc0: - data_temp = tik_instance.Scalar("float32") - data_temp.set_as(input_x_ub[cc0]) - tik_instance.vector_dup(64, broadcast_0_local_UB[cc0 * 64], data_temp, 1, 1, 8) - tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[2048], 32, 1, 1, - 1, 8, 8, 8) - tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[1024], 16, 1, 1, - 1, 8, 8, 8) - tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[512], 8, 1, 1, 1, - 8, 8, 8) - tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[256], 4, 1, 1, 1, - 8, 8, 8) - tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[128], 2, 1, 1, 1, - 8, 8, 8) - tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[64], 1, 1, 1, 1, - 8, 8, 8) - tik_instance.data_move(res[block_index, 0], broadcast_0_local_UB, 0, 1, 8, 0, 0) - elif input_x_shape[0] == 128 and input_x_shape[1] == 63: - input_x = tik_instance.Tensor("float32", input_x_shape, name="input_x", scope=tik.scope_gm) - res = tik_instance.Tensor("float32", output_shape, name="res", scope=tik.scope_gm) - total_elements = 1 - for val in input_x_shape: - total_elements *= val - blocks = 32 - each_block_element = total_elements // blocks - with tik_instance.for_range(0, blocks, block_num=blocks) as block_index: - input_x_ub = tik_instance.Tensor("float32", (each_block_element,), name="input_x_ub", - scope=tik.scope_ubuf) - tik_instance.data_move(input_x_ub, input_x[each_block_element * block_index], 0, 1, - each_block_element // 8, 0, 0) - repeat_time_1 = 255 - repeat_time_2 = each_block_element // 64 - 255 * 3 - tik_instance.vabs(64, input_x_ub, input_x_ub, repeat_time_1, 1, 1, 8, 8) - tik_instance.vabs(64, input_x_ub[repeat_time_1 * 64], input_x_ub[repeat_time_1 * 64], repeat_time_1, 1, - 1, 8, 8) - tik_instance.vabs(64, input_x_ub[repeat_time_1 * 2 * 64], input_x_ub[repeat_time_1 * 2 * 64], - repeat_time_1, 1, 1, 8, 8) - tik_instance.vabs(64, input_x_ub[repeat_time_1 * 3 * 64], input_x_ub[repeat_time_1 * 3 * 64], - repeat_time_2, 1, 1, 8, 8) - loop_size = each_block_element // 16384 - with tik_instance.for_range(0, loop_size) as loop_idx: - tik_instance.vmax(64, input_x_ub[16384 * loop_idx], input_x_ub[16384 * loop_idx], - input_x_ub[16384 * loop_idx + 8192], 128, 1, 1, 1, 8, 8, 8) - tik_instance.vmax(64, input_x_ub[16384 * loop_idx], input_x_ub[16384 * loop_idx], - input_x_ub[16384 * loop_idx + 4096], 64, 1, 1, 1, 8, 8, 8) - tik_instance.vmax(64, input_x_ub[16384 * loop_idx], input_x_ub[16384 * loop_idx], - input_x_ub[16384 * loop_idx + 2048], 32, 1, 1, 1, 8, 8, 8) - tik_instance.vmax(64, input_x_ub[16384 * loop_idx], input_x_ub[16384 * loop_idx], - input_x_ub[16384 * loop_idx + 1024], 16, 1, 1, 1, 8, 8, 8) - tik_instance.vmax(64, input_x_ub[16384 * loop_idx], input_x_ub[16384 * loop_idx], - input_x_ub[16384 * loop_idx + 512], 8, 1, 1, 1, 8, 8, 8) - tik_instance.vmax(64, input_x_ub[16384 * loop_idx], input_x_ub[16384 * loop_idx], - input_x_ub[16384 * loop_idx + 256], 4, 1, 1, 1, 8, 8, 8) - tik_instance.vmax(64, input_x_ub[16384 * loop_idx], input_x_ub[16384 * loop_idx], - input_x_ub[16384 * loop_idx + 128], 2, 1, 1, 1, 8, 8, 8) - tik_instance.vmax(64, input_x_ub[16384 * loop_idx], input_x_ub[16384 * loop_idx], - input_x_ub[16384 * loop_idx + 64], 1, 1, 1, 1, 8, 8, 8) - with tik_instance.for_range(0, loop_size - 1) as loop_idx: - tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[16384 * (loop_idx + 1)], 1, 1, 1, 1, 8, 8, - 8) - tail_element = each_block_element - 16384 * loop_size - repeats = tail_element // 64 - with tik_instance.for_range(0, repeats) as i: - tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[16384 * loop_size + i * 64], 1, 1, 1, 1, 8, - 8, 8) - with tik_instance.for_range(0, 64) as cc0: - data_temp = tik_instance.Scalar("float32") - data_temp.set_as(input_x_ub[cc0]) - tik_instance.vector_dup(64, input_x_ub[64 + cc0 * 64], data_temp, 1, 1, 8) - tik_instance.vmax(64, input_x_ub[64], input_x_ub[64], input_x_ub[2048 + 64], 32, 1, 1, 1, 8, 8, 8) - tik_instance.vmax(64, input_x_ub[64], input_x_ub[64], input_x_ub[1024 + 64], 16, 1, 1, 1, 8, 8, 8) - tik_instance.vmax(64, input_x_ub[64], input_x_ub[64], input_x_ub[512 + 64], 8, 1, 1, 1, 8, 8, 8) - tik_instance.vmax(64, input_x_ub[64], input_x_ub[64], input_x_ub[256 + 64], 4, 1, 1, 1, 8, 8, 8) - tik_instance.vmax(64, input_x_ub[64], input_x_ub[64], input_x_ub[128 + 64], 2, 1, 1, 1, 8, 8, 8) - tik_instance.vmax(64, input_x_ub[64], input_x_ub[64], input_x_ub[64 + 64], 1, 1, 1, 1, 8, 8, 8) - tik_instance.data_move(res[block_index, 0], input_x_ub[64], 0, 1, 8, 0, 0) - elif (input_x_shape[0] == 32 and input_x_shape[1] == 128) or ( - input_x_shape[0] == 128 and input_x_shape[1] == 32): - input_x = tik_instance.Tensor("float32", input_x_shape, name="input_x", scope=tik.scope_gm) - res = tik_instance.Tensor("float32", output_shape, name="res", scope=tik.scope_gm) - total_elements = 1 - for val in input_x_shape: - total_elements *= val - blocks = 32 - each_block_element = total_elements // blocks - with tik_instance.for_range(0, blocks, block_num=blocks) as block_index: - input_x_ub = tik_instance.Tensor("float32", (each_block_element,), name="input_x_ub", - scope=tik.scope_ubuf) - broadcast_0_local_UB = tik_instance.Tensor("float32", (4096,), name="broadcast_0_local_UB", - scope=tik.scope_ubuf) - tik_instance.data_move(input_x_ub, input_x[each_block_element * block_index], 0, 1, - each_block_element // 8, 0, 0) - repeat_time_1 = 255 - repeat_time_2 = each_block_element // 64 - 255 * 2 - tik_instance.vabs(64, input_x_ub, input_x_ub, repeat_time_1, 1, 1, 8, 8) - tik_instance.vabs(64, input_x_ub[repeat_time_1 * 64], input_x_ub[repeat_time_1 * 64], repeat_time_1, 1, - 1, 8, 8) - tik_instance.vabs(64, input_x_ub[repeat_time_1 * 2 * 64], input_x_ub[repeat_time_1 * 2 * 64], - repeat_time_2, 1, 1, 8, 8) - loop_size = each_block_element // 16384 - with tik_instance.for_range(0, loop_size) as loop_idx: - tik_instance.vmax(64, input_x_ub[16384 * loop_idx], input_x_ub[16384 * loop_idx], - input_x_ub[16384 * loop_idx + 8192], 128, 1, 1, 1, 8, 8, 8) - tik_instance.vmax(64, input_x_ub[16384 * loop_idx], input_x_ub[16384 * loop_idx], - input_x_ub[16384 * loop_idx + 4096], 64, 1, 1, 1, 8, 8, 8) - tik_instance.vmax(64, input_x_ub[16384 * loop_idx], input_x_ub[16384 * loop_idx], - input_x_ub[16384 * loop_idx + 2048], 32, 1, 1, 1, 8, 8, 8) - tik_instance.vmax(64, input_x_ub[16384 * loop_idx], input_x_ub[16384 * loop_idx], - input_x_ub[16384 * loop_idx + 1024], 16, 1, 1, 1, 8, 8, 8) - tik_instance.vmax(64, input_x_ub[16384 * loop_idx], input_x_ub[16384 * loop_idx], - input_x_ub[16384 * loop_idx + 512], 8, 1, 1, 1, 8, 8, 8) - tik_instance.vmax(64, input_x_ub[16384 * loop_idx], input_x_ub[16384 * loop_idx], - input_x_ub[16384 * loop_idx + 256], 4, 1, 1, 1, 8, 8, 8) - tik_instance.vmax(64, input_x_ub[16384 * loop_idx], input_x_ub[16384 * loop_idx], - input_x_ub[16384 * loop_idx + 128], 2, 1, 1, 1, 8, 8, 8) - tik_instance.vmax(64, input_x_ub[16384 * loop_idx], input_x_ub[16384 * loop_idx], - input_x_ub[16384 * loop_idx + 64], 1, 1, 1, 1, 8, 8, 8) - with tik_instance.for_range(0, loop_size - 1) as loop_idx: - tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[16384 * (loop_idx + 1)], 1, 1, 1, 1, 8, 8, - 8) - with tik_instance.for_range(0, 64) as cc0: - data_temp = tik_instance.Scalar("float32") - data_temp.set_as(input_x_ub[cc0]) - tik_instance.vector_dup(64, broadcast_0_local_UB[cc0 * 64], data_temp, 1, 1, 8) - tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[2048], 32, 1, 1, - 1, 8, 8, 8) - tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[1024], 16, 1, 1, - 1, 8, 8, 8) - tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[512], 8, 1, 1, 1, - 8, 8, 8) - tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[256], 4, 1, 1, 1, - 8, 8, 8) - tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[128], 2, 1, 1, 1, - 8, 8, 8) - tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[64], 1, 1, 1, 1, - 8, 8, 8) - tik_instance.data_move(res[block_index, 0], broadcast_0_local_UB, 0, 1, 8, 0, 0) - elif input_x_shape[0] == 288 and input_x_shape[1] == 32: - input_x = tik_instance.Tensor("float32", input_x_shape, name="input_x", scope=tik.scope_gm) - res = tik_instance.Tensor("float32", output_shape, name="res", scope=tik.scope_gm) - total_elements = 1 - for val in input_x_shape: - total_elements *= val - blocks = 32 - each_block_element = total_elements // blocks - with tik_instance.for_range(0, blocks, block_num=blocks) as block_index: - assist_ub = tik_instance.Tensor("float32", (64,), name="assist_ub", scope=tik.scope_ubuf) - zero = tik_instance.Scalar("float32") - zero.set_as(0) - tik_instance.vector_dup(64, assist_ub, zero, 1, 1, 8) - input_x_ub = tik_instance.Tensor("float32", (32768,), name="input_x_ub", scope=tik.scope_ubuf) - broadcast_0_local_UB = tik_instance.Tensor("float32", (4096,), name="broadcast_0_local_UB", - scope=tik.scope_ubuf) - repeat_time_1 = 255 - repeat_time_2 = 32768 // 64 - 255 * 2 - - tik_instance.data_move(input_x_ub[0], input_x[each_block_element * block_index + 0], 0, 1, 4096, 0, 0) - tik_instance.vabs(64, input_x_ub, input_x_ub, repeat_time_1, 1, 1, 8, 8) - tik_instance.vabs(64, input_x_ub[repeat_time_1 * 64], input_x_ub[repeat_time_1 * 64], repeat_time_1, 1, - 1, 8, 8) - tik_instance.vabs(64, input_x_ub[repeat_time_1 * 2 * 64], input_x_ub[repeat_time_1 * 2 * 64], - repeat_time_2, 1, 1, 8, 8) - tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[16384], 255, 1, 1, 1, 8, 8, 8) - tik_instance.vmax(64, input_x_ub[16320], input_x_ub[16320], input_x_ub[32704], 1, 1, 1, 1, 8, 8, 8) - tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[8192], 128, 1, 1, 1, 8, 8, 8) - tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[4096], 64, 1, 1, 1, 8, 8, 8) - tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[2048], 32, 1, 1, 1, 8, 8, 8) - tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[1024], 16, 1, 1, 1, 8, 8, 8) - tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[512], 8, 1, 1, 1, 8, 8, 8) - tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[256], 4, 1, 1, 1, 8, 8, 8) - tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[128], 2, 1, 1, 1, 8, 8, 8) - tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[64], 1, 1, 1, 1, 8, 8, 8) - tik_instance.vmax(64, assist_ub, assist_ub, input_x_ub, 1, 1, 1, 1, 8, 8, 8) - - tik_instance.data_move(input_x_ub[0], input_x[each_block_element * block_index + 32768], 0, 1, 4096, 0, - 0) - tik_instance.vabs(64, input_x_ub, input_x_ub, repeat_time_1, 1, 1, 8, 8) - tik_instance.vabs(64, input_x_ub[repeat_time_1 * 64], input_x_ub[repeat_time_1 * 64], repeat_time_1, 1, - 1, 8, 8) - tik_instance.vabs(64, input_x_ub[repeat_time_1 * 2 * 64], input_x_ub[repeat_time_1 * 2 * 64], - repeat_time_2, 1, 1, 8, 8) - tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[16384], 255, 1, 1, 1, 8, 8, 8) - tik_instance.vmax(64, input_x_ub[16320], input_x_ub[16320], input_x_ub[32704], 1, 1, 1, 1, 8, 8, 8) - tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[8192], 128, 1, 1, 1, 8, 8, 8) - tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[4096], 64, 1, 1, 1, 8, 8, 8) - tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[2048], 32, 1, 1, 1, 8, 8, 8) - tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[1024], 16, 1, 1, 1, 8, 8, 8) - tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[512], 8, 1, 1, 1, 8, 8, 8) - tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[256], 4, 1, 1, 1, 8, 8, 8) - tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[128], 2, 1, 1, 1, 8, 8, 8) - tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[64], 1, 1, 1, 1, 8, 8, 8) - tik_instance.vmax(64, assist_ub, assist_ub, input_x_ub, 1, 1, 1, 1, 8, 8, 8) - tik_instance.data_move(input_x_ub[0], input_x[each_block_element * block_index + 65536], 0, 1, 1024, 0, - 0) - tik_instance.vabs(64, input_x_ub, input_x_ub, 128, 1, 1, 8, 8) - tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[4096], 64, 1, 1, 1, 8, 8, 8) - tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[2048], 32, 1, 1, 1, 8, 8, 8) - tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[1024], 16, 1, 1, 1, 8, 8, 8) - tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[512], 8, 1, 1, 1, 8, 8, 8) - tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[256], 4, 1, 1, 1, 8, 8, 8) - tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[128], 2, 1, 1, 1, 8, 8, 8) - tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[64], 1, 1, 1, 1, 8, 8, 8) - tik_instance.vmax(64, assist_ub, assist_ub, input_x_ub, 1, 1, 1, 1, 8, 8, 8) - - with tik_instance.for_range(0, 64) as cc0: - data_temp = tik_instance.Scalar("float32") - data_temp.set_as(assist_ub[cc0]) - tik_instance.vector_dup(64, broadcast_0_local_UB[cc0 * 64], data_temp, 1, 1, 8) - tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[2048], 32, 1, 1, - 1, 8, 8, 8) - tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[1024], 16, 1, 1, - 1, 8, 8, 8) - tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[512], 8, 1, 1, 1, - 8, 8, 8) - tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[256], 4, 1, 1, 1, - 8, 8, 8) - tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[128], 2, 1, 1, 1, - 8, 8, 8) - tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[64], 1, 1, 1, 1, - 8, 8, 8) - tik_instance.data_move(res[block_index, 0], broadcast_0_local_UB, 0, 1, 8, 0, 0) - elif input_x_shape[0] == 64 and input_x_shape[1] == 128: - input_x = tik_instance.Tensor("float32", input_x_shape, name="input_x", scope=tik.scope_gm) - res = tik_instance.Tensor("float32", output_shape, name="res", scope=tik.scope_gm) - total_elements = 1 - for val in input_x_shape: - total_elements *= val - blocks = 32 - each_block_element = total_elements // blocks - with tik_instance.for_range(0, blocks, block_num=blocks) as block_index: - assist_ub = tik_instance.Tensor("float32", (64,), name="assist_ub", scope=tik.scope_ubuf) - zero = tik_instance.Scalar("float32") - zero.set_as(0) - tik_instance.vector_dup(64, assist_ub, zero, 1, 1, 8) - input_x_ub = tik_instance.Tensor("float32", (32768,), name="input_x_ub", scope=tik.scope_ubuf) - broadcast_0_local_UB = tik_instance.Tensor("float32", (4096,), name="broadcast_0_local_UB", - scope=tik.scope_ubuf) - repeat_time_1 = 255 - repeat_time_2 = 32768 // 64 - 255 * 2 - - tik_instance.data_move(input_x_ub[0], input_x[each_block_element * block_index + 0], 0, 1, 4096, 0, 0) - tik_instance.vabs(64, input_x_ub, input_x_ub, repeat_time_1, 1, 1, 8, 8) - tik_instance.vabs(64, input_x_ub[repeat_time_1 * 64], input_x_ub[repeat_time_1 * 64], repeat_time_1, 1, - 1, 8, 8) - tik_instance.vabs(64, input_x_ub[repeat_time_1 * 2 * 64], input_x_ub[repeat_time_1 * 2 * 64], - repeat_time_2, 1, 1, 8, 8) - tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[16384], 255, 1, 1, 1, 8, 8, 8) - tik_instance.vmax(64, input_x_ub[16320], input_x_ub[16320], input_x_ub[32704], 1, 1, 1, 1, 8, 8, 8) - tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[8192], 128, 1, 1, 1, 8, 8, 8) - tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[4096], 64, 1, 1, 1, 8, 8, 8) - tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[2048], 32, 1, 1, 1, 8, 8, 8) - tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[1024], 16, 1, 1, 1, 8, 8, 8) - tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[512], 8, 1, 1, 1, 8, 8, 8) - tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[256], 4, 1, 1, 1, 8, 8, 8) - tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[128], 2, 1, 1, 1, 8, 8, 8) - tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[64], 1, 1, 1, 1, 8, 8, 8) - tik_instance.vmax(64, assist_ub, assist_ub, input_x_ub, 1, 1, 1, 1, 8, 8, 8) - - tik_instance.data_move(input_x_ub[0], input_x[each_block_element * block_index + 32768], 0, 1, 4096, 0, - 0) - tik_instance.vabs(64, input_x_ub, input_x_ub, repeat_time_1, 1, 1, 8, 8) - tik_instance.vabs(64, input_x_ub[repeat_time_1 * 64], input_x_ub[repeat_time_1 * 64], repeat_time_1, 1, - 1, 8, 8) - tik_instance.vabs(64, input_x_ub[repeat_time_1 * 2 * 64], input_x_ub[repeat_time_1 * 2 * 64], - repeat_time_2, 1, 1, 8, 8) - tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[16384], 255, 1, 1, 1, 8, 8, 8) - tik_instance.vmax(64, input_x_ub[16320], input_x_ub[16320], input_x_ub[32704], 1, 1, 1, 1, 8, 8, 8) - tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[8192], 128, 1, 1, 1, 8, 8, 8) - tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[4096], 64, 1, 1, 1, 8, 8, 8) - tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[2048], 32, 1, 1, 1, 8, 8, 8) - tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[1024], 16, 1, 1, 1, 8, 8, 8) - tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[512], 8, 1, 1, 1, 8, 8, 8) - tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[256], 4, 1, 1, 1, 8, 8, 8) - tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[128], 2, 1, 1, 1, 8, 8, 8) - tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[64], 1, 1, 1, 1, 8, 8, 8) - tik_instance.vmax(64, assist_ub, assist_ub, input_x_ub, 1, 1, 1, 1, 8, 8, 8) - - with tik_instance.for_range(0, 64) as cc0: - data_temp = tik_instance.Scalar("float32") - data_temp.set_as(assist_ub[cc0]) - tik_instance.vector_dup(64, broadcast_0_local_UB[cc0 * 64], data_temp, 1, 1, 8) - tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[2048], 32, 1, 1, - 1, 8, 8, 8) - tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[1024], 16, 1, 1, - 1, 8, 8, 8) - tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[512], 8, 1, 1, 1, - 8, 8, 8) - tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[256], 4, 1, 1, 1, - 8, 8, 8) - tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[128], 2, 1, 1, 1, - 8, 8, 8) - tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[64], 1, 1, 1, 1, - 8, 8, 8) - tik_instance.data_move(res[block_index, 0], broadcast_0_local_UB, 0, 1, 8, 0, 0) - elif (input_x_shape[0] == 64 and input_x_shape[1] == 32) or (input_x_shape[0] == 32 and input_x_shape[1] == 64): - input_x = tik_instance.Tensor("float32", input_x_shape, name="input_x", scope=tik.scope_gm) - res = tik_instance.Tensor("float32", output_shape, name="res", scope=tik.scope_gm) - total_elements = 1 - for val in input_x_shape: - total_elements *= val - blocks = 32 - each_block_element = total_elements // blocks - with tik_instance.for_range(0, blocks, block_num=blocks) as block_index: - input_x_ub = tik_instance.Tensor("float32", (each_block_element,), name="input_x_ub", - scope=tik.scope_ubuf) - broadcast_0_local_UB = tik_instance.Tensor("float32", (4096,), name="broadcast_0_local_UB", - scope=tik.scope_ubuf) - tik_instance.data_move(input_x_ub, input_x[each_block_element * block_index], 0, 1, - each_block_element // 8, 0, 0) - repeat_time_1 = 255 - repeat_time_2 = each_block_element // 64 - 255 - tik_instance.vabs(64, input_x_ub, input_x_ub, repeat_time_1, 1, 1, 8, 8) - tik_instance.vabs(64, input_x_ub[repeat_time_1 * 64], input_x_ub[repeat_time_1 * 64], repeat_time_2, 1, - 1, 8, 8) - tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[8192], 128, 1, 1, 1, 8, 8, 8) - tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[4096], 64, 1, 1, 1, 8, 8, 8) tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[2048], 32, 1, 1, 1, 8, 8, 8) tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[1024], 16, 1, 1, 1, 8, 8, 8) tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[512], 8, 1, 1, 1, 8, 8, 8) @@ -891,159 +345,131 @@ def CusFusedAbsMax1(input_x, output, origin_shape=None, kernel_name="fused_abs_m data_temp = tik_instance.Scalar("float32") data_temp.set_as(input_x_ub[cc0]) tik_instance.vector_dup(64, broadcast_0_local_UB[cc0 * 64], data_temp, 1, 1, 8) - tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[2048], 32, 1, 1, + tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[2048], 32, 1, + 1, 1, 8, 8, 8) + tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[1024], 16, 1, + 1, 1, 8, 8, 8) + tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[512], 8, 1, + 1, 1, 8, 8, 8) + tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[256], 4, 1, + 1, 1, 8, 8, 8) + tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[128], 2, 1, + 1, 1, 8, 8, 8) + tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[64], 1, 1, 1, 1, 8, 8, 8) - tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[1024], 16, 1, 1, - 1, 8, 8, 8) - tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[512], 8, 1, 1, 1, - 8, 8, 8) - tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[256], 4, 1, 1, 1, - 8, 8, 8) - tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[128], 2, 1, 1, 1, - 8, 8, 8) - tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[64], 1, 1, 1, 1, - 8, 8, 8) tik_instance.data_move(res[block_index, 0], broadcast_0_local_UB, 0, 1, 8, 0, 0) - elif input_x_shape[0] == 36 and input_x_shape[1] == 4: - input_x = tik_instance.Tensor("float32", input_x_shape, name="input_x", scope=tik.scope_gm) - res = tik_instance.Tensor("float32", output_shape, name="res", scope=tik.scope_gm) - total_elements = 1 - for val in input_x_shape: - total_elements *= val - blocks = 32 - each_block_element = total_elements // blocks - with tik_instance.for_range(0, blocks, block_num=blocks) as block_index: - input_x_ub = tik_instance.Tensor("float32", (each_block_element,), name="input_x_ub", - scope=tik.scope_ubuf) - broadcast_0_local_UB = tik_instance.Tensor("float32", (4096,), name="broadcast_0_local_UB", - scope=tik.scope_ubuf) - tik_instance.data_move(input_x_ub, input_x[each_block_element * block_index], 0, 1, - each_block_element // 8, 0, 0) - - repeat_time = each_block_element // 64 - tik_instance.vabs(64, input_x_ub, input_x_ub, repeat_time, 1, 1, 8, 8) - tik_instance.vmax(64, input_x_ub[1024], input_x_ub[1024], input_x_ub[1024 + 64], 1, 1, 1, 1, 8, 8, 8) - tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[512], 8, 1, 1, 1, 8, 8, 8) - tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[256], 4, 1, 1, 1, 8, 8, 8) - tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[128], 2, 1, 1, 1, 8, 8, 8) - tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[64], 1, 1, 1, 1, 8, 8, 8) - tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[1024], 1, 1, 1, 1, 8, 8, 8) - with tik_instance.for_range(0, 64) as cc0: - data_temp = tik_instance.Scalar("float32") - data_temp.set_as(input_x_ub[cc0]) - tik_instance.vector_dup(64, broadcast_0_local_UB[cc0 * 64], data_temp, 1, 1, 8) - tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[2048], 32, 1, 1, - 1, 8, 8, 8) - tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[1024], 16, 1, 1, - 1, 8, 8, 8) - tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[512], 8, 1, 1, 1, - 8, 8, 8) - tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[256], 4, 1, 1, 1, - 8, 8, 8) - tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[128], 2, 1, 1, 1, - 8, 8, 8) - tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[64], 1, 1, 1, 1, - 8, 8, 8) - tik_instance.data_move(res[block_index, 0], broadcast_0_local_UB, 0, 1, 8, 0, 0) - elif input_x_shape[0] == 4 and input_x_shape[1] == 4: + else: + raise RuntimeError("origin shape %s is not supported" % str(ori_shape)) + elif input_info == ((16, 128, 128), "float32"): + input_x = tik_instance.Tensor("float32", input_x_shape, name="input_x", scope=tik.scope_gm) + res = tik_instance.Tensor("float32", output_shape, name="res", scope=tik.scope_gm) + total_elements = 1 + for val in input_x_shape: + total_elements *= val + blocks = 32 + each_block_element = total_elements // blocks + with tik_instance.for_range(0, blocks, block_num=blocks) as block_index: + input_x_ub = tik_instance.Tensor("float32", (each_block_element,), name="input_x_ub", + scope=tik.scope_ubuf) + broadcast_0_local_UB = tik_instance.Tensor("float32", (4096,), name="broadcast_0_local_UB", + scope=tik.scope_ubuf) + tik_instance.data_move(input_x_ub, input_x[each_block_element * block_index], 0, 1, + each_block_element // 8, 0, 0) + repeat_time = each_block_element // 64 + tik_instance.vabs(64, input_x_ub, input_x_ub, repeat_time, 1, 1, 8, 8) + tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[4096], 64, 1, 1, 1, 8, 8, 8) + tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[2048], 32, 1, 1, 1, 8, 8, 8) + tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[1024], 16, 1, 1, 1, 8, 8, 8) + tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[512], 8, 1, 1, 1, 8, 8, 8) + tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[256], 4, 1, 1, 1, 8, 8, 8) + tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[128], 2, 1, 1, 1, 8, 8, 8) + tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[64], 1, 1, 1, 1, 8, 8, 8) + with tik_instance.for_range(0, 64) as cc0: + data_temp = tik_instance.Scalar("float32") + data_temp.set_as(input_x_ub[cc0]) + tik_instance.vector_dup(64, broadcast_0_local_UB[cc0 * 64], data_temp, 1, 1, 8) + tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[2048], 32, 1, 1, + 1, 8, 8, 8) + tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[1024], 16, 1, 1, + 1, 8, 8, 8) + tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[512], 8, 1, 1, 1, + 8, 8, 8) + tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[256], 4, 1, 1, 1, + 8, 8, 8) + tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[128], 2, 1, 1, 1, + 8, 8, 8) + tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[64], 1, 1, 1, 1, + 8, 8, 8) + tik_instance.data_move(res[block_index, 0], broadcast_0_local_UB, 0, 1, 8, 0, 0) + elif input_info == ((32, 128, 128), "float32"): + input_x = tik_instance.Tensor("float32", input_x_shape, name="input_x", scope=tik.scope_gm) + res = tik_instance.Tensor("float32", output_shape, name="res", scope=tik.scope_gm) + total_elements = 1 + for val in input_x_shape: + total_elements *= val + blocks = 32 + each_block_element = total_elements // blocks + with tik_instance.for_range(0, blocks, block_num=blocks) as block_index: + input_x_ub = tik_instance.Tensor("float32", (each_block_element,), name="input_x_ub", + scope=tik.scope_ubuf) + broadcast_0_local_UB = tik_instance.Tensor("float32", (4096,), name="broadcast_0_local_UB", + scope=tik.scope_ubuf) + tik_instance.data_move(input_x_ub, input_x[each_block_element * block_index], 0, 1, + each_block_element // 8, 0, 0) + tik_instance.vabs(64, input_x_ub, input_x_ub, 255, 1, 1, 8, 8) + tik_instance.vabs(64, input_x_ub[255 * 64], input_x_ub[255 * 64], 1, 1, 1, 8, 8) + tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[8192], 128, 1, 1, 1, 8, 8, 8) + tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[4096], 64, 1, 1, 1, 8, 8, 8) + tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[2048], 32, 1, 1, 1, 8, 8, 8) + tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[1024], 16, 1, 1, 1, 8, 8, 8) + tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[512], 8, 1, 1, 1, 8, 8, 8) + tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[256], 4, 1, 1, 1, 8, 8, 8) + tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[128], 2, 1, 1, 1, 8, 8, 8) + tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[64], 1, 1, 1, 1, 8, 8, 8) + with tik_instance.for_range(0, 64) as cc0: + data_temp = tik_instance.Scalar("float32") + data_temp.set_as(input_x_ub[cc0]) + tik_instance.vector_dup(64, broadcast_0_local_UB[cc0 * 64], data_temp, 1, 1, 8) + tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[2048], 32, 1, 1, + 1, 8, 8, 8) + tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[1024], 16, 1, 1, + 1, 8, 8, 8) + tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[512], 8, 1, 1, 1, + 8, 8, 8) + tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[256], 4, 1, 1, 1, + 8, 8, 8) + tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[128], 2, 1, 1, 1, + 8, 8, 8) + tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[64], 1, 1, 1, 1, + 8, 8, 8) + tik_instance.data_move(res[block_index, 0], broadcast_0_local_UB, 0, 1, 8, 0, 0) + elif input_info == ((5, 128, 128), "float32"): + if ori_shape == (576, 576): input_x = tik_instance.Tensor("float32", input_x_shape, name="input_x", scope=tik.scope_gm) res = tik_instance.Tensor("float32", output_shape, name="res", scope=tik.scope_gm) - total_elements = 1 - for val in input_x_shape: - total_elements *= val + total_elements = 69632 blocks = 32 each_block_element = total_elements // blocks + phase_1 = 2048 + phase_2 = 128 with tik_instance.for_range(0, blocks, block_num=blocks) as block_index: input_x_ub = tik_instance.Tensor("float32", (each_block_element,), name="input_x_ub", scope=tik.scope_ubuf) broadcast_0_local_UB = tik_instance.Tensor("float32", (4096,), name="broadcast_0_local_UB", scope=tik.scope_ubuf) - tik_instance.data_move(input_x_ub, input_x[each_block_element * block_index], 0, 1, - each_block_element // 8, 0, 0) - + tik_instance.data_move(input_x_ub, input_x[phase_1 * block_index], 0, 1, phase_1 // 8, 0, 0) + tik_instance.data_move(input_x_ub[phase_1], input_x[65536 + phase_2 * block_index * 2], 0, 1, 8, 0, 0) + tik_instance.data_move(input_x_ub[phase_1 + 64], input_x[65536 + 128 + phase_2 * block_index * 2], 0, 1, + 8, 0, 0) repeat_time = each_block_element // 64 tik_instance.vabs(64, input_x_ub, input_x_ub, repeat_time, 1, 1, 8, 8) - tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[64], 1, 1, 1, 1, 8, 8, 8) - with tik_instance.for_range(0, 64) as cc0: - data_temp = tik_instance.Scalar("float32") - data_temp.set_as(input_x_ub[cc0]) - tik_instance.vector_dup(64, broadcast_0_local_UB[cc0 * 64], data_temp, 1, 1, 8) - tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[2048], 32, 1, 1, - 1, 8, 8, 8) - tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[1024], 16, 1, 1, - 1, 8, 8, 8) - tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[512], 8, 1, 1, 1, - 8, 8, 8) - tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[256], 4, 1, 1, 1, - 8, 8, 8) - tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[128], 2, 1, 1, 1, - 8, 8, 8) - tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[64], 1, 1, 1, 1, - 8, 8, 8) - tik_instance.data_move(res[block_index, 0], broadcast_0_local_UB, 0, 1, 8, 0, 0) - elif input_x_shape[0] == 49 and input_x_shape[1] == 4: - input_x = tik_instance.Tensor("float32", input_x_shape, name="input_x", scope=tik.scope_gm) - res = tik_instance.Tensor("float32", output_shape, name="res", scope=tik.scope_gm) - total_elements = 1 - for val in input_x_shape: - total_elements *= val - blocks = 32 - each_block_element = total_elements // blocks - with tik_instance.for_range(0, blocks, block_num=blocks) as block_index: - input_x_ub = tik_instance.Tensor("float32", (each_block_element,), name="input_x_ub", - scope=tik.scope_ubuf) - broadcast_0_local_UB = tik_instance.Tensor("float32", (4096,), name="broadcast_0_local_UB", - scope=tik.scope_ubuf) - tik_instance.data_move(input_x_ub, input_x[each_block_element * block_index], 0, 1, - each_block_element // 8, 0, 0) - - repeat_time = each_block_element // 64 - tik_instance.vabs(64, input_x_ub, input_x_ub, 24, 1, 1, 8, 8) - tik_instance.vabs(32, input_x_ub[1536], input_x_ub[1536], 1, 1, 1, 8, 8) - tik_instance.vmax(32, input_x_ub[1504], input_x_ub[1504], input_x_ub[1536], 1, 1, 1, 1, 8, 8, 8) + tik_instance.vmax(64, input_x_ub[2048], input_x_ub[2048], input_x_ub[2048 + 64], 1, 1, 1, 1, 8, 8, 8) + tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[1024], 16, 1, 1, 1, 8, 8, 8) tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[512], 8, 1, 1, 1, 8, 8, 8) tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[256], 4, 1, 1, 1, 8, 8, 8) tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[128], 2, 1, 1, 1, 8, 8, 8) tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[64], 1, 1, 1, 1, 8, 8, 8) - tik_instance.vmax(64, input_x_ub[1024], input_x_ub[1024], input_x_ub[1024 + 256], 4, 1, 1, 1, 8, 8, 8) - tik_instance.vmax(64, input_x_ub[1024], input_x_ub[1024], input_x_ub[1024 + 128], 2, 1, 1, 1, 8, 8, 8) - tik_instance.vmax(64, input_x_ub[1024], input_x_ub[1024], input_x_ub[1024 + 64], 1, 1, 1, 1, 8, 8, 8) - tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[1024], 1, 1, 1, 1, 8, 8, 8) - with tik_instance.for_range(0, 64) as cc0: - data_temp = tik_instance.Scalar("float32") - data_temp.set_as(input_x_ub[cc0]) - tik_instance.vector_dup(64, broadcast_0_local_UB[cc0 * 64], data_temp, 1, 1, 8) - tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[2048], 32, 1, 1, - 1, 8, 8, 8) - tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[1024], 16, 1, 1, - 1, 8, 8, 8) - tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[512], 8, 1, 1, 1, - 8, 8, 8) - tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[256], 4, 1, 1, 1, - 8, 8, 8) - tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[128], 2, 1, 1, 1, - 8, 8, 8) - tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[64], 1, 1, 1, 1, - 8, 8, 8) - tik_instance.data_move(res[block_index, 0], broadcast_0_local_UB, 0, 1, 8, 0, 0) - elif input_x_shape[0] == 1 and input_x_shape[1] == 64 and input_x_shape[2] == 64: - input_x = tik_instance.Tensor("float32", input_x_shape, name="input_x", scope=tik.scope_gm) - res = tik_instance.Tensor("float32", output_shape, name="res", scope=tik.scope_gm) - total_elements = 1 - for val in input_x_shape: - total_elements *= val - blocks = 32 - each_block_element = total_elements // blocks - with tik_instance.for_range(0, blocks, block_num=blocks) as block_index: - input_x_ub = tik_instance.Tensor("float32", (each_block_element,), name="input_x_ub", - scope=tik.scope_ubuf) - broadcast_0_local_UB = tik_instance.Tensor("float32", (4096,), name="broadcast_0_local_UB", - scope=tik.scope_ubuf) - tik_instance.data_move(input_x_ub, input_x[each_block_element * block_index], 0, 1, - each_block_element // 8, 0, 0) - repeat_time = each_block_element // 64 - tik_instance.vabs(64, input_x_ub, input_x_ub, repeat_time, 1, 1, 8, 8) - tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[64], 1, 1, 1, 1, 8, 8, 8) + tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[2048], 1, 1, 1, 1, 8, 8, 8) with tik_instance.for_range(0, 64) as cc0: data_temp = tik_instance.Scalar("float32") data_temp.set_as(input_x_ub[cc0]) @@ -1061,10 +487,189 @@ def CusFusedAbsMax1(input_x, output, origin_shape=None, kernel_name="fused_abs_m tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[64], 1, 1, 1, 1, 8, 8, 8) tik_instance.data_move(res[block_index, 0], broadcast_0_local_UB, 0, 1, 8, 0, 0) - else: - raise RuntimeError("UnSupportedShape") - elif len(input_x_shape) == 2 and (input_x_shape[0] == 32 and input_x_shape[1] == 64): + raise RuntimeError("origin shape %s is not supported" % str(ori_shape)) + elif input_info == ((9, 128, 128), "float32"): + input_x = tik_instance.Tensor("float32", input_x_shape, name="input_x", scope=tik.scope_gm) + res = tik_instance.Tensor("float32", output_shape, name="res", scope=tik.scope_gm) + total_elements = 1 + for val in input_x_shape: + total_elements *= val + blocks = 32 + each_block_element = total_elements // blocks + with tik_instance.for_range(0, blocks, block_num=blocks) as block_index: + input_x_ub = tik_instance.Tensor("float32", (each_block_element,), name="input_x_ub", + scope=tik.scope_ubuf) + broadcast_0_local_UB = tik_instance.Tensor("float32", (4096,), name="broadcast_0_local_UB", + scope=tik.scope_ubuf) + tik_instance.data_move(input_x_ub, input_x[each_block_element * block_index], 0, 1, + each_block_element // 8, 0, 0) + repeat_time = each_block_element // 64 + tik_instance.vabs(64, input_x_ub, input_x_ub, repeat_time, 1, 1, 8, 8) + tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[2048], 32, 1, 1, 1, 8, 8, 8) + tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[1024], 16, 1, 1, 1, 8, 8, 8) + tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[512], 8, 1, 1, 1, 8, 8, 8) + tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[256], 4, 1, 1, 1, 8, 8, 8) + tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[128], 2, 1, 1, 1, 8, 8, 8) + tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[64], 1, 1, 1, 1, 8, 8, 8) + tik_instance.vmax(64, input_x_ub[4096], input_x_ub[4096], input_x_ub[4096 + 256], 4, 1, 1, 1, 8, 8, 8) + tik_instance.vmax(64, input_x_ub[4096], input_x_ub[4096], input_x_ub[4096 + 128], 2, 1, 1, 1, 8, 8, 8) + tik_instance.vmax(64, input_x_ub[4096], input_x_ub[4096], input_x_ub[4096 + 64], 1, 1, 1, 1, 8, 8, 8) + tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[4096], 1, 1, 1, 1, 8, 8, 8) + with tik_instance.for_range(0, 64) as cc0: + data_temp = tik_instance.Scalar("float32") + data_temp.set_as(input_x_ub[cc0]) + tik_instance.vector_dup(64, broadcast_0_local_UB[cc0 * 64], data_temp, 1, 1, 8) + tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[2048], 32, 1, 1, + 1, 8, 8, 8) + tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[1024], 16, 1, 1, + 1, 8, 8, 8) + tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[512], 8, 1, 1, 1, + 8, 8, 8) + tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[256], 4, 1, 1, 1, + 8, 8, 8) + tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[128], 2, 1, 1, 1, + 8, 8, 8) + tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[64], 1, 1, 1, 1, + 8, 8, 8) + tik_instance.data_move(res[block_index, 0], broadcast_0_local_UB, 0, 1, 8, 0, 0) + elif input_info == ((18, 128, 128), "float32"): + input_x = tik_instance.Tensor("float32", input_x_shape, name="input_x", scope=tik.scope_gm) + res = tik_instance.Tensor("float32", output_shape, name="res", scope=tik.scope_gm) + total_elements = 1 + for val in input_x_shape: + total_elements *= val + blocks = 32 + each_block_element = total_elements // blocks + with tik_instance.for_range(0, blocks, block_num=blocks) as block_index: + input_x_ub = tik_instance.Tensor("float32", (each_block_element,), name="input_x_ub", + scope=tik.scope_ubuf) + broadcast_0_local_UB = tik_instance.Tensor("float32", (4096,), name="broadcast_0_local_UB", + scope=tik.scope_ubuf) + tik_instance.data_move(input_x_ub, input_x[each_block_element * block_index], 0, 1, + each_block_element // 8, 0, 0) + repeat_time = each_block_element // 64 + tik_instance.vabs(64, input_x_ub, input_x_ub, repeat_time, 1, 1, 8, 8) + tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[4096], 64, 1, 1, 1, 8, 8, 8) + tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[2048], 32, 1, 1, 1, 8, 8, 8) + tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[1024], 16, 1, 1, 1, 8, 8, 8) + tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[512], 8, 1, 1, 1, 8, 8, 8) + tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[256], 4, 1, 1, 1, 8, 8, 8) + tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[128], 2, 1, 1, 1, 8, 8, 8) + tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[64], 1, 1, 1, 1, 8, 8, 8) + tik_instance.vmax(64, input_x_ub[8192], input_x_ub[8192], input_x_ub[8192 + 512], 8, 1, 1, 1, 8, 8, 8) + tik_instance.vmax(64, input_x_ub[8192], input_x_ub[8192], input_x_ub[8192 + 256], 4, 1, 1, 1, 8, 8, 8) + tik_instance.vmax(64, input_x_ub[8192], input_x_ub[8192], input_x_ub[8192 + 128], 2, 1, 1, 1, 8, 8, 8) + tik_instance.vmax(64, input_x_ub[8192], input_x_ub[8192], input_x_ub[8192 + 64], 1, 1, 1, 1, 8, 8, 8) + tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[8192], 1, 1, 1, 1, 8, 8, 8) + with tik_instance.for_range(0, 64) as cc0: + data_temp = tik_instance.Scalar("float32") + data_temp.set_as(input_x_ub[cc0]) + tik_instance.vector_dup(64, broadcast_0_local_UB[cc0 * 64], data_temp, 1, 1, 8) + tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[2048], 32, 1, 1, + 1, 8, 8, 8) + tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[1024], 16, 1, 1, + 1, 8, 8, 8) + tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[512], 8, 1, 1, 1, + 8, 8, 8) + tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[256], 4, 1, 1, 1, + 8, 8, 8) + tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[128], 2, 1, 1, 1, + 8, 8, 8) + tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[64], 1, 1, 1, 1, + 8, 8, 8) + tik_instance.data_move(res[block_index, 0], broadcast_0_local_UB, 0, 1, 8, 0, 0) + elif input_info == ((36, 128, 128), "float32"): + input_x = tik_instance.Tensor("float32", input_x_shape, name="input_x", scope=tik.scope_gm) + res = tik_instance.Tensor("float32", output_shape, name="res", scope=tik.scope_gm) + total_elements = 1 + for val in input_x_shape: + total_elements *= val + blocks = 32 + each_block_element = total_elements // blocks + with tik_instance.for_range(0, blocks, block_num=blocks) as block_index: + input_x_ub = tik_instance.Tensor("float32", (each_block_element,), name="input_x_ub", + scope=tik.scope_ubuf) + broadcast_0_local_UB = tik_instance.Tensor("float32", (4096,), name="broadcast_0_local_UB", + scope=tik.scope_ubuf) + tik_instance.data_move(input_x_ub, input_x[each_block_element * block_index], 0, 1, + each_block_element // 8, 0, 0) + repeat_time_1 = 255 + repeat_time_2 = each_block_element // 64 - 255 + tik_instance.vabs(64, input_x_ub, input_x_ub, repeat_time_1, 1, 1, 8, 8) + tik_instance.vabs(64, input_x_ub[repeat_time_1 * 64], input_x_ub[repeat_time_1 * 64], repeat_time_2, 1, + 1, 8, 8) + tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[8192], 128, 1, 1, 1, 8, 8, 8) + tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[4096], 64, 1, 1, 1, 8, 8, 8) + tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[2048], 32, 1, 1, 1, 8, 8, 8) + tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[1024], 16, 1, 1, 1, 8, 8, 8) + tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[512], 8, 1, 1, 1, 8, 8, 8) + tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[256], 4, 1, 1, 1, 8, 8, 8) + tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[128], 2, 1, 1, 1, 8, 8, 8) + tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[64], 1, 1, 1, 1, 8, 8, 8) + tik_instance.vmax(64, input_x_ub[16384], input_x_ub[16384], input_x_ub[16384 + 1024], 16, 1, 1, 1, 8, 8, + 8) + tik_instance.vmax(64, input_x_ub[16384], input_x_ub[16384], input_x_ub[16384 + 512], 8, 1, 1, 1, 8, 8, + 8) + tik_instance.vmax(64, input_x_ub[16384], input_x_ub[16384], input_x_ub[16384 + 256], 4, 1, 1, 1, 8, 8, + 8) + tik_instance.vmax(64, input_x_ub[16384], input_x_ub[16384], input_x_ub[16384 + 128], 2, 1, 1, 1, 8, 8, + 8) + tik_instance.vmax(64, input_x_ub[16384], input_x_ub[16384], input_x_ub[16384 + 64], 1, 1, 1, 1, 8, 8, 8) + tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[16384], 1, 1, 1, 1, 8, 8, 8) + with tik_instance.for_range(0, 64) as cc0: + data_temp = tik_instance.Scalar("float32") + data_temp.set_as(input_x_ub[cc0]) + tik_instance.vector_dup(64, broadcast_0_local_UB[cc0 * 64], data_temp, 1, 1, 8) + tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[2048], 32, 1, 1, + 1, 8, 8, 8) + tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[1024], 16, 1, 1, + 1, 8, 8, 8) + tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[512], 8, 1, 1, 1, + 8, 8, 8) + tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[256], 4, 1, 1, 1, + 8, 8, 8) + tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[128], 2, 1, 1, 1, + 8, 8, 8) + tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[64], 1, 1, 1, 1, + 8, 8, 8) + tik_instance.data_move(res[block_index, 0], broadcast_0_local_UB, 0, 1, 8, 0, 0) + elif input_info == ((1, 64, 64), "float32"): + input_x = tik_instance.Tensor("float32", input_x_shape, name="input_x", scope=tik.scope_gm) + res = tik_instance.Tensor("float32", output_shape, name="res", scope=tik.scope_gm) + total_elements = 1 + for val in input_x_shape: + total_elements *= val + blocks = 32 + each_block_element = total_elements // blocks + with tik_instance.for_range(0, blocks, block_num=blocks) as block_index: + input_x_ub = tik_instance.Tensor("float32", (each_block_element,), name="input_x_ub", + scope=tik.scope_ubuf) + broadcast_0_local_UB = tik_instance.Tensor("float32", (4096,), name="broadcast_0_local_UB", + scope=tik.scope_ubuf) + tik_instance.data_move(input_x_ub, input_x[each_block_element * block_index], 0, 1, + each_block_element // 8, 0, 0) + repeat_time = each_block_element // 64 + tik_instance.vabs(64, input_x_ub, input_x_ub, repeat_time, 1, 1, 8, 8) + tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[64], 1, 1, 1, 1, 8, 8, 8) + with tik_instance.for_range(0, 64) as cc0: + data_temp = tik_instance.Scalar("float32") + data_temp.set_as(input_x_ub[cc0]) + tik_instance.vector_dup(64, broadcast_0_local_UB[cc0 * 64], data_temp, 1, 1, 8) + tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[2048], 32, 1, 1, + 1, 8, 8, 8) + tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[1024], 16, 1, 1, + 1, 8, 8, 8) + tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[512], 8, 1, 1, 1, + 8, 8, 8) + tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[256], 4, 1, 1, 1, + 8, 8, 8) + tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[128], 2, 1, 1, 1, + 8, 8, 8) + tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[64], 1, 1, 1, 1, + 8, 8, 8) + tik_instance.data_move(res[block_index, 0], broadcast_0_local_UB, 0, 1, 8, 0, 0) + elif input_info == ((32, 64), "float32"): input_x = tik_instance.Tensor("float32", input_x_shape, name="input_x", scope=tik.scope_gm) res = tik_instance.Tensor("float32", output_shape, name="res", scope=tik.scope_gm) input_x_ub = tik_instance.Tensor("float32", (32 * 64,), name="input_x_ub", scope=tik.scope_ubuf) diff --git a/model_zoo/bert_thor/README.md b/model_zoo/bert_thor/README.md new file mode 100644 index 0000000000..189b9d9e4d --- /dev/null +++ b/model_zoo/bert_thor/README.md @@ -0,0 +1,93 @@ +# BERT Example +## Description +This is an example of training bert by second-order optimizer THOR. THOR is a novel approximate seond-order optimization method in MindSpore. + +## Requirements +- Install [MindSpore](https://www.mindspore.cn/install/en). +- Download the zhwiki dataset for pre-training. Extract and clean text in the dataset with [WikiExtractor](https://github.com/attardi/wikiextractor). Convert the dataset to TFRecord format and move the files to a specified path. +- Download dataset for fine-tuning and evaluation such as CLUENER, TNEWS, SQuAD v1.1, etc. +> Notes: + If you are running a fine-tuning or evaluation task, prepare a checkpoint from pre-train. + +## Running the Example +### Pre-Training +- Set options in `config.py`, including lossscale, optimizer and network. Click [here](https://www.mindspore.cn/tutorial/zh-CN/master/use/data_preparation/loading_the_datasets.html#tfrecord) for more information about dataset and the json schema file. + +- Run `run_standalone_pretrain.sh` for non-distributed pre-training of BERT-base and BERT-NEZHA model. + + ``` bash + sh scripts/run_standalone_pretrain.sh DEVICE_ID EPOCH_SIZE DATA_DIR SCHEMA_DIR + ``` +- Run `run_distribute_pretrain.sh` for distributed pre-training of BERT-base and BERT-NEZHA model. + + ``` bash + sh scripts/run_distribute_pretrain.sh DEVICE_NUM EPOCH_SIZE DATA_DIR SCHEMA_DIR MINDSPORE_HCCL_CONFIG_PATH + ``` + +## Usage +### Pre-Training +``` +usage: run_pretrain.py [--distribute DISTRIBUTE] [--epoch_size N] [----device_num N] [--device_id N] + [--enable_save_ckpt ENABLE_SAVE_CKPT] + [--enable_lossscale ENABLE_LOSSSCALE] [--do_shuffle DO_SHUFFLE] + [--enable_data_sink ENABLE_DATA_SINK] [--data_sink_steps N] [--checkpoint_path CHECKPOINT_PATH] + [--save_checkpoint_steps N] [--save_checkpoint_num N] + [--data_dir DATA_DIR] [--schema_dir SCHEMA_DIR] + +options: + --distribute pre_training by serveral devices: "true"(training by more than 1 device) | "false", default is "false" + --epoch_size epoch size: N, default is 1 + --device_num number of used devices: N, default is 1 + --device_id device id: N, default is 0 + --enable_save_ckpt enable save checkpoint: "true" | "false", default is "true" + --enable_lossscale enable lossscale: "true" | "false", default is "true" + --do_shuffle enable shuffle: "true" | "false", default is "true" + --enable_data_sink enable data sink: "true" | "false", default is "true" + --data_sink_steps set data sink steps: N, default is 1 + --checkpoint_path path to save checkpoint files: PATH, default is "" + --save_checkpoint_steps steps for saving checkpoint files: N, default is 1000 + --save_checkpoint_num number for saving checkpoint files: N, default is 1 + --data_dir path to dataset directory: PATH, default is "" + --schema_dir path to schema.json file, PATH, default is "" +``` +## Options and Parameters +It contains of parameters of BERT model and options for training, which is set in file `config.py`, `bert_net_config.py` and `evaluation_config.py` respectively. +### Options: +``` +config.py: + bert_network version of BERT model: base | nezha, default is base + optimizer optimizer used in the network: AdamWerigtDecayDynamicLR | Lamb | Momentum | Thor, default is "Thor" + +``` + +### Parameters: +``` +Parameters for dataset and network (Pre-Training/Evaluation): + batch_size batch size of input dataset: N, default is 8 + seq_length length of input sequence: N, default is 128 + vocab_size size of each embedding vector: N, must be consistant with the dataset you use. Default is 21136 + hidden_size size of bert encoder layers: N, default is 768 + num_hidden_layers number of hidden layers: N, default is 12 + num_attention_heads number of attention heads: N, default is 12 + intermediate_size size of intermediate layer: N, default is 3072 + hidden_act activation function used: ACTIVATION, default is "gelu" + hidden_dropout_prob dropout probability for BertOutput: Q, default is 0.1 + attention_probs_dropout_prob dropout probability for BertAttention: Q, default is 0.1 + max_position_embeddings maximum length of sequences: N, default is 512 + type_vocab_size size of token type vocab: N, default is 16 + initializer_range initialization value of TruncatedNormal: Q, default is 0.02 + use_relative_positions use relative positions or not: True | False, default is False + input_mask_from_dataset use the input mask loaded form dataset or not: True | False, default is True + token_type_ids_from_dataset use the token type ids loaded from dataset or not: True | False, default is True + dtype data type of input: mstype.float16 | mstype.float32, default is mstype.float32 + compute_type compute type in BertTransformer: mstype.float16 | mstype.float32, default is mstype.float16 + +Parameters for optimizer: + Thor: + momentum momentum for the moving average: Q + weight_decay weight decay: Q + loss_scale loss scale: N + frequency the step interval to update second-order information matrix: N, default is 10 + batch_size batch size of input dataset: N, default is 8 +``` + diff --git a/model_zoo/bert_thor/pretrain_eval.py b/model_zoo/bert_thor/pretrain_eval.py new file mode 100644 index 0000000000..04ad30bb3d --- /dev/null +++ b/model_zoo/bert_thor/pretrain_eval.py @@ -0,0 +1,164 @@ +# Copyright 2020 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ + +""" +Bert evaluation script. +""" + +import os + +from src import BertModel, GetMaskedLMOutput +from src.evaluation_config import cfg, bert_net_cfg + +import mindspore.common.dtype as mstype +import mindspore.dataset as de +import mindspore.dataset.transforms.c_transforms as C +import mindspore.nn as nn +from mindspore import context +from mindspore.common.parameter import Parameter +from mindspore.common.tensor import Tensor +from mindspore.nn.metrics import Metric +from mindspore.ops import operations as P +from mindspore.train.model import Model +from mindspore.train.serialization import load_checkpoint, load_param_into_net + + +class myMetric(Metric): + ''' + Self-defined Metric as a callback. + ''' + + def __init__(self): + super(myMetric, self).__init__() + self.clear() + + def clear(self): + self.total_num = 0 + self.acc_num = 0 + + def update(self, *inputs): + total_num = self._convert_data(inputs[0]) + acc_num = self._convert_data(inputs[1]) + self.total_num = total_num + self.acc_num = acc_num + + def eval(self): + return self.acc_num / self.total_num + + +class GetLogProbs(nn.Cell): + ''' + Get MaskedLM prediction scores + ''' + + def __init__(self, config): + super(GetLogProbs, self).__init__() + self.bert = BertModel(config, False) + self.cls1 = GetMaskedLMOutput(config) + + def construct(self, input_ids, input_mask, token_type_id, masked_pos): + sequence_output, _, embedding_table = self.bert(input_ids, token_type_id, input_mask) + prediction_scores = self.cls1(sequence_output, embedding_table, masked_pos) + return prediction_scores + + +class BertPretrainEva(nn.Cell): + ''' + Evaluate MaskedLM prediction scores + ''' + + def __init__(self, config): + super(BertPretrainEva, self).__init__() + self.bert = GetLogProbs(config) + self.argmax = P.Argmax(axis=-1, output_type=mstype.int32) + self.equal = P.Equal() + self.mean = P.ReduceMean() + self.sum = P.ReduceSum() + self.total = Parameter(Tensor([0], mstype.float32), name='total') + self.acc = Parameter(Tensor([0], mstype.float32), name='acc') + self.reshape = P.Reshape() + self.shape = P.Shape() + self.cast = P.Cast() + + def construct(self, input_ids, input_mask, token_type_id, masked_pos, masked_ids, masked_weights, nsp_label): + """construct of BertPretrainEva""" + bs, _ = self.shape(input_ids) + probs = self.bert(input_ids, input_mask, token_type_id, masked_pos) + index = self.argmax(probs) + index = self.reshape(index, (bs, -1)) + eval_acc = self.equal(index, masked_ids) + eval_acc1 = self.cast(eval_acc, mstype.float32) + real_acc = eval_acc1 * masked_weights + acc = self.sum(real_acc) + total = self.sum(masked_weights) + self.total += total + self.acc += acc + return acc, self.total, self.acc + + +def get_enwiki_512_dataset(batch_size=1, repeat_count=1, distribute_file=''): + ''' + Get enwiki seq_length=512 dataset + ''' + ds = de.TFRecordDataset([cfg.data_file], cfg.schema_file, columns_list=["input_ids", "input_mask", "segment_ids", + "masked_lm_positions", "masked_lm_ids", + "masked_lm_weights", + "next_sentence_labels"]) + type_cast_op = C.TypeCast(mstype.int32) + ds = ds.map(input_columns="segment_ids", operations=type_cast_op) + ds = ds.map(input_columns="input_mask", operations=type_cast_op) + ds = ds.map(input_columns="input_ids", operations=type_cast_op) + ds = ds.map(input_columns="masked_lm_ids", operations=type_cast_op) + ds = ds.map(input_columns="masked_lm_positions", operations=type_cast_op) + ds = ds.map(input_columns="next_sentence_labels", operations=type_cast_op) + ds = ds.repeat(repeat_count) + + # apply batch operations + ds = ds.batch(batch_size, drop_remainder=True) + return ds + + +def bert_predict(): + ''' + Predict function + ''' + devid = int(os.getenv('DEVICE_ID')) + context.set_context(mode=context.GRAPH_MODE, device_target="Ascend", device_id=devid) + dataset = get_enwiki_512_dataset(bert_net_cfg.batch_size, 1) + net_for_pretraining = BertPretrainEva(bert_net_cfg) + net_for_pretraining.set_train(False) + param_dict = load_checkpoint(cfg.finetune_ckpt) + load_param_into_net(net_for_pretraining, param_dict) + model = Model(net_for_pretraining) + return model, dataset, net_for_pretraining + + +def MLM_eval(): + ''' + Evaluate function + ''' + _, dataset, net_for_pretraining = bert_predict() + net = Model(net_for_pretraining, eval_network=net_for_pretraining, eval_indexes=[0, 1, 2], + metrics={'name': myMetric()}) + res = net.eval(dataset, dataset_sink_mode=False) + print("==============================================================") + for _, v in res.items(): + print("Accuracy is: ") + print(v) + print("==============================================================") + + +if __name__ == "__main__": + MLM_eval() diff --git a/model_zoo/bert_thor/run_pretrain.py b/model_zoo/bert_thor/run_pretrain.py new file mode 100644 index 0000000000..08161c7a13 --- /dev/null +++ b/model_zoo/bert_thor/run_pretrain.py @@ -0,0 +1,202 @@ +# Copyright 2020 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ +""" +#################pre_train bert example on zh-wiki######################## +python run_pretrain.py +""" + +import argparse +import os + +import numpy +from src import BertNetworkWithLoss, BertTrainOneStepCell, BertTrainOneStepWithLossScaleCell +from src.bert_net_config import bert_net_cfg +from src.config import cfg +from src.dataset import create_bert_dataset +from src.lr_generator import get_bert_lr, get_bert_damping +from src.model_thor import Model +# from src.thor_for_bert import THOR +from src.thor_for_bert_arg import THOR +from src.utils import LossCallBack, BertLearningRate + +import mindspore.common.dtype as mstype +import mindspore.communication.management as D +from mindspore import context +from mindspore import log as logger +from mindspore.nn.optim import Lamb, Momentum, AdamWeightDecay +from mindspore.nn.wrap.loss_scale import DynamicLossScaleUpdateCell +from mindspore.train.callback import ModelCheckpoint, CheckpointConfig, TimeMonitor +from mindspore.train.parallel_utils import ParallelMode +from mindspore.train.serialization import load_checkpoint, load_param_into_net + +_current_dir = os.path.dirname(os.path.realpath(__file__)) + + +def run_pretrain(): + """pre-train bert_clue""" + parser = argparse.ArgumentParser(description='bert pre_training') + parser.add_argument('--device_target', type=str, default='Ascend', choices=['Ascend', 'GPU'], + help='device where the code will be implemented. (Default: Ascend)') + parser.add_argument("--distribute", type=str, default="false", help="Run distribute, default is false.") + parser.add_argument("--epoch_size", type=int, default="1", help="Epoch size, default is 1.") + parser.add_argument("--device_id", type=int, default=4, help="Device id, default is 0.") + parser.add_argument("--device_num", type=int, default=1, help="Use device nums, default is 1.") + parser.add_argument("--enable_save_ckpt", type=str, default="true", help="Enable save checkpoint, default is true.") + parser.add_argument("--enable_lossscale", type=str, default="false", help="Use lossscale or not, default is not.") + parser.add_argument("--do_shuffle", type=str, default="false", help="Enable shuffle for dataset, default is true.") + parser.add_argument("--enable_data_sink", type=str, default="true", help="Enable data sink, default is true.") + parser.add_argument("--data_sink_steps", type=int, default="100", help="Sink steps for each epoch, default is 1.") + parser.add_argument("--save_checkpoint_path", type=str, default="", help="Save checkpoint path") + parser.add_argument("--load_checkpoint_path", type=str, default="", help="Load checkpoint file path") + parser.add_argument("--save_checkpoint_steps", type=int, default=1000, help="Save checkpoint steps, " + "default is 1000.") + parser.add_argument("--train_steps", type=int, default=-1, help="Training Steps, default is -1, " + "meaning run all steps according to epoch number.") + parser.add_argument("--save_checkpoint_num", type=int, default=1, help="Save checkpoint numbers, default is 1.") + parser.add_argument("--data_dir", type=str, default="", help="Data path, it is better to use absolute path") + parser.add_argument("--schema_dir", type=str, default="", help="Schema path, it is better to use absolute path") + + args_opt = parser.parse_args() + context.set_context(mode=context.GRAPH_MODE, device_target=args_opt.device_target, device_id=args_opt.device_id, + save_graphs=True) + context.set_context(reserve_class_name_in_scope=False) + context.set_context(variable_memory_max_size="30GB") + ckpt_save_dir = args_opt.save_checkpoint_path + if args_opt.distribute == "true": + if args_opt.device_target == 'Ascend': + D.init('hccl') + device_num = args_opt.device_num + rank = args_opt.device_id % device_num + else: + D.init('nccl') + device_num = D.get_group_size() + rank = D.get_rank() + ckpt_save_dir = args_opt.save_checkpoint_path + 'ckpt_' + str(rank) + '/' + + context.reset_auto_parallel_context() + context.set_auto_parallel_context(parallel_mode=ParallelMode.DATA_PARALLEL, mirror_mean=True, + device_num=device_num) + from mindspore.parallel._auto_parallel_context import auto_parallel_context + if bert_net_cfg.num_hidden_layers == 12: + if bert_net_cfg.use_relative_positions: + auto_parallel_context().set_all_reduce_fusion_split_indices([29, 58, 87, 116, 145, 174, 203, 217], + "hccl_world_groupsum1") + auto_parallel_context().set_all_reduce_fusion_split_indices([29, 58, 87, 116, 145, 174, 203, 217], + "hccl_world_groupsum3") + else: + auto_parallel_context().set_all_reduce_fusion_split_indices([28, 55, 82, 109, 136, 163, 190, 205], + "hccl_world_groupsum1") + auto_parallel_context().set_all_reduce_fusion_split_indices([28, 55, 82, 109, 136, 163, 190, 205], + "hccl_world_groupsum3") + elif bert_net_cfg.num_hidden_layers == 24: + if bert_net_cfg.use_relative_positions: + auto_parallel_context().set_all_reduce_fusion_split_indices([30, 90, 150, 210, 270, 330, 390, 421], + "hccl_world_groupsum1") + auto_parallel_context().set_all_reduce_fusion_split_indices([30, 90, 150, 210, 270, 330, 390, 421], + "hccl_world_groupsum3") + else: + auto_parallel_context().set_all_reduce_fusion_split_indices([38, 93, 148, 203, 258, 313, 368, 397], + "hccl_world_groupsum1") + auto_parallel_context().set_all_reduce_fusion_split_indices([38, 93, 148, 203, 258, 313, 368, 397], + "hccl_world_groupsum3") + else: + rank = 0 + device_num = 1 + + if args_opt.device_target == 'GPU' and bert_net_cfg.compute_type != mstype.float32: + logger.warning('Gpu only support fp32 temporarily, run with fp32.') + bert_net_cfg.compute_type = mstype.float32 + + ds = create_bert_dataset(device_num, rank, args_opt.do_shuffle, args_opt.data_dir, args_opt.schema_dir) + net_with_loss = BertNetworkWithLoss(bert_net_cfg, True) + + new_repeat_count = args_opt.epoch_size * ds.get_dataset_size() // args_opt.data_sink_steps + if args_opt.train_steps > 0: + new_repeat_count = min(new_repeat_count, args_opt.train_steps // args_opt.data_sink_steps) + else: + args_opt.train_steps = args_opt.epoch_size * ds.get_dataset_size() + logger.info("train steps: {}".format(args_opt.train_steps)) + + if cfg.optimizer == 'Lamb': + lr_schedule = BertLearningRate(learning_rate=cfg.Lamb.learning_rate, + end_learning_rate=cfg.Lamb.end_learning_rate, + warmup_steps=cfg.Lamb.warmup_steps, + decay_steps=args_opt.train_steps, + power=cfg.Lamb.power) + params = net_with_loss.trainable_params() + decay_params = list(filter(cfg.Lamb.decay_filter, params)) + other_params = list(filter(lambda x: x not in decay_params, params)) + group_params = [{'params': decay_params, 'weight_decay': cfg.Lamb.weight_decay}, + {'params': other_params}, + {'order_params': params}] + optimizer = Lamb(group_params, learning_rate=lr_schedule, eps=cfg.Lamb.eps) + elif cfg.optimizer == 'Momentum': + optimizer = Momentum(net_with_loss.trainable_params(), learning_rate=cfg.Momentum.learning_rate, + momentum=cfg.Momentum.momentum) + elif cfg.optimizer == 'AdamWeightDecay': + lr_schedule = BertLearningRate(learning_rate=cfg.AdamWeightDecay.learning_rate, + end_learning_rate=cfg.AdamWeightDecay.end_learning_rate, + warmup_steps=cfg.AdamWeightDecay.warmup_steps, + decay_steps=args_opt.train_steps, + power=cfg.AdamWeightDecay.power) + params = net_with_loss.trainable_params() + decay_params = list(filter(cfg.AdamWeightDecay.decay_filter, params)) + other_params = list(filter(lambda x: x not in decay_params, params)) + group_params = [{'params': decay_params, 'weight_decay': cfg.AdamWeightDecay.weight_decay}, + {'params': other_params, 'weight_decay': 0.0}, + {'order_params': params}] + + optimizer = AdamWeightDecay(group_params, learning_rate=lr_schedule, eps=cfg.AdamWeightDecay.eps) + elif cfg.optimizer == "Thor": + lr = get_bert_lr() + damping = get_bert_damping() + optimizer = THOR(filter(lambda x: x.requires_grad, net_with_loss.get_parameters()), lr, cfg.Thor.momentum, + filter(lambda x: 'matrix_A' in x.name, net_with_loss.get_parameters()), + filter(lambda x: 'matrix_G' in x.name, net_with_loss.get_parameters()), + filter(lambda x: 'A_inv_max' in x.name, net_with_loss.get_parameters()), + filter(lambda x: 'G_inv_max' in x.name, net_with_loss.get_parameters()), + cfg.Thor.weight_decay, cfg.Thor.loss_scale, bert_net_cfg.num_hidden_layers, + bert_net_cfg.batch_size, damping) + else: + raise ValueError("Don't support optimizer {}, only support [Lamb, Momentum, AdamWeightDecay]". + format(cfg.optimizer)) + callback = [TimeMonitor(args_opt.data_sink_steps), LossCallBack()] + if args_opt.enable_save_ckpt == "true": + config_ck = CheckpointConfig(save_checkpoint_steps=args_opt.save_checkpoint_steps, + keep_checkpoint_max=args_opt.save_checkpoint_num) + ckpoint_cb = ModelCheckpoint(prefix='checkpoint_bert', directory=ckpt_save_dir, config=config_ck) + callback.append(ckpoint_cb) + + if args_opt.load_checkpoint_path: + param_dict = load_checkpoint(args_opt.load_checkpoint_path) + load_param_into_net(net_with_loss, param_dict) + + if args_opt.enable_lossscale == "true": + update_cell = DynamicLossScaleUpdateCell(loss_scale_value=cfg.loss_scale_value, + scale_factor=cfg.scale_factor, + scale_window=cfg.scale_window) + net_with_grads = BertTrainOneStepWithLossScaleCell(net_with_loss, optimizer=optimizer, + scale_update_cell=update_cell) + else: + net_with_grads = BertTrainOneStepCell(net_with_loss, optimizer=optimizer) + + model = Model(net_with_grads, frequency=cfg.Thor.frequency) + model.train(new_repeat_count, ds, callbacks=callback, dataset_sink_mode=(args_opt.enable_data_sink == "true"), + sink_size=args_opt.data_sink_steps) + + +if __name__ == '__main__': + numpy.random.seed(0) + run_pretrain() diff --git a/model_zoo/bert_thor/scripts/run_distribute_pretrain.sh b/model_zoo/bert_thor/scripts/run_distribute_pretrain.sh new file mode 100644 index 0000000000..217d757eb9 --- /dev/null +++ b/model_zoo/bert_thor/scripts/run_distribute_pretrain.sh @@ -0,0 +1,62 @@ +#!/bin/bash +# Copyright 2020 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ + +echo "==============================================================================================================" +echo "Please run the scipt as: " +echo "bash run_distribute_pretrain.sh DEVICE_NUM EPOCH_SIZE DATA_DIR SCHEMA_DIR MINDSPORE_HCCL_CONFIG_PATH" +echo "for example: bash run_distribute_pretrain.sh 8 1 /path/zh-wiki/ /path/Schema.json /path/hccl.json" +echo "It is better to use absolute path." +echo "==============================================================================================================" + +EPOCH_SIZE=$2 +DATA_DIR=$3 +SCHEMA_DIR=$4 + +ulimit -u unlimited +export MINDSPORE_HCCL_CONFIG_PATH=$5 +export RANK_TABLE_FILE=$5 +export RANK_SIZE=$1 +export HCCL_CONNECT_TIMEOUT=300 + +for((i=0;i env.log + python ../run_pretrain.py \ + --distribute="true" \ + --epoch_size=$EPOCH_SIZE \ + --device_id=$DEVICE_ID \ + --device_num=$RANK_SIZE \ + --enable_save_ckpt="true" \ + --enable_lossscale="false" \ + --do_shuffle="true" \ + --enable_data_sink="true" \ + --data_sink_steps=1000 \ + --load_checkpoint_path="" \ + --save_checkpoint_steps=5000 \ + --save_checkpoint_num=30 \ + --data_dir=$DATA_DIR \ + --schema_dir=$SCHEMA_DIR > log.txt 2>&1 & + cd ../ +done \ No newline at end of file diff --git a/model_zoo/bert_thor/scripts/run_standalone_pretrain.sh b/model_zoo/bert_thor/scripts/run_standalone_pretrain.sh new file mode 100644 index 0000000000..f59eb69601 --- /dev/null +++ b/model_zoo/bert_thor/scripts/run_standalone_pretrain.sh @@ -0,0 +1,46 @@ +#!/bin/bash +# Copyright 2020 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ + +echo "==============================================================================================================" +echo "Please run the scipt as: " +echo "bash run_standalone_pretrain.sh DEVICE_ID EPOCH_SIZE DATA_DIR SCHEMA_DIR" +echo "for example: bash run_standalone_pretrain.sh 0 40 /path/zh-wiki/ /path/Schema.json" +echo "==============================================================================================================" + +DEVICE_ID=$1 +EPOCH_SIZE=$2 +DATA_DIR=$3 +SCHEMA_DIR=$4 + +mkdir -p ms_log +PROJECT_DIR=$(cd "$(dirname "$0")" || exit; pwd) +CUR_DIR=`pwd` +export GLOG_log_dir=${CUR_DIR}/ms_log +export GLOG_logtostderr=0 +python ${PROJECT_DIR}/../run_pretrain.py \ + --distribute="false" \ + --epoch_size=$EPOCH_SIZE \ + --device_id=$DEVICE_ID \ + --enable_save_ckpt="true" \ + --enable_lossscale="true" \ + --do_shuffle="true" \ + --enable_data_sink="true" \ + --data_sink_steps=1 \ + --load_checkpoint_path="" \ + --save_checkpoint_steps=10000 \ + --save_checkpoint_num=1 \ + --data_dir=$DATA_DIR \ + --schema_dir=$SCHEMA_DIR > log.txt 2>&1 & diff --git a/model_zoo/bert_thor/src/__init__.py b/model_zoo/bert_thor/src/__init__.py new file mode 100644 index 0000000000..4f4584a4b4 --- /dev/null +++ b/model_zoo/bert_thor/src/__init__.py @@ -0,0 +1,31 @@ +# Copyright 2020 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ +"""Bert Init.""" +from .bert_for_pre_training import BertNetworkWithLoss, BertPreTraining, \ + BertPretrainingLoss, GetMaskedLMOutput, GetNextSentenceOutput, \ + BertTrainOneStepCell, BertTrainOneStepWithLossScaleCell +from .bert_model import BertAttention, BertConfig, BertEncoderCell, BertModel, \ + BertOutput, BertSelfAttention, BertTransformer, EmbeddingLookup, \ + EmbeddingPostprocessor, RelaPosEmbeddingsGenerator, RelaPosMatrixGenerator, \ + SaturateCast, CreateAttentionMaskFromInputMask + +__all__ = [ + "BertNetworkWithLoss", "BertPreTraining", "BertPretrainingLoss", + "GetMaskedLMOutput", "GetNextSentenceOutput", "BertTrainOneStepCell", "BertTrainOneStepWithLossScaleCell", + "BertAttention", "BertConfig", "BertEncoderCell", "BertModel", "BertOutput", + "BertSelfAttention", "BertTransformer", "EmbeddingLookup", + "EmbeddingPostprocessor", "RelaPosEmbeddingsGenerator", + "RelaPosMatrixGenerator", "SaturateCast", "CreateAttentionMaskFromInputMask" +] diff --git a/model_zoo/bert_thor/src/bert_for_pre_training.py b/model_zoo/bert_thor/src/bert_for_pre_training.py new file mode 100644 index 0000000000..fb2db14743 --- /dev/null +++ b/model_zoo/bert_thor/src/bert_for_pre_training.py @@ -0,0 +1,458 @@ +# Copyright 2020 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ +"""Bert for pretraining.""" +import numpy as np + +import mindspore.nn as nn +from mindspore import context +from mindspore.common import dtype as mstype +from mindspore.common.initializer import initializer, TruncatedNormal +from mindspore.common.parameter import Parameter +from mindspore.common.tensor import Tensor +from mindspore.communication.management import get_group_size +from mindspore.nn.wrap.grad_reducer import DistributedGradReducer +from mindspore.ops import _selected_ops +from mindspore.ops import composite as C +from mindspore.ops import functional as F +from mindspore.ops import operations as P +from mindspore.train.parallel_utils import ParallelMode +from .bert_model import BertModel +from .config import cfg +from .lr_generator import get_bert_damping +from .thor_layer import Dense_Thor + +damping = get_bert_damping() +loss_scale = cfg.Thor.loss_scale +GRADIENT_CLIP_TYPE = 1 +GRADIENT_CLIP_VALUE = 1.0 + +clip_grad = C.MultitypeFuncGraph("clip_grad") + + +# pylint: disable=consider-using-in +@clip_grad.register("Number", "Number", "Tensor") +def _clip_grad(clip_type, clip_value, grad): + """ + Clip gradients. + + Inputs: + clip_type (int): The way to clip, 0 for 'value', 1 for 'norm'. + clip_value (float): Specifies how much to clip. + grad (tuple[Tensor]): Gradients. + + Outputs: + tuple[Tensor], clipped gradients. + """ + if clip_type != 0 and clip_type != 1: + return grad + dt = F.dtype(grad) + if clip_type == 0: + new_grad = C.clip_by_value(grad, F.cast(F.tuple_to_array((-clip_value,)), dt), + F.cast(F.tuple_to_array((clip_value,)), dt)) + else: + new_grad = nn.ClipByNorm()(grad, F.cast(F.tuple_to_array((clip_value,)), dt)) + return new_grad + + +class GetMaskedLMOutput(nn.Cell): + """ + Get masked lm output. + + Args: + config (BertConfig): The config of BertModel. + + Returns: + Tensor, masked lm output. + """ + + def __init__(self, config): + super(GetMaskedLMOutput, self).__init__() + self.width = config.hidden_size + self.reshape = P.Reshape() + self.gather = P.GatherV2() + + weight_init = TruncatedNormal(config.initializer_range) + self.dense = Dense_Thor(in_channels=self.width, + out_channels=config.hidden_size, + weight_init=weight_init, + has_bias=True, + bias_init='zeros', + damping=damping, + loss_scale=loss_scale, + frequency=1, + activation=config.hidden_act, + batch_size=config.batch_size).to_float(config.compute_type) + self.layernorm = nn.LayerNorm((config.hidden_size,)).to_float(config.compute_type) + self.output_bias = Parameter( + initializer( + 'zero', + config.vocab_size), + name='output_bias') + self.matmul = P.MatMul(transpose_b=True) + self.log_softmax = nn.LogSoftmax(axis=-1) + self.shape_flat_offsets = (-1, 1) + self.rng = Tensor(np.array(range(0, config.batch_size)).astype(np.int32)) + self.last_idx = (-1,) + self.shape_flat_sequence_tensor = (config.batch_size * config.seq_length, self.width) + self.seq_length_tensor = Tensor(np.array((config.seq_length,)).astype(np.int32)) + self.cast = P.Cast() + self.compute_type = config.compute_type + self.dtype = config.dtype + + def construct(self, + input_tensor, + output_weights, + positions): + """construct of GetMaskedLMOutput""" + flat_offsets = self.reshape( + self.rng * self.seq_length_tensor, self.shape_flat_offsets) + flat_position = self.reshape(positions + flat_offsets, self.last_idx) + flat_sequence_tensor = self.reshape(input_tensor, self.shape_flat_sequence_tensor) + input_tensor = self.gather(flat_sequence_tensor, flat_position, 0) + input_tensor = self.cast(input_tensor, self.compute_type) + output_weights = self.cast(output_weights, self.compute_type) + input_tensor = self.dense(input_tensor) + input_tensor = self.layernorm(input_tensor) + logits = self.matmul(input_tensor, output_weights) + logits = self.cast(logits, self.dtype) + logits = logits + self.output_bias + log_probs = self.log_softmax(logits) + return log_probs + + +class GetNextSentenceOutput(nn.Cell): + """ + Get next sentence output. + + Args: + config (BertConfig): The config of Bert. + + Returns: + Tensor, next sentence output. + """ + + def __init__(self, config): + super(GetNextSentenceOutput, self).__init__() + self.log_softmax = _selected_ops.LogSoftmax() + weight_init = TruncatedNormal(config.initializer_range) + self.dense = nn.Dense(config.hidden_size, 2, + weight_init=weight_init, has_bias=True).to_float(config.compute_type) + self.dtype = config.dtype + self.cast = P.Cast() + + def construct(self, input_tensor): + logits = self.dense(input_tensor) + logits = self.cast(logits, self.dtype) + log_prob = self.log_softmax(logits) + return log_prob + + +class BertPreTraining(nn.Cell): + """ + Bert pretraining network. + + Args: + config (BertConfig): The config of BertModel. + is_training (bool): Specifies whether to use the training mode. + use_one_hot_embeddings (bool): Specifies whether to use one-hot for embeddings. + + Returns: + Tensor, prediction_scores, seq_relationship_score. + """ + + def __init__(self, config, is_training, use_one_hot_embeddings): + super(BertPreTraining, self).__init__() + self.bert = BertModel(config, is_training, use_one_hot_embeddings) + self.cls1 = GetMaskedLMOutput(config) + self.cls2 = GetNextSentenceOutput(config) + + def construct(self, input_ids, input_mask, token_type_id, + masked_lm_positions): + sequence_output, pooled_output, embedding_table = \ + self.bert(input_ids, token_type_id, input_mask) + prediction_scores = self.cls1(sequence_output, + embedding_table, + masked_lm_positions) + seq_relationship_score = self.cls2(pooled_output) + return prediction_scores, seq_relationship_score + + +class BertPretrainingLoss(nn.Cell): + """ + Provide bert pre-training loss. + + Args: + config (BertConfig): The config of BertModel. + + Returns: + Tensor, total loss. + """ + + def __init__(self, config): + super(BertPretrainingLoss, self).__init__() + self.vocab_size = config.vocab_size + self.onehot = P.OneHot() + self.on_value = Tensor(1.0, mstype.float32) + self.off_value = Tensor(0.0, mstype.float32) + self.reduce_sum = P.ReduceSum() + self.reduce_mean = P.ReduceMean() + self.reshape = P.Reshape() + self.last_idx = (-1,) + self.neg = P.Neg() + self.cast = P.Cast() + + def construct(self, prediction_scores, seq_relationship_score, masked_lm_ids, + masked_lm_weights, next_sentence_labels): + """Defines the computation performed.""" + label_ids = self.reshape(masked_lm_ids, self.last_idx) + label_weights = self.cast(self.reshape(masked_lm_weights, self.last_idx), mstype.float32) + one_hot_labels = self.onehot(label_ids, self.vocab_size, self.on_value, self.off_value) + + per_example_loss = self.neg(self.reduce_sum(prediction_scores * one_hot_labels, self.last_idx)) + numerator = self.reduce_sum(label_weights * per_example_loss, ()) + denominator = self.reduce_sum(label_weights, ()) + self.cast(F.tuple_to_array((1e-5,)), mstype.float32) + masked_lm_loss = numerator / denominator + + # next_sentence_loss + labels = self.reshape(next_sentence_labels, self.last_idx) + one_hot_labels = self.onehot(labels, 2, self.on_value, self.off_value) + per_example_loss = self.neg(self.reduce_sum( + one_hot_labels * seq_relationship_score, self.last_idx)) + next_sentence_loss = self.reduce_mean(per_example_loss, self.last_idx) + + # total_loss + total_loss = masked_lm_loss + next_sentence_loss + + return total_loss + + +class BertNetworkWithLoss(nn.Cell): + """ + Provide bert pre-training loss through network. + + Args: + config (BertConfig): The config of BertModel. + is_training (bool): Specifies whether to use the training mode. + use_one_hot_embeddings (bool): Specifies whether to use one-hot for embeddings. Default: False. + + Returns: + Tensor, the loss of the network. + """ + + def __init__(self, config, is_training, use_one_hot_embeddings=False): + super(BertNetworkWithLoss, self).__init__() + self.bert = BertPreTraining(config, is_training, use_one_hot_embeddings) + self.loss = BertPretrainingLoss(config) + self.cast = P.Cast() + + def construct(self, + input_ids, + input_mask, + token_type_id, + next_sentence_labels, + masked_lm_positions, + masked_lm_ids, + masked_lm_weights): + """construct of BertNetworkWithLoss""" + prediction_scores, seq_relationship_score = \ + self.bert(input_ids, input_mask, token_type_id, masked_lm_positions) + total_loss = self.loss(prediction_scores, seq_relationship_score, + masked_lm_ids, masked_lm_weights, next_sentence_labels) + return self.cast(total_loss, mstype.float32) + + +class BertTrainOneStepCell(nn.Cell): + """ + Encapsulation class of bert network training. + + Append an optimizer to the training network after that the construct + function can be called to create the backward graph. + + Args: + network (Cell): The training network. Note that loss function should have been added. + optimizer (Optimizer): Optimizer for updating the weights. + sens (Number): The adjust parameter. Default: 1.0. + """ + + def __init__(self, network, optimizer, sens=1.0): + super(BertTrainOneStepCell, self).__init__(auto_prefix=False) + self.network = network + self.weights = optimizer.parameters + self.optimizer = optimizer + self.grad = C.GradOperation('grad', get_by_list=True, sens_param=True) + self.sens = sens + self.reducer_flag = False + self.parallel_mode = context.get_auto_parallel_context("parallel_mode") + if self.parallel_mode in [ParallelMode.DATA_PARALLEL, ParallelMode.HYBRID_PARALLEL]: + self.reducer_flag = True + self.grad_reducer = None + if self.reducer_flag: + mean = context.get_auto_parallel_context("mirror_mean") + degree = get_group_size() + self.grad_reducer = DistributedGradReducer(optimizer.parameters, mean, degree) + + self.cast = P.Cast() + self.hyper_map = C.HyperMap() + + def set_sens(self, value): + self.sens = value + + def construct(self, + input_ids, + input_mask, + token_type_id, + next_sentence_labels, + masked_lm_positions, + masked_lm_ids, + masked_lm_weights): + """Defines the computation performed.""" + weights = self.weights + + loss = self.network(input_ids, + input_mask, + token_type_id, + next_sentence_labels, + masked_lm_positions, + masked_lm_ids, + masked_lm_weights) + grads = self.grad(self.network, weights)(input_ids, + input_mask, + token_type_id, + next_sentence_labels, + masked_lm_positions, + masked_lm_ids, + masked_lm_weights, + self.cast(F.tuple_to_array((self.sens,)), + mstype.float32)) + grads = self.hyper_map(F.partial(clip_grad, GRADIENT_CLIP_TYPE, GRADIENT_CLIP_VALUE), grads) + if self.reducer_flag: + # apply grad reducer on grads + grads = self.grad_reducer(grads) + succ = self.optimizer(grads) + return F.depend(loss, succ) + + +grad_scale = C.MultitypeFuncGraph("grad_scale") +reciprocal = P.Reciprocal() + + +@grad_scale.register("Tensor", "Tensor") +def tensor_grad_scale(scale, grad): + return grad * reciprocal(scale) + + +class BertTrainOneStepWithLossScaleCell(nn.Cell): + """ + Encapsulation class of bert network training. + + Append an optimizer to the training network after that the construct + function can be called to create the backward graph. + + Args: + network (Cell): The training network. Note that loss function should have been added. + optimizer (Optimizer): Optimizer for updating the weights. + scale_update_cell (Cell): Cell to do the loss scale. Default: None. + """ + + def __init__(self, network, optimizer, scale_update_cell=None): + super(BertTrainOneStepWithLossScaleCell, self).__init__(auto_prefix=False) + self.network = network + self.weights = optimizer.parameters + self.optimizer = optimizer + self.grad = C.GradOperation('grad', + get_by_list=True, + sens_param=True) + self.reducer_flag = False + self.allreduce = P.AllReduce() + self.parallel_mode = context.get_auto_parallel_context("parallel_mode") + if self.parallel_mode in [ParallelMode.DATA_PARALLEL, ParallelMode.HYBRID_PARALLEL]: + self.reducer_flag = True + self.grad_reducer = F.identity + self.degree = 1 + if self.reducer_flag: + self.degree = get_group_size() + self.grad_reducer = DistributedGradReducer(optimizer.parameters, False, self.degree) + self.is_distributed = (self.parallel_mode != ParallelMode.STAND_ALONE) + self.cast = P.Cast() + self.alloc_status = P.NPUAllocFloatStatus() + self.get_status = P.NPUGetFloatStatus() + self.clear_before_grad = P.NPUClearFloatStatus() + self.reduce_sum = P.ReduceSum(keep_dims=False) + self.depend_parameter_use = P.ControlDepend(depend_mode=1) + self.base = Tensor(1, mstype.float32) + self.less_equal = P.LessEqual() + self.hyper_map = C.HyperMap() + self.loss_scale = None + self.loss_scaling_manager = scale_update_cell + if scale_update_cell: + self.loss_scale = Parameter(Tensor(scale_update_cell.get_loss_scale(), dtype=mstype.float32), + name="loss_scale") + + @C.add_flags(has_effect=True) + def construct(self, + input_ids, + input_mask, + token_type_id, + next_sentence_labels, + masked_lm_positions, + masked_lm_ids, + masked_lm_weights, + sens=None): + """Defines the computation performed.""" + weights = self.weights + loss = self.network(input_ids, + input_mask, + token_type_id, + next_sentence_labels, + masked_lm_positions, + masked_lm_ids, + masked_lm_weights) + if sens is None: + scaling_sens = self.loss_scale + else: + scaling_sens = sens + # alloc status and clear should be right before gradoperation + init = self.alloc_status() + self.clear_before_grad(init) + grads = self.grad(self.network, weights)(input_ids, + input_mask, + token_type_id, + next_sentence_labels, + masked_lm_positions, + masked_lm_ids, + masked_lm_weights, + self.cast(scaling_sens, + mstype.float32)) + # apply grad reducer on grads + grads = self.grad_reducer(grads) + grads = self.hyper_map(F.partial(grad_scale, scaling_sens * self.degree), grads) + grads = self.hyper_map(F.partial(clip_grad, GRADIENT_CLIP_TYPE, GRADIENT_CLIP_VALUE), grads) + self.get_status(init) + flag_sum = self.reduce_sum(init, (0,)) + if self.is_distributed: + # sum overflow flag over devices + flag_reduce = self.allreduce(flag_sum) + cond = self.less_equal(self.base, flag_reduce) + else: + cond = self.less_equal(self.base, flag_sum) + overflow = cond + if sens is None: + overflow = self.loss_scaling_manager(self.loss_scale, cond) + if overflow: + succ = False + else: + succ = self.optimizer(grads) + ret = (loss, cond, scaling_sens) + return F.depend(ret, succ) diff --git a/model_zoo/bert_thor/src/bert_model.py b/model_zoo/bert_thor/src/bert_model.py new file mode 100644 index 0000000000..93b5a9169a --- /dev/null +++ b/model_zoo/bert_thor/src/bert_model.py @@ -0,0 +1,1027 @@ +# Copyright 2020 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ +"""Bert model.""" + +import copy +import math + +import numpy as np + +import mindspore.common.dtype as mstype +import mindspore.nn as nn +import mindspore.ops.functional as F +from mindspore.common.initializer import TruncatedNormal, initializer +from mindspore.common.parameter import Parameter +from mindspore.common.tensor import Tensor +from mindspore.ops import composite as C +from mindspore.ops import operations as P +from .config import cfg +from .fused_layer_norm import FusedLayerNorm +from .lr_generator import get_bert_damping +from .thor_layer import Dense_Thor, Embedding_Thor + +damping = get_bert_damping() +loss_scale = cfg.Thor.loss_scale +batch_size = cfg.Thor.batch_size + + +class BertConfig: + """ + Configuration for `BertModel`. + + Args: + batch_size (int): Batch size of input dataset. + seq_length (int): Length of input sequence. Default: 128. + vocab_size (int): The shape of each embedding vector. Default: 32000. + hidden_size (int): Size of the bert encoder layers. Default: 768. + num_hidden_layers (int): Number of hidden layers in the BertTransformer encoder + cell. Default: 12. + num_attention_heads (int): Number of attention heads in the BertTransformer + encoder cell. Default: 12. + intermediate_size (int): Size of intermediate layer in the BertTransformer + encoder cell. Default: 3072. + hidden_act (str): Activation function used in the BertTransformer encoder + cell. Default: "gelu". + hidden_dropout_prob (float): The dropout probability for BertOutput. Default: 0.1. + attention_probs_dropout_prob (float): The dropout probability for + BertAttention. Default: 0.1. + max_position_embeddings (int): Maximum length of sequences used in this + model. Default: 512. + type_vocab_size (int): Size of token type vocab. Default: 16. + initializer_range (float): Initialization value of TruncatedNormal. Default: 0.02. + use_relative_positions (bool): Specifies whether to use relative positions. Default: False. + input_mask_from_dataset (bool): Specifies whether to use the input mask that loaded from + dataset. Default: True. + token_type_ids_from_dataset (bool): Specifies whether to use the token type ids that loaded + from dataset. Default: True. + dtype (:class:`mindspore.dtype`): Data type of the input. Default: mstype.float32. + compute_type (:class:`mindspore.dtype`): Compute type in BertTransformer. Default: mstype.float32. + """ + + def __init__(self, + batch_size, + seq_length=128, + vocab_size=32000, + hidden_size=768, + num_hidden_layers=12, + num_attention_heads=12, + intermediate_size=3072, + hidden_act="gelu", + hidden_dropout_prob=0.1, + attention_probs_dropout_prob=0.1, + max_position_embeddings=512, + type_vocab_size=16, + initializer_range=0.02, + use_relative_positions=False, + input_mask_from_dataset=True, + token_type_ids_from_dataset=True, + dtype=mstype.float32, + compute_type=mstype.float32, + enable_fused_layernorm=False): + self.batch_size = batch_size + self.seq_length = seq_length + self.vocab_size = vocab_size + self.hidden_size = hidden_size + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + self.hidden_act = hidden_act + self.intermediate_size = intermediate_size + self.hidden_dropout_prob = hidden_dropout_prob + self.attention_probs_dropout_prob = attention_probs_dropout_prob + self.max_position_embeddings = max_position_embeddings + self.type_vocab_size = type_vocab_size + self.initializer_range = initializer_range + self.input_mask_from_dataset = input_mask_from_dataset + self.token_type_ids_from_dataset = token_type_ids_from_dataset + self.use_relative_positions = use_relative_positions + self.dtype = dtype + self.compute_type = compute_type + self.enable_fused_layernorm = enable_fused_layernorm + + +class EmbeddingLookup(nn.Cell): + """ + A embeddings lookup table with a fixed dictionary and size. + + Args: + vocab_size (int): Size of the dictionary of embeddings. + embedding_size (int): The size of each embedding vector. + embedding_shape (list): [batch_size, seq_length, embedding_size], the shape of + each embedding vector. + use_one_hot_embeddings (bool): Specifies whether to use one hot encoding form. Default: False. + initializer_range (float): Initialization value of TruncatedNormal. Default: 0.02. + """ + + def __init__(self, + vocab_size, + embedding_size, + embedding_shape, + use_one_hot_embeddings=False, + initializer_range=0.02): + super(EmbeddingLookup, self).__init__() + self.vocab_size = vocab_size + self.use_one_hot_embeddings = use_one_hot_embeddings + self.embedding_table = Parameter(initializer + (TruncatedNormal(initializer_range), + [vocab_size, embedding_size]), + name='embedding_table') + self.expand = P.ExpandDims() + self.shape_flat = (-1,) + self.gather = P.GatherV2() + self.one_hot = P.OneHot() + self.on_value = Tensor(1.0, mstype.float32) + self.off_value = Tensor(0.0, mstype.float32) + self.array_mul = P.MatMul() + self.reshape = P.Reshape() + self.shape = tuple(embedding_shape) + + def construct(self, input_ids): + """construct of EmbeddingLookup""" + extended_ids = self.expand(input_ids, -1) + flat_ids = self.reshape(extended_ids, self.shape_flat) + if self.use_one_hot_embeddings: + one_hot_ids = self.one_hot(flat_ids, self.vocab_size, self.on_value, self.off_value) + output_for_reshape = self.array_mul( + one_hot_ids, self.embedding_table) + else: + output_for_reshape = self.gather(self.embedding_table, flat_ids, 0) + output = self.reshape(output_for_reshape, self.shape) + return output, self.embedding_table + + +class EmbeddingPostprocessor(nn.Cell): + """ + Postprocessors apply positional and token type embeddings to word embeddings. + + Args: + embedding_size (int): The size of each embedding vector. + embedding_shape (list): [batch_size, seq_length, embedding_size], the shape of + each embedding vector. + use_token_type (bool): Specifies whether to use token type embeddings. Default: False. + token_type_vocab_size (int): Size of token type vocab. Default: 16. + use_one_hot_embeddings (bool): Specifies whether to use one hot encoding form. Default: False. + initializer_range (float): Initialization value of TruncatedNormal. Default: 0.02. + max_position_embeddings (int): Maximum length of sequences used in this + model. Default: 512. + dropout_prob (float): The dropout probability. Default: 0.1. + """ + + def __init__(self, + embedding_size, + embedding_shape, + use_relative_positions=False, + use_token_type=False, + token_type_vocab_size=16, + use_one_hot_embeddings=False, + initializer_range=0.02, + max_position_embeddings=512, + dropout_prob=0.1): + super(EmbeddingPostprocessor, self).__init__() + self.use_token_type = use_token_type + self.token_type_vocab_size = token_type_vocab_size + self.use_one_hot_embeddings = use_one_hot_embeddings + self.max_position_embeddings = max_position_embeddings + self.token_type_embedding = Embedding_Thor( + vocab_size=token_type_vocab_size, + embedding_size=embedding_size, + embedding_shape=embedding_shape, + use_one_hot_embeddings=use_one_hot_embeddings, + initializer_range=initializer_range, + name='embedding_table', + is_expand=False, + batch_size=batch_size, + damping=damping, + loss_scale=loss_scale, + frequency=1) + self.shape_flat = (-1,) + self.one_hot = P.OneHot() + self.on_value = Tensor(1.0, mstype.float32) + self.off_value = Tensor(0.1, mstype.float32) + self.array_mul = P.MatMul() + self.reshape = P.Reshape() + self.shape = tuple(embedding_shape) + self.dropout = nn.Dropout(1 - dropout_prob) + self.gather = P.GatherV2() + self.use_relative_positions = use_relative_positions + self.slice = P.StridedSlice() + _, seq, width = self.shape + position_embedding_shape = [1, seq, width] + self.full_position_embedding = Embedding_Thor( + vocab_size=max_position_embeddings, + embedding_size=embedding_size, + embedding_shape=position_embedding_shape, + use_one_hot_embeddings=use_one_hot_embeddings, + initializer_range=initializer_range, + name='full_position_embeddings', + is_expand=False, + batch_size=batch_size, + damping=damping, + loss_scale=loss_scale, + frequency=1) + self.position_ids = Tensor(np.arange(seq).reshape(-1, seq).astype(np.int32)) + self.layernorm = nn.LayerNorm((embedding_size,)) + + def construct(self, token_type_ids, word_embeddings): + """construct of EmbeddingPostprocessor""" + output = word_embeddings + if self.use_token_type: + token_type_embeddings, _ = self.token_type_embedding(token_type_ids) + output += token_type_embeddings + if not self.use_relative_positions: + position_embeddings, _ = self.full_position_embedding(self.position_ids) + output += position_embeddings + output = self.layernorm(output) + output = self.dropout(output) + return output + + +class BertOutput(nn.Cell): + """ + Apply a linear computation to hidden status and a residual computation to input. + + Args: + in_channels (int): Input channels. + out_channels (int): Output channels. + initializer_range (float): Initialization value of TruncatedNormal. Default: 0.02. + dropout_prob (float): The dropout probability. Default: 0.1. + compute_type (:class:`mindspore.dtype`): Compute type in BertTransformer. Default: mstype.float32. + """ + + def __init__(self, + in_channels, + out_channels, + initializer_range=0.02, + dropout_prob=0.1, + compute_type=mstype.float32, + enable_fused_layernorm=False): + super(BertOutput, self).__init__() + self.dense = Dense_Thor(in_channels=in_channels, + out_channels=out_channels, + weight_init=TruncatedNormal(initializer_range), + has_bias=True, + bias_init='zeros', + damping=damping, + loss_scale=loss_scale, + frequency=1, + activation=None, + batch_size=batch_size).to_float(compute_type) + self.dropout = nn.Dropout(1 - dropout_prob) + self.dropout_prob = dropout_prob + self.add = P.TensorAdd() + if compute_type == mstype.float16: + self.layernorm = FusedLayerNorm((out_channels,), + use_batch_norm=enable_fused_layernorm).to_float(compute_type) + else: + self.layernorm = nn.LayerNorm((out_channels,)).to_float(compute_type) + self.cast = P.Cast() + + def construct(self, hidden_status, input_tensor): + """construct of BertOutput""" + output = self.dense(hidden_status) + output = self.dropout(output) + output = self.add(input_tensor, output) + output = self.layernorm(output) + return output + + +class RelaPosMatrixGenerator(nn.Cell): + """ + Generates matrix of relative positions between inputs. + + Args: + length (int): Length of one dim for the matrix to be generated. + max_relative_position (int): Max value of relative position. + """ + + def __init__(self, length, max_relative_position): + super(RelaPosMatrixGenerator, self).__init__() + self._length = length + self._max_relative_position = max_relative_position + self._min_relative_position = -max_relative_position + self.range_length = -length + 1 + + self.tile = P.Tile() + self.range_mat = P.Reshape() + self.sub = P.Sub() + self.expanddims = P.ExpandDims() + self.cast = P.Cast() + + def construct(self): + """construct of RelaPosMatrixGenerator""" + range_vec_row_out = self.cast(F.tuple_to_array(F.make_range(self._length)), mstype.int32) + range_vec_col_out = self.range_mat(range_vec_row_out, (self._length, -1)) + tile_row_out = self.tile(range_vec_row_out, (self._length,)) + tile_col_out = self.tile(range_vec_col_out, (1, self._length)) + range_mat_out = self.range_mat(tile_row_out, (self._length, self._length)) + transpose_out = self.range_mat(tile_col_out, (self._length, self._length)) + distance_mat = self.sub(range_mat_out, transpose_out) + + distance_mat_clipped = C.clip_by_value(distance_mat, + self._min_relative_position, + self._max_relative_position) + + # Shift values to be >=0. Each integer still uniquely identifies a + # relative position difference. + final_mat = distance_mat_clipped + self._max_relative_position + return final_mat + + +class RelaPosEmbeddingsGenerator(nn.Cell): + """ + Generates tensor of size [length, length, depth]. + + Args: + length (int): Length of one dim for the matrix to be generated. + depth (int): Size of each attention head. + max_relative_position (int): Maxmum value of relative position. + initializer_range (float): Initialization value of TruncatedNormal. + use_one_hot_embeddings (bool): Specifies whether to use one hot encoding form. Default: False. + """ + + def __init__(self, + length, + depth, + max_relative_position, + initializer_range, + use_one_hot_embeddings=False): + super(RelaPosEmbeddingsGenerator, self).__init__() + self.depth = depth + self.vocab_size = max_relative_position * 2 + 1 + self.use_one_hot_embeddings = use_one_hot_embeddings + + self.embeddings_table = Parameter( + initializer(TruncatedNormal(initializer_range), + [self.vocab_size, self.depth]), + name='embeddings_for_position') + + self.relative_positions_matrix = RelaPosMatrixGenerator(length=length, + max_relative_position=max_relative_position) + self.reshape = P.Reshape() + self.one_hot = nn.OneHot(depth=self.vocab_size) + self.shape = P.Shape() + self.gather = P.GatherV2() # index_select + self.matmul = P.BatchMatMul() + + def construct(self): + """construct of RelaPosEmbeddingsGenerator""" + relative_positions_matrix_out = self.relative_positions_matrix() + + # Generate embedding for each relative position of dimension depth. + if self.use_one_hot_embeddings: + flat_relative_positions_matrix = self.reshape(relative_positions_matrix_out, (-1,)) + one_hot_relative_positions_matrix = self.one_hot( + flat_relative_positions_matrix) + embeddings = self.matmul(one_hot_relative_positions_matrix, self.embeddings_table) + my_shape = self.shape(relative_positions_matrix_out) + (self.depth,) + embeddings = self.reshape(embeddings, my_shape) + else: + embeddings = self.gather(self.embeddings_table, + relative_positions_matrix_out, 0) + return embeddings + + +class SaturateCast(nn.Cell): + """ + Performs a safe saturating cast. This operation applies proper clamping before casting to prevent + the danger that the value will overflow or underflow. + + Args: + src_type (:class:`mindspore.dtype`): The type of the elements of the input tensor. Default: mstype.float32. + dst_type (:class:`mindspore.dtype`): The type of the elements of the output tensor. Default: mstype.float32. + """ + + def __init__(self, src_type=mstype.float32, dst_type=mstype.float32): + super(SaturateCast, self).__init__() + np_type = mstype.dtype_to_nptype(dst_type) + + self.tensor_min_type = float(np.finfo(np_type).min) + self.tensor_max_type = float(np.finfo(np_type).max) + + self.min_op = P.Minimum() + self.max_op = P.Maximum() + self.cast = P.Cast() + self.dst_type = dst_type + + def construct(self, x): + """construct of SaturateCast""" + out = self.max_op(x, self.tensor_min_type) + out = self.min_op(out, self.tensor_max_type) + return self.cast(out, self.dst_type) + + +class BertAttention(nn.Cell): + """ + Apply multi-headed attention from "from_tensor" to "to_tensor". + + Args: + batch_size (int): Batch size of input datasets. + from_tensor_width (int): Size of last dim of from_tensor. + to_tensor_width (int): Size of last dim of to_tensor. + from_seq_length (int): Length of from_tensor sequence. + to_seq_length (int): Length of to_tensor sequence. + num_attention_heads (int): Number of attention heads. Default: 1. + size_per_head (int): Size of each attention head. Default: 512. + query_act (str): Activation function for the query transform. Default: None. + key_act (str): Activation function for the key transform. Default: None. + value_act (str): Activation function for the value transform. Default: None. + has_attention_mask (bool): Specifies whether to use attention mask. Default: False. + attention_probs_dropout_prob (float): The dropout probability for + BertAttention. Default: 0.0. + use_one_hot_embeddings (bool): Specifies whether to use one hot encoding form. Default: False. + initializer_range (float): Initialization value of TruncatedNormal. Default: 0.02. + do_return_2d_tensor (bool): True for return 2d tensor. False for return 3d + tensor. Default: False. + use_relative_positions (bool): Specifies whether to use relative positions. Default: False. + compute_type (:class:`mindspore.dtype`): Compute type in BertAttention. Default: mstype.float32. + """ + + def __init__(self, + batch_size, + from_tensor_width, + to_tensor_width, + from_seq_length, + to_seq_length, + num_attention_heads=1, + size_per_head=512, + query_act=None, + key_act=None, + value_act=None, + has_attention_mask=False, + attention_probs_dropout_prob=0.0, + use_one_hot_embeddings=False, + initializer_range=0.02, + do_return_2d_tensor=False, + use_relative_positions=False, + compute_type=mstype.float32): + + super(BertAttention, self).__init__() + self.batch_size = batch_size + self.from_seq_length = from_seq_length + self.to_seq_length = to_seq_length + self.num_attention_heads = num_attention_heads + self.size_per_head = size_per_head + self.has_attention_mask = has_attention_mask + self.use_relative_positions = use_relative_positions + + self.scores_mul = 1.0 / math.sqrt(float(self.size_per_head)) + self.reshape = P.Reshape() + self.shape_from_2d = (-1, from_tensor_width) + self.shape_to_2d = (-1, to_tensor_width) + weight = TruncatedNormal(initializer_range) + units = num_attention_heads * size_per_head + self.query_layer = Dense_Thor(in_channels=from_tensor_width, + out_channels=units, + weight_init=weight, + has_bias=True, + bias_init='zeros', + damping=damping, + loss_scale=loss_scale, + frequency=1, + activation=query_act, + batch_size=batch_size).to_float(compute_type) + self.key_layer = Dense_Thor(in_channels=to_tensor_width, + out_channels=units, + weight_init=weight, + has_bias=True, + bias_init='zeros', + damping=damping, + loss_scale=loss_scale, + frequency=1, + activation=key_act, + batch_size=batch_size).to_float(compute_type) + self.value_layer = Dense_Thor(in_channels=to_tensor_width, + out_channels=units, + weight_init=weight, + has_bias=True, + bias_init='zeros', + damping=damping, + loss_scale=loss_scale, + frequency=1, + activation=value_act, + batch_size=batch_size).to_float(compute_type) + self.shape_from = (batch_size, from_seq_length, num_attention_heads, size_per_head) + self.shape_to = ( + batch_size, to_seq_length, num_attention_heads, size_per_head) + + self.matmul_trans_b = P.BatchMatMul(transpose_b=True) + self.multiply = P.Mul() + self.transpose = P.Transpose() + self.trans_shape = (0, 2, 1, 3) + self.trans_shape_relative = (2, 0, 1, 3) + self.trans_shape_position = (1, 2, 0, 3) + self.multiply_data = -10000.0 + self.batch_num = batch_size * num_attention_heads + self.matmul = P.BatchMatMul() + + self.softmax = nn.Softmax() + self.dropout = nn.Dropout(1 - attention_probs_dropout_prob) + + if self.has_attention_mask: + self.expand_dims = P.ExpandDims() + self.sub = P.Sub() + self.add = P.TensorAdd() + self.cast = P.Cast() + self.get_dtype = P.DType() + if do_return_2d_tensor: + self.shape_return = (batch_size * from_seq_length, num_attention_heads * size_per_head) + else: + self.shape_return = (batch_size, from_seq_length, num_attention_heads * size_per_head) + + self.cast_compute_type = SaturateCast(dst_type=compute_type) + if self.use_relative_positions: + self._generate_relative_positions_embeddings = \ + RelaPosEmbeddingsGenerator(length=to_seq_length, + depth=size_per_head, + max_relative_position=16, + initializer_range=initializer_range, + use_one_hot_embeddings=use_one_hot_embeddings) + + def construct(self, from_tensor, to_tensor, attention_mask): + """construct of BertAttention""" + # reshape 2d/3d input tensors to 2d + from_tensor_2d = self.reshape(from_tensor, self.shape_from_2d) + to_tensor_2d = self.reshape(to_tensor, self.shape_to_2d) + query_out = self.query_layer(from_tensor_2d) + key_out = self.key_layer(to_tensor_2d) + value_out = self.value_layer(to_tensor_2d) + + query_layer = self.reshape(query_out, self.shape_from) + query_layer = self.transpose(query_layer, self.trans_shape) + key_layer = self.reshape(key_out, self.shape_to) + key_layer = self.transpose(key_layer, self.trans_shape) + + attention_scores = self.matmul_trans_b(query_layer, key_layer) + + # use_relative_position, supplementary logic + if self.use_relative_positions: + # 'relations_keys' = [F|T, F|T, H] + relations_keys = self._generate_relative_positions_embeddings() + relations_keys = self.cast_compute_type(relations_keys) + # query_layer_t is [F, B, N, H] + query_layer_t = self.transpose(query_layer, self.trans_shape_relative) + # query_layer_r is [F, B * N, H] + query_layer_r = self.reshape(query_layer_t, + (self.from_seq_length, + self.batch_num, + self.size_per_head)) + # key_position_scores is [F, B * N, F|T] + key_position_scores = self.matmul_trans_b(query_layer_r, + relations_keys) + # key_position_scores_r is [F, B, N, F|T] + key_position_scores_r = self.reshape(key_position_scores, + (self.from_seq_length, + self.batch_size, + self.num_attention_heads, + self.from_seq_length)) + # key_position_scores_r_t is [B, N, F, F|T] + key_position_scores_r_t = self.transpose(key_position_scores_r, + self.trans_shape_position) + attention_scores = attention_scores + key_position_scores_r_t + + attention_scores = self.multiply(self.scores_mul, attention_scores) + + if self.has_attention_mask: + attention_mask = self.expand_dims(attention_mask, 1) + multiply_out = self.sub(self.cast(F.tuple_to_array((1.0,)), self.get_dtype(attention_scores)), + self.cast(attention_mask, self.get_dtype(attention_scores))) + + adder = self.multiply(multiply_out, self.multiply_data) + attention_scores = self.add(adder, attention_scores) + + attention_probs = self.softmax(attention_scores) + attention_probs = self.dropout(attention_probs) + + value_layer = self.reshape(value_out, self.shape_to) + value_layer = self.transpose(value_layer, self.trans_shape) + context_layer = self.matmul(attention_probs, value_layer) + + # use_relative_position, supplementary logic + if self.use_relative_positions: + # 'relations_values' = [F|T, F|T, H] + relations_values = self._generate_relative_positions_embeddings() + relations_values = self.cast_compute_type(relations_values) + # attention_probs_t is [F, B, N, T] + attention_probs_t = self.transpose(attention_probs, self.trans_shape_relative) + # attention_probs_r is [F, B * N, T] + attention_probs_r = self.reshape( + attention_probs_t, + (self.from_seq_length, + self.batch_num, + self.to_seq_length)) + # value_position_scores is [F, B * N, H] + value_position_scores = self.matmul(attention_probs_r, + relations_values) + # value_position_scores_r is [F, B, N, H] + value_position_scores_r = self.reshape(value_position_scores, + (self.from_seq_length, + self.batch_size, + self.num_attention_heads, + self.size_per_head)) + # value_position_scores_r_t is [B, N, F, H] + value_position_scores_r_t = self.transpose(value_position_scores_r, + self.trans_shape_position) + context_layer = context_layer + value_position_scores_r_t + + context_layer = self.transpose(context_layer, self.trans_shape) + context_layer = self.reshape(context_layer, self.shape_return) + + return context_layer + + +class BertSelfAttention(nn.Cell): + """ + Apply self-attention. + + Args: + batch_size (int): Batch size of input dataset. + seq_length (int): Length of input sequence. + hidden_size (int): Size of the bert encoder layers. + num_attention_heads (int): Number of attention heads. Default: 12. + attention_probs_dropout_prob (float): The dropout probability for + BertAttention. Default: 0.1. + use_one_hot_embeddings (bool): Specifies whether to use one_hot encoding form. Default: False. + initializer_range (float): Initialization value of TruncatedNormal. Default: 0.02. + hidden_dropout_prob (float): The dropout probability for BertOutput. Default: 0.1. + use_relative_positions (bool): Specifies whether to use relative positions. Default: False. + compute_type (:class:`mindspore.dtype`): Compute type in BertSelfAttention. Default: mstype.float32. + """ + + def __init__(self, + batch_size, + seq_length, + hidden_size, + num_attention_heads=12, + attention_probs_dropout_prob=0.1, + use_one_hot_embeddings=False, + initializer_range=0.02, + hidden_dropout_prob=0.1, + use_relative_positions=False, + compute_type=mstype.float32, + enable_fused_layernorm=False): + super(BertSelfAttention, self).__init__() + if hidden_size % num_attention_heads != 0: + raise ValueError("The hidden size (%d) is not a multiple of the number " + "of attention heads (%d)" % (hidden_size, num_attention_heads)) + + self.size_per_head = int(hidden_size / num_attention_heads) + + self.attention = BertAttention( + batch_size=batch_size, + from_tensor_width=hidden_size, + to_tensor_width=hidden_size, + from_seq_length=seq_length, + to_seq_length=seq_length, + num_attention_heads=num_attention_heads, + size_per_head=self.size_per_head, + attention_probs_dropout_prob=attention_probs_dropout_prob, + use_one_hot_embeddings=use_one_hot_embeddings, + initializer_range=initializer_range, + use_relative_positions=use_relative_positions, + has_attention_mask=True, + do_return_2d_tensor=True, + compute_type=compute_type) + + self.output = BertOutput(in_channels=hidden_size, + out_channels=hidden_size, + initializer_range=initializer_range, + dropout_prob=hidden_dropout_prob, + compute_type=compute_type, + enable_fused_layernorm=enable_fused_layernorm) + self.reshape = P.Reshape() + self.shape = (-1, hidden_size) + + def construct(self, input_tensor, attention_mask): + """construct of BertSelfAttention""" + input_tensor = self.reshape(input_tensor, self.shape) + attention_output = self.attention(input_tensor, input_tensor, attention_mask) + output = self.output(attention_output, input_tensor) + return output + + +class BertEncoderCell(nn.Cell): + """ + Encoder cells used in BertTransformer. + + Args: + batch_size (int): Batch size of input dataset. + hidden_size (int): Size of the bert encoder layers. Default: 768. + seq_length (int): Length of input sequence. Default: 512. + num_attention_heads (int): Number of attention heads. Default: 12. + intermediate_size (int): Size of intermediate layer. Default: 3072. + attention_probs_dropout_prob (float): The dropout probability for + BertAttention. Default: 0.02. + use_one_hot_embeddings (bool): Specifies whether to use one hot encoding form. Default: False. + initializer_range (float): Initialization value of TruncatedNormal. Default: 0.02. + hidden_dropout_prob (float): The dropout probability for BertOutput. Default: 0.1. + use_relative_positions (bool): Specifies whether to use relative positions. Default: False. + hidden_act (str): Activation function. Default: "gelu". + compute_type (:class:`mindspore.dtype`): Compute type in attention. Default: mstype.float32. + """ + + def __init__(self, + batch_size, + hidden_size=768, + seq_length=512, + num_attention_heads=12, + intermediate_size=3072, + attention_probs_dropout_prob=0.02, + use_one_hot_embeddings=False, + initializer_range=0.02, + hidden_dropout_prob=0.1, + use_relative_positions=False, + hidden_act="gelu", + compute_type=mstype.float32, + enable_fused_layernorm=False): + super(BertEncoderCell, self).__init__() + self.attention = BertSelfAttention( + batch_size=batch_size, + hidden_size=hidden_size, + seq_length=seq_length, + num_attention_heads=num_attention_heads, + attention_probs_dropout_prob=attention_probs_dropout_prob, + use_one_hot_embeddings=use_one_hot_embeddings, + initializer_range=initializer_range, + hidden_dropout_prob=hidden_dropout_prob, + use_relative_positions=use_relative_positions, + compute_type=compute_type, + enable_fused_layernorm=enable_fused_layernorm) + self.intermediate = Dense_Thor(in_channels=hidden_size, + out_channels=intermediate_size, + weight_init=TruncatedNormal(initializer_range), + has_bias=True, + bias_init='zeros', + damping=damping, + loss_scale=loss_scale, + frequency=1, + activation=hidden_act, + batch_size=batch_size).to_float(compute_type) + self.output = BertOutput(in_channels=intermediate_size, + out_channels=hidden_size, + initializer_range=initializer_range, + dropout_prob=hidden_dropout_prob, + compute_type=compute_type, + enable_fused_layernorm=enable_fused_layernorm) + + def construct(self, hidden_states, attention_mask): + """construct of BertEncoderCell""" + # self-attention + attention_output = self.attention(hidden_states, attention_mask) + # feed construct + intermediate_output = self.intermediate(attention_output) + # add and normalize + output = self.output(intermediate_output, attention_output) + return output + + +class BertTransformer(nn.Cell): + """ + Multi-layer bert transformer. + + Args: + batch_size (int): Batch size of input dataset. + hidden_size (int): Size of the encoder layers. + seq_length (int): Length of input sequence. + num_hidden_layers (int): Number of hidden layers in encoder cells. + num_attention_heads (int): Number of attention heads in encoder cells. Default: 12. + intermediate_size (int): Size of intermediate layer in encoder cells. Default: 3072. + attention_probs_dropout_prob (float): The dropout probability for + BertAttention. Default: 0.1. + use_one_hot_embeddings (bool): Specifies whether to use one hot encoding form. Default: False. + initializer_range (float): Initialization value of TruncatedNormal. Default: 0.02. + hidden_dropout_prob (float): The dropout probability for BertOutput. Default: 0.1. + use_relative_positions (bool): Specifies whether to use relative positions. Default: False. + hidden_act (str): Activation function used in the encoder cells. Default: "gelu". + compute_type (:class:`mindspore.dtype`): Compute type in BertTransformer. Default: mstype.float32. + return_all_encoders (bool): Specifies whether to return all encoders. Default: False. + """ + + def __init__(self, + batch_size, + hidden_size, + seq_length, + num_hidden_layers, + num_attention_heads=12, + intermediate_size=3072, + attention_probs_dropout_prob=0.1, + use_one_hot_embeddings=False, + initializer_range=0.02, + hidden_dropout_prob=0.1, + use_relative_positions=False, + hidden_act="gelu", + compute_type=mstype.float32, + return_all_encoders=False, + enable_fused_layernorm=False): + super(BertTransformer, self).__init__() + self.return_all_encoders = return_all_encoders + + layers = [] + for _ in range(num_hidden_layers): + layer = BertEncoderCell(batch_size=batch_size, + hidden_size=hidden_size, + seq_length=seq_length, + num_attention_heads=num_attention_heads, + intermediate_size=intermediate_size, + attention_probs_dropout_prob=attention_probs_dropout_prob, + use_one_hot_embeddings=use_one_hot_embeddings, + initializer_range=initializer_range, + hidden_dropout_prob=hidden_dropout_prob, + use_relative_positions=use_relative_positions, + hidden_act=hidden_act, + compute_type=compute_type, + enable_fused_layernorm=enable_fused_layernorm) + layers.append(layer) + + self.layers = nn.CellList(layers) + + self.reshape = P.Reshape() + self.shape = (-1, hidden_size) + self.out_shape = (batch_size, seq_length, hidden_size) + + def construct(self, input_tensor, attention_mask): + """construct of BertTransformer""" + prev_output = self.reshape(input_tensor, self.shape) + + all_encoder_layers = () + for layer_module in self.layers: + layer_output = layer_module(prev_output, attention_mask) + prev_output = layer_output + + if self.return_all_encoders: + layer_output = self.reshape(layer_output, self.out_shape) + all_encoder_layers = all_encoder_layers + (layer_output,) + + if not self.return_all_encoders: + prev_output = self.reshape(prev_output, self.out_shape) + all_encoder_layers = all_encoder_layers + (prev_output,) + return all_encoder_layers + + +class CreateAttentionMaskFromInputMask(nn.Cell): + """ + Create attention mask according to input mask. + + Args: + config (Class): Configuration for BertModel. + """ + + def __init__(self, config): + super(CreateAttentionMaskFromInputMask, self).__init__() + self.input_mask_from_dataset = config.input_mask_from_dataset + self.input_mask = None + + if not self.input_mask_from_dataset: + self.input_mask = initializer( + "ones", [config.batch_size, config.seq_length], mstype.int32).to_tensor() + + self.cast = P.Cast() + self.reshape = P.Reshape() + self.shape = (config.batch_size, 1, config.seq_length) + self.broadcast_ones = initializer( + "ones", [config.batch_size, config.seq_length, 1], mstype.float32).to_tensor() + self.batch_matmul = P.BatchMatMul() + + def construct(self, input_mask): + """construct of CreateAttentionMaskFromInputMask""" + if not self.input_mask_from_dataset: + input_mask = self.input_mask + + attention_mask = self.cast(self.reshape(input_mask, self.shape), mstype.float32) + return attention_mask + + +class BertModel(nn.Cell): + """ + Bidirectional Encoder Representations from Transformers. + + Args: + config (Class): Configuration for BertModel. + is_training (bool): True for training mode. False for eval mode. + use_one_hot_embeddings (bool): Specifies whether to use one hot encoding form. Default: False. + """ + + def __init__(self, + config, + is_training, + use_one_hot_embeddings=False): + super(BertModel, self).__init__() + config = copy.deepcopy(config) + if not is_training: + config.hidden_dropout_prob = 0.0 + config.attention_probs_dropout_prob = 0.0 + + self.input_mask_from_dataset = config.input_mask_from_dataset + self.token_type_ids_from_dataset = config.token_type_ids_from_dataset + self.batch_size = config.batch_size + self.seq_length = config.seq_length + self.hidden_size = config.hidden_size + self.num_hidden_layers = config.num_hidden_layers + self.embedding_size = config.hidden_size + self.token_type_ids = None + + self.last_idx = self.num_hidden_layers - 1 + output_embedding_shape = [self.batch_size, self.seq_length, + self.embedding_size] + + if not self.token_type_ids_from_dataset: + self.token_type_ids = initializer( + "zeros", [self.batch_size, self.seq_length], mstype.int32).to_tensor() + + self.bert_embedding_lookup = Embedding_Thor( + vocab_size=config.vocab_size, + embedding_size=self.embedding_size, + embedding_shape=output_embedding_shape, + use_one_hot_embeddings=use_one_hot_embeddings, + initializer_range=config.initializer_range, + name='embedding_table', + is_expand=True, + batch_size=batch_size, + damping=damping, + loss_scale=loss_scale, + frequency=1) + self.bert_embedding_postprocessor = EmbeddingPostprocessor( + embedding_size=self.embedding_size, + embedding_shape=output_embedding_shape, + use_relative_positions=config.use_relative_positions, + use_token_type=True, + token_type_vocab_size=config.type_vocab_size, + use_one_hot_embeddings=use_one_hot_embeddings, + initializer_range=0.02, + max_position_embeddings=config.max_position_embeddings, + dropout_prob=config.hidden_dropout_prob) + + self.bert_encoder = BertTransformer( + batch_size=self.batch_size, + hidden_size=self.hidden_size, + seq_length=self.seq_length, + num_attention_heads=config.num_attention_heads, + num_hidden_layers=self.num_hidden_layers, + intermediate_size=config.intermediate_size, + attention_probs_dropout_prob=config.attention_probs_dropout_prob, + use_one_hot_embeddings=use_one_hot_embeddings, + initializer_range=config.initializer_range, + hidden_dropout_prob=config.hidden_dropout_prob, + use_relative_positions=config.use_relative_positions, + hidden_act=config.hidden_act, + compute_type=config.compute_type, + return_all_encoders=True, + enable_fused_layernorm=config.enable_fused_layernorm) + + self.cast = P.Cast() + self.dtype = config.dtype + self.cast_compute_type = SaturateCast(dst_type=config.compute_type) + self.slice = P.StridedSlice() + + self.squeeze_1 = P.Squeeze(axis=1) + self.dense = Dense_Thor(in_channels=self.hidden_size, + out_channels=self.hidden_size, + weight_init=TruncatedNormal(config.initializer_range), + has_bias=True, + bias_init='zeros', + damping=damping, + loss_scale=loss_scale, + frequency=1, + activation="tanh", + batch_size=batch_size).to_float(config.compute_type) + self._create_attention_mask_from_input_mask = CreateAttentionMaskFromInputMask(config) + + def construct(self, input_ids, token_type_ids, input_mask): + """construct of BertModel""" + + # embedding + if not self.token_type_ids_from_dataset: + token_type_ids = self.token_type_ids + word_embeddings, embedding_tables = self.bert_embedding_lookup(input_ids) + embedding_output = self.bert_embedding_postprocessor(token_type_ids, + word_embeddings) + + # attention mask [batch_size, seq_length, seq_length] + attention_mask = self._create_attention_mask_from_input_mask(input_mask) + + # bert encoder + encoder_output = self.bert_encoder(self.cast_compute_type(embedding_output), + attention_mask) + + sequence_output = self.cast(encoder_output[self.last_idx], self.dtype) + + # pooler + sequence_slice = self.slice(sequence_output, + (0, 0, 0), + (self.batch_size, 1, self.hidden_size), + (1, 1, 1)) + first_token = self.squeeze_1(sequence_slice) + pooled_output = self.dense(first_token) + pooled_output = self.cast(pooled_output, self.dtype) + + return sequence_output, pooled_output, embedding_tables diff --git a/model_zoo/bert_thor/src/bert_net_config.py b/model_zoo/bert_thor/src/bert_net_config.py new file mode 100644 index 0000000000..043705f387 --- /dev/null +++ b/model_zoo/bert_thor/src/bert_net_config.py @@ -0,0 +1,89 @@ +# Copyright 2020 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ +""" +network config setting, will be used in dataset.py, run_pretrain.py +Including two kinds of network: \ +base: Goole BERT-base(the base version of BERT model). +large: BERT-NEZHA(a Chinese pretrained language model developed by Huawei, which introduced a improvement of \ + Functional Relative Posetional Encoding as an effective positional encoding scheme). +""" +import mindspore.common.dtype as mstype +from .bert_model import BertConfig +from .config import cfg + +if cfg.bert_network == 'base': + bert_net_cfg = BertConfig( + batch_size=cfg.Thor.batch_size, + seq_length=128, + vocab_size=21128, + hidden_size=768, + num_hidden_layers=12, + num_attention_heads=12, + intermediate_size=3072, + hidden_act="gelu", + hidden_dropout_prob=0.1, + attention_probs_dropout_prob=0.1, + max_position_embeddings=512, + type_vocab_size=2, + initializer_range=0.02, + use_relative_positions=False, + input_mask_from_dataset=True, + token_type_ids_from_dataset=True, + dtype=mstype.float32, + compute_type=mstype.float16 + ) +if cfg.bert_network == 'nezha': + bert_net_cfg = BertConfig( + batch_size=cfg.Thor.batch_size, + seq_length=128, + vocab_size=21128, + hidden_size=1024, + num_hidden_layers=24, + num_attention_heads=16, + intermediate_size=4096, + hidden_act="gelu", + hidden_dropout_prob=0.1, + attention_probs_dropout_prob=0.1, + max_position_embeddings=512, + type_vocab_size=2, + initializer_range=0.02, + use_relative_positions=True, + input_mask_from_dataset=True, + token_type_ids_from_dataset=True, + dtype=mstype.float32, + compute_type=mstype.float16 + ) +if cfg.bert_network == 'large': + bert_net_cfg = BertConfig( + batch_size=cfg.Thor.batch_size, + seq_length=512, + vocab_size=30522, + hidden_size=1024, + num_hidden_layers=24, + num_attention_heads=16, + intermediate_size=4096, + hidden_act="gelu", + hidden_dropout_prob=0.1, + attention_probs_dropout_prob=0.1, + max_position_embeddings=512, + type_vocab_size=2, + initializer_range=0.02, + use_relative_positions=False, + input_mask_from_dataset=True, + token_type_ids_from_dataset=True, + dtype=mstype.float32, + compute_type=mstype.float16, + enable_fused_layernorm=True + ) diff --git a/model_zoo/bert_thor/src/config.py b/model_zoo/bert_thor/src/config.py new file mode 100644 index 0000000000..9c1d5bf725 --- /dev/null +++ b/model_zoo/bert_thor/src/config.py @@ -0,0 +1,55 @@ +# Copyright 2020 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ +""" +network config setting, will be used in dataset.py, run_pretrain.py +""" +from easydict import EasyDict as edict + +cfg = edict({ + 'bert_network': 'large', + 'loss_scale_value': 65536, + 'scale_factor': 2, + 'scale_window': 1000, + 'optimizer': 'Thor', + 'AdamWeightDecay': edict({ + 'learning_rate': 3e-5, + 'end_learning_rate': 1e-10, + 'power': 5.0, + 'weight_decay': 1e-5, + 'decay_filter': lambda x: 'layernorm' not in x.name.lower() and 'bias' not in x.name.lower(), + 'eps': 1e-6, + 'warmup_steps': 10000, + }), + 'Lamb': edict({ + 'learning_rate': 3e-5, + 'end_learning_rate': 1e-10, + 'power': 10.0, + 'warmup_steps': 10000, + 'weight_decay': 0.01, + 'decay_filter': lambda x: 'layernorm' not in x.name.lower() and 'bias' not in x.name.lower(), + 'eps': 1e-6, + }), + 'Momentum': edict({ + 'learning_rate': 2e-5, + 'momentum': 0.9, + }), + 'Thor': edict({ + 'momentum': 0.9, + 'weight_decay': 5e-4, + 'loss_scale': 1, + 'frequency': 10, + 'batch_size': 8, + }), +}) diff --git a/model_zoo/bert_thor/src/dataset.py b/model_zoo/bert_thor/src/dataset.py new file mode 100644 index 0000000000..889e27694a --- /dev/null +++ b/model_zoo/bert_thor/src/dataset.py @@ -0,0 +1,128 @@ +# Copyright 2020 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ +""" +Data operations, will be used in run_pretrain.py +""" +import os + +import mindspore.common.dtype as mstype +import mindspore.dataset.engine.datasets as de +import mindspore.dataset.transforms.c_transforms as C +from mindspore import log as logger +from .bert_net_config import bert_net_cfg + + +def create_bert_dataset(device_num=1, rank=0, do_shuffle="true", data_dir=None, schema_dir=None): + """create train dataset""" + # apply repeat operations + files = os.listdir(data_dir) + data_files = [] + for file_name in files: + if "tfrecord" in file_name: + data_files.append(os.path.join(data_dir, file_name)) + data_files = sorted(data_files) + ds = de.TFRecordDataset(data_files, schema_dir if schema_dir != "" else None, + columns_list=["input_ids", "input_mask", "segment_ids", "next_sentence_labels", + "masked_lm_positions", "masked_lm_ids", "masked_lm_weights"], + shuffle=de.Shuffle.FILES if do_shuffle == "true" else False, + num_shards=device_num, shard_id=rank, shard_equal_rows=True) + ori_dataset_size = ds.get_dataset_size() + print('origin dataset size: ', ori_dataset_size) + type_cast_op = C.TypeCast(mstype.int32) + ds = ds.map(input_columns="masked_lm_ids", operations=type_cast_op) + ds = ds.map(input_columns="masked_lm_positions", operations=type_cast_op) + ds = ds.map(input_columns="next_sentence_labels", operations=type_cast_op) + ds = ds.map(input_columns="segment_ids", operations=type_cast_op) + ds = ds.map(input_columns="input_mask", operations=type_cast_op) + ds = ds.map(input_columns="input_ids", operations=type_cast_op) + # apply batch operations + ds = ds.batch(bert_net_cfg.batch_size, drop_remainder=True) + logger.info("data size: {}".format(ds.get_dataset_size())) + logger.info("repeat count: {}".format(ds.get_repeat_count())) + return ds + + +def create_ner_dataset(batch_size=1, repeat_count=1, assessment_method="accuracy", + data_file_path=None, schema_file_path=None): + """create finetune or evaluation dataset""" + type_cast_op = C.TypeCast(mstype.int32) + ds = de.TFRecordDataset([data_file_path], schema_file_path if schema_file_path != "" else None, + columns_list=["input_ids", "input_mask", "segment_ids", "label_ids"]) + if assessment_method == "Spearman_correlation": + type_cast_op_float = C.TypeCast(mstype.float32) + ds = ds.map(input_columns="label_ids", operations=type_cast_op_float) + else: + ds = ds.map(input_columns="label_ids", operations=type_cast_op) + ds = ds.map(input_columns="segment_ids", operations=type_cast_op) + ds = ds.map(input_columns="input_mask", operations=type_cast_op) + ds = ds.map(input_columns="input_ids", operations=type_cast_op) + ds = ds.repeat(repeat_count) + # apply shuffle operation + buffer_size = 960 + ds = ds.shuffle(buffer_size=buffer_size) + # apply batch operations + ds = ds.batch(batch_size, drop_remainder=True) + return ds + + +def create_classification_dataset(batch_size=1, repeat_count=1, assessment_method="accuracy", + data_file_path=None, schema_file_path=None): + """create finetune or evaluation dataset""" + type_cast_op = C.TypeCast(mstype.int32) + ds = de.TFRecordDataset([data_file_path], schema_file_path if schema_file_path != "" else None, + columns_list=["input_ids", "input_mask", "segment_ids", "label_ids"]) + if assessment_method == "Spearman_correlation": + type_cast_op_float = C.TypeCast(mstype.float32) + ds = ds.map(input_columns="label_ids", operations=type_cast_op_float) + else: + ds = ds.map(input_columns="label_ids", operations=type_cast_op) + ds = ds.map(input_columns="segment_ids", operations=type_cast_op) + ds = ds.map(input_columns="input_mask", operations=type_cast_op) + ds = ds.map(input_columns="input_ids", operations=type_cast_op) + ds = ds.repeat(repeat_count) + # apply shuffle operation + buffer_size = 960 + ds = ds.shuffle(buffer_size=buffer_size) + # apply batch operations + ds = ds.batch(batch_size, drop_remainder=True) + return ds + + +def create_squad_dataset(batch_size=1, repeat_count=1, data_file_path=None, schema_file_path=None, is_training=True): + """create finetune or evaluation dataset""" + type_cast_op = C.TypeCast(mstype.int32) + if is_training: + ds = de.TFRecordDataset([data_file_path], schema_file_path if schema_file_path != "" else None, + columns_list=["input_ids", "input_mask", "segment_ids", + "start_positions", "end_positions", + "unique_ids", "is_impossible"]) + ds = ds.map(input_columns="start_positions", operations=type_cast_op) + ds = ds.map(input_columns="end_positions", operations=type_cast_op) + else: + ds = de.TFRecordDataset([data_file_path], schema_file_path if schema_file_path != "" else None, + columns_list=["input_ids", "input_mask", "segment_ids", "unique_ids"]) + ds = ds.map(input_columns="input_ids", operations=type_cast_op) + ds = ds.map(input_columns="input_mask", operations=type_cast_op) + ds = ds.map(input_columns="segment_ids", operations=type_cast_op) + ds = ds.map(input_columns="segment_ids", operations=type_cast_op) + ds = ds.map(input_columns="input_mask", operations=type_cast_op) + ds = ds.map(input_columns="input_ids", operations=type_cast_op) + ds = ds.repeat(repeat_count) + # apply shuffle operation + buffer_size = 960 + ds = ds.shuffle(buffer_size=buffer_size) + # apply batch operations + ds = ds.batch(batch_size, drop_remainder=True) + return ds diff --git a/model_zoo/bert_thor/src/dataset_helper.py b/model_zoo/bert_thor/src/dataset_helper.py new file mode 100644 index 0000000000..ff90d27e85 --- /dev/null +++ b/model_zoo/bert_thor/src/dataset_helper.py @@ -0,0 +1,177 @@ +# Copyright 2020 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ +"""Dataset help for minddata dataset""" +import os + +from mindspore import context +from mindspore._checkparam import check_bool, check_int +from mindspore.parallel._utils import _get_device_num, _need_to_full +from mindspore.train._utils import _exec_datagraph, _get_types_and_shapes, _to_full_shapes + + +def _send_data(dataset, epoch_num): + """Engine dataset to write data to tdt queue.""" + if not hasattr(dataset, '__has_sent__'): + exec_dataset = dataset.__TRANSFER_DATASET__ + exec_dataset.send(epoch_num) + dataset.__has_sent__ = True + + +def _send_data_no_flag(dataset, epoch_num): + """Engine dataset to write data to tdt queue directly.""" + exec_dataset = dataset.__TRANSFER_DATASET__ + exec_dataset.send(epoch_num) + + +class DatasetHelper: + """ + Help function to use the Minddata dataset. + + According to different context, change the iter of dataset, to use the same for loop in different context. + + Note: + The iter of DatasetHelper will give one epoch data. + + Args: + dataset (DataSet): The training dataset iterator. + dataset_sink_mode (bool): If true use GetNext to fetch the data, or else feed the data from host. Default: True. + sink_size (int): Control the amount of data each sink. + If sink_size=-1, sink the complete dataset each epoch. + If sink_size>0, sink sink_size data each epoch. Default: -1. + + Examples: + >>> dataset_helper = DatasetHelper(dataset) + >>> for inputs in dataset_helper: + >>> outputs = network(*inputs) + """ + + def __init__(self, dataset, dataset_sink_mode=True, sink_size=-1, epoch_num=1, iter_first_order=0): + check_bool(dataset_sink_mode) + check_int(sink_size) + if sink_size < -1 or sink_size == 0: + raise ValueError("The sink_size must be -1 or positive, but got sink_size {}.".format(sink_size)) + + if dataset_sink_mode: + if context.get_context("enable_ge"): + iterclass = _DatasetIterGE + else: + if context.get_context("device_target") == "Ascend": + iterclass = _DatasetIterMSLoopSink + elif context.get_context("device_target") == "GPU": + ms_role = os.getenv("MS_ROLE") + if ms_role in ("MS_PSERVER", "MS_SCHED"): + iterclass = _DatasetIterPSLite + else: + iterclass = _DatasetIterMS + elif context.get_context("device_target") == "CPU": + raise RuntimeError("Currently dataset sink mode is not supported when the device target is CPU.") + self.iter = iterclass(dataset, sink_size, epoch_num, iter_first_order) + else: + iterclass = _DatasetIterNormal + self.iter = iterclass(dataset) + + def __iter__(self): + return self.iter.__iter__() + + # A temp solution for loop sink. Delete later + def types_shapes(self): + """Get the types and shapes from dataset on current config.""" + return self.iter.types_shapes() + + def sink_size(self): + """Get sink_size for every iteration.""" + return self.iter.get_sink_size() + + def stop_send(self): + """Free up resources about data sink.""" + self.iter.stop_send() + + +class _DatasetIter: + """Base iter for dataset helper""" + + def __init__(self, dataset, sink_size, epoch_num): + self.dataset = dataset + self.sink_size = sink_size + self.sink_count = 1 + + if not hasattr(dataset, '__TRANSFER_DATASET__'): + if hasattr(dataset, '__loop_size__'): + self.sink_size = dataset.__loop_size__ + dataset.__TRANSFER_DATASET__ = _exec_datagraph(dataset, self.sink_size) + dataset.__ME_INITED__ = dataset.__TRANSFER_DATASET__.queue_name + + if not hasattr(dataset, '__no_send__'): + _send_data(dataset, epoch_num) + else: + _send_data_no_flag(dataset, epoch_num) + + self.stop_send = dataset.__TRANSFER_DATASET__.stop_send + self.dataset_types, self.dataset_shapes = _get_types_and_shapes(dataset) + + def __iter__(self): + self.index = 0 + return self + + def __next__(self): + if self.index >= self.sink_count: + raise StopIteration() + self.index += 1 + return self.op() + + def types_shapes(self): + return self.dataset_types, self.dataset_shapes + + def get_sink_count(self, dataset, sink_size, iter_first_order): + sink_count = 1 + if hasattr(dataset, '__loop_size__'): + loop_size = dataset.__loop_size__ + iter_first_order + sink_count = int(sink_size / loop_size) * 2 + return sink_count + + def get_sink_size(self): + """get sink_size to device""" + sink_size = 1 + if hasattr(self.dataset, '__loop_size__'): + sink_size = self.dataset.__loop_size__ + else: + if context.get_context("enable_ge") or context.get_context("device_target") == "Ascend": + if self.sink_size > 0: + sink_size = self.sink_size + else: + sink_size = self.dataset.get_dataset_size() + return sink_size + + +class _DatasetIterMSLoopSink(_DatasetIter): + """Iter for context (device_target=Ascend)""" + + def __init__(self, dataset, sink_size, epoch_num, iter_first_order): + super().__init__(dataset, sink_size, epoch_num) + self.sink_count = self.get_sink_count(dataset, sink_size, iter_first_order) + ms_role = os.getenv("MS_ROLE") + if ms_role in ("MS_PSERVER", "MS_SCHED"): + self.sink_count = 1 + # for self._parallel_mode equal to semi_auto_parallel or auto_parallel, and not using full_batch, + # use a complete tensor to compile, and slice tensor to run. The batch dimension of tensors for + # compile is device_number times the batch dimension of tensors for run. Now only support LoopSink. + if _need_to_full(): + device_num = _get_device_num() + self.dataset_shapes = _to_full_shapes(self.dataset_shapes, device_num) + + def op(): + return tuple() + + self.op = op diff --git a/model_zoo/bert_thor/src/evaluation_config.py b/model_zoo/bert_thor/src/evaluation_config.py new file mode 100644 index 0000000000..8aab4acf40 --- /dev/null +++ b/model_zoo/bert_thor/src/evaluation_config.py @@ -0,0 +1,54 @@ +# Copyright 2020 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ + +""" +config settings, will be used in finetune.py +""" + +from easydict import EasyDict as edict + +import mindspore.common.dtype as mstype +from .bert_model import BertConfig + +cfg = edict({ + 'task': 'NER', + 'num_labels': 41, + 'data_file': '', + 'schema_file': None, + 'finetune_ckpt': '', + 'use_crf': False, + 'clue_benchmark': False, +}) + +bert_net_cfg = BertConfig( + batch_size=8 if not cfg.clue_benchmark else 1, + seq_length=512, + vocab_size=30522, + hidden_size=1024, + num_hidden_layers=24, + num_attention_heads=16, + intermediate_size=4096, + hidden_act="gelu", + hidden_dropout_prob=0.0, + attention_probs_dropout_prob=0.0, + max_position_embeddings=512, + type_vocab_size=2, + initializer_range=0.02, + use_relative_positions=False, + input_mask_from_dataset=True, + token_type_ids_from_dataset=True, + dtype=mstype.float32, + compute_type=mstype.float16, +) diff --git a/model_zoo/bert_thor/src/fused_layer_norm.py b/model_zoo/bert_thor/src/fused_layer_norm.py new file mode 100644 index 0000000000..882c4c6978 --- /dev/null +++ b/model_zoo/bert_thor/src/fused_layer_norm.py @@ -0,0 +1,124 @@ +# Copyright 2020 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ +"""fused layernorm""" +import numpy as np + +import mindspore.common.dtype as mstype +from mindspore.common.initializer import initializer +from mindspore.common.parameter import Parameter +from mindspore.nn.cell import Cell +from mindspore.ops import functional as F +from mindspore.ops import operations as P +from mindspore.ops.primitive import constexpr + +__all__ = ['FusedLayerNorm'] + + +@constexpr +def get_shape_for_norm(x_shape, begin_norm_axis): + print("input_shape: ", x_shape) + norm_shape = x_shape[begin_norm_axis:] + output_shape = (1, -1, 1, int(np.prod(norm_shape))) + print("output_shape: ", output_shape) + return output_shape + + +class FusedLayerNorm(Cell): + r""" + Applies Layer Normalization over a mini-batch of inputs. + + Layer normalization is widely used in recurrent neural networks. It applies + normalization over a mini-batch of inputs for each single training case as described + in the paper `Layer Normalization `_. Unlike batch + normalization, layer normalization performs exactly the same computation at training and + testing times. It can be described using the following formula. It is applied across all channels + and pixel but only one batch size. + + .. math:: + y = \frac{x - \mathrm{E}[x]}{\sqrt{\mathrm{Var}[x] + \epsilon}} * \gamma + \beta + + Args: + normalized_shape (Union(tuple[int], list[int]): The normalization is performed over axis + `begin_norm_axis ... R - 1`. + begin_norm_axis (int): It first normalization dimension: normalization will be performed along dimensions + `begin_norm_axis: rank(inputs)`, the value should be in [-1, rank(input)). Default: -1. + begin_params_axis (int): The first parameter(beta, gamma)dimension: scale and centering parameters + will have dimensions `begin_params_axis: rank(inputs)` and will be broadcast with + the normalized inputs accordingly, the value should be in [-1, rank(input)). Default: -1. + gamma_init (Union[Tensor, str, Initializer, numbers.Number]): Initializer for the gamma weight. + The values of str refer to the function `initializer` including 'zeros', 'ones', 'xavier_uniform', + 'he_uniform', etc. Default: 'ones'. + beta_init (Union[Tensor, str, Initializer, numbers.Number]): Initializer for the beta weight. + The values of str refer to the function `initializer` including 'zeros', 'ones', 'xavier_uniform', + 'he_uniform', etc. Default: 'zeros'. + use_batch_nrom (bool): Whether use batchnorm to preocess. + + Inputs: + - **input_x** (Tensor) - The shape of 'input_x' is :math:`(x_1, x_2, ..., x_R)`, + and `input_shape[begin_norm_axis:]` is equal to `normalized_shape`. + + Outputs: + Tensor, the normalized and scaled offset tensor, has the same shape and data type as the `input_x`. + + Examples: + >>> x = Tensor(np.ones([20, 5, 10, 10]), mindspore.float32) + >>> shape1 = x.shape[1:] + >>> m = nn.LayerNorm(shape1, begin_norm_axis=1, begin_params_axis=1) + >>> m(x) + """ + + def __init__(self, + normalized_shape, + begin_norm_axis=-1, + begin_params_axis=-1, + gamma_init='ones', + beta_init='zeros', + use_batch_norm=False): + super(FusedLayerNorm, self).__init__() + if not isinstance(normalized_shape, (tuple, list)): + raise TypeError("The type of 'normalized_shape' should be tuple[int] or list[int], but '{}' type is {}." + .format(normalized_shape, type(normalized_shape))) + self.normalized_shape = normalized_shape + self.begin_norm_axis = begin_norm_axis + self.begin_params_axis = begin_params_axis + self.gamma = Parameter(initializer( + gamma_init, normalized_shape), name="gamma") + self.beta = Parameter(initializer( + beta_init, normalized_shape), name="beta") + self.layer_norm = P.LayerNorm(begin_norm_axis=self.begin_norm_axis, begin_params_axis=self.begin_params_axis) + + self.batch_norm = P.BatchNorm(is_training=True, epsilon=1e-5) + self.use_batch_norm = use_batch_norm + + def construct(self, input_x): + """construct of FusedLayerNorm""" + if self.use_batch_norm and self.training: + ones = P.Fill()(mstype.float32, F.shape(input_x)[:self.begin_norm_axis], 1.0) + zeros = P.Fill()(mstype.float32, F.shape(input_x)[:self.begin_norm_axis], 0.0) + shape_x = F.shape(input_x) + norm_shape = get_shape_for_norm(shape_x, self.begin_norm_axis) + input_x = F.reshape(input_x, norm_shape) + output, _, _, _, _, _ = self.batch_norm(input_x, ones, zeros, None, None) + output = F.reshape(output, shape_x) + y = output * self.gamma + self.beta + else: + y, _, _ = self.layer_norm(input_x, self.gamma, self.beta) + return y + + def extend_repr(self): + """Display instance object as string.""" + s = 'normalized_shape={}, begin_norm_axis={}, begin_params_axis={}, gamma{}, beta={}'.format( + self.normalized_shape, self.begin_norm_axis, self.begin_params_axis, self.gamma, self.beta) + return s diff --git a/model_zoo/bert_thor/src/grad_reducer_thor1.py b/model_zoo/bert_thor/src/grad_reducer_thor1.py new file mode 100644 index 0000000000..709b0b73df --- /dev/null +++ b/model_zoo/bert_thor/src/grad_reducer_thor1.py @@ -0,0 +1,184 @@ +# Copyright 2020 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ +"""grad_reducer_thor""" +import mindspore.common.dtype as mstype +from mindspore.communication.management import GlobalComm, get_group_size +from mindspore.nn.cell import Cell +from mindspore.ops import functional as F, composite as C, operations as P +from mindspore.ops.operations.comm_ops import AllReduce, ReduceOp + +reduce_opt = C.MultitypeFuncGraph("reduce_opt") + +_all_reduce_G = AllReduce() + + +def _init_optimizer_allreduce(group): + global _all_reduce_G + _all_reduce_G = AllReduce(ReduceOp.SUM, GlobalComm.WORLD_COMM_GROUP) + _all_reduce_G.add_prim_attr('fusion', group) + + +@reduce_opt.register("Function", "Number", "Tensor") +def _tensors_allreduce_mean(mul, degree, grad): + degree = F.scalar_cast(degree, F.dtype(grad)) + grad = _all_reduce_G(grad) + cast_op = P.Cast() + return mul(grad, cast_op(F.scalar_to_array(1.0 / degree), F.dtype(grad))) + + +@reduce_opt.register("Bool", "Tensor") +def _tensors_allreduce(allreduce_filter, grad): + if allreduce_filter: + return _all_reduce_G(grad) + return grad + + +_get_datatype = C.MultitypeFuncGraph("_get_datatype") + + +@_get_datatype.register("Tensor") +def _tensors_get_datatype(grad): + """ + Acquire gradient datatype. + + Args: + grad (Tensor): The gradient tensor before operation. + + Returns: + mstype, the datatype of gradient. + """ + return F.dtype(grad) + + +_cast_datatype = C.MultitypeFuncGraph("_cast_datatype") + + +@_cast_datatype.register("TypeType", "Tensor") +def _tensors_cast_datatype(datatype, grad): + """ + Cast gradient to datatype. + + Args: + datatype (mstype): the destination datatype of gradient. + grad (Tensor): The gradient tensor before operation. + + Returns: + Tensor, the gradient tensor after operation. + """ + return F.cast(grad, datatype) + + +class DistributedGradReducerThor1(Cell): + """ + A distributed optimizer. + + Constructs a gradient reducer Cell, which applies communication and average operations on + single-process gradient values. + + Args: + parameters (list): the parameters to be updated. + mean (bool): When mean is true, the mean coefficient (degree) would apply on gradients. Default: False. + degree (int): The mean coefficient. Usually it equals to device number. Default: None. + + Raises: + ValueError: If degree is not a int or less than 0. + + Examples: + >>> from mindspore.communication import init, get_group_size + >>> from mindspore.ops import composite as C + >>> from mindspore.ops import operations as P + >>> from mindspore.ops import functional as F + >>> from mindspore import context + >>> from mindspore import nn + >>> from mindspore import ParallelMode, ParameterTuple + >>> + >>> device_id = int(os.environ["DEVICE_ID"]) + >>> context.set_context(mode=context.GRAPH_MODE, device_target="Ascend", save_graphs=True, + >>> device_id=int(device_id), enable_hccl=True) + >>> init() + >>> context.reset_auto_parallel_context() + >>> context.set_auto_parallel_context(parallel_mode=ParallelMode.DATA_PARALLEL) + >>> + >>> + >>> class TrainingWrapper(nn.Cell): + >>> def __init__(self, network, optimizer, sens=1.0): + >>> super(TrainingWrapper, self).__init__(auto_prefix=False) + >>> self.network = network + >>> self.network.add_flags(defer_inline=True) + >>> self.weights = ParameterTuple(network.trainable_params()) + >>> self.optimizer = optimizer + >>> self.grad = C.GradOperation('grad', get_by_list=True, sens_param=True) + >>> self.sens = sens + >>> self.reducer_flag = False + >>> self.grad_reducer = None + >>> self.parallel_mode = context.get_auto_parallel_context("parallel_mode") + >>> if self.parallel_mode in [ParallelMode.DATA_PARALLEL, + >>> ParallelMode.HYBRID_PARALLEL]: + >>> self.reducer_flag = True + >>> if self.reducer_flag: + >>> mean = context.get_auto_parallel_context("mirror_mean") + >>> if mean.get_device_num_is_set(): + >>> degree = context.get_auto_parallel_context("device_num") + >>> else: + >>> degree = get_group_size() + >>> self.grad_reducer = nn.DistributedGradReducer(optimizer.parameters, mean, degree) + >>> + >>> def construct(self, *args): + >>> weights = self.weights + >>> loss = self.network(*args) + >>> sens = P.Fill()(P.DType()(loss), P.Shape()(loss), self.sens) + >>> grads = self.grad(self.network, weights)(*args, sens) + >>> if self.reducer_flag: + >>> # apply grad reducer on grads + >>> grads = self.grad_reducer(grads) + >>> return F.depend(loss, self.optimizer(grads)) + >>> + >>> network = Net() + >>> optimizer = nn.Momentum(network.trainable_params(), learning_rate=0.1, momentum=0.9) + >>> train_cell = TrainingWrapper(network, optimizer) + >>> inputs = Tensor(np.ones([16, 16]).astype(np.float32)) + >>> label = Tensor(np.zeros([16, 16]).astype(np.float32)) + >>> grads = train_cell(inputs, label) + """ + + def __init__(self, parameters, group, mean=True, degree=None): + super(DistributedGradReducerThor1, self).__init__(auto_prefix=False) + self.hyper_map = C.HyperMap() + self.mul = P.Mul() + if degree is None: + self.degree = get_group_size() + else: + if not isinstance(degree, int) or degree <= 0: + raise ValueError("Parameter 'degree' in DistributedGradReducer should large than 0 and be int") + self.degree = degree + self.mean = mean + self.allreduce_filter = tuple(x.layerwise_parallel is False for x in parameters) + _init_optimizer_allreduce(group) + + def construct(self, grads): + """construct of DistributedGradReducerThor1""" + # In some circumstances, the data precision of grads could be mixed with float16 and float32. Thus, the + # result of AllReduce is unreliable. To solve the problem, grads should be cast to float32 before AllReduce, + # and cast back after the operation. + datatypes = self.hyper_map(F.partial(_get_datatype), grads) + grads = self.hyper_map(F.partial(_cast_datatype, mstype.float32), grads) + + if self.mean: + new_grad = self.hyper_map(F.partial(reduce_opt, self.mul, self.degree), grads) + else: + new_grad = self.hyper_map(F.partial(reduce_opt), self.allreduce_filter, grads) + + new_grad = self.hyper_map(F.partial(_cast_datatype), datatypes, new_grad) + return new_grad diff --git a/model_zoo/bert_thor/src/lr_generator.py b/model_zoo/bert_thor/src/lr_generator.py new file mode 100644 index 0000000000..c2416e9b81 --- /dev/null +++ b/model_zoo/bert_thor/src/lr_generator.py @@ -0,0 +1,70 @@ +# Copyright 2020 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ +"""learning rate generator""" +import numpy as np + +from mindspore.common.tensor import Tensor + + +def get_poly_lr(global_step, lr_init, lr_end, lr_max, warmup_steps, total_steps, poly_power): + """ + generate learning rate array + + Args: + lr_init(float): init learning rate + lr_end(float): end learning rate + lr_max(float): max learning rate + warmup_steps(int): number of warmup epochs + total_steps(int): total epoch of training + poly_power(int): poly learning rate power + + Returns: + np.array, learning rate array + """ + lr_each_step = [] + if warmup_steps != 0: + inc_each_step = (float(lr_max) - float(lr_init)) / float(warmup_steps) + else: + inc_each_step = 0 + for i in range(total_steps): + if i < warmup_steps: + lr = float(lr_init) + inc_each_step * float(i) + else: + base = (1.0 - (float(i) - float(warmup_steps)) / (float(total_steps) - float(warmup_steps))) + lr = float(lr_max - lr_end) * (base ** poly_power) + lr = lr + lr_end + if lr < 0.0: + lr = 0.0 + lr_each_step.append(lr) + + learning_rate = np.array(lr_each_step).astype(np.float32) + current_step = global_step + learning_rate = learning_rate[current_step:] + return learning_rate + + +# bert kfac hyperparam setting +def get_bert_lr(): + learning_rate = Tensor( + get_poly_lr(global_step=0, lr_init=0.0, lr_end=1e-6, lr_max=4e-4, warmup_steps=0, total_steps=30000, + poly_power=1)) + return learning_rate + + +def get_bert_damping(): + damping = Tensor( + get_poly_lr(global_step=0, lr_init=0.0, lr_end=1e-6, lr_max=5e-2, warmup_steps=0, total_steps=30000, + poly_power=1)) + return damping diff --git a/model_zoo/bert_thor/src/model_thor.py b/model_zoo/bert_thor/src/model_thor.py new file mode 100644 index 0000000000..6a687f2791 --- /dev/null +++ b/model_zoo/bert_thor/src/model_thor.py @@ -0,0 +1,784 @@ +# Copyright 2020 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ +"""Model.""" +import math +import os +from collections.abc import Iterable + +import numpy as np +from mindspore._c_expression import init_exec_dataset + +from mindspore import context +from mindspore import log as logger +from mindspore import nn +from mindspore._checkparam import check_input_data, check_output_data, check_int_positive, check_bool, check_int +from mindspore.common import dtype as mstype +from mindspore.common.dtype import pytype_to_dtype +from mindspore.common.tensor import Tensor +from mindspore.nn.metrics import Loss +from mindspore.nn.metrics import get_metrics +from mindspore.nn.wrap.cell_wrapper import _VirtualDatasetCell +from mindspore.parallel._utils import _get_parallel_mode, _get_device_num, _get_global_rank, \ + _get_parameter_broadcast, _device_number_check, _parameter_broadcast_check +from mindspore.parallel._utils import _need_to_full +from mindspore.train import amp +from mindspore.train._utils import _to_full_tensor +from mindspore.train.callback import _InternalCallbackParam, RunContext, _CallbackManager +from mindspore.train.parallel_utils import ParallelMode +from .dataset_helper import DatasetHelper + + +def _convert_type(types): + """ + Convert from numpy type to tensor type. + + Args: + types (list): Numpy type list of element in dataset. + + Returns: + list, list of element in dataset. + """ + ms_types = [] + for np_type in types: + ms_type = pytype_to_dtype(np_type) + ms_types.append(ms_type) + return ms_types + + +def _get_types_and_shapes(dataset): + """Get dataset types and shapes.""" + dataset_types = _convert_type(dataset.output_types()) + dataset_shapes = dataset.output_shapes() + return dataset_types, dataset_shapes + + +def _exec_datagraph(exec_dataset, dataset_size, phase='dataset'): + """Initialize and execute the dataset graph.""" + batch_size = exec_dataset.get_batch_size() + input_indexs = exec_dataset.input_indexs + + # transform data format + dataset_types, dataset_shapes = _get_types_and_shapes(exec_dataset) + init_exec_dataset(exec_dataset.__ME_INITED__, + dataset_size, + batch_size, + dataset_types, + dataset_shapes, + input_indexs, + phase=phase, + need_run=False) + + +class Model: + """ + High-Level API for Training or Testing. + + `Model` groups layers into an object with training and inference features. + + Args: + network (Cell): The training or testing network. + loss_fn (Cell): Objective function, if loss_fn is None, the + network should contain the logic of loss and grads calculation, and the logic + of parallel if needed. Default: None. + optimizer (Cell): Optimizer for updating the weights. Default: None. + metrics (Union[dict, set]): Dict or set of metrics to be evaluated by the model during + training and testing. eg: {'accuracy', 'recall'}. Default: None. + eval_network (Cell): Network for evaluation. If not defined, `network` and `loss_fn` would be wrapped as + `eval_network`. Default: None. + eval_indexes (list): In case of defining the `eval_network`, if `eval_indexes` is None, all outputs of + `eval_network` would be passed to metrics, otherwise `eval_indexes` must contain three + elements, representing the positions of loss value, predict value and label, the loss + value would be passed to `Loss` metric, predict value and label would be passed to other + metric. Default: None. + amp_level (str): Option for argument `level` in `mindspore.amp.build_train_network`, level for mixed + precision training. Supports [O0, O2, O3]. Default: "O0". + + - O0: Do not change. + - O2: Cast network to float16, keep batchnorm run in float32, using dynamic loss scale. + - O3: Cast network to float16, with additional property 'keep_batchnorm_fp32=False'. + + O2 is recommended on GPU, O3 is recommended on Ascend. + + loss_scale_manager (Union[None, LossScaleManager]): If None, not scale the loss, or else + scale the loss by LossScaleManager. If it is set, overwrite the level setting. It's a eyword argument. + e.g. Use `loss_scale_manager=None` to set the value. + keep_batchnorm_fp32 (bool): Keep Batchnorm run in `float32`. If set, overwrite the level setting. Default: True. + + Examples: + >>> class Net(nn.Cell): + >>> def __init__(self): + >>> super(Net, self).__init__() + >>> self.conv = nn.Conv2d(3, 64, 3, has_bias=False, weight_init='normal') + >>> self.bn = nn.BatchNorm2d(64) + >>> self.relu = nn.ReLU() + >>> self.flatten = nn.Flatten() + >>> self.fc = nn.Dense(64*224*224, 12) # padding=0 + >>> + >>> def construct(self, x): + >>> x = self.conv(x) + >>> x = self.bn(x) + >>> x = self.relu(x) + >>> x = self.flatten(x) + >>> out = self.fc(x) + >>> return out + >>> + >>> net = Net() + >>> loss = nn.SoftmaxCrossEntropyWithLogits(is_grad=False, sparse=True) + >>> optim = Momentum(params=net.trainable_params(), learning_rate=0.1, momentum=0.9) + >>> model = Model(net, loss_fn=loss, optimizer=optim, metrics=None) + >>> dataset = get_dataset() + >>> model.train(2, dataset) + """ + + def __init__(self, network, loss_fn=None, optimizer=None, metrics=None, eval_network=None, + eval_indexes=None, amp_level="O0", frequency=278, stop_epoch=100, **kwargs): + self._network = network + self._loss_fn = loss_fn + self._optimizer = optimizer + self._loss_scale_manager = None + self._loss_scale_manager_set = False + self._keep_bn_fp32 = True + self._check_kwargs(kwargs) + self._amp_level = amp_level + self._process_amp_args(kwargs) + self._parallel_mode = _get_parallel_mode() + self._device_number = _get_device_num() + self._global_rank = _get_global_rank() + self._parameter_broadcast = _get_parameter_broadcast() + self._frequency = frequency + self._stop_epoch = stop_epoch + + self._train_network = self._build_train_network() + self._build_eval_network(metrics, eval_network, eval_indexes) + self._build_predict_network() + + def _process_amp_args(self, kwargs): + if self._amp_level in ["O0", "O3"]: + self._keep_bn_fp32 = False + if 'keep_batchnorm_fp32' in kwargs: + self._keep_bn_fp32 = kwargs['keep_batchnorm_fp32'] + if 'loss_scale_manager' in kwargs: + self._loss_scale_manager = kwargs['loss_scale_manager'] + self._loss_scale_manager_set = True + + def _check_kwargs(self, kwargs): + for arg in kwargs: + if arg not in ['loss_scale_manager', 'keep_batchnorm_fp32']: + raise ValueError(f"Unsupport arg '{arg}'") + + def _build_train_network(self): + """Build train network""" + network = self._network + if self._optimizer: + if self._loss_scale_manager_set: + network = amp.build_train_network(network, + self._optimizer, + self._loss_fn, + level=self._amp_level, + loss_scale_manager=self._loss_scale_manager, + keep_batchnorm_fp32=self._keep_bn_fp32) + else: + network = amp.build_train_network(network, + self._optimizer, + self._loss_fn, + level=self._amp_level, + keep_batchnorm_fp32=self._keep_bn_fp32) + elif self._loss_fn: + network = nn.WithLossCell(network, self._loss_fn) + # If need to check if loss_fn is not None, but optimizer is None + + if self._parallel_mode in (ParallelMode.SEMI_AUTO_PARALLEL, ParallelMode.AUTO_PARALLEL): + network.set_auto_parallel() + return network + + def _build_eval_network(self, metrics, eval_network, eval_indexes): + """Build the network for evaluation.""" + self._metric_fns = get_metrics(metrics) + if not self._metric_fns: + return + + if eval_network is not None: + if eval_indexes is not None and not (isinstance(eval_indexes, list) and len(eval_indexes) == 3): + raise ValueError("Eval_indexes must be a list or None. If eval_indexes is a list, length of it \ + must be three. But got {}".format(eval_indexes)) + + self._eval_network = eval_network + self._eval_indexes = eval_indexes + else: + if self._loss_fn is None: + raise ValueError("loss_fn can not be None.") + self._eval_network = nn.WithEvalCell(self._network, self._loss_fn, self._amp_level == "O2") + self._eval_indexes = [0, 1, 2] + + if self._parallel_mode in (ParallelMode.SEMI_AUTO_PARALLEL, ParallelMode.AUTO_PARALLEL): + if self._optimizer: + self._eval_network = _VirtualDatasetCell(self._eval_network) + self._eval_network.set_auto_parallel() + + def _build_predict_network(self): + """Build the network for prediction.""" + self._predict_network = self._network + if self._parallel_mode in (ParallelMode.SEMI_AUTO_PARALLEL, ParallelMode.AUTO_PARALLEL): + self._predict_network = _VirtualDatasetCell(self._network) + self._predict_network.set_auto_parallel() + + def _clear_metrics(self): + """Clear metrics local values.""" + for metric in self._metric_fns.values(): + metric.clear() + + def _update_metrics(self, outputs): + """Update metrics local values.""" + if not isinstance(outputs, tuple): + raise ValueError("The `outputs` is not tuple.") + + if self._eval_indexes is not None and len(outputs) < 3: + raise ValueError("The length of `outputs` must be greater than or equal to 3, \ + but got {}".format(len(outputs))) + + for metric in self._metric_fns.values(): + if self._eval_indexes is None: + metric.update(*outputs) + else: + if isinstance(metric, Loss): + metric.update(outputs[self._eval_indexes[0]]) + else: + metric.update(outputs[self._eval_indexes[1]], outputs[self._eval_indexes[2]]) + + def _get_metrics(self): + """Get metrics local values.""" + metrics = dict() + for key, value in self._metric_fns.items(): + metrics[key] = value.eval() + return metrics + + def _get_scaling_sens(self): + """get the scaling sens""" + scaling_sens = 1 + if self._loss_scale_manager is not None: + scaling_sens = self._loss_scale_manager.get_loss_scale() + if self._parallel_mode == ParallelMode.DATA_PARALLEL: + scaling_sens /= self._device_number + return scaling_sens + + def _exec_preprocess(self, network, is_train, phase, dataset, dataset_sink_mode, sink_size=-1, epoch_num=1, + iter_first_order=9): + """Initializes dataset.""" + need_wrap = False + if dataset_sink_mode: + # remove later to deal with loop sink + if not hasattr(dataset, '__ME_INITED__') and context.get_context("device_target") == "Ascend" \ + and not context.get_context("enable_ge"): + need_wrap = True + + if not is_train: + dataset.__loop_size__ = 1 + + dataset_helper = DatasetHelper(dataset, dataset_sink_mode, sink_size, epoch_num, iter_first_order) + + # remove later to deal with loop sink + if need_wrap: + network = nn.DataWrapper(network, *(dataset_helper.types_shapes()), dataset.__ME_INITED__) + network.set_train(is_train) + network.phase = phase + + if self._parallel_mode in (ParallelMode.SEMI_AUTO_PARALLEL, ParallelMode.AUTO_PARALLEL): + network.set_auto_parallel() + + return dataset_helper, network + + def init(self, train_dataset=None, valid_dataset=None): + """ + Initializes compute graphs and data graphs with sink mode. + + Note: + Pre-init process only supports `GRAPH_MODE` and `Ascend` target currently. + + Args: + train_dataset (Dataset): A training dataset iterator. If define `train_dataset`, training graphs will be + initialized. Default: None. + valid_dataset (Dataset): A evaluating dataset iterator. If define `valid_dataset`, evaluation graphs will + be initialized, and `metrics` in `Model` can not be None. Default: None. + + Examples: + >>> train_dataset = get_train_dataset() + >>> valid_dataset = get_valid_dataset() + >>> net = Net() + >>> loss = nn.SoftmaxCrossEntropyWithLogits(is_grad=False, sparse=True) + >>> optim = Momentum(params=net.trainable_params(), learning_rate=0.1, momentum=0.9) + >>> model = Model(net, loss_fn=loss, optimizer=optim, metrics={'acc'}) + >>> model.init(train_dataset, valid_dataset) + >>> model.train(2, train_dataset) + >>> model.eval(valid_dataset) + """ + if context.get_context("mode") != context.GRAPH_MODE or context.get_context("device_target") != "Ascend": + raise RuntimeError('Pre-init process only supports GRAPH MODE and Ascend target currently.') + + if not train_dataset and not valid_dataset: + raise ValueError('Both train_dataset and valid_dataset can not be None or empty.') + + _device_number_check(self._parallel_mode, self._device_number) + + if train_dataset: + _parameter_broadcast_check(self._parallel_mode, self._parameter_broadcast) + self._train_network.set_train() + self._train_network.phase = 'train' + + if self._parameter_broadcast: + self._train_network.set_broadcast_flag() + train_dataset.__no_send__ = True + train_dataset_helper, train_network = self._exec_preprocess(self._train_network, + is_train=True, + phase='train', + dataset=train_dataset, + dataset_sink_mode=True) + self._train_network = train_network + for inputs in train_dataset_helper: + self._train_network.compile(*inputs) + break + + if valid_dataset: + if not self._metric_fns: + raise RuntimeError('If define `valid_dataset`, metric fn can not be None or empty.') + + self._eval_network.set_train(False) + self._eval_network.phase = 'eval' + valid_dataset.__no_send__ = True + valid_dataset_helper, eval_network = self._exec_preprocess(self._eval_network, + is_train=False, + phase='eval', + dataset=valid_dataset, + dataset_sink_mode=True) + self._eval_network = eval_network + for inputs in valid_dataset_helper: + self._eval_network.compile(*inputs) + break + + def _train(self, epoch, train_dataset, callbacks=None, dataset_sink_mode=True, sink_size=-1): + """ + Training. + + Args: + epoch (int): Total number of iterations on the data. + train_dataset (Dataset): A training dataset iterator. If there is no + loss_fn, a tuple with multiply data (data1, data2, data3, ...) will be + returned and passed to the network. Otherwise, a tuple (data, label) will + be returned, and the data and label are passed to the network and loss + function respectively. + callbacks (list): List of callback object. Callbacks which should be executed while training. Default: None. + dataset_sink_mode (bool): Determines whether to pass the data through dataset channel. Default: True. + Configure pynative mode, the training process will be performed with + dataset not sink. + sink_size (int): Control the amount of data each sink. Default: -1. + """ + epoch = check_int_positive(epoch) + self._train_network.set_train() + + if self._parameter_broadcast: + self._train_network.set_broadcast_flag() + + cb_params = _InternalCallbackParam() + cb_params.train_network = self._train_network + cb_params.epoch_num = epoch + if dataset_sink_mode and sink_size > 0: + cb_params.batch_num = sink_size + else: + cb_params.batch_num = train_dataset.get_dataset_size() + cb_params.mode = "train" + cb_params.loss_fn = self._loss_fn + cb_params.optimizer = self._optimizer + cb_params.parallel_mode = self._parallel_mode + cb_params.device_number = self._device_number + cb_params.train_dataset = train_dataset + cb_params.list_callback = self._transform_callbacks(callbacks) + cb_params.train_dataset_element = None + cb_params.network = self._network + ms_role = os.getenv("MS_ROLE") + if ms_role in ("MS_PSERVER", "MS_SCHED"): + epoch = 1 + + # build callback list + with _CallbackManager(callbacks) as list_callback: + if not dataset_sink_mode: + self._train_process(epoch, train_dataset, list_callback, cb_params) + elif context.get_context("mode") == context.PYNATIVE_MODE: + logger.warning("The pynative mode cannot support dataset sink mode currently." + "So the training process will be performed with dataset not sink.") + self._train_process(epoch, train_dataset, list_callback, cb_params) + else: + self._train_dataset_sink_process(epoch, train_dataset, list_callback, cb_params, sink_size) + + @staticmethod + def _transform_callbacks(callbacks): + """Transform callback to a list.""" + if callbacks is None: + return [] + + if isinstance(callbacks, Iterable): + return list(callbacks) + + return [callbacks] + + def _train_dataset_sink_process(self, epoch, train_dataset, list_callback=None, cb_params=None, sink_size=-1): + """ + Training process. The data would be passed to network through dataset channel. + + Args: + epoch (int): Total number of iterations on the data. + train_dataset (Dataset): A training dataset iterator. If there is no + loss_fn, a tuple with multiply data (data1, data2, data3, ...) should be + returned and passed to the network. Otherwise, a tuple (data, label) should + be returned, and the data and label are passed to the network and loss + function respectively. + list_callback (Callback): Executor of callback list. Default: None. + cb_params (_InternalCallbackParam): Callback parameters. Default: None. + sink_size (int): Control the amount of data each sink. Default: -1. + """ + if sink_size == -1: + epoch_num = epoch + else: + epoch_num = math.ceil(epoch * sink_size / train_dataset.get_dataset_size()) + + iter_first_order = self._frequency - 1 + iter_second_order = 1 + train_dataset.__loop_size__ = iter_second_order + dataset_helper, train_network = self._exec_preprocess(self._train_network, + is_train=True, + phase='train', + dataset=train_dataset, + dataset_sink_mode=True, + sink_size=sink_size, + epoch_num=epoch_num, + iter_first_order=iter_first_order) + self._train_network = train_network + cb_params.train_network = self._train_network + cb_params.cur_step_num = 0 + + run_context = RunContext(cb_params) + list_callback.begin(run_context) + + # used to stop training for early stop, such as stopAtTIme or stopATStep + should_stop = False + has_do_dataset_init = False + switch_branch_one = True + train_network_init_flag = True + for i in range(epoch): + cb_params.cur_epoch_num = i + 1 + list_callback.epoch_begin(run_context) + + # for data sink dataset_helper only iter once, other wise iter epoch_size times. + for inputs in dataset_helper: + if _need_to_full(): + inputs = _to_full_tensor(inputs, self._device_number, self._global_rank) + list_callback.step_begin(run_context) + if switch_branch_one: + cb_params.cur_step_num += dataset_helper.sink_size() + if train_network_init_flag: + self._train_network.add_flags_recursive(thor=True) + self._train_network.phase = 'train0' + else: + cb_params.cur_step_num += iter_first_order + if train_network_init_flag: + self._train_network.add_flags_recursive(thor=False) + train_network_init_flag = False + self._train_network.phase = 'train1' + if not has_do_dataset_init: + _exec_datagraph(train_dataset, iter_first_order, phase='train1_dataset') + has_do_dataset_init = True + switch_branch_one = not switch_branch_one + outputs = self._train_network(*inputs) + cb_params.net_outputs = outputs + list_callback.step_end(run_context) + + list_callback.epoch_end(run_context) + should_stop = should_stop or run_context.get_stop_requested() + if should_stop: + break + dataset_helper.stop_send() + + list_callback.end(run_context) + + def _train_process(self, epoch, train_dataset, list_callback=None, cb_params=None): + """ + Training process. The data would be passed to network directly. + + Args: + epoch (int): Total number of iterations on the data. + train_dataset (Dataset): A training dataset iterator. If there is no + loss_fn, a tuple with multiply data (data1, data2, data3, ...) should be + returned and passed to the network. Otherwise, a tuple (data, label) should + be returned, and the data and label are passed to the network and loss + function respectively. + list_callback (Callback): Executor of callback list. Default: None. + cb_params (_InternalCallbackParam): Callback parameters. Default: None. + """ + dataset_helper, _ = self._exec_preprocess(self._train_network, + is_train=True, + phase='train', + dataset=train_dataset, + dataset_sink_mode=False) + cb_params.cur_step_num = 0 + run_context = RunContext(cb_params) + list_callback.begin(run_context) + # used to stop training for early stop, such as stopAtTIme or stopATStep + should_stop = False + + for i in range(epoch): + cb_params.cur_epoch_num = i + 1 + + list_callback.epoch_begin(run_context) + + for next_element in dataset_helper: + len_element = len(next_element) + if self._loss_fn and len_element != 2: + raise ValueError("when loss_fn is not None, train_dataset should" + "return two elements, but got {}".format(len_element)) + cb_params.cur_step_num += 1 + list_callback.step_begin(run_context) + + overflow = False + if self._loss_scale_manager and self._loss_scale_manager.get_drop_overflow_update(): + scaling_sens = self._get_scaling_sens() + next_element = tuple(next_element) + (Tensor(scaling_sens, mstype.float32),) + + cb_params.train_dataset_element = next_element + outputs = self._train_network(*next_element) + cb_params.net_outputs = outputs + if self._loss_scale_manager and self._loss_scale_manager.get_drop_overflow_update(): + _, overflow, _ = outputs + overflow = np.all(overflow.asnumpy()) + self._loss_scale_manager.update_loss_scale(overflow) + + list_callback.step_end(run_context) + should_stop = should_stop or run_context.get_stop_requested() + if should_stop: + break + + train_dataset.reset() + + list_callback.epoch_end(run_context) + should_stop = should_stop or run_context.get_stop_requested() + if should_stop: + break + + list_callback.end(run_context) + + def train(self, epoch, train_dataset, callbacks=None, dataset_sink_mode=True, sink_size=-1): + """ + Training API where the iteration is controlled by python front-end. + + When setting pynative mode, the training process will be performed with dataset not sink. + + Note: + CPU is not supported when dataset_sink_mode is true. + If dataset_sink_mode is True, epoch of training should be equal to the count of repeat + operation in dataset processing. Otherwise, errors could occur since the amount of data + is not the amount training requires. + If dataset_sink_mode is True, data will be sent to device. If device is Ascend, features + of data will be transferred one by one. The limitation of data transmission per time is 256M. + + Args: + epoch (int): Total number of iterations on the data. + train_dataset (Dataset): A training dataset iterator. If there is no + loss_fn, a tuple with multiply data (data1, data2, data3, ...) should be + returned and passed to the network. Otherwise, a tuple (data, label) should + be returned, and the data and label are passed to the network and loss + function respectively. + callbacks (list): List of callback object. Callbacks which should be excuted while training. Default: None. + dataset_sink_mode (bool): Determines whether to pass the data through dataset channel. Default: True. + Configure pynative mode, the training process will be performed with + dataset not sink. + sink_size (int): Control the amount of data each sink. + If sink_size=-1, sink the complete dataset each epoch. + If sink_size>0, sink sink_size data each epoch. + If dataset_sink_mode is False, set sink_size invalid. Default: -1. + + Examples: + >>> dataset = get_dataset() + >>> net = Net() + >>> loss = nn.SoftmaxCrossEntropyWithLogits(is_grad=False, sparse=True) + >>> loss_scale_manager = FixedLossScaleManager() + >>> optim = Momentum(params=net.trainable_params(), learning_rate=0.1, momentum=0.9) + >>> model = Model(net, loss_fn=loss, optimizer=optim, metrics=None, loss_scale_manager=loss_scale_manager) + >>> model.train(2, dataset) + """ + check_bool(dataset_sink_mode) + check_int(sink_size) + if sink_size < -1 or sink_size == 0: + raise ValueError("The sink_size must be -1 or positive, but got sink_size {}.".format(sink_size)) + + _device_number_check(self._parallel_mode, self._device_number) + _parameter_broadcast_check(self._parallel_mode, self._parameter_broadcast) + + self._train(epoch, + train_dataset, + callbacks=callbacks, + dataset_sink_mode=dataset_sink_mode, + sink_size=sink_size) + + def _eval_dataset_sink_process(self, valid_dataset, list_callback=None, cb_params=None): + """ + Evaluation. The data would be passed to network through dataset channel. + + Args: + valid_dataset (Dataset): Dataset to evaluate the model. + list_callback (Callback): Executor of callback list. Default: None. + cb_params (_InternalCallbackParam): Callback parameters. Default: None. + + Returns: + Dict, returns the loss value & metrics values for the model in test mode. + """ + run_context = RunContext(cb_params) + + dataset_helper, eval_network = self._exec_preprocess(self._eval_network, + is_train=False, + phase='eval', + dataset=valid_dataset, + dataset_sink_mode=True) + self._eval_network = eval_network + cb_params.eval_network = self._eval_network + list_callback.begin(run_context) + + for inputs in dataset_helper: + cb_params.cur_step_num += 1 + list_callback.step_begin(run_context) + + outputs = self._eval_network(*inputs) + + cb_params.net_outputs = outputs + list_callback.step_end(run_context) + self._update_metrics(outputs) + + metrics = self._get_metrics() + cb_params.metrics = metrics + list_callback.end(run_context) + + return metrics + + def _eval_process(self, valid_dataset, list_callback=None, cb_params=None): + """ + Evaluation. The data would be passed to network directly. + + Args: + valid_dataset (Dataset): Dataset to evaluate the model. + list_callback (Callback): Executor of callback list. Default: None. + cb_params (_InternalCallbackParam): Callback parameters. Default: None. + + Returns: + Dict, returns the loss value & metrics values for the model in test mode. + """ + run_context = RunContext(cb_params) + list_callback.begin(run_context) + + dataset_helper, _ = self._exec_preprocess(self._eval_network, + is_train=False, + phase='eval', + dataset=valid_dataset, + dataset_sink_mode=False) + for next_element in dataset_helper: + cb_params.cur_step_num += 1 + list_callback.step_begin(run_context) + outputs = self._eval_network(*next_element) + cb_params.net_outputs = outputs + list_callback.step_end(run_context) + self._update_metrics(outputs) + + valid_dataset.reset() + + metrics = self._get_metrics() + cb_params.metrics = metrics + list_callback.end(run_context) + return metrics + + def eval(self, valid_dataset, callbacks=None, dataset_sink_mode=True): + """ + Evaluation API where the iteration is controlled by python front-end. + + Configure to pynative mode, the evaluation will be performed with dataset non-sink mode. + + Note: + CPU is not supported when dataset_sink_mode is true. + If dataset_sink_mode is True, data will be sent to device. If device is Ascend, features + of data will be transferred one by one. The limitation of data transmission per time is 256M. + + Args: + valid_dataset (Dataset): Dataset to evaluate the model. + callbacks (list): List of callback object. Callbacks which should be excuted + while training. Default: None. + dataset_sink_mode (bool): Determines whether to pass the data through dataset channel. Default: True. + + Returns: + Dict, returns the loss value & metrics values for the model in test mode. + + Examples: + >>> dataset = get_dataset() + >>> net = Net() + >>> loss = nn.SoftmaxCrossEntropyWithLogits(is_grad=False, sparse=True) + >>> model = Model(net, loss_fn=loss, optimizer=None, metrics={'acc'}) + >>> model.eval(dataset) + """ + check_bool(dataset_sink_mode) + _device_number_check(self._parallel_mode, self._device_number) + if not self._metric_fns: + raise ValueError("metric fn can not be None or empty.") + + cb_params = _InternalCallbackParam() + cb_params.eval_network = self._eval_network + cb_params.valid_dataset = valid_dataset + cb_params.batch_num = valid_dataset.get_dataset_size() + cb_params.mode = "eval" + cb_params.cur_step_num = 0 + cb_params.list_callback = self._transform_callbacks(callbacks) + cb_params.network = self._network + + self._eval_network.set_train(mode=False) + self._eval_network.phase = 'eval' + + self._clear_metrics() + + with _CallbackManager(callbacks) as list_callback: + if dataset_sink_mode: + return self._eval_dataset_sink_process(valid_dataset, list_callback, cb_params) + return self._eval_process(valid_dataset, list_callback, cb_params) + + def predict(self, *predict_data): + """ + Generates output predictions for the input samples. + + Data could be single tensor, or list of tensor, tuple of tensor. + + Note: + Batch data should be put together in one tensor. + + Args: + predict_data (Tensor): Tensor of predict data. can be array, list or tuple. + + Returns: + Tensor, array(s) of predictions. + + Examples: + >>> input_data = Tensor(np.random.randint(0, 255, [1, 3, 224, 224]), mindspore.float32) + >>> model = Model(Net()) + >>> model.predict(input_data) + """ + self._predict_network.set_train(False) + check_input_data(*predict_data, data_class=Tensor) + result = self._predict_network(*predict_data) + + check_output_data(result) + return result + + +__all__ = ["Model"] diff --git a/model_zoo/bert_thor/src/thor_for_bert.py b/model_zoo/bert_thor/src/thor_for_bert.py new file mode 100644 index 0000000000..b9e9c46ab4 --- /dev/null +++ b/model_zoo/bert_thor/src/thor_for_bert.py @@ -0,0 +1,422 @@ +# Copyright 2020 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ +"""momentum""" +import mindspore.common.dtype as mstype +from mindspore.common.initializer import initializer +from mindspore.common.parameter import Parameter +from mindspore.common.parameter import ParameterTuple +from mindspore.common.tensor import Tensor +from mindspore.nn.optim.optimizer import Optimizer +from mindspore.ops import functional as F, composite as C, operations as P + +momentum_opt = C.MultitypeFuncGraph("momentum_opt") + + +@momentum_opt.register("Function", "Tensor", "Tensor", "Tensor", "Tensor", "Tensor") +def _tensor_run_opt_ext(opt, learning_rate, momentum, gradient, weight, moment): + """Apply momentum optimizer to the weight parameter using Tensor.""" + success = True + success = F.depend(success, opt(weight, moment, learning_rate, gradient, momentum)) + return success + + +op_add = P.AddN() +apply_decay = C.MultitypeFuncGraph("apply_decay") + + +@apply_decay.register("Number", "Bool", "Tensor", "Tensor") +def _tensor_apply_decay(weight_decay, if_apply, weight, gradient): + """Get grad with weight_decay.""" + if if_apply: + return op_add((weight * weight_decay, gradient)) + return gradient + + +class THOR(Optimizer): + """THOR""" + + def __init__(self, params, learning_rate, momentum, matrix_A, matrix_G, A_inv_max, G_inv_max, weight_decay=0.0, + loss_scale=1.0, num_hidden_layers=24, batch_size=12, damping=0.03, frequency=10, + decay_filter=lambda x: 'layernorm' not in x.name.lower() and 'bias' not in x.name.lower()): + super(THOR, self).__init__(learning_rate, params, weight_decay, loss_scale) + if isinstance(momentum, float) and momentum < 0.0: + raise ValueError("momentum should be at least 0.0, but got momentum {}".format(momentum)) + self.momentum = Parameter(Tensor(momentum, mstype.float32), name="momentum") + self.params = self.parameters + self.moments = self.params.clone(prefix="moments", init='zeros') + self.hyper_map = C.HyperMap() + self.opt = P.ApplyMomentum() + self.matrix_A = ParameterTuple(matrix_A) + self.matrix_G = ParameterTuple(matrix_G) + self.A_inv_max = ParameterTuple(A_inv_max) + self.G_inv_max = ParameterTuple(G_inv_max) + self.matmul = P.MatMul() + self.transpose = P.Transpose() + self.shape = P.Shape() + self.reshape = P.Reshape() + self.mul = P.Mul() + self.gather = P.GatherV2() + self.matrix_A_inv = () + self.matrix_G_inv = () + self.matrix_max_inv = () + self.num_hidden_layers = num_hidden_layers + fc_layer_num = num_hidden_layers * 6 + 5 + for i in range(fc_layer_num): + self.matrix_max_inv = self.matrix_max_inv + ( + Parameter(initializer(1, [1], mstype.float32), name="matrix_max" + str(i), requires_grad=False),) + self.log = P.Log() + self.exp = P.Exp() + self.sqrt = P.Sqrt() + self.matrix_max_inv = ParameterTuple(self.matrix_max_inv) + self.assign = P.Assign() + self.cast = P.Cast() + self.thor = True + self.weight_decay = weight_decay * loss_scale + self.decay_flags = tuple(decay_filter(x) for x in self.parameters) + self.expand = P.ExpandDims() + self.square = P.Square() + self.inv = P.Inv() + self.batch_size = batch_size + self.damping = damping + self.freq = Tensor(frequency, mstype.int32) + self.one = Tensor(1, mstype.int32) + self.cov_step = Parameter(initializer(0, [1], mstype.int32), name="cov_step", requires_grad=False) + + def construct(self, gradients): + """construct of THOR""" + params = self.params + moments = self.moments + encoder_layers_num = 16 + if self.thor: + new_grads = () + # process embedding layer + for em_idx in range(3): + g = gradients[em_idx] + matrix_idx = em_idx + temp_a_ori = self.matrix_A[matrix_idx] + temp_a = self.expand(temp_a_ori, 1) + temp_g = self.matrix_G[matrix_idx] + G_max = self.G_inv_max[matrix_idx] + temp_g = self.cast(temp_g, mstype.float32) + matrix_G_inv_max = self.log(G_max) + matrix_G_inv_max = self.mul(matrix_G_inv_max, -1) + matrix_G_inv_max = self.exp(matrix_G_inv_max) + temp_g = self.mul(temp_g, matrix_G_inv_max) + g = self.mul(temp_a, g) + g = self.cast(g, mstype.float16) + temp_g = self.cast(temp_g, mstype.float16) + g = self.matmul(g, temp_g) + g = self.cast(g, mstype.float32) + g = self.mul(g, G_max) + fake_A = self.assign(self.matrix_A[matrix_idx], temp_a_ori) + fake_G = self.assign(self.matrix_G[matrix_idx], temp_g) + fake_max = self.assign(self.matrix_max_inv[matrix_idx], G_max) + g = F.depend(g, fake_A) + g = F.depend(g, fake_G) + g = F.depend(g, fake_max) + new_grads = new_grads + (g,) + # process bert_embedding_postprocessor.layernorm + grad_idx = 3 + beta_grad = gradients[grad_idx] + gamma_grad = gradients[grad_idx + 1] + normalizer = self.batch_size + normalizer = self.cast(normalizer, mstype.float32) + damping_step = self.gather(self.damping, self.cov_step, 0) + damping_step = self.cast(damping_step, mstype.float32) + self.cov_step = self.cov_step + self.one + damping = self.sqrt(damping_step) + beta = self.square(beta_grad) + beta_cov = self.mul(beta, 1.0 / normalizer) + beta_cov = beta_cov + damping + beta_inv = self.inv(beta_cov) + gamma = self.square(gamma_grad) + gamma_cov = self.mul(gamma, 1.0 / normalizer) + gamma_cov = gamma_cov + damping + gamma_inv = self.inv(gamma_cov) + beta = self.mul(beta_inv, beta_grad) + gamma = self.mul(gamma_inv, gamma_grad) + new_grads = new_grads + (beta, gamma) + + for i in range(self.num_hidden_layers): + encoder_begin_idx = encoder_layers_num * i + 5 + for j in range(0, encoder_layers_num, 2): + grad_idx = encoder_begin_idx + j + if j in (8, 14): + # process layernorm layer + beta_grad = gradients[grad_idx] + gamma_grad = gradients[grad_idx + 1] + normalizer = self.batch_size + normalizer = self.cast(normalizer, mstype.float32) + beta = self.square(beta_grad) + beta_cov = self.mul(beta, 1.0 / normalizer) + beta_cov = beta_cov + damping + beta_inv = self.inv(beta_cov) + gamma = self.square(gamma_grad) + gamma_cov = self.mul(gamma, 1.0 / normalizer) + gamma_cov = gamma_cov + damping + gamma_inv = self.inv(gamma_cov) + beta = self.mul(beta_inv, beta_grad) + gamma = self.mul(gamma_inv, gamma_grad) + new_grads = new_grads + (beta, gamma) + else: + g = gradients[grad_idx] + offset_idx = 0 + if j in (0, 2, 4, 6): + offset_idx = j // 2 + elif j in (10, 12): + offset_idx = j // 2 - 1 + matrix_idx = 6 * i + offset_idx + 3 + temp_a = self.matrix_A[matrix_idx] + temp_g = self.matrix_G[matrix_idx] + temp_a = self.cast(temp_a, mstype.float32) + temp_g = self.cast(temp_g, mstype.float32) + matrix_A_inv_max = self.log(self.A_inv_max[matrix_idx]) + matrix_A_inv_max = self.mul(matrix_A_inv_max, -1) + matrix_A_inv_max = self.exp(matrix_A_inv_max) + temp_a = self.mul(temp_a, matrix_A_inv_max) + matrix_G_inv_max = self.log(self.G_inv_max[matrix_idx]) + matrix_G_inv_max = self.mul(matrix_G_inv_max, -1) + matrix_G_inv_max = self.exp(matrix_G_inv_max) + temp_g = self.mul(temp_g, matrix_G_inv_max) + temp_max = self.mul(self.A_inv_max[matrix_idx], self.G_inv_max[matrix_idx]) + temp_a = self.cast(temp_a, mstype.float16) + temp_g = self.cast(temp_g, mstype.float16) + g = self.cast(g, mstype.float16) + + g = self.matmul(temp_g, g) + g = self.matmul(g, temp_a) + g = self.cast(g, mstype.float32) + g = self.mul(g, temp_max) + + fake_A = self.assign(self.matrix_A[matrix_idx], temp_a) + fake_G = self.assign(self.matrix_G[matrix_idx], temp_g) + fake_max = self.assign(self.matrix_max_inv[matrix_idx], temp_max) + g = F.depend(g, fake_A) + g = F.depend(g, fake_G) + g = F.depend(g, fake_max) + new_grads = new_grads + (g,) + new_grads = new_grads + (gradients[grad_idx + 1],) + + # process pooler layer + pooler_layer_idx = encoder_layers_num * self.num_hidden_layers + 5 + matrix_idx = self.num_hidden_layers * 6 + 3 + g = gradients[pooler_layer_idx] + pooler_bias = gradients[pooler_layer_idx + 1] + temp_a = self.matrix_A[matrix_idx] + temp_g = self.matrix_G[matrix_idx] + temp_a = self.cast(temp_a, mstype.float32) + temp_g = self.cast(temp_g, mstype.float32) + matrix_A_inv_max = self.log(self.A_inv_max[matrix_idx]) + matrix_A_inv_max = self.mul(matrix_A_inv_max, -1) + matrix_A_inv_max = self.exp(matrix_A_inv_max) + temp_a = self.mul(temp_a, matrix_A_inv_max) + matrix_G_inv_max = self.log(self.G_inv_max[matrix_idx]) + matrix_G_inv_max = self.mul(matrix_G_inv_max, -1) + matrix_G_inv_max = self.exp(matrix_G_inv_max) + temp_g = self.mul(temp_g, matrix_G_inv_max) + temp_max = self.mul(self.A_inv_max[matrix_idx], self.G_inv_max[matrix_idx]) + temp_a = self.cast(temp_a, mstype.float16) + temp_g = self.cast(temp_g, mstype.float16) + g = self.cast(g, mstype.float16) + + g = self.matmul(temp_g, g) + g = self.matmul(g, temp_a) + g = self.cast(g, mstype.float32) + g = self.mul(g, temp_max) + + fake_A = self.assign(self.matrix_A[matrix_idx], temp_a) + fake_G = self.assign(self.matrix_G[matrix_idx], temp_g) + fake_max = self.assign(self.matrix_max_inv[matrix_idx], temp_max) + g = F.depend(g, fake_A) + g = F.depend(g, fake_G) + g = F.depend(g, fake_max) + new_grads = new_grads + (g, pooler_bias) + + # for cls1 fc layer: mlm + mlm_fc_idx = encoder_layers_num * self.num_hidden_layers + 8 + matrix_idx = self.num_hidden_layers * 6 + 4 + g = gradients[mlm_fc_idx] + mlm_bias = gradients[mlm_fc_idx + 1] + temp_a = self.matrix_A[matrix_idx] + temp_g = self.matrix_G[matrix_idx] + temp_a = self.cast(temp_a, mstype.float32) + temp_g = self.cast(temp_g, mstype.float32) + matrix_A_inv_max = self.log(self.A_inv_max[matrix_idx]) + matrix_A_inv_max = self.mul(matrix_A_inv_max, -1) + matrix_A_inv_max = self.exp(matrix_A_inv_max) + temp_a = self.mul(temp_a, matrix_A_inv_max) + matrix_G_inv_max = self.log(self.G_inv_max[matrix_idx]) + matrix_G_inv_max = self.mul(matrix_G_inv_max, -1) + matrix_G_inv_max = self.exp(matrix_G_inv_max) + temp_g = self.mul(temp_g, matrix_G_inv_max) + temp_max = self.mul(self.A_inv_max[matrix_idx], self.G_inv_max[matrix_idx]) + temp_a = self.cast(temp_a, mstype.float16) + temp_g = self.cast(temp_g, mstype.float16) + g = self.cast(g, mstype.float16) + + g = self.matmul(temp_g, g) + g = self.matmul(g, temp_a) + g = self.cast(g, mstype.float32) + g = self.mul(g, temp_max) + + fake_A = self.assign(self.matrix_A[matrix_idx], temp_a) + fake_G = self.assign(self.matrix_G[matrix_idx], temp_g) + fake_max = self.assign(self.matrix_max_inv[matrix_idx], temp_max) + g = F.depend(g, fake_A) + g = F.depend(g, fake_G) + g = F.depend(g, fake_max) + new_grads = new_grads + (gradients[mlm_fc_idx - 1],) + new_grads = new_grads + (g, mlm_bias) + # add bert.cls1.layernorm grad + begin_idx = mlm_fc_idx + 2 + end_idx = mlm_fc_idx + 4 + new_grads = new_grads + gradients[begin_idx: end_idx] + lenth = len(gradients) + new_grads = new_grads + gradients[lenth - 2: lenth] + gradients = new_grads + else: + new_grads = () + # process embedding layer + for em_idx in range(3): + g = gradients[em_idx] + matrix_idx = em_idx + temp_a = self.matrix_A[matrix_idx] + temp_a = self.expand(temp_a, 1) + temp_g = self.matrix_G[matrix_idx] + matrix_max = self.matrix_max_inv[matrix_idx] + g = self.mul(temp_a, g) + temp_g = self.cast(temp_g, mstype.float16) + g = self.cast(g, mstype.float16) + g = self.matmul(g, temp_g) + g = self.cast(g, mstype.float32) + g = self.mul(g, matrix_max) + new_grads = new_grads + (g,) + # process bert_embedding_postprocessor.layernorm + grad_idx = 3 + beta_grad = gradients[grad_idx] + gamma_grad = gradients[grad_idx + 1] + normalizer = self.batch_size + normalizer = self.cast(normalizer, mstype.float32) + damping_step = self.gather(self.damping, self.cov_step, 0) + damping_step = self.cast(damping_step, mstype.float32) + self.cov_step = self.cov_step + self.one + damping = self.sqrt(damping_step) + beta = self.square(beta_grad) + beta_cov = self.mul(beta, 1.0 / normalizer) + beta_cov = beta_cov + damping + beta_inv = self.inv(beta_cov) + gamma = self.square(gamma_grad) + gamma_cov = self.mul(gamma, 1.0 / normalizer) + gamma_cov = gamma_cov + damping + gamma_inv = self.inv(gamma_cov) + beta = self.mul(beta_inv, beta_grad) + gamma = self.mul(gamma_inv, gamma_grad) + new_grads = new_grads + (beta, gamma) + + for i in range(self.num_hidden_layers): + encoder_begin_idx = encoder_layers_num * i + 5 + for j in range(0, encoder_layers_num, 2): + grad_idx = encoder_begin_idx + j + if j in (8, 14): + # process layernorm layer + beta_grad = gradients[grad_idx] + gamma_grad = gradients[grad_idx + 1] + normalizer = self.batch_size + normalizer = self.cast(normalizer, mstype.float32) + beta = self.square(beta_grad) + beta_cov = self.mul(beta, 1.0 / normalizer) + beta_cov = beta_cov + damping + beta_inv = self.inv(beta_cov) + gamma = self.square(gamma_grad) + gamma_cov = self.mul(gamma, 1.0 / normalizer) + gamma_cov = gamma_cov + damping + gamma_inv = self.inv(gamma_cov) + beta = self.mul(beta_inv, beta_grad) + gamma = self.mul(gamma_inv, gamma_grad) + new_grads = new_grads + (beta, gamma) + else: + g = gradients[grad_idx] + offset_idx = 0 + if j in (0, 2, 4, 6): + offset_idx = j // 2 + elif j in (10, 12): + offset_idx = j // 2 - 1 + matrix_idx = 6 * i + offset_idx + 3 + temp_a = self.matrix_A[matrix_idx] + temp_g = self.matrix_G[matrix_idx] + matrix_max = self.matrix_max_inv[matrix_idx] + temp_a = self.cast(temp_a, mstype.float16) + temp_g = self.cast(temp_g, mstype.float16) + g = self.cast(g, mstype.float16) + + g = self.matmul(temp_g, g) + g = self.matmul(g, temp_a) + g = self.cast(g, mstype.float32) + g = self.mul(g, matrix_max) + new_grads = new_grads + (g,) + new_grads = new_grads + (gradients[grad_idx + 1],) + + # process pooler layer + pooler_layer_idx = encoder_layers_num * self.num_hidden_layers + 5 + matrix_idx = self.num_hidden_layers * 6 + 3 + g = gradients[pooler_layer_idx] + pooler_bias = gradients[pooler_layer_idx + 1] + temp_a = self.matrix_A[matrix_idx] + temp_g = self.matrix_G[matrix_idx] + matrix_max = self.matrix_max_inv[matrix_idx] + temp_a = self.cast(temp_a, mstype.float16) + temp_g = self.cast(temp_g, mstype.float16) + g = self.cast(g, mstype.float16) + + g = self.matmul(temp_g, g) + g = self.matmul(g, temp_a) + g = self.cast(g, mstype.float32) + g = self.mul(g, matrix_max) + new_grads = new_grads + (g, pooler_bias) + + # for cls1 fc layer: mlm + mlm_fc_idx = encoder_layers_num * self.num_hidden_layers + 8 + matrix_idx = self.num_hidden_layers * 6 + 4 + g = gradients[mlm_fc_idx] + mlm_bias = gradients[mlm_fc_idx + 1] + temp_a = self.matrix_A[matrix_idx] + temp_g = self.matrix_G[matrix_idx] + matrix_max = self.matrix_max_inv[matrix_idx] + temp_a = self.cast(temp_a, mstype.float16) + temp_g = self.cast(temp_g, mstype.float16) + g = self.cast(g, mstype.float16) + + g = self.matmul(temp_g, g) + g = self.matmul(g, temp_a) + g = self.cast(g, mstype.float32) + g = self.mul(g, matrix_max) + # add bert.cls1.output_bias grad + new_grads = new_grads + (gradients[mlm_fc_idx - 1],) + new_grads = new_grads + (g, mlm_bias) + # add bert.cls1.layernorm grad + begin_idx = mlm_fc_idx + 2 + end_idx = mlm_fc_idx + 4 + new_grads = new_grads + gradients[begin_idx: end_idx] + lenth = len(gradients) + new_grads = new_grads + gradients[lenth - 2: lenth] + gradients = new_grads + + if self.weight_decay > 0: + gradients = self.hyper_map(F.partial(apply_decay, self.weight_decay), self.decay_flags, + params, gradients) + gradients = self.scale_grad(gradients) + lr = self.get_lr() + success = self.hyper_map(F.partial(momentum_opt, self.opt, lr, self.momentum), gradients, params, moments) + return success diff --git a/model_zoo/bert_thor/src/thor_for_bert_arg.py b/model_zoo/bert_thor/src/thor_for_bert_arg.py new file mode 100644 index 0000000000..8cb56d1f70 --- /dev/null +++ b/model_zoo/bert_thor/src/thor_for_bert_arg.py @@ -0,0 +1,429 @@ +# Copyright 2020 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ +"""momentum""" +import mindspore.common.dtype as mstype +from mindspore.common.initializer import initializer +from mindspore.common.parameter import Parameter +from mindspore.common.parameter import ParameterTuple +from mindspore.common.tensor import Tensor +from mindspore.nn.optim.optimizer import Optimizer +from mindspore.ops import functional as F, composite as C, operations as P +from mindspore.parallel._utils import _get_device_num, _get_mirror_mean +from .grad_reducer_thor1 import DistributedGradReducerThor1 + +momentum_opt = C.MultitypeFuncGraph("momentum_opt") + + +@momentum_opt.register("Function", "Tensor", "Tensor", "Tensor", "Tensor", "Tensor") +def _tensor_run_opt_ext(opt, learning_rate, momentum, gradient, weight, moment): + """Apply momentum optimizer to the weight parameter using Tensor.""" + success = True + success = F.depend(success, opt(weight, moment, learning_rate, gradient, momentum)) + return success + + +op_add = P.AddN() +apply_decay = C.MultitypeFuncGraph("apply_decay") + + +@apply_decay.register("Number", "Bool", "Tensor", "Tensor") +def _tensor_apply_decay(weight_decay, if_apply, weight, gradient): + """Get grad with weight_decay.""" + if if_apply: + return op_add((weight * weight_decay, gradient)) + return gradient + + +class THOR(Optimizer): + """THOR""" + + def __init__(self, params, learning_rate, momentum, matrix_A, matrix_G, A_inv_max, G_inv_max, weight_decay=0.0, + loss_scale=1.0, num_hidden_layers=24, batch_size=12, damping=0.03, frequency=10, + decay_filter=lambda x: 'layernorm' not in x.name.lower() and 'bias' not in x.name.lower()): + super(THOR, self).__init__(learning_rate, params, weight_decay, loss_scale) + if isinstance(momentum, float) and momentum < 0.0: + raise ValueError("momentum should be at least 0.0, but got momentum {}".format(momentum)) + self.momentum = Parameter(Tensor(momentum, mstype.float32), name="momentum") + self.params = self.parameters + self.moments = self.params.clone(prefix="moments", init='zeros') + self.hyper_map = C.HyperMap() + self.opt = P.ApplyMomentum() + self.matrix_A = ParameterTuple(matrix_A) + self.matrix_G = ParameterTuple(matrix_G) + self.A_inv_max = ParameterTuple(A_inv_max) + self.G_inv_max = ParameterTuple(G_inv_max) + self.matmul = P.MatMul() + self.transpose = P.Transpose() + self.shape = P.Shape() + self.reshape = P.Reshape() + self.mul = P.Mul() + self.gather = P.GatherV2() + self.matrix_A_inv = () + self.matrix_G_inv = () + self.matrix_max_inv = () + self.num_hidden_layers = num_hidden_layers + fc_layer_num = num_hidden_layers * 6 + 5 + for i in range(fc_layer_num): + self.matrix_max_inv = self.matrix_max_inv + ( + Parameter(initializer(1, [1], mstype.float32), name="matrix_max" + str(i), requires_grad=False),) + self.log = P.Log() + self.exp = P.Exp() + self.sqrt = P.Sqrt() + self.matrix_max_inv = ParameterTuple(self.matrix_max_inv) + self.assign = P.Assign() + self.cast = P.Cast() + self.thor = True + self.weight_decay = weight_decay * loss_scale + self.decay_flags = tuple(decay_filter(x) for x in self.parameters) + self.expand = P.ExpandDims() + self.square = P.Square() + self.inv = P.Inv() + self.batch_size = batch_size + self.damping = damping + self.freq = Tensor(frequency, mstype.int32) + self.one = Tensor(1, mstype.int32) + self.cov_step = Parameter(initializer(0, [1], mstype.int32), name="cov_step", requires_grad=False) + mean = _get_mirror_mean() + degree = _get_device_num() + self.grad_reducer_g = DistributedGradReducerThor1(self.parameters, 3, mean, degree) + + def construct(self, gradients): + """construct of THOR""" + params = self.params + moments = self.moments + encoder_layers_num = 16 + if self.thor: + new_grads = () + # process embedding layer + for em_idx in range(3): + g = gradients[em_idx] + matrix_idx = em_idx + temp_a_ori = self.matrix_A[matrix_idx] + temp_a = self.expand(temp_a_ori, 1) + temp_g = self.matrix_G[matrix_idx] + G_max = self.G_inv_max[matrix_idx] + temp_g = self.cast(temp_g, mstype.float32) + matrix_G_inv_max = self.log(G_max) + matrix_G_inv_max = self.mul(matrix_G_inv_max, -1) + matrix_G_inv_max = self.exp(matrix_G_inv_max) + temp_g = self.mul(temp_g, matrix_G_inv_max) + g = self.mul(temp_a, g) + g = self.cast(g, mstype.float16) + temp_g = self.cast(temp_g, mstype.float16) + g = self.matmul(g, temp_g) + g = self.cast(g, mstype.float32) + g = self.mul(g, G_max) + fake_A = self.assign(self.matrix_A[matrix_idx], temp_a_ori) + fake_G = self.assign(self.matrix_G[matrix_idx], temp_g) + fake_max = self.assign(self.matrix_max_inv[matrix_idx], G_max) + g = F.depend(g, fake_A) + g = F.depend(g, fake_G) + g = F.depend(g, fake_max) + new_grads = new_grads + (g,) + # process bert_embedding_postprocessor.layernorm + grad_idx = 3 + beta_grad = gradients[grad_idx] + gamma_grad = gradients[grad_idx + 1] + normalizer = self.batch_size + normalizer = self.cast(normalizer, mstype.float32) + damping_step = self.gather(self.damping, self.cov_step, 0) + damping_step = self.cast(damping_step, mstype.float32) + self.cov_step = self.cov_step + self.one + damping = self.sqrt(damping_step) + beta = self.square(beta_grad) + beta_cov = self.mul(beta, 1.0 / normalizer) + beta_cov = beta_cov + damping + beta_inv = self.inv(beta_cov) + gamma = self.square(gamma_grad) + gamma_cov = self.mul(gamma, 1.0 / normalizer) + gamma_cov = gamma_cov + damping + gamma_inv = self.inv(gamma_cov) + beta = self.mul(beta_inv, beta_grad) + gamma = self.mul(gamma_inv, gamma_grad) + new_grads = new_grads + (beta, gamma) + + for i in range(self.num_hidden_layers): + encoder_begin_idx = encoder_layers_num * i + 5 + for j in range(0, encoder_layers_num, 2): + grad_idx = encoder_begin_idx + j + if j in (8, 14): + # process layernorm layer + beta_grad = gradients[grad_idx] + gamma_grad = gradients[grad_idx + 1] + normalizer = self.batch_size + normalizer = self.cast(normalizer, mstype.float32) + beta = self.square(beta_grad) + beta_cov = self.mul(beta, 1.0 / normalizer) + beta_cov = beta_cov + damping + beta_inv = self.inv(beta_cov) + gamma = self.square(gamma_grad) + gamma_cov = self.mul(gamma, 1.0 / normalizer) + gamma_cov = gamma_cov + damping + gamma_inv = self.inv(gamma_cov) + beta = self.mul(beta_inv, beta_grad) + gamma = self.mul(gamma_inv, gamma_grad) + new_grads = new_grads + (beta, gamma) + else: + g = gradients[grad_idx] + offset_idx = 0 + if j in (0, 2, 4, 6): + offset_idx = j // 2 + elif j in (10, 12): + offset_idx = j // 2 - 1 + matrix_idx = 6 * i + offset_idx + 3 + temp_a = self.matrix_A[matrix_idx] + temp_g = self.matrix_G[matrix_idx] + temp_a = self.cast(temp_a, mstype.float32) + temp_g = self.cast(temp_g, mstype.float32) + matrix_A_inv_max = self.log(self.A_inv_max[matrix_idx]) + matrix_A_inv_max = self.mul(matrix_A_inv_max, -1) + matrix_A_inv_max = self.exp(matrix_A_inv_max) + temp_a = self.mul(temp_a, matrix_A_inv_max) + matrix_G_inv_max = self.log(self.G_inv_max[matrix_idx]) + matrix_G_inv_max = self.mul(matrix_G_inv_max, -1) + matrix_G_inv_max = self.exp(matrix_G_inv_max) + temp_g = self.mul(temp_g, matrix_G_inv_max) + temp_max = self.mul(self.A_inv_max[matrix_idx], self.G_inv_max[matrix_idx]) + temp_a = self.cast(temp_a, mstype.float16) + temp_g = self.cast(temp_g, mstype.float16) + g = self.cast(g, mstype.float16) + + g = self.matmul(temp_g, g) + g = self.matmul(g, temp_a) + g = self.cast(g, mstype.float32) + g = self.mul(g, temp_max) + + fake_A = self.assign(self.matrix_A[matrix_idx], temp_a) + fake_G = self.assign(self.matrix_G[matrix_idx], temp_g) + fake_max = self.assign(self.matrix_max_inv[matrix_idx], temp_max) + g = F.depend(g, fake_A) + g = F.depend(g, fake_G) + g = F.depend(g, fake_max) + new_grads = new_grads + (g,) + new_grads = new_grads + (gradients[grad_idx + 1],) + + # process pooler layer + pooler_layer_idx = encoder_layers_num * self.num_hidden_layers + 5 + matrix_idx = self.num_hidden_layers * 6 + 3 + g = gradients[pooler_layer_idx] + pooler_bias = gradients[pooler_layer_idx + 1] + temp_a = self.matrix_A[matrix_idx] + temp_g = self.matrix_G[matrix_idx] + temp_a = self.cast(temp_a, mstype.float32) + temp_g = self.cast(temp_g, mstype.float32) + matrix_A_inv_max = self.log(self.A_inv_max[matrix_idx]) + matrix_A_inv_max = self.mul(matrix_A_inv_max, -1) + matrix_A_inv_max = self.exp(matrix_A_inv_max) + temp_a = self.mul(temp_a, matrix_A_inv_max) + matrix_G_inv_max = self.log(self.G_inv_max[matrix_idx]) + matrix_G_inv_max = self.mul(matrix_G_inv_max, -1) + matrix_G_inv_max = self.exp(matrix_G_inv_max) + temp_g = self.mul(temp_g, matrix_G_inv_max) + temp_max = self.mul(self.A_inv_max[matrix_idx], self.G_inv_max[matrix_idx]) + temp_a = self.cast(temp_a, mstype.float16) + temp_g = self.cast(temp_g, mstype.float16) + g = self.cast(g, mstype.float16) + + g = self.matmul(temp_g, g) + g = self.matmul(g, temp_a) + g = self.cast(g, mstype.float32) + g = self.mul(g, temp_max) + + fake_A = self.assign(self.matrix_A[matrix_idx], temp_a) + fake_G = self.assign(self.matrix_G[matrix_idx], temp_g) + fake_max = self.assign(self.matrix_max_inv[matrix_idx], temp_max) + g = F.depend(g, fake_A) + g = F.depend(g, fake_G) + g = F.depend(g, fake_max) + new_grads = new_grads + (g, pooler_bias) + + # for cls1 fc layer: mlm + mlm_fc_idx = encoder_layers_num * self.num_hidden_layers + 8 + matrix_idx = self.num_hidden_layers * 6 + 4 + g = gradients[mlm_fc_idx] + mlm_bias = gradients[mlm_fc_idx + 1] + temp_a = self.matrix_A[matrix_idx] + temp_g = self.matrix_G[matrix_idx] + temp_a = self.cast(temp_a, mstype.float32) + temp_g = self.cast(temp_g, mstype.float32) + matrix_A_inv_max = self.log(self.A_inv_max[matrix_idx]) + matrix_A_inv_max = self.mul(matrix_A_inv_max, -1) + matrix_A_inv_max = self.exp(matrix_A_inv_max) + temp_a = self.mul(temp_a, matrix_A_inv_max) + matrix_G_inv_max = self.log(self.G_inv_max[matrix_idx]) + matrix_G_inv_max = self.mul(matrix_G_inv_max, -1) + matrix_G_inv_max = self.exp(matrix_G_inv_max) + temp_g = self.mul(temp_g, matrix_G_inv_max) + temp_max = self.mul(self.A_inv_max[matrix_idx], self.G_inv_max[matrix_idx]) + temp_a = self.cast(temp_a, mstype.float16) + temp_g = self.cast(temp_g, mstype.float16) + g = self.cast(g, mstype.float16) + + g = self.matmul(temp_g, g) + g = self.matmul(g, temp_a) + g = self.cast(g, mstype.float32) + g = self.mul(g, temp_max) + + fake_A = self.assign(self.matrix_A[matrix_idx], temp_a) + fake_G = self.assign(self.matrix_G[matrix_idx], temp_g) + fake_max = self.assign(self.matrix_max_inv[matrix_idx], temp_max) + g = F.depend(g, fake_A) + g = F.depend(g, fake_G) + g = F.depend(g, fake_max) + new_grads = new_grads + (gradients[mlm_fc_idx - 1],) + new_grads = new_grads + (g, mlm_bias) + # add bert.cls1.layernorm grad + begin_idx = mlm_fc_idx + 2 + end_idx = mlm_fc_idx + 4 + new_grads = new_grads + gradients[begin_idx: end_idx] + lenth = len(gradients) + new_grads = new_grads + gradients[lenth - 2: lenth] + gradients = new_grads + gradients = self.grad_reducer_g(gradients) + else: + new_grads = () + # process embedding layer + for em_idx in range(3): + g = gradients[em_idx] + matrix_idx = em_idx + temp_a = self.matrix_A[matrix_idx] + temp_a = self.expand(temp_a, 1) + temp_g = self.matrix_G[matrix_idx] + matrix_max = self.matrix_max_inv[matrix_idx] + g = self.mul(temp_a, g) + temp_g = self.cast(temp_g, mstype.float16) + g = self.cast(g, mstype.float16) + g = self.matmul(g, temp_g) + g = self.cast(g, mstype.float32) + g = self.mul(g, matrix_max) + new_grads = new_grads + (g,) + # process bert_embedding_postprocessor.layernorm + grad_idx = 3 + beta_grad = gradients[grad_idx] + gamma_grad = gradients[grad_idx + 1] + normalizer = self.batch_size + normalizer = self.cast(normalizer, mstype.float32) + damping_step = self.gather(self.damping, self.cov_step, 0) + damping_step = self.cast(damping_step, mstype.float32) + self.cov_step = self.cov_step + self.one + damping = self.sqrt(damping_step) + beta = self.square(beta_grad) + beta_cov = self.mul(beta, 1.0 / normalizer) + beta_cov = beta_cov + damping + beta_inv = self.inv(beta_cov) + gamma = self.square(gamma_grad) + gamma_cov = self.mul(gamma, 1.0 / normalizer) + gamma_cov = gamma_cov + damping + gamma_inv = self.inv(gamma_cov) + beta = self.mul(beta_inv, beta_grad) + gamma = self.mul(gamma_inv, gamma_grad) + new_grads = new_grads + (beta, gamma) + + for i in range(self.num_hidden_layers): + encoder_begin_idx = encoder_layers_num * i + 5 + for j in range(0, encoder_layers_num, 2): + grad_idx = encoder_begin_idx + j + if j in (8, 14): + # process layernorm layer + beta_grad = gradients[grad_idx] + gamma_grad = gradients[grad_idx + 1] + normalizer = self.batch_size + normalizer = self.cast(normalizer, mstype.float32) + beta = self.square(beta_grad) + beta_cov = self.mul(beta, 1.0 / normalizer) + beta_cov = beta_cov + damping + beta_inv = self.inv(beta_cov) + gamma = self.square(gamma_grad) + gamma_cov = self.mul(gamma, 1.0 / normalizer) + gamma_cov = gamma_cov + damping + gamma_inv = self.inv(gamma_cov) + beta = self.mul(beta_inv, beta_grad) + gamma = self.mul(gamma_inv, gamma_grad) + new_grads = new_grads + (beta, gamma) + else: + g = gradients[grad_idx] + offset_idx = 0 + if j in (0, 2, 4, 6): + offset_idx = j // 2 + elif j in (10, 12): + offset_idx = j // 2 - 1 + matrix_idx = 6 * i + offset_idx + 3 + temp_a = self.matrix_A[matrix_idx] + temp_g = self.matrix_G[matrix_idx] + matrix_max = self.matrix_max_inv[matrix_idx] + temp_a = self.cast(temp_a, mstype.float16) + temp_g = self.cast(temp_g, mstype.float16) + g = self.cast(g, mstype.float16) + + g = self.matmul(temp_g, g) + g = self.matmul(g, temp_a) + g = self.cast(g, mstype.float32) + g = self.mul(g, matrix_max) + new_grads = new_grads + (g,) + new_grads = new_grads + (gradients[grad_idx + 1],) + + # process pooler layer + pooler_layer_idx = encoder_layers_num * self.num_hidden_layers + 5 + matrix_idx = self.num_hidden_layers * 6 + 3 + g = gradients[pooler_layer_idx] + pooler_bias = gradients[pooler_layer_idx + 1] + temp_a = self.matrix_A[matrix_idx] + temp_g = self.matrix_G[matrix_idx] + matrix_max = self.matrix_max_inv[matrix_idx] + temp_a = self.cast(temp_a, mstype.float16) + temp_g = self.cast(temp_g, mstype.float16) + g = self.cast(g, mstype.float16) + + g = self.matmul(temp_g, g) + g = self.matmul(g, temp_a) + g = self.cast(g, mstype.float32) + g = self.mul(g, matrix_max) + new_grads = new_grads + (g, pooler_bias) + + # for cls1 fc layer: mlm + mlm_fc_idx = encoder_layers_num * self.num_hidden_layers + 8 + matrix_idx = self.num_hidden_layers * 6 + 4 + g = gradients[mlm_fc_idx] + mlm_bias = gradients[mlm_fc_idx + 1] + temp_a = self.matrix_A[matrix_idx] + temp_g = self.matrix_G[matrix_idx] + matrix_max = self.matrix_max_inv[matrix_idx] + temp_a = self.cast(temp_a, mstype.float16) + temp_g = self.cast(temp_g, mstype.float16) + g = self.cast(g, mstype.float16) + + g = self.matmul(temp_g, g) + g = self.matmul(g, temp_a) + g = self.cast(g, mstype.float32) + g = self.mul(g, matrix_max) + # add bert.cls1.output_bias grad + new_grads = new_grads + (gradients[mlm_fc_idx - 1],) + new_grads = new_grads + (g, mlm_bias) + # add bert.cls1.layernorm grad + begin_idx = mlm_fc_idx + 2 + end_idx = mlm_fc_idx + 4 + new_grads = new_grads + gradients[begin_idx: end_idx] + lenth = len(gradients) + new_grads = new_grads + gradients[lenth - 2: lenth] + gradients = new_grads + gradients = self.grad_reducer_g(gradients) + + if self.weight_decay > 0: + gradients = self.hyper_map(F.partial(apply_decay, self.weight_decay), self.decay_flags, + params, gradients) + gradients = self.scale_grad(gradients) + lr = self.get_lr() + success = self.hyper_map(F.partial(momentum_opt, self.opt, lr, self.momentum), gradients, params, moments) + return success diff --git a/model_zoo/bert_thor/src/thor_layer.py b/model_zoo/bert_thor/src/thor_layer.py new file mode 100644 index 0000000000..8f9e0c0759 --- /dev/null +++ b/model_zoo/bert_thor/src/thor_layer.py @@ -0,0 +1,304 @@ +# Copyright 2020 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ +"""thor_layer""" +import numpy as np + +import mindspore.common.dtype as mstype +from mindspore._checkparam import check_bool, check_int_positive +from mindspore.common.initializer import TruncatedNormal, initializer +from mindspore.common.parameter import Parameter +from mindspore.common.tensor import Tensor +from mindspore.nn.cell import Cell +from mindspore.nn.layer.activation import get_activation +from mindspore.ops import operations as P + + +class Embedding_Thor(Cell): + """ + A embeddings lookup table with a fixed dictionary and size. + + Args: + vocab_size (int): Size of the dictionary of embeddings. + embedding_size (int): The size of each embedding vector. + embedding_shape (list): [batch_size, seq_length, embedding_size], the shape of + each embedding vector. + use_one_hot_embeddings (bool): Specifies whether to use one hot encoding form. Default: False. + initializer_range (float): Initialization value of TruncatedNormal. Default: 0.02. + """ + + def __init__(self, + vocab_size, + embedding_size, + embedding_shape, + use_one_hot_embeddings=False, + initializer_range=0.02, + name='embedding_table', + is_expand=False, + batch_size=12, + damping=0.03, + loss_scale=1, + frequency=10, + ): + super(Embedding_Thor, self).__init__() + self.vocab_size = vocab_size + self.use_one_hot_embeddings = use_one_hot_embeddings + self.embedding_table = Parameter(initializer + (TruncatedNormal(initializer_range), + [vocab_size, embedding_size]), + name=name) + self.thor = True + self.is_expand = is_expand + self.expand = P.ExpandDims() + self.shape_flat = (-1,) + self.gather = P.GatherV2() + self.one_hot = P.OneHot() + self.on_value = Tensor(1.0, mstype.float32) + self.off_value = Tensor(0.0, mstype.float32) + self.array_mul = P.MatMul() + self.reshape = P.Reshape() + self.em_shape = tuple(embedding_shape) + self.shape = P.Shape() + self.loss_scale = Tensor(1 / loss_scale, mstype.float16) + self.matrix_A_inv = Parameter(Tensor(np.zeros([vocab_size]).astype(np.float32)), name='matrix_A_inv', + requires_grad=False) + self.matrix_G_inv = Parameter(Tensor(np.zeros([embedding_size, embedding_size]).astype(np.float16)), + name="matrix_G_inv", requires_grad=False) + self.A_inv_max = Parameter(initializer(0, [1], mstype.float32), name="A_inv_max", requires_grad=False) + self.G_inv_max = Parameter(initializer(0, [1], mstype.float32), name="G_inv_max", requires_grad=False) + self.fused_abs_max = P.CusFusedAbsMax1() + self.fake_G = Tensor(np.zeros([embedding_size, embedding_size]).astype(np.float16)) + self.dampingA = Tensor(np.ones([vocab_size]).astype(np.float32)) + self.dampingG = Tensor(np.identity(embedding_size), mstype.float32) + self.cov_step = Parameter(initializer(0, [1], mstype.int32), name="cov_step", requires_grad=False) + self.freq = Tensor(frequency, mstype.int32) + self.axis = 0 + self.damping = damping + self.gather = P.GatherV2() + self.sqrt = P.Sqrt() + self.mul = P.Mul() + self.cast = P.Cast() + self.cube_matmul = P.CusMatMulCube(transpose_a=True) + self.vector_matmul = P.CusBatchMatMul() + self.cholesky = P.CusCholeskyTrsm() + self.matrix_combine = P.CusMatrixCombine() + self.reduce_sum = P.ReduceSum(keep_dims=False) + self.inv = P.Inv() + self.getG = P.InsertGradientOf(self.save_gradient) + self.batch_size = batch_size + + def save_gradient(self, dout): + """save_gradient""" + bs = self.batch_size + bs = self.cast(bs, mstype.float32) + out = dout + dout = self.mul(dout, self.loss_scale) + dout = self.mul(dout, bs) + shape = self.shape(dout) + normalizer = self.cast(shape[0], mstype.float32) + matrix_G = self.cube_matmul(dout, dout) + matrix_G = self.mul(matrix_G, 1.0 / normalizer) + damping_step = self.gather(self.damping, self.cov_step, 0) + damping_step = self.cast(damping_step, mstype.float32) + self.cov_step = self.cov_step + self.freq + damping = self.sqrt(damping_step) + dampingG = self.cast(self.dampingG, mstype.float32) + matrix_G = matrix_G + damping * dampingG + matrix_G_inv = self.cholesky(matrix_G) + matrix_G_inv = self.vector_matmul(matrix_G_inv, matrix_G_inv) + matrix_G_inv_max = self.fused_abs_max(matrix_G_inv) + matrix_G_inv_max = self.fused_abs_max(matrix_G_inv_max) + self.G_inv_max = matrix_G_inv_max + matrix_G_inv = self.matrix_combine(matrix_G_inv) + matrix_G_inv = self.cast(matrix_G_inv, mstype.float16) + self.matrix_G_inv = matrix_G_inv + return out + + def construct(self, input_ids): + """construct of Embedding_Thor""" + if self.is_expand: + input_ids = self.expand(input_ids, -1) + flat_ids = self.reshape(input_ids, self.shape_flat) + if self.use_one_hot_embeddings: + one_hot_ids = self.one_hot(flat_ids, self.vocab_size, self.on_value, self.off_value) + output_for_reshape = self.array_mul(one_hot_ids, self.embedding_table) + else: + if self.thor: + one_hot_ids = self.one_hot(flat_ids, self.vocab_size, self.on_value, self.off_value) + matrix_A = self.reduce_sum(one_hot_ids, 0) + normalizer = self.batch_size + normalizer = self.cast(normalizer, mstype.float32) + matrix_A = self.mul(matrix_A, 1.0 / normalizer) + damping_step = self.gather(self.damping, self.cov_step, self.axis) + damping_step = self.cast(damping_step, mstype.float32) + damping = self.sqrt(damping_step) + dampingA = self.cast(self.dampingA, mstype.float32) + matrix_A = matrix_A + damping * dampingA + matrix_A_inv = self.inv(matrix_A) + self.matrix_A_inv = matrix_A_inv + self.matrix_G_inv = self.fake_G + output_for_reshape = self.gather(self.embedding_table, flat_ids, 0) + output_for_reshape = self.getG(output_for_reshape) + else: + output_for_reshape = self.gather(self.embedding_table, flat_ids, 0) + + output = self.reshape(output_for_reshape, self.em_shape) + return output, self.embedding_table + + +class Dense_Thor(Cell): + """Dense_Thor""" + + # @cell_attr_register(attrs=['has_bias', 'activation', 'in_channels', 'out_channels']) + def __init__(self, + in_channels, + out_channels, + weight_init='normal', + bias_init='zeros', + damping=0.03, + loss_scale=1, + frequency=10, + has_bias=False, + activation=None, + batch_size=12): + super(Dense_Thor, self).__init__() + self.in_channels = check_int_positive(in_channels) + self.out_channels = check_int_positive(out_channels) + self.has_bias = check_bool(has_bias) + self.thor = True + if isinstance(weight_init, Tensor): + if weight_init.dim() != 2 or weight_init.shape()[0] != out_channels or \ + weight_init.shape()[1] != in_channels: + raise ValueError("weight_init shape error") + + self.weight = Parameter(initializer(weight_init, [out_channels, in_channels]), name="weight") + + if self.has_bias: + if isinstance(bias_init, Tensor): + if bias_init.dim() != 1 or bias_init.shape()[0] != out_channels: + raise ValueError("bias_init shape error") + + self.bias = Parameter(initializer(bias_init, [out_channels]), name="bias") + + self.matmul = P.MatMul(transpose_b=True) + self.bias_add = P.BiasAdd() + + self.activation = get_activation(activation) + self.activation_flag = self.activation is not None + self.matrix_A_inv = Parameter(Tensor(np.zeros([in_channels, in_channels]).astype(np.float16)), + name='matrix_A_inv', requires_grad=False) + self.matrix_G_inv = Parameter(Tensor(np.zeros([out_channels, out_channels]).astype(np.float16)), + name="matrix_G_inv", requires_grad=False) + self.A_inv_max = Parameter(initializer(0, [1], mstype.float32), name="A_inv_max", requires_grad=False) + self.G_inv_max = Parameter(initializer(0, [1], mstype.float32), name="G_inv_max", requires_grad=False) + self.fused_abs_max = P.CusFusedAbsMax1() + self.fake_G = Tensor(np.zeros([out_channels, out_channels]).astype(np.float16)) + + self.matmul = P.MatMul(transpose_b=True) + self.cube_matmul = P.CusMatMulCube(transpose_a=True) + self.matrix_combine = P.CusMatrixCombine() + self.cholesky = P.CusCholeskyTrsm() + self.shape = P.Shape() + self.reshape = P.Reshape() + self.transpose = P.Transpose() + self.cov_step = Parameter(initializer(0, [1], mstype.int32), name="cov_step", requires_grad=False) + self.mul = P.Mul() + self.cast = P.Cast() + self.damping = damping + self.loss_scale = Tensor(1 / loss_scale, mstype.float16) + self.vector_matmul = P.CusBatchMatMul() + self.gather = P.GatherV2() + self.assignadd = P.AssignAdd() + self.freq = Tensor(frequency, mstype.int32) + self.axis = 0 + self.abs = P.Abs() + self.reduce_max = P.ReduceMax(keep_dims=False) + self.log = P.Log() + self.exp = P.Exp() + self.dampingA = Tensor(np.identity(in_channels), mstype.float32) + self.dampingG = Tensor(np.identity(out_channels), mstype.float32) + self.sqrt = P.Sqrt() + self.getG = P.InsertGradientOf(self.save_gradient) + self.batch_size = batch_size + + def save_gradient(self, dout): + """save_gradient""" + bs = self.cast(self.batch_size, mstype.float32) + out = dout + dout = self.mul(dout, self.loss_scale) + dout = self.mul(dout, bs) + shape = self.shape(dout) + normalizer = self.cast(shape[0], mstype.float32) + matrix_G = self.cube_matmul(dout, dout) + matrix_G = self.mul(matrix_G, 1.0 / normalizer) + damping_step = self.gather(self.damping, self.cov_step, 0) + damping_step = self.cast(damping_step, mstype.float32) + self.cov_step = self.cov_step + self.freq + damping = self.sqrt(damping_step) + dampingG = self.cast(self.dampingG, mstype.float32) + matrix_G = matrix_G + damping * dampingG + matrix_G_inv = self.cholesky(matrix_G) + matrix_G_inv = self.vector_matmul(matrix_G_inv, matrix_G_inv) + matrix_G_inv_max = self.fused_abs_max(matrix_G_inv) + matrix_G_inv_max = self.fused_abs_max(matrix_G_inv_max) + self.G_inv_max = matrix_G_inv_max + matrix_G_inv = self.matrix_combine(matrix_G_inv) + matrix_G_inv = self.cast(matrix_G_inv, mstype.float16) + self.matrix_G_inv = matrix_G_inv + return out + + def construct(self, x): + """construct""" + if self.thor: + inputs = self.cube_matmul(x, x) + shape = self.shape(x) + normalizer = self.cast(shape[0], mstype.float32) + matrix_A = self.mul(inputs, 1.0 / normalizer) + + damping_step = self.gather(self.damping, self.cov_step, self.axis) + damping_step = self.cast(damping_step, mstype.float32) + damping = self.sqrt(damping_step) + dampingA = self.cast(self.dampingA, mstype.float32) + matrix_A = matrix_A + damping * dampingA + matrix_A_inv = self.cholesky(matrix_A) + matrix_A_inv = self.vector_matmul(matrix_A_inv, matrix_A_inv) + matrix_A_inv_max = self.fused_abs_max(matrix_A_inv) + matrix_A_inv_max = self.fused_abs_max(matrix_A_inv_max) + self.A_inv_max = matrix_A_inv_max + matrix_A_inv = self.matrix_combine(matrix_A_inv) + matrix_A_inv = self.cast(matrix_A_inv, mstype.float16) + self.matrix_A_inv = matrix_A_inv + self.matrix_G_inv = self.fake_G + output = self.matmul(x, self.weight) + output = self.getG(output) + else: + output = self.matmul(x, self.weight) + + if self.has_bias: + output = self.bias_add(output, self.bias) + if self.activation_flag: + return self.activation(output) + return output + + def extend_repr(self): + """extend_repr""" + str_info = 'in_channels={}, out_channels={}, weight={}, has_bias={}' \ + .format(self.in_channels, self.out_channels, self.weight, self.has_bias) + if self.has_bias: + str_info = str_info + ', bias={}'.format(self.bias) + + if self.activation_flag: + str_info = str_info + ', activation={}'.format(self.activation) + + return str_info diff --git a/model_zoo/bert_thor/src/utils.py b/model_zoo/bert_thor/src/utils.py new file mode 100644 index 0000000000..9366f71246 --- /dev/null +++ b/model_zoo/bert_thor/src/utils.py @@ -0,0 +1,169 @@ +# Copyright 2020 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ + +""" +Functional Cells used in Bert finetune and evaluation. +""" + +import os +import time + +import numpy as np +from src.config import cfg + +import mindspore.nn as nn +from mindspore.common import dtype as mstype +from mindspore.common.tensor import Tensor +from mindspore.nn.learning_rate_schedule import LearningRateSchedule, PolynomialDecayLR, WarmUpLR +from mindspore.ops import operations as P +from mindspore.train.callback import Callback + + +class CrossEntropyCalculation(nn.Cell): + """ + Cross Entropy loss + """ + + def __init__(self, is_training=True): + super(CrossEntropyCalculation, self).__init__() + self.onehot = P.OneHot() + self.on_value = Tensor(1.0, mstype.float32) + self.off_value = Tensor(0.0, mstype.float32) + self.reduce_sum = P.ReduceSum() + self.reduce_mean = P.ReduceMean() + self.reshape = P.Reshape() + self.last_idx = (-1,) + self.neg = P.Neg() + self.cast = P.Cast() + self.is_training = is_training + + def construct(self, logits, label_ids, num_labels): + if self.is_training: + label_ids = self.reshape(label_ids, self.last_idx) + one_hot_labels = self.onehot(label_ids, num_labels, self.on_value, self.off_value) + per_example_loss = self.neg(self.reduce_sum(one_hot_labels * logits, self.last_idx)) + loss = self.reduce_mean(per_example_loss, self.last_idx) + return_value = self.cast(loss, mstype.float32) + else: + return_value = logits * 1.0 + return return_value + + +def make_directory(path: str): + """Make directory.""" + if path is None or not isinstance(path, str) or path.strip() == "": + logger.error("The path(%r) is invalid type.", path) + raise TypeError("Input path is invaild type") + + # convert the relative paths + path = os.path.realpath(path) + logger.debug("The abs path is %r", path) + + # check the path is exist and write permissions? + if os.path.exists(path): + real_path = path + else: + # All exceptions need to be caught because create directory maybe have some limit(permissions) + logger.debug("The directory(%s) doesn't exist, will create it", path) + try: + os.makedirs(path, exist_ok=True) + real_path = path + except PermissionError as e: + logger.error("No write permission on the directory(%r), error = %r", path, e) + raise TypeError("No write permission on the directory.") + return real_path + + +class LossCallBack(Callback): + """ + Monitor the loss in training. + If the loss in NAN or INF terminating training. + Note: + if per_print_times is 0 do not print loss. + Args: + per_print_times (int): Print loss every times. Default: 1. + """ + + def __init__(self, per_print_times=1): + super(LossCallBack, self).__init__() + if not isinstance(per_print_times, int) or per_print_times < 0: + raise ValueError("print_step must be int and >= 0") + self._per_print_times = per_print_times + self.step_start_time = time.time() + + def step_begin(self, run_context): + self.step_start_time = time.time() + + def step_end(self, run_context): + cb_params = run_context.original_args() + step_time_span = time.time() - self.step_start_time + total_time_span = step_time_span + cur_step_num = cb_params.cur_step_num + if cur_step_num % cfg.Thor.frequency == 0: + step_time_span = step_time_span / (cfg.Thor.frequency - 1) + print("epoch: {}, step: {}, outputs are {}, total_time_span is {}, step_time_span is {}".format( + cb_params.cur_epoch_num, cb_params.cur_step_num, + str(cb_params.net_outputs), total_time_span, step_time_span)) + + +def LoadNewestCkpt(load_finetune_checkpoint_dir, steps_per_epoch, epoch_num, prefix): + """ + Find the ckpt finetune generated and load it into eval network. + """ + files = os.listdir(load_finetune_checkpoint_dir) + pre_len = len(prefix) + max_num = 0 + for filename in files: + name_ext = os.path.splitext(filename) + if name_ext[-1] != ".ckpt": + continue + # steps_per_epoch = ds.get_dataset_size() + if filename.find(prefix) == 0 and not filename[pre_len].isalpha(): + index = filename[pre_len:].find("-") + if index == 0 and max_num == 0: + load_finetune_checkpoint_path = os.path.join(load_finetune_checkpoint_dir, filename) + elif index not in (0, -1): + name_split = name_ext[-2].split('_') + if (steps_per_epoch != int(name_split[len(name_split) - 1])) \ + or (epoch_num != int(filename[pre_len + index + 1:pre_len + index + 2])): + continue + num = filename[pre_len + 1:pre_len + index] + if int(num) > max_num: + max_num = int(num) + load_finetune_checkpoint_path = os.path.join(load_finetune_checkpoint_dir, filename) + return load_finetune_checkpoint_path + + +class BertLearningRate(LearningRateSchedule): + """ + Warmup-decay learning rate for Bert network. + """ + + def __init__(self, learning_rate, end_learning_rate, warmup_steps, decay_steps, power): + super(BertLearningRate, self).__init__() + self.warmup_lr = WarmUpLR(learning_rate, warmup_steps) + self.decay_lr = PolynomialDecayLR(learning_rate, end_learning_rate, decay_steps, power) + self.warmup_steps = Tensor(np.array([warmup_steps]).astype(np.float32)) + + self.greater = P.Greater() + self.one = Tensor(np.array([1.0]).astype(np.float32)) + self.cast = P.Cast() + + def construct(self, global_step): + is_warmup = self.cast(self.greater(self.warmup_steps, global_step), mstype.float32) + warmup_lr = self.warmup_lr(global_step) + decay_lr = self.decay_lr(global_step) + lr = (self.one - is_warmup) * decay_lr + is_warmup * warmup_lr + return lr