diff --git a/mindspore/lite/src/runtime/agent/npu/npu_executor.cc b/mindspore/lite/src/runtime/agent/npu/npu_executor.cc
index a2c524bfdc..e0150316a7 100644
--- a/mindspore/lite/src/runtime/agent/npu/npu_executor.cc
+++ b/mindspore/lite/src/runtime/agent/npu/npu_executor.cc
@@ -38,7 +38,24 @@ int NPUExecutor::Prepare(const std::vector<kernel::LiteKernel *> &kernels) {
   return RET_OK;
 }
 
-bool IsSameShapeTensor(Tensor *tensor, std::shared_ptr<hiai::AiTensor> npu_tensor) {
+std::vector<int> GetNpuTensorShape(int dim, std::shared_ptr<hiai::AiTensor> npu_tensor) {
+  std::vector<int> npu_shape;
+  if (dim > 0) {
+    npu_shape.push_back(npu_tensor->GetTensorDimension().GetNumber());
+  }
+  if (dim > 1) {
+    npu_shape.push_back(npu_tensor->GetTensorDimension().GetChannel());
+  }
+  if (dim > 2) {
+    npu_shape.push_back(npu_tensor->GetTensorDimension().GetHeight());
+  }
+  if (dim > 3) {
+    npu_shape.push_back(npu_tensor->GetTensorDimension().GetWidth());
+  }
+  return npu_shape;
+}
+
+bool IsSameShapeInTensor(Tensor *tensor, std::shared_ptr<hiai::AiTensor> npu_tensor) {
   if (tensor->shape().size() > 4) {
     MS_LOG(ERROR) << "Npu does not support input tensor dims greater than 4";
     return false;
@@ -49,18 +66,15 @@ bool IsSameShapeTensor(Tensor *tensor, std::shared_ptr<hiai::AiTensor> npu_tenso
            tensor->Height() == npu_tensor->GetTensorDimension().GetHeight() &&
            tensor->Width() == npu_tensor->GetTensorDimension().GetWidth();
   }
-  std::vector<int> npu_shape;
-  auto dim = tensor->shape().size();
-  if (dim > 0) {
-    npu_shape.push_back(npu_tensor->GetTensorDimension().GetNumber());
-  }
-  if (dim > 1) {
-    npu_shape.push_back(npu_tensor->GetTensorDimension().GetChannel());
-  }
-  if (dim > 2) {
-    npu_shape.push_back(npu_tensor->GetTensorDimension().GetHeight());
+  return GetNpuTensorShape(tensor->shape().size(), npu_tensor) == tensor->shape();
+}
+
+bool IsSameShapeOutTensor(Tensor *tensor, std::shared_ptr<hiai::AiTensor> npu_tensor) {
+  if (tensor->shape().size() > 4) {
+    MS_LOG(ERROR) << "Npu does not support output tensor dims greater than 4";
+    return false;
   }
-  return npu_shape == tensor->shape();
+  return GetNpuTensorShape(tensor->shape().size(), npu_tensor) == tensor->shape();
 }
 
 int NPUExecutor::Run(const std::vector<Tensor *> &in_tensors, const std::vector<Tensor *> &out_tensors,
@@ -72,10 +86,10 @@ int NPUExecutor::Run(const std::vector<Tensor *> &in_tensors, const std::vector<
   for (int i = 0; i < npu_input_tensors_.size(); ++i) {
     int index = 0;
     for (; index < in_tensors.size(); index++) {
-      if (!inputs_visited[index] && IsSameShapeTensor(in_tensors[index], npu_input_tensors_[i])) {
+      if (!inputs_visited[index] && IsSameShapeInTensor(in_tensors[index], npu_input_tensors_[i])) {
         void *data = in_tensors[index]->data_c();
         if (data == nullptr) {
-          MS_LOG(ERROR) << model_name_ << " Inputs data is nullptr";
+          MS_LOG(ERROR) << "For " << model_name_ << ", the " << i << "th input data is nullptr";
           return RET_ERROR;
         }
 
@@ -106,14 +120,28 @@ int NPUExecutor::Run(const std::vector<Tensor *> &in_tensors, const std::vector<
     return RET_ERROR;
   }
 
+  std::vector<bool> outputs_visited(out_tensors.size(), false);
   for (int i = 0; i < npu_output_tensors_.size(); ++i) {
-    void *data = out_tensors[i]->MutableData();
-    if (data == nullptr) {
-      MS_LOG(ERROR) << "Malloc buffer failed.";
-      return RET_ERROR;
+    int index = 0;
+    for (; index < out_tensors.size(); index++) {
+      if (!outputs_visited[index] && IsSameShapeOutTensor(out_tensors[index], npu_output_tensors_[i])) {
+        void *data = out_tensors[index]->MutableData();
+        if (data == nullptr) {
+          MS_LOG(ERROR) << "For " << model_name_ << ", the " << i << "th output data is nullptr";
+          return RET_ERROR;
+        }
+
+        memcpy(data, npu_output_tensors_[i]->GetBuffer(), npu_output_tensors_[i]->GetSize());
+        out_tensors[index]->ResetRefCount();
+        outputs_visited[index] = true;
+        break;
+      }
+      if (index == out_tensors.size()) {
+        MS_LOG(ERROR) << "Can't find corresponding ms lite tensor of " << i << " output tensor for npu executor "
+                      << model_name_;
+        return RET_ERROR;
+      }
     }
-    memcpy(data, npu_output_tensors_[i]->GetBuffer(), npu_output_tensors_[i]->GetSize());
-    out_tensors[i]->ResetRefCount();
   }
   return RET_OK;
 }
diff --git a/mindspore/lite/src/runtime/agent/npu/optimizer/npu_transform_pass.cc b/mindspore/lite/src/runtime/agent/npu/optimizer/npu_transform_pass.cc
index bbfb293d88..300439c948 100644
--- a/mindspore/lite/src/runtime/agent/npu/optimizer/npu_transform_pass.cc
+++ b/mindspore/lite/src/runtime/agent/npu/optimizer/npu_transform_pass.cc
@@ -82,7 +82,7 @@ int NPUTransformPass::InsertPostNodes(kernel::LiteKernel *kernel, std::vector<ke
     auto nhwc_shape = kernel->out_tensors()[0]->shape();
     std::vector<int> nchw_shape = {nhwc_shape[0], nhwc_shape[3], nhwc_shape[1], nhwc_shape[2]};
     auto tensor =
-      new (std::nothrow) Tensor(kernel->out_tensors()[0]->data_type(), nchw_shape, schema::Format_NHWC, Tensor::VAR);
+      new (std::nothrow) Tensor(kernel->out_tensors()[0]->data_type(), nchw_shape, schema::Format_NCHW, Tensor::VAR);
     if (tensor == nullptr) {
       MS_LOG(ERROR) << "New nchw tensor failed when inserting post nchw2nhwc kernel.";
       return RET_ERROR;
diff --git a/mindspore/lite/src/runtime/kernel/npu/scale_npu.cc b/mindspore/lite/src/runtime/kernel/npu/scale_npu.cc
index 79d86d2260..b31b9034e8 100644
--- a/mindspore/lite/src/runtime/kernel/npu/scale_npu.cc
+++ b/mindspore/lite/src/runtime/kernel/npu/scale_npu.cc
@@ -25,7 +25,7 @@ namespace mindspore::kernel {
 int ScaleNPUKernel::IsSupport(const std::vector<lite::Tensor *> &inputs, const std::vector<lite::Tensor *> &outputs,
                               OpParameter *opParameter) {
   if (scale_parameter_->axis_ < 0) {
-    scale_parameter_->axis_ = scale_parameter_->axis_ + inputs.size();
+    scale_parameter_->axis_ = scale_parameter_->axis_ + inputs[0]->shape().size();
   }
   if (scale_parameter_->axis_ != 1) {
     MS_LOG(ERROR) << "Npu scale axis attr only support 1, now is " << scale_parameter_->axis_;