[Cherry-pick][XPU/2.12] cherry-pick (#9601)

* [XPU] add seq_softmax, seq_expand, lod_reset op in xpu (#9453) * [XPU] always set padding_idx to -1 for XPUEmbeddingWithEltwiseAddCompute (#9465) * [XPU] Fix error when running arm target on xpu (#9466) * [XPU] supported for matmul/matmul_v2's y_trans=true in __xpu__fc__op (#9427) Co-authored-by: AlbertVan <[email protected]> Co-authored-by: linwei210 <[email protected]>
PaddlePaddle · Nov 1, 2022 · ce48e33 · ce48e33
1 parent 433796c
commit ce48e33
Show file tree

Hide file tree

Showing 24 changed files with 587 additions and 101 deletions.
diff --git a/lite/core/optimizer/mir/fusion/__xpu__fc_fuse_pass.cc b/lite/core/optimizer/mir/fusion/__xpu__fc_fuse_pass.cc
@@ -36,7 +36,27 @@ class XPUFcFuser : public FuseBase {
   void BuildPattern() override {
     auto* x = VarNode("x")->assert_is_op_input(mul_type_, "X")->AsInput();
     auto* W = VarNode("W")->assert_is_op_input(mul_type_, "Y")->AsInput();
-    auto* mul = OpNode("mul", mul_type_)->AsIntermediate();
+    // check w.dims() == 2 and x.trans == false.
+    auto mul_input_check = [this](const Node* node) -> bool {
+      auto op_desc = *const_cast<Node*>(node)->stmt()->op_info();
+      auto mul_input_y_name = op_desc.Input("Y").front();
+      auto* scope = const_cast<Node*>(node)->AsStmt().op()->scope();
+      auto mul_y_shape = scope->FindMutableTensor(mul_input_y_name)->dims();
+      if (this->mul_type_ == "mul") {
+        return mul_y_shape.size() == 2;
+      }
+      if (this->mul_type_ == "matmul") {
+        return (mul_y_shape.size() == 2 &&
+                !op_desc.GetAttr<bool>("transpose_X"));
+      }
+      if (this->mul_type_ == "matmul_v2") {
+        return (mul_y_shape.size() == 2 && !op_desc.GetAttr<bool>("trans_x"));
+      }
+      return false;
+    };
+    auto* mul = OpNode("mul", mul_type_)
+                    ->AsIntermediate()
+                    ->assert_node_satisfied(mul_input_check);
     auto* mul_out = VarNode("mul_out")->assert_is_op_output(mul_type_, "Out");
     PMNode* bias = nullptr;
     PMNode* add = nullptr;
@@ -194,25 +214,20 @@ class XPUFcFuser : public FuseBase {
 
     op_desc.SetAttr<int>("in_num_col_dims", -1);
     if (mul_type_ == "mul") {
-      op_desc.SetAttr(
-          "in_num_col_dims",
-          matched.at("mul")->stmt()->op_info()->GetAttr<int>("x_num_col_dims"));
+      op_desc.SetAttr("in_num_col_dims",
+                      op_info->GetAttr<int>("x_num_col_dims"));
+      // trans_x and trans_y is not existed in mul op. set false
       op_desc.SetAttr("transpose_x", false);
-      op_desc.SetAttr("transpose_w", true);
+      op_desc.SetAttr("transpose_w", false);
     } else if (mul_type_ == "matmul") {
-      op_desc.SetAttr(
-          "transpose_x",
-          matched.at("mul")->stmt()->op_info()->GetAttr<bool>("transpose_X"));
-      op_desc.SetAttr(
-          "transpose_w",
-          matched.at("mul")->stmt()->op_info()->GetAttr<bool>("transpose_Y"));
+      op_desc.SetAttr("transpose_x", op_info->GetAttr<bool>("transpose_X"));
+      op_desc.SetAttr("transpose_w", op_info->GetAttr<bool>("transpose_Y"));
     } else {
-      op_desc.SetAttr(
-          "transpose_x",
-          matched.at("mul")->stmt()->op_info()->GetAttr<bool>("trans_x"));
-      op_desc.SetAttr(
-          "transpose_w",
-          matched.at("mul")->stmt()->op_info()->GetAttr<bool>("trans_y"));
+      op_desc.SetAttr("transpose_x", op_info->GetAttr<bool>("trans_x"));
+      op_desc.SetAttr("transpose_w", op_info->GetAttr<bool>("trans_y"));
+    }
+    if (op_info->HasAttr("alpha")) {
+      op_desc.SetAttr("alpha", op_info->GetAttr<float>("alpha"));
     }
 
     std::string max_output_name = output_name + "_xpu_max";
@@ -246,7 +261,7 @@ class XPUFcFuser : public FuseBase {
     bool per_tensor = true;
     CHECK_GT(weight_max.size(), 0) << "fc channel size: " << weight_max.size();
     auto first = weight_max[0];
-    for (int i = 1; i < weight_max.size(); ++i) {
+    for (size_t i = 1; i < weight_max.size(); ++i) {
       if (std::abs(first - weight_max[i]) > 1e-6) {
         per_tensor = false;
         break;
@@ -263,7 +278,7 @@ class XPUFcFusePass : public ProgramPass {
   void Apply(const std::unique_ptr<SSAGraph>& graph) override {
     if (GetBoolFromEnv("XPU_ENABLE_XTCL")) return;
     // TODO(weihaoji) support with_no_bias and more activation types
-    for (auto with_bias : {true, /*false*/}) {
+    for (auto with_bias : {true, false}) {
       for (auto act_type : {"relu",
                             "gelu",
                             /*"sigmoid",
@@ -273,7 +288,7 @@ class XPUFcFusePass : public ProgramPass {
                             "hard_sigmoid",
                             "relu6",*/
                             "linear"}) {
-        for (auto mul_type : {"mul", "matmul_v2"}) {
+        for (auto mul_type : {"mul", "matmul", "matmul_v2"}) {
           fusion::XPUFcFuser fuser(with_bias, act_type, mul_type);
           fuser(graph.get());
         }

diff --git a/lite/core/optimizer/mir/variable_place_inference_pass.h b/lite/core/optimizer/mir/variable_place_inference_pass.h
@@ -176,7 +176,8 @@ class VariablePlaceInferencePass : public DebugPass {
             *var_type = decl_type;
           }
         } else if (!(*var_type)->place().is_valid()) {
-          if (var.is_weight && with_targets["kMetal"]) {
+          if (var.is_weight &&
+              (with_targets["kMetal"] || with_targets["kXPU"])) {
             SetWeightType(in_node, **var_type, with_targets);
           } else if (decl_type->precision() == PRECISION(kInt8) ||
                      (decl_type->precision() == PRECISION(kFP16) &&
@@ -218,6 +219,10 @@ class VariablePlaceInferencePass : public DebugPass {
           } else {
             UpdateTypeFrom(var_type, decl_type);
           }
+          if (with_targets["kXPU"]) {
+            var.is_weight = false;
+            var.is_persist = false;
+          }
         }
       }
     }

diff --git a/lite/core/program.cc b/lite/core/program.cc
@@ -753,26 +753,6 @@ void Program::PrepareWorkspace(
       const auto& var_type = var_desc->GetType();
       VLOG(4) << "Var " << var_name << " in block " << block_idx;
       VLOG(4) << " - type " << static_cast<int>(var_type);
-
-#if defined(LITE_WITH_XPU) || defined(LITE_WITH_CUDA)
-      if (!var_desc->Persistable()) {
-#endif
-        // Collect precision info into var_type_map_
-        if (var_type == lite::VarDescAPI::Type::LOD_TENSOR) {
-          const auto& var_data_type =
-              VarDescType2PrecisionType(var_desc->GetDataType());
-          if (var_data_type != PRECISION(kUnk)) {
-            var_type_map_[var_name] = LiteType::GetTensorTy(
-                TARGET(kUnk), var_data_type, DATALAYOUT(kUnk));
-          }
-          VLOG(4) << " - data type " << static_cast<int>(var_data_type);
-        } else if (var_type == lite::VarDescAPI::Type::LOD_TENSOR_ARRAY) {
-          var_type_map_[var_name] = LiteType::GetTensorListTy(
-              TARGET(kUnk), PRECISION(kUnk), DATALAYOUT(kUnk));
-        }
-#if defined(LITE_WITH_XPU) || defined(LITE_WITH_CUDA)
-      }
-#endif
       // Create tensors or weights from variable description.
       if (!var_desc->Persistable()) {
         vars_.push_back(var_name);
@@ -806,6 +786,18 @@ void Program::PrepareWorkspace(
           var->GetMutable<std::vector<lite::Scope*>>();
         }
       } else {
+        if (var_type == lite::VarDescAPI::Type::LOD_TENSOR) {
+          const auto& var_data_type =
+              VarDescType2PrecisionType(var_desc->GetDataType());
+          if (var_data_type != PRECISION(kUnk)) {
+            var_type_map_[var_name] = LiteType::GetTensorTy(
+                TARGET(kUnk), var_data_type, DATALAYOUT(kUnk));
+          }
+          VLOG(4) << " - data type " << static_cast<int>(var_data_type);
+        } else if (var_type == lite::VarDescAPI::Type::LOD_TENSOR_ARRAY) {
+          var_type_map_[var_name] = LiteType::GetTensorListTy(
+              TARGET(kUnk), PRECISION(kUnk), DATALAYOUT(kUnk));
+        }
         if (var_name == "feed" || var_name == "fetch") continue;
         weights_.push_back(var_name);
         scope_->Var(var_name);

diff --git a/lite/kernels/xpu/CMakeLists.txt b/lite/kernels/xpu/CMakeLists.txt
@@ -67,6 +67,8 @@ add_kernel(sequence_reverse_compute_xpu XPU extra SRCS sequence_reverse_compute.
 add_kernel(sequence_concat_compute_xpu XPU extra SRCS sequence_concat_compute.cc)
 add_kernel(sequence_arithmetic_compute_xpu XPU extra SRCS sequence_arithmetic_compute.cc)
 add_kernel(sequence_pool_compute_xpu XPU extra SRCS sequence_pool_compute.cc)
+add_kernel(sequence_expand_compute_xpu XPU extra SRCS sequence_expand_compute.cc)
+add_kernel(sequence_softmax_compute_xpu XPU extra SRCS sequence_softmax_compute.cc)
 add_kernel(match_matrix_tensor_compute_xpu XPU extra SRCS match_matrix_tensor_compute.cc)
 add_kernel(var_conv_2d_compute_xpu XPU extra SRCS var_conv_2d_compute.cc)
 add_kernel(search_grnn_compute_xpu XPU extra SRCS search_grnn_compute.cc)
@@ -101,6 +103,7 @@ add_kernel(is_empty_compute_xpu XPU extra SRCS is_empty_compute.cc)
 add_kernel(shape_compute_xpu XPU extra SRCS shape_compute.cc)
 add_kernel(lod_array_length_compute_xpu XPU extra SRCS lod_array_length_compute.cc)
 add_kernel(multiclass_nms_compute_xpu XPU extra SRCS multiclass_nms_compute.cc)
+add_kernel(lod_reset_compute_xpu XPU extra SRCS lod_reset_compute.cc)
 
 # extra(fused kernel)
 add_kernel(__xpu__resnet50_compute_xpu XPU extra SRCS __xpu__resnet50_compute.cc)

diff --git a/lite/kernels/xpu/__xpu__embedding_with_eltwise_add_compute.cc b/lite/kernels/xpu/__xpu__embedding_with_eltwise_add_compute.cc
@@ -32,6 +32,14 @@ void XPUEmbeddingWithEltwiseAddCompute::PrepareForRun() {
     table_lens_cpu_.push_back(table_dims[0]);
     arg_tables_.push_back(table->data<float>());
   }
+
+  padding_idx_ = static_cast<int>(param.padding_idx);
+
+  if (GetBoolFromEnv("XPU_PADDING_IDX", true)) {
+    padding_idx_ = -1;
+  }
+  VLOG(3) << "model padding_idx: " << param.padding_idx
+          << ", xpu padding_idx: " << padding_idx_;
 }
 
 void XPUEmbeddingWithEltwiseAddCompute::Run() {
@@ -101,8 +109,7 @@ void XPUEmbeddingWithEltwiseAddCompute::Run() {
       table_lens_cpu_,
       embed_dim,
       std::vector<float>(table_lens_cpu_.size(), 1.0f),
-      std::vector<int>(table_lens_cpu_.size(),
-                       static_cast<int>(param.padding_idx)));
+      std::vector<int>(table_lens_cpu_.size(), padding_idx_));
   CHECK_EQ(r, 0);
 }
 

diff --git a/lite/kernels/xpu/__xpu__embedding_with_eltwise_add_compute.h b/lite/kernels/xpu/__xpu__embedding_with_eltwise_add_compute.h
@@ -35,6 +35,7 @@ class XPUEmbeddingWithEltwiseAddCompute
  private:
   std::vector<const float*> arg_tables_;
   std::vector<int> table_lens_cpu_;
+  int padding_idx_;
 };
 
 }  // namespace xpu

diff --git a/lite/kernels/xpu/__xpu__fc_compute.cc b/lite/kernels/xpu/__xpu__fc_compute.cc
@@ -51,7 +51,7 @@ void XPUFcCompute<TGEMM, TW, DX, DY, PType>::PrepareForRun() {
         TargetWrapperXPU::ConvertCPUWeightToXPUQuantWeight<int8_t, int8_t>(
             reinterpret_cast<const int8_t*>(w_ptr),
             weight_dims,
-            w_trans,
+            !w_trans,
             per_channel_ ? param.weight_max.size() : max_ptr_size);
     CHECK(xpu_quant_weight_.max_ptr_ != nullptr)
         << "slim int8 quant xpu_quant_weight_max_ptr should't be null";
@@ -83,7 +83,7 @@ void XPUFcCompute<TGEMM, TW, DX, DY, PType>::PrepareForRun() {
         TargetWrapperXPU::ConvertCPUWeightToXPUQuantWeight<int16_t, int16_t>(
             reinterpret_cast<const int16_t*>(w_ptr),
             weight_dims,
-            w_trans,
+            !w_trans,
             max_ptr_size);
     std::vector<float> cpu_w_max(max_ptr_size, param.weight_max[0]);
     CHECK(xpu_quant_weight_.max_ptr_ != nullptr)
@@ -102,7 +102,7 @@ void XPUFcCompute<TGEMM, TW, DX, DY, PType>::PrepareForRun() {
 
   xpu_quant_weight_ =
       TargetWrapperXPU::ConvertCPUWeightToXPUQuantWeight<float, TW>(
-          w_ptr, weight_dims, w_trans, max_ptr_size);
+          w_ptr, weight_dims, !w_trans, max_ptr_size);
   if (std::is_same<TW, float>::value) {
     VLOG(6) << "If fc compute precision is int31,must check weight max should "
                "be null ";
@@ -133,8 +133,11 @@ void XPUFcCompute<TGEMM, TW, DX, DY, PType>::Run() {
 
   bool x_trans = param.transpose_x;
   bool w_trans = param.transpose_w;
+  if (w_trans) {
+    n = param.w->dims()[0];
+  }
   int ldx = (x_trans ? m : k);
-  int ldw = (w_trans ? k : n);
+  int ldw = k;
   int ldy = n;
 
   float* output_max =
@@ -168,14 +171,14 @@ void XPUFcCompute<TGEMM, TW, DX, DY, PType>::Run() {
         n,                                                         // n
         k,                                                         // k
         x_trans,                                                   // x_trans
-        w_trans,                                                   // w_trans
+        true,                                                      // w_trans
         input_max,                                                 // x_maxptr
         nullptr,                                                   // w_maxptr
         output_max,                                                // y_maxptr
         ldx,                                                       // ldx
         ldw,                                                       // ldw
         ldy,                                                       // ldy
-        1.0f,                                                      // alpha
+        param.alpha,                                               // alpha
         0.0f,                                                      // beta
         bias,                                                      // bias
         reinterpret_cast<const float*>(
@@ -191,14 +194,14 @@ void XPUFcCompute<TGEMM, TW, DX, DY, PType>::Run() {
         n,                                                           // n
         k,                                                           // k
         x_trans,                                                     // x_trans
-        w_trans,                                                     // w_trans
+        true,                                                        // w_trans
         input_max,                                                   // x_maxptr
         reinterpret_cast<const float*>(xpu_quant_weight_.max_ptr_),  // w_maxptr
         output_max,                                                  // y_maxptr
         ldx,                                                         // ldx
         ldw,                                                         // ldw
         ldy,                                                         // ldy
-        1.0f,                                                        // alpha
+        param.alpha,                                                 // alpha
         0.0f,                                                        // beta
         bias,                                                        // bias
         act);

diff --git a/lite/kernels/xpu/concat_compute.cc b/lite/kernels/xpu/concat_compute.cc
@@ -38,28 +38,43 @@ void ConcatCompute<InType, PType>::Run() {
 
   std::vector<const InType*> x_list;
   std::vector<std::vector<int>> xdims_list;
-  for (int i = 0; i < ins.size(); i++) {
+  for (size_t i = 0; i < ins.size(); i++) {
     if (ins[i]->numel() > 0) {
       xdims_list.push_back(std::vector<int>());
-      for (int j = 0; j < ins[i]->dims().size(); j++) {
+      for (size_t j = 0; j < ins[i]->dims().size(); j++) {
         xdims_list.back().push_back(ins[i]->dims()[j]);
       }
-      if (sizeof(InType) == 8) {
+      if (std::is_same<InType, int64_t>::value) {
         xdims_list[i].back() = xdims_list[i].back() * 2;
       }
-      x_list.push_back(
-          reinterpret_cast<const InType*>(ins[i]->template data<InType>()));
+      x_list.push_back(ins[i]->template data<InType>());
     }
   }
   if (x_list.size() > 1) {
-    int r = xdnn::concat<InType>(
-        ctx.GetRawContext(),
-        x_list,
-        reinterpret_cast<InType*>(
-            out->template mutable_data<InType>(TARGET(kXPU))),
-        xdims_list,
-        axis);
+    int r = 0;
+    // int64 is not supported in xpu1, use float.
+    if (std::is_same<InType, int64_t>::value) {
+      std::vector<const float*> x_list_f(x_list.size());
+      for (size_t i = 0; i < x_list.size(); ++i) {
+        x_list_f[i] = reinterpret_cast<const float*>(x_list[i]);
+      }
+      r = xdnn::concat<float>(
+          ctx.GetRawContext(),
+          x_list_f,
+          reinterpret_cast<float*>(
+              out->template mutable_data<InType>(TARGET(kXPU))),
+          xdims_list,
+          axis);
 
+    } else {
+      r = xdnn::concat<InType>(
+          ctx.GetRawContext(),
+          x_list,
+          reinterpret_cast<InType*>(
+              out->template mutable_data<InType>(TARGET(kXPU))),
+          xdims_list,
+          axis);
+    }
     CHECK_EQ(r, 0);
   } else if (x_list.size() == 1) {
     int r = xdnn::copy<InType>(ctx.GetRawContext(),
@@ -87,6 +102,7 @@ using concati32 =
     paddle::lite::kernels::xpu::ConcatCompute<int, PRECISION(kFloat)>;
 using concati64 =
     paddle::lite::kernels::xpu::ConcatCompute<int64_t, PRECISION(kFloat)>;
+
 REGISTER_LITE_KERNEL(concat, kXPU, kFloat, kNCHW, concatfp32, def)
     .BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFloat))})
     .BindInput("AxisTensor",
@@ -101,21 +117,22 @@ REGISTER_LITE_KERNEL(concat, kXPU, kFP16, kNCHW, concatfp16, concat_FP16)
     .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFP16))})
     .Finalize();
 
-REGISTER_LITE_KERNEL(concat, kXPU, kInt16, kNCHW, concati16, concat_INT16)
+REGISTER_LITE_KERNEL(
+    concat, kXPU, kInt16, kNCHW, concati16, DISABLE_XPU1_concat_INT16)
     .BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kInt16))})
     .BindInput("AxisTensor",
                {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kInt32))})
     .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kInt16))})
     .Finalize();
 
-REGISTER_LITE_KERNEL(concat, kXPU, kInt32, kNCHW, concati32, concat_INT32)
+REGISTER_LITE_KERNEL(concat, kXPU, kFloat, kNCHW, concati32, concat_INT32)
     .BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kInt32))})
     .BindInput("AxisTensor",
                {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kInt32))})
     .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kInt32))})
     .Finalize();
 
-REGISTER_LITE_KERNEL(concat, kXPU, kInt64, kNCHW, concati64, concat_INT64)
+REGISTER_LITE_KERNEL(concat, kXPU, kFloat, kNCHW, concati64, concat_INT64)
     .BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kInt64))})
     .BindInput("AxisTensor",
                {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kInt32))})

diff --git a/lite/kernels/xpu/gru_compute.cc b/lite/kernels/xpu/gru_compute.cc
@@ -56,15 +56,18 @@ void GRUCompute::PrepareForRun() {
       paddle::lite::xpu::math::FindMaxAbs(weight_s1_ptr, weight_s1_len);
   weight_s2_abs_max_ =
       paddle::lite::xpu::math::FindMaxAbs(weight_s2_ptr, weight_s2_len);
-  std::vector<float> weight_max_vector(8);
-  for (int i = 0; i < 4; i++) {
+  auto& ctx = this->ctx_->template As<XPUContext>();
+  int max_ptr_size = ctx.GetRawContext()->max_ptr_size();
+  std::vector<float> weight_max_vector(max_ptr_size * 2);
+  for (int i = 0; i < max_ptr_size; i++) {
     weight_max_vector[i] = weight_s1_abs_max_;
-    weight_max_vector[i + 4] = weight_s2_abs_max_;
+    weight_max_vector[i + max_ptr_size] = weight_s2_abs_max_;
   }
-  weight_max_guard_ = TargetWrapperXPU::MallocScratchPad(8 * sizeof(float));
+  weight_max_guard_ =
+      TargetWrapperXPU::MallocScratchPad(max_ptr_size * 2 * sizeof(float));
   XPU_CALL(xpu_memcpy(reinterpret_cast<float*>(weight_max_guard_->addr_),
                       weight_max_vector.data(),
-                      8 * sizeof(float),
+                      max_ptr_size * 2 * sizeof(float),
                       XPUMemcpyKind::XPU_HOST_TO_DEVICE));
   // quant
   quant_weight_guard_ =