Skip to content

Commit

Permalink
[Cherry-pick][XPU/2.12] cherry-pick (#9601)
Browse files Browse the repository at this point in the history
* [XPU] add seq_softmax, seq_expand, lod_reset op in xpu (#9453)

* [XPU] always set padding_idx to -1 for XPUEmbeddingWithEltwiseAddCompute (#9465)

* [XPU] Fix error when running arm target on xpu (#9466)

* [XPU] supported for matmul/matmul_v2's y_trans=true in __xpu__fc__op (#9427)

Co-authored-by: AlbertVan <[email protected]>
Co-authored-by: linwei210 <[email protected]>
  • Loading branch information
3 people authored Nov 1, 2022
1 parent 433796c commit ce48e33
Show file tree
Hide file tree
Showing 24 changed files with 587 additions and 101 deletions.
55 changes: 35 additions & 20 deletions lite/core/optimizer/mir/fusion/__xpu__fc_fuse_pass.cc
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,27 @@ class XPUFcFuser : public FuseBase {
void BuildPattern() override {
auto* x = VarNode("x")->assert_is_op_input(mul_type_, "X")->AsInput();
auto* W = VarNode("W")->assert_is_op_input(mul_type_, "Y")->AsInput();
auto* mul = OpNode("mul", mul_type_)->AsIntermediate();
// check w.dims() == 2 and x.trans == false.
auto mul_input_check = [this](const Node* node) -> bool {
auto op_desc = *const_cast<Node*>(node)->stmt()->op_info();
auto mul_input_y_name = op_desc.Input("Y").front();
auto* scope = const_cast<Node*>(node)->AsStmt().op()->scope();
auto mul_y_shape = scope->FindMutableTensor(mul_input_y_name)->dims();
if (this->mul_type_ == "mul") {
return mul_y_shape.size() == 2;
}
if (this->mul_type_ == "matmul") {
return (mul_y_shape.size() == 2 &&
!op_desc.GetAttr<bool>("transpose_X"));
}
if (this->mul_type_ == "matmul_v2") {
return (mul_y_shape.size() == 2 && !op_desc.GetAttr<bool>("trans_x"));
}
return false;
};
auto* mul = OpNode("mul", mul_type_)
->AsIntermediate()
->assert_node_satisfied(mul_input_check);
auto* mul_out = VarNode("mul_out")->assert_is_op_output(mul_type_, "Out");
PMNode* bias = nullptr;
PMNode* add = nullptr;
Expand Down Expand Up @@ -194,25 +214,20 @@ class XPUFcFuser : public FuseBase {

op_desc.SetAttr<int>("in_num_col_dims", -1);
if (mul_type_ == "mul") {
op_desc.SetAttr(
"in_num_col_dims",
matched.at("mul")->stmt()->op_info()->GetAttr<int>("x_num_col_dims"));
op_desc.SetAttr("in_num_col_dims",
op_info->GetAttr<int>("x_num_col_dims"));
// trans_x and trans_y is not existed in mul op. set false
op_desc.SetAttr("transpose_x", false);
op_desc.SetAttr("transpose_w", true);
op_desc.SetAttr("transpose_w", false);
} else if (mul_type_ == "matmul") {
op_desc.SetAttr(
"transpose_x",
matched.at("mul")->stmt()->op_info()->GetAttr<bool>("transpose_X"));
op_desc.SetAttr(
"transpose_w",
matched.at("mul")->stmt()->op_info()->GetAttr<bool>("transpose_Y"));
op_desc.SetAttr("transpose_x", op_info->GetAttr<bool>("transpose_X"));
op_desc.SetAttr("transpose_w", op_info->GetAttr<bool>("transpose_Y"));
} else {
op_desc.SetAttr(
"transpose_x",
matched.at("mul")->stmt()->op_info()->GetAttr<bool>("trans_x"));
op_desc.SetAttr(
"transpose_w",
matched.at("mul")->stmt()->op_info()->GetAttr<bool>("trans_y"));
op_desc.SetAttr("transpose_x", op_info->GetAttr<bool>("trans_x"));
op_desc.SetAttr("transpose_w", op_info->GetAttr<bool>("trans_y"));
}
if (op_info->HasAttr("alpha")) {
op_desc.SetAttr("alpha", op_info->GetAttr<float>("alpha"));
}

std::string max_output_name = output_name + "_xpu_max";
Expand Down Expand Up @@ -246,7 +261,7 @@ class XPUFcFuser : public FuseBase {
bool per_tensor = true;
CHECK_GT(weight_max.size(), 0) << "fc channel size: " << weight_max.size();
auto first = weight_max[0];
for (int i = 1; i < weight_max.size(); ++i) {
for (size_t i = 1; i < weight_max.size(); ++i) {
if (std::abs(first - weight_max[i]) > 1e-6) {
per_tensor = false;
break;
Expand All @@ -263,7 +278,7 @@ class XPUFcFusePass : public ProgramPass {
void Apply(const std::unique_ptr<SSAGraph>& graph) override {
if (GetBoolFromEnv("XPU_ENABLE_XTCL")) return;
// TODO(weihaoji) support with_no_bias and more activation types
for (auto with_bias : {true, /*false*/}) {
for (auto with_bias : {true, false}) {
for (auto act_type : {"relu",
"gelu",
/*"sigmoid",
Expand All @@ -273,7 +288,7 @@ class XPUFcFusePass : public ProgramPass {
"hard_sigmoid",
"relu6",*/
"linear"}) {
for (auto mul_type : {"mul", "matmul_v2"}) {
for (auto mul_type : {"mul", "matmul", "matmul_v2"}) {
fusion::XPUFcFuser fuser(with_bias, act_type, mul_type);
fuser(graph.get());
}
Expand Down
7 changes: 6 additions & 1 deletion lite/core/optimizer/mir/variable_place_inference_pass.h
Original file line number Diff line number Diff line change
Expand Up @@ -176,7 +176,8 @@ class VariablePlaceInferencePass : public DebugPass {
*var_type = decl_type;
}
} else if (!(*var_type)->place().is_valid()) {
if (var.is_weight && with_targets["kMetal"]) {
if (var.is_weight &&
(with_targets["kMetal"] || with_targets["kXPU"])) {
SetWeightType(in_node, **var_type, with_targets);
} else if (decl_type->precision() == PRECISION(kInt8) ||
(decl_type->precision() == PRECISION(kFP16) &&
Expand Down Expand Up @@ -218,6 +219,10 @@ class VariablePlaceInferencePass : public DebugPass {
} else {
UpdateTypeFrom(var_type, decl_type);
}
if (with_targets["kXPU"]) {
var.is_weight = false;
var.is_persist = false;
}
}
}
}
Expand Down
32 changes: 12 additions & 20 deletions lite/core/program.cc
Original file line number Diff line number Diff line change
Expand Up @@ -753,26 +753,6 @@ void Program::PrepareWorkspace(
const auto& var_type = var_desc->GetType();
VLOG(4) << "Var " << var_name << " in block " << block_idx;
VLOG(4) << " - type " << static_cast<int>(var_type);

#if defined(LITE_WITH_XPU) || defined(LITE_WITH_CUDA)
if (!var_desc->Persistable()) {
#endif
// Collect precision info into var_type_map_
if (var_type == lite::VarDescAPI::Type::LOD_TENSOR) {
const auto& var_data_type =
VarDescType2PrecisionType(var_desc->GetDataType());
if (var_data_type != PRECISION(kUnk)) {
var_type_map_[var_name] = LiteType::GetTensorTy(
TARGET(kUnk), var_data_type, DATALAYOUT(kUnk));
}
VLOG(4) << " - data type " << static_cast<int>(var_data_type);
} else if (var_type == lite::VarDescAPI::Type::LOD_TENSOR_ARRAY) {
var_type_map_[var_name] = LiteType::GetTensorListTy(
TARGET(kUnk), PRECISION(kUnk), DATALAYOUT(kUnk));
}
#if defined(LITE_WITH_XPU) || defined(LITE_WITH_CUDA)
}
#endif
// Create tensors or weights from variable description.
if (!var_desc->Persistable()) {
vars_.push_back(var_name);
Expand Down Expand Up @@ -806,6 +786,18 @@ void Program::PrepareWorkspace(
var->GetMutable<std::vector<lite::Scope*>>();
}
} else {
if (var_type == lite::VarDescAPI::Type::LOD_TENSOR) {
const auto& var_data_type =
VarDescType2PrecisionType(var_desc->GetDataType());
if (var_data_type != PRECISION(kUnk)) {
var_type_map_[var_name] = LiteType::GetTensorTy(
TARGET(kUnk), var_data_type, DATALAYOUT(kUnk));
}
VLOG(4) << " - data type " << static_cast<int>(var_data_type);
} else if (var_type == lite::VarDescAPI::Type::LOD_TENSOR_ARRAY) {
var_type_map_[var_name] = LiteType::GetTensorListTy(
TARGET(kUnk), PRECISION(kUnk), DATALAYOUT(kUnk));
}
if (var_name == "feed" || var_name == "fetch") continue;
weights_.push_back(var_name);
scope_->Var(var_name);
Expand Down
3 changes: 3 additions & 0 deletions lite/kernels/xpu/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,8 @@ add_kernel(sequence_reverse_compute_xpu XPU extra SRCS sequence_reverse_compute.
add_kernel(sequence_concat_compute_xpu XPU extra SRCS sequence_concat_compute.cc)
add_kernel(sequence_arithmetic_compute_xpu XPU extra SRCS sequence_arithmetic_compute.cc)
add_kernel(sequence_pool_compute_xpu XPU extra SRCS sequence_pool_compute.cc)
add_kernel(sequence_expand_compute_xpu XPU extra SRCS sequence_expand_compute.cc)
add_kernel(sequence_softmax_compute_xpu XPU extra SRCS sequence_softmax_compute.cc)
add_kernel(match_matrix_tensor_compute_xpu XPU extra SRCS match_matrix_tensor_compute.cc)
add_kernel(var_conv_2d_compute_xpu XPU extra SRCS var_conv_2d_compute.cc)
add_kernel(search_grnn_compute_xpu XPU extra SRCS search_grnn_compute.cc)
Expand Down Expand Up @@ -101,6 +103,7 @@ add_kernel(is_empty_compute_xpu XPU extra SRCS is_empty_compute.cc)
add_kernel(shape_compute_xpu XPU extra SRCS shape_compute.cc)
add_kernel(lod_array_length_compute_xpu XPU extra SRCS lod_array_length_compute.cc)
add_kernel(multiclass_nms_compute_xpu XPU extra SRCS multiclass_nms_compute.cc)
add_kernel(lod_reset_compute_xpu XPU extra SRCS lod_reset_compute.cc)

# extra(fused kernel)
add_kernel(__xpu__resnet50_compute_xpu XPU extra SRCS __xpu__resnet50_compute.cc)
Expand Down
11 changes: 9 additions & 2 deletions lite/kernels/xpu/__xpu__embedding_with_eltwise_add_compute.cc
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,14 @@ void XPUEmbeddingWithEltwiseAddCompute::PrepareForRun() {
table_lens_cpu_.push_back(table_dims[0]);
arg_tables_.push_back(table->data<float>());
}

padding_idx_ = static_cast<int>(param.padding_idx);

if (GetBoolFromEnv("XPU_PADDING_IDX", true)) {
padding_idx_ = -1;
}
VLOG(3) << "model padding_idx: " << param.padding_idx
<< ", xpu padding_idx: " << padding_idx_;
}

void XPUEmbeddingWithEltwiseAddCompute::Run() {
Expand Down Expand Up @@ -101,8 +109,7 @@ void XPUEmbeddingWithEltwiseAddCompute::Run() {
table_lens_cpu_,
embed_dim,
std::vector<float>(table_lens_cpu_.size(), 1.0f),
std::vector<int>(table_lens_cpu_.size(),
static_cast<int>(param.padding_idx)));
std::vector<int>(table_lens_cpu_.size(), padding_idx_));
CHECK_EQ(r, 0);
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ class XPUEmbeddingWithEltwiseAddCompute
private:
std::vector<const float*> arg_tables_;
std::vector<int> table_lens_cpu_;
int padding_idx_;
};

} // namespace xpu
Expand Down
19 changes: 11 additions & 8 deletions lite/kernels/xpu/__xpu__fc_compute.cc
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@ void XPUFcCompute<TGEMM, TW, DX, DY, PType>::PrepareForRun() {
TargetWrapperXPU::ConvertCPUWeightToXPUQuantWeight<int8_t, int8_t>(
reinterpret_cast<const int8_t*>(w_ptr),
weight_dims,
w_trans,
!w_trans,
per_channel_ ? param.weight_max.size() : max_ptr_size);
CHECK(xpu_quant_weight_.max_ptr_ != nullptr)
<< "slim int8 quant xpu_quant_weight_max_ptr should't be null";
Expand Down Expand Up @@ -83,7 +83,7 @@ void XPUFcCompute<TGEMM, TW, DX, DY, PType>::PrepareForRun() {
TargetWrapperXPU::ConvertCPUWeightToXPUQuantWeight<int16_t, int16_t>(
reinterpret_cast<const int16_t*>(w_ptr),
weight_dims,
w_trans,
!w_trans,
max_ptr_size);
std::vector<float> cpu_w_max(max_ptr_size, param.weight_max[0]);
CHECK(xpu_quant_weight_.max_ptr_ != nullptr)
Expand All @@ -102,7 +102,7 @@ void XPUFcCompute<TGEMM, TW, DX, DY, PType>::PrepareForRun() {

xpu_quant_weight_ =
TargetWrapperXPU::ConvertCPUWeightToXPUQuantWeight<float, TW>(
w_ptr, weight_dims, w_trans, max_ptr_size);
w_ptr, weight_dims, !w_trans, max_ptr_size);
if (std::is_same<TW, float>::value) {
VLOG(6) << "If fc compute precision is int31,must check weight max should "
"be null ";
Expand Down Expand Up @@ -133,8 +133,11 @@ void XPUFcCompute<TGEMM, TW, DX, DY, PType>::Run() {

bool x_trans = param.transpose_x;
bool w_trans = param.transpose_w;
if (w_trans) {
n = param.w->dims()[0];
}
int ldx = (x_trans ? m : k);
int ldw = (w_trans ? k : n);
int ldw = k;
int ldy = n;

float* output_max =
Expand Down Expand Up @@ -168,14 +171,14 @@ void XPUFcCompute<TGEMM, TW, DX, DY, PType>::Run() {
n, // n
k, // k
x_trans, // x_trans
w_trans, // w_trans
true, // w_trans
input_max, // x_maxptr
nullptr, // w_maxptr
output_max, // y_maxptr
ldx, // ldx
ldw, // ldw
ldy, // ldy
1.0f, // alpha
param.alpha, // alpha
0.0f, // beta
bias, // bias
reinterpret_cast<const float*>(
Expand All @@ -191,14 +194,14 @@ void XPUFcCompute<TGEMM, TW, DX, DY, PType>::Run() {
n, // n
k, // k
x_trans, // x_trans
w_trans, // w_trans
true, // w_trans
input_max, // x_maxptr
reinterpret_cast<const float*>(xpu_quant_weight_.max_ptr_), // w_maxptr
output_max, // y_maxptr
ldx, // ldx
ldw, // ldw
ldy, // ldy
1.0f, // alpha
param.alpha, // alpha
0.0f, // beta
bias, // bias
act);
Expand Down
47 changes: 32 additions & 15 deletions lite/kernels/xpu/concat_compute.cc
Original file line number Diff line number Diff line change
Expand Up @@ -38,28 +38,43 @@ void ConcatCompute<InType, PType>::Run() {

std::vector<const InType*> x_list;
std::vector<std::vector<int>> xdims_list;
for (int i = 0; i < ins.size(); i++) {
for (size_t i = 0; i < ins.size(); i++) {
if (ins[i]->numel() > 0) {
xdims_list.push_back(std::vector<int>());
for (int j = 0; j < ins[i]->dims().size(); j++) {
for (size_t j = 0; j < ins[i]->dims().size(); j++) {
xdims_list.back().push_back(ins[i]->dims()[j]);
}
if (sizeof(InType) == 8) {
if (std::is_same<InType, int64_t>::value) {
xdims_list[i].back() = xdims_list[i].back() * 2;
}
x_list.push_back(
reinterpret_cast<const InType*>(ins[i]->template data<InType>()));
x_list.push_back(ins[i]->template data<InType>());
}
}
if (x_list.size() > 1) {
int r = xdnn::concat<InType>(
ctx.GetRawContext(),
x_list,
reinterpret_cast<InType*>(
out->template mutable_data<InType>(TARGET(kXPU))),
xdims_list,
axis);
int r = 0;
// int64 is not supported in xpu1, use float.
if (std::is_same<InType, int64_t>::value) {
std::vector<const float*> x_list_f(x_list.size());
for (size_t i = 0; i < x_list.size(); ++i) {
x_list_f[i] = reinterpret_cast<const float*>(x_list[i]);
}
r = xdnn::concat<float>(
ctx.GetRawContext(),
x_list_f,
reinterpret_cast<float*>(
out->template mutable_data<InType>(TARGET(kXPU))),
xdims_list,
axis);

} else {
r = xdnn::concat<InType>(
ctx.GetRawContext(),
x_list,
reinterpret_cast<InType*>(
out->template mutable_data<InType>(TARGET(kXPU))),
xdims_list,
axis);
}
CHECK_EQ(r, 0);
} else if (x_list.size() == 1) {
int r = xdnn::copy<InType>(ctx.GetRawContext(),
Expand Down Expand Up @@ -87,6 +102,7 @@ using concati32 =
paddle::lite::kernels::xpu::ConcatCompute<int, PRECISION(kFloat)>;
using concati64 =
paddle::lite::kernels::xpu::ConcatCompute<int64_t, PRECISION(kFloat)>;

REGISTER_LITE_KERNEL(concat, kXPU, kFloat, kNCHW, concatfp32, def)
.BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFloat))})
.BindInput("AxisTensor",
Expand All @@ -101,21 +117,22 @@ REGISTER_LITE_KERNEL(concat, kXPU, kFP16, kNCHW, concatfp16, concat_FP16)
.BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFP16))})
.Finalize();

REGISTER_LITE_KERNEL(concat, kXPU, kInt16, kNCHW, concati16, concat_INT16)
REGISTER_LITE_KERNEL(
concat, kXPU, kInt16, kNCHW, concati16, DISABLE_XPU1_concat_INT16)
.BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kInt16))})
.BindInput("AxisTensor",
{LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kInt32))})
.BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kInt16))})
.Finalize();

REGISTER_LITE_KERNEL(concat, kXPU, kInt32, kNCHW, concati32, concat_INT32)
REGISTER_LITE_KERNEL(concat, kXPU, kFloat, kNCHW, concati32, concat_INT32)
.BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kInt32))})
.BindInput("AxisTensor",
{LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kInt32))})
.BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kInt32))})
.Finalize();

REGISTER_LITE_KERNEL(concat, kXPU, kInt64, kNCHW, concati64, concat_INT64)
REGISTER_LITE_KERNEL(concat, kXPU, kFloat, kNCHW, concati64, concat_INT64)
.BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kInt64))})
.BindInput("AxisTensor",
{LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kInt32))})
Expand Down
13 changes: 8 additions & 5 deletions lite/kernels/xpu/gru_compute.cc
Original file line number Diff line number Diff line change
Expand Up @@ -56,15 +56,18 @@ void GRUCompute::PrepareForRun() {
paddle::lite::xpu::math::FindMaxAbs(weight_s1_ptr, weight_s1_len);
weight_s2_abs_max_ =
paddle::lite::xpu::math::FindMaxAbs(weight_s2_ptr, weight_s2_len);
std::vector<float> weight_max_vector(8);
for (int i = 0; i < 4; i++) {
auto& ctx = this->ctx_->template As<XPUContext>();
int max_ptr_size = ctx.GetRawContext()->max_ptr_size();
std::vector<float> weight_max_vector(max_ptr_size * 2);
for (int i = 0; i < max_ptr_size; i++) {
weight_max_vector[i] = weight_s1_abs_max_;
weight_max_vector[i + 4] = weight_s2_abs_max_;
weight_max_vector[i + max_ptr_size] = weight_s2_abs_max_;
}
weight_max_guard_ = TargetWrapperXPU::MallocScratchPad(8 * sizeof(float));
weight_max_guard_ =
TargetWrapperXPU::MallocScratchPad(max_ptr_size * 2 * sizeof(float));
XPU_CALL(xpu_memcpy(reinterpret_cast<float*>(weight_max_guard_->addr_),
weight_max_vector.data(),
8 * sizeof(float),
max_ptr_size * 2 * sizeof(float),
XPUMemcpyKind::XPU_HOST_TO_DEVICE));
// quant
quant_weight_guard_ =
Expand Down
Loading

0 comments on commit ce48e33

Please sign in to comment.