From b8d106e1c57ff6a06d91b5b5c1232cb54b6e47b7 Mon Sep 17 00:00:00 2001
From: danleifeng <52735331+danleifeng@users.noreply.github.com>
Date: Wed, 20 Jul 2022 15:02:37 +0800
Subject: [PATCH 01/12] =?UTF-8?q?=E3=80=90GPUPS=E3=80=91Adam=20accessor=20?=
 =?UTF-8?q?(#43919)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* add adam/sharedadam optimzier for gpups;edit optimizer struct;test=develop
---
 .../distributed/ps/table/ctr_dymf_accessor.cc |  28 +-
 .../distributed/ps/table/ctr_dymf_accessor.h  |  19 +-
 .../distributed/ps/table/sparse_sgd_rule.cc   |  84 ++-
 .../distributed/ps/table/sparse_sgd_rule.h    |  23 +
 paddle/fluid/distributed/ps/table/table.cc    |   1 +
 .../distributed/ps/wrapper/CMakeLists.txt     |   1 +
 paddle/fluid/distributed/ps/wrapper/fleet.cc  |  45 +-
 .../framework/distributed_strategy.proto      |   6 +-
 paddle/fluid/framework/fleet/CMakeLists.txt   |  15 +-
 paddle/fluid/framework/fleet/heter_context.h  |   2 -
 .../framework/fleet/heter_ps/CMakeLists.txt   |   8 +-
 .../framework/fleet/heter_ps/feature_value.cu | 192 +++++
 .../framework/fleet/heter_ps/feature_value.h  | 705 ++++++++++++++++++
 .../fleet/heter_ps/graph_gpu_ps_table.h       |   6 +-
 .../framework/fleet/heter_ps/hashtable.h      |  12 +-
 .../fleet/heter_ps/hashtable_kernel.cu        | 162 ++--
 .../framework/fleet/heter_ps/heter_comm.h     |  27 +-
 .../framework/fleet/heter_ps/heter_comm_inl.h | 433 ++++++-----
 .../fleet/heter_ps/heter_comm_kernel.cu       | 173 +++--
 .../fleet/heter_ps/heter_comm_kernel.h        |  52 +-
 .../framework/fleet/heter_ps/heter_ps.cc      |  43 +-
 .../framework/fleet/heter_ps/heter_ps.cu      | 145 ++--
 .../fluid/framework/fleet/heter_ps/heter_ps.h |  25 +-
 .../framework/fleet/heter_ps/heter_ps_base.h  |  20 +-
 .../fluid/framework/fleet/heter_ps/mem_pool.h |  14 -
 .../framework/fleet/heter_ps/optimizer.cuh.h  | 472 ++++++++++--
 .../framework/fleet/heter_ps/optimizer_conf.h |  28 +-
 .../fluid/framework/fleet/ps_gpu_wrapper.cc   | 310 +++-----
 .../fluid/framework/fleet/ps_gpu_wrapper.cu   | 326 +-------
 paddle/fluid/framework/fleet/ps_gpu_wrapper.h | 211 ++++--
 .../fluid/framework/fleet/ps_gpu_wrapper.kps  | 179 ++---
 .../fleet/base/distributed_strategy.py        |  15 +
 python/paddle/distributed/ps/the_one_ps.py    |   2 +-
 .../tests/unittests/test_dist_fleet_ps13.py   | 201 +++++
 .../test_fleet_distributed_strategy.py        |   8 +
 tools/parallel_UT_rule.py                     |   3 +-
 36 files changed, 2714 insertions(+), 1282 deletions(-)
 create mode 100644 paddle/fluid/framework/fleet/heter_ps/feature_value.cu
 create mode 100644 python/paddle/fluid/tests/unittests/test_dist_fleet_ps13.py

diff --git a/paddle/fluid/distributed/ps/table/ctr_dymf_accessor.cc b/paddle/fluid/distributed/ps/table/ctr_dymf_accessor.cc
index c65eac99acc03..4feee70fed751 100644
--- a/paddle/fluid/distributed/ps/table/ctr_dymf_accessor.cc
+++ b/paddle/fluid/distributed/ps/table/ctr_dymf_accessor.cc
@@ -31,6 +31,7 @@ int CtrDymfAccessor::Initialize() {
   _embedx_sgd_rule = CREATE_PSCORE_CLASS(SparseValueSGDRule, name);
   _embedx_sgd_rule->LoadConfig(_config.embedx_sgd_param(),
                                _config.embedx_dim());
+  common_feature_value.optimizer_name = name;
 
   common_feature_value.embed_sgd_dim = _embed_sgd_rule->Dim();
   common_feature_value.embedx_dim = _config.embedx_dim();
@@ -42,7 +43,10 @@ int CtrDymfAccessor::Initialize() {
   if (_config.ctr_accessor_param().show_scale()) {
     _show_scale = true;
   }
-  VLOG(0) << " INTO CtrDymfAccessor::Initialize()";
+  VLOG(0) << " INTO CtrDymfAccessor::Initialize(); embed_sgd_dim:"
+          << common_feature_value.embed_sgd_dim
+          << " embedx_dim:" << common_feature_value.embedx_dim
+          << "  embedx_sgd_dim:" << common_feature_value.embedx_sgd_dim;
   InitAccessorInfo();
   return 0;
 }
@@ -53,9 +57,9 @@ void CtrDymfAccessor::InitAccessorInfo() {
 
   auto embedx_dim = _config.embedx_dim();
   VLOG(0) << "InitAccessorInfo embedx_dim:" << embedx_dim;
-  _accessor_info.select_dim = 3 + embedx_dim;
+  _accessor_info.select_dim = 4 + embedx_dim;
   _accessor_info.select_size = _accessor_info.select_dim * sizeof(float);
-  _accessor_info.update_dim = 4 + embedx_dim;
+  _accessor_info.update_dim = 5 + embedx_dim;
   _accessor_info.update_size = _accessor_info.update_dim * sizeof(float);
   _accessor_info.mf_size =
       (embedx_dim + common_feature_value.embedx_sgd_dim) * sizeof(float);
@@ -179,8 +183,10 @@ int32_t CtrDymfAccessor::Create(float** values, size_t num) {
     value[common_feature_value.ClickIndex()] = 0;
     value[common_feature_value.SlotIndex()] = -1;
     value[common_feature_value.MfDimIndex()] = -1;
-    _embed_sgd_rule->InitValue(value + common_feature_value.EmbedWIndex(),
-                               value + common_feature_value.EmbedG2SumIndex());
+    _embed_sgd_rule->InitValue(
+        value + common_feature_value.EmbedWIndex(),
+        value + common_feature_value.EmbedG2SumIndex(),
+        false);  // adam embed init not zero, adagrad embed init zero
     _embedx_sgd_rule->InitValue(value + common_feature_value.EmbedxWIndex(),
                                 value + common_feature_value.EmbedxG2SumIndex(),
                                 false);
@@ -293,22 +299,14 @@ std::string CtrDymfAccessor::ParseToString(const float* v, int param) {
        i++) {
     os << " " << v[i];
   }
-  // os << " " << common_feature_value.Slot(const_cast<float*>(v)) << " "
-  //    << common_feature_value.MfDim(const_cast<float*>(v));
   auto show = common_feature_value.Show(const_cast<float*>(v));
   auto click = common_feature_value.Click(const_cast<float*>(v));
   auto score = ShowClickScore(show, click);
+  auto mf_dim = int(common_feature_value.MfDim(const_cast<float*>(v)));
   if (score >= _config.embedx_threshold() &&
       param > common_feature_value.EmbedxG2SumIndex()) {
-    // VLOG(1) << "common_feature_value.EmbedxG2SumIndex():"
-    //         << common_feature_value.EmbedxG2SumIndex();
-    // VLOG(1) << "common_feature_value.EmbedxWIndex():"
-    //         << common_feature_value.EmbedxWIndex();
-    // VLOG(1) << "common_feature_value.MfDim():"
-    //         << common_feature_value.MfDim(const_cast<float*>(v));
     for (auto i = common_feature_value.EmbedxG2SumIndex();
-         i < common_feature_value.EmbedxWIndex() +
-                 common_feature_value.MfDim(const_cast<float*>(v));
+         i < common_feature_value.Dim(mf_dim);
          ++i) {
       os << " " << v[i];
     }
diff --git a/paddle/fluid/distributed/ps/table/ctr_dymf_accessor.h b/paddle/fluid/distributed/ps/table/ctr_dymf_accessor.h
index a360030cb7d3d..b820d617d06ae 100644
--- a/paddle/fluid/distributed/ps/table/ctr_dymf_accessor.h
+++ b/paddle/fluid/distributed/ps/table/ctr_dymf_accessor.h
@@ -54,10 +54,24 @@ class CtrDymfAccessor : public ValueAccessor {
     int ClickIndex() { return ShowIndex() + 1; }
     int EmbedWIndex() { return ClickIndex() + 1; }
     int EmbedG2SumIndex() { return EmbedWIndex() + 1; }
-    int SlotIndex() { return EmbedG2SumIndex() + 1; }
+    int SlotIndex() { return EmbedG2SumIndex() + embed_sgd_dim; }
     int MfDimIndex() { return SlotIndex() + 1; }
     int EmbedxG2SumIndex() { return MfDimIndex() + 1; }
-    int EmbedxWIndex() { return EmbedxG2SumIndex() + 1; }
+    int EmbedxWIndex() { return EmbedxG2SumIndex() + embedx_sgd_dim; }
+
+    // 根据mf_dim计算的总长度
+    int Dim(int& mf_dim) {
+      int tmp_embedx_sgd_dim = 1;
+      if (optimizer_name == "SparseAdamSGDRule") {  // adam
+        tmp_embedx_sgd_dim = mf_dim * 2 + 2;
+      } else if (optimizer_name == "SparseSharedAdamSGDRule") {  // shared_adam
+        tmp_embedx_sgd_dim = 4;
+      }
+      return 7 + embed_sgd_dim + tmp_embedx_sgd_dim + mf_dim;
+    }
+
+    // 根据mf_dim计算的总byte数
+    int Size(int& mf_dim) { return (Dim(mf_dim)) * sizeof(float); }
 
     float& UnseenDays(float* val) { return val[UnseenDaysIndex()]; }
     float& DeltaScore(float* val) { return val[DeltaScoreIndex()]; }
@@ -73,6 +87,7 @@ class CtrDymfAccessor : public ValueAccessor {
     int embed_sgd_dim;
     int embedx_dim;
     int embedx_sgd_dim;
+    std::string optimizer_name;
   };
 
   struct CtrDymfPushValue {
diff --git a/paddle/fluid/distributed/ps/table/sparse_sgd_rule.cc b/paddle/fluid/distributed/ps/table/sparse_sgd_rule.cc
index 07562f566d326..014d6e450ab4a 100644
--- a/paddle/fluid/distributed/ps/table/sparse_sgd_rule.cc
+++ b/paddle/fluid/distributed/ps/table/sparse_sgd_rule.cc
@@ -213,7 +213,6 @@ void SparseAdamSGDRule::UpdateValueWork(float* w,
   float beta1_pow_ = *beta1_pow;
   float beta2_pow_ = *beta2_pow;
 
-  // lr not change in one update
   lr *= sqrt(1 - beta2_pow_) / (1 - beta1_pow_);
   for (size_t i = 0; i < _embedding_dim; i++) {
     // Calculation
@@ -252,5 +251,88 @@ void SparseAdamSGDRule::InitValueWork(float* value,
   *(sgd + Beta1PowIndex()) = _beta1_decay_rate;
   *(sgd + Beta2PowIndex()) = _beta2_decay_rate;
 }
+
+void SparseSharedAdamSGDRule::LoadConfig(
+    const SparseCommonSGDRuleParameter& param, size_t emb_dim) {
+  _embedding_dim = emb_dim;
+  auto adam_param = param.adam();
+  learning_rate_ = adam_param.learning_rate();
+  _initial_range = adam_param.initial_range();
+  _beta1_decay_rate = adam_param.beta1_decay_rate();
+  _beta2_decay_rate = adam_param.beta2_decay_rate();
+  _ada_epsilon = adam_param.ada_epsilon();
+  if (adam_param.weight_bounds_size() == 0) {
+    _min_bound = -std::numeric_limits<float>::max();
+    _max_bound = std::numeric_limits<float>::max();
+  } else {
+    CHECK(adam_param.weight_bounds_size() >= 2)
+        << "invalid repeated size for weight_bounds:"
+        << adam_param.weight_bounds_size();
+    _min_bound = adam_param.weight_bounds(0);
+    _max_bound = adam_param.weight_bounds(1);
+  }
+}
+
+void SparseSharedAdamSGDRule::UpdateValueWork(float* w,
+                                              float* sgd,
+                                              const float* grad,
+                                              float scale) {
+  float* gsum = sgd + GSumIndex();
+  float* g2sum = sgd + G2SumIndex();
+  float* beta1_pow = sgd + Beta1PowIndex();
+  float* beta2_pow = sgd + Beta2PowIndex();
+  const float* g = grad;
+
+  float lr = learning_rate_;
+  float beta1_pow_ = *beta1_pow;
+  float beta2_pow_ = *beta2_pow;
+  float gsum_ = *gsum;
+  float g2sum_ = *g2sum;
+
+  lr *= sqrt(1 - beta2_pow_) / (1 - beta1_pow_);
+  double sum_gsum = 0.0;
+  double sum_g2sum = 0.0;
+  for (int i = 0; i < _embedding_dim; i++) {
+    // Calculation
+    double new_gsum =
+        _beta1_decay_rate * gsum_ + (1 - _beta1_decay_rate) * g[i];
+    double new_g2sum =
+        _beta2_decay_rate * g2sum_ + (1 - _beta2_decay_rate) * g[i] * g[i];
+    w[i] = w[i] - lr * (new_gsum / (sqrt(new_g2sum) + _ada_epsilon));
+    BoundValue(w[i]);
+    sum_gsum += new_gsum;
+    sum_g2sum += new_g2sum;
+  }
+  // update beta_pow_decay
+  (*gsum) = sum_gsum / _embedding_dim;
+  (*g2sum) = sum_g2sum / _embedding_dim;
+  (*beta1_pow) *= _beta1_decay_rate;
+  (*beta2_pow) *= _beta2_decay_rate;
+}
+
+void SparseSharedAdamSGDRule::InitValueWork(float* value,
+                                            float* sgd,
+                                            bool zero_init) {
+  for (int i = 0; i < _embedding_dim; ++i) {
+    if (zero_init) {
+      value[i] = 0.0;
+      BoundValue(value[i]);
+    } else {
+      value[i] =
+          (local_uniform_real_distribution<double>()(local_random_engine()) *
+               2 -
+           1) *
+          _initial_range;
+      BoundValue(value[i]);
+    }
+  }
+  // init rule gsum and g2sum
+  for (int i = GSumIndex(); i < Beta1PowIndex(); i++) {
+    sgd[i] = 0.0;
+  }
+  // init beta1_pow and beta2_pow
+  *(sgd + Beta1PowIndex()) = _beta1_decay_rate;
+  *(sgd + Beta2PowIndex()) = _beta2_decay_rate;
+}
 }  // namespace distributed
 }  // namespace paddle
diff --git a/paddle/fluid/distributed/ps/table/sparse_sgd_rule.h b/paddle/fluid/distributed/ps/table/sparse_sgd_rule.h
index f62cffdf232e7..4fed331ba93ec 100644
--- a/paddle/fluid/distributed/ps/table/sparse_sgd_rule.h
+++ b/paddle/fluid/distributed/ps/table/sparse_sgd_rule.h
@@ -144,5 +144,28 @@ class SparseAdamSGDRule : public SparseValueSGDRule {
   float _beta2_decay_rate;
   float _ada_epsilon;
 };
+
+class SparseSharedAdamSGDRule : public SparseValueSGDRule {
+ public:
+  virtual void LoadConfig(const SparseCommonSGDRuleParameter& param,
+                          size_t emb_dim);
+  virtual void UpdateValueWork(float* w,
+                               float* sgd,
+                               const float* push_value,
+                               float scale);
+  virtual void InitValueWork(float* value, float* sgd, bool zero_init);
+  virtual size_t Dim() { return 4; }
+  size_t GSumIndex() { return 0; }
+  size_t G2SumIndex() { return GSumIndex() + 1; }
+  size_t Beta1PowIndex() { return G2SumIndex() + 1; }
+  size_t Beta2PowIndex() { return Beta1PowIndex() + 1; }
+
+ protected:
+  float learning_rate_;
+  float _beta1_decay_rate;
+  float _beta2_decay_rate;
+  float _ada_epsilon;
+};
+
 }  // namespace distributed
 }  // namespace paddle
diff --git a/paddle/fluid/distributed/ps/table/table.cc b/paddle/fluid/distributed/ps/table/table.cc
index cfa286f1c3f7f..3e6d5a9941206 100644
--- a/paddle/fluid/distributed/ps/table/table.cc
+++ b/paddle/fluid/distributed/ps/table/table.cc
@@ -49,6 +49,7 @@ REGISTER_PSCORE_CLASS(SparseValueSGDRule, StdAdaGradSGDRule);
 REGISTER_PSCORE_CLASS(SparseValueSGDRule, SparseAdamSGDRule);
 REGISTER_PSCORE_CLASS(SparseValueSGDRule, SparseNaiveSGDRule);
 REGISTER_PSCORE_CLASS(SparseValueSGDRule, SparseAdaGradSGDRule);
+REGISTER_PSCORE_CLASS(SparseValueSGDRule, SparseSharedAdamSGDRule);
 
 int32_t TableManager::Initialize() {
   static bool initialized = false;
diff --git a/paddle/fluid/distributed/ps/wrapper/CMakeLists.txt b/paddle/fluid/distributed/ps/wrapper/CMakeLists.txt
index 8b5457ef9eea5..c9cd883dabb69 100644
--- a/paddle/fluid/distributed/ps/wrapper/CMakeLists.txt
+++ b/paddle/fluid/distributed/ps/wrapper/CMakeLists.txt
@@ -13,6 +13,7 @@ cc_library(
        op_registry
        fs
        shell
+       ps_gpu_wrapper
        ${RPC_DEPS})
 
 target_link_libraries(fleet z)
diff --git a/paddle/fluid/distributed/ps/wrapper/fleet.cc b/paddle/fluid/distributed/ps/wrapper/fleet.cc
index bbefeba559916..3d7190cf55336 100644
--- a/paddle/fluid/distributed/ps/wrapper/fleet.cc
+++ b/paddle/fluid/distributed/ps/wrapper/fleet.cc
@@ -18,6 +18,10 @@ limitations under the License. */
 
 #include "paddle/fluid/distributed/ps/service/communicator/communicator.h"
 #include "paddle/fluid/distributed/ps/table/table.h"
+#include "paddle/fluid/distributed/ps/wrapper/fleet.h"
+#if defined PADDLE_WITH_HETERPS && defined PADDLE_WITH_PSCORE
+#include "paddle/fluid/framework/fleet/ps_gpu_wrapper.h"
+#endif
 
 namespace paddle {
 namespace distributed {
@@ -129,6 +133,13 @@ void FleetWrapper::InitWorker(const std::string& dist_desc,
       worker_ptr_ = std::shared_ptr<paddle::distributed::PSClient>(
           paddle::distributed::PSClientFactory::Create(ps_param));
       worker_ptr_->Configure(ps_param, dense_pull_regions, ps_env_, index);
+#if defined PADDLE_WITH_HETERPS && defined PADDLE_WITH_PSCORE
+      VLOG(3) << "FleetWrapper::InitWorker InitializeGPUServer";
+      auto* accessor = worker_ptr_->GetTableAccessor(0);
+      auto ps_gpu_wrapper = paddle::framework::PSGPUWrapper::GetInstance();
+      ps_gpu_wrapper->InitializeGPUServer(ps_param);
+      ps_gpu_wrapper->SetTableAccessor(accessor);
+#endif
     }
   } else {
     VLOG(3) << "Client can be initialized only once";
@@ -525,11 +536,11 @@ void FleetWrapper::PushSparseFromTensorAsync(
   int batch_size = -1;
   bool batch_size_consist = true;
   for (auto* input : *inputs) {
-    int cur_batch_size =
+    size_t cur_batch_size =
         input->lod().size() ? input->lod()[0].size() - 1 : input->dims()[0];
     if (batch_size == -1) {
-      batch_size = cur_batch_size;
-    } else if (batch_size != cur_batch_size) {
+      batch_size = int(cur_batch_size);
+    } else if (batch_size != int(cur_batch_size)) {
       // CHECK(batch_size == cur_batch_size);  // NOLINT
       batch_size_consist = false;
       break;
@@ -537,12 +548,12 @@ void FleetWrapper::PushSparseFromTensorAsync(
   }
   CHECK(batch_size > 0);  // NOLINT
 
-  int show_size =
+  size_t show_size =
       shows->lod().size() ? shows->lod()[0].size() - 1 : shows->dims()[0];
-  CHECK(show_size == batch_size || show_size == 1);
-  int clk_size =
+  CHECK(show_size == size_t(batch_size) || show_size == 1);
+  size_t clk_size =
       clks->lod().size() ? clks->lod()[0].size() - 1 : clks->dims()[0];
-  CHECK(clk_size == batch_size || clk_size == 1);
+  CHECK(clk_size == size_t(batch_size) || clk_size == 1);
 
   CHECK(outputs->size() == inputs->size());
   std::vector<uint64_t> push_keys;
@@ -601,12 +612,10 @@ void FleetWrapper::PushSparseFromTensorAsync(
             // in
             // ctr_accessor.h
             push_values.back()[0] = 2;  // TODO(zhaocaibei123): slot
-            push_values.back()[1] = (static_cast<int>(i) >= show_size
-                                         ? 1
-                                         : static_cast<float>(show_tensor[i]));
-            push_values.back()[2] = (static_cast<int>(i) >= clk_size
-                                         ? 0
-                                         : static_cast<float>(clk_tensor[i]));
+            push_values.back()[1] =
+                (i >= show_size ? 1 : static_cast<float>(show_tensor[i]));
+            push_values.back()[2] =
+                (i >= clk_size ? 0 : static_cast<float>(clk_tensor[i]));
             float* data = push_values.back().data() + 3;
             memcpy(data, g + output_len, sizeof(float) * fea_dim);
           }
@@ -630,12 +639,10 @@ void FleetWrapper::PushSparseFromTensorAsync(
           // slot show clk grad... consistent with CtrCommonPushValue defined in
           // ctr_accessor.h
           push_values.back()[0] = 2;  // TODO(zhaocaibei123): slot
-          push_values.back()[1] = (static_cast<int>(i) >= show_size
-                                       ? 1
-                                       : static_cast<float>(show_tensor[i]));
-          push_values.back()[2] = (static_cast<int>(i) >= clk_size
-                                       ? 0
-                                       : static_cast<float>(clk_tensor[i]));
+          push_values.back()[1] =
+              (i >= show_size ? 1 : static_cast<float>(show_tensor[i]));
+          push_values.back()[2] =
+              (i >= clk_size ? 0 : static_cast<float>(clk_tensor[i]));
           float* data = push_values.back().data() + 3;
           memcpy(data, g + output_len, sizeof(float) * fea_dim);
         }
diff --git a/paddle/fluid/framework/distributed_strategy.proto b/paddle/fluid/framework/distributed_strategy.proto
index b3a01ae169e4e..45758389c5413 100755
--- a/paddle/fluid/framework/distributed_strategy.proto
+++ b/paddle/fluid/framework/distributed_strategy.proto
@@ -197,14 +197,14 @@ message TableParameter {
 
 message TableAccessorParameter {
   optional string accessor_class = 1;
-  optional SGDParameter embed_sgd_param = 2;
-  optional SGDParameter embedx_sgd_param = 3;
   optional uint32 fea_dim = 4 [ default = 11 ];   // field size of one value
   optional uint32 embedx_dim = 5 [ default = 8 ]; // embedx feature size
   optional uint32 embedx_threshold = 6
       [ default = 10 ]; // embedx feature create threshold
   optional CtrAccessorParameter ctr_accessor_param = 7;
   repeated TableAccessorSaveParameter table_accessor_save_param = 8;
+  optional SGDParameter embed_sgd_param = 10;
+  optional SGDParameter embedx_sgd_param = 11;
 }
 
 message SGDParameter {
@@ -228,7 +228,7 @@ message
   repeated float weight_bounds = 4;
 }
 
-message SparseAdamSGDParameter { // SparseAdamSGDRule
+message SparseAdamSGDParameter { // SparseAdamSGDRule | SparseSharedAdamSGDRule
   optional double learning_rate = 1 [ default = 0.001 ];
   optional double initial_range = 2 [ default = 0.0001 ];
   optional double beta1_decay_rate = 3 [ default = 0.9 ];
diff --git a/paddle/fluid/framework/fleet/CMakeLists.txt b/paddle/fluid/framework/fleet/CMakeLists.txt
index 42235b7c484e3..4cf3ab8dc1a67 100644
--- a/paddle/fluid/framework/fleet/CMakeLists.txt
+++ b/paddle/fluid/framework/fleet/CMakeLists.txt
@@ -25,10 +25,17 @@ endif()
 
 if(WITH_HETERPS)
   if(WITH_NCCL AND WITH_GPU)
-    nv_library(
-      ps_gpu_wrapper
-      SRCS ps_gpu_wrapper.cu ps_gpu_wrapper.cc
-      DEPS heter_ps gloo_wrapper ${BRPC_DEPS})
+    if(WITH_PSCORE)
+      nv_library(
+        ps_gpu_wrapper
+        SRCS ps_gpu_wrapper.cu ps_gpu_wrapper.cc
+        DEPS heter_ps gloo_wrapper ps_framework_proto ${BRPC_DEPS})
+    else()
+      nv_library(
+        ps_gpu_wrapper
+        SRCS ps_gpu_wrapper.cu ps_gpu_wrapper.cc
+        DEPS heter_ps gloo_wrapper ${BRPC_DEPS})
+    endif()
     add_subdirectory(heter_ps)
   elseif(WITH_XPU_KP)
     xpu_library(
diff --git a/paddle/fluid/framework/fleet/heter_context.h b/paddle/fluid/framework/fleet/heter_context.h
index 3955502c8b808..ef2e73d6dd5b5 100644
--- a/paddle/fluid/framework/fleet/heter_context.h
+++ b/paddle/fluid/framework/fleet/heter_context.h
@@ -81,7 +81,6 @@ class HeterContext {
   std::vector<std::vector<FeatureValue>> device_values_;
   std::vector<std::vector<FeatureKey>> device_keys_;
   std::vector<std::vector<std::vector<FeatureKey>>> device_dim_keys_;
-  std::vector<std::vector<std::vector<FeatureValue>>> device_dim_values_;
   std::vector<std::mutex*> mutex_;
   std::vector<std::vector<std::mutex*>> dim_mutex_;
   int multi_mf_dim_ = 0;
@@ -114,7 +113,6 @@ class HeterContext {
       value_dim_ptr_[i].resize(dim_num);
     }
     device_values_.resize(device_num);
-    device_dim_values_.resize(device_num);
     device_keys_.resize(device_num);
 
     device_dim_keys_.resize(device_num);
diff --git a/paddle/fluid/framework/fleet/heter_ps/CMakeLists.txt b/paddle/fluid/framework/fleet/heter_ps/CMakeLists.txt
index 7540c6147f4b7..9631502f4f05e 100644
--- a/paddle/fluid/framework/fleet/heter_ps/CMakeLists.txt
+++ b/paddle/fluid/framework/fleet/heter_ps/CMakeLists.txt
@@ -9,16 +9,16 @@ if(WITH_GPU)
   endif()
   nv_library(
     heter_comm_kernel
-    SRCS heter_comm_kernel.cu feature_value.h
+    SRCS heter_comm_kernel.cu feature_value.h feature_value.cu
     DEPS ${HETERPS_DEPS})
   nv_library(
     hashtable_kernel
-    SRCS hashtable_kernel.cu feature_value.h
+    SRCS hashtable_kernel.cu feature_value.h feature_value.cu
     DEPS ${HETERPS_DEPS})
   nv_library(
     heter_comm
-    SRCS heter_comm.h feature_value.h heter_resource.cc heter_resource.h
-         mem_pool.h
+    SRCS heter_comm.h feature_value.h feature_value.cu heter_resource.cc
+         heter_resource.h mem_pool.h
     DEPS ${HETERPS_DEPS} heter_comm_kernel hashtable_kernel)
   nv_test(
     test_heter_comm
diff --git a/paddle/fluid/framework/fleet/heter_ps/feature_value.cu b/paddle/fluid/framework/fleet/heter_ps/feature_value.cu
new file mode 100644
index 0000000000000..560ce33b9af78
--- /dev/null
+++ b/paddle/fluid/framework/fleet/heter_ps/feature_value.cu
@@ -0,0 +1,192 @@
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+  http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#ifdef PADDLE_WITH_HETERPS
+#include "paddle/fluid/framework/fleet/heter_ps/feature_value.h"
+
+namespace paddle {
+namespace framework {
+
+template <typename FVAccessor>
+__global__ void PullCopy(float** dest,
+                         const float* src,
+                         const int64_t* len,
+                         int slot_num,
+                         int total_len,
+                         uint64_t** keys,
+                         uint64_t max_val_size,
+                         int* gpu_dim,
+                         FVAccessor feature_value_accessor) {
+  CUDA_KERNEL_LOOP(i, total_len) {
+    int low = 0;
+    int high = slot_num - 1;
+    while (low < high) {
+      int mid = (low + high) / 2;
+      if (i < len[mid])
+        high = mid;
+      else
+        low = mid + 1;
+    }
+    int x = low;
+    int y = i - (x ? len[x - 1] : 0);
+    float* feature_value_ptr =
+        (float*)((char*)src + uint64_t(i) * uint64_t(max_val_size));
+    int mf_dim = gpu_dim[x] - 3;
+    feature_value_accessor.Select(
+        dest[x] + y * (mf_dim + 3), feature_value_ptr, keys[x] + y, mf_dim);
+  }
+}
+
+template <typename FVAccessor>
+__global__ void PushCopyWithPool(float* dest,
+                                 float** src,
+                                 int64_t* len,
+                                 int slot_num,
+                                 uint64_t total_len,
+                                 int bs,
+                                 int* slot_vector,
+                                 int* mf_dim_vector,
+                                 size_t grad_value_size,
+                                 FVAccessor feature_value_accessor) {
+  CUDA_KERNEL_LOOP(i, total_len) {
+    int low = 0;
+    int high = slot_num - 1;
+    while (low < high) {
+      int mid = (low + high) / 2;
+      if (i < len[mid])
+        high = mid;
+      else
+        low = mid + 1;
+    }
+    int x = low;
+    int y = i - (x ? len[low - 1] : 0);
+    float* cur = (float*)((char*)dest + i * grad_value_size);
+
+    cur[feature_value_accessor.common_push_value.SlotIndex()] =
+        (float)slot_vector[x];
+    int mf_dim = mf_dim_vector[x];
+    cur[feature_value_accessor.common_push_value.MfDimIndex()] = mf_dim;
+
+    cur[feature_value_accessor.common_push_value.ShowIndex()] =
+        *(src[x] + y * (mf_dim + 3));
+    cur[feature_value_accessor.common_push_value.ClickIndex()] =
+        *(src[x] + y * (mf_dim + 3) + 1);
+    cur[feature_value_accessor.common_push_value.EmbedGIndex()] =
+        *(src[x] + y * (mf_dim + 3) + 2) * -1. * bs;
+    for (int j = 0; j < mf_dim; j++) {
+      cur[feature_value_accessor.common_push_value.EmbedxGIndex() + j] =
+          *(src[x] + y * (mf_dim + 3) + 3 + j) * -1. * bs;
+    }
+  }
+}
+
+template <typename GPUAccessor>
+void AccessorWrapper<GPUAccessor>::CopyForPullImpl(
+    const paddle::platform::Place& place,
+    uint64_t** gpu_keys,
+    const std::vector<float*>& values,
+    const float* total_values_gpu,
+    const int64_t* gpu_len,
+    const int slot_num,
+    const int hidden_size,
+    const int64_t total_length,
+    int* gpu_dim,
+    int feature_value_size) {
+  auto stream = dynamic_cast<paddle::platform::CUDADeviceContext*>(
+                    paddle::platform::DeviceContextPool::Instance().Get(place))
+                    ->stream();
+  auto buf_value = memory::Alloc(place, values.size() * sizeof(float*));
+  float** gpu_values = reinterpret_cast<float**>(buf_value->ptr());
+  cudaMemcpy(gpu_values,
+             values.data(),
+             values.size() * sizeof(float*),
+             cudaMemcpyHostToDevice);
+  PullCopy<<<(total_length + 1024 - 1) / 1024, 1024, 0, stream>>>(
+      gpu_values,
+      total_values_gpu,
+      gpu_len,
+      slot_num,
+      total_length,
+      gpu_keys,
+      feature_value_size,
+      gpu_dim,
+      gpu_accessor_);
+  cudaStreamSynchronize(stream);
+}
+
+template <typename GPUAccessor>
+void AccessorWrapper<GPUAccessor>::CopyForPushImpl(
+    const paddle::platform::Place& place,
+    const std::vector<const float*>& grad_values,
+    float* total_grad_values_gpu,
+    const std::vector<int64_t>& slot_lengths,
+    const uint64_t total_length,
+    const int batch_size,
+    size_t grad_value_size,
+    std::vector<int>& slot_vector,
+    std::vector<int>& slot_mf_dim_vector) {
+  auto stream = dynamic_cast<paddle::platform::CUDADeviceContext*>(
+                    paddle::platform::DeviceContextPool::Instance().Get(place))
+                    ->stream();
+  auto slot_lengths_lod = slot_lengths;
+  for (int i = 1; i < slot_lengths_lod.size(); i++) {
+    slot_lengths_lod[i] += slot_lengths_lod[i - 1];
+  }
+  auto buf_grad_value =
+      memory::Alloc(place, grad_values.size() * sizeof(float*));
+  auto buf_length = memory::Alloc(place, slot_lengths.size() * sizeof(int64_t));
+  auto buf_slot_vector =
+      memory::Alloc(place, slot_lengths_lod.size() * sizeof(int));
+  auto buf_mf_dim_vector =
+      memory::Alloc(place, slot_lengths_lod.size() * sizeof(int));
+  float** gpu_values = reinterpret_cast<float**>(buf_grad_value->ptr());
+  int64_t* gpu_len = reinterpret_cast<int64_t*>(buf_length->ptr());
+  int* d_slot_vector = reinterpret_cast<int*>(buf_slot_vector->ptr());
+  int* d_mf_dim_vector = reinterpret_cast<int*>(buf_mf_dim_vector->ptr());
+  cudaMemcpy(gpu_values,
+             grad_values.data(),
+             grad_values.size() * sizeof(float*),
+             cudaMemcpyHostToDevice);
+  cudaMemcpy(gpu_len,
+             slot_lengths_lod.data(),
+             slot_lengths.size() * sizeof(int64_t),
+             cudaMemcpyHostToDevice);
+  cudaMemcpy(d_slot_vector,
+             slot_vector.data(),
+             slot_lengths_lod.size() * sizeof(int),
+             cudaMemcpyHostToDevice);
+  cudaMemcpy(d_mf_dim_vector,
+             slot_mf_dim_vector.data(),
+             slot_lengths_lod.size() * sizeof(int),
+             cudaMemcpyHostToDevice);
+  PushCopyWithPool<<<(total_length + 1024 - 1) / 1024, 1024, 0, stream>>>(
+      total_grad_values_gpu,
+      gpu_values,
+      gpu_len,
+      slot_lengths.size(),
+      total_length,
+      batch_size,
+      d_slot_vector,
+      d_mf_dim_vector,
+      grad_value_size,
+      gpu_accessor_);
+  cudaStreamSynchronize(stream);
+}
+
+#ifdef PADDLE_WITH_PSCORE
+template class AccessorWrapper<CommonFeatureValueAccessor>;
+#endif
+
+}  // namespace framework
+}  // namespace paddle
+#endif
diff --git a/paddle/fluid/framework/fleet/heter_ps/feature_value.h b/paddle/fluid/framework/fleet/heter_ps/feature_value.h
index cb7f3a40d6720..ef4533d64eac2 100644
--- a/paddle/fluid/framework/fleet/heter_ps/feature_value.h
+++ b/paddle/fluid/framework/fleet/heter_ps/feature_value.h
@@ -17,12 +17,547 @@ limitations under the License. */
 #ifdef PADDLE_WITH_HETERPS
 
 #include <iostream>
+#include <sstream>
+#include <unordered_map>
+
+#include "paddle/fluid/platform/device_context.h"
+#include "paddle/fluid/platform/place.h"
+#ifdef PADDLE_WITH_PSCORE
+#include "paddle/fluid/distributed/ps/table/accessor.h"
+#include "paddle/fluid/distributed/ps/table/ctr_dymf_accessor.h"
+#include "paddle/fluid/distributed/ps/table/depends/feature_value.h"
+#endif
 
 namespace paddle {
 namespace framework {
 #define MF_DIM 8
 
 typedef uint64_t FeatureKey;
+#define TYPEALIGN(ALIGNVAL, LEN) \
+  (((uint64_t)(LEN) + ((ALIGNVAL)-1)) & ~((uint64_t)((ALIGNVAL)-1)))
+
+class FeatureValueAccessor {
+ public:
+  __host__ __device__ FeatureValueAccessor() {}
+  __host__ __device__ ~FeatureValueAccessor() {}
+
+  __host__ __device__ virtual int Configure(
+      std::unordered_map<std::string, float> config) {
+    _config = config;
+    Initialize();
+    return 0;
+  }
+  __host__ __device__ virtual int Initialize() = 0;
+
+ protected:
+  std::unordered_map<std::string, float> _config;
+};
+
+// adagrad: embed_sgd_dim=1, embedx_sgd_dim=1,embedx_dim=n
+// adam std:  embed_sgd_dim=4, embedx_sgd_dim=n*2+2,embedx_dim=n
+// adam shared:  embed_sgd_dim=4, embedx_sgd_dim=4,embedx_dim=n
+class CommonFeatureValueAccessor : public FeatureValueAccessor {
+ public:
+  struct CommonFeatureValue {
+    /*
+      uint64_t cpu_ptr;
+      float delta_score;
+      float show;
+      float click;
+      float embed_w;
+      std::vector<float> embed_g2sum;
+      float slot;
+      float mf_dim
+      float mf_size
+      std::vector<float> embedx_g2sum;
+      std::vector<float> embedx_w;
+       */
+
+    __host__ __device__ int Dim() {
+      return 9 + embed_sgd_dim + embedx_sgd_dim + embedx_dim;
+    }  // has cpu_ptr(2)
+    __host__ __device__ int DimSize(size_t dim, int embedx_dim) {
+      return sizeof(float);
+    }
+    __host__ __device__ size_t Size() {
+      return TYPEALIGN(8, Dim() * sizeof(float));
+    }  // cpu_ptr:uint64=2float
+    __host__ __device__ int EmbedDim() { return embed_sgd_dim; }
+    __host__ __device__ int EmbedXDim() { return embedx_sgd_dim; }
+    __host__ __device__ int EmbedWDim() { return embedx_dim; }
+    __host__ __device__ int CpuPtrIndex() { return 0; }  // cpuprt uint64
+    __host__ __device__ int DeltaScoreIndex() { return CpuPtrIndex() + 2; }
+    __host__ __device__ int ShowIndex() { return DeltaScoreIndex() + 1; }
+    __host__ __device__ int ClickIndex() { return ShowIndex() + 1; }
+    __host__ __device__ int EmbedWIndex() { return ClickIndex() + 1; }
+    __host__ __device__ int EmbedG2SumIndex() { return EmbedWIndex() + 1; }
+    __host__ __device__ int SlotIndex() {
+      return EmbedG2SumIndex() + embed_sgd_dim;
+    }
+    __host__ __device__ int MfDimIndex() { return SlotIndex() + 1; }
+    __host__ __device__ int MfSizeIndex() {
+      return MfDimIndex() + 1;
+    }  // actual mf size (ex. 0)
+    __host__ __device__ int EmbedxG2SumIndex() { return MfSizeIndex() + 1; }
+    __host__ __device__ int EmbedxWIndex() {
+      return EmbedxG2SumIndex() + embedx_sgd_dim;
+    }
+
+    // 根据mf_dim计算的总长度
+    __host__ __device__ int Dim(int& mf_dim) {
+      int tmp_embedx_sgd_dim = 1;
+      if (optimizer_type_ == 3) {  // adam
+        tmp_embedx_sgd_dim = mf_dim * 2 + 2;
+      } else if (optimizer_type_ == 4) {  // shared_adam
+        tmp_embedx_sgd_dim = 4;
+      }
+      return 9 + embed_sgd_dim + tmp_embedx_sgd_dim + mf_dim;
+    }
+
+    // 根据mf_dim 计算的总byte数
+    __host__ __device__ size_t Size(int& mf_dim) {
+      return TYPEALIGN(8, Dim(mf_dim) * sizeof(float));  // cpu_ptr:2float
+    }
+
+    // 根据mf_dim 计算的 mf_size byte数
+    __host__ __device__ size_t MFSize(int& mf_dim) {
+      int tmp_embedx_sgd_dim = 1;
+      if (optimizer_type_ == 3) {  // adam
+        tmp_embedx_sgd_dim = mf_dim * 2 + 2;
+      } else if (optimizer_type_ == 4) {  // shared_adam
+        tmp_embedx_sgd_dim = 4;
+      }
+      return (tmp_embedx_sgd_dim + mf_dim) * sizeof(float);
+    }
+
+    __host__ __device__ int EmbedxG2SumOffsetIndex() { return 0; }
+    __host__ __device__ int EmbedxWOffsetIndex(float* val) {
+      // has mf
+      int tmp_embedx_sgd_dim = 1;
+      if (int(MfSize(val)) > 0) {
+        if (optimizer_type_ == 3) {  // adam
+          tmp_embedx_sgd_dim = int(MfDim(val)) * 2 + 2;
+        } else if (optimizer_type_ == 4) {  // shared_adam
+          tmp_embedx_sgd_dim = 4;
+        }
+        return EmbedxG2SumIndex() + tmp_embedx_sgd_dim;
+      } else {
+        // no mf
+        return 0;
+      }
+    }
+
+    __host__ __device__ uint64_t CpuPtr(float* val) {
+      return *(reinterpret_cast<uint64_t*>(val));
+    }
+    __host__ __device__ float& DeltaScore(float* val) {
+      return val[DeltaScoreIndex()];
+    }
+    __host__ __device__ float& Show(float* val) { return val[ShowIndex()]; }
+    __host__ __device__ float& Click(float* val) { return val[ClickIndex()]; }
+    __host__ __device__ float& Slot(float* val) { return val[SlotIndex()]; }
+    __host__ __device__ float& MfDim(float* val) { return val[MfDimIndex()]; }
+    __host__ __device__ float& MfSize(float* val) { return val[MfSizeIndex()]; }
+    __host__ __device__ float& EmbedW(float* val) { return val[EmbedWIndex()]; }
+    __host__ __device__ float& EmbedG2Sum(float* val) {
+      return val[EmbedG2SumIndex()];
+    }
+    __host__ __device__ float& EmbedxG2Sum(float* val) {
+      return val[EmbedxG2SumIndex()];
+    }
+    __host__ __device__ float& EmbedxW(float* val) {
+      return val[EmbedxWIndex()];
+    }
+
+    int embed_sgd_dim;
+    int embedx_dim;
+    int embedx_sgd_dim;
+    int optimizer_type_;
+  };
+
+  struct CommonPushValue {
+    /*
+       float slot;
+       float show;
+       float click;
+       float mf_dim;
+       float embed_g;
+       std::vector<float> embedx_g;
+       */
+
+    __host__ __device__ int Dim(int embedx_dim) { return 5 + embedx_dim; }
+
+    __host__ __device__ int DimSize(int dim, int embedx_dim) {
+      return sizeof(float);
+    }
+    __host__ __device__ int Size(int embedx_dim) {
+      return TYPEALIGN(8, Dim(embedx_dim) * sizeof(float));
+    }
+    __host__ __device__ int SlotIndex() { return 0; }
+    __host__ __device__ int ShowIndex() {
+      return CommonPushValue::SlotIndex() + 1;
+    }
+    __host__ __device__ int ClickIndex() {
+      return CommonPushValue::ShowIndex() + 1;
+    }
+    __host__ __device__ int MfDimIndex() {
+      return CommonPushValue::ClickIndex() + 1;
+    }
+    __host__ __device__ int EmbedGIndex() {
+      return CommonPushValue::MfDimIndex() + 1;
+    }
+    __host__ __device__ int EmbedxGIndex() {
+      return CommonPushValue::EmbedGIndex() + 1;
+    }
+    __host__ __device__ float& Slot(float* val) {
+      return val[CommonPushValue::SlotIndex()];
+    }
+    __host__ __device__ float& Show(float* val) {
+      return val[CommonPushValue::ShowIndex()];
+    }
+    __host__ __device__ float& Click(float* val) {
+      return val[CommonPushValue::ClickIndex()];
+    }
+    __host__ __device__ float& MfDim(float* val) {
+      return val[CommonPushValue::MfDimIndex()];
+    }
+    __host__ __device__ float& EmbedG(float* val) {
+      return val[CommonPushValue::EmbedGIndex()];
+    }
+    __host__ __device__ float* EmbedxG(float* val) {
+      return val + CommonPushValue::EmbedxGIndex();
+    }
+  };
+
+  struct CommonPullValue {
+    /*
+       float show;
+       float click;
+       float embed_w;
+       std::vector<float> embedx_w;
+       */
+
+    __host__ __device__ static int Dim(int embedx_dim) {
+      return 3 + embedx_dim;
+    }
+    __host__ __device__ int DimSize(size_t dim) { return sizeof(float); }
+    __host__ __device__ int Size(int embedx_dim) {
+      return TYPEALIGN(8, Dim(embedx_dim) * sizeof(float));
+    }
+    __host__ __device__ int ShowIndex() { return 0; }
+    __host__ __device__ int ClickIndex() { return 1; }
+    __host__ __device__ int EmbedWIndex() { return 2; }
+    __host__ __device__ int EmbedxWIndex() { return 3; }
+    __host__ __device__ float& Show(float* val) {
+      return val[CommonPullValue::ShowIndex()];
+    }
+    __host__ __device__ float& Click(float* val) {
+      return val[CommonPullValue::ClickIndex()];
+    }
+    __host__ __device__ float& EmbedW(float* val) {
+      return val[CommonPullValue::EmbedWIndex()];
+    }
+    __host__ __device__ float* EmbedxW(float* val) {
+      return val + CommonPullValue::EmbedxWIndex();
+    }
+  };
+
+  __host__ __device__ CommonFeatureValueAccessor() {}
+  __host__ __device__ ~CommonFeatureValueAccessor() {}
+
+  __host__ __device__ virtual int Initialize() {
+    int optimizer_type = (_config.find("optimizer_type") == _config.end())
+                             ? 1
+                             : int(_config["optimizer_type"]);
+    int sparse_embedx_dim = (_config.find("embedx_dim") == _config.end())
+                                ? 8
+                                : int(_config["embedx_dim"]);
+    if (optimizer_type == 3) {  // adam
+      common_feature_value.embed_sgd_dim = 4;
+      common_feature_value.embedx_sgd_dim = sparse_embedx_dim * 2 + 2;
+    } else if (optimizer_type == 4) {  // shared_adam
+      common_feature_value.embed_sgd_dim = 4;
+      common_feature_value.embedx_sgd_dim = 4;
+    } else {
+      common_feature_value.embed_sgd_dim = 1;
+      common_feature_value.embedx_sgd_dim = 1;
+    }
+    common_feature_value.optimizer_type_ = optimizer_type;
+    common_feature_value.embedx_dim = sparse_embedx_dim;
+
+    return 0;
+  }
+
+  // // build阶段从cpu_val赋值给gpu_val
+  __host__ void BuildFill(
+      float* gpu_val,
+      void* cpu,
+      paddle::distributed::ValueAccessor* cpu_table_accessor,
+      int mf_dim) {
+#ifdef PADDLE_WITH_PSCORE
+    paddle::distributed::CtrDymfAccessor* cpu_accessor =
+        dynamic_cast<paddle::distributed::CtrDymfAccessor*>(cpu_table_accessor);
+    paddle::distributed::FixedFeatureValue* cpu_ptr =
+        (paddle::distributed::FixedFeatureValue*)(cpu);
+    float* cpu_val = cpu_ptr->data();
+    size_t cpu_dim = cpu_ptr->size();
+
+    gpu_val[common_feature_value.DeltaScoreIndex()] =
+        cpu_val[cpu_accessor->common_feature_value.DeltaScoreIndex()];
+    gpu_val[common_feature_value.ShowIndex()] =
+        cpu_val[cpu_accessor->common_feature_value.ShowIndex()];
+    gpu_val[common_feature_value.ClickIndex()] =
+        cpu_val[cpu_accessor->common_feature_value.ClickIndex()];
+    gpu_val[common_feature_value.SlotIndex()] =
+        cpu_val[cpu_accessor->common_feature_value.SlotIndex()];
+    gpu_val[common_feature_value.EmbedWIndex()] =
+        cpu_val[cpu_accessor->common_feature_value.EmbedWIndex()];
+    for (int i = 0; i < common_feature_value.EmbedDim(); i++) {
+      gpu_val[common_feature_value.EmbedG2SumIndex() + i] =
+          cpu_val[cpu_accessor->common_feature_value.EmbedG2SumIndex() + i];
+    }
+    *(reinterpret_cast<uint64_t*>(
+        gpu_val + common_feature_value.CpuPtrIndex())) = (uint64_t)(cpu);
+    cpu_val[cpu_accessor->common_feature_value.MfDimIndex()] = float(mf_dim);
+    gpu_val[common_feature_value.MfDimIndex()] = mf_dim;
+    if (cpu_dim > cpu_accessor->GetAccessorInfo().dim -
+                      cpu_accessor->GetAccessorInfo().mf_size / sizeof(float)) {
+      gpu_val[common_feature_value.MfSizeIndex()] =
+          common_feature_value.MFSize(mf_dim) / sizeof(float);
+
+      for (int x = 0;
+           x < int(common_feature_value.MFSize(mf_dim) / sizeof(float));
+           x++) {
+        gpu_val[common_feature_value.EmbedxG2SumIndex() + x] =
+            cpu_val[cpu_accessor->common_feature_value.EmbedxG2SumIndex() + x];
+      }
+    } else {
+      gpu_val[common_feature_value.MfSizeIndex()] = 0;
+      for (int x = common_feature_value.EmbedxG2SumIndex();
+           x < int(common_feature_value.Size(mf_dim) / sizeof(float));
+           x++) {
+        gpu_val[x] = 0;
+      }
+    }
+#endif
+  }
+
+  // dump_to_cpu阶段从gpu_val赋值给cpu_val
+  __host__ void DumpFill(float* gpu_val,
+                         paddle::distributed::ValueAccessor* cpu_table_accessor,
+                         int mf_dim) {
+#ifdef PADDLE_WITH_PSCORE
+    paddle::distributed::CtrDymfAccessor* cpu_accessor =
+        dynamic_cast<paddle::distributed::CtrDymfAccessor*>(cpu_table_accessor);
+
+    auto* downpour_value =
+        (paddle::distributed::FixedFeatureValue*)(*(reinterpret_cast<uint64_t*>(
+            gpu_val + common_feature_value.CpuPtrIndex())));
+    size_t downpour_value_size = downpour_value->size();
+    if (gpu_val[common_feature_value.MfSizeIndex()] > 0 &&
+        downpour_value_size == (cpu_accessor->GetAccessorInfo().dim -
+                                int(cpu_accessor->GetAccessorInfo().mf_size /
+                                    sizeof(float)))) {  // cpu_accessor
+      downpour_value->resize(cpu_accessor->common_feature_value.Dim(mf_dim));
+    }
+    float* cpu_val = downpour_value->data();
+    cpu_val[cpu_accessor->common_feature_value.DeltaScoreIndex()] =
+        gpu_val[common_feature_value.DeltaScoreIndex()];
+    cpu_val[cpu_accessor->common_feature_value.ShowIndex()] =
+        gpu_val[common_feature_value.ShowIndex()];
+    cpu_val[cpu_accessor->common_feature_value.ClickIndex()] =
+        gpu_val[common_feature_value.ClickIndex()];
+    cpu_val[cpu_accessor->common_feature_value.EmbedWIndex()] =
+        gpu_val[common_feature_value.EmbedWIndex()];
+    cpu_val[cpu_accessor->common_feature_value.SlotIndex()] =
+        gpu_val[common_feature_value.SlotIndex()];
+
+    for (int i = 0; i < common_feature_value.EmbedDim(); i++) {
+      cpu_val[cpu_accessor->common_feature_value.EmbedG2SumIndex() + i] =
+          gpu_val[common_feature_value.EmbedG2SumIndex() + i];
+    }
+
+    if (gpu_val[common_feature_value.MfSizeIndex()] > 0) {
+      for (int x = 0;
+           x < int(common_feature_value.MFSize(mf_dim) / sizeof(float));
+           x++) {
+        cpu_val[cpu_accessor->common_feature_value.EmbedxG2SumIndex() + x] =
+            gpu_val[common_feature_value.EmbedxG2SumIndex() + x];
+      }
+    }
+#endif
+  }
+
+  // dy_mf_fill_dvals_kernel, dy_mf_search_kernel 阶段 gpukernel
+  // 中从src_val赋值给dest_val
+  __host__ __device__ void FeatureValueFill(float* dest_val,
+                                            float* src_val,
+                                            int mf_dim) {
+    *(reinterpret_cast<uint64_t*>(dest_val +
+                                  common_feature_value.CpuPtrIndex())) =
+        *(reinterpret_cast<uint64_t*>(src_val +
+                                      common_feature_value.CpuPtrIndex()));
+    dest_val[common_feature_value.DeltaScoreIndex()] =
+        src_val[common_feature_value.DeltaScoreIndex()];
+    dest_val[common_feature_value.ShowIndex()] =
+        src_val[common_feature_value.ShowIndex()];
+    dest_val[common_feature_value.ClickIndex()] =
+        src_val[common_feature_value.ClickIndex()];
+    dest_val[common_feature_value.EmbedWIndex()] =
+        src_val[common_feature_value.EmbedWIndex()];
+    for (int i = 0; i < common_feature_value.EmbedDim(); i++) {
+      dest_val[common_feature_value.EmbedG2SumIndex() + i] =
+          src_val[common_feature_value.EmbedG2SumIndex() + i];
+    }
+    dest_val[common_feature_value.SlotIndex()] =
+        src_val[common_feature_value.SlotIndex()];
+    dest_val[common_feature_value.MfDimIndex()] = mf_dim;
+    dest_val[common_feature_value.MfSizeIndex()] =
+        src_val[common_feature_value.MfSizeIndex()];
+
+    for (int x = common_feature_value.EmbedxG2SumIndex();
+         x < int(common_feature_value.Size(mf_dim) / sizeof(float));
+         x++) {
+      dest_val[x] = src_val[x];
+    }
+  }
+
+  // dy_mf_fill_shard_grads_kernel,update_one 阶段 gpukernel
+  // 中从src_val赋值给dest_val
+  __host__ __device__ void PushValueFill(float* dest_val,
+                                         const float* src_val) {
+    dest_val[common_push_value.SlotIndex()] =
+        src_val[common_push_value.SlotIndex()];
+    dest_val[common_push_value.ShowIndex()] =
+        src_val[common_push_value.ShowIndex()];
+    dest_val[common_push_value.ClickIndex()] =
+        src_val[common_push_value.ClickIndex()];
+    dest_val[common_push_value.MfDimIndex()] =
+        src_val[common_push_value.MfDimIndex()];
+    dest_val[common_push_value.EmbedGIndex()] =
+        src_val[common_push_value.EmbedGIndex()];
+
+    for (int x = 0; x < int(src_val[common_push_value.MfDimIndex()]); x++) {
+      dest_val[common_push_value.EmbedxGIndex() + x] =
+          src_val[common_push_value.EmbedxGIndex() + x];
+    }
+  }
+
+  // update_basic 阶段 gpukernel 中从src_val赋值给dest_val
+  __host__ __device__ void PushValueFillBasic(float* dest_val,
+                                              const float* src_val) {
+    dest_val[common_push_value.SlotIndex()] =
+        src_val[common_push_value.SlotIndex()];
+    dest_val[common_push_value.ShowIndex()] =
+        src_val[common_push_value.ShowIndex()];
+    dest_val[common_push_value.ClickIndex()] =
+        src_val[common_push_value.ClickIndex()];
+    dest_val[common_push_value.MfDimIndex()] =
+        src_val[common_push_value.MfDimIndex()];
+    dest_val[common_push_value.EmbedGIndex()] =
+        src_val[common_push_value.EmbedGIndex()];
+  }
+
+  // merge_one 阶段 gpukernel 中 PushValue 从src_val赋值给dest_val
+  __host__ __device__ void MergePushValue(float* dest_val,
+                                          const float* src_val) {
+    dest_val[common_push_value.ShowIndex()] +=
+        src_val[common_push_value.ShowIndex()];
+    dest_val[common_push_value.ClickIndex()] +=
+        src_val[common_push_value.ClickIndex()];
+    dest_val[common_push_value.EmbedGIndex()] +=
+        src_val[common_push_value.EmbedGIndex()];
+    for (int j = 0; j < int(dest_val[common_push_value.MfDimIndex()]); j++) {
+      dest_val[common_push_value.EmbedxGIndex() + j] +=
+          src_val[common_push_value.EmbedxGIndex() + j];
+    }
+  }
+
+  // merge_basic 阶段 gpukernel 中 PushValue 从src_val赋值给dest_val
+  __host__ __device__ void MergePushValueBasic(float* dest_val,
+                                               const float* src_val) {
+    dest_val[common_push_value.ShowIndex()] +=
+        src_val[common_push_value.ShowIndex()];
+    dest_val[common_push_value.ClickIndex()] +=
+        src_val[common_push_value.ClickIndex()];
+    dest_val[common_push_value.EmbedGIndex()] +=
+        src_val[common_push_value.EmbedGIndex()];
+  }
+
+  // PullCopy 阶段 gpukernel 中  FeatureValue回填到PullValue
+  __host__ __device__ void Select(float* dest_val,
+                                  float* src_val,
+                                  uint64_t* key,
+                                  int mf_dim) {
+    if (*key == 0) {
+      *(dest_val + common_pull_value.ShowIndex()) = 0;
+      *(dest_val + common_pull_value.ClickIndex()) = 0;
+      *(dest_val + common_pull_value.EmbedWIndex()) = 0;
+    } else {
+      *(dest_val + common_pull_value.ShowIndex()) =
+          src_val[common_feature_value.ShowIndex()];
+      *(dest_val + common_pull_value.ClickIndex()) =
+          src_val[common_feature_value.ClickIndex()];
+      *(dest_val + common_pull_value.EmbedWIndex()) =
+          src_val[common_feature_value.EmbedWIndex()];
+    }
+
+    if (src_val[common_feature_value.MfSizeIndex()] == 0 || *key == 0) {
+      for (int j = 0; j < mf_dim; j++) {
+        *(dest_val + common_pull_value.EmbedxWIndex() + j) = 0;
+      }
+    } else {
+      for (int j = 0; j < mf_dim; j++) {
+        *(dest_val + common_pull_value.EmbedxWIndex() + j) =
+            src_val[common_feature_value.EmbedxWOffsetIndex(src_val) + j];
+      }
+    }
+  }
+
+  __host__ __device__ std::string ParseToString(const float* v,
+                                                int param_size) {
+    /*
+        uint64_t cpu_ptr; // 2float
+        float delta_score;
+        float show;
+        float click;
+        float embed_w;
+        std::vector<float> embed_g2sum;
+        float slot;
+        float mf_dim
+        float mf_size
+        std::vector<float> embedx_g2sum;
+        std::vector<float> embedx_w;
+    */
+    std::stringstream os;
+    os << "cpuptr: " << common_feature_value.CpuPtr(const_cast<float*>(v))
+       << " delta_score: " << v[2] << " show: " << v[3] << " click: " << v[4]
+       << " embed_w:" << v[5] << " embed_g2sum:";
+    for (int i = common_feature_value.EmbedG2SumIndex();
+         i < common_feature_value.SlotIndex();
+         i++) {
+      os << " " << v[i];
+    }
+    int mf_dim = int(common_feature_value.MfDim(const_cast<float*>(v)));
+    os << " slot: " << common_feature_value.Slot(const_cast<float*>(v))
+       << " mf_dim: " << mf_dim
+       << " mf_size: " << common_feature_value.MfSize(const_cast<float*>(v))
+       << " mf: ";
+    if (param_size > common_feature_value.EmbedxG2SumIndex()) {
+      for (auto i = common_feature_value.EmbedxG2SumIndex();
+           i < common_feature_value.Dim(mf_dim);
+           ++i) {
+        os << " " << v[i];
+      }
+    }
+    return os.str();
+  }
+
+ public:
+  CommonFeatureValue common_feature_value;
+  CommonPushValue common_push_value;
+  CommonPullValue common_pull_value;
+};
 
 struct FeatureValue {
   float delta_score;
@@ -95,6 +630,176 @@ struct FeaturePushValue {
   }
 };
 
+class VirtualAccessor {
+ public:
+  virtual int Configure(std::unordered_map<std::string, float> config) = 0;
+
+  virtual size_t GetFeatureValueSize(int& mf_dim) = 0;
+
+  virtual size_t GetPushValueSize(int& mf_dim) = 0;
+
+  virtual void BuildFill(void* gpu_val,
+                         void* cpu_val,
+                         paddle::distributed::ValueAccessor* cpu_table_accessor,
+                         int mf_dim) = 0;
+
+  virtual void DumpFill(float* gpu_val,
+                        paddle::distributed::ValueAccessor* cpu_table_accessor,
+                        int mf_dim) = 0;
+
+  virtual void CopyForPull(const paddle::platform::Place& place,
+                           uint64_t** gpu_keys,
+                           const std::vector<float*>& values,
+                           const float* total_values_gpu,
+                           const int64_t* gpu_len,
+                           const int slot_num,
+                           const int hidden_size,
+                           const int64_t total_length,
+                           int* gpu_dim,
+                           int feature_value_size) = 0;
+
+  virtual void CopyForPush(const paddle::platform::Place& place,
+                           const std::vector<const float*>& grad_values,
+                           float* total_grad_values_gpu,
+                           const std::vector<int64_t>& slot_lengths,
+                           const uint64_t total_length,
+                           const int batch_size,
+                           size_t grad_value_size,
+                           std::vector<int>& slot_vector,
+                           std::vector<int>& slot_mf_dim_vector) = 0;
+
+  virtual std::string ParseToString(const float* v, int param_size) = 0;
+};
+
+template <typename GPUAccessor>
+class AccessorWrapper : public VirtualAccessor {
+ public:
+  explicit AccessorWrapper() {}
+  virtual ~AccessorWrapper() {}
+  AccessorWrapper(const AccessorWrapper&) = delete;
+  AccessorWrapper& operator=(const AccessorWrapper&) = delete;
+
+  virtual int Configure(std::unordered_map<std::string, float> config) {
+    return gpu_accessor_.Configure(config);
+  }
+
+  virtual size_t GetFeatureValueSize(int& mf_dim) {
+    return gpu_accessor_.common_feature_value.Size(mf_dim);
+  }
+
+  virtual size_t GetPushValueSize(int& mf_dim) {
+    return gpu_accessor_.common_push_value.Size(mf_dim);
+  }
+
+  virtual void BuildFill(void* gpu_val,
+                         void* cpu_val,
+                         paddle::distributed::ValueAccessor* cpu_table_accessor,
+                         int mf_dim) {
+    gpu_accessor_.BuildFill(
+        (float*)(gpu_val), cpu_val, cpu_table_accessor, mf_dim);
+  }
+
+  virtual void DumpFill(float* gpu_val,
+                        paddle::distributed::ValueAccessor* cpu_table_accessor,
+                        int mf_dim) {
+    gpu_accessor_.DumpFill(gpu_val, cpu_table_accessor, mf_dim);
+  }
+
+  virtual void CopyForPull(const paddle::platform::Place& place,
+                           uint64_t** gpu_keys,
+                           const std::vector<float*>& values,
+                           const float* total_values_gpu,
+                           const int64_t* gpu_len,
+                           const int slot_num,
+                           const int hidden_size,
+                           const int64_t total_length,
+                           int* gpu_dim,
+                           int feature_value_size) {
+    CopyForPullImpl(place,
+                    gpu_keys,
+                    values,
+                    total_values_gpu,
+                    gpu_len,
+                    slot_num,
+                    hidden_size,
+                    total_length,
+                    gpu_dim,
+                    feature_value_size);
+  }
+
+  virtual void CopyForPush(const paddle::platform::Place& place,
+                           const std::vector<const float*>& grad_values,
+                           float* total_grad_values_gpu,
+                           const std::vector<int64_t>& slot_lengths,
+                           const uint64_t total_length,
+                           const int batch_size,
+                           size_t grad_value_size,
+                           std::vector<int>& slot_vector,
+                           std::vector<int>& slot_mf_dim_vector) {
+    CopyForPushImpl(place,
+                    grad_values,
+                    total_grad_values_gpu,
+                    slot_lengths,
+                    total_length,
+                    batch_size,
+                    grad_value_size,
+                    slot_vector,
+                    slot_mf_dim_vector);
+  }
+
+  void CopyForPullImpl(const paddle::platform::Place& place,
+                       uint64_t** gpu_keys,
+                       const std::vector<float*>& values,
+                       const float* total_values_gpu,
+                       const int64_t* gpu_len,
+                       const int slot_num,
+                       const int hidden_size,
+                       const int64_t total_length,
+                       int* gpu_dim,
+                       int feature_value_size);
+
+  void CopyForPushImpl(const paddle::platform::Place& place,
+                       const std::vector<const float*>& grad_values,
+                       float* total_grad_values_gpu,
+                       const std::vector<int64_t>& slot_lengths,
+                       const uint64_t total_length,
+                       const int batch_size,
+                       size_t grad_value_size,
+                       std::vector<int>& slot_vector,
+                       std::vector<int>& slot_mf_dim_vector);
+
+  virtual std::string ParseToString(const float* v, int param_size) {
+    return gpu_accessor_.ParseToString(v, param_size);
+  }
+
+  GPUAccessor gpu_accessor_;
+};
+
+class GlobalAccessorTransfor {
+ public:
+  static GlobalAccessorTransfor& GetInstance() {
+    static GlobalAccessorTransfor ins;
+    return ins;
+  }
+  void Init(std::string accessor_type) {
+    if (accessor_wrapper_ptr_ != nullptr) {
+      return;
+    }
+    if (accessor_type == "CtrDymfAccessor") {
+      accessor_wrapper_ptr_ = new AccessorWrapper<CommonFeatureValueAccessor>();
+    } else {
+      VLOG(0) << "GlobalAccessorTransfor Init not support accessor_type:"
+              << accessor_type;
+      accessor_wrapper_ptr_ = new AccessorWrapper<CommonFeatureValueAccessor>();
+    }
+  }
+  VirtualAccessor* GetAccessorWrapper() { return accessor_wrapper_ptr_; }
+
+ private:
+  VirtualAccessor* accessor_wrapper_ptr_ = nullptr;
+};
+
 }  // end namespace framework
 }  // end namespace paddle
+
 #endif
diff --git a/paddle/fluid/framework/fleet/heter_ps/graph_gpu_ps_table.h b/paddle/fluid/framework/fleet/heter_ps/graph_gpu_ps_table.h
index 9a6581c2ae5e3..a4bee2c19bbda 100644
--- a/paddle/fluid/framework/fleet/heter_ps/graph_gpu_ps_table.h
+++ b/paddle/fluid/framework/fleet/heter_ps/graph_gpu_ps_table.h
@@ -25,10 +25,12 @@
 #ifdef PADDLE_WITH_HETERPS
 namespace paddle {
 namespace framework {
-class GpuPsGraphTable : public HeterComm<uint64_t, int64_t, int> {
+class GpuPsGraphTable
+    : public HeterComm<uint64_t, int64_t, int, CommonFeatureValueAccessor> {
  public:
   GpuPsGraphTable(std::shared_ptr<HeterPsResource> resource, int topo_aware)
-      : HeterComm<uint64_t, int64_t, int>(1, resource) {
+      : HeterComm<uint64_t, int64_t, int, CommonFeatureValueAccessor>(
+            1, resource) {
     load_factor_ = 0.25;
     rw_lock.reset(new pthread_rwlock_t());
     gpu_num = resource_->total_device();
diff --git a/paddle/fluid/framework/fleet/heter_ps/hashtable.h b/paddle/fluid/framework/fleet/heter_ps/hashtable.h
index dbd6130c1461d..43192df0c71f0 100644
--- a/paddle/fluid/framework/fleet/heter_ps/hashtable.h
+++ b/paddle/fluid/framework/fleet/heter_ps/hashtable.h
@@ -137,8 +137,12 @@ class HashTable {
            size_t len,
            StreamType stream);
 
-  template <typename StreamType>
-  void get(const KeyType* d_keys, char* d_vals, size_t len, StreamType stream);
+  template <typename StreamType, typename FVAccessor>
+  void get(const KeyType* d_keys,
+           char* d_vals,
+           size_t len,
+           StreamType stream,
+           FVAccessor& fv_accessor);
 
   void show();
 
@@ -150,9 +154,9 @@ class HashTable {
 
 #if defined(PADDLE_WITH_CUDA)
 
-  template <typename GradType, typename Sgd, typename StreamType>
+  template <typename Sgd, typename StreamType>
   void update(const KeyType* d_keys,
-              const GradType* d_grads,
+              const float* d_grads,
               size_t len,
               Sgd sgd,
               StreamType stream);
diff --git a/paddle/fluid/framework/fleet/heter_ps/hashtable_kernel.cu b/paddle/fluid/framework/fleet/heter_ps/hashtable_kernel.cu
index bb9998249048e..2f5d5697e7c38 100644
--- a/paddle/fluid/framework/fleet/heter_ps/hashtable_kernel.cu
+++ b/paddle/fluid/framework/fleet/heter_ps/hashtable_kernel.cu
@@ -83,36 +83,25 @@ __global__ void search_kernel(Table* table,
   }
 }
 
-template <typename Table>
+template <typename Table, typename FVAccessor>
 __global__ void dy_mf_search_kernel(Table* table,
                                     const typename Table::key_type* const keys,
                                     char* vals,
                                     size_t len,
-                                    size_t pull_feature_value_size) {
+                                    size_t pull_feature_value_size,
+                                    FVAccessor feature_value_accessor) {
   const size_t i = blockIdx.x * blockDim.x + threadIdx.x;
   if (i < len) {
     auto it = table->find(keys[i]);
 
     if (it != table->end()) {
       uint64_t offset = i * pull_feature_value_size;
-      FeatureValue* cur = (FeatureValue*)(vals + offset);
-      FeatureValue& input = *(FeatureValue*)(it->second);
-      cur->slot = input.slot;
-      cur->show = input.show;
-      cur->clk = input.clk;
-      cur->mf_dim = input.mf_dim;
-      cur->lr = input.lr;
-      cur->mf_size = input.mf_size;
-      cur->cpu_ptr = input.cpu_ptr;
-      cur->delta_score = input.delta_score;
-      cur->lr_g2sum = input.lr_g2sum;
-      for (int j = 0; j < cur->mf_dim + 1; ++j) {
-        cur->mf[j] = input.mf[j];
-      }
-    } else {
-      if (keys[i] != 0) {
-        printf("warning::pull miss key: %llu", keys[i]);
-      }
+      float* cur = (float*)(vals + offset);
+      float* input = it->second;
+      int mf_dim =
+          int(input[feature_value_accessor.common_feature_value.MfDimIndex()]);
+
+      feature_value_accessor.FeatureValueFill(cur, input, mf_dim);
     }
   }
 }
@@ -145,8 +134,8 @@ __global__ void dy_mf_update_kernel(Table* table,
   if (i < len) {
     auto it = table->find(keys[i]);
     if (it != table->end()) {
-      FeaturePushValue* cur = (FeaturePushValue*)(grads + i * grad_value_size);
-      sgd.dy_mf_update_value(optimizer_config, (it.getter())->second, *cur);
+      float* cur = (float*)(grads + i * grad_value_size);
+      sgd.dy_mf_update_value(optimizer_config, (it.getter())->second, cur);
     } else {
       if (keys[i] != 0) {
         printf("warning::push miss key: %llu", keys[i]);
@@ -212,17 +201,18 @@ void HashTable<KeyType, ValType>::get(const KeyType* d_keys,
 }
 
 template <typename KeyType, typename ValType>
-template <typename StreamType>
+template <typename StreamType, typename FVAccessor>
 void HashTable<KeyType, ValType>::get(const KeyType* d_keys,
                                       char* d_vals,
                                       size_t len,
-                                      StreamType stream) {
+                                      StreamType stream,
+                                      FVAccessor& fv_accessor) {
   if (len == 0) {
     return;
   }
   const int grid_size = (len - 1) / BLOCK_SIZE_ + 1;
   dy_mf_search_kernel<<<grid_size, BLOCK_SIZE_, 0, stream>>>(
-      container_, d_keys, d_vals, len, pull_feature_value_size_);
+      container_, d_keys, d_vals, len, pull_feature_value_size_, fv_accessor);
 }
 
 template <typename KeyType, typename ValType>
@@ -298,27 +288,6 @@ void HashTable<KeyType, ValType>::dump_to_cpu(int devid, StreamType stream) {
           cpu_val[x + 7] = gpu_val.mf[x];
         }
       }
-#endif
-#ifdef PADDLE_WITH_PSCORE
-      auto* downpour_value =
-          (paddle::distributed::FixedFeatureValue*)(gpu_val.cpu_ptr);
-      int downpour_value_size = downpour_value->size();
-      if (gpu_val.mf_size > 0 && downpour_value_size == 7) {
-        downpour_value->resize(gpu_val.mf_size + downpour_value_size);
-      }
-      float* cpu_val = downpour_value->data();
-      // cpu_val[0] = 0;
-      cpu_val[2] = gpu_val.delta_score;
-      cpu_val[3] = gpu_val.show;
-      cpu_val[4] = gpu_val.clk;
-      cpu_val[5] = gpu_val.lr;
-      cpu_val[6] = gpu_val.lr_g2sum;
-      cpu_val[0] = gpu_val.slot;
-      if (gpu_val.mf_size > 0) {
-        for (int x = 0; x < gpu_val.mf_size; x++) {
-          cpu_val[x + 7] = gpu_val.mf[x];
-        }
-      }
 #endif
     }
   };
@@ -336,9 +305,9 @@ void HashTable<KeyType, ValType>::dump_to_cpu(int devid, StreamType stream) {
 }
 
 template <typename KeyType, typename ValType>
-template <typename GradType, typename Sgd, typename StreamType>
+template <typename Sgd, typename StreamType>
 void HashTable<KeyType, ValType>::update(const KeyType* d_keys,
-                                         const GradType* d_grads,
+                                         const float* d_grads,
                                          size_t len,
                                          Sgd sgd,
                                          StreamType stream) {
@@ -371,8 +340,8 @@ void HashTable<KeyType, ValType>::update(const KeyType* d_keys,
       push_grad_value_size_);
 }
 
-template class HashTable<unsigned long, paddle::framework::FeatureValue>;
-template class HashTable<unsigned long, paddle::framework::FeatureValue*>;
+template class HashTable<unsigned long, float>;
+template class HashTable<unsigned long, float*>;
 template class HashTable<long, int>;
 template class HashTable<unsigned long, int>;
 template class HashTable<unsigned long, unsigned long>;
@@ -382,15 +351,19 @@ template class HashTable<long, long>;
 template class HashTable<long, unsigned long>;
 template class HashTable<long, unsigned int>;
 
-template void HashTable<unsigned long, paddle::framework::FeatureValue>::get<
-    cudaStream_t>(const unsigned long* d_keys,
-                  paddle::framework::FeatureValue* d_vals,
-                  size_t len,
-                  cudaStream_t stream);
+template void HashTable<unsigned long, float>::get<cudaStream_t>(
+    const unsigned long* d_keys,
+    float* d_vals,
+    size_t len,
+    cudaStream_t stream);
 
 template void
-HashTable<unsigned long, paddle::framework::FeatureValue*>::get<cudaStream_t>(
-    const unsigned long* d_keys, char* d_vals, size_t len, cudaStream_t stream);
+HashTable<unsigned long, float*>::get<cudaStream_t, CommonFeatureValueAccessor>(
+    const unsigned long* d_keys,
+    char* d_vals,
+    size_t len,
+    cudaStream_t stream,
+    CommonFeatureValueAccessor& fv_accessor);
 
 template void HashTable<long, int>::get<cudaStream_t>(const long* d_keys,
                                                       int* d_vals,
@@ -399,6 +372,12 @@ template void HashTable<long, int>::get<cudaStream_t>(const long* d_keys,
 
 template void HashTable<unsigned long, int>::get<cudaStream_t>(
     const unsigned long* d_keys, int* d_vals, size_t len, cudaStream_t stream);
+template void HashTable<unsigned long, unsigned long>::get<cudaStream_t>(
+    const unsigned long* d_keys,
+    unsigned long* d_vals,
+    size_t len,
+    cudaStream_t stream);
+
 template void HashTable<long, unsigned long>::get<cudaStream_t>(
     const long* d_keys, unsigned long* d_vals, size_t len, cudaStream_t stream);
 template void HashTable<long, long>::get<cudaStream_t>(const long* d_keys,
@@ -414,19 +393,19 @@ template void HashTable<unsigned long, long>::get<cudaStream_t>(
 //    const unsigned long* d_keys, char* d_vals, size_t len, cudaStream_t
 //    stream);
 
-template void HashTable<unsigned long, paddle::framework::FeatureValue>::insert<
-    cudaStream_t>(const unsigned long* d_keys,
-                  const paddle::framework::FeatureValue* d_vals,
-                  size_t len,
-                  cudaStream_t stream);
+template void HashTable<unsigned long, float>::insert<cudaStream_t>(
+    const unsigned long* d_keys,
+    const float* d_vals,
+    size_t len,
+    cudaStream_t stream);
 
-template void HashTable<unsigned long, paddle::framework::FeatureValue*>::
-    insert<cudaStream_t>(const unsigned long* d_keys,
-                         size_t len,
-                         char* pool,
-                         size_t feature_value_size,
-                         size_t start_index,
-                         cudaStream_t stream);
+template void HashTable<unsigned long, float*>::insert<cudaStream_t>(
+    const unsigned long* d_keys,
+    size_t len,
+    char* pool,
+    size_t feature_value_size,
+    size_t start_index,
+    cudaStream_t stream);
 
 template void HashTable<long, int>::insert<cudaStream_t>(const long* d_keys,
                                                          const int* d_vals,
@@ -460,30 +439,37 @@ template void HashTable<unsigned long, long>::insert<cudaStream_t>(
     size_t len,
     cudaStream_t stream);
 
-template void HashTable<unsigned long, paddle::framework::FeatureValue>::
-    dump_to_cpu<cudaStream_t>(int devid, cudaStream_t stream);
+template void HashTable<unsigned long, unsigned long>::insert<cudaStream_t>(
+    const unsigned long* d_keys,
+    const unsigned long* d_vals,
+    size_t len,
+    cudaStream_t stream);
 
-template void HashTable<unsigned long, paddle::framework::FeatureValue>::update<
-    paddle::framework::FeaturePushValue,
-    Optimizer<paddle::framework::FeatureValue,
-              paddle::framework::FeaturePushValue>,
+template void HashTable<unsigned long, float*>::dump_to_cpu<cudaStream_t>(
+    int devid, cudaStream_t stream);
+
+template void
+HashTable<unsigned long, float*>::update<SparseAdagradOptimizer, cudaStream_t>(
+    const unsigned long* d_keys,
+    const char* d_grads,
+    size_t len,
+    SparseAdagradOptimizer sgd,
+    cudaStream_t stream);
+template void
+HashTable<unsigned long, float*>::update<SparseAdamOptimizer, cudaStream_t>(
+    const unsigned long* d_keys,
+    const char* d_grads,
+    size_t len,
+    SparseAdamOptimizer sgd,
+    cudaStream_t stream);
+template void HashTable<unsigned long, float*>::update<
+    SparseAdamSharedOptimizer,
     cudaStream_t>(const unsigned long* d_keys,
-                  const paddle::framework::FeaturePushValue* d_grads,
+                  const char* d_grads,
                   size_t len,
-                  Optimizer<paddle::framework::FeatureValue,
-                            paddle::framework::FeaturePushValue> sgd,
+                  SparseAdamSharedOptimizer sgd,
                   cudaStream_t stream);
 
-template void HashTable<unsigned long, paddle::framework::FeatureValue*>::
-    update<Optimizer<paddle::framework::FeatureValue,
-                     paddle::framework::FeaturePushValue>,
-           cudaStream_t>(const unsigned long* d_keys,
-                         const char* d_grads,
-                         size_t len,
-                         Optimizer<paddle::framework::FeatureValue,
-                                   paddle::framework::FeaturePushValue> sgd,
-                         cudaStream_t stream);
-
 // template void HashTable<unsigned long,
 // paddle::framework::FeatureValue>::update<
 //    Optimizer<paddle::framework::FeatureValue,
diff --git a/paddle/fluid/framework/fleet/heter_ps/heter_comm.h b/paddle/fluid/framework/fleet/heter_ps/heter_comm.h
index 92b97625ba5d7..c6e25b72a2afb 100644
--- a/paddle/fluid/framework/fleet/heter_ps/heter_comm.h
+++ b/paddle/fluid/framework/fleet/heter_ps/heter_comm.h
@@ -46,7 +46,10 @@ namespace framework {
 #define TYPEALIGN(ALIGNVAL, LEN) \
   (((uint64_t)(LEN) + ((ALIGNVAL)-1)) & ~((uint64_t)((ALIGNVAL)-1)))
 
-template <typename KeyType, typename ValType, typename GradType>
+template <typename KeyType,
+          typename ValType,
+          typename GradType,
+          typename FVAccessor>
 class HeterComm {
  public:
   HeterComm(size_t capacity, std::shared_ptr<HeterPsResource> resource);
@@ -65,12 +68,9 @@ class HeterComm {
                   GradType* d_grads,
                   size_t len,
                   int& uniq_len);  // NOLINT
-  void dynamic_merge_grad(int gpu_num,
-                          KeyType* d_keys,
-                          GradType* d_grads,
-                          size_t len,
-                          int& uniq_len);
-  void pull_sparse(int num, KeyType* d_keys, ValType* d_vals, size_t len);
+  void dynamic_merge_grad(
+      int gpu_num, KeyType* d_keys, float* d_grads, size_t len, int& uniq_len);
+  void pull_sparse(int num, KeyType* d_keys, float* d_vals, size_t len);
   void build_ps(int num,
                 KeyType* h_keys,
                 ValType* h_vals,
@@ -92,7 +92,7 @@ class HeterComm {
   template <typename Sgd>
   void push_sparse(int num,
                    KeyType* d_keys,
-                   GradType* d_grads,
+                   float* d_grads,
                    size_t len,
                    Sgd& sgd);  // NOLINT
 #elif defined(PADDLE_WITH_XPU_KP)
@@ -149,6 +149,13 @@ class HeterComm {
     multi_mf_dim_ = multi_mf_dim;
     max_mf_dim_ = max_mf_dim;
   }
+
+  void set_accessor(FVAccessor& accessor) {
+    feature_value_accessor_ = accessor;
+    //  for (auto& ptr_table: ptr_tables_) {
+    //    ptr_table->set_accessor(feature_value_accessor_);
+    //  }
+  }
 #endif
 
   bool need_transfer(int send_id, int receive_id) {
@@ -282,9 +289,11 @@ class HeterComm {
                    char* src_val,
                    size_t val_size);
 
+  FVAccessor feature_value_accessor_;
+
  protected:
   using Table = HashTable<KeyType, ValType>;
-  using PtrTable = HashTable<KeyType, ValType*>;
+  using PtrTable = HashTable<KeyType, float*>;
   std::vector<Table*> tables_;
   std::vector<PtrTable*> ptr_tables_;
   std::shared_ptr<HeterPsResource> resource_;
diff --git a/paddle/fluid/framework/fleet/heter_ps/heter_comm_inl.h b/paddle/fluid/framework/fleet/heter_ps/heter_comm_inl.h
index a7333cd01c6ec..f8657c8e895ad 100644
--- a/paddle/fluid/framework/fleet/heter_ps/heter_comm_inl.h
+++ b/paddle/fluid/framework/fleet/heter_ps/heter_comm_inl.h
@@ -24,8 +24,12 @@ limitations under the License. */
 
 namespace paddle {
 namespace framework {
-template <typename KeyType, typename ValType, typename GradType>
-HeterComm<KeyType, ValType, GradType>::HeterComm(
+
+template <typename KeyType,
+          typename ValType,
+          typename GradType,
+          typename FVAccessor>
+HeterComm<KeyType, ValType, GradType, FVAccessor>::HeterComm(
     size_t capacity, std::shared_ptr<HeterPsResource> resource) {
   VLOG(1) << "Construct new HeterComm";
   resource_ = resource;
@@ -42,10 +46,14 @@ HeterComm<KeyType, ValType, GradType>::HeterComm(
       tables_.push_back(table);
     } else {
       max_mf_dim_ = resource_->max_mf_dim();
-      size_t val_type_size = TYPEALIGN(
-          8, sizeof(FeatureValue) + sizeof(float) * (max_mf_dim_ + 1));
-      size_t grad_type_size = TYPEALIGN(
-          8, sizeof(FeaturePushValue) + (max_mf_dim_ * sizeof(float)));
+      auto accessor_wrapper_ptr =
+          GlobalAccessorTransfor::GetInstance().GetAccessorWrapper();
+      size_t val_type_size =
+          accessor_wrapper_ptr->GetFeatureValueSize(max_mf_dim_);
+      size_t grad_type_size =
+          accessor_wrapper_ptr->GetPushValueSize(max_mf_dim_);
+      VLOG(0) << " HeterComm init, max feature_value_size:" << val_type_size
+              << ", feature_value_push_size:" << grad_type_size;
       auto ptr_table = new PtrTable(capacity / load_factor_);
       ptr_table->set_feature_value_size(val_type_size, grad_type_size);
       ptr_tables_.push_back(ptr_table);
@@ -58,8 +66,11 @@ HeterComm<KeyType, ValType, GradType>::HeterComm(
   init_path();
 }
 
-template <typename KeyType, typename ValType, typename GradType>
-void HeterComm<KeyType, ValType, GradType>::init_path() {
+template <typename KeyType,
+          typename ValType,
+          typename GradType,
+          typename FVAccessor>
+void HeterComm<KeyType, ValType, GradType, FVAccessor>::init_path() {
   int total_device = resource_->total_device();
   path_.resize(total_device);
   if (!topo_aware_) {
@@ -111,14 +122,18 @@ void HeterComm<KeyType, ValType, GradType>::init_path() {
   }
 }
 
-template <typename KeyType, typename ValType, typename GradType>
+template <typename KeyType,
+          typename ValType,
+          typename GradType,
+          typename FVAccessor>
 template <typename DstPlace, typename SrcPlace, typename StreamType>
-void HeterComm<KeyType, ValType, GradType>::memory_copy(DstPlace dst_place,
-                                                        void* dst,
-                                                        SrcPlace src_place,
-                                                        const void* src,
-                                                        size_t count,
-                                                        StreamType stream) {
+void HeterComm<KeyType, ValType, GradType, FVAccessor>::memory_copy(
+    DstPlace dst_place,
+    void* dst,
+    SrcPlace src_place,
+    const void* src,
+    size_t count,
+    StreamType stream) {
 #if defined(PADDLE_WITH_CUDA)
   cudaMemcpyAsync(dst, src, count, cudaMemcpyDefault, stream);
   if (stream == 0) {
@@ -129,11 +144,12 @@ void HeterComm<KeyType, ValType, GradType>::memory_copy(DstPlace dst_place,
 #endif
 }
 
-template <typename KeyType, typename ValType, typename GradType>
-void HeterComm<KeyType, ValType, GradType>::create_storage(int start_index,
-                                                           int end_index,
-                                                           int keylen,
-                                                           int vallen) {
+template <typename KeyType,
+          typename ValType,
+          typename GradType,
+          typename FVAccessor>
+void HeterComm<KeyType, ValType, GradType, FVAccessor>::create_storage(
+    int start_index, int end_index, int keylen, int vallen) {
 #if defined(PADDLE_WITH_CUDA)
   auto& allocator = allocators_[start_index];
   auto& nodes = path_[start_index][end_index].nodes_;
@@ -167,9 +183,12 @@ void HeterComm<KeyType, ValType, GradType>::create_storage(int start_index,
 #endif
 }
 
-template <typename KeyType, typename ValType, typename GradType>
-void HeterComm<KeyType, ValType, GradType>::destroy_storage(int start_index,
-                                                            int end_index) {
+template <typename KeyType,
+          typename ValType,
+          typename GradType,
+          typename FVAccessor>
+void HeterComm<KeyType, ValType, GradType, FVAccessor>::destroy_storage(
+    int start_index, int end_index) {
 #if defined(PADDLE_WITH_CUDA)
   auto& allocator = allocators_[start_index];
   auto& nodes = path_[start_index][end_index].nodes_;
@@ -184,13 +203,17 @@ void HeterComm<KeyType, ValType, GradType>::destroy_storage(int start_index,
 #endif
 }
 
-template <typename KeyType, typename ValType, typename GradType>
-void HeterComm<KeyType, ValType, GradType>::walk_to_dest(int start_index,
-                                                         int num,
-                                                         int* h_left,
-                                                         int* h_right,
-                                                         KeyType* src_key,
-                                                         GradType* src_val) {
+template <typename KeyType,
+          typename ValType,
+          typename GradType,
+          typename FVAccessor>
+void HeterComm<KeyType, ValType, GradType, FVAccessor>::walk_to_dest(
+    int start_index,
+    int num,
+    int* h_left,
+    int* h_right,
+    KeyType* src_key,
+    GradType* src_val) {
   int need_copy_val = 0;
   if (src_val) {
     need_copy_val = 1;
@@ -267,14 +290,18 @@ void HeterComm<KeyType, ValType, GradType>::walk_to_dest(int start_index,
   }
 }
 
-template <typename KeyType, typename ValType, typename GradType>
-void HeterComm<KeyType, ValType, GradType>::walk_to_dest(int start_index,
-                                                         int gpu_num,
-                                                         int* h_left,
-                                                         int* h_right,
-                                                         KeyType* src_key,
-                                                         char* src_val,
-                                                         size_t val_size) {
+template <typename KeyType,
+          typename ValType,
+          typename GradType,
+          typename FVAccessor>
+void HeterComm<KeyType, ValType, GradType, FVAccessor>::walk_to_dest(
+    int start_index,
+    int gpu_num,
+    int* h_left,
+    int* h_right,
+    KeyType* src_key,
+    char* src_val,
+    size_t val_size) {
   int need_copy_val = 0;
   if (src_val) {
     need_copy_val = 1;
@@ -327,13 +354,17 @@ void HeterComm<KeyType, ValType, GradType>::walk_to_dest(int start_index,
   }
 }
 
-template <typename KeyType, typename ValType, typename GradType>
-void HeterComm<KeyType, ValType, GradType>::walk_to_src(int start_index,
-                                                        int gpu_num,
-                                                        int* h_left,
-                                                        int* h_right,
-                                                        char* src_val,
-                                                        size_t val_size) {
+template <typename KeyType,
+          typename ValType,
+          typename GradType,
+          typename FVAccessor>
+void HeterComm<KeyType, ValType, GradType, FVAccessor>::walk_to_src(
+    int start_index,
+    int gpu_num,
+    int* h_left,
+    int* h_right,
+    char* src_val,
+    size_t val_size) {
   std::queue<CopyTask> que;
   for (int i = 0; i < gpu_num; i++) {
     if (h_left[i] == -1 || h_right[i] == -1) {
@@ -383,8 +414,11 @@ void HeterComm<KeyType, ValType, GradType>::walk_to_src(int start_index,
   }
 }
 
-template <typename KeyType, typename ValType, typename GradType>
-HeterComm<KeyType, ValType, GradType>::~HeterComm() {
+template <typename KeyType,
+          typename ValType,
+          typename GradType,
+          typename FVAccessor>
+HeterComm<KeyType, ValType, GradType, FVAccessor>::~HeterComm() {
   if (!multi_mf_dim_) {
     for (auto& table : tables_) {
       delete table;
@@ -402,15 +436,22 @@ HeterComm<KeyType, ValType, GradType>::~HeterComm() {
   }
 }
 
-template <typename KeyType, typename ValType, typename GradType>
-void HeterComm<KeyType, ValType, GradType>::show_one_table(int gpu_num) {
+template <typename KeyType,
+          typename ValType,
+          typename GradType,
+          typename FVAccessor>
+void HeterComm<KeyType, ValType, GradType, FVAccessor>::show_one_table(
+    int gpu_num) {
   if (!multi_mf_dim_) {
     tables_[gpu_num]->show();
   }
 }
 
-template <typename KeyType, typename ValType, typename GradType>
-int HeterComm<KeyType, ValType, GradType>::log2i(int x) {
+template <typename KeyType,
+          typename ValType,
+          typename GradType,
+          typename FVAccessor>
+int HeterComm<KeyType, ValType, GradType, FVAccessor>::log2i(int x) {
   unsigned res = 0;
   while (x >>= 1) {
     ++res;
@@ -418,13 +459,20 @@ int HeterComm<KeyType, ValType, GradType>::log2i(int x) {
   return res;
 }
 
-template <typename KeyType, typename ValType, typename GradType>
-int HeterComm<KeyType, ValType, GradType>::get_index_by_devid(int devid) {
+template <typename KeyType,
+          typename ValType,
+          typename GradType,
+          typename FVAccessor>
+int HeterComm<KeyType, ValType, GradType, FVAccessor>::get_index_by_devid(
+    int devid) {
   return resource_->get_index_by_devid(devid);
 }
 
-template <typename KeyType, typename ValType, typename GradType>
-void HeterComm<KeyType, ValType, GradType>::set_sparse_sgd(
+template <typename KeyType,
+          typename ValType,
+          typename GradType,
+          typename FVAccessor>
+void HeterComm<KeyType, ValType, GradType, FVAccessor>::set_sparse_sgd(
     const OptimizerConfig& optimizer_config) {
   for (int i = 0; i < resource_->total_device(); ++i) {
     AnyDeviceGuard guard(resource_->dev_id(i));
@@ -436,8 +484,11 @@ void HeterComm<KeyType, ValType, GradType>::set_sparse_sgd(
   }
 }
 
-template <typename KeyType, typename ValType, typename GradType>
-void HeterComm<KeyType, ValType, GradType>::set_embedx_sgd(
+template <typename KeyType,
+          typename ValType,
+          typename GradType,
+          typename FVAccessor>
+void HeterComm<KeyType, ValType, GradType, FVAccessor>::set_embedx_sgd(
     const OptimizerConfig& optimizer_config) {
   for (int i = 0; i < resource_->total_device(); ++i) {
     AnyDeviceGuard guard(resource_->dev_id(i));
@@ -449,13 +500,17 @@ void HeterComm<KeyType, ValType, GradType>::set_embedx_sgd(
   }
 }
 
-template <typename KeyType, typename ValType, typename GradType>
-void HeterComm<KeyType, ValType, GradType>::build_ps(int dev_num,
-                                                     KeyType* h_keys,
-                                                     ValType* h_vals,
-                                                     size_t len,
-                                                     size_t chunk_size,
-                                                     int stream_num) {
+template <typename KeyType,
+          typename ValType,
+          typename GradType,
+          typename FVAccessor>
+void HeterComm<KeyType, ValType, GradType, FVAccessor>::build_ps(
+    int dev_num,
+    KeyType* h_keys,
+    ValType* h_vals,
+    size_t len,
+    size_t chunk_size,
+    int stream_num) {
   if (len <= 0) {
     return;
   }
@@ -518,14 +573,18 @@ void HeterComm<KeyType, ValType, GradType>::build_ps(int dev_num,
   }
 }
 
-template <typename KeyType, typename ValType, typename GradType>
-void HeterComm<KeyType, ValType, GradType>::build_ps(int num,
-                                                     KeyType* h_keys,
-                                                     char* pool,
-                                                     size_t len,
-                                                     size_t feature_value_size,
-                                                     size_t chunk_size,
-                                                     int stream_num) {
+template <typename KeyType,
+          typename ValType,
+          typename GradType,
+          typename FVAccessor>
+void HeterComm<KeyType, ValType, GradType, FVAccessor>::build_ps(
+    int num,
+    KeyType* h_keys,
+    char* pool,
+    size_t len,
+    size_t feature_value_size,
+    size_t chunk_size,
+    int stream_num) {
   if (len <= 0) {
     return;
   }
@@ -580,8 +639,11 @@ void HeterComm<KeyType, ValType, GradType>::build_ps(int num,
   }
 }
 
-template <typename KeyType, typename ValType, typename GradType>
-void HeterComm<KeyType, ValType, GradType>::merge_grad(
+template <typename KeyType,
+          typename ValType,
+          typename GradType,
+          typename FVAccessor>
+void HeterComm<KeyType, ValType, GradType, FVAccessor>::merge_grad(
     int dev_num,
     KeyType* d_keys,
     GradType* d_grads,
@@ -654,13 +716,12 @@ void HeterComm<KeyType, ValType, GradType>::merge_grad(
   sync_stream(stream);
 }
 
-template <typename KeyType, typename ValType, typename GradType>
-void HeterComm<KeyType, ValType, GradType>::dynamic_merge_grad(
-    int gpu_num,
-    KeyType* d_keys,
-    GradType* d_grads,
-    size_t len,
-    int& uniq_len) {
+template <typename KeyType,
+          typename ValType,
+          typename GradType,
+          typename FVAccessor>
+void HeterComm<KeyType, ValType, GradType, FVAccessor>::dynamic_merge_grad(
+    int gpu_num, KeyType* d_keys, float* d_grads, size_t len, int& uniq_len) {
   int dev_id = resource_->dev_id(gpu_num);
   platform::CUDAPlace place = platform::CUDAPlace(dev_id);
   platform::CUDADeviceGuard guard(dev_id);
@@ -668,16 +729,15 @@ void HeterComm<KeyType, ValType, GradType>::dynamic_merge_grad(
 
   size_t temp_storage_bytes;
 
-  // VLOG(1) << "hetercomm merge_grad: max_mf_dim: " << max_mf_dim_;
-  size_t grad_value_size =
-      TYPEALIGN(8, sizeof(FeaturePushValue) + (max_mf_dim_ * sizeof(float)));
+  auto accessor_wrapper_ptr =
+      GlobalAccessorTransfor::GetInstance().GetAccessorWrapper();
+  size_t grad_value_size = accessor_wrapper_ptr->GetPushValueSize(max_mf_dim_);
 
   auto d_merge_keys = memory::Alloc(place, len * sizeof(KeyType));
   KeyType* d_merge_keys_ptr = reinterpret_cast<KeyType*>(d_merge_keys->ptr());
 
   auto d_merge_grads = memory::Alloc(place, len * grad_value_size);
-  GradType* d_merge_grads_ptr =
-      reinterpret_cast<GradType*>(d_merge_grads->ptr());
+  float* d_merge_grads_ptr = reinterpret_cast<float*>(d_merge_grads->ptr());
 
   auto d_fea_num_info = memory::Alloc(place, sizeof(uint32_t) * (len * 3 + 1));
   uint32_t* d_fea_num_info_ptr =
@@ -772,7 +832,8 @@ void HeterComm<KeyType, ValType, GradType>::dynamic_merge_grad(
                                      uniq_len,
                                      grad_value_size,
                                      merger_,
-                                     stream);
+                                     stream,
+                                     feature_value_accessor_);
   PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamSynchronize(stream));
   PADDLE_ENFORCE_GPU_SUCCESS(cudaMemcpyAsync(d_grads,
                                              d_merge_grads_ptr,
@@ -782,8 +843,11 @@ void HeterComm<KeyType, ValType, GradType>::dynamic_merge_grad(
   PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamSynchronize(stream));
 }
 
-template <typename KeyType, typename ValType, typename GradType>
-void HeterComm<KeyType, ValType, GradType>::split_input_to_shard(
+template <typename KeyType,
+          typename ValType,
+          typename GradType,
+          typename FVAccessor>
+void HeterComm<KeyType, ValType, GradType, FVAccessor>::split_input_to_shard(
     KeyType* d_keys,
     int* d_idx_ptr,
     size_t len,
@@ -843,11 +907,12 @@ void HeterComm<KeyType, ValType, GradType>::split_input_to_shard(
   sync_stream(stream);
 }
 
-template <typename KeyType, typename ValType, typename GradType>
-void HeterComm<KeyType, ValType, GradType>::pull_sparse(int num,
-                                                        KeyType* d_keys,
-                                                        ValType* d_vals,
-                                                        size_t len) {
+template <typename KeyType,
+          typename ValType,
+          typename GradType,
+          typename FVAccessor>
+void HeterComm<KeyType, ValType, GradType, FVAccessor>::pull_sparse(
+    int num, KeyType* d_keys, float* d_vals, size_t len) {
   if (len == 0) {
     return;
   }
@@ -893,12 +958,15 @@ void HeterComm<KeyType, ValType, GradType>::pull_sparse(int num,
 
   auto d_idx = memory::Alloc(place, len * sizeof(int));
   int* d_idx_ptr = reinterpret_cast<int*>(d_idx->ptr());
-  size_t val_type_size =
-      TYPEALIGN(8, sizeof(FeatureValue) + sizeof(float) * (max_mf_dim_ + 1));
+
+  auto accessor_wrapper_ptr =
+      GlobalAccessorTransfor::GetInstance().GetAccessorWrapper();
+  size_t val_type_size = accessor_wrapper_ptr->GetFeatureValueSize(max_mf_dim_);
+  VLOG(3) << "pull_sparse len:" << len << "  val_type_size: " << val_type_size;
   auto d_shard_keys = memory::Alloc(place, len * sizeof(KeyType));
   KeyType* d_shard_keys_ptr = reinterpret_cast<KeyType*>(d_shard_keys->ptr());
   auto d_shard_vals = memory::Alloc(place, len * val_type_size);
-  ValType* d_shard_vals_ptr = reinterpret_cast<ValType*>(d_shard_vals->ptr());
+  float* d_shard_vals_ptr = reinterpret_cast<float*>(d_shard_vals->ptr());
 
   split_input_to_shard(d_keys, d_idx_ptr, len, d_left_ptr, d_right_ptr, num);
 
@@ -944,7 +1012,8 @@ void HeterComm<KeyType, ValType, GradType>::pull_sparse(int num,
     ptr_tables_[i]->get(reinterpret_cast<KeyType*>(node.key_storage),
                         node.val_storage,
                         h_right[i] - h_left[i] + 1,
-                        resource_->remote_stream(i, num));
+                        resource_->remote_stream(i, num),
+                        feature_value_accessor_);
   }
 
   for (int i = 0; i < total_device; ++i) {
@@ -964,10 +1033,16 @@ void HeterComm<KeyType, ValType, GradType>::pull_sparse(int num,
     auto& node = path_[num][i].nodes_.front();
     sync_stream(node.out_stream);
   }
-  heter_comm_kernel_->dy_mf_fill_dvals(
-      d_shard_vals_ptr, d_vals, d_idx_ptr, len, val_type_size, stream);
+  heter_comm_kernel_->dy_mf_fill_dvals(d_shard_vals_ptr,
+                                       d_vals,
+                                       d_idx_ptr,
+                                       len,
+                                       val_type_size,
+                                       stream,
+                                       feature_value_accessor_);
 
   sync_stream(stream);
+
   for (int i = 0; i < total_device; ++i) {
     if (h_left[i] == -1 || h_right[i] == -1) {
       continue;
@@ -977,13 +1052,17 @@ void HeterComm<KeyType, ValType, GradType>::pull_sparse(int num,
 }
 
 #if defined(PADDLE_WITH_CUDA)
-template <typename KeyType, typename ValType, typename GradType>
+template <typename KeyType,
+          typename ValType,
+          typename GradType,
+          typename FVAccessor>
 template <typename Sgd>
-void HeterComm<KeyType, ValType, GradType>::push_sparse(int dev_num,
-                                                        KeyType* d_keys,
-                                                        GradType* d_grads,
-                                                        size_t len,
-                                                        Sgd& sgd) {  // NOLINT
+void HeterComm<KeyType, ValType, GradType, FVAccessor>::push_sparse(
+    int dev_num,
+    KeyType* d_keys,
+    float* d_grads,
+    size_t len,
+    Sgd& sgd) {  // NOLINT
   if (len == 0) {
     return;
   }
@@ -991,8 +1070,9 @@ void HeterComm<KeyType, ValType, GradType>::push_sparse(int dev_num,
   int total_device = resource_->total_device();
   int dev_id = resource_->dev_id(dev_num);
 
-  size_t grad_value_size =
-      TYPEALIGN(8, sizeof(FeaturePushValue) + (max_mf_dim_ * sizeof(float)));
+  auto accessor_wrapper_ptr =
+      GlobalAccessorTransfor::GetInstance().GetAccessorWrapper();
+  size_t grad_value_size = accessor_wrapper_ptr->GetPushValueSize(max_mf_dim_);
   DevPlace place = DevPlace(dev_id);
   AnyDeviceGuard guard(dev_id);
   auto stream = resource_->local_stream(dev_num, 0);
@@ -1037,8 +1117,7 @@ void HeterComm<KeyType, ValType, GradType>::push_sparse(int dev_num,
   KeyType* d_shard_keys_ptr = reinterpret_cast<KeyType*>(d_shard_keys->ptr());
 
   auto d_shard_grads = memory::Alloc(place, len * grad_value_size);
-  GradType* d_shard_grads_ptr =
-      reinterpret_cast<GradType*>(d_shard_grads->ptr());
+  float* d_shard_grads_ptr = reinterpret_cast<float*>(d_shard_grads->ptr());
 
   int uniq_len = len;
   dynamic_merge_grad(dev_num, d_keys, d_grads, len, uniq_len);
@@ -1048,24 +1127,15 @@ void HeterComm<KeyType, ValType, GradType>::push_sparse(int dev_num,
   split_input_to_shard(
       d_keys, d_idx_ptr, uniq_len, d_left_ptr, d_right_ptr, dev_num);
 
-  if (!multi_mf_dim_) {
-    heter_comm_kernel_->fill_shard_grads(d_shard_keys_ptr,
-                                         d_keys,
-                                         d_shard_grads_ptr,
-                                         d_grads,
-                                         d_idx_ptr,
-                                         uniq_len,
-                                         stream);
-  } else {
-    heter_comm_kernel_->dy_mf_fill_shard_grads(d_shard_keys_ptr,
-                                               d_keys,
-                                               d_shard_grads_ptr,
-                                               d_grads,
-                                               d_idx_ptr,
-                                               uniq_len,
-                                               grad_value_size,
-                                               stream);
-  }
+  heter_comm_kernel_->dy_mf_fill_shard_grads(d_shard_keys_ptr,
+                                             d_keys,
+                                             d_shard_grads_ptr,
+                                             d_grads,
+                                             d_idx_ptr,
+                                             uniq_len,
+                                             grad_value_size,
+                                             stream,
+                                             feature_value_accessor_);
 
   sync_stream(stream);
 
@@ -1089,33 +1159,17 @@ void HeterComm<KeyType, ValType, GradType>::push_sparse(int dev_num,
     if (h_left[i] == -1 || h_right[i] == -1) {
       continue;
     }
-    if (!multi_mf_dim_) {
-      create_storage(dev_num,
-                     i,
-                     shard_len * sizeof(KeyType),
-                     shard_len * sizeof(GradType));
-    } else {
-      create_storage(
-          dev_num, i, shard_len * sizeof(KeyType), shard_len * grad_value_size);
-    }
+    create_storage(
+        dev_num, i, shard_len * sizeof(KeyType), shard_len * grad_value_size);
   }
 
-  if (!multi_mf_dim_) {
-    walk_to_dest(dev_num,
-                 total_device,
-                 h_left,
-                 h_right,
-                 d_shard_keys_ptr,
-                 d_shard_grads_ptr);
-  } else {
-    walk_to_dest(dev_num,
-                 total_device,
-                 h_left,
-                 h_right,
-                 d_shard_keys_ptr,
-                 reinterpret_cast<char*>(d_shard_grads_ptr),
-                 grad_value_size);
-  }
+  walk_to_dest(dev_num,
+               total_device,
+               h_left,
+               h_right,
+               d_shard_keys_ptr,
+               reinterpret_cast<char*>(d_shard_grads_ptr),
+               grad_value_size);
 
   for (int i = 0; i < total_device; ++i) {
     if (h_left[i] == -1 || h_right[i] == -1) {
@@ -1125,21 +1179,12 @@ void HeterComm<KeyType, ValType, GradType>::push_sparse(int dev_num,
     sync_stream(node.in_stream);
 
     AnyDeviceGuard guard(resource_->dev_id(i));
-    if (!multi_mf_dim_) {
-      tables_[i]->rwlock_->WRLock();
-      tables_[i]->update(reinterpret_cast<KeyType*>(node.key_storage),
-                         reinterpret_cast<GradType*>(node.val_storage),
-                         h_right[i] - h_left[i] + 1,
-                         sgd,
-                         resource_->remote_stream(i, dev_num));
-    } else {
-      ptr_tables_[i]->rwlock_->WRLock();
-      ptr_tables_[i]->update(reinterpret_cast<KeyType*>(node.key_storage),
-                             node.val_storage,
-                             h_right[i] - h_left[i] + 1,
-                             sgd,
-                             resource_->remote_stream(i, dev_num));
-    }
+    ptr_tables_[i]->rwlock_->WRLock();
+    ptr_tables_[i]->update(reinterpret_cast<KeyType*>(node.key_storage),
+                           node.val_storage,
+                           h_right[i] - h_left[i] + 1,
+                           sgd,
+                           resource_->remote_stream(i, dev_num));
   }
 
   for (int i = 0; i < total_device; ++i) {
@@ -1162,11 +1207,12 @@ void HeterComm<KeyType, ValType, GradType>::push_sparse(int dev_num,
 }
 
 #elif defined(PADDLE_WITH_XPU_KP)
-template <typename KeyType, typename ValType, typename GradType>
-void HeterComm<KeyType, ValType, GradType>::push_sparse(int dev_num,
-                                                        KeyType* d_keys,
-                                                        GradType* d_grads,
-                                                        size_t len) {
+template <typename KeyType,
+          typename ValType,
+          typename GradType,
+          typename FVAccessor>
+void HeterComm<KeyType, ValType, GradType, FVAccessor>::push_sparse(
+    int dev_num, KeyType* d_keys, GradType* d_grads, size_t len) {
   if (len == 0) {
     return;
   }
@@ -1302,9 +1348,12 @@ void HeterComm<KeyType, ValType, GradType>::push_sparse(int dev_num,
 #endif
 
 #if defined(PADDLE_WITH_CUDA)
-template <typename KeyType, typename ValType, typename GradType>
+template <typename KeyType,
+          typename ValType,
+          typename GradType,
+          typename FVAccessor>
 template <typename Sgd>
-void HeterComm<KeyType, ValType, GradType>::update_one_table(
+void HeterComm<KeyType, ValType, GradType, FVAccessor>::update_one_table(
     int gpu_num,
     KeyType* d_keys,
     GradType* d_grads,
@@ -1323,9 +1372,12 @@ void HeterComm<KeyType, ValType, GradType>::update_one_table(
   cudaStreamSynchronize(resource_->remote_stream(gpu_num, gpu_num));
 }
 
-template <typename KeyType, typename ValType, typename GradType>
+template <typename KeyType,
+          typename ValType,
+          typename GradType,
+          typename FVAccessor>
 template <typename Sgd>
-void HeterComm<KeyType, ValType, GradType>::push_sparse_multi_node(
+void HeterComm<KeyType, ValType, GradType, FVAccessor>::push_sparse_multi_node(
     int gpu_num,
     KeyType* d_keys,
     GradType* d_grads,
@@ -1352,8 +1404,11 @@ void HeterComm<KeyType, ValType, GradType>::push_sparse_multi_node(
                    sgd);
 }
 
-template <typename KeyType, typename ValType, typename GradType>
-int HeterComm<KeyType, ValType, GradType>::gather_one_node_grad(
+template <typename KeyType,
+          typename ValType,
+          typename GradType,
+          typename FVAccessor>
+int HeterComm<KeyType, ValType, GradType, FVAccessor>::gather_one_node_grad(
     int gpu_num, KeyType* d_keys, GradType* d_grads, int len) {
   int total_gpu = resource_->total_device();
   int dev_id = resource_->dev_id(gpu_num);
@@ -1454,8 +1509,11 @@ int HeterComm<KeyType, ValType, GradType>::gather_one_node_grad(
   return ret;
 }
 
-template <typename KeyType, typename ValType, typename GradType>
-int HeterComm<KeyType, ValType, GradType>::gather_multi_node_grad(
+template <typename KeyType,
+          typename ValType,
+          typename GradType,
+          typename FVAccessor>
+int HeterComm<KeyType, ValType, GradType, FVAccessor>::gather_multi_node_grad(
     int gpu_num, KeyType* d_keys, GradType* d_grads, int len) {
   int dev_id = resource_->dev_id(gpu_num);
   auto& storage = storage_[gpu_num];
@@ -1525,8 +1583,11 @@ int HeterComm<KeyType, ValType, GradType>::gather_multi_node_grad(
 }
 #endif
 
-template <typename KeyType, typename ValType, typename GradType>
-void HeterComm<KeyType, ValType, GradType>::end_pass() {
+template <typename KeyType,
+          typename ValType,
+          typename GradType,
+          typename FVAccessor>
+void HeterComm<KeyType, ValType, GradType, FVAccessor>::end_pass() {
   int total_device = resource_->total_device();
   std::vector<std::thread> threads;
 
@@ -1547,8 +1608,10 @@ void HeterComm<KeyType, ValType, GradType>::end_pass() {
   }
 }
 
-// template <typename KeyType, typename ValType, typename GradType>
-// void HeterComm<KeyType, ValType, GradType>::dump_to_cpu(int index) {
+// template <typename KeyType, typename ValType, typename GradType, typename
+// FVAccessor>
+// void HeterComm<KeyType, ValType, GradType, FVAccessor>::dump_to_cpu(int
+// index) {
 //  auto stream = resource_->local_stream(index, 0);
 //  int dev_id = resource_->dev_id(index);
 //  platform::CUDADeviceGuard guard(dev_id);
diff --git a/paddle/fluid/framework/fleet/heter_ps/heter_comm_kernel.cu b/paddle/fluid/framework/fleet/heter_ps/heter_comm_kernel.cu
index fd0dd1a72cca1..ebf7e76527af0 100644
--- a/paddle/fluid/framework/fleet/heter_ps/heter_comm_kernel.cu
+++ b/paddle/fluid/framework/fleet/heter_ps/heter_comm_kernel.cu
@@ -128,22 +128,28 @@ __global__ void fill_dvals_kernel(ValType* d_shard_vals,
   }
 }
 
-template <typename KeyType, typename GradType, typename T>
-__global__ void dy_mf_fill_shard_grads_kernel(KeyType* d_shard_keys,
-                                              KeyType* d_keys,
-                                              GradType* d_shard_grads,
-                                              GradType* d_grads,
-                                              T* idx,
-                                              size_t len,
-                                              size_t grad_value_size) {
+template <typename KeyType, typename T, typename FVAccessor>
+__global__ void dy_mf_fill_shard_grads_kernel(
+    KeyType* d_shard_keys,
+    KeyType* d_keys,
+    float* d_shard_grads,
+    float* d_grads,
+    T* idx,
+    size_t len,
+    size_t grad_value_size,
+    FVAccessor feature_value_accessor) {
   const size_t i = blockIdx.x * blockDim.x + threadIdx.x;
   if (i < len) {
     d_shard_keys[i] = d_keys[idx[i]];
-    *(GradType*)((char*)d_shard_grads + i * grad_value_size) =
-        *(GradType*)((char*)d_grads + uint64_t(idx[i]) * grad_value_size);
+    float* cur = (float*)((char*)d_shard_grads + i * grad_value_size);
+    float* shard_val =
+        (float*)((char*)d_grads + uint64_t(idx[i]) * grad_value_size);
+
+    feature_value_accessor.PushValueFill(cur, shard_val);
   }
 }
 
+template <typename FVAccessor>
 __global__ void merge_gradients_kernel(const uint32_t* offset,
                                        const uint32_t* fea_num,
                                        const uint32_t* index,
@@ -151,36 +157,40 @@ __global__ void merge_gradients_kernel(const uint32_t* offset,
                                        char* output,
                                        int n,
                                        size_t grad_value_size,
-                                       DynamicGradMerger& merger_) {
+                                       DynamicGradMerger& merger,
+                                       FVAccessor& feature_value_accessor) {
   const size_t i = blockIdx.x * blockDim.x + threadIdx.x;
   if (i < n) {
     uint32_t start = offset[i];
     uint32_t num = fea_num[i];
     int ori_index = index[start];
-    FeaturePushValue& out = *(FeaturePushValue*)(output + i * grad_value_size);
-    FeaturePushValue& in =
-        *(FeaturePushValue*)(input + size_t(ori_index) * grad_value_size);
-    merger_.update_one(out, in);
+    float* out = (float*)(output + i * grad_value_size);
+    float* in = (float*)(input + size_t(ori_index) * grad_value_size);
+    merger.update_one(out, in, feature_value_accessor);
     for (int j = 1; j < num; ++j) {
       ori_index = index[start + j];
-      FeaturePushValue& rhs =
-          *(FeaturePushValue*)(input + size_t(ori_index) * grad_value_size);
-      merger_.merge_one(out, rhs);
+      in = (float*)(input + size_t(ori_index) * grad_value_size);
+      merger.merge_one(out, in, feature_value_accessor);
     }
   }
 }
 
-template <typename ValType, typename T>
-__global__ void dy_mf_fill_dvals_kernel(ValType* d_shard_vals,
-                                        ValType* d_vals,
+template <typename T, typename FVAccessor>
+__global__ void dy_mf_fill_dvals_kernel(float* d_shard_vals,
+                                        float* d_vals,
                                         T* idx,
                                         size_t len,
-                                        size_t val_size) {
+                                        size_t val_size,
+                                        FVAccessor feature_value_accessor) {
   const size_t i = blockIdx.x * blockDim.x + threadIdx.x;
   if (i < len) {
     uint64_t new_offset = uint64_t(idx[i]) * val_size;
-    *(ValType*)((char*)d_vals + new_offset) =
-        *(ValType*)((char*)d_shard_vals + i * val_size);
+    float* cur = (float*)((char*)d_vals + new_offset);
+    float* shard_val = (float*)((char*)d_shard_vals + uint64_t(i) * val_size);
+    int mf_dim = int(
+        shard_val[feature_value_accessor.common_feature_value.MfDimIndex()]);
+
+    feature_value_accessor.FeatureValueFill(cur, shard_val, mf_dim);
   }
 }
 
@@ -312,15 +322,20 @@ void HeterCommKernel::reduce_by_key(void* d_temp_storage,
                                                             debug_synchronous));
 }
 
-template <typename KeyType, typename GradType, typename T, typename StreamType>
-void HeterCommKernel::dy_mf_fill_shard_grads(KeyType* d_shard_keys,
-                                             KeyType* d_keys,
-                                             GradType* d_shard_grads,
-                                             GradType* d_grads,
-                                             T* idx,
-                                             long long len,
-                                             size_t grad_value_size,
-                                             const StreamType& stream) {
+template <typename KeyType,
+          typename T,
+          typename StreamType,
+          typename FVAccessor>
+void HeterCommKernel::dy_mf_fill_shard_grads(
+    KeyType* d_shard_keys,
+    KeyType* d_keys,
+    float* d_shard_grads,
+    float* d_grads,
+    T* idx,
+    long long len,
+    size_t grad_value_size,
+    const StreamType& stream,
+    FVAccessor& feature_value_accessor) {
   int grid_size = (len - 1) / block_size_ + 1;
   size_t c_len = (size_t)len;
   dy_mf_fill_shard_grads_kernel<<<grid_size, block_size_, 0, stream>>>(
@@ -330,10 +345,11 @@ void HeterCommKernel::dy_mf_fill_shard_grads(KeyType* d_shard_keys,
       d_grads,
       idx,
       c_len,
-      grad_value_size);
+      grad_value_size,
+      feature_value_accessor);
 }
 
-template <typename StreamType>
+template <typename StreamType, typename FVAccessor>
 void HeterCommKernel::merge_gradient(const uint32_t* offset,
                                      const uint32_t* fea_num,
                                      const uint32_t* index,
@@ -342,23 +358,33 @@ void HeterCommKernel::merge_gradient(const uint32_t* offset,
                                      int n,
                                      size_t grad_value_size,
                                      DynamicGradMerger& merger_,
-                                     const StreamType& stream) {
+                                     const StreamType& stream,
+                                     FVAccessor& feature_value_accessor) {
   int grid_size = (n - 1) / block_size_ + 1;
   merge_gradients_kernel<<<grid_size, block_size_, 0, stream>>>(
-      offset, fea_num, index, input, output, n, grad_value_size, merger_);
+      offset,
+      fea_num,
+      index,
+      input,
+      output,
+      n,
+      grad_value_size,
+      merger_,
+      feature_value_accessor);
 }
 
-template <typename ValType, typename T, typename StreamType>
-void HeterCommKernel::dy_mf_fill_dvals(ValType* d_shard_vals,
-                                       ValType* d_vals,
+template <typename T, typename StreamType, typename FVAccessor>
+void HeterCommKernel::dy_mf_fill_dvals(float* d_shard_vals,
+                                       float* d_vals,
                                        T* idx,
                                        long long len,
                                        size_t val_size,
-                                       const StreamType& stream) {
+                                       const StreamType& stream,
+                                       FVAccessor& feature_value_accessor) {
   int grid_size = (len - 1) / block_size_ + 1;
   size_t c_len = (size_t)len;
   dy_mf_fill_dvals_kernel<<<grid_size, block_size_, 0, stream>>>(
-      d_shard_vals, d_vals, idx, c_len, val_size);
+      d_shard_vals, d_vals, idx, c_len, val_size, feature_value_accessor);
 }
 
 template void HeterCommKernel::fill_idx<int, cudaStream_t>(
@@ -402,17 +428,15 @@ template void HeterCommKernel::fill_shard_key<unsigned long, int, cudaStream_t>(
     long long len,
     const cudaStream_t& stream);
 
-template void HeterCommKernel::fill_shard_grads<
-    unsigned long,
-    paddle::framework::FeaturePushValue,
-    int,
-    cudaStream_t>(unsigned long* d_shard_keys,
-                  unsigned long* d_keys,
-                  paddle::framework::FeaturePushValue* d_shard_grads,
-                  paddle::framework::FeaturePushValue* d_grads,
-                  int* idx,
-                  long long len,
-                  const cudaStream_t& stream);
+template void
+HeterCommKernel::fill_shard_grads<unsigned long, float, int, cudaStream_t>(
+    unsigned long* d_shard_keys,
+    unsigned long* d_keys,
+    float* d_shard_grads,
+    float* d_grads,
+    int* idx,
+    long long len,
+    const cudaStream_t& stream);
 
 template void
 HeterCommKernel::fill_dvals<paddle::framework::FeatureValue, int, cudaStream_t>(
@@ -467,20 +491,23 @@ template void HeterCommKernel::reduce_by_key<
                   cudaStream_t stream,
                   bool debug_synchronous);
 
-template void HeterCommKernel::dy_mf_fill_shard_grads<
-    unsigned long,
-    paddle::framework::FeaturePushValue,
-    int,
-    cudaStream_t>(unsigned long* d_shard_keys,
-                  unsigned long* d_keys,
-                  paddle::framework::FeaturePushValue* d_shard_grads,
-                  paddle::framework::FeaturePushValue* d_grads,
-                  int* idx,
-                  long long len,
-                  size_t grad_value_size,
-                  const cudaStream_t& stream);
-
-template void HeterCommKernel::merge_gradient<cudaStream_t>(
+template void
+HeterCommKernel::dy_mf_fill_shard_grads<unsigned long,
+                                        int,
+                                        cudaStream_t,
+                                        CommonFeatureValueAccessor>(
+    unsigned long* d_shard_keys,
+    unsigned long* d_keys,
+    float* d_shard_grads,
+    float* d_grads,
+    int* idx,
+    long long len,
+    size_t grad_value_size,
+    const cudaStream_t& stream,
+    CommonFeatureValueAccessor& feature_value_accessor);
+
+template void
+HeterCommKernel::merge_gradient<cudaStream_t, CommonFeatureValueAccessor>(
     const uint32_t* offset,
     const uint32_t* fea_num,
     const uint32_t* index,
@@ -489,16 +516,18 @@ template void HeterCommKernel::merge_gradient<cudaStream_t>(
     int n,
     size_t grad_value_size,
     DynamicGradMerger& merger_,
-    const cudaStream_t& stream);
+    const cudaStream_t& stream,
+    CommonFeatureValueAccessor& feature_value_accessor);
 
 template void HeterCommKernel::
-    dy_mf_fill_dvals<paddle::framework::FeatureValue, int, cudaStream_t>(
-        paddle::framework::FeatureValue* d_shard_vals,
-        paddle::framework::FeatureValue* d_vals,
+    dy_mf_fill_dvals<int, cudaStream_t, CommonFeatureValueAccessor>(
+        float* d_shard_vals,
+        float* d_vals,
         int* idx,
         long long len,
         size_t val_size,
-        const cudaStream_t& stream);
+        const cudaStream_t& stream,
+        CommonFeatureValueAccessor& feature_value_accessor);
 #endif
 
 }  // namespace framework
diff --git a/paddle/fluid/framework/fleet/heter_ps/heter_comm_kernel.h b/paddle/fluid/framework/fleet/heter_ps/heter_comm_kernel.h
index d1555dc2e0919..57f0aff4b6e56 100644
--- a/paddle/fluid/framework/fleet/heter_ps/heter_comm_kernel.h
+++ b/paddle/fluid/framework/fleet/heter_ps/heter_comm_kernel.h
@@ -41,25 +41,16 @@ struct DynamicGradMerger {
     return out;
   }
 
-  template <typename T>
-  __device__ __forceinline__ void update_one(T& output, const T& input) {
-    output.slot = input.slot;
-    output.show = input.show;
-    output.clk = input.clk;
-    output.mf_dim = input.mf_dim;
-    output.lr_g = input.lr_g;
-    for (int i = 0; i < output.mf_dim; ++i) {
-      output.mf_g[i] = input.mf_g[i];
-    }
+  template <typename FVAccessor>
+  __device__ __forceinline__ void update_one(
+      float* output, const float* input, FVAccessor& feature_value_accessor) {
+    feature_value_accessor.PushValueFill(output, input);
   }
-  template <typename T>
-  __device__ __forceinline__ void merge_one(T& output, const T& input) {
-    output.show += input.show;
-    output.clk += input.clk;
-    output.lr_g += input.lr_g;
-    for (int i = 0; i < input.mf_dim; ++i) {
-      output.mf_g[i] += input.mf_g[i];
-    }
+
+  template <typename FVAccessor>
+  __device__ __forceinline__ void merge_one(
+      float* output, const float* input, FVAccessor& feature_value_accessor) {
+    feature_value_accessor.MergePushValue(output, input);
   }
 };
 
@@ -146,19 +137,20 @@ class HeterCommKernel {
                      bool debug_synchronous = false);
 
   template <typename KeyType,
-            typename GradType,
             typename T,
-            typename StreamType>
+            typename StreamType,
+            typename FVAccessor>
   void dy_mf_fill_shard_grads(KeyType* d_shard_keys,
                               KeyType* d_keys,
-                              GradType* d_shard_grads,
-                              GradType* d_grads,
+                              float* d_shard_grads,
+                              float* d_grads,
                               T* idx,
                               long long len,
                               size_t grad_value_size,
-                              const StreamType& stream);
+                              const StreamType& stream,
+                              FVAccessor& feature_value_accessor);
 
-  template <typename StreamType>
+  template <typename StreamType, typename FVAccessor>
   void merge_gradient(const uint32_t* offset,
                       const uint32_t* fea_num,
                       const uint32_t* index,
@@ -167,15 +159,17 @@ class HeterCommKernel {
                       int n,
                       size_t grad_value_size,
                       DynamicGradMerger& merger_,
-                      const StreamType& stream);
+                      const StreamType& stream,
+                      FVAccessor& feature_value_accessor);
 
-  template <typename ValType, typename T, typename StreamType>
-  void dy_mf_fill_dvals(ValType* d_shard_vals,
-                        ValType* d_vals,
+  template <typename T, typename StreamType, typename FVAccessor>
+  void dy_mf_fill_dvals(float* d_shard_vals,
+                        float* d_vals,
                         T* idx,
                         long long len,
                         size_t val_size,
-                        const StreamType& stream);
+                        const StreamType& stream,
+                        FVAccessor& feature_value_accessor);
 
  private:
   int block_size_{256};
diff --git a/paddle/fluid/framework/fleet/heter_ps/heter_ps.cc b/paddle/fluid/framework/fleet/heter_ps/heter_ps.cc
index 82f5393c3660b..4eff4a8ad55b9 100644
--- a/paddle/fluid/framework/fleet/heter_ps/heter_ps.cc
+++ b/paddle/fluid/framework/fleet/heter_ps/heter_ps.cc
@@ -22,34 +22,43 @@ namespace paddle {
 namespace framework {
 
 HeterPsBase* HeterPsBase::get_instance(
-    size_t capacity, std::shared_ptr<HeterPsResource> resource) {
-  return new HeterPs(capacity, resource);
+    size_t capacity,
+    std::shared_ptr<HeterPsResource> resource,
+    std::unordered_map<std::string, float> fleet_config,
+    std::string accessor_type,
+    int optimizer_type) {
+  if (accessor_type == "CtrDymfAccessor" &&
+      (optimizer_type == 1 || optimizer_type == 3 || optimizer_type == 4)) {
+    return new HeterPs<CommonFeatureValueAccessor>(
+        capacity, resource, accessor_type, fleet_config, optimizer_type);
+  } else {
+    VLOG(0) << " HeterPsBase get_instance Warning: now only support "
+               "CtrDymfAccessor, but get "
+            << accessor_type_;
+    return new HeterPs<CommonFeatureValueAccessor>(
+        capacity, resource, accessor_type, fleet_config, optimizer_type);
+  }
 }
 
-HeterPs::HeterPs(size_t capacity, std::shared_ptr<HeterPsResource> resource) {
-  comm_ =
-      std::make_shared<HeterComm<FeatureKey, FeatureValue, FeaturePushValue>>(
-          capacity, resource);
+HeterPs::HeterPs(size_t capacity,
+                 std::shared_ptr<HeterPsResource> resource,
+                 std::unordered_map<std::string, float> fleet_config,
+                 std::string accessor_type,
+                 int optimizer_type) {
+  comm_ = std::make_shared<HeterComm<FeatureKey, float*, float*, FVAccessor>>(
+      capacity, resource);
+  optimizer_type_ = optimizer_type;
 }
 
 HeterPs::~HeterPs() {}
 
 void HeterPs::pull_sparse(int num,
                           FeatureKey* d_keys,
-                          FeatureValue* d_vals,
+                          float* d_vals,
                           size_t len) {
   comm_->pull_sparse(num, d_keys, d_vals, len);
 }
 
-void HeterPs::build_ps(int num,
-                       FeatureKey* h_keys,
-                       FeatureValue* h_vals,
-                       size_t len,
-                       size_t chunk_size,
-                       int stream_num) {
-  comm_->build_ps(num, h_keys, h_vals, len, chunk_size, stream_num);
-}
-
 int HeterPs::get_index_by_devid(int devid) {
   return comm_->get_index_by_devid(devid);
 }
@@ -68,7 +77,7 @@ void HeterPs::show_one_table(int gpu_num) { comm_->show_one_table(gpu_num); }
 
 void HeterPs::push_sparse(int num,
                           FeatureKey* d_keys,
-                          FeaturePushValue* d_grads,
+                          float* d_grads,
                           size_t len) {
   comm_->push_sparse(num, d_keys, d_grads, len);
   // comm_->push_sparse_multi_node(num, d_keys, d_grads, len, opt_);
diff --git a/paddle/fluid/framework/fleet/heter_ps/heter_ps.cu b/paddle/fluid/framework/fleet/heter_ps/heter_ps.cu
index 005cbd401223d..b059690990370 100644
--- a/paddle/fluid/framework/fleet/heter_ps/heter_ps.cu
+++ b/paddle/fluid/framework/fleet/heter_ps/heter_ps.cu
@@ -22,80 +22,139 @@ namespace paddle {
 namespace framework {
 
 HeterPsBase* HeterPsBase::get_instance(
-    size_t capacity, std::shared_ptr<HeterPsResource> resource) {
-  return new HeterPs(capacity, resource);
+    size_t capacity,
+    std::shared_ptr<HeterPsResource> resource,
+    std::unordered_map<std::string, float> fleet_config,
+    std::string accessor_type,
+    int optimizer_type) {
+  if (accessor_type == "CtrDymfAccessor" &&
+      (optimizer_type == 1 || optimizer_type == 3 || optimizer_type == 4)) {
+    return new HeterPs<CommonFeatureValueAccessor>(
+        capacity, resource, fleet_config, accessor_type, optimizer_type);
+  } else {
+    VLOG(0) << " HeterPsBase get_instance Warning: now only support "
+               "CtrDymfAccessor, but get "
+            << accessor_type;
+    return new HeterPs<CommonFeatureValueAccessor>(
+        capacity, resource, fleet_config, accessor_type, optimizer_type);
+  }
 }
 
-HeterPs::HeterPs(size_t capacity, std::shared_ptr<HeterPsResource> resource) {
-  comm_ =
-      std::make_shared<HeterComm<FeatureKey, FeatureValue, FeaturePushValue>>(
-          capacity, resource);
-  opt_ = Optimizer<FeatureValue, FeaturePushValue>();
+template <typename FVAccessor>
+HeterPs<FVAccessor>::HeterPs(
+    size_t capacity,
+    std::shared_ptr<HeterPsResource> resource,
+    std::unordered_map<std::string, float> fleet_config,
+    std::string accessor_type,
+    int optimizer_type) {
+  comm_ = std::make_shared<HeterComm<FeatureKey, float*, float*, FVAccessor>>(
+      capacity, resource);
+  feature_value_accessor_.Configure(fleet_config);
+  set_accessor(feature_value_accessor_);
+  accessor_type_ = accessor_type;
+  optimizer_type_ = optimizer_type;
 }
 
-HeterPs::~HeterPs() {}
+template <typename FVAccessor>
+HeterPs<FVAccessor>::~HeterPs() {}
 
-void HeterPs::pull_sparse(int num,
-                          FeatureKey* d_keys,
-                          FeatureValue* d_vals,
-                          size_t len) {
+template <typename FVAccessor>
+void HeterPs<FVAccessor>::pull_sparse(int num,
+                                      FeatureKey* d_keys,
+                                      float* d_vals,
+                                      size_t len) {
   comm_->pull_sparse(num, d_keys, d_vals, len);
 }
 
-void HeterPs::build_ps(int num,
-                       FeatureKey* h_keys,
-                       FeatureValue* h_vals,
-                       size_t len,
-                       size_t chunk_size,
-                       int stream_num) {
-  comm_->build_ps(num, h_keys, h_vals, len, chunk_size, stream_num);
-}
-
-void HeterPs::build_ps(int num,
-                       FeatureKey* h_keys,
-                       char* pool,
-                       size_t len,
-                       size_t feature_value_size,
-                       size_t chunk_size,
-                       int stream_num) {
+template <typename FVAccessor>
+void HeterPs<FVAccessor>::build_ps(int num,
+                                   FeatureKey* h_keys,
+                                   char* pool,
+                                   size_t len,
+                                   size_t feature_value_size,
+                                   size_t chunk_size,
+                                   int stream_num) {
   comm_->build_ps(
       num, h_keys, pool, len, feature_value_size, chunk_size, stream_num);
 }
 
-int HeterPs::get_index_by_devid(int devid) {
+template <typename FVAccessor>
+int HeterPs<FVAccessor>::get_index_by_devid(int devid) {
   return comm_->get_index_by_devid(devid);
 }
 
-void HeterPs::set_sparse_sgd(const OptimizerConfig& optimizer_config) {
+template <typename FVAccessor>
+void HeterPs<FVAccessor>::set_sparse_sgd(
+    const OptimizerConfig& optimizer_config) {
   comm_->set_sparse_sgd(optimizer_config);
 }
 
-void HeterPs::set_embedx_sgd(const OptimizerConfig& optimizer_config) {
+template <typename FVAccessor>
+void HeterPs<FVAccessor>::set_embedx_sgd(
+    const OptimizerConfig& optimizer_config) {
   comm_->set_embedx_sgd(optimizer_config);
 }
 
-void HeterPs::end_pass() { comm_->end_pass(); }
+template <typename FVAccessor>
+void HeterPs<FVAccessor>::end_pass() {
+  comm_->end_pass();
+}
 
-void HeterPs::show_one_table(int gpu_num) { comm_->show_one_table(gpu_num); }
+template <typename FVAccessor>
+void HeterPs<FVAccessor>::show_one_table(int gpu_num) {
+  comm_->show_one_table(gpu_num);
+}
 
-void HeterPs::push_sparse(int num,
-                          FeatureKey* d_keys,
-                          FeaturePushValue* d_grads,
-                          size_t len) {
-  comm_->push_sparse(num, d_keys, d_grads, len, opt_);
-  // comm_->push_sparse_multi_node(num, d_keys, d_grads, len, opt_);
+template <typename FVAccessor>
+void HeterPs<FVAccessor>::push_sparse(int num,
+                                      FeatureKey* d_keys,
+                                      float* d_grads,
+                                      size_t len) {
+  if (accessor_type_ == "CtrDymfAccessor") {
+    if (optimizer_type_ == 3) {  // adam
+      auto optimizer = SparseAdamOptimizer(feature_value_accessor_);
+      VLOG(5) << "INTO push_sparse SparseAdamOptimizer, EmbedDim():"
+              << optimizer.EmbedDim();
+      comm_->push_sparse(num, d_keys, d_grads, len, optimizer);
+    } else if (optimizer_type_ == 4) {  // shared_adam
+      auto optimizer = SparseAdamSharedOptimizer(feature_value_accessor_);
+      VLOG(5) << "INTO push_sparse SparseAdamSharedOptimizer, EmbedDim():"
+              << optimizer.EmbedDim();
+      comm_->push_sparse(num, d_keys, d_grads, len, optimizer);
+    } else if (optimizer_type_ == 1) {  // adagrad  {
+      auto optimizer = SparseAdagradOptimizer(feature_value_accessor_);
+      VLOG(5) << "INTO push_sparse SparseAdagradOptimizer, EmbedDim():"
+              << optimizer.EmbedDim();
+      comm_->push_sparse(num, d_keys, d_grads, len, optimizer);
+    } else {
+      VLOG(0) << " push sparse Error: CtrDymfAccessor only support adagrad(1),"
+                 "adam(3) or shared_adam(4), bug get optimizer type:"
+              << optimizer_type_;
+    }
+  } else {
+    VLOG(0) << " push sparse Error: now only support CtrDymfAccessor, but get "
+            << accessor_type_;
+  }
 }
 
-void HeterPs::set_nccl_comm_and_size(const std::vector<ncclComm_t>& inner_comms,
-                                     const std::vector<ncclComm_t>& inter_comms,
-                                     int comm_size) {
+template <typename FVAccessor>
+void HeterPs<FVAccessor>::set_nccl_comm_and_size(
+    const std::vector<ncclComm_t>& inner_comms,
+    const std::vector<ncclComm_t>& inter_comms,
+    int comm_size) {
   comm_->set_nccl_comm_and_size(inner_comms, inter_comms, comm_size);
 }
 
-void HeterPs::set_multi_mf_dim(int multi_mf_dim, int max_mf_dim) {
+template <typename FVAccessor>
+void HeterPs<FVAccessor>::set_multi_mf_dim(int multi_mf_dim, int max_mf_dim) {
   comm_->set_multi_mf_dim(multi_mf_dim, max_mf_dim);
 }
 
+template <typename FVAccessor>
+void HeterPs<FVAccessor>::set_accessor(FVAccessor& accessor) {
+  comm_->set_accessor(accessor);
+}
+
 }  // end namespace framework
 }  // end namespace paddle
 #endif
diff --git a/paddle/fluid/framework/fleet/heter_ps/heter_ps.h b/paddle/fluid/framework/fleet/heter_ps/heter_ps.h
index 7fee229738830..439f5d6c81854 100644
--- a/paddle/fluid/framework/fleet/heter_ps/heter_ps.h
+++ b/paddle/fluid/framework/fleet/heter_ps/heter_ps.h
@@ -26,24 +26,23 @@ limitations under the License. */
 namespace paddle {
 namespace framework {
 
+template <typename FVAccessor>
 class HeterPs : public HeterPsBase {
  public:
   HeterPs() {}
-  HeterPs(size_t capacity, std::shared_ptr<HeterPsResource> resource);
+  HeterPs(size_t capacity,
+          std::shared_ptr<HeterPsResource> resource,
+          std::unordered_map<std::string, float> fleet_config,
+          std::string accessor_type,
+          int optimizer_type);
   virtual ~HeterPs();
   HeterPs(const HeterPs&) = delete;
   HeterPs& operator=(const HeterPs&) = delete;
 
   void pull_sparse(int num,
                    FeatureKey* d_keys,
-                   FeatureValue* d_vals,
+                   float* d_vals,
                    size_t len) override;
-  void build_ps(int num,
-                FeatureKey* h_keys,
-                FeatureValue* h_vals,
-                size_t len,
-                size_t chunk_size,
-                int stream_num) override;
   void build_ps(int num,
                 FeatureKey* h_keys,
                 char* pool,
@@ -56,6 +55,8 @@ class HeterPs : public HeterPsBase {
                               const std::vector<ncclComm_t>& inter_comms,
                               int comm_size) override;
   void set_multi_mf_dim(int multi_mf_dim, int max_mf_dim) override;
+
+  void set_accessor(FVAccessor& accessor);
 #endif
 
   void set_sparse_sgd(const OptimizerConfig& optimizer_config) override;
@@ -66,13 +67,15 @@ class HeterPs : public HeterPsBase {
   void show_one_table(int gpu_num) override;
   void push_sparse(int num,
                    FeatureKey* d_keys,
-                   FeaturePushValue* d_grads,
+                   float* d_grads,
                    size_t len) override;
 
  private:
-  std::shared_ptr<HeterComm<FeatureKey, FeatureValue, FeaturePushValue>> comm_;
+  std::shared_ptr<HeterComm<FeatureKey, float*, float*, FVAccessor>> comm_;
 #if defined(PADDLE_WITH_CUDA)
-  Optimizer<FeatureValue, FeaturePushValue> opt_;
+  FVAccessor feature_value_accessor_;
+  std::string accessor_type_;
+  int optimizer_type_;
 #endif
 };
 
diff --git a/paddle/fluid/framework/fleet/heter_ps/heter_ps_base.h b/paddle/fluid/framework/fleet/heter_ps/heter_ps_base.h
index acc984f14adaa..e45d1db71ccae 100644
--- a/paddle/fluid/framework/fleet/heter_ps/heter_ps_base.h
+++ b/paddle/fluid/framework/fleet/heter_ps/heter_ps_base.h
@@ -34,14 +34,8 @@ class HeterPsBase {
 
   virtual void pull_sparse(int num,
                            FeatureKey* d_keys,
-                           FeatureValue* d_vals,
+                           float* d_vals,
                            size_t len) = 0;
-  virtual void build_ps(int num,
-                        FeatureKey* h_keys,
-                        FeatureValue* h_vals,
-                        size_t len,
-                        size_t chunk_size,
-                        int stream_num) = 0;
   virtual void build_ps(int num,
                         FeatureKey* h_keys,
                         char* pool,
@@ -56,19 +50,25 @@ class HeterPsBase {
       const std::vector<ncclComm_t>& inter_comms,
       int comm_size) = 0;
   virtual void set_multi_mf_dim(int multi_mf_dim, int max_mf_dim) = 0;
+
 #endif
   virtual void end_pass() = 0;
   virtual void show_one_table(int gpu_num) = 0;
   virtual void push_sparse(int num,
                            FeatureKey* d_keys,
-                           FeaturePushValue* d_grads,
+                           float* d_grads,
                            size_t len) = 0;
 
   virtual void set_sparse_sgd(const OptimizerConfig& optimizer_config) = 0;
   virtual void set_embedx_sgd(const OptimizerConfig& optimizer_config) = 0;
 
-  static HeterPsBase* get_instance(size_t capacity,
-                                   std::shared_ptr<HeterPsResource> resource);
+  static HeterPsBase* get_instance(
+      size_t capacity,
+      std::shared_ptr<HeterPsResource> resource,
+      //  CommonFeatureValueAccessor feature_value_accessor,
+      std::unordered_map<std::string, float> fleet_config,
+      std::string accessor_type,
+      int optimizer_type);
 };
 
 }  // end namespace framework
diff --git a/paddle/fluid/framework/fleet/heter_ps/mem_pool.h b/paddle/fluid/framework/fleet/heter_ps/mem_pool.h
index 88c3136dd77d1..05e252b2afe44 100644
--- a/paddle/fluid/framework/fleet/heter_ps/mem_pool.h
+++ b/paddle/fluid/framework/fleet/heter_ps/mem_pool.h
@@ -82,20 +82,6 @@ class HBMMemoryPool : public managed {
     cudaMemset(mem_, 0, block_size_ * capacity);
   }
 
-  friend std::ostream& operator<<(std::ostream& out, HBMMemoryPool& p) {
-    for (size_t k = 0; k < 5; k++) {
-      auto x = (FeatureValue*)(p.mem() + k * p.capacity());
-      out << "show: " << x->show << " clk: " << x->clk << " slot: " << x->slot
-          << " lr: " << x->lr << " mf_dim: " << x->mf_size
-          << " mf_size: " << x->mf_size << " mf:";
-      for (int i = 0; i < x->mf_size + 1; ++i) {
-        out << " " << x->mf[i];
-      }
-      out << "\n";
-    }
-    return out;
-  }
-
   char* mem() { return mem_; }
 
   size_t capacity() { return capacity_; }
diff --git a/paddle/fluid/framework/fleet/heter_ps/optimizer.cuh.h b/paddle/fluid/framework/fleet/heter_ps/optimizer.cuh.h
index 74a4f1ca16c2b..3a6f60fef858b 100644
--- a/paddle/fluid/framework/fleet/heter_ps/optimizer.cuh.h
+++ b/paddle/fluid/framework/fleet/heter_ps/optimizer.cuh.h
@@ -27,134 +27,460 @@ namespace paddle {
 namespace framework {
 
 #if defined(PADDLE_WITH_CUDA)
-template <typename ValType, typename GradType>
+
 class Optimizer {
  public:
-  Optimizer() {}
+  __host__ Optimizer(CommonFeatureValueAccessor feature_value_accessor) {
+    feature_value_accessor_ = feature_value_accessor;
+  }
+  __host__ ~Optimizer() {}
+
+  __device__ void update_value(const OptimizerConfig& optimizer_config,
+                               float& val,  // NOLINT
+                               const float& grad) {
+    printf(
+        "Warning: update_value will not used. Please use dy_mf_update_value\n");
+  }
+
+  __device__ void dy_mf_update_value(const OptimizerConfig& optimizer_config,
+                                     float* ptr,
+                                     const float* grad) {}
 
-  ~Optimizer() {}
+  CommonFeatureValueAccessor feature_value_accessor_;
 
-  void initialize() {}
+  size_t _embedding_dim;
+  size_t _lr_embedding_dim;
+};
+
+class SparseAdagradOptimizer : public Optimizer {
+ public:
+  __host__ SparseAdagradOptimizer(
+      CommonFeatureValueAccessor feature_value_accessor)
+      : Optimizer(feature_value_accessor) {
+    _lr_embedding_dim = 1;
+    _embedding_dim = feature_value_accessor_.common_feature_value.EmbedWDim();
+  }
+
+  __device__ void update_value_work(const OptimizerConfig& optimizer_config,
+                                    int n,
+                                    float* w,
+                                    float* sgd,  // NOLINT
+                                    const float* g,
+                                    float scale) {
+    float& g2sum = sgd[G2SumIndex()];
+    double add_g2sum = 0;
+    double ratio = optimizer_config.mf_learning_rate *
+                   sqrt(optimizer_config.mf_initial_g2sum /
+                        (optimizer_config.mf_initial_g2sum + g2sum));
+    for (int i = 0; i < n; ++i) {
+      double scaled_grad = g[i] / scale;
+
+      w[i] += scaled_grad * ratio;
+
+      if (w[i] < optimizer_config.mf_min_bound)
+        w[i] = optimizer_config.mf_min_bound;
+      if (w[i] > optimizer_config.mf_max_bound)
+        w[i] = optimizer_config.mf_max_bound;
+      add_g2sum += scaled_grad * scaled_grad;
+    }
+
+    g2sum += add_g2sum / n;
+  }
+
+  __device__ void update_value(const OptimizerConfig& optimizer_config,
+                               float& val,  // NOLINT
+                               const float& grad) {
+    printf(
+        "Warning: update_value will not used. Please use dy_mf_update_value\n");
+  }
+  __device__ void dy_mf_update_value(const OptimizerConfig& optimizer_config,
+                                     float* ptr,
+                                     const float* grad) {
+    float g_show = grad[feature_value_accessor_.common_push_value.ShowIndex()];
+    float g_click =
+        grad[feature_value_accessor_.common_push_value.ClickIndex()];
+
+    ptr[feature_value_accessor_.common_feature_value.SlotIndex()] =
+        grad[feature_value_accessor_.common_push_value.SlotIndex()];
+    ptr[feature_value_accessor_.common_feature_value.ShowIndex()] += g_show;
+    ptr[feature_value_accessor_.common_feature_value.ClickIndex()] += g_click;
+    ptr[feature_value_accessor_.common_feature_value.DeltaScoreIndex()] +=
+        optimizer_config.nonclk_coeff * (g_show - g_click) +
+        optimizer_config.clk_coeff * g_click;
+
+    update_value_work(
+        optimizer_config,
+        1,
+        ptr + feature_value_accessor_.common_feature_value.EmbedWIndex(),
+        ptr + feature_value_accessor_.common_feature_value.EmbedG2SumIndex(),
+        grad + feature_value_accessor_.common_push_value.EmbedGIndex(),
+        g_show);
+
+    int mf_dim =
+        int(ptr[feature_value_accessor_.common_feature_value.MfDimIndex()]);
+    if (ptr[feature_value_accessor_.common_feature_value.MfSizeIndex()] == 0) {
+      if (optimizer_config.mf_create_thresholds <=
+          optimizer_config.nonclk_coeff *
+                  (ptr[feature_value_accessor_.common_feature_value
+                           .ShowIndex()] -
+                   ptr[feature_value_accessor_.common_feature_value
+                           .ClickIndex()]) +
+              optimizer_config.clk_coeff *
+                  ptr[feature_value_accessor_.common_feature_value
+                          .ClickIndex()]) {
+        ptr[feature_value_accessor_.common_feature_value.MfSizeIndex()] =
+            feature_value_accessor_.common_feature_value.MFSize(mf_dim) /
+            sizeof(float);
+
+        int tid_x = blockIdx.x * blockDim.x + threadIdx.x;
+        curandState state;
+        curand_init(clock64(), tid_x, 0, &state);
+        for (int i = 0; i < mf_dim; ++i) {
+          ptr[feature_value_accessor_.common_feature_value.EmbedxWIndex() + i] =
+              (curand_uniform(&state)) * optimizer_config.mf_initial_range;
+        }
+      }
+    } else {
+      update_value_work(
+          optimizer_config,
+          mf_dim,
+          ptr + feature_value_accessor_.common_feature_value.EmbedxWIndex(),
+          ptr + feature_value_accessor_.common_feature_value.EmbedxG2SumIndex(),
+          grad + feature_value_accessor_.common_push_value.EmbedxGIndex(),
+          g_show);
+    }
+  }
+
+  __host__ __device__ size_t Dim() { return EmbedDim() + EmbedxDim(); }
+  __host__ __device__ size_t EmbedDim() { return _lr_embedding_dim; }
+  __host__ __device__ size_t EmbedxDim() { return _embedding_dim; }
+  __host__ __device__ size_t G2SumIndex() { return 0; }
+  __host__ __device__ size_t EmbedxG2SumIndex() { return 0; }
+};
+
+class SparseAdamOptimizer : public Optimizer {
+ public:
+  __host__ SparseAdamOptimizer(
+      CommonFeatureValueAccessor feature_value_accessor)
+      : Optimizer(feature_value_accessor) {
+    _lr_embedding_dim = 1;
+    _embedding_dim = feature_value_accessor_.common_feature_value.EmbedWDim();
+  }
 
   __device__ void update_lr(const OptimizerConfig& optimizer_config,
-                            float& w,  // NOLINT
-                            float& g2sum,
-                            float g,  // NOLINT
+                            int n,
+                            float* w,
+                            float* sgd,
+                            const float* g,
                             float scale) {
-    double add_g2sum = 0;
-    double ratio = optimizer_config.learning_rate *
-                   sqrt(optimizer_config.initial_g2sum /
-                        (optimizer_config.initial_g2sum + g2sum));
-    double scaled_grad = g / scale;
+    float* moment1 = sgd + GSumIndex();
+    float* moment2 = sgd + G2SumIndex();
+    float* beta1_pow = sgd + Beta1PowIndex();
+    float* beta2_pow = sgd + Beta2PowIndex();
 
-    w += scaled_grad * ratio;
+    float beta1_pow_ = *beta1_pow;
+    float beta2_pow_ = *beta2_pow;
 
-    if (w < optimizer_config.min_bound) w = optimizer_config.min_bound;
-    if (w > optimizer_config.max_bound) w = optimizer_config.max_bound;
+    float epsilon = 1e-08;
+    double ratio = optimizer_config.learning_rate * sqrt(1.0 - beta2_pow_) /
+                   (1.0 - beta1_pow_);
+    for (int i = 0; i < n; ++i) {
+      double scaled_grad = g[i] / scale;
 
-    add_g2sum += scaled_grad * scaled_grad;
+      double new_moment1 =
+          optimizer_config.beta1_decay_rate * moment1[i] +
+          (1.0 - optimizer_config.beta1_decay_rate) * scaled_grad;
+      double new_moment2 =
+          optimizer_config.beta2_decay_rate * moment2[i] +
+          (1.0 - optimizer_config.beta2_decay_rate) * scaled_grad * scaled_grad;
+      w[i] += ratio * (new_moment1 / (sqrt(new_moment2) + epsilon));
+
+      if (w[i] < optimizer_config.mf_min_bound)
+        w[i] = optimizer_config.mf_min_bound;
+      if (w[i] > optimizer_config.mf_max_bound)
+        w[i] = optimizer_config.mf_max_bound;
 
-    g2sum += add_g2sum;
+      moment1[i] = new_moment1;
+      moment2[i] = new_moment2;
+    }
+    (*beta1_pow) *= optimizer_config.beta1_decay_rate;
+    (*beta2_pow) *= optimizer_config.beta2_decay_rate;
   }
 
   __device__ void update_mf(const OptimizerConfig& optimizer_config,
                             int n,
                             float* w,
-                            float& g2sum,  // NOLINT
+                            float* sgd,
                             const float* g,
                             float scale) {
-    double add_g2sum = 0;
-    double ratio = optimizer_config.mf_learning_rate *
-                   sqrt(optimizer_config.mf_initial_g2sum /
-                        (optimizer_config.mf_initial_g2sum + g2sum));
+    float* moment1 = sgd + EmbedxGSumIndex();
+    float* moment2 = sgd + EmbedxG2SumIndex();
+    float* beta1_pow = sgd + EmbedxBeta1PowIndex();
+    float* beta2_pow = sgd + EmbedxBeta2PowIndex();
+
+    float beta1_pow_ = *beta1_pow;
+    float beta2_pow_ = *beta2_pow;
+
+    float epsilon = 1e-08;
+    double ratio = optimizer_config.learning_rate * sqrt(1.0 - beta2_pow_) /
+                   (1.0 - beta1_pow_);
     for (int i = 0; i < n; ++i) {
       double scaled_grad = g[i] / scale;
 
-      w[i] += scaled_grad * ratio;
+      double new_moment1 =
+          optimizer_config.beta1_decay_rate * moment1[i] +
+          (1.0 - optimizer_config.beta1_decay_rate) * scaled_grad;
+      double new_moment2 =
+          optimizer_config.beta2_decay_rate * moment2[i] +
+          (1.0 - optimizer_config.beta2_decay_rate) * scaled_grad * scaled_grad;
+      w[i] += ratio * (new_moment1 / (sqrt(new_moment2) + epsilon));
 
       if (w[i] < optimizer_config.mf_min_bound)
         w[i] = optimizer_config.mf_min_bound;
       if (w[i] > optimizer_config.mf_max_bound)
         w[i] = optimizer_config.mf_max_bound;
-      add_g2sum += scaled_grad * scaled_grad;
-    }
 
-    g2sum += add_g2sum / n;
+      moment1[i] = new_moment1;
+      moment2[i] = new_moment2;
+    }
+    (*beta1_pow) *= optimizer_config.beta1_decay_rate;
+    (*beta2_pow) *= optimizer_config.beta2_decay_rate;
   }
 
   __device__ void update_value(const OptimizerConfig& optimizer_config,
-                               ValType& val,  // NOLINT
-                               const GradType& grad) {
-    val.slot = grad.slot;
-    val.show += grad.show;
-    val.clk += grad.clk;
-    val.delta_score += optimizer_config.nonclk_coeff * (grad.show - grad.clk) +
-                       optimizer_config.clk_coeff * grad.clk;
+                               float& val,  // NOLINT
+                               const float& grad) {
+    printf(
+        "Warning: update_value will not used. Please use dy_mf_update_value\n");
+  }
+  __device__ void dy_mf_update_value(const OptimizerConfig& optimizer_config,
+                                     float* ptr,
+                                     const float* grad) {
+    float g_show = grad[feature_value_accessor_.common_push_value.ShowIndex()];
+    float g_click =
+        grad[feature_value_accessor_.common_push_value.ClickIndex()];
 
-    update_lr(optimizer_config, val.lr, val.lr_g2sum, grad.lr_g, grad.show);
+    ptr[feature_value_accessor_.common_feature_value.SlotIndex()] =
+        grad[feature_value_accessor_.common_push_value.SlotIndex()];
+    ptr[feature_value_accessor_.common_feature_value.ShowIndex()] += g_show;
+    ptr[feature_value_accessor_.common_feature_value.ClickIndex()] += g_click;
+    ptr[feature_value_accessor_.common_feature_value.DeltaScoreIndex()] +=
+        optimizer_config.nonclk_coeff * (g_show - g_click) +
+        optimizer_config.clk_coeff * g_click;
 
-    if (val.mf_size == 0) {
+    update_lr(
+        optimizer_config,
+        1,
+        ptr + feature_value_accessor_.common_feature_value.EmbedWIndex(),
+        ptr + feature_value_accessor_.common_feature_value.EmbedG2SumIndex(),
+        grad + feature_value_accessor_.common_push_value.EmbedGIndex(),
+        g_show);
+    int mf_dim =
+        int(ptr[feature_value_accessor_.common_feature_value.MfDimIndex()]);
+    if (ptr[feature_value_accessor_.common_feature_value.MfSizeIndex()] == 0) {
       if (optimizer_config.mf_create_thresholds <=
-          optimizer_config.nonclk_coeff * (val.show - val.clk) +
-              optimizer_config.clk_coeff * val.clk) {
-        val.mf_size = MF_DIM + 1;
-        val.mf[0] = 0;
+          optimizer_config.nonclk_coeff *
+                  (ptr[feature_value_accessor_.common_feature_value
+                           .ShowIndex()] -
+                   ptr[feature_value_accessor_.common_feature_value
+                           .ClickIndex()]) +
+              optimizer_config.clk_coeff *
+                  ptr[feature_value_accessor_.common_feature_value
+                          .ClickIndex()]) {
+        ptr[feature_value_accessor_.common_feature_value.MfSizeIndex()] =
+            feature_value_accessor_.common_feature_value.MFSize(mf_dim) /
+            sizeof(float);
+
         int tid_x = blockIdx.x * blockDim.x + threadIdx.x;
         curandState state;
         curand_init(clock64(), tid_x, 0, &state);
-        for (int i = 0; i < MF_DIM; ++i) {
-          val.mf[i + 1] =
+        for (int i = 0; i < mf_dim; ++i) {
+          ptr[feature_value_accessor_.common_feature_value.EmbedxWIndex() + i] =
               (curand_uniform(&state)) * optimizer_config.mf_initial_range;
         }
+        ptr[feature_value_accessor_.common_feature_value.EmbedxG2SumIndex() +
+            EmbedxBeta1PowIndex()] = optimizer_config.beta1_decay_rate;
+        ptr[feature_value_accessor_.common_feature_value.EmbedxG2SumIndex() +
+            EmbedxBeta2PowIndex()] = optimizer_config.beta2_decay_rate;
       }
     } else {
-      update_mf(optimizer_config,
-                MF_DIM,
-                &val.mf[1],
-                val.mf[0],
-                grad.mf_g,
-                grad.show);
+      update_mf(
+          optimizer_config,
+          mf_dim,
+          ptr + feature_value_accessor_.common_feature_value.EmbedxWIndex(),
+          ptr + feature_value_accessor_.common_feature_value.EmbedxG2SumIndex(),
+          grad + feature_value_accessor_.common_push_value.EmbedxGIndex(),
+          g_show);
+    }
+    // printf("EmbedxGIndex: %f, mf_gsum: %f, ",
+    // feature_value_accessor_.common_push_value.EmbedxGIndex(),
+    //          ptr[feature_value_accessor_.common_feature_value.EmbedxG2SumIndex()]);
+  }
+
+  __host__ __device__ size_t Dim() { return EmbedDim() + EmbedxDim(); }
+  __host__ __device__ size_t EmbedDim() { return _lr_embedding_dim * 2 + 2; }
+  __host__ __device__ size_t EmbedxDim() { return _embedding_dim * 2 + 2; }
+  __host__ __device__ size_t GSumIndex() { return 0; }
+  __host__ __device__ size_t G2SumIndex() {
+    return GSumIndex() + _lr_embedding_dim;
+  }
+  __host__ __device__ size_t Beta1PowIndex() {
+    return G2SumIndex() + _lr_embedding_dim;
+  }
+  __host__ __device__ size_t Beta2PowIndex() { return Beta1PowIndex() + 1; }
+  __host__ __device__ size_t EmbedxGSumIndex() { return 0; }
+  __host__ __device__ size_t EmbedxG2SumIndex() {
+    return EmbedxGSumIndex() + _embedding_dim;
+  }
+  __host__ __device__ size_t EmbedxBeta1PowIndex() {
+    return EmbedxG2SumIndex() + _embedding_dim;
+  }
+  __host__ __device__ size_t EmbedxBeta2PowIndex() {
+    return EmbedxBeta1PowIndex() + 1;
+  }
+};
+
+class SparseAdamSharedOptimizer : public Optimizer {
+ public:
+  __host__ SparseAdamSharedOptimizer(
+      CommonFeatureValueAccessor feature_value_accessor)
+      : Optimizer(feature_value_accessor) {
+    _lr_embedding_dim = 1;
+    _embedding_dim = feature_value_accessor_.common_feature_value.EmbedWDim();
+  }
+
+  __device__ void update_value_work(const OptimizerConfig& optimizer_config,
+                                    int n,
+                                    float* w,
+                                    float* sgd,
+                                    const float* g,
+                                    float scale) {
+    float* moment1 = sgd + GSumIndex();
+    float* moment2 = sgd + G2SumIndex();
+    float* beta1_pow = sgd + Beta1PowIndex();
+    float* beta2_pow = sgd + Beta2PowIndex();
+
+    float beta1_pow_ = *beta1_pow;
+    float beta2_pow_ = *beta2_pow;
+    float moment1_ = *moment1;
+    float moment2_ = *moment2;
+    float epsilon = 1e-08;
+    double ratio = optimizer_config.learning_rate * sqrt(1.0 - beta2_pow_) /
+                   (1.0 - beta1_pow_);
+
+    double sum_mom1 = 0.0;
+    double sum_mom2 = 0.0;
+    for (int i = 0; i < n; ++i) {
+      double scaled_grad = g[i] / scale;
+
+      double new_moment1 =
+          optimizer_config.beta1_decay_rate * moment1_ +
+          (1.0 - optimizer_config.beta1_decay_rate) * scaled_grad;
+      double new_moment2 =
+          optimizer_config.beta2_decay_rate * moment2_ +
+          (1.0 - optimizer_config.beta2_decay_rate) * scaled_grad * scaled_grad;
+      w[i] += ratio * (new_moment1 / (sqrt(new_moment2) + epsilon));
+
+      if (w[i] < optimizer_config.mf_min_bound)
+        w[i] = optimizer_config.mf_min_bound;
+      if (w[i] > optimizer_config.mf_max_bound)
+        w[i] = optimizer_config.mf_max_bound;
+
+      sum_mom1 += new_moment1;
+      sum_mom2 += new_moment2;
     }
+
+    (*moment1) = sum_mom1 / n;
+    (*moment2) = sum_mom2 / n;
+    (*beta1_pow) *= optimizer_config.beta1_decay_rate;
+    (*beta2_pow) *= optimizer_config.beta2_decay_rate;
+  }
+
+  __device__ void update_value(const OptimizerConfig& optimizer_config,
+                               float& val,  // NOLINT
+                               const float& grad) {
+    printf(
+        "Warning: update_value will not used. Please use dy_mf_update_value\n");
   }
 
   __device__ void dy_mf_update_value(const OptimizerConfig& optimizer_config,
-                                     ValType* ptr,
-                                     const GradType& grad) {
-    ptr->slot = grad.slot;
-    ptr->show += grad.show;
-    ptr->clk += grad.clk;
-    ptr->delta_score += optimizer_config.nonclk_coeff * (grad.show - grad.clk) +
-                        optimizer_config.clk_coeff * grad.clk;
-
-    update_lr(optimizer_config, ptr->lr, ptr->lr_g2sum, grad.lr_g, grad.show);
-    // use MF_DIM temporarily
-    // ptr->mf_dim = grad.mf_dim;
-
-    if (ptr->mf_size == 0) {
+                                     float* ptr,
+                                     const float* grad) {
+    float g_show = grad[feature_value_accessor_.common_push_value.ShowIndex()];
+    float g_click =
+        grad[feature_value_accessor_.common_push_value.ClickIndex()];
+
+    ptr[feature_value_accessor_.common_feature_value.SlotIndex()] =
+        grad[feature_value_accessor_.common_push_value.SlotIndex()];
+    ptr[feature_value_accessor_.common_feature_value.ShowIndex()] += g_show;
+    ptr[feature_value_accessor_.common_feature_value.ClickIndex()] += g_click;
+    ptr[feature_value_accessor_.common_feature_value.DeltaScoreIndex()] +=
+        optimizer_config.nonclk_coeff * (g_show - g_click) +
+        optimizer_config.clk_coeff * g_click;
+
+    update_value_work(
+        optimizer_config,
+        1,
+        ptr + feature_value_accessor_.common_feature_value.EmbedWIndex(),
+        ptr + feature_value_accessor_.common_feature_value.EmbedG2SumIndex(),
+        grad + feature_value_accessor_.common_push_value.EmbedGIndex(),
+        g_show);
+    int mf_dim =
+        int(ptr[feature_value_accessor_.common_feature_value.MfDimIndex()]);
+    if (ptr[feature_value_accessor_.common_feature_value.MfSizeIndex()] == 0) {
       if (optimizer_config.mf_create_thresholds <=
-          optimizer_config.nonclk_coeff * (ptr->show - ptr->clk) +
-              optimizer_config.clk_coeff * ptr->clk) {
-        ptr->mf_size = ptr->mf_dim + 1;
+          optimizer_config.nonclk_coeff *
+                  (ptr[feature_value_accessor_.common_feature_value
+                           .ShowIndex()] -
+                   ptr[feature_value_accessor_.common_feature_value
+                           .ClickIndex()]) +
+              optimizer_config.clk_coeff *
+                  ptr[feature_value_accessor_.common_feature_value
+                          .ClickIndex()]) {
+        ptr[feature_value_accessor_.common_feature_value.MfSizeIndex()] =
+            feature_value_accessor_.common_feature_value.MFSize(mf_dim) /
+            sizeof(float);
 
-        // ptr->mf_size = MF_DIM + 1;
-        ptr->mf[0] = 0;
         int tid_x = blockIdx.x * blockDim.x + threadIdx.x;
         curandState state;
         curand_init(clock64(), tid_x, 0, &state);
-        for (int i = 0; i < ptr->mf_dim; ++i) {
-          ptr->mf[i + 1] =
+        for (int i = 0; i < mf_dim; ++i) {
+          ptr[feature_value_accessor_.common_feature_value.EmbedxWIndex() + i] =
               (curand_uniform(&state)) * optimizer_config.mf_initial_range;
         }
+        ptr[feature_value_accessor_.common_feature_value.EmbedxG2SumIndex() +
+            EmbedxBeta1PowIndex()] = optimizer_config.beta1_decay_rate;
+        ptr[feature_value_accessor_.common_feature_value.EmbedxG2SumIndex() +
+            EmbedxBeta2PowIndex()] = optimizer_config.beta2_decay_rate;
       }
     } else {
-      update_mf(optimizer_config,
-                ptr->mf_dim,
-                &(ptr->mf[1]),
-                ptr->mf[0],
-                grad.mf_g,
-                grad.show);  // for local test
+      update_value_work(
+          optimizer_config,
+          mf_dim,
+          ptr + feature_value_accessor_.common_feature_value.EmbedxWIndex(),
+          ptr + feature_value_accessor_.common_feature_value.EmbedxG2SumIndex(),
+          grad + feature_value_accessor_.common_push_value.EmbedxGIndex(),
+          g_show);
     }
   }
+
+  __host__ __device__ size_t Dim() { return EmbedDim() + EmbedxDim(); }
+  __host__ __device__ size_t EmbedDim() { return 4; }
+  __host__ __device__ size_t EmbedxDim() { return 4; }
+  __host__ __device__ size_t GSumIndex() { return 0; }
+  __host__ __device__ size_t G2SumIndex() { return GSumIndex() + 1; }
+  __host__ __device__ size_t Beta1PowIndex() { return G2SumIndex() + 1; }
+  __host__ __device__ size_t Beta2PowIndex() { return Beta1PowIndex() + 1; }
+  __host__ __device__ size_t EmbedxGSumIndex() { return 0; }
+  __host__ __device__ size_t EmbedxG2SumIndex() {
+    return EmbedxGSumIndex() + 1;
+  }
+  __host__ __device__ size_t EmbedxBeta1PowIndex() {
+    return EmbedxG2SumIndex() + 1;
+  }
+  __host__ __device__ size_t EmbedxBeta2PowIndex() {
+    return EmbedxBeta1PowIndex() + 1;
+  }
 };
 
 #endif
diff --git a/paddle/fluid/framework/fleet/heter_ps/optimizer_conf.h b/paddle/fluid/framework/fleet/heter_ps/optimizer_conf.h
index 0db72992215a2..2db259941c873 100644
--- a/paddle/fluid/framework/fleet/heter_ps/optimizer_conf.h
+++ b/paddle/fluid/framework/fleet/heter_ps/optimizer_conf.h
@@ -27,13 +27,19 @@ class OptimizerConfig {
   float learning_rate = 0.05;
   float initial_g2sum = 3.0;
   float initial_range = 0;
+  float beta1_decay_rate = 0.9;    // adam
+  float beta2_decay_rate = 0.999;  // adam
+  float ada_epsilon = 1e-8;
 
   float mf_create_thresholds = 10;
   float mf_learning_rate = 0.05;
   float mf_initial_g2sum = 3.0;
   float mf_initial_range = 1e-4;
+  float mf_beta1_decay_rate = 0.9;    // adam
+  float mf_beta2_decay_rate = 0.999;  // adam
   float mf_min_bound = -10;
   float mf_max_bound = 10;
+  float mf_ada_epsilon = 1e-8;
 
   void set_sparse_sgd(float nonclk_coeff,
                       float clk_coeff,
@@ -41,7 +47,10 @@ class OptimizerConfig {
                       float max_bound,
                       float learning_rate,
                       float initial_g2sum,
-                      float initial_range) {
+                      float initial_range,
+                      float beta1_decay_rate,
+                      float beta2_decay_rate,
+                      float ada_epsilon) {
     this->nonclk_coeff = nonclk_coeff;
     this->clk_coeff = clk_coeff;
     this->min_bound = min_bound;
@@ -49,6 +58,9 @@ class OptimizerConfig {
     this->learning_rate = learning_rate;
     this->initial_g2sum = initial_g2sum;
     this->initial_range = initial_range;
+    this->beta1_decay_rate = beta1_decay_rate;
+    this->beta2_decay_rate = beta2_decay_rate;
+    this->ada_epsilon = ada_epsilon;
   }
 
   void set_sparse_sgd(const OptimizerConfig& optimizer_config) {
@@ -59,6 +71,9 @@ class OptimizerConfig {
     this->learning_rate = optimizer_config.learning_rate;
     this->initial_g2sum = optimizer_config.initial_g2sum;
     this->initial_range = optimizer_config.initial_range;
+    this->beta1_decay_rate = optimizer_config.beta1_decay_rate;
+    this->beta2_decay_rate = optimizer_config.beta2_decay_rate;
+    this->ada_epsilon = optimizer_config.ada_epsilon;
   }
 
   void set_embedx_sgd(float mf_create_thresholds,
@@ -66,13 +81,19 @@ class OptimizerConfig {
                       float mf_initial_g2sum,
                       float mf_initial_range,
                       float mf_min_bound,
-                      float mf_max_bound) {
+                      float mf_max_bound,
+                      float mf_beta1_decay_rate,
+                      float mf_beta2_decay_rate,
+                      float mf_ada_epsilon) {
     this->mf_create_thresholds = mf_create_thresholds;
     this->mf_learning_rate = mf_learning_rate;
     this->mf_initial_g2sum = mf_initial_g2sum;
     this->mf_initial_range = mf_initial_range;
     this->mf_min_bound = mf_min_bound;
     this->mf_max_bound = mf_max_bound;
+    this->mf_beta1_decay_rate = mf_beta1_decay_rate;
+    this->mf_beta2_decay_rate = mf_beta2_decay_rate;
+    this->mf_ada_epsilon = mf_ada_epsilon;
   }
 
   void set_embedx_sgd(const OptimizerConfig& optimizer_config) {
@@ -82,6 +103,9 @@ class OptimizerConfig {
     this->mf_initial_range = optimizer_config.mf_initial_range;
     this->mf_min_bound = optimizer_config.mf_min_bound;
     this->mf_max_bound = optimizer_config.mf_max_bound;
+    this->mf_beta1_decay_rate = optimizer_config.mf_beta1_decay_rate;
+    this->mf_beta2_decay_rate = optimizer_config.mf_beta2_decay_rate;
+    this->mf_ada_epsilon = optimizer_config.mf_ada_epsilon;
   }
 };
 
diff --git a/paddle/fluid/framework/fleet/ps_gpu_wrapper.cc b/paddle/fluid/framework/fleet/ps_gpu_wrapper.cc
index 94fa386aac488..d9bb6e946f42d 100644
--- a/paddle/fluid/framework/fleet/ps_gpu_wrapper.cc
+++ b/paddle/fluid/framework/fleet/ps_gpu_wrapper.cc
@@ -33,9 +33,9 @@ limitations under the License. */
 #include <algorithm>
 #include <deque>
 
+#include "paddle/fluid/framework/data_set.h"
 #include "paddle/fluid/platform/timer.h"
 #if defined(PADDLE_WITH_PSCORE)
-#include "paddle/fluid/distributed/ps/table/ctr_dymf_accessor.h"
 #include "paddle/fluid/distributed/ps/table/depends/feature_value.h"
 #endif
 
@@ -135,7 +135,7 @@ void PSGPUWrapper::PreBuildTask(std::shared_ptr<HeterContext> gpu_task) {
   std::string data_set_name = std::string(typeid(*dataset_).name());
 
   if (data_set_name.find("SlotRecordDataset") != std::string::npos) {
-    SlotRecordDataset* dataset = dynamic_cast<SlotRecordDataset*>(dataset_);
+    SlotRecordDataset* dataset = (SlotRecordDataset*)(dataset_);
     auto input_channel = dataset->GetInputChannel();
     VLOG(0) << "psgpu wrapperinputslotchannle size: " << input_channel->Size();
     const std::deque<SlotRecord>& vec_data = input_channel->GetData();
@@ -185,7 +185,7 @@ void PSGPUWrapper::PreBuildTask(std::shared_ptr<HeterContext> gpu_task) {
   } else {
     CHECK(data_set_name.find("MultiSlotDataset") != std::string::npos);
     VLOG(0) << "ps_gpu_wrapper use MultiSlotDataset";
-    MultiSlotDataset* dataset = dynamic_cast<MultiSlotDataset*>(dataset_);
+    MultiSlotDataset* dataset = (MultiSlotDataset*)(dataset_);
     auto input_channel = dataset->GetInputChannel();
 
     const std::deque<Record>& vec_data = input_channel->GetData();
@@ -540,17 +540,17 @@ void PSGPUWrapper::BuildPull(std::shared_ptr<HeterContext> gpu_task) {
                                  &device_vals,
                                  &device_task_keys,
                                  &device_task_ptrs](int dev, int shard_id) {
-    auto& task_keys = device_task_keys[shard_id];
+  // auto& task_keys = device_task_keys[shard_id];
 #ifdef PADDLE_WITH_PSLIB
     auto& task_ptrs = device_task_ptrs[shard_id];
 #endif
 
-#ifdef PADDLE_WITH_PSCORE
-    auto& task_ptrs = device_task_ptrs[shard_id];
-#endif
+    // #ifdef PADDLE_WITH_PSCORE
+    //     auto& task_ptrs = device_task_ptrs[shard_id];
+    // #endif
 
-    int len = prefix_sum[dev][shard_id + 1] - prefix_sum[dev][shard_id];
-    int cur = prefix_sum[dev][shard_id];
+    // int len = prefix_sum[dev][shard_id + 1] - prefix_sum[dev][shard_id];
+    // int cur = prefix_sum[dev][shard_id];
 #ifdef PADDLE_WITH_PSLIB
     for (int j = 0; j < len; ++j) {
       device_keys[dev][cur + j] = task_keys[dev][j];
@@ -579,33 +579,6 @@ void PSGPUWrapper::BuildPull(std::shared_ptr<HeterContext> gpu_task) {
       }
     }
 #endif
-#ifdef PADDLE_WITH_PSCORE
-    for (int j = 0; j < len; ++j) {
-      device_keys[dev][cur + j] = task_keys[dev][j];
-      float* ptr_val = task_ptrs[dev][j]->data();
-      FeatureValue& val = device_vals[dev][cur + j];
-      size_t dim = task_ptrs[dev][j]->size();
-      val.delta_score = ptr_val[2];
-      val.show = ptr_val[3];
-      val.clk = ptr_val[4];
-      val.slot = ptr_val[0];
-      val.lr = ptr_val[5];
-      val.lr_g2sum = ptr_val[6];
-      val.cpu_ptr = (uint64_t)(task_ptrs[dev][j]);
-
-      if (dim > 7) {
-        val.mf_size = MF_DIM + 1;
-        for (int x = 0; x < val.mf_size; x++) {
-          val.mf[x] = ptr_val[x + 7];
-        }
-      } else {
-        val.mf_size = 0;
-        for (int x = 0; x < MF_DIM + 1; x++) {
-          val.mf[x] = 0;
-        }
-      }
-    }
-#endif
     VLOG(3) << "GpuPs build hbmps done";
   };
 
@@ -665,16 +638,25 @@ void PSGPUWrapper::BuildGPUTask(std::shared_ptr<HeterContext> gpu_task) {
     return;
   }
   std::vector<std::thread> threads(device_num);
-  HeterPs_ = HeterPsBase::get_instance(size_max, resource_);
+  auto accessor_wrapper_ptr =
+      GlobalAccessorTransfor::GetInstance().GetAccessorWrapper();
+  HeterPs_ = HeterPsBase::get_instance(
+      size_max, resource_, fleet_config_, accessor_class_, optimizer_type_);
 #ifdef PADDLE_WITH_CUDA
   HeterPs_->set_nccl_comm_and_size(inner_comms_, inter_comms_, node_size_);
+  HeterPs_->set_sparse_sgd(optimizer_config_);
+  HeterPs_->set_embedx_sgd(optimizer_config_);
 #endif
 
-  auto build_dymf_mem_pool = [this, &gpu_task](int i, int j) {
+  auto build_dymf_mem_pool = [this, &gpu_task, &accessor_wrapper_ptr](int i,
+                                                                      int j) {
     this->HeterPs_->set_multi_mf_dim(multi_mf_dim_, max_mf_dim_);
     int mf_dim = this->index_dim_vec_[j];
+    VLOG(0) << "building table: " << i << "with mf dim: " << mf_dim
+            << " feature_value_size:"
+            << accessor_wrapper_ptr->GetFeatureValueSize(mf_dim);
     size_t feature_value_size =
-        TYPEALIGN(8, sizeof(FeatureValue) + ((mf_dim + 1) * sizeof(float)));
+        accessor_wrapper_ptr->GetFeatureValueSize(mf_dim);
     auto& device_dim_keys = gpu_task->device_dim_keys_[i][j];
     auto& device_dim_ptrs = gpu_task->device_dim_ptr_[i][j];
     size_t len = device_dim_keys.size();
@@ -682,12 +664,13 @@ void PSGPUWrapper::BuildGPUTask(std::shared_ptr<HeterContext> gpu_task) {
     this->mem_pools_[i * this->multi_mf_dim_ + j] =
         new MemoryPool(len, feature_value_size);
   };
-  auto build_dymf_hbm_pool = [this, &gpu_task](int i, int j) {
+  auto build_dymf_hbm_pool = [this, &gpu_task, &accessor_wrapper_ptr](int i,
+                                                                      int j) {
     auto& device_dim_keys = gpu_task->device_dim_keys_[i][j];
     size_t len = device_dim_keys.size();
     int mf_dim = this->index_dim_vec_[j];
     size_t feature_value_size =
-        TYPEALIGN(8, sizeof(FeatureValue) + ((mf_dim + 1) * sizeof(float)));
+        accessor_wrapper_ptr->GetFeatureValueSize(mf_dim);
 
     auto& mem_pool = this->mem_pools_[i * this->multi_mf_dim_ + j];
     platform::CUDADeviceGuard guard(resource_->dev_id(i));
@@ -710,13 +693,13 @@ void PSGPUWrapper::BuildGPUTask(std::shared_ptr<HeterContext> gpu_task) {
     delete mem_pool;
   };
   int thread_num = 16;
-  auto build_dynamic_mf_func = [this, &gpu_task, thread_num](
-                                   int i, int j, int z) {
+  auto build_dynamic_mf_func = [this,
+                                &gpu_task,
+                                thread_num,
+                                &accessor_wrapper_ptr](int i, int j, int z) {
     // this->HeterPs_->set_multi_mf_dim(multi_mf_dim_, max_mf_dim_);
     int mf_dim = this->index_dim_vec_[j];
     VLOG(0) << "building table: " << i << "with mf dim: " << mf_dim;
-    // size_t feature_value_size =
-    //     TYPEALIGN(8, sizeof(FeatureValue) + ((mf_dim + 1) * sizeof(float)));
     auto& device_dim_keys = gpu_task->device_dim_keys_[i][j];
     auto& device_dim_ptrs = gpu_task->device_dim_ptr_[i][j];
     size_t len = device_dim_keys.size();
@@ -743,10 +726,10 @@ void PSGPUWrapper::BuildGPUTask(std::shared_ptr<HeterContext> gpu_task) {
     // ============ add for multi-thread ================
 
     for (size_t k = left; k < right; k++) {
-      FeatureValue* val = (FeatureValue*)(mem_pool->mem_address(k));
+#ifdef PADDLE_WITH_PSLIB
+      float* val = (float*)(mem_pool->mem_address(k));
       float* ptr_val = device_dim_ptrs[k]->data();
       size_t dim = device_dim_ptrs[k]->size();
-#ifdef PADDLE_WITH_PSLIB
       val->delta_score =
           ptr_val[paddle::ps::DownpourCtrDymfAccessor::
                       DownpourCtrDymfFeatureValue::delta_score_index()];
@@ -765,23 +748,6 @@ void PSGPUWrapper::BuildGPUTask(std::shared_ptr<HeterContext> gpu_task) {
       ptr_val[paddle::ps::DownpourCtrDymfAccessor::DownpourCtrDymfFeatureValue::
                   mf_dim_index()] = float(mf_dim);
       val->mf_dim = mf_dim;
-#endif
-#ifdef PADDLE_WITH_PSCORE
-      paddle::distributed::CtrDymfAccessor accessor;
-      val->delta_score =
-          ptr_val[accessor.common_feature_value.DeltaScoreIndex()];
-      val->show = ptr_val[accessor.common_feature_value.ShowIndex()];
-      val->clk = ptr_val[accessor.common_feature_value.ClickIndex()];
-      val->slot = int(ptr_val[accessor.common_feature_value.SlotIndex()]);
-      val->lr = ptr_val[accessor.common_feature_value.EmbedWIndex()];
-      val->lr_g2sum = ptr_val[accessor.common_feature_value.EmbedG2SumIndex()];
-
-      val->cpu_ptr = (uint64_t)(device_dim_ptrs[k]);
-
-      // TODO(xuefeng) set mf_dim while using DownpourCtrDymfAccessor
-      ptr_val[accessor.common_feature_value.MfDimIndex()] = float(mf_dim);
-      val->mf_dim = mf_dim;
-#endif
       if (dim > 8) {  // CpuPS alreay expand as mf_dim
         val->mf_size = mf_dim + 1;
         for (int x = 0; x < val->mf_dim + 1; x++) {
@@ -793,6 +759,12 @@ void PSGPUWrapper::BuildGPUTask(std::shared_ptr<HeterContext> gpu_task) {
           val->mf[x] = 0;
         }
       }
+#endif
+#ifdef PADDLE_WITH_PSCORE
+      void* val = mem_pool->mem_address(k);
+      accessor_wrapper_ptr->BuildFill(
+          val, device_dim_ptrs[k], cpu_table_accessor_, mf_dim);
+#endif
     }
   };
 
@@ -945,7 +917,10 @@ void PSGPUWrapper::EndPass() {
     }
   }
   int thread_num = 8;
-  auto dump_pool_to_cpu_func = [this, thread_num](int i, int j, int z) {
+  auto accessor_wrapper_ptr =
+      GlobalAccessorTransfor::GetInstance().GetAccessorWrapper();
+  auto dump_pool_to_cpu_func = [this, thread_num, &accessor_wrapper_ptr](
+                                   int i, int j, int z) {
     PADDLE_ENFORCE_GPU_SUCCESS(cudaSetDevice(this->resource_->dev_id(i)));
     auto& hbm_pool = this->hbm_pools_[i * this->multi_mf_dim_ + j];
     auto& device_keys = this->current_task_->device_dim_keys_[i][j];
@@ -965,9 +940,11 @@ void PSGPUWrapper::EndPass() {
     }
     // ============ multi-thread process feasign============
     int mf_dim = this->index_dim_vec_[j];
-    VLOG(0) << "dump pool to cpu table: " << i << "with mf dim: " << mf_dim;
     size_t feature_value_size =
-        TYPEALIGN(8, sizeof(FeatureValue) + ((mf_dim + 1) * sizeof(float)));
+        accessor_wrapper_ptr->GetFeatureValueSize(mf_dim);
+    VLOG(0) << "dump pool to cpu table: " << i << "with mf dim: " << mf_dim
+            << " key_len :" << len
+            << " feature_value_size:" << feature_value_size;
     char* test_build_values = (char*)malloc(feature_value_size * real_len);
     uint64_t offset = left * feature_value_size;
     cudaMemcpy(test_build_values,
@@ -981,7 +958,7 @@ void PSGPUWrapper::EndPass() {
         continue;
       }
       size_t local_offset = (i - left) * feature_value_size;
-      FeatureValue* gpu_val = (FeatureValue*)(test_build_values + local_offset);
+      float* gpu_val = (float*)(test_build_values + local_offset);
 #ifdef PADDLE_WITH_PSLIB
       auto* downpour_value =
           (paddle::ps::DownpourFixedFeatureValue*)(gpu_val->cpu_ptr);
@@ -1002,31 +979,15 @@ void PSGPUWrapper::EndPass() {
                   embed_g2sum_index()] = gpu_val->lr_g2sum;
       cpu_val[paddle::ps::DownpourCtrDymfAccessor::DownpourCtrDymfFeatureValue::
                   slot_index()] = gpu_val->slot;
-#endif
-#ifdef PADDLE_WITH_PSCORE
-      auto* downpour_value =
-          (paddle::distributed::FixedFeatureValue*)(gpu_val->cpu_ptr);
-      int downpour_value_size = downpour_value->size();
-      if (gpu_val->mf_size > 0 && downpour_value_size == 8) {
-        downpour_value->resize(gpu_val->mf_dim + 1 + downpour_value_size);
-      }
-      float* cpu_val = downpour_value->data();
-
-      paddle::distributed::CtrDymfAccessor accessor;
-      cpu_val[accessor.common_feature_value.DeltaScoreIndex()] =
-          gpu_val->delta_score;
-      cpu_val[accessor.common_feature_value.ShowIndex()] = gpu_val->show;
-      cpu_val[accessor.common_feature_value.ClickIndex()] = gpu_val->clk;
-      cpu_val[accessor.common_feature_value.EmbedWIndex()] = gpu_val->lr;
-      cpu_val[accessor.common_feature_value.EmbedG2SumIndex()] =
-          gpu_val->lr_g2sum;
-      cpu_val[accessor.common_feature_value.SlotIndex()] = gpu_val->slot;
-#endif
       if (gpu_val->mf_size > 0) {
         for (int x = 0; x < gpu_val->mf_dim + 1; x++) {
           cpu_val[x + 8] = gpu_val->mf[x];
         }
       }
+#endif
+#ifdef PADDLE_WITH_PSCORE
+      accessor_wrapper_ptr->DumpFill(gpu_val, cpu_table_accessor_, mf_dim);
+#endif
     }
     free(test_build_values);
   };
@@ -1066,79 +1027,8 @@ void PSGPUWrapper::PullSparse(const paddle::platform::Place& place,
                               const std::vector<float*>& values,
                               const std::vector<int64_t>& slot_lengths,
                               const int hidden_size) {
-  platform::Timer all_timer;
-  platform::Timer pull_gpups_timer;
-  all_timer.Start();
-  int64_t total_length =
-      std::accumulate(slot_lengths.begin(), slot_lengths.end(), 0UL);
-  VLOG(3) << "Begine Gpu/Xpu Ps PullSparse";
-  auto buf = memory::Alloc(place, total_length * sizeof(FeatureValue));
-  FeatureValue* total_values_gpu = reinterpret_cast<FeatureValue*>(buf->ptr());
-  if (platform::is_cpu_place(place)) {
-    PADDLE_THROW(platform::errors::Unimplemented(
-        "Warning:: CPUPlace is not supported in GpuPs now."));
-  } else if (platform::is_gpu_place(place)) {
-#ifdef PADDLE_WITH_CUDA
-    VLOG(3) << "Begin copy keys, key_num[" << total_length << "]";
-    int device_id = place.GetDeviceId();
-    int devid_2_index = HeterPs_->get_index_by_devid(device_id);
-    LoDTensor& total_keys_tensor = keys_tensor[devid_2_index];
-    uint64_t* total_keys = reinterpret_cast<uint64_t*>(
-        total_keys_tensor.mutable_data<int64_t>({total_length, 1}, place));
-
-    // construct slot_level lod info
-    auto slot_lengths_lod = slot_lengths;
-    for (size_t i = 1; i < slot_lengths_lod.size(); i++) {
-      slot_lengths_lod[i] += slot_lengths_lod[i - 1];
-    }
-    auto buf_key = memory::Alloc(place, keys.size() * sizeof(uint64_t*));
-    auto buf_length =
-        memory::Alloc(place, slot_lengths.size() * sizeof(int64_t));
-    uint64_t** gpu_keys = reinterpret_cast<uint64_t**>(buf_key->ptr());
-    int64_t* gpu_len = reinterpret_cast<int64_t*>(buf_length->ptr());
-    cudaMemcpy(gpu_keys,
-               keys.data(),
-               keys.size() * sizeof(uint64_t*),
-               cudaMemcpyHostToDevice);
-    cudaMemcpy(gpu_len,
-               slot_lengths_lod.data(),
-               slot_lengths.size() * sizeof(int64_t),
-               cudaMemcpyHostToDevice);
-
-    this->CopyKeys(place,
-                   gpu_keys,
-                   total_keys,
-                   gpu_len,
-                   static_cast<int>(slot_lengths.size()),
-                   static_cast<int>(total_length));
-    VLOG(3) << "Begin call PullSparseGPU in GPUPS, dev: " << devid_2_index
-            << " len: " << total_length;
-    pull_gpups_timer.Start();
-    HeterPs_->pull_sparse(devid_2_index,
-                          total_keys,
-                          total_values_gpu,
-                          static_cast<int>(total_length));
-    pull_gpups_timer.Pause();
-
-    VLOG(3) << "Begin Copy result to tensor, total_length[" << total_length
-            << "]";
-    this->CopyForPull(place,
-                      gpu_keys,
-                      values,
-                      total_values_gpu,
-                      gpu_len,
-                      static_cast<int>(slot_lengths.size()),
-                      hidden_size,
-                      total_length);
-  } else {
-    PADDLE_THROW(platform::errors::PreconditionNotMet(
-        "GpuPs: PullSparse Only Support CUDAPlace Now."));
-  }
-  all_timer.Pause();
-  VLOG(3) << "GpuPs PullSparse total costs: " << all_timer.ElapsedSec()
-          << " s, of which GPUPS costs: " << pull_gpups_timer.ElapsedSec()
-          << " s";
-  VLOG(3) << "End PullSparse";
+  VLOG(0) << "Warning:: recommand use pull_gpups_sparse op instead. This "
+             "PullSparse is not used.";
 }
 
 void PSGPUWrapper::PullSparse(const paddle::platform::Place& place,
@@ -1156,13 +1046,16 @@ void PSGPUWrapper::PullSparse(const paddle::platform::Place& place,
       std::accumulate(slot_lengths.begin(), slot_lengths.end(), 0UL);
   size_t feature_value_size = 0;
 
-  feature_value_size = TYPEALIGN(
-      8, sizeof(FeatureValue) + sizeof(float) * (index_dim_vec_.back() + 1));
+  auto accessor_wrapper_ptr =
+      GlobalAccessorTransfor::GetInstance().GetAccessorWrapper();
+  feature_value_size = accessor_wrapper_ptr->GetFeatureValueSize(max_mf_dim_);
+  VLOG(3) << "PullSparse max_dim:" << max_mf_dim_
+          << " feature_value_size:" << feature_value_size;
 
 #ifdef PADDLE_WITH_CUDA
   VLOG(3) << "Begine Gpu Ps PullSparse";
   auto buf = memory::Alloc(place, total_length * feature_value_size);
-  FeatureValue* total_values_gpu = reinterpret_cast<FeatureValue*>(buf->ptr());
+  float* total_values_gpu = reinterpret_cast<float*>(buf->ptr());
 #endif
 #ifdef PADDLE_WITH_XPU_KP
   VLOG(3) << "Begine Xpu Ps PullSparse";
@@ -1224,19 +1117,19 @@ void PSGPUWrapper::PullSparse(const paddle::platform::Place& place,
     VLOG(3) << "Begin Copy result to tensor, total_length[" << total_length
             << "]";
 
-    this->CopyForPull(place,
-                      gpu_keys,
-                      values,
-                      total_values_gpu,
-                      gpu_len,
-                      static_cast<int>(slot_lengths.size()),
-                      hidden_size,
-                      total_length,
-                      gpu_dim);
+    accessor_wrapper_ptr->CopyForPull(place,
+                                      gpu_keys,
+                                      values,
+                                      total_values_gpu,
+                                      gpu_len,
+                                      static_cast<int>(slot_lengths.size()),
+                                      hidden_size,
+                                      total_length,
+                                      gpu_dim,
+                                      val_type_size_);
 
     pull_gpups_timer.Pause();
 
-#endif
   } else if (platform::is_xpu_place(place)) {
 #ifdef PADDLE_WITH_XPU_KP
     VLOG(3) << "Begin copy keys, key_num[" << total_length << "]";
@@ -1283,14 +1176,15 @@ void PSGPUWrapper::PullSparse(const paddle::platform::Place& place,
 
     VLOG(3) << "Begin Copy result to tensor, total_length[" << total_length
             << "]";
-    this->CopyForPull(place,
-                      xpu_keys,
-                      values,
-                      total_values_gpu,
-                      xpu_len,
-                      static_cast<int>(slot_lengths.size()),
-                      hidden_size,
-                      total_length);
+    accessor_wrapper_ptr->CopyForPull(place,
+                                      xpu_keys,
+                                      values,
+                                      total_values_gpu,
+                                      xpu_len,
+                                      static_cast<int>(slot_lengths.size()),
+                                      hidden_size,
+                                      total_length,
+                                      val_type_size_);
 #endif
   } else {
     PADDLE_THROW(platform::errors::PreconditionNotMet(
@@ -1317,12 +1211,13 @@ void PSGPUWrapper::PushSparseGrad(const paddle::platform::Place& place,
       std::accumulate(slot_lengths.begin(), slot_lengths.end(), 0UL);
   // #ifdef PADDLE_WITH_CUDA
   VLOG(3) << "Begin GPUPS PushSparseGrad";
-  size_t grad_value_size =
-      TYPEALIGN(8, sizeof(FeaturePushValue) + (max_mf_dim_ * sizeof(float)));
+  auto accessor_wrapper_ptr =
+      GlobalAccessorTransfor::GetInstance().GetAccessorWrapper();
+  size_t grad_value_size = accessor_wrapper_ptr->GetPushValueSize(max_mf_dim_);
   auto buf = memory::Alloc(place, total_length * grad_value_size);
-  VLOG(3) << "Push Sparse Max mf dimention: " << max_mf_dim_;
-  FeaturePushValue* total_grad_values_gpu =
-      reinterpret_cast<FeaturePushValue*>(buf->ptr());
+  VLOG(3) << "Push Sparse Max mf dimention: " << max_mf_dim_
+          << "grad_value_size:" << grad_value_size;
+  float* total_grad_values_gpu = reinterpret_cast<float*>(buf->ptr());
   if (platform::is_cpu_place(place)) {
     PADDLE_THROW(platform::errors::Unimplemented(
         "Warning:: CPUPlace is not supported in GPUPS now."));
@@ -1334,23 +1229,15 @@ void PSGPUWrapper::PushSparseGrad(const paddle::platform::Place& place,
     uint64_t* total_keys =
         reinterpret_cast<uint64_t*>(cached_total_keys_tensor.data<int64_t>());
     VLOG(3) << "Begin copy grad tensor to gpups struct";
-    if (!multi_mf_dim_) {
-      this->CopyForPush(place,
-                        grad_values,
-                        total_grad_values_gpu,
-                        slot_lengths,
-                        hidden_size,
-                        total_length,
-                        batch_size);
-    } else {
-      this->CopyForPush(place,
-                        grad_values,
-                        total_grad_values_gpu,
-                        slot_lengths,
-                        total_length,
-                        batch_size,
-                        grad_value_size);
-    }
+    accessor_wrapper_ptr->CopyForPush(place,
+                                      grad_values,
+                                      total_grad_values_gpu,
+                                      slot_lengths,
+                                      total_length,
+                                      batch_size,
+                                      grad_value_size,
+                                      slot_vector_,
+                                      slot_mf_dim_vector_);
 
     VLOG(3) << "Begin call PushSparseGPU in GPUPS, dev: " << devid_2_index
             << " len: " << total_length;
@@ -1369,13 +1256,14 @@ void PSGPUWrapper::PushSparseGrad(const paddle::platform::Place& place,
     uint64_t* total_keys =
         reinterpret_cast<uint64_t*>(cached_total_keys_tensor.data<int64_t>());
     VLOG(3) << "Begin copy grad tensor to xpups struct";
-    this->CopyForPush(place,
-                      grad_values,
-                      total_grad_values_gpu,
-                      slot_lengths,
-                      hidden_size,
-                      total_length,
-                      batch_size);
+    accessor_wrapper_ptr->CopyForPush(place,
+                                      grad_values,
+                                      total_grad_values_gpu,
+                                      slot_lengths,
+                                      hidden_size,
+                                      total_length,
+                                      batch_size,
+                                      slot_vector_);
 
     VLOG(3) << "Begin call PushSparseXPU in XPUPS, dev: " << devid_2_index
             << " len: " << total_length;
diff --git a/paddle/fluid/framework/fleet/ps_gpu_wrapper.cu b/paddle/fluid/framework/fleet/ps_gpu_wrapper.cu
index 734765fa95423..f8624f48d08f3 100644
--- a/paddle/fluid/framework/fleet/ps_gpu_wrapper.cu
+++ b/paddle/fluid/framework/fleet/ps_gpu_wrapper.cu
@@ -26,90 +26,6 @@ limitations under the License. */
 namespace paddle {
 namespace framework {
 
-__global__ void PullCopy(float** dest,
-                         const FeatureValue* src,
-                         const int64_t* len,
-                         int hidden,
-                         int slot_num,
-                         int total_len,
-                         uint64_t** keys) {
-  CUDA_KERNEL_LOOP(i, total_len) {
-    int low = 0;
-    int high = slot_num - 1;
-    while (low < high) {
-      int mid = (low + high) / 2;
-      if (i < len[mid])
-        high = mid;
-      else
-        low = mid + 1;
-    }
-    int x = low;
-    int y = i - (x ? len[x - 1] : 0);
-    if (*(keys[x] + y) == 0) {
-      *(dest[x] + y * hidden) = 0;
-      *(dest[x] + y * hidden + 1) = 0;
-      *(dest[x] + y * hidden + 2) = 0;
-    } else {
-      *(dest[x] + y * hidden) = (src + i)->show;
-      *(dest[x] + y * hidden + 1) = (src + i)->clk;
-      *(dest[x] + y * hidden + 2) = (src + i)->lr;
-    }
-    if ((src + i)->mf_size == 0 || *(keys[x] + y) == 0) {
-      for (int j = 0; j < hidden - 3; j++) {
-        *(dest[x] + y * hidden + 3 + j) = 0;
-      }
-    } else {
-      for (int j = 0; j < hidden - 3; j++) {
-        *(dest[x] + y * hidden + 3 + j) = (src + i)->mf[1 + j];
-      }
-    }
-  }
-}
-
-__global__ void PullCopy(float** dest,
-                         const FeatureValue* src,
-                         const int64_t* len,
-                         int slot_num,
-                         int total_len,
-                         uint64_t** keys,
-                         uint64_t max_val_size,
-                         int* gpu_dim) {
-  CUDA_KERNEL_LOOP(i, total_len) {
-    int low = 0;
-    int high = slot_num - 1;
-    while (low < high) {
-      int mid = (low + high) / 2;
-      if (i < len[mid])
-        high = mid;
-      else
-        low = mid + 1;
-    }
-    int x = low;
-    int y = i - (x ? len[x - 1] : 0);
-    FeatureValue* feature_value_ptr =
-        (FeatureValue*)((char*)src + uint64_t(i) * uint64_t(max_val_size));
-    int mf_dim = gpu_dim[x] - 3;
-    if (*(keys[x] + y) == 0) {
-      *(dest[x] + y * (mf_dim + 3)) = 0;
-      *(dest[x] + y * (mf_dim + 3) + 1) = 0;
-      *(dest[x] + y * (mf_dim + 3) + 2) = 0;
-    } else {
-      *(dest[x] + y * (mf_dim + 3)) = feature_value_ptr->show;
-      *(dest[x] + y * (mf_dim + 3) + 1) = feature_value_ptr->clk;
-      *(dest[x] + y * (mf_dim + 3) + 2) = feature_value_ptr->lr;
-    }
-    if ((feature_value_ptr)->mf_size == 0 || *(keys[x] + y) == 0) {
-      for (int j = 0; j < mf_dim; j++) {
-        *(dest[x] + y * (mf_dim + 3) + 3 + j) = 0;
-      }
-    } else {
-      for (int j = 0; j < mf_dim; j++) {
-        *(dest[x] + y * (mf_dim + 3) + 3 + j) = feature_value_ptr->mf[1 + j];
-      }
-    }
-  }
-}
-
 __global__ void CopyKeysKernel(uint64_t** src_keys,
                                uint64_t* dest_total_keys,
                                const int64_t* len,
@@ -161,101 +77,8 @@ __global__ void PushCopy(FeaturePushValue* dest,
   }
 }
 
-__global__ void PushCopyWithPool(FeaturePushValue* dest,
-                                 float** src,
-                                 int64_t* len,
-                                 int slot_num,
-                                 uint64_t total_len,
-                                 int bs,
-                                 int* slot_vector,
-                                 int* mf_dim_vector,
-                                 size_t grad_value_size) {
-  CUDA_KERNEL_LOOP(i, total_len) {
-    int low = 0;
-    int high = slot_num - 1;
-    while (low < high) {
-      int mid = (low + high) / 2;
-      if (i < len[mid])
-        high = mid;
-      else
-        low = mid + 1;
-    }
-    int x = low;
-    int y = i - (x ? len[low - 1] : 0);
-    FeaturePushValue* cur =
-        (FeaturePushValue*)((char*)dest + i * grad_value_size);
-    cur->slot = slot_vector[x];
-    int mf_dim = mf_dim_vector[x];
-    cur->mf_dim = mf_dim;
-    cur->show = *(src[x] + y * (mf_dim + 3));
-    cur->clk = *(src[x] + y * (mf_dim + 3) + 1);
-    cur->lr_g = *(src[x] + y * (mf_dim + 3) + 2) * -1. * bs;
-    for (int j = 0; j < cur->mf_dim; j++) {
-      cur->mf_g[j] = *(src[x] + y * (mf_dim + 3) + 3 + j) * -1. * bs;
-    }
-  }
-}
 PSGPUWrapper::~PSGPUWrapper() { delete HeterPs_; }
 
-void PSGPUWrapper::CopyForPull(const paddle::platform::Place& place,
-                               uint64_t** gpu_keys,
-                               const std::vector<float*>& values,
-                               const FeatureValue* total_values_gpu,
-                               const int64_t* gpu_len,
-                               const int slot_num,
-                               const int hidden_size,
-                               const int64_t total_length) {
-  auto stream = dynamic_cast<platform::CUDADeviceContext*>(
-                    platform::DeviceContextPool::Instance().Get(place))
-                    ->stream();
-  auto buf_value = memory::Alloc(place, values.size() * sizeof(float*));
-  float** gpu_values = reinterpret_cast<float**>(buf_value->ptr());
-  cudaMemcpy(gpu_values,
-             values.data(),
-             values.size() * sizeof(float*),
-             cudaMemcpyHostToDevice);
-
-  PullCopy<<<(total_length + 1024 - 1) / 1024, 1024, 0, stream>>>(
-      gpu_values,
-      total_values_gpu,
-      gpu_len,
-      hidden_size,
-      slot_num,
-      total_length,
-      gpu_keys);
-  cudaStreamSynchronize(stream);
-}
-
-void PSGPUWrapper::CopyForPull(const paddle::platform::Place& place,
-                               uint64_t** gpu_keys,
-                               const std::vector<float*>& values,
-                               const FeatureValue* total_values_gpu,
-                               const int64_t* gpu_len,
-                               const int slot_num,
-                               const int hidden_size,
-                               const int64_t total_length,
-                               int* gpu_dim) {
-  auto stream = dynamic_cast<platform::CUDADeviceContext*>(
-                    platform::DeviceContextPool::Instance().Get(place))
-                    ->stream();
-  auto buf_value = memory::Alloc(place, values.size() * sizeof(float*));
-  float** gpu_values = reinterpret_cast<float**>(buf_value->ptr());
-  cudaMemcpy(gpu_values,
-             values.data(),
-             values.size() * sizeof(float*),
-             cudaMemcpyHostToDevice);
-  PullCopy<<<(total_length + 1024 - 1) / 1024, 1024, 0, stream>>>(
-      gpu_values,
-      total_values_gpu,
-      gpu_len,
-      slot_num,
-      total_length,
-      gpu_keys,
-      val_type_size_,
-      gpu_dim);
-  cudaStreamSynchronize(stream);
-}
-
 void PSGPUWrapper::CopyKeys(const paddle::platform::Place& place,
                             uint64_t** origin_keys,
                             uint64_t* total_keys,
@@ -270,125 +93,26 @@ void PSGPUWrapper::CopyKeys(const paddle::platform::Place& place,
   cudaStreamSynchronize(stream);
 }
 
-void PSGPUWrapper::CopyForPush(const paddle::platform::Place& place,
-                               const std::vector<const float*>& grad_values,
-                               FeaturePushValue* total_grad_values_gpu,
-                               const std::vector<int64_t>& slot_lengths,
-                               const int hidden_size,
-                               const int64_t total_length,
-                               const int batch_size) {
-  auto stream = dynamic_cast<platform::CUDADeviceContext*>(
-                    platform::DeviceContextPool::Instance().Get(place))
-                    ->stream();
-  auto slot_lengths_lod = slot_lengths;
-  for (int i = 1; i < slot_lengths_lod.size(); i++) {
-    slot_lengths_lod[i] += slot_lengths_lod[i - 1];
-  }
-  auto buf_grad_value =
-      memory::Alloc(place, grad_values.size() * sizeof(float*));
-  auto buf_length = memory::Alloc(place, slot_lengths.size() * sizeof(int64_t));
-  auto buf_slot_vector =
-      memory::Alloc(place, slot_lengths_lod.size() * sizeof(int));
-
-  float** gpu_values = reinterpret_cast<float**>(buf_grad_value->ptr());
-  int64_t* gpu_len = reinterpret_cast<int64_t*>(buf_length->ptr());
-  int* d_slot_vector = reinterpret_cast<int*>(buf_slot_vector->ptr());
-
-  cudaMemcpy(gpu_values,
-             grad_values.data(),
-             grad_values.size() * sizeof(float*),
-             cudaMemcpyHostToDevice);
-  cudaMemcpy(gpu_len,
-             slot_lengths_lod.data(),
-             slot_lengths.size() * sizeof(int64_t),
-             cudaMemcpyHostToDevice);
-  cudaMemcpy(d_slot_vector,
-             slot_vector_.data(),
-             slot_lengths_lod.size() * sizeof(int),
-             cudaMemcpyHostToDevice);
-
-  PushCopy<<<(total_length + 1024 - 1) / 1024, 1024, 0, stream>>>(
-      total_grad_values_gpu,
-      gpu_values,
-      gpu_len,
-      hidden_size,
-      slot_lengths.size(),
-      total_length,
-      batch_size,
-      d_slot_vector);
-  cudaStreamSynchronize(stream);
-}
-
-void PSGPUWrapper::CopyForPush(const paddle::platform::Place& place,
-                               const std::vector<const float*>& grad_values,
-                               FeaturePushValue* total_grad_values_gpu,
-                               const std::vector<int64_t>& slot_lengths,
-                               const uint64_t total_length,
-                               const int batch_size,
-                               size_t grad_value_size) {
-  auto stream = dynamic_cast<platform::CUDADeviceContext*>(
-                    platform::DeviceContextPool::Instance().Get(place))
-                    ->stream();
-  auto slot_lengths_lod = slot_lengths;
-  for (int i = 1; i < slot_lengths_lod.size(); i++) {
-    slot_lengths_lod[i] += slot_lengths_lod[i - 1];
-  }
-  auto buf_grad_value =
-      memory::Alloc(place, grad_values.size() * sizeof(float*));
-  auto buf_length = memory::Alloc(place, slot_lengths.size() * sizeof(int64_t));
-  auto buf_slot_vector =
-      memory::Alloc(place, slot_lengths_lod.size() * sizeof(int));
-  auto buf_mf_dim_vector =
-      memory::Alloc(place, slot_lengths_lod.size() * sizeof(int));
-  float** gpu_values = reinterpret_cast<float**>(buf_grad_value->ptr());
-  int64_t* gpu_len = reinterpret_cast<int64_t*>(buf_length->ptr());
-  int* d_slot_vector = reinterpret_cast<int*>(buf_slot_vector->ptr());
-  int* d_mf_dim_vector = reinterpret_cast<int*>(buf_mf_dim_vector->ptr());
-  cudaMemcpy(gpu_values,
-             grad_values.data(),
-             grad_values.size() * sizeof(float*),
-             cudaMemcpyHostToDevice);
-  cudaMemcpy(gpu_len,
-             slot_lengths_lod.data(),
-             slot_lengths.size() * sizeof(int64_t),
-             cudaMemcpyHostToDevice);
-  cudaMemcpy(d_slot_vector,
-             slot_vector_.data(),
-             slot_lengths_lod.size() * sizeof(int),
-             cudaMemcpyHostToDevice);
-  cudaMemcpy(d_mf_dim_vector,
-             slot_mf_dim_vector_.data(),
-             slot_lengths_lod.size() * sizeof(int),
-             cudaMemcpyHostToDevice);
-  PushCopyWithPool<<<(total_length + 1024 - 1) / 1024, 1024, 0, stream>>>(
-      total_grad_values_gpu,
-      gpu_values,
-      gpu_len,
-      slot_lengths.size(),
-      total_length,
-      batch_size,
-      d_slot_vector,
-      d_mf_dim_vector,
-      grad_value_size);
-  cudaStreamSynchronize(stream);
-}
-
 void PSGPUWrapper::SetSparseSGD(float nonclk_coeff,
                                 float clk_coeff,
                                 float min_bound,
                                 float max_bound,
                                 float learning_rate,
                                 float initial_g2sum,
-                                float initial_range) {
-  OptimizerConfig optimizer_config;
-  optimizer_config.set_sparse_sgd(nonclk_coeff,
-                                  clk_coeff,
-                                  min_bound,
-                                  max_bound,
-                                  learning_rate,
-                                  initial_g2sum,
-                                  initial_range);
-  HeterPs_->set_sparse_sgd(optimizer_config);
+                                float initial_range,
+                                float beta1_decay_rate,
+                                float beta2_decay_rate,
+                                float ada_epsilon) {
+  optimizer_config_.set_sparse_sgd(nonclk_coeff,
+                                   clk_coeff,
+                                   min_bound,
+                                   max_bound,
+                                   learning_rate,
+                                   initial_g2sum,
+                                   initial_range,
+                                   beta1_decay_rate,
+                                   beta2_decay_rate,
+                                   ada_epsilon);
 }
 
 void PSGPUWrapper::SetEmbedxSGD(float mf_create_thresholds,
@@ -396,15 +120,19 @@ void PSGPUWrapper::SetEmbedxSGD(float mf_create_thresholds,
                                 float mf_initial_g2sum,
                                 float mf_initial_range,
                                 float mf_min_bound,
-                                float mf_max_bound) {
-  OptimizerConfig optimizer_config;
-  optimizer_config.set_embedx_sgd(mf_create_thresholds,
-                                  mf_learning_rate,
-                                  mf_initial_g2sum,
-                                  mf_initial_range,
-                                  mf_min_bound,
-                                  mf_max_bound);
-  HeterPs_->set_embedx_sgd(optimizer_config);
+                                float mf_max_bound,
+                                float mf_beta1_decay_rate,
+                                float mf_beta2_decay_rate,
+                                float mf_ada_epsilon) {
+  optimizer_config_.set_embedx_sgd(mf_create_thresholds,
+                                   mf_learning_rate,
+                                   mf_initial_g2sum,
+                                   mf_initial_range,
+                                   mf_min_bound,
+                                   mf_max_bound,
+                                   mf_beta1_decay_rate,
+                                   mf_beta2_decay_rate,
+                                   mf_ada_epsilon);
 }
 
 }  // end namespace framework
diff --git a/paddle/fluid/framework/fleet/ps_gpu_wrapper.h b/paddle/fluid/framework/fleet/ps_gpu_wrapper.h
index 65f86acce9151..0d1669a42b1e9 100644
--- a/paddle/fluid/framework/fleet/ps_gpu_wrapper.h
+++ b/paddle/fluid/framework/fleet/ps_gpu_wrapper.h
@@ -51,7 +51,10 @@ limitations under the License. */
 #include "paddle/fluid/platform/macros.h"  // for DISABLE_COPY_AND_ASSIGN
 #include "paddle/fluid/platform/place.h"
 #ifdef PADDLE_WITH_PSCORE
+#include "paddle/fluid/distributed/ps/table/accessor.h"
+#include "paddle/fluid/distributed/ps/table/ctr_dymf_accessor.h"
 #include "paddle/fluid/distributed/ps/wrapper/fleet.h"
+#include "paddle/fluid/distributed/the_one_ps.pb.h"
 #endif
 #ifdef PADDLE_WITH_PSLIB
 #include "afs_api.h"
@@ -64,9 +67,6 @@ limitations under the License. */
 namespace paddle {
 namespace framework {
 
-#define TYPEALIGN(ALIGNVAL, LEN) \
-  (((uint64_t)(LEN) + ((ALIGNVAL)-1)) & ~((uint64_t)((ALIGNVAL)-1)))
-
 class Dataset;
 
 #ifdef PADDLE_WITH_PSLIB
@@ -98,7 +98,7 @@ class AfsWrapper {
 
 class PSGPUWrapper {
  public:
-  virtual ~PSGPUWrapper();
+  ~PSGPUWrapper();
 
   PSGPUWrapper() {
     HeterPs_ = NULL;
@@ -139,37 +139,6 @@ class PSGPUWrapper {
                 const int64_t* gpu_len,
                 int slot_num,
                 int total_len);
-  void CopyForPull(const paddle::platform::Place& place,
-                   uint64_t** gpu_keys,
-                   const std::vector<float*>& values,
-                   const FeatureValue* total_values_gpu,
-                   const int64_t* gpu_len,
-                   const int slot_num,
-                   const int hidden_size,
-                   const int64_t total_length);
-  void CopyForPull(const paddle::platform::Place& place,
-                   uint64_t** gpu_keys,
-                   const std::vector<float*>& values,
-                   const FeatureValue* total_values_gpu,
-                   const int64_t* gpu_len,
-                   const int slot_num,
-                   const int hidden_size,
-                   const int64_t total_length,
-                   int* gpu_dim);
-  void CopyForPush(const paddle::platform::Place& place,
-                   const std::vector<const float*>& grad_values,
-                   FeaturePushValue* total_grad_values_gpu,
-                   const std::vector<int64_t>& slot_lengths,
-                   const int hidden_size,
-                   const int64_t total_length,
-                   const int batch_size);
-  void CopyForPush(const paddle::platform::Place& place,
-                   const std::vector<const float*>& grad_values,
-                   FeaturePushValue* total_grad_values_gpu,
-                   const std::vector<int64_t>& slot_lengths,
-                   const uint64_t total_length,
-                   const int batch_size,
-                   size_t grad_value_size);
 
   void BuildGPUTask(std::shared_ptr<HeterContext> gpu_task);
   void PreBuildTask(std::shared_ptr<HeterContext> gpu_task);
@@ -274,13 +243,96 @@ class PSGPUWrapper {
                     float max_bound,
                     float learning_rate,
                     float initial_g2sum,
-                    float initial_range);
+                    float initial_range,
+                    float beta1_decay_rate,
+                    float beta2_decay_rate,
+                    float ada_epsilon);
   void SetEmbedxSGD(float mf_create_thresholds,
                     float mf_learning_rate,
                     float mf_initial_g2sum,
                     float mf_initial_range,
                     float mf_min_bound,
-                    float mf_max_bound);
+                    float mf_max_bound,
+                    float mf_beta1_decay_rate,
+                    float mf_beta2_decay_rate,
+                    float mf_ada_epsilon);
+
+#ifdef PADDLE_WITH_PSCORE
+  void add_sparse_optimizer(
+      std::unordered_map<std::string, float>& config,  // NOLINT
+      const ::paddle::distributed::SparseCommonSGDRuleParameter& sgd_param,
+      const std::string& prefix = "") {
+    auto optimizer_name = sgd_param.name();
+    if (optimizer_name == "SparseNaiveSGDRule") {
+      config[prefix + "optimizer_type"] = 0;
+      config[prefix + "learning_rate"] = sgd_param.naive().learning_rate();
+      config[prefix + "initial_range"] = sgd_param.naive().initial_range();
+      config[prefix + "min_bound"] = sgd_param.naive().weight_bounds()[0];
+      config[prefix + "max_bound"] = sgd_param.naive().weight_bounds()[1];
+    } else if (optimizer_name == "SparseAdaGradSGDRule") {
+      config[prefix + "optimizer_type"] = 1;
+      config[prefix + "learning_rate"] = sgd_param.adagrad().learning_rate();
+      config[prefix + "initial_range"] = sgd_param.adagrad().initial_range();
+      config[prefix + "initial_g2sum"] = sgd_param.adagrad().initial_g2sum();
+      config[prefix + "min_bound"] = sgd_param.adagrad().weight_bounds()[0];
+      config[prefix + "max_bound"] = sgd_param.adagrad().weight_bounds()[1];
+    } else if (optimizer_name == "StdAdaGradSGDRule") {
+      config[prefix + "optimizer_type"] = 2;
+      config[prefix + "learning_rate"] = sgd_param.adagrad().learning_rate();
+      config[prefix + "initial_range"] = sgd_param.adagrad().initial_range();
+      config[prefix + "initial_g2sum"] = sgd_param.adagrad().initial_g2sum();
+      config[prefix + "min_bound"] = sgd_param.adagrad().weight_bounds()[0];
+      config[prefix + "max_bound"] = sgd_param.adagrad().weight_bounds()[1];
+    } else if (optimizer_name == "SparseAdamSGDRule") {
+      config[prefix + "optimizer_type"] = 3;
+      config[prefix + "learning_rate"] = sgd_param.adam().learning_rate();
+      config[prefix + "initial_range"] = sgd_param.adam().initial_range();
+      config[prefix + "beta1_decay_rate"] = sgd_param.adam().beta1_decay_rate();
+      config[prefix + "beta2_decay_rate"] = sgd_param.adam().beta2_decay_rate();
+      config[prefix + "ada_epsilon"] = sgd_param.adam().ada_epsilon();
+      config[prefix + "min_bound"] = sgd_param.adam().weight_bounds()[0];
+      config[prefix + "max_bound"] = sgd_param.adam().weight_bounds()[1];
+    } else if (optimizer_name == "SparseSharedAdamSGDRule") {
+      config[prefix + "optimizer_type"] = 4;
+      config[prefix + "learning_rate"] = sgd_param.adam().learning_rate();
+      config[prefix + "initial_range"] = sgd_param.adam().initial_range();
+      config[prefix + "beta1_decay_rate"] = sgd_param.adam().beta1_decay_rate();
+      config[prefix + "beta2_decay_rate"] = sgd_param.adam().beta2_decay_rate();
+      config[prefix + "ada_epsilon"] = sgd_param.adam().ada_epsilon();
+      config[prefix + "min_bound"] = sgd_param.adam().weight_bounds()[0];
+      config[prefix + "max_bound"] = sgd_param.adam().weight_bounds()[1];
+    }
+  }
+
+  void InitializeGPUServer(paddle::distributed::PSParameter ps_param) {
+    auto sparse_table =
+        ps_param.server_param().downpour_server_param().downpour_table_param(0);
+    auto sparse_table_accessor = sparse_table.accessor();
+    auto sparse_table_accessor_parameter =
+        sparse_table_accessor.ctr_accessor_param();
+    accessor_class_ = sparse_table_accessor.accessor_class();
+
+    std::unordered_map<std::string, float> config;
+    config["embedx_dim"] = sparse_table_accessor.embedx_dim();
+    config["nonclk_coeff"] = sparse_table_accessor_parameter.nonclk_coeff();
+    config["clk_coeff"] = sparse_table_accessor_parameter.click_coeff();
+    config["mf_create_thresholds"] = sparse_table_accessor.embedx_threshold();
+
+    if (accessor_class_ == "CtrDymfAccessor") {
+      // optimizer config for embed_w and embedx
+      add_sparse_optimizer(config, sparse_table_accessor.embed_sgd_param());
+      add_sparse_optimizer(
+          config, sparse_table_accessor.embedx_sgd_param(), "mf_");
+    }
+
+    fleet_config_ = config;
+    GlobalAccessorTransfor::GetInstance().Init(accessor_class_);
+    GlobalAccessorTransfor::GetInstance().GetAccessorWrapper()->Configure(
+        config);
+    InitializeGPUServer(config);
+  }
+#endif
+
   void InitializeGPUServer(std::unordered_map<std::string, float> config) {
     float nonclk_coeff = (config.find("nonclk_coeff") == config.end())
                              ? 1.0
@@ -288,54 +340,83 @@ class PSGPUWrapper {
     float clk_coeff =
         (config.find("clk_coeff") == config.end()) ? 1.0 : config["clk_coeff"];
     float min_bound = (config.find("min_bound") == config.end())
-                          ? -10000.0
+                          ? -10.0
                           : config["min_bound"];
-    float max_bound = (config.find("max_bound") == config.end())
-                          ? 10000.0
-                          : config["max_bound"];
+    float max_bound =
+        (config.find("max_bound") == config.end()) ? 10.0 : config["max_bound"];
     float learning_rate = (config.find("learning_rate") == config.end())
-                              ? 1.0
+                              ? 0.05
                               : config["learning_rate"];
     float initial_g2sum = (config.find("initial_g2sum") == config.end())
-                              ? 1.0
+                              ? 3.0
                               : config["initial_g2sum"];
     float initial_range = (config.find("initial_range") == config.end())
-                              ? 1.0
+                              ? 1e-4
                               : config["initial_range"];
-
+    float beta1_decay_rate = (config.find("beta1_decay_rate") == config.end())
+                                 ? 0.9
+                                 : config["beta1_decay_rate"];
+    float beta2_decay_rate = (config.find("beta2_decay_rate") == config.end())
+                                 ? 0.999
+                                 : config["beta2_decay_rate"];
+    float ada_epsilon = (config.find("ada_epsilon") == config.end())
+                            ? 1e-8
+                            : config["ada_epsilon"];
     // mf config settings
     float mf_create_thresholds =
         (config.find("mf_create_thresholds") == config.end())
             ? static_cast<float>(1.0)
             : config["mf_create_thresholds"];
     float mf_learning_rate = (config.find("mf_learning_rate") == config.end())
-                                 ? 1.0
+                                 ? 0.05
                                  : config["mf_learning_rate"];
     float mf_initial_g2sum = (config.find("mf_initial_g2sum") == config.end())
-                                 ? 1.0
+                                 ? 3.0
                                  : config["mf_initial_g2sum"];
     float mf_initial_range = (config.find("mf_initial_range") == config.end())
-                                 ? 1.0
+                                 ? 1e-4
                                  : config["mf_initial_range"];
     float mf_min_bound = (config.find("mf_min_bound") == config.end())
-                             ? 1.0
+                             ? -10.0
                              : config["mf_min_bound"];
     float mf_max_bound = (config.find("mf_max_bound") == config.end())
-                             ? 1.0
+                             ? 10.0
                              : config["mf_max_bound"];
+    float mf_beta1_decay_rate =
+        (config.find("mf_beta1_decay_rate") == config.end())
+            ? 0.9
+            : config["mf_beta1_decay_rate"];
+    float mf_beta2_decay_rate =
+        (config.find("mf_beta2_decay_rate") == config.end())
+            ? 0.999
+            : config["mf_beta2_decay_rate"];
+    float mf_ada_epsilon = (config.find("mf_ada_epsilon") == config.end())
+                               ? 1e-8
+                               : config["mf_ada_epsilon"];
     this->SetSparseSGD(nonclk_coeff,
                        clk_coeff,
                        min_bound,
                        max_bound,
                        learning_rate,
                        initial_g2sum,
-                       initial_range);
+                       initial_range,
+                       beta1_decay_rate,
+                       beta2_decay_rate,
+                       ada_epsilon);
     this->SetEmbedxSGD(mf_create_thresholds,
                        mf_learning_rate,
                        mf_initial_g2sum,
                        mf_initial_range,
                        mf_min_bound,
-                       mf_max_bound);
+                       mf_max_bound,
+                       mf_beta1_decay_rate,
+                       mf_beta2_decay_rate,
+                       mf_ada_epsilon);
+
+    // set optimizer type(naive,adagrad,std_adagrad,adam,share_adam)
+    optimizer_type_ = (config.find("optimizer_type") == config.end())
+                          ? 1
+                          : static_cast<int>(config["optimizer_type"]);
   }
 
   void SetDate(int year, int month, int day) {
@@ -380,7 +461,7 @@ class PSGPUWrapper {
     if (slot_info_initialized_) {
       return;
     }
-    SlotRecordDataset* dataset = dynamic_cast<SlotRecordDataset*>(dataset_);
+    SlotRecordDataset* dataset = (SlotRecordDataset*)(dataset_);
     auto slots_vec = dataset->GetSlots();
     slot_offset_vector_.clear();
     for (auto& slot : slot_vector_) {
@@ -421,10 +502,13 @@ class PSGPUWrapper {
     for (size_t i = 0; i < slot_index_vec_.size(); i++) {
       slot_index_vec_[i] = dim_index_map[slot_mf_dim_vector_[i]];
     }
-    val_type_size_ =
-        TYPEALIGN(8, sizeof(FeatureValue) + sizeof(float) * (max_mf_dim_ + 1));
-    grad_type_size_ =
-        TYPEALIGN(8, sizeof(FeaturePushValue) + (max_mf_dim_ * sizeof(float)));
+
+    auto accessor_wrapper_ptr =
+        GlobalAccessorTransfor::GetInstance().GetAccessorWrapper();
+    val_type_size_ = accessor_wrapper_ptr->GetFeatureValueSize(max_mf_dim_);
+    grad_type_size_ = accessor_wrapper_ptr->GetPushValueSize(max_mf_dim_);
+    VLOG(0) << "InitSlotInfo: val_type_size_" << val_type_size_
+            << " grad_type_size_:" << grad_type_size_;
     slot_info_initialized_ = true;
   }
 #endif
@@ -445,6 +529,12 @@ class PSGPUWrapper {
                   const std::string& conf);
 #endif
 
+#ifdef PADDLE_WITH_PSCORE
+  void SetTableAccessor(paddle::distributed::ValueAccessor* accessor) {
+    cpu_table_accessor_ = accessor;
+  }
+#endif
+
  private:
   static std::shared_ptr<PSGPUWrapper> s_instance_;
   Dataset* dataset_;
@@ -497,6 +587,12 @@ class PSGPUWrapper {
   int day_;
   bool slot_info_initialized_ = false;
   int use_afs_api_ = 0;
+  int optimizer_type_ = 1;
+  std::string accessor_class_;
+  std::unordered_map<std::string, float> fleet_config_;
+#ifdef PADDLE_WITH_PSCORE
+  paddle::distributed::ValueAccessor* cpu_table_accessor_;
+#endif
 
 #ifdef PADDLE_WITH_CUDA
   std::vector<MemoryPool*> mem_pools_;
@@ -521,6 +617,7 @@ class PSGPUWrapper {
   bool running_ = false;
   std::vector<std::shared_ptr<ThreadPool>> pull_thread_pool_;
   std::vector<std::shared_ptr<ThreadPool>> hbm_thread_pool_;
+  OptimizerConfig optimizer_config_;
 
  protected:
   static bool is_initialized_;
diff --git a/paddle/fluid/framework/fleet/ps_gpu_wrapper.kps b/paddle/fluid/framework/fleet/ps_gpu_wrapper.kps
index ef6c70e624d4c..3505bff72e90a 100644
--- a/paddle/fluid/framework/fleet/ps_gpu_wrapper.kps
+++ b/paddle/fluid/framework/fleet/ps_gpu_wrapper.kps
@@ -28,9 +28,13 @@ limitations under the License. */
 namespace paddle {
 namespace framework {
 
-__global__ void PullCopy(float* dest, const FeatureValue* src,
-                         const long long* len, int hidden, int slot_num,
-                         int total_len, unsigned long long* keys) {
+__global__ void PullCopy(float* dest,
+                         const FeatureValue* src,
+                         const long long* len,
+                         int hidden,
+                         int slot_num,
+                         int total_len,
+                         unsigned long long* keys) {
   int cid = core_id();
   int ncores = core_num();
   if (cid >= ncores) {
@@ -42,8 +46,8 @@ __global__ void PullCopy(float* dest, const FeatureValue* src,
   GM2LM(len, local_len, slot_num * sizeof(int64_t));
 
   __global_ptr__ unsigned long long* local_keys[slot_num];
-  GM2LM(keys, local_keys,
-        slot_num * sizeof(__global_ptr__ unsigned long long*));
+  GM2LM(
+      keys, local_keys, slot_num * sizeof(__global_ptr__ unsigned long long*));
 
   __global_ptr__ float* local_dest[slot_num];
   GM2LM(dest, local_dest, slot_num * sizeof(__global_ptr__ float*));
@@ -64,10 +68,11 @@ __global__ void PullCopy(float* dest, const FeatureValue* src,
     // copy read_len (length) of slots' val to LM
     for (int k = 0; k < slot_len; k += read_len) {
       int real_read_len = min(read_len, slot_len - k);
-      GM2LM(src + dest_len + k, local_slot_vals,
+      GM2LM(src + dest_len + k,
+            local_slot_vals,
             real_read_len * sizeof(FeatureValue));
-      GM2LM(local_keys[i] + k, local_slot_keys,
-            real_read_len * sizeof(uint64_t));
+      GM2LM(
+          local_keys[i] + k, local_slot_keys, real_read_len * sizeof(uint64_t));
       for (int j = 0; j < real_read_len; j++) {
         if (local_slot_keys[j] == 0) {
           local_dest_vals[j * hidden] = 0;
@@ -89,7 +94,8 @@ __global__ void PullCopy(float* dest, const FeatureValue* src,
           }
         }
       }
-      LM2GM(local_dest_vals, local_dest[i] + k * hidden,
+      LM2GM(local_dest_vals,
+            local_dest[i] + k * hidden,
             real_read_len * hidden * sizeof(float));
     }
   }
@@ -97,7 +103,8 @@ __global__ void PullCopy(float* dest, const FeatureValue* src,
 
 __global__ void CopyKeysKernel(unsigned long long* src_keys,
                                unsigned long long* dest_total_keys,
-                               const long long* len, int slot_num,
+                               const long long* len,
+                               int slot_num,
                                int total_len) {
   int cid = core_id();
   int ncores = core_num();
@@ -110,7 +117,8 @@ __global__ void CopyKeysKernel(unsigned long long* src_keys,
   GM2LM(len, local_len, slot_num * sizeof(long long));
 
   __global_ptr__ unsigned long long* local_keys[slot_num];
-  GM2LM(src_keys, local_keys,
+  GM2LM(src_keys,
+        local_keys,
         slot_num * sizeof(__global_ptr__ unsigned long long*));
 
   for (int i = thread_id; i < slot_num; i += nthreads) {
@@ -123,16 +131,23 @@ __global__ void CopyKeysKernel(unsigned long long* src_keys,
 
     for (int k = 0; k < slot_len; k += read_len) {
       int real_read_len = min(read_len, slot_len - k);
-      GM2LM(local_keys[i] + k, local_slot_keys,
+      GM2LM(local_keys[i] + k,
+            local_slot_keys,
             real_read_len * sizeof(unsigned long long));
-      LM2GM(local_slot_keys, dest_total_keys + dest_len + k,
+      LM2GM(local_slot_keys,
+            dest_total_keys + dest_len + k,
             real_read_len * sizeof(unsigned long long));
     }
   }
 }
 
-__global__ void PushCopy(FeaturePushValue* dest, float* src, long long* len,
-                         int hidden, int slot_num, int total_len, int bs,
+__global__ void PushCopy(FeaturePushValue* dest,
+                         float* src,
+                         long long* len,
+                         int hidden,
+                         int slot_num,
+                         int total_len,
+                         int bs,
                          int* slot_vector) {
   int cid = core_id();
   int ncores = core_num();
@@ -163,7 +178,8 @@ __global__ void PushCopy(FeaturePushValue* dest, float* src, long long* len,
     // copy read_len(length) of slots' grad to LM
     for (int k = 0; k < slot_len; k += read_len) {
       int real_read_len = min(read_len, slot_len - k);
-      GM2LM(local_src[i] + k * hidden, local_slot_grads,
+      GM2LM(local_src[i] + k * hidden,
+            local_slot_grads,
             real_read_len * hidden * sizeof(float));
       // copy from slots' grad to total grad
       for (int j = 0; j < real_read_len; j++) {
@@ -176,7 +192,8 @@ __global__ void PushCopy(FeaturePushValue* dest, float* src, long long* len,
               local_slot_grads[j * hidden + 3 + m] * -1. * bs;
         }
       }
-      LM2GM(local_dest_grads, dest + dest_len + k,
+      LM2GM(local_dest_grads,
+            dest + dest_len + k,
             real_read_len * sizeof(FeaturePushValue));
     }
   }
@@ -184,40 +201,11 @@ __global__ void PushCopy(FeaturePushValue* dest, float* src, long long* len,
 
 PSGPUWrapper::~PSGPUWrapper() { delete HeterPs_; }
 
-void PSGPUWrapper::CopyForPull(const paddle::platform::Place& place,
-                               uint64_t** gpu_keys,
-                               const std::vector<float*>& values,
-                               const FeatureValue* total_values_gpu,
-                               const int64_t* gpu_len, const int slot_num,
-                               const int hidden_size,
-                               const int64_t total_length) {
-  XPUStream stream = nullptr;
-  auto dev_ctx = platform::DeviceContextPool::Instance().Get(place);
-  stream = static_cast<platform::XPUDeviceContext*>(dev_ctx)
-               ->x_context()
-               ->xpu_stream;
-  // float* buf_value = nullptr;
-  // xpu_malloc(reinterpret_cast<void**>(&buf_value),
-  //            values.size() * sizeof(float*));
-  // float** gpu_values = reinterpret_cast<float**>(&buf_value);
-  float* gpu_values = nullptr;
-  xpu_malloc(reinterpret_cast<void**>(&gpu_values),
-             values.size() * sizeof(float*));
-  xpu_memcpy(gpu_values, values.data(), values.size() * sizeof(float*),
-             XPU_HOST_TO_DEVICE);
-
-  // unsigned long long** c_keys = (unsigned long long**)gpu_keys;
-  unsigned long long* c_keys = reinterpret_cast<unsigned long long*>(gpu_keys);
-  const long long* c_len = (const long long*)gpu_len;
-  PullCopy<<<2, 64, stream>>>(gpu_values, total_values_gpu, c_len, hidden_size,
-                              slot_num, total_length, c_keys);
-
-  xpu_wait(stream);
-}
-
 void PSGPUWrapper::CopyKeys(const paddle::platform::Place& place,
-                            uint64_t** origin_keys, uint64_t* total_keys,
-                            const int64_t* gpu_len, int slot_num,
+                            uint64_t** origin_keys,
+                            uint64_t* total_keys,
+                            const int64_t* gpu_len,
+                            int slot_num,
                             int total_len) {
   XPUStream stream = nullptr;
   auto dev_ctx = platform::DeviceContextPool::Instance().Get(place);
@@ -232,66 +220,49 @@ void PSGPUWrapper::CopyKeys(const paddle::platform::Place& place,
   xpu_wait(stream);
 }
 
-void PSGPUWrapper::CopyForPush(const paddle::platform::Place& place,
-                               const std::vector<const float*>& grad_values,
-                               FeaturePushValue* total_grad_values_gpu,
-                               const std::vector<int64_t>& slot_lengths,
-                               const int hidden_size,
-                               const int64_t total_length,
-                               const int batch_size) {
-  XPUStream stream = nullptr;
-  auto dev_ctx = platform::DeviceContextPool::Instance().Get(place);
-  stream = static_cast<platform::XPUDeviceContext*>(dev_ctx)
-               ->x_context()
-               ->xpu_stream;
-  auto slot_lengths_lod = slot_lengths;
-  for (size_t i = 1; i < slot_lengths_lod.size(); i++) {
-    slot_lengths_lod[i] += slot_lengths_lod[i - 1];
-  }
-
-  float* gpu_values = nullptr;
-  int64_t* gpu_len = nullptr;
-  int* d_slot_vector = nullptr;
-
-  xpu_malloc(reinterpret_cast<void**>(&gpu_values),
-             grad_values.size() * sizeof(float*));
-  xpu_malloc(reinterpret_cast<void**>(&gpu_len),
-             slot_lengths.size() * sizeof(int64_t));
-  xpu_malloc(reinterpret_cast<void**>(&d_slot_vector),
-             slot_lengths_lod.size() * sizeof(int));
-
-  xpu_memcpy(gpu_values, grad_values.data(),
-             grad_values.size() * sizeof(float*), XPU_HOST_TO_DEVICE);
-  xpu_memcpy(gpu_len, slot_lengths_lod.data(),
-             slot_lengths.size() * sizeof(int64_t), XPU_HOST_TO_DEVICE);
-  xpu_memcpy(d_slot_vector, slot_vector_.data(),
-             slot_lengths_lod.size() * sizeof(int), XPU_HOST_TO_DEVICE);
-
-  long long* c_len = (long long*)gpu_len;
-  PushCopy<<<2, 64, stream>>>(total_grad_values_gpu, gpu_values, c_len,
-                              hidden_size, slot_lengths.size(), total_length,
-                              batch_size, d_slot_vector);
-  xpu_wait(stream);
-}
-
-void PSGPUWrapper::SetSparseSGD(float nonclk_coeff, float clk_coeff,
-                                float min_bound, float max_bound,
-                                float learning_rate, float initial_g2sum,
-                                float initial_range) {
+void PSGPUWrapper::SetSparseSGD(float nonclk_coeff,
+                                float clk_coeff,
+                                float min_bound,
+                                float max_bound,
+                                float learning_rate,
+                                float initial_g2sum,
+                                float initial_range,
+                                float beta1_decay_rate,
+                                float beta2_decay_rate,
+                                float ada_epsilon) {
   OptimizerConfig optimizer_config;
-  optimizer_config.set_sparse_sgd(nonclk_coeff, clk_coeff, min_bound, max_bound,
-                                  learning_rate, initial_g2sum, initial_range);
+  optimizer_config.set_sparse_sgd(nonclk_coeff,
+                                  clk_coeff,
+                                  min_bound,
+                                  max_bound,
+                                  learning_rate,
+                                  initial_g2sum,
+                                  initial_range,
+                                  beta1_decay_rate,
+                                  beta2_decay_rate,
+                                  ada_epsilon);
   HeterPs_->set_sparse_sgd(optimizer_config);
 }
 
 void PSGPUWrapper::SetEmbedxSGD(float mf_create_thresholds,
-                                float mf_learning_rate, float mf_initial_g2sum,
-                                float mf_initial_range, float mf_min_bound,
-                                float mf_max_bound) {
+                                float mf_learning_rate,
+                                float mf_initial_g2sum,
+                                float mf_initial_range,
+                                float mf_min_bound,
+                                float mf_max_bound,
+                                float mf_beta1_decay_rate,
+                                float mf_beta2_decay_rate,
+                                float mf_ada_epsilon) {
   OptimizerConfig optimizer_config;
-  optimizer_config.set_embedx_sgd(mf_create_thresholds, mf_learning_rate,
-                                  mf_initial_g2sum, mf_initial_range,
-                                  mf_min_bound, mf_max_bound);
+  optimizer_config.set_embedx_sgd(mf_create_thresholds,
+                                  mf_learning_rate,
+                                  mf_initial_g2sum,
+                                  mf_initial_range,
+                                  mf_min_bound,
+                                  mf_max_bound,
+                                  mf_beta1_decay_rate,
+                                  mf_beta2_decay_rate,
+                                  mf_ada_epsilon);
   HeterPs_->set_embedx_sgd(optimizer_config);
 }
 
diff --git a/python/paddle/distributed/fleet/base/distributed_strategy.py b/python/paddle/distributed/fleet/base/distributed_strategy.py
index 902854a7c7279..c58b539b6877d 100755
--- a/python/paddle/distributed/fleet/base/distributed_strategy.py
+++ b/python/paddle/distributed/fleet/base/distributed_strategy.py
@@ -594,6 +594,21 @@ def sparse_optimizer_config(sgd, strategy, prefix):
                 bounds = strategy.get(prefix + 'sparse_weight_bounds',
                                       [-10, 10])
                 sgd.adam.weight_bounds.extend(bounds)
+            elif optimizer_name == "shared_adam":
+                sgd.name = 'SparseSharedAdamSGDRule'
+                sgd.adam.learning_rate = strategy.get(
+                    prefix + 'sparse_learning_rate', 0.001)
+                sgd.adam.initial_range = strategy.get(
+                    prefix + 'sparse_initial_range', 1e-4)
+                sgd.adam.beta1_decay_rate = strategy.get(
+                    prefix + 'sparse_beta1_decay_rate', 0.9)
+                sgd.adam.beta2_decay_rate = strategy.get(
+                    prefix + 'sparse_beta2_decay_rate', 0.999)
+                sgd.adam.ada_epsilon = strategy.get(
+                    prefix + 'sparse_ada_epsilon', 1e-8)
+                bounds = strategy.get(prefix + 'sparse_weight_bounds',
+                                      [-10, 10])
+                sgd.adam.weight_bounds.extend(bounds)
 
         def set_sparse_table_config(table_data, config):
             for key in config:
diff --git a/python/paddle/distributed/ps/the_one_ps.py b/python/paddle/distributed/ps/the_one_ps.py
index c6ba48e5e32b5..7d240983a1c28 100755
--- a/python/paddle/distributed/ps/the_one_ps.py
+++ b/python/paddle/distributed/ps/the_one_ps.py
@@ -195,7 +195,7 @@ def _set(self, accessor_proto, varname, program_id, context):
                     sgd_param.naive.initial_range = 0.0001
                 if len(sgd_param.naive.weight_bounds) == 0:
                     sgd_param.naive.weight_bounds.extend([-10.0, 10.0])
-            if sgd_param.name == "SparseAdamSGDRule":
+            if sgd_param.name == "SparseAdamSGDRule" or sgd_param.name == "SparseSharedAdamSGDRule":
                 if not sgd_param.adam.HasField("learning_rate"):
                     sgd_param.adam.learning_rate = 0.001
                 if not sgd_param.adam.HasField("initial_range"):
diff --git a/python/paddle/fluid/tests/unittests/test_dist_fleet_ps13.py b/python/paddle/fluid/tests/unittests/test_dist_fleet_ps13.py
new file mode 100644
index 0000000000000..c5ae2365b07cd
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_ps13.py
@@ -0,0 +1,201 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import os
+
+os.environ["WITH_DISTRIBUTE"] = "ON"
+
+import unittest
+import tempfile
+import shutil
+
+import paddle
+import paddle.fluid as fluid
+import paddle.distributed.fleet.base.role_maker as role_maker
+import paddle.distributed.fleet as fleet
+
+paddle.enable_static()
+
+# For Net
+base_lr = 0.2
+emb_lr = base_lr * 3
+dict_dim = 1500
+emb_dim = 128
+hid_dim = 128
+margin = 0.1
+sample_rate = 1
+batch_size = 4
+
+
+# this unittest is tested for SparseSharedAdamSGDRule
+class TestPSPassWithBow(unittest.TestCase):
+
+    def net(self):
+
+        def get_acc(cos_q_nt, cos_q_pt, batch_size):
+            cond = fluid.layers.less_than(cos_q_nt, cos_q_pt)
+            cond = fluid.layers.cast(cond, dtype='float64')
+            cond_3 = fluid.layers.reduce_sum(cond)
+            acc = fluid.layers.elementwise_div(cond_3,
+                                               fluid.layers.fill_constant(
+                                                   shape=[1],
+                                                   value=batch_size * 1.0,
+                                                   dtype='float64'),
+                                               name="simnet_acc")
+            return acc
+
+        def get_loss(cos_q_pt, cos_q_nt):
+            loss_op1 = fluid.layers.elementwise_sub(
+                fluid.layers.fill_constant_batch_size_like(input=cos_q_pt,
+                                                           shape=[-1, 1],
+                                                           value=margin,
+                                                           dtype='float32'),
+                cos_q_pt)
+            loss_op2 = fluid.layers.elementwise_add(loss_op1, cos_q_nt)
+            loss_op3 = fluid.layers.elementwise_max(
+                fluid.layers.fill_constant_batch_size_like(input=loss_op2,
+                                                           shape=[-1, 1],
+                                                           value=0.0,
+                                                           dtype='float32'),
+                loss_op2)
+            avg_cost = fluid.layers.mean(loss_op3)
+            return avg_cost
+
+        is_distributed = False
+        is_sparse = True
+
+        # query
+        q = fluid.layers.data(name="query_ids",
+                              shape=[1],
+                              dtype="int64",
+                              lod_level=1)
+        # embedding
+        q_emb = fluid.contrib.layers.sparse_embedding(
+            input=q,
+            size=[dict_dim, emb_dim],
+            param_attr=fluid.ParamAttr(
+                initializer=fluid.initializer.Constant(value=0.01),
+                name="__emb__",
+                learning_rate=emb_lr))
+        q_emb = fluid.layers.reshape(q_emb, [-1, emb_dim])
+        # vsum
+        q_sum = fluid.layers.sequence_pool(input=q_emb, pool_type='sum')
+        q_ss = fluid.layers.softsign(q_sum)
+        # fc layer after conv
+        q_fc = fluid.layers.fc(
+            input=q_ss,
+            size=hid_dim,
+            param_attr=fluid.ParamAttr(
+                initializer=fluid.initializer.Constant(value=0.01),
+                name="__q_fc__",
+                learning_rate=base_lr))
+        # label data
+        label = fluid.layers.data(name="label", shape=[1], dtype="int64")
+        # pt
+        pt = fluid.layers.data(name="pos_title_ids",
+                               shape=[1],
+                               dtype="int64",
+                               lod_level=1)
+        # embedding
+        pt_emb = fluid.contrib.layers.sparse_embedding(
+            input=pt,
+            size=[dict_dim, emb_dim],
+            param_attr=fluid.ParamAttr(
+                initializer=fluid.initializer.Constant(value=0.01),
+                name="__emb__",
+                learning_rate=emb_lr))
+        pt_emb = fluid.layers.reshape(pt_emb, [-1, emb_dim])
+        # vsum
+        pt_sum = fluid.layers.sequence_pool(input=pt_emb, pool_type='sum')
+        pt_ss = fluid.layers.softsign(pt_sum)
+        # fc layer
+        pt_fc = fluid.layers.fc(
+            input=pt_ss,
+            size=hid_dim,
+            param_attr=fluid.ParamAttr(
+                initializer=fluid.initializer.Constant(value=0.01),
+                name="__fc__",
+                learning_rate=base_lr),
+            bias_attr=fluid.ParamAttr(name="__fc_b__"))
+        # nt
+        nt = fluid.layers.data(name="neg_title_ids",
+                               shape=[1],
+                               dtype="int64",
+                               lod_level=1)
+        # embedding
+        nt_emb = fluid.contrib.layers.sparse_embedding(
+            input=nt,
+            size=[dict_dim, emb_dim],
+            param_attr=fluid.ParamAttr(
+                initializer=fluid.initializer.Constant(value=0.01),
+                name="__emb__",
+                learning_rate=emb_lr))
+        nt_emb = fluid.layers.reshape(nt_emb, [-1, emb_dim])
+        # vsum
+        nt_sum = fluid.layers.sequence_pool(input=nt_emb, pool_type='sum')
+        nt_ss = fluid.layers.softsign(nt_sum)
+        # fc layer
+        nt_fc = fluid.layers.fc(
+            input=nt_ss,
+            size=hid_dim,
+            param_attr=fluid.ParamAttr(
+                initializer=fluid.initializer.Constant(value=0.01),
+                name="__fc__",
+                learning_rate=base_lr),
+            bias_attr=fluid.ParamAttr(name="__fc_b__"))
+        cos_q_pt = fluid.layers.cos_sim(q_fc, pt_fc)
+        cos_q_nt = fluid.layers.cos_sim(q_fc, nt_fc)
+        # loss
+        avg_cost = get_loss(cos_q_pt, cos_q_nt)
+        # acc
+        acc = get_acc(cos_q_nt, cos_q_pt, batch_size)
+        return [avg_cost, acc, cos_q_pt]
+
+    def test(self):
+        os.environ["PADDLE_PSERVER_NUMS"] = "2"
+        os.environ["PADDLE_TRAINERS_NUM"] = "2"
+        os.environ["POD_IP"] = "127.0.0.1"
+        os.environ["PADDLE_PORT"] = "36001"
+        os.environ["PADDLE_TRAINER_ID"] = "0"
+        os.environ["PADDLE_TRAINERS_NUM"] = "2"
+        os.environ[
+            "PADDLE_PSERVERS_IP_PORT_LIST"] = "127.0.0.1:36001,127.0.0.2:36001"
+        os.environ["TRAINING_ROLE"] = "PSERVER"
+
+        role = role_maker.PaddleCloudRoleMaker()
+        fleet.init(role)
+        loss, acc, _ = self.net()
+
+        strategy = paddle.distributed.fleet.DistributedStrategy()
+        strategy.a_sync = True
+
+        configs = {}
+        configs['__emb__'] = {
+            "table_parameters.__emb__.accessor.embed_sgd_param.name":
+            "SparseSharedAdamSGDRule",
+            "table_parameters.__emb__.accessor.embedx_sgd_param.name":
+            "SparseSharedAdamSGDRule",
+        }
+        strategy.sparse_table_configs = configs
+        optimizer = paddle.fluid.optimizer.SGD(learning_rate=0.01)
+        optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy)
+        optimizer.minimize(loss)
+
+        fleet.init_server()
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_fleet_distributed_strategy.py b/python/paddle/fluid/tests/unittests/test_fleet_distributed_strategy.py
index 455a7a30cfd18..9ac88c802111f 100644
--- a/python/paddle/fluid/tests/unittests/test_fleet_distributed_strategy.py
+++ b/python/paddle/fluid/tests/unittests/test_fleet_distributed_strategy.py
@@ -334,6 +334,14 @@ def test_fleet_desc_configs(self):
             strategy.sparse_table_configs[0].accessor.embed_sgd_param.adagrad.
             initial_range, 0.0001)
 
+        strategy = paddle.distributed.fleet.DistributedStrategy()
+        configs = {}
+        configs['emb'] = {"sparse_optimizer": "shared_adam"}
+        strategy.fleet_desc_configs = configs
+        self.assertEqual(
+            strategy.sparse_table_configs[0].accessor.embed_sgd_param.adam.
+            beta1_decay_rate, 0.9)
+
     def test_trainer_desc_configs(self):
         strategy = paddle.distributed.fleet.DistributedStrategy()
         configs = {
diff --git a/tools/parallel_UT_rule.py b/tools/parallel_UT_rule.py
index 559f2d95b915f..53ab93f57ce56 100755
--- a/tools/parallel_UT_rule.py
+++ b/tools/parallel_UT_rule.py
@@ -671,7 +671,8 @@
     'test_trt_convert_reduce_sum',
     'save_quant2_model_lstm',
     'test_trt_convert_slice',
-    'test_quant2_int8_lstm_mkldnn'
+    'test_quant2_int8_lstm_mkldnn',
+    'test_dist_fleet_ps13'
 ]
 
 # mem=0 but always timeout or failed : It run 15 job each time in Single cases;

From c99c70cb2bbcf9deb0ab11d44ee2d98c9d378ae8 Mon Sep 17 00:00:00 2001
From: lyq <30404405+affectionlu@users.noreply.github.com>
Date: Wed, 20 Jul 2022 15:33:03 +0800
Subject: [PATCH 02/12] [Phi] migrate sync_batch_norm to phi (#44369)

---
 paddle/fluid/operators/inplace_abn_op.cu      | 130 ++--
 paddle/fluid/operators/sync_batch_norm_op.cu  | 137 ----
 .../fluid/operators/sync_batch_norm_op.cu.h   | 637 ------------------
 paddle/phi/api/yaml/legacy_api.yaml           |  10 +
 paddle/phi/api/yaml/legacy_backward.yaml      |  12 +
 .../gpu/sync_batch_norm_grad_kernel.cu        |  75 +++
 .../phi/kernels/gpu/sync_batch_norm_kernel.cu | 190 ++++++
 .../phi/kernels/gpu/sync_batch_norm_utils.h   | 493 ++++++++++++++
 .../phi/kernels/sync_batch_norm_grad_kernel.h |  45 ++
 paddle/phi/kernels/sync_batch_norm_kernel.h   |  43 ++
 paddle/phi/ops/compat/sync_batch_norm_sig.cc  |  67 ++
 python/paddle/nn/layer/norm.py                |  11 +-
 12 files changed, 1027 insertions(+), 823 deletions(-)
 delete mode 100644 paddle/fluid/operators/sync_batch_norm_op.cu
 delete mode 100644 paddle/fluid/operators/sync_batch_norm_op.cu.h
 create mode 100644 paddle/phi/kernels/gpu/sync_batch_norm_grad_kernel.cu
 create mode 100644 paddle/phi/kernels/gpu/sync_batch_norm_kernel.cu
 create mode 100644 paddle/phi/kernels/gpu/sync_batch_norm_utils.h
 create mode 100644 paddle/phi/kernels/sync_batch_norm_grad_kernel.h
 create mode 100644 paddle/phi/kernels/sync_batch_norm_kernel.h
 create mode 100644 paddle/phi/ops/compat/sync_batch_norm_sig.cc

diff --git a/paddle/fluid/operators/inplace_abn_op.cu b/paddle/fluid/operators/inplace_abn_op.cu
index 71e21a2edd47b..a74150a330672 100644
--- a/paddle/fluid/operators/inplace_abn_op.cu
+++ b/paddle/fluid/operators/inplace_abn_op.cu
@@ -13,17 +13,19 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/inplace_abn_op.h"
+#include <iostream>
 #include "paddle/fluid/operators/batch_norm_op.h"
-#include "paddle/fluid/operators/sync_batch_norm_op.cu.h"
 #include "paddle/phi/kernels/batch_norm_grad_kernel.h"
 #include "paddle/phi/kernels/batch_norm_kernel.h"
+#include "paddle/phi/kernels/gpu/sync_batch_norm_utils.h"
+#include "paddle/phi/kernels/sync_batch_norm_grad_kernel.h"
+#include "paddle/phi/kernels/sync_batch_norm_kernel.h"
 
 namespace paddle {
 namespace operators {
 
 template <typename DeviceContext, typename T>
-class InplaceABNKernel
-    : public paddle::operators::SyncBatchNormKernel<DeviceContext, T> {
+class InplaceABNKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
     auto* y = ctx.Output<Tensor>("Y");
@@ -36,29 +38,49 @@ class InplaceABNKernel
         GetInplaceABNActivationType(ctx.Attr<std::string>("activation"));
     auto& place = *ctx.template device_context<DeviceContext>().eigen_device();
 
+    auto* scale = ctx.Input<Tensor>("Scale");
+    auto* bias = ctx.Input<Tensor>("Bias");
+    auto* mean = ctx.Input<Tensor>("Mean");
+    auto* variance = ctx.Input<Tensor>("Variance");
+
+    auto momentum = ctx.Attr<float>("momentum");
+    auto epsilon = ctx.Attr<float>("epsilon");
+    auto data_layout = ctx.Attr<std::string>("data_layout");
+    auto is_test = ctx.Attr<bool>("is_test");
+    auto use_global_stats = ctx.Attr<bool>("use_global_stats");
+    auto trainable_statistics = ctx.Attr<bool>("trainable_statistics");
+    auto fuse_with_relu = ctx.Attr<bool>("fuse_with_relu");
+
+    auto* mean_out = ctx.Output<Tensor>("MeanOut");
+    auto* variance_out = ctx.Output<Tensor>("VarianceOut");
+    auto* saved_mean = ctx.Output<Tensor>("SavedMean");
+    auto* saved_variance = ctx.Output<Tensor>("SavedVariance");
+    auto* reserve_space = ctx.Output<Tensor>("ReserveSpace");
+
     if (ctx.Attr<bool>("use_sync_bn")) {
-      SyncBatchNormKernel<DeviceContext, T>::Compute(ctx);
+      auto& dev_ctx = ctx.device_context<DeviceContext>();
+      phi::SyncBatchNormKernel<T>(
+          static_cast<const typename framework::ConvertToPhiContext<
+              DeviceContext>::TYPE&>(dev_ctx),
+          *x,
+          *scale,
+          *bias,
+          *mean,
+          *variance,
+          momentum,
+          epsilon,
+          data_layout,
+          is_test,
+          use_global_stats,
+          trainable_statistics,
+          fuse_with_relu,
+          y,
+          mean_out,
+          variance_out,
+          saved_mean,
+          saved_variance,
+          reserve_space);
     } else {
-      // BatchNormKernel<DeviceContext, T>::Compute(ctx);
-      auto* scale = ctx.Input<Tensor>("Scale");
-      auto* bias = ctx.Input<Tensor>("Bias");
-      auto* mean = ctx.Input<Tensor>("Mean");
-      auto* variance = ctx.Input<Tensor>("Variance");
-
-      auto momentum = ctx.Attr<float>("momentum");
-      auto epsilon = ctx.Attr<float>("epsilon");
-      auto data_layout = ctx.Attr<std::string>("data_layout");
-      auto is_test = ctx.Attr<bool>("is_test");
-      auto use_global_stats = ctx.Attr<bool>("use_global_stats");
-      auto trainable_statistics = ctx.Attr<bool>("trainable_statistics");
-      auto fuse_with_relu = ctx.Attr<bool>("fuse_with_relu");
-
-      auto* mean_out = ctx.Output<Tensor>("MeanOut");
-      auto* variance_out = ctx.Output<Tensor>("VarianceOut");
-      auto* saved_mean = ctx.Output<Tensor>("SavedMean");
-      auto* saved_variance = ctx.Output<Tensor>("SavedVariance");
-      auto* reserve_space = ctx.Output<Tensor>("ReserveSpace");
-
       auto& dev_ctx = ctx.device_context<DeviceContext>();
       phi::BatchNormKernel<T>(
           static_cast<const typename framework::ConvertToPhiContext<
@@ -92,8 +114,7 @@ class InplaceABNKernel
 // Deriving the Gradient for the Backward Pass of Batch Normalization
 // https://kevinzakka.github.io/2016/09/14/batch_normalization/
 template <typename DeviceContext, typename T>
-class InplaceABNGradKernel
-    : public paddle::operators::SyncBatchNormGradKernel<DeviceContext, T> {
+class InplaceABNGradKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
     const auto* y = ctx.Input<Tensor>("Y");
@@ -115,29 +136,44 @@ class InplaceABNGradKernel
     InplaceABNActivation<DeviceContext, T> functor;
     functor.GradCompute(ctx, activation, place, cur_y, cur_y, cur_dy, cur_dy);
 
+    auto* scale = ctx.Input<Tensor>("Scale");
+    auto* bias = ctx.Input<Tensor>("Bias");
+    auto* saved_mean = ctx.Input<Tensor>("SavedMean");
+    auto* saved_variance = ctx.Input<Tensor>("SavedVariance");
+
+    auto momentum = ctx.Attr<float>("momentum");
+    auto epsilon = ctx.Attr<float>("epsilon");
+    auto data_layout = ctx.Attr<std::string>("data_layout");
+    auto is_test = ctx.Attr<bool>("is_test");
+    auto use_global_stats = ctx.Attr<bool>("use_global_stats");
+    auto trainable_statistics = ctx.Attr<bool>("trainable_statistics");
+    auto fuse_with_relu = ctx.Attr<bool>("fuse_with_relu");
+
+    auto* scale_grad = ctx.Output<Tensor>(framework::GradVarName("Scale"));
+    auto* bias_grad = ctx.Output<Tensor>(framework::GradVarName("Bias"));
+
+    auto* reserve_space = ctx.Input<Tensor>("ReserveSpace");
+    auto* mean = ctx.Input<Tensor>("ReserveSpace");
+    auto* variance = ctx.Input<Tensor>("ReserveSpace");
+
     if (ctx.Attr<bool>("use_sync_bn")) {
-      SyncBatchNormGradKernel<DeviceContext, T>::Compute(ctx);
+      auto& dev_ctx = ctx.device_context<DeviceContext>();
+      phi::SyncBatchNormGradFunctor<T>(
+          static_cast<const typename framework::ConvertToPhiContext<
+              DeviceContext>::TYPE&>(dev_ctx),
+          nullptr,
+          y,
+          *scale,
+          *bias,
+          *saved_mean,
+          *saved_variance,
+          *d_y,
+          epsilon,
+          data_layout,
+          d_x,
+          scale_grad,
+          bias_grad);
     } else {
-      auto* scale = ctx.Input<Tensor>("Scale");
-      auto* bias = ctx.Input<Tensor>("Bias");
-      auto* saved_mean = ctx.Input<Tensor>("SavedMean");
-      auto* saved_variance = ctx.Input<Tensor>("SavedVariance");
-
-      auto momentum = ctx.Attr<float>("momentum");
-      auto epsilon = ctx.Attr<float>("epsilon");
-      auto data_layout = ctx.Attr<std::string>("data_layout");
-      auto is_test = ctx.Attr<bool>("is_test");
-      auto use_global_stats = ctx.Attr<bool>("use_global_stats");
-      auto trainable_statistics = ctx.Attr<bool>("trainable_statistics");
-      auto fuse_with_relu = ctx.Attr<bool>("fuse_with_relu");
-
-      auto* scale_grad = ctx.Output<Tensor>(framework::GradVarName("Scale"));
-      auto* bias_grad = ctx.Output<Tensor>(framework::GradVarName("Bias"));
-
-      auto* reserve_space = ctx.Input<Tensor>("ReserveSpace");
-      auto* mean = ctx.Input<Tensor>("ReserveSpace");
-      auto* variance = ctx.Input<Tensor>("ReserveSpace");
-
       paddle::optional<Tensor> space_opt;
       paddle::optional<Tensor> mean_opt;
       paddle::optional<Tensor> variance_opt;
diff --git a/paddle/fluid/operators/sync_batch_norm_op.cu b/paddle/fluid/operators/sync_batch_norm_op.cu
deleted file mode 100644
index 637064398e177..0000000000000
--- a/paddle/fluid/operators/sync_batch_norm_op.cu
+++ /dev/null
@@ -1,137 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/sync_batch_norm_op.cu.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename T>
-class SyncBatchNormKernel<platform::CUDADeviceContext, T>
-    : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    double epsilon = static_cast<double>(ctx.Attr<float>("epsilon"));
-    const float momentum = ctx.Attr<float>("momentum");
-    const bool is_test = ctx.Attr<bool>("is_test");
-    const std::string layout_str = ctx.Attr<std::string>("data_layout");
-    const DataLayout layout = framework::StringToDataLayout(layout_str);
-    const bool use_global_stats = ctx.Attr<bool>("use_global_stats");
-    const bool trainable_stats = ctx.Attr<bool>("trainable_statistics");
-    PADDLE_ENFORCE_EQ(use_global_stats,
-                      false,
-                      platform::errors::InvalidArgument(
-                          "sync_batch_norm doesn't support "
-                          "to set use_global_stats True. Please use batch_norm "
-                          "in this case."));
-
-    const auto *x = ctx.Input<Tensor>("X");
-    auto *y = ctx.Output<Tensor>("Y");
-
-    const auto *est_mean = ctx.Input<Tensor>("Mean");
-    const auto *est_var = ctx.Input<Tensor>("Variance");
-
-    // moving mean/variance
-    auto *mean_out = ctx.Output<Tensor>("MeanOut");
-    auto *variance_out = ctx.Output<Tensor>("VarianceOut");
-
-    auto *saved_mean = ctx.Output<Tensor>("SavedMean");
-    auto *saved_inv_variance = ctx.Output<Tensor>("SavedVariance");
-
-    bool test_mode = is_test && (!trainable_stats);
-    SyncBatchNormFunctor<platform::CUDADeviceContext, T>(ctx,
-                                                         layout,
-                                                         x,
-                                                         y,
-                                                         est_mean,
-                                                         est_var,
-                                                         mean_out,
-                                                         variance_out,
-                                                         saved_mean,
-                                                         saved_inv_variance,
-                                                         epsilon,
-                                                         momentum,
-                                                         test_mode,
-                                                         use_global_stats);
-  }
-};
-
-template <typename T>
-class SyncBatchNormGradKernel<platform::CUDADeviceContext, T>
-    : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    PADDLE_ENFORCE_EQ(
-        platform::is_gpu_place(ctx.GetPlace()),
-        true,
-        platform::errors::InvalidArgument("It must use CUDAPlace."));
-    double epsilon = static_cast<double>(ctx.Attr<float>("epsilon"));
-    const std::string layout_str = ctx.Attr<std::string>("data_layout");
-
-    const DataLayout layout = framework::StringToDataLayout(layout_str);
-    const auto *d_y = ctx.Input<Tensor>(framework::GradVarName("Y"));
-    const auto *scale = ctx.Input<Tensor>("Scale");
-    const auto *bias = ctx.Input<Tensor>("Bias");
-
-    // init output
-    auto *d_x = ctx.Output<Tensor>(framework::GradVarName("X"));
-    auto *d_scale = ctx.Output<Tensor>(framework::GradVarName("Scale"));
-    auto *d_bias = ctx.Output<Tensor>(framework::GradVarName("Bias"));
-
-    const auto *saved_mean = ctx.Input<Tensor>("SavedMean");
-    const auto *saved_inv_var = ctx.Input<Tensor>("SavedVariance");
-
-    SyncBatchNormGradFunctor<platform::CUDADeviceContext, T>(ctx,
-                                                             layout,
-                                                             scale,
-                                                             bias,
-                                                             d_x,
-                                                             d_y,
-                                                             d_scale,
-                                                             d_bias,
-                                                             saved_mean,
-                                                             saved_inv_var,
-                                                             epsilon);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-#ifdef PADDLE_WITH_HIP
-// MIOPEN do not support double
-REGISTER_OP_CUDA_KERNEL(
-    sync_batch_norm,
-    ops::SyncBatchNormKernel<plat::CUDADeviceContext, float>,
-    ops::SyncBatchNormKernel<plat::CUDADeviceContext, plat::float16>);
-REGISTER_OP_CUDA_KERNEL(
-    sync_batch_norm_grad,
-    ops::SyncBatchNormGradKernel<plat::CUDADeviceContext, float>,
-    ops::SyncBatchNormGradKernel<plat::CUDADeviceContext, plat::float16>);
-#else
-REGISTER_OP_CUDA_KERNEL(
-    sync_batch_norm,
-    ops::SyncBatchNormKernel<plat::CUDADeviceContext, float>,
-    ops::SyncBatchNormKernel<plat::CUDADeviceContext, double>,
-    ops::SyncBatchNormKernel<plat::CUDADeviceContext, plat::float16>);
-REGISTER_OP_CUDA_KERNEL(
-    sync_batch_norm_grad,
-    ops::SyncBatchNormGradKernel<plat::CUDADeviceContext, float>,
-    ops::SyncBatchNormGradKernel<plat::CUDADeviceContext, double>,
-    ops::SyncBatchNormGradKernel<plat::CUDADeviceContext, plat::float16>);
-#endif
-
-// clang-format on
diff --git a/paddle/fluid/operators/sync_batch_norm_op.cu.h b/paddle/fluid/operators/sync_batch_norm_op.cu.h
deleted file mode 100644
index 47de27e876922..0000000000000
--- a/paddle/fluid/operators/sync_batch_norm_op.cu.h
+++ /dev/null
@@ -1,637 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <algorithm>
-#include <cfloat>
-#include <cmath>
-#include <string>
-#include <vector>
-#ifdef __NVCC__
-#include "cub/cub.cuh"
-#endif
-#ifdef __HIPCC__
-#include <hipcub/hipcub.hpp>
-namespace cub = hipcub;
-#endif
-#include "paddle/fluid/framework/convert_utils.h"
-#include "paddle/fluid/framework/data_layout.h"
-#include "paddle/fluid/memory/malloc.h"
-#include "paddle/fluid/operators/batch_norm_op.h"
-#include "paddle/fluid/operators/norm_utils.h"
-#include "paddle/fluid/platform/device/gpu/gpu_dnn.h"
-#include "paddle/fluid/platform/device/gpu/nccl_helper.h"
-#include "paddle/fluid/platform/float16.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-using DataLayout = framework::DataLayout;
-template <typename T>
-using CudnnDataType = platform::CudnnDataType<T>;
-template <typename T>
-using BatchNormParamType = typename CudnnDataType<T>::BatchNormParamType;
-
-template <typename T, int BlockDim, framework::DataLayout layout>
-__global__ void KeLocalStats(
-    const T *x, int N, int M, int C, BatchNormParamType<T> *mean_var) {
-  typedef cub::BlockReduce<BatchNormParamType<T>, BlockDim> BlockReduce;
-  __shared__ typename BlockReduce::TempStorage temp_storage;
-  for (int k = blockIdx.x; k < C; k += gridDim.x) {
-    BatchNormParamType<T> x_sum = 0.;
-    BatchNormParamType<T> x2_sum = 0.;
-    for (int i = threadIdx.x; i < N * M; i += BlockDim) {
-      int id = layout == framework::DataLayout::kNCHW
-                   ? (i / M) * C * M + k * M + i % M
-                   : i * C + k;
-      auto x_in = static_cast<BatchNormParamType<T>>(x[id]);
-      x_sum += x_in;
-      x2_sum += x_in * x_in;
-    }
-    __syncthreads();
-    auto out = BlockReduce(temp_storage).Reduce(x_sum, cub::Sum());
-    __syncthreads();
-    if (threadIdx.x == 0) {
-      mean_var[k] = out / (N * M);
-    }
-    out = BlockReduce(temp_storage).Reduce(x2_sum, cub::Sum());
-    __syncthreads();
-    if (threadIdx.x == 0) {
-      mean_var[k + C] = out / (N * M);
-    }
-  }
-  if (blockIdx.x == 0 && threadIdx.x == 0) {
-    mean_var[2 * C] = static_cast<BatchNormParamType<T>>(1.0);
-  }
-}
-
-template <typename T>
-__global__ void KeSyncAndMovingStats(BatchNormParamType<T> *means,
-                                     BatchNormParamType<T> *variances,
-                                     BatchNormParamType<T> *num_dev,
-                                     const int C,
-                                     const BatchNormParamType<T> momentum,
-                                     const double epsilon,
-                                     BatchNormParamType<T> *sv_mean_data,
-                                     BatchNormParamType<T> *sv_inv_var_data,
-                                     BatchNormParamType<T> *moving_means,
-                                     BatchNormParamType<T> *moving_variances) {
-  // sync stats across multi-devices
-  int gid = blockIdx.x * blockDim.x + threadIdx.x;
-  int stride = blockDim.x * gridDim.x;
-  for (int i = gid; i < C; i += stride) {
-    auto mean = means[i] / (*num_dev);
-    auto var = variances[i] / (*num_dev);
-    var = var - mean * mean;
-
-    // sync stats
-    sv_mean_data[i] = mean;
-    sv_inv_var_data[i] = 1.0 / sqrt(var + epsilon);
-    variances[i] = var;
-
-    // moving stats
-    moving_means[i] = moving_means[i] * momentum + mean * (1. - momentum);
-    moving_variances[i] =
-        moving_variances[i] * momentum + var * (1. - momentum);
-  }
-}
-
-template <typename T, framework::DataLayout layout>
-static __global__ void KeNormAffine(const T *x,
-                                    const BatchNormParamType<T> *scale,
-                                    const BatchNormParamType<T> *bias,
-                                    const BatchNormParamType<T> *mean,
-                                    const BatchNormParamType<T> *variance,
-                                    const double epsilon,
-                                    const int C,
-                                    const int M,
-                                    const int num,
-                                    T *y) {
-  int gid = blockIdx.x * blockDim.x + threadIdx.x;
-  int stride = blockDim.x * gridDim.x;
-  for (int i = gid; i < num; i += stride) {
-    const int c = layout == framework::DataLayout::kNCHW ? (i / M) % C : i % C;
-    auto x_i = static_cast<BatchNormParamType<T>>(x[i]);
-    auto y_i =
-        (x_i - mean[c]) / sqrt(variance[c] + epsilon) * scale[c] + bias[c];
-    y[i] = static_cast<T>(y_i);
-  }
-}
-
-template <typename DeviceContext, typename T>
-void SyncBatchNormFunctor(const framework::ExecutionContext &ctx,
-                          const DataLayout layout,
-                          const framework::Tensor *x,
-                          framework::Tensor *y,
-                          const framework::Tensor *mean,
-                          const framework::Tensor *variance,
-                          framework::Tensor *mean_out,
-                          framework::Tensor *variance_out,
-                          framework::Tensor *saved_mean,
-                          framework::Tensor *saved_variance,
-                          double epsilon,
-                          const float momentum,
-                          const bool is_test,
-                          const bool use_global_stats
-
-) {
-  const auto &x_dims = x->dims();
-  PADDLE_ENFORCE_GE(x_dims.size(),
-                    2,
-                    platform::errors::InvalidArgument(
-                        "The Input dim size should be larger than 1."));
-  PADDLE_ENFORCE_LE(x_dims.size(),
-                    5,
-                    platform::errors::InvalidArgument(
-                        "The Input dim size should be less than 6."));
-  int N, C, H, W, D;
-  ExtractNCWHD(x_dims, layout, &N, &C, &H, &W, &D);
-  int x_numel = x->numel();
-
-  const T *x_d = x->data<T>();
-  const auto *s_d = ctx.Input<Tensor>("Scale")->data<BatchNormParamType<T>>();
-  const auto *b_d = ctx.Input<Tensor>("Bias")->data<BatchNormParamType<T>>();
-
-  T *y_d = y->mutable_data<T>(ctx.GetPlace());
-
-  const BatchNormParamType<T> *mean_data = nullptr;
-  const BatchNormParamType<T> *var_data = nullptr;
-
-  auto &dev_ctx = ctx.cuda_device_context();
-  auto stream = dev_ctx.stream();
-  const int block = 512;
-  int max_threads = dev_ctx.GetMaxPhysicalThreadCount();
-
-  paddle::memory::AllocationPtr alloc_ptr{nullptr};
-
-  if (is_test) {
-    mean_data = mean->data<BatchNormParamType<T>>();
-    var_data = variance->data<BatchNormParamType<T>>();
-  } else {
-    // x, x^2, 1, here 1 is used to calc device num
-    // device num also can be got from platform::DeviceContextPool
-    const int bytes = (C * 2 + 1) * sizeof(BatchNormParamType<T>);
-    alloc_ptr = memory::Alloc(dev_ctx, bytes);
-
-    auto *stats = reinterpret_cast<BatchNormParamType<T> *>(alloc_ptr->ptr());
-    const int threads = 256;
-    int grid = std::min(C, (max_threads + threads - 1) / threads);
-    if (layout == framework::DataLayout::kNCHW) {
-      KeLocalStats<T, threads, framework::DataLayout::kNCHW>
-          <<<grid, threads, 0, stream>>>(x_d, N, H * W * D, C, stats);
-    } else {
-      KeLocalStats<T, threads, framework::DataLayout::kNHWC>
-          <<<grid, threads, 0, stream>>>(x_d, N, H * W * D, C, stats);
-    }
-
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
-    auto *comm = dev_ctx.nccl_comm();
-    if (comm) {
-      int dtype = platform::ToNCCLDataType(
-          framework::TransToProtoVarType(mean_out->dtype()));
-      // In-place operation
-      PADDLE_ENFORCE_GPU_SUCCESS(
-          platform::dynload::ncclAllReduce(stats,
-                                           stats,
-                                           2 * C + 1,
-                                           static_cast<ncclDataType_t>(dtype),
-                                           ncclSum,
-                                           comm,
-                                           stream));
-    }
-#endif
-
-    auto *est_mean_data =
-        mean_out->mutable_data<BatchNormParamType<T>>(ctx.GetPlace());
-    auto *est_var_data =
-        variance_out->mutable_data<BatchNormParamType<T>>(ctx.GetPlace());
-
-    auto *sv_mean_data =
-        saved_mean->mutable_data<BatchNormParamType<T>>(ctx.GetPlace());
-    auto *sv_inv_var_data =
-        saved_variance->mutable_data<BatchNormParamType<T>>(ctx.GetPlace());
-
-    // Note, Input('Mean')/Input('Variance') share variable with
-    // Output('MeanOut')/Output('VarianceOut')
-    KeSyncAndMovingStats<T>
-        <<<(C + block - 1) / block, block, 0, stream>>>(stats,
-                                                        stats + C,
-                                                        stats + 2 * C,
-                                                        C,
-                                                        momentum,
-                                                        epsilon,
-                                                        sv_mean_data,
-                                                        sv_inv_var_data,
-                                                        est_mean_data,
-                                                        est_var_data);
-
-    mean_data = sv_mean_data;
-    var_data = stats + C;
-  }
-
-  int grid2 = (std::min(x_numel, max_threads) + block - 1) / block;
-  if (layout == framework::DataLayout::kNCHW) {
-    KeNormAffine<T, framework::DataLayout::kNCHW>
-        <<<grid2, block, 0, stream>>>(x_d,
-                                      s_d,
-                                      b_d,
-                                      mean_data,
-                                      var_data,
-                                      epsilon,
-                                      C,
-                                      H * W * D,
-                                      x_numel,
-                                      y_d);
-  } else {
-    KeNormAffine<T, framework::DataLayout::kNHWC>
-        <<<grid2, block, 0, stream>>>(x_d,
-                                      s_d,
-                                      b_d,
-                                      mean_data,
-                                      var_data,
-                                      epsilon,
-                                      C,
-                                      H * W * D,
-                                      x_numel,
-                                      y_d);
-  }
-}
-
-template <typename T, const int BlockDim, framework::DataLayout layout>
-__global__ void KeBackwardLocalStats(const T *dy,
-                                     const T *x,
-                                     const BatchNormParamType<T> *means,
-                                     int N,
-                                     int M,
-                                     int C,
-                                     BatchNormParamType<T> *sum_dy_prod) {
-  typedef cub::BlockReduce<BatchNormParamType<T>, BlockDim> BlockReduce;
-  __shared__ typename BlockReduce::TempStorage temp_storage;
-  for (int k = blockIdx.x; k < C; k += gridDim.x) {
-    BatchNormParamType<T> sum1 = 0.;
-    BatchNormParamType<T> sum2 = 0.;
-    auto mean = means[k];
-    for (int i = threadIdx.x; i < N * M; i += blockDim.x) {
-      int id = layout == framework::DataLayout::kNCHW
-                   ? (i / M) * C * M + k * M + i % M
-                   : i * C + k;
-      auto g = static_cast<BatchNormParamType<T>>(dy[id]);
-      sum1 += g;
-      auto x_i = static_cast<BatchNormParamType<T>>(x[id]);
-      sum2 += g * (x_i - mean);
-    }
-
-    __syncthreads();
-    auto out = BlockReduce(temp_storage).Reduce(sum1, cub::Sum());
-    __syncthreads();
-    if (threadIdx.x == 0) {
-      sum_dy_prod[k] = out;
-    }
-    out = BlockReduce(temp_storage).Reduce(sum2, cub::Sum());
-    __syncthreads();
-    if (threadIdx.x == 0) {
-      sum_dy_prod[k + C] = out;
-    }
-  }
-  if (blockIdx.x == 0 && threadIdx.x == 0) {
-    sum_dy_prod[2 * C] = 1.0;
-  }
-}
-
-template <typename T, int BlockDim, framework::DataLayout layout>
-static __global__ void KeBNBackwardScaleBias(
-    const T *dy,
-    const T *x,
-    const BatchNormParamType<T> *mean,
-    const BatchNormParamType<T> *inv_variance,
-    const double epsilon,
-    const int N,
-    const int C,
-    const int HxW,
-    BatchNormParamType<T> *dscale,
-    BatchNormParamType<T> *dbias) {
-  const int outer_size = C;
-  const int inner_size = N * HxW;
-  typedef cub::BlockReduce<BatchNormParamType<T>, BlockDim> BlockReduce;
-  __shared__ typename BlockReduce::TempStorage temp_storage;
-
-  for (int i = blockIdx.x; i < outer_size; i += gridDim.x) {
-    BatchNormParamType<T> ds_sum = 0.;
-    BatchNormParamType<T> db_sum = 0.;
-
-    auto inv_var_i = inv_variance[i];
-    auto mean_i = mean[i];
-    for (int j = threadIdx.x; j < inner_size; j += blockDim.x) {
-      const int id = layout == framework::DataLayout::kNCHW
-                         ? ((j / HxW) * C + i) * HxW + (j % HxW)
-                         : j * outer_size + i;
-      auto x_i = static_cast<BatchNormParamType<T>>(x[id]);
-      auto dy_i = static_cast<BatchNormParamType<T>>(dy[id]);
-      ds_sum += dy_i * (x_i - mean_i);
-      db_sum += dy_i;
-    }
-    __syncthreads();
-    auto os = BlockReduce(temp_storage).Reduce(ds_sum, cub::Sum());
-    __syncthreads();
-    auto ob = BlockReduce(temp_storage).Reduce(db_sum, cub::Sum());
-    __syncthreads();
-    if (threadIdx.x == 0) {
-      dscale[i] = os * inv_var_i;
-      dbias[i] = ob;
-    }
-    __syncthreads();
-  }
-}
-
-template <typename T, framework::DataLayout layout>
-static __global__ void KeBNRestoreData(T *x,
-                                       const BatchNormParamType<T> *scale,
-                                       const BatchNormParamType<T> *bias,
-                                       const BatchNormParamType<T> *mean,
-                                       const BatchNormParamType<T> *sv_inv,
-                                       const double epsilon,
-                                       int C,
-                                       int M,
-                                       int num,
-                                       const T *y) {
-  int gid = blockIdx.x * blockDim.x + threadIdx.x;
-  int stride = blockDim.x * gridDim.x;
-  for (int i = gid; i < num; i += stride) {
-    const int c = layout == framework::DataLayout::kNCHW ? (i / M) % C : i % C;
-    auto y_i = static_cast<BatchNormParamType<T>>(y[i]);
-    auto x_i = (y_i - bias[c]) / scale[c] / sv_inv[c] + mean[c];
-    x[i] = static_cast<T>(x_i);
-  }
-}
-
-template <typename T, framework::DataLayout layout>
-static __global__ void KeBNBackwardData(
-    const T *dy,
-    const T *x,
-    const BatchNormParamType<T> *gamma,
-    const BatchNormParamType<T> *mean,
-    const BatchNormParamType<T> *inv_variance,
-    const BatchNormParamType<T> *g_sum_dy,
-    const BatchNormParamType<T> *g_sum_dy_prod,
-    const BatchNormParamType<T> *num_dev,
-    const double epsilon,
-    const int C,
-    const int HxW,
-    const int num,
-    T *dx) {
-  int gid = blockIdx.x * blockDim.x + threadIdx.x;
-  int stride = blockDim.x * gridDim.x;
-  auto scale = static_cast<BatchNormParamType<T>>(C) / num;
-  auto dev_num = num_dev[0];
-  for (int i = gid; i < num; i += stride) {
-    const int c = layout == framework::DataLayout::kNCHW ? i / HxW % C : i % C;
-    auto inv_var = inv_variance[c];
-    auto s_d = gamma[c];
-    auto gvar =
-        -(g_sum_dy_prod[c] / dev_num) * s_d * inv_var * (inv_var * inv_var);
-    auto gmean = -(g_sum_dy[c] / dev_num) * s_d * inv_var;
-
-    auto x_i = static_cast<BatchNormParamType<T>>(x[i]);
-    auto dy_i = static_cast<BatchNormParamType<T>>(dy[i]);
-    auto dx_i =
-        dy_i * s_d * inv_var + gmean * scale + gvar * scale * (x_i - mean[c]);
-    dx[i] = static_cast<T>(dx_i);
-  }
-}
-
-template <typename DeviceContext, typename T>
-void SyncBatchNormGradFunctor(const framework::ExecutionContext &ctx,
-                              const DataLayout layout,
-                              const framework::Tensor *scale,
-                              const framework::Tensor *bias,
-                              framework::Tensor *d_x,
-                              const framework::Tensor *d_y,
-                              framework::Tensor *d_scale,
-                              framework::Tensor *d_bias,
-                              const framework::Tensor *mean,
-                              const framework::Tensor *variance,
-                              const double epsilon) {
-  // sync_batch_norm with inplace as false will take X as grad input, which
-  // is same as cuDNN batch_norm backward calculation, batch_norm
-  // with inplace as true only take Y as input and X should be calculate
-  // by inverse operation of batch_norm on Y
-  const Tensor *x;
-  bool is_inplace;
-  if (ctx.HasInput("Y")) {
-    x = ctx.Input<Tensor>("Y");
-    is_inplace = true;
-  } else {
-    x = ctx.Input<Tensor>("X");
-    is_inplace = false;
-  }
-
-  const auto &x_dims = x->dims();
-
-  PADDLE_ENFORCE_GE(x_dims.size(),
-                    2,
-                    platform::errors::InvalidArgument(
-                        "The Input X dim size should be larger than 1."));
-  PADDLE_ENFORCE_LE(x_dims.size(),
-                    5,
-                    platform::errors::InvalidArgument(
-                        "The Input X dim size should be less than 6."));
-
-  int N, C, H, W, D;
-  ExtractNCWHD(x_dims, layout, &N, &C, &H, &W, &D);
-  PADDLE_ENFORCE_EQ(scale->dims()[0],
-                    C,
-                    platform::errors::InvalidArgument(
-                        "Expected first dim for input parameter(scale) of "
-                        "OP(sync_batch_norm) be (%d), but given (%d).",
-                        C,
-                        scale->dims()[0]));
-
-  d_x->mutable_data<T>(ctx.GetPlace());
-  if (d_scale && d_bias) {
-    d_scale->mutable_data<BatchNormParamType<T>>(ctx.GetPlace());
-    d_bias->mutable_data<BatchNormParamType<T>>(ctx.GetPlace());
-  }
-  PADDLE_ENFORCE_EQ(scale->dims().size(),
-                    1UL,
-                    platform::errors::InvalidArgument(
-                        "Expected rank for input parameter(scale) of "
-                        "OP(sync_batch_norm) be (1), but given (%d).",
-                        scale->dims().size()));
-
-  std::vector<int> dims;
-  std::vector<int> strides;
-  if (layout == DataLayout::kNCHW) {
-    dims = {N, C, H, W, D};
-    strides = {C * H * W * D, H * W * D, W * D, D, 1};
-  } else {
-    dims = {N, C, H, W, D};
-    strides = {H * W * C * D, 1, W * D * C, D * C, C};
-  }
-  const T *x_d = x->data<T>();
-  auto px = *x;
-  const T *dy_d = d_y->data<T>();
-
-  auto &dev_ctx = ctx.cuda_device_context();
-  auto stream = dev_ctx.stream();
-
-  const auto *saved_mean = mean->data<BatchNormParamType<T>>();
-  const auto *saved_inv_var = variance->data<BatchNormParamType<T>>();
-  const int bytes = (C * 2 + 1) * sizeof(BatchNormParamType<T>);
-  auto alloc_ptr = memory::Alloc(dev_ctx, bytes);
-  auto *stats = reinterpret_cast<BatchNormParamType<T> *>(alloc_ptr->ptr());
-
-  const int block = 512;
-  const int threads = 256;
-  int x_numel = x->numel();
-  int fsize = H * W * D;
-  int max_threads = dev_ctx.GetMaxPhysicalThreadCount();
-  int grid = std::min(C, (max_threads + threads - 1) / threads);
-  int grid2 = (std::min(x_numel, max_threads) + block - 1) / block;
-
-  if (is_inplace) {
-    if (layout == framework::DataLayout::kNCHW) {
-      KeBNRestoreData<T, framework::DataLayout::kNCHW>
-          <<<grid2, block, 0, stream>>>(px.mutable_data<T>(ctx.GetPlace()),
-                                        scale->data<BatchNormParamType<T>>(),
-                                        bias->data<BatchNormParamType<T>>(),
-                                        saved_mean,
-                                        saved_inv_var,
-                                        epsilon,
-                                        C,
-                                        H * W * D,
-                                        x_numel,
-                                        x->data<T>());
-    } else {
-      KeBNRestoreData<T, framework::DataLayout::kNHWC>
-          <<<grid2, block, 0, stream>>>(px.mutable_data<T>(ctx.GetPlace()),
-                                        scale->data<BatchNormParamType<T>>(),
-                                        bias->data<BatchNormParamType<T>>(),
-                                        saved_mean,
-                                        saved_inv_var,
-                                        epsilon,
-                                        C,
-                                        H * W * D,
-                                        x_numel,
-                                        x->data<T>());
-    }
-  }
-
-  if (layout == framework::DataLayout::kNCHW) {
-    KeBackwardLocalStats<T, threads, framework::DataLayout::kNCHW>
-        <<<grid, threads, 0, stream>>>(
-            dy_d, x_d, saved_mean, N, fsize, C, stats);
-  } else {
-    KeBackwardLocalStats<T, threads, framework::DataLayout::kNHWC>
-        <<<grid, threads, 0, stream>>>(
-            dy_d, x_d, saved_mean, N, fsize, C, stats);
-  }
-
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
-  auto *comm = dev_ctx.nccl_comm();
-  if (comm) {
-    int dtype = platform::ToNCCLDataType(
-        framework::TransToProtoVarType(scale->dtype()));
-    // In-place operation
-    PADDLE_ENFORCE_GPU_SUCCESS(
-        platform::dynload::ncclAllReduce(stats,
-                                         stats,
-                                         2 * C + 1,
-                                         static_cast<ncclDataType_t>(dtype),
-                                         ncclSum,
-                                         comm,
-                                         stream));
-  }
-#endif
-
-  if (layout == framework::DataLayout::kNCHW) {
-    if (d_scale && d_bias) {
-      KeBNBackwardScaleBias<T, threads, framework::DataLayout::kNCHW>
-          <<<grid, threads, 0, stream>>>(dy_d,
-                                         x_d,
-                                         saved_mean,
-                                         saved_inv_var,
-                                         epsilon,
-                                         N,
-                                         C,
-                                         fsize,
-                                         d_scale->data<BatchNormParamType<T>>(),
-                                         d_bias->data<BatchNormParamType<T>>());
-    }
-    if (d_x) {
-      KeBNBackwardData<T, framework::DataLayout::kNCHW>
-          <<<grid2, block, 0, stream>>>(dy_d,
-                                        x_d,
-                                        scale->data<BatchNormParamType<T>>(),
-                                        saved_mean,
-                                        saved_inv_var,
-                                        stats,
-                                        stats + C,
-                                        stats + 2 * C,
-                                        epsilon,
-                                        C,
-                                        fsize,
-                                        x->numel(),
-                                        d_x->data<T>());
-    }
-  } else {
-    if (d_scale && d_bias) {
-      KeBNBackwardScaleBias<T, threads, framework::DataLayout::kNHWC>
-          <<<grid, threads, 0, stream>>>(dy_d,
-                                         x_d,
-                                         saved_mean,
-                                         saved_inv_var,
-                                         epsilon,
-                                         N,
-                                         C,
-                                         fsize,
-                                         d_scale->data<BatchNormParamType<T>>(),
-                                         d_bias->data<BatchNormParamType<T>>());
-    }
-    if (d_x) {
-      KeBNBackwardData<T, framework::DataLayout::kNHWC>
-          <<<grid2, block, 0, stream>>>(dy_d,
-                                        x_d,
-                                        scale->data<BatchNormParamType<T>>(),
-                                        saved_mean,
-                                        saved_inv_var,
-                                        stats,
-                                        stats + C,
-                                        stats + 2 * C,
-                                        epsilon,
-                                        C,
-                                        fsize,
-                                        x->numel(),
-                                        d_x->data<T>());
-    }
-  }
-}
-
-template <typename DeviceContext, typename T>
-class SyncBatchNormKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override;
-};
-
-// Deriving the Gradient for the Backward Pass of Batch Normalization
-// https://kevinzakka.github.io/2016/09/14/batch_normalization/
-template <typename DeviceContext, typename T>
-class SyncBatchNormGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override;
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/phi/api/yaml/legacy_api.yaml b/paddle/phi/api/yaml/legacy_api.yaml
index 453a0d9c16690..ed08fe48ee849 100644
--- a/paddle/phi/api/yaml/legacy_api.yaml
+++ b/paddle/phi/api/yaml/legacy_api.yaml
@@ -2075,6 +2075,16 @@
     func : swish
   backward : swish_grad
 
+# sync_batch_norm
+- api : sync_batch_norm
+  args : (Tensor x, Tensor scale, Tensor bias, Tensor mean, Tensor variance, float momentum, float epsilon, str data_layout, bool is_test, bool use_global_stats, bool trainable_statistics, bool fuse_with_relu)
+  output : Tensor(out), Tensor(mean_out), Tensor(variance_out), Tensor(saved_mean), Tensor(saved_variance), Tensor(reserve_space)
+  infer_meta :
+    func : BatchNormInferMeta
+  kernel :
+    func : sync_batch_norm
+  backward : sync_batch_norm_grad
+
 # take_along_axis
 - api : take_along_axis
   args : (Tensor x, Tensor index, int axis)
diff --git a/paddle/phi/api/yaml/legacy_backward.yaml b/paddle/phi/api/yaml/legacy_backward.yaml
index 50aa57a3845cd..91464ac769f77 100644
--- a/paddle/phi/api/yaml/legacy_backward.yaml
+++ b/paddle/phi/api/yaml/legacy_backward.yaml
@@ -2085,6 +2085,18 @@
     func : swish_grad
   inplace : (out_grad -> x_grad)
 
+- backward_api : sync_batch_norm_grad
+  forward : sync_batch_norm (Tensor x, Tensor scale, Tensor bias, Tensor mean, Tensor variance, float momentum, float epsilon, str data_layout, bool is_test, bool use_global_stats, bool trainable_statistics, bool fuse_with_relu) -> Tensor(out), Tensor(mean_out), Tensor(variance_out), Tensor(saved_mean), Tensor(saved_variance), Tensor(reserve_space)
+  args : (Tensor x, Tensor scale, Tensor bias, Tensor mean_out, Tensor variance_out, Tensor saved_mean, Tensor saved_variance, Tensor reserve_space, Tensor out_grad, float momentum, float epsilon, str data_layout, bool is_test, bool use_global_stats, bool trainable_statistics, bool fuse_with_relu)
+  output : Tensor(x_grad), Tensor(scale_grad), Tensor(bias_grad)
+  infer_meta :
+    func : GeneralTernaryGradInferMeta
+    param : [x, scale, bias]
+  kernel :
+    func : sync_batch_norm_grad
+    data_type : out_grad
+  optional : mean_out, variance_out, reserve_space
+
 - backward_api : take_along_axis_grad
   forward : take_along_axis (Tensor x, Tensor index, int axis) -> Tensor(out)
   args : (Tensor x, Tensor index, Tensor out_grad, int axis)
diff --git a/paddle/phi/kernels/gpu/sync_batch_norm_grad_kernel.cu b/paddle/phi/kernels/gpu/sync_batch_norm_grad_kernel.cu
new file mode 100644
index 0000000000000..ba5020d08bd0f
--- /dev/null
+++ b/paddle/phi/kernels/gpu/sync_batch_norm_grad_kernel.cu
@@ -0,0 +1,75 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/sync_batch_norm_grad_kernel.h"
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/gpu/sync_batch_norm_utils.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void SyncBatchNormGradKernel(const Context& ctx,
+                             const DenseTensor& x,
+                             const DenseTensor& scale,
+                             const DenseTensor& bias,
+                             const paddle::optional<DenseTensor>& mean,
+                             const paddle::optional<DenseTensor>& variance,
+                             const DenseTensor& saved_mean,
+                             const DenseTensor& saved_variance,
+                             const paddle::optional<DenseTensor>& reserve_space,
+                             const DenseTensor& y_grad,
+                             float momentum,
+                             float epsilon_f,
+                             const std::string& data_layout_str,
+                             bool is_test,
+                             bool use_global_stats,
+                             bool trainable_statistics,
+                             bool fuse_with_relu,
+                             DenseTensor* x_grad,
+                             DenseTensor* scale_grad,
+                             DenseTensor* bias_grad) {
+  SyncBatchNormGradFunctor<T, Context>(ctx,
+                                       &x,
+                                       nullptr,
+                                       scale,
+                                       bias,
+                                       saved_mean,
+                                       saved_variance,
+                                       y_grad,
+                                       epsilon_f,
+                                       data_layout_str,
+                                       x_grad,
+                                       scale_grad,
+                                       bias_grad);
+}
+
+}  // namespace phi
+
+#ifdef PADDLE_WITH_HIP
+PD_REGISTER_KERNEL(sync_batch_norm_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::SyncBatchNormGradKernel,
+                   float,
+                   phi::dtype::float16) {}
+#else
+PD_REGISTER_KERNEL(sync_batch_norm_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::SyncBatchNormGradKernel,
+                   float,
+                   double,
+                   phi::dtype::float16) {}
+#endif
diff --git a/paddle/phi/kernels/gpu/sync_batch_norm_kernel.cu b/paddle/phi/kernels/gpu/sync_batch_norm_kernel.cu
new file mode 100644
index 0000000000000..a1d4b681ca053
--- /dev/null
+++ b/paddle/phi/kernels/gpu/sync_batch_norm_kernel.cu
@@ -0,0 +1,190 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/sync_batch_norm_kernel.h"
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/gpu/sync_batch_norm_utils.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void SyncBatchNormKernel(const Context &ctx,
+                         const DenseTensor &x,
+                         const DenseTensor &scale,
+                         const DenseTensor &bias,
+                         const DenseTensor &mean,
+                         const DenseTensor &variance,
+                         float momentum,
+                         float epsilon_f,
+                         const std::string &data_layout_str,
+                         bool is_test,
+                         bool use_global_stats,
+                         bool trainable_statistics,
+                         bool fuse_with_relu,
+                         DenseTensor *y,
+                         DenseTensor *mean_out,
+                         DenseTensor *variance_out,
+                         DenseTensor *saved_mean,
+                         DenseTensor *saved_variance,
+                         DenseTensor *reserve_space) {
+  PADDLE_ENFORCE_EQ(use_global_stats,
+                    false,
+                    phi::errors::InvalidArgument(
+                        "sync_batch_norm doesn't support "
+                        "to set use_global_stats True. Please use batch_norm "
+                        "in this case."));
+
+  double epsilon = epsilon_f;
+  const bool trainable_stats = trainable_statistics;
+  const DataLayout layout =
+      paddle::framework::StringToDataLayout(data_layout_str);
+  bool test_mode = is_test && (!trainable_statistics);
+  const auto &x_dims = x.dims();
+  PADDLE_ENFORCE_GE(x_dims.size(),
+                    2,
+                    phi::errors::InvalidArgument(
+                        "The Input dim size should be larger than 1."));
+  PADDLE_ENFORCE_LE(x_dims.size(),
+                    5,
+                    phi::errors::InvalidArgument(
+                        "The Input dim size should be less than 6."));
+  int N, C, H, W, D;
+  funcs::ExtractNCWHD(x_dims, layout, &N, &C, &H, &W, &D);
+  int x_numel = x.numel();
+
+  const T *x_d = x.template data<T>();
+  const auto *s_d = scale.template data<BatchNormParamType<T>>();
+  const auto *b_d = bias.template data<BatchNormParamType<T>>();
+
+  T *y_d = ctx.template Alloc<T>(y);
+
+  const BatchNormParamType<T> *mean_data = nullptr;
+  const BatchNormParamType<T> *var_data = nullptr;
+
+  auto stream = ctx.stream();
+  const int block = 512;
+  int max_threads = ctx.GetMaxPhysicalThreadCount();
+
+  paddle::memory::AllocationPtr alloc_ptr{nullptr};
+
+  if (test_mode) {
+    mean_data = mean.template data<BatchNormParamType<T>>();
+    var_data = variance.template data<BatchNormParamType<T>>();
+  } else {
+    // x, x^2, 1, here 1 is used to calc device num
+    // device num also can be got from platform::DeviceContextPool
+    const int bytes = (C * 2 + 1) * sizeof(BatchNormParamType<T>);
+    alloc_ptr = paddle::memory::Alloc(ctx, bytes);
+
+    auto *stats = reinterpret_cast<BatchNormParamType<T> *>(alloc_ptr->ptr());
+    const int threads = 256;
+    int grid = std::min(C, (max_threads + threads - 1) / threads);
+    if (layout == paddle::framework::DataLayout::kNCHW) {
+      KeLocalStats<T, threads, paddle::framework::DataLayout::kNCHW>
+          <<<grid, threads, 0, stream>>>(x_d, N, H * W * D, C, stats);
+    } else {
+      KeLocalStats<T, threads, paddle::framework::DataLayout::kNHWC>
+          <<<grid, threads, 0, stream>>>(x_d, N, H * W * D, C, stats);
+    }
+
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+    auto *comm = ctx.nccl_comm();
+    if (comm) {
+      int dtype = paddle::platform::ToNCCLDataType(
+          paddle::framework::TransToProtoVarType(mean_out->dtype()));
+      // In-place operation
+      PADDLE_ENFORCE_GPU_SUCCESS(paddle::platform::dynload::ncclAllReduce(
+          stats,
+          stats,
+          2 * C + 1,
+          static_cast<ncclDataType_t>(dtype),
+          ncclSum,
+          comm,
+          stream));
+    }
+#endif
+
+    auto *est_mean_data = ctx.template Alloc<BatchNormParamType<T>>(mean_out);
+    auto *est_var_data =
+        ctx.template Alloc<BatchNormParamType<T>>(variance_out);
+
+    auto *sv_mean_data = ctx.template Alloc<BatchNormParamType<T>>(saved_mean);
+    auto *sv_inv_var_data =
+        ctx.template Alloc<BatchNormParamType<T>>(saved_variance);
+
+    // Note, Input('Mean')/Input('Variance') share variable with
+    // Output('MeanOut')/Output('VarianceOut')
+    KeSyncAndMovingStats<T>
+        <<<(C + block - 1) / block, block, 0, stream>>>(stats,
+                                                        stats + C,
+                                                        stats + 2 * C,
+                                                        C,
+                                                        momentum,
+                                                        epsilon,
+                                                        sv_mean_data,
+                                                        sv_inv_var_data,
+                                                        est_mean_data,
+                                                        est_var_data);
+
+    mean_data = sv_mean_data;
+    var_data = stats + C;
+  }
+
+  int grid2 = (std::min(x_numel, max_threads) + block - 1) / block;
+  if (layout == paddle::framework::DataLayout::kNCHW) {
+    KeNormAffine<T, paddle::framework::DataLayout::kNCHW>
+        <<<grid2, block, 0, stream>>>(x_d,
+                                      s_d,
+                                      b_d,
+                                      mean_data,
+                                      var_data,
+                                      epsilon,
+                                      C,
+                                      H * W * D,
+                                      x_numel,
+                                      y_d);
+  } else {
+    KeNormAffine<T, paddle::framework::DataLayout::kNHWC>
+        <<<grid2, block, 0, stream>>>(x_d,
+                                      s_d,
+                                      b_d,
+                                      mean_data,
+                                      var_data,
+                                      epsilon,
+                                      C,
+                                      H * W * D,
+                                      x_numel,
+                                      y_d);
+  }
+}
+
+}  // namespace phi
+
+#ifdef PADDLE_WITH_HIP
+PD_REGISTER_KERNEL(sync_batch_norm,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::SyncBatchNormKernel,
+                   float,
+                   phi::dtype::float16) {}
+#else
+PD_REGISTER_KERNEL(sync_batch_norm,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::SyncBatchNormKernel,
+                   float,
+                   double,
+                   phi::dtype::float16) {}
+#endif
diff --git a/paddle/phi/kernels/gpu/sync_batch_norm_utils.h b/paddle/phi/kernels/gpu/sync_batch_norm_utils.h
new file mode 100644
index 0000000000000..37b9bca73a857
--- /dev/null
+++ b/paddle/phi/kernels/gpu/sync_batch_norm_utils.h
@@ -0,0 +1,493 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <algorithm>
+#include <cfloat>
+#include <cmath>
+#include <string>
+#include <vector>
+#ifdef __NVCC__
+#include "cub/cub.cuh"
+#endif
+#ifdef __HIPCC__
+#include <hipcub/hipcub.hpp>
+namespace cub = hipcub;
+#endif
+#include "paddle/fluid/framework/convert_utils.h"
+#include "paddle/fluid/memory/malloc.h"
+#include "paddle/fluid/platform/device/gpu/gpu_dnn.h"
+#include "paddle/fluid/platform/device/gpu/nccl_helper.h"
+#include "paddle/phi/common/layout.h"
+#include "paddle/phi/kernels/funcs/norm_utils.h"
+
+namespace phi {
+
+template <typename T>
+using CudnnDataType = paddle::platform::CudnnDataType<T>;
+template <typename T>
+using BatchNormParamType = typename CudnnDataType<T>::BatchNormParamType;
+
+template <typename T, int BlockDim, DataLayout layout>
+__global__ void KeLocalStats(
+    const T *x, int N, int M, int C, BatchNormParamType<T> *mean_var) {
+  typedef cub::BlockReduce<BatchNormParamType<T>, BlockDim> BlockReduce;
+  __shared__ typename BlockReduce::TempStorage temp_storage;
+  for (int k = blockIdx.x; k < C; k += gridDim.x) {
+    BatchNormParamType<T> x_sum = 0.;
+    BatchNormParamType<T> x2_sum = 0.;
+    for (int i = threadIdx.x; i < N * M; i += BlockDim) {
+      int id = layout == DataLayout::kNCHW ? (i / M) * C * M + k * M + i % M
+                                           : i * C + k;
+      auto x_in = static_cast<BatchNormParamType<T>>(x[id]);
+      x_sum += x_in;
+      x2_sum += x_in * x_in;
+    }
+    __syncthreads();
+    auto out = BlockReduce(temp_storage).Reduce(x_sum, cub::Sum());
+    __syncthreads();
+    if (threadIdx.x == 0) {
+      mean_var[k] = out / (N * M);
+    }
+    out = BlockReduce(temp_storage).Reduce(x2_sum, cub::Sum());
+    __syncthreads();
+    if (threadIdx.x == 0) {
+      mean_var[k + C] = out / (N * M);
+    }
+  }
+  if (blockIdx.x == 0 && threadIdx.x == 0) {
+    mean_var[2 * C] = static_cast<BatchNormParamType<T>>(1.0);
+  }
+}
+
+template <typename T>
+__global__ void KeSyncAndMovingStats(BatchNormParamType<T> *means,
+                                     BatchNormParamType<T> *variances,
+                                     BatchNormParamType<T> *num_dev,
+                                     const int C,
+                                     const BatchNormParamType<T> momentum,
+                                     const double epsilon,
+                                     BatchNormParamType<T> *sv_mean_data,
+                                     BatchNormParamType<T> *sv_inv_var_data,
+                                     BatchNormParamType<T> *moving_means,
+                                     BatchNormParamType<T> *moving_variances) {
+  // sync stats across multi-devices
+  int gid = blockIdx.x * blockDim.x + threadIdx.x;
+  int stride = blockDim.x * gridDim.x;
+  for (int i = gid; i < C; i += stride) {
+    auto mean = means[i] / (*num_dev);
+    auto var = variances[i] / (*num_dev);
+    var = var - mean * mean;
+
+    // sync stats
+    sv_mean_data[i] = mean;
+    sv_inv_var_data[i] = 1.0 / sqrt(var + epsilon);
+    variances[i] = var;
+
+    // moving stats
+    moving_means[i] = moving_means[i] * momentum + mean * (1. - momentum);
+    moving_variances[i] =
+        moving_variances[i] * momentum + var * (1. - momentum);
+  }
+}
+
+template <typename T, DataLayout layout>
+static __global__ void KeNormAffine(const T *x,
+                                    const BatchNormParamType<T> *scale,
+                                    const BatchNormParamType<T> *bias,
+                                    const BatchNormParamType<T> *mean,
+                                    const BatchNormParamType<T> *variance,
+                                    const double epsilon,
+                                    const int C,
+                                    const int M,
+                                    const int num,
+                                    T *y) {
+  int gid = blockIdx.x * blockDim.x + threadIdx.x;
+  int stride = blockDim.x * gridDim.x;
+  for (int i = gid; i < num; i += stride) {
+    const int c = layout == DataLayout::kNCHW ? (i / M) % C : i % C;
+    auto x_i = static_cast<BatchNormParamType<T>>(x[i]);
+    auto y_i =
+        (x_i - mean[c]) / sqrt(variance[c] + epsilon) * scale[c] + bias[c];
+    y[i] = static_cast<T>(y_i);
+  }
+}
+
+template <typename T, const int BlockDim, DataLayout layout>
+__global__ void KeBackwardLocalStats(const T *dy,
+                                     const T *x,
+                                     const BatchNormParamType<T> *means,
+                                     int N,
+                                     int M,
+                                     int C,
+                                     BatchNormParamType<T> *sum_dy_prod) {
+  typedef cub::BlockReduce<BatchNormParamType<T>, BlockDim> BlockReduce;
+  __shared__ typename BlockReduce::TempStorage temp_storage;
+  for (int k = blockIdx.x; k < C; k += gridDim.x) {
+    BatchNormParamType<T> sum1 = 0.;
+    BatchNormParamType<T> sum2 = 0.;
+    auto mean = means[k];
+    for (int i = threadIdx.x; i < N * M; i += blockDim.x) {
+      int id = layout == DataLayout::kNCHW ? (i / M) * C * M + k * M + i % M
+                                           : i * C + k;
+      auto g = static_cast<BatchNormParamType<T>>(dy[id]);
+      sum1 += g;
+      auto x_i = static_cast<BatchNormParamType<T>>(x[id]);
+      sum2 += g * (x_i - mean);
+    }
+
+    __syncthreads();
+    auto out = BlockReduce(temp_storage).Reduce(sum1, cub::Sum());
+    __syncthreads();
+    if (threadIdx.x == 0) {
+      sum_dy_prod[k] = out;
+    }
+    out = BlockReduce(temp_storage).Reduce(sum2, cub::Sum());
+    __syncthreads();
+    if (threadIdx.x == 0) {
+      sum_dy_prod[k + C] = out;
+    }
+  }
+  if (blockIdx.x == 0 && threadIdx.x == 0) {
+    sum_dy_prod[2 * C] = 1.0;
+  }
+}
+
+template <typename T, int BlockDim, DataLayout layout>
+static __global__ void KeBNBackwardScaleBias(
+    const T *dy,
+    const T *x,
+    const BatchNormParamType<T> *mean,
+    const BatchNormParamType<T> *inv_variance,
+    const double epsilon,
+    const int N,
+    const int C,
+    const int HxW,
+    BatchNormParamType<T> *dscale,
+    BatchNormParamType<T> *dbias) {
+  const int outer_size = C;
+  const int inner_size = N * HxW;
+  typedef cub::BlockReduce<BatchNormParamType<T>, BlockDim> BlockReduce;
+  __shared__ typename BlockReduce::TempStorage temp_storage;
+
+  for (int i = blockIdx.x; i < outer_size; i += gridDim.x) {
+    BatchNormParamType<T> ds_sum = 0.;
+    BatchNormParamType<T> db_sum = 0.;
+
+    auto inv_var_i = inv_variance[i];
+    auto mean_i = mean[i];
+    for (int j = threadIdx.x; j < inner_size; j += blockDim.x) {
+      const int id = layout == DataLayout::kNCHW
+                         ? ((j / HxW) * C + i) * HxW + (j % HxW)
+                         : j * outer_size + i;
+      auto x_i = static_cast<BatchNormParamType<T>>(x[id]);
+      auto dy_i = static_cast<BatchNormParamType<T>>(dy[id]);
+      ds_sum += dy_i * (x_i - mean_i);
+      db_sum += dy_i;
+    }
+    __syncthreads();
+    auto os = BlockReduce(temp_storage).Reduce(ds_sum, cub::Sum());
+    __syncthreads();
+    auto ob = BlockReduce(temp_storage).Reduce(db_sum, cub::Sum());
+    __syncthreads();
+    if (threadIdx.x == 0) {
+      dscale[i] = os * inv_var_i;
+      dbias[i] = ob;
+    }
+    __syncthreads();
+  }
+}
+
+template <typename T, DataLayout layout>
+static __global__ void KeBNRestoreData(T *x,
+                                       const BatchNormParamType<T> *scale,
+                                       const BatchNormParamType<T> *bias,
+                                       const BatchNormParamType<T> *mean,
+                                       const BatchNormParamType<T> *sv_inv,
+                                       const double epsilon,
+                                       int C,
+                                       int M,
+                                       int num,
+                                       const T *y) {
+  int gid = blockIdx.x * blockDim.x + threadIdx.x;
+  int stride = blockDim.x * gridDim.x;
+  for (int i = gid; i < num; i += stride) {
+    const int c = layout == DataLayout::kNCHW ? (i / M) % C : i % C;
+    auto y_i = static_cast<BatchNormParamType<T>>(y[i]);
+    auto x_i = (y_i - bias[c]) / scale[c] / sv_inv[c] + mean[c];
+    x[i] = static_cast<T>(x_i);
+  }
+}
+
+template <typename T, DataLayout layout>
+static __global__ void KeBNBackwardData(
+    const T *dy,
+    const T *x,
+    const BatchNormParamType<T> *gamma,
+    const BatchNormParamType<T> *mean,
+    const BatchNormParamType<T> *inv_variance,
+    const BatchNormParamType<T> *g_sum_dy,
+    const BatchNormParamType<T> *g_sum_dy_prod,
+    const BatchNormParamType<T> *num_dev,
+    const double epsilon,
+    const int C,
+    const int HxW,
+    const int num,
+    T *dx) {
+  int gid = blockIdx.x * blockDim.x + threadIdx.x;
+  int stride = blockDim.x * gridDim.x;
+  auto scale = static_cast<BatchNormParamType<T>>(C) / num;
+  auto dev_num = num_dev[0];
+  for (int i = gid; i < num; i += stride) {
+    const int c = layout == DataLayout::kNCHW ? i / HxW % C : i % C;
+    auto inv_var = inv_variance[c];
+    auto s_d = gamma[c];
+    auto gvar =
+        -(g_sum_dy_prod[c] / dev_num) * s_d * inv_var * (inv_var * inv_var);
+    auto gmean = -(g_sum_dy[c] / dev_num) * s_d * inv_var;
+
+    auto x_i = static_cast<BatchNormParamType<T>>(x[i]);
+    auto dy_i = static_cast<BatchNormParamType<T>>(dy[i]);
+    auto dx_i =
+        dy_i * s_d * inv_var + gmean * scale + gvar * scale * (x_i - mean[c]);
+    dx[i] = static_cast<T>(dx_i);
+  }
+}
+
+template <typename T, typename Context>
+void SyncBatchNormGradFunctor(
+    const Context &ctx,
+    const DenseTensor *input_x,
+    const DenseTensor *input_y,
+    const DenseTensor &scale,
+    const DenseTensor &bias,
+    // const paddle::optional<DenseTensor>& mean,
+    // const paddle::optional<DenseTensor>& variance,
+    const DenseTensor &saved_mean,
+    const DenseTensor &saved_variance,
+    // const paddle::optional<DenseTensor>& reserve_space,
+    const DenseTensor &y_grad,
+    // float momentum,
+    float epsilon_f,
+    const std::string &data_layout_str,
+    // bool is_test,
+    // bool use_global_stats,
+    // bool trainable_statistics,
+    // bool fuse_with_relu,
+    DenseTensor *x_grad,
+    DenseTensor *scale_grad,
+    DenseTensor *bias_grad) {
+  double epsilon = static_cast<double>(epsilon_f);
+
+  const DataLayout layout =
+      paddle::framework::StringToDataLayout(data_layout_str);
+
+  const auto *d_y = &y_grad;
+
+  auto *d_x = x_grad;
+  auto *d_scale = scale_grad;
+  auto *d_bias = bias_grad;
+
+  const DenseTensor *x;
+  bool is_inplace = false;
+  if (input_y) {
+    is_inplace = true;
+    x = input_y;
+  } else {
+    x = input_x;
+  }
+  const auto &x_dims = x->dims();
+
+  PADDLE_ENFORCE_GE(x_dims.size(),
+                    2,
+                    phi::errors::InvalidArgument(
+                        "The Input X dim size should be larger than 1."));
+  PADDLE_ENFORCE_LE(x_dims.size(),
+                    5,
+                    phi::errors::InvalidArgument(
+                        "The Input X dim size should be less than 6."));
+
+  int N, C, H, W, D;
+  funcs::ExtractNCWHD(x_dims, layout, &N, &C, &H, &W, &D);
+  PADDLE_ENFORCE_EQ(scale.dims()[0],
+                    C,
+                    phi::errors::InvalidArgument(
+                        "Expected first dim for input parameter(scale) of "
+                        "OP(sync_batch_norm) be (%d), but given (%d).",
+                        C,
+                        scale.dims()[0]));
+
+  ctx.template Alloc<T>(d_x);
+  if (d_scale && d_bias) {
+    ctx.template Alloc<BatchNormParamType<T>>(d_scale);
+    ctx.template Alloc<BatchNormParamType<T>>(d_bias);
+  }
+  PADDLE_ENFORCE_EQ(scale.dims().size(),
+                    1UL,
+                    phi::errors::InvalidArgument(
+                        "Expected rank for input parameter(scale) of "
+                        "OP(sync_batch_norm) be (1), but given (%d).",
+                        scale.dims().size()));
+
+  std::vector<int> dims;
+  std::vector<int> strides;
+  if (layout == DataLayout::kNCHW) {
+    dims = {N, C, H, W, D};
+    strides = {C * H * W * D, H * W * D, W * D, D, 1};
+  } else {
+    dims = {N, C, H, W, D};
+    strides = {H * W * C * D, 1, W * D * C, D * C, C};
+  }
+  const T *x_d = x->data<T>();
+  auto px = *x;
+  const T *dy_d = d_y->data<T>();
+
+  auto stream = ctx.stream();
+
+  const auto *saved_mean_ptr =
+      saved_mean.template data<BatchNormParamType<T>>();
+  const auto *saved_inv_var =
+      saved_variance.template data<BatchNormParamType<T>>();
+  const int bytes = (C * 2 + 1) * sizeof(BatchNormParamType<T>);
+  auto alloc_ptr = paddle::memory::Alloc(ctx, bytes);
+  auto *stats = reinterpret_cast<BatchNormParamType<T> *>(alloc_ptr->ptr());
+
+  const int block = 512;
+  const int threads = 256;
+  int x_numel = x->numel();
+  int fsize = H * W * D;
+  int max_threads = ctx.GetMaxPhysicalThreadCount();
+  int grid = std::min(C, (max_threads + threads - 1) / threads);
+  int grid2 = (std::min(x_numel, max_threads) + block - 1) / block;
+
+  if (is_inplace) {
+    if (layout == DataLayout::kNCHW) {
+      KeBNRestoreData<T, DataLayout::kNCHW><<<grid2, block, 0, stream>>>(
+          ctx.template Alloc<T>(&px),
+          scale.template data<BatchNormParamType<T>>(),
+          bias.template data<BatchNormParamType<T>>(),
+          saved_mean_ptr,
+          saved_inv_var,
+          epsilon,
+          C,
+          H * W * D,
+          x_numel,
+          x->data<T>());
+    } else {
+      KeBNRestoreData<T, DataLayout::kNHWC><<<grid2, block, 0, stream>>>(
+          ctx.template Alloc<T>(&px),
+          scale.template data<BatchNormParamType<T>>(),
+          bias.template data<BatchNormParamType<T>>(),
+          saved_mean_ptr,
+          saved_inv_var,
+          epsilon,
+          C,
+          H * W * D,
+          x_numel,
+          x->data<T>());
+    }
+  }
+
+  if (layout == DataLayout::kNCHW) {
+    KeBackwardLocalStats<T, threads, DataLayout::kNCHW>
+        <<<grid, threads, 0, stream>>>(
+            dy_d, x_d, saved_mean_ptr, N, fsize, C, stats);
+  } else {
+    KeBackwardLocalStats<T, threads, DataLayout::kNHWC>
+        <<<grid, threads, 0, stream>>>(
+            dy_d, x_d, saved_mean_ptr, N, fsize, C, stats);
+  }
+
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+  auto *comm = ctx.nccl_comm();
+  if (comm) {
+    int dtype = paddle::platform::ToNCCLDataType(
+        paddle::framework::TransToProtoVarType(scale.dtype()));
+    // In-place operation
+    PADDLE_ENFORCE_GPU_SUCCESS(paddle::platform::dynload::ncclAllReduce(
+        stats,
+        stats,
+        2 * C + 1,
+        static_cast<ncclDataType_t>(dtype),
+        ncclSum,
+        comm,
+        stream));
+  }
+#endif
+
+  if (layout == DataLayout::kNCHW) {
+    if (d_scale && d_bias) {
+      KeBNBackwardScaleBias<T, threads, DataLayout::kNCHW>
+          <<<grid, threads, 0, stream>>>(dy_d,
+                                         x_d,
+                                         saved_mean_ptr,
+                                         saved_inv_var,
+                                         epsilon,
+                                         N,
+                                         C,
+                                         fsize,
+                                         d_scale->data<BatchNormParamType<T>>(),
+                                         d_bias->data<BatchNormParamType<T>>());
+    }
+    if (d_x) {
+      KeBNBackwardData<T, DataLayout::kNCHW><<<grid2, block, 0, stream>>>(
+          dy_d,
+          x_d,
+          scale.template data<BatchNormParamType<T>>(),
+          saved_mean_ptr,
+          saved_inv_var,
+          stats,
+          stats + C,
+          stats + 2 * C,
+          epsilon,
+          C,
+          fsize,
+          x->numel(),
+          d_x->data<T>());
+    }
+  } else {
+    if (d_scale && d_bias) {
+      KeBNBackwardScaleBias<T, threads, DataLayout::kNHWC>
+          <<<grid, threads, 0, stream>>>(dy_d,
+                                         x_d,
+                                         saved_mean_ptr,
+                                         saved_inv_var,
+                                         epsilon,
+                                         N,
+                                         C,
+                                         fsize,
+                                         d_scale->data<BatchNormParamType<T>>(),
+                                         d_bias->data<BatchNormParamType<T>>());
+    }
+    if (d_x) {
+      KeBNBackwardData<T, DataLayout::kNHWC><<<grid2, block, 0, stream>>>(
+          dy_d,
+          x_d,
+          scale.template data<BatchNormParamType<T>>(),
+          saved_mean_ptr,
+          saved_inv_var,
+          stats,
+          stats + C,
+          stats + 2 * C,
+          epsilon,
+          C,
+          fsize,
+          x->numel(),
+          d_x->data<T>());
+    }
+  }
+}
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/sync_batch_norm_grad_kernel.h b/paddle/phi/kernels/sync_batch_norm_grad_kernel.h
new file mode 100644
index 0000000000000..395bec23f1091
--- /dev/null
+++ b/paddle/phi/kernels/sync_batch_norm_grad_kernel.h
@@ -0,0 +1,45 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <string>
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void SyncBatchNormGradKernel(const Context& dev_ctx,
+                             const DenseTensor& x,
+                             const DenseTensor& scale,
+                             const DenseTensor& bias,
+                             const paddle::optional<DenseTensor>& mean,
+                             const paddle::optional<DenseTensor>& variance,
+                             const DenseTensor& saved_mean,
+                             const DenseTensor& saved_variance,
+                             const paddle::optional<DenseTensor>& reserve_space,
+                             const DenseTensor& y_grad,
+                             float momentum,
+                             float epsilon,
+                             const std::string& data_layout,
+                             bool is_test,
+                             bool use_global_stats,
+                             bool trainable_statistics,
+                             bool fuse_with_relu,
+                             DenseTensor* x_grad,
+                             DenseTensor* scale_grad,
+                             DenseTensor* bias_grad);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/sync_batch_norm_kernel.h b/paddle/phi/kernels/sync_batch_norm_kernel.h
new file mode 100644
index 0000000000000..5071eaabf8653
--- /dev/null
+++ b/paddle/phi/kernels/sync_batch_norm_kernel.h
@@ -0,0 +1,43 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <string>
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void SyncBatchNormKernel(const Context& dev_ctx,
+                         const DenseTensor& x,
+                         const DenseTensor& scale,
+                         const DenseTensor& bias,
+                         const DenseTensor& mean,
+                         const DenseTensor& variance,
+                         float momentum,
+                         float epsilon,
+                         const std::string& data_layout,
+                         bool is_test,
+                         bool use_global_stats,
+                         bool trainable_statistics,
+                         bool fuse_with_relu,
+                         DenseTensor* y,
+                         DenseTensor* mean_out,
+                         DenseTensor* variance_out,
+                         DenseTensor* saved_mean,
+                         DenseTensor* saved_variance,
+                         DenseTensor* reserve_space);
+}  // namespace phi
diff --git a/paddle/phi/ops/compat/sync_batch_norm_sig.cc b/paddle/phi/ops/compat/sync_batch_norm_sig.cc
new file mode 100644
index 0000000000000..2595f241ff233
--- /dev/null
+++ b/paddle/phi/ops/compat/sync_batch_norm_sig.cc
@@ -0,0 +1,67 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/core/compat/op_utils.h"
+
+namespace phi {
+
+KernelSignature SyncBatchNormOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  return KernelSignature("sync_batch_norm",
+                         {"X", "Scale", "Bias", "Mean", "Variance"},
+                         {"momentum",
+                          "epsilon",
+                          "data_layout",
+                          "is_test",
+                          "use_global_stats",
+                          "trainable_statistics",
+                          "fuse_with_relu"},
+                         {"Y",
+                          "MeanOut",
+                          "VarianceOut",
+                          "SavedMean",
+                          "SavedVariance",
+                          "ReserveSpace"});
+}
+
+KernelSignature SyncBatchNormGradOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  return KernelSignature("sync_batch_norm_grad",
+                         {
+                             "X",
+                             "Scale",
+                             "Bias",
+                             "Mean",
+                             "Variance",
+                             "SavedMean",
+                             "SavedVariance",
+                             "ReserveSpace",
+                             "Y@GRAD",
+                         },
+                         {"momentum",
+                          "epsilon",
+                          "data_layout",
+                          "is_test",
+                          "use_global_stats",
+                          "trainable_statistics",
+                          "fuse_with_relu"},
+                         {"X@GRAD", "Scale@GRAD", "Bias@GRAD"});
+}
+
+}  // namespace phi
+
+PD_REGISTER_ARG_MAPPING_FN(sync_batch_norm,
+                           phi::SyncBatchNormOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(sync_batch_norm_grad,
+                           phi::SyncBatchNormGradOpArgumentMapping);
diff --git a/python/paddle/nn/layer/norm.py b/python/paddle/nn/layer/norm.py
index e549859fe626d..b9081d0c8e682 100644
--- a/python/paddle/nn/layer/norm.py
+++ b/python/paddle/nn/layer/norm.py
@@ -49,6 +49,7 @@
 from paddle import _C_ops
 from .. import Layer
 from paddle import in_dynamic_mode
+from paddle.fluid.framework import in_dygraph_mode
 
 __all__ = []
 
@@ -1100,7 +1101,14 @@ def forward(self, x):
 
         ### train mode: use mini-batch stats, eval mode: use global stats
         ### use_global_stats only support False in sync_batch_norm
-        if in_dynamic_mode():
+        if in_dygraph_mode():
+            sync_batch_norm_out, _, _, _, _, _ = _C_ops.final_state_sync_batch_norm(
+                x, self.weight, self.bias, self._mean, self._variance,
+                self._momentum, self._epsilon, self._data_format,
+                not self.training, False, False, False)
+            return sync_batch_norm_out
+
+        elif in_dynamic_mode():
             attrs = ("momentum", self._momentum, "epsilon", self._epsilon,
                      "is_test", not self.training, "data_layout",
                      self._data_format, "use_mkldnn", False, "fuse_with_relu",
@@ -1109,7 +1117,6 @@ def forward(self, x):
             sync_batch_norm_out, _, _, _, _, _ = _C_ops.sync_batch_norm(
                 x, self.weight, self.bias, self._mean, self._variance, mean_out,
                 variance_out, *attrs)
-
             return sync_batch_norm_out
 
         check_variable_and_dtype(x, 'input', ['float16', 'float32', 'float64'],

From 99bf7007d0d04a80382f1b2d47ae41f7644340ea Mon Sep 17 00:00:00 2001
From: zmxdream <zhangminxu01@baidu.com>
Date: Wed, 20 Jul 2022 16:46:11 +0800
Subject: [PATCH 03/12] [GPUPS]Fix psgpuwrapper initialization (#44468)

* Update ps_gpu_wrapper.h

* Update ps_gpu_wrapper.h

* Update ps_gpu_wrapper.cc
---
 paddle/fluid/framework/fleet/ps_gpu_wrapper.cc | 1 +
 paddle/fluid/framework/fleet/ps_gpu_wrapper.h  | 9 +++++++--
 2 files changed, 8 insertions(+), 2 deletions(-)

diff --git a/paddle/fluid/framework/fleet/ps_gpu_wrapper.cc b/paddle/fluid/framework/fleet/ps_gpu_wrapper.cc
index d9bb6e946f42d..622793653dcab 100644
--- a/paddle/fluid/framework/fleet/ps_gpu_wrapper.cc
+++ b/paddle/fluid/framework/fleet/ps_gpu_wrapper.cc
@@ -95,6 +95,7 @@ int AfsWrapper::mv(const std::string& old_path, const std::string& dest_path) {
 
 std::shared_ptr<PSGPUWrapper> PSGPUWrapper::s_instance_ = NULL;
 bool PSGPUWrapper::is_initialized_ = false;
+std::mutex PSGPUWrapper::ins_mutex;
 #ifdef PADDLE_WITH_PSLIB
 void PSGPUWrapper::InitAfsApi(const std::string& fs_name,
                               const std::string& fs_user,
diff --git a/paddle/fluid/framework/fleet/ps_gpu_wrapper.h b/paddle/fluid/framework/fleet/ps_gpu_wrapper.h
index 0d1669a42b1e9..cce120bcef747 100644
--- a/paddle/fluid/framework/fleet/ps_gpu_wrapper.h
+++ b/paddle/fluid/framework/fleet/ps_gpu_wrapper.h
@@ -20,6 +20,7 @@ limitations under the License. */
 #include <ctime>
 #include <map>
 #include <memory>
+#include <mutex>
 #include <random>
 #include <string>
 #include <unordered_map>
@@ -429,8 +430,11 @@ class PSGPUWrapper {
 
   // PSGPUWrapper singleton
   static std::shared_ptr<PSGPUWrapper> GetInstance() {
-    if (NULL == s_instance_) {
-      s_instance_.reset(new paddle::framework::PSGPUWrapper());
+    {
+      std::lock_guard<std::mutex> lk(ins_mutex);
+      if (NULL == s_instance_) {
+        s_instance_.reset(new paddle::framework::PSGPUWrapper());
+      }
     }
     return s_instance_;
   }
@@ -537,6 +541,7 @@ class PSGPUWrapper {
 
  private:
   static std::shared_ptr<PSGPUWrapper> s_instance_;
+  static std::mutex ins_mutex;
   Dataset* dataset_;
 #ifdef PADDLE_WITH_PSLIB
   paddle::ps::AfsApiWrapper afs_handler_;

From 889bdde3a6e7515cb07a4b00531fccc0ee31bc2a Mon Sep 17 00:00:00 2001
From: Zhou Wei <1183042833@qq.com>
Date: Wed, 20 Jul 2022 16:52:18 +0800
Subject: [PATCH 04/12] [Phi] migrate exponential kernel to phi (#44376)

* [Phi] migrate exponential kernel to phi

* fix comment

* fix CI
---
 paddle/fluid/operators/exponential_op.cc      | 86 +++----------------
 paddle/fluid/operators/exponential_op.cu      | 48 -----------
 paddle/fluid/operators/exponential_op.h       | 42 ---------
 .../yaml/generator/wrapped_infermeta_gen.py   |  3 +-
 paddle/phi/api/yaml/legacy_api.yaml           | 11 +++
 paddle/phi/api/yaml/legacy_backward.yaml      |  9 ++
 paddle/phi/kernels/cpu/exponential_kernel.cc  | 45 ++++++++++
 paddle/phi/kernels/exponential_kernel.h       | 27 ++++++
 paddle/phi/kernels/gpu/exponential_kernel.cu  | 36 ++++++++
 paddle/phi/ops/compat/exponential_sig.cc      | 26 ++++++
 .../tests/unittests/test_exponential_op.py    | 12 ++-
 python/paddle/tensor/random.py                |  4 +-
 12 files changed, 181 insertions(+), 168 deletions(-)
 delete mode 100644 paddle/fluid/operators/exponential_op.cu
 delete mode 100644 paddle/fluid/operators/exponential_op.h
 create mode 100644 paddle/phi/kernels/cpu/exponential_kernel.cc
 create mode 100644 paddle/phi/kernels/exponential_kernel.h
 create mode 100644 paddle/phi/kernels/gpu/exponential_kernel.cu
 create mode 100644 paddle/phi/ops/compat/exponential_sig.cc

diff --git a/paddle/fluid/operators/exponential_op.cc b/paddle/fluid/operators/exponential_op.cc
index 5a75063fba7c1..26e06e50a7784 100644
--- a/paddle/fluid/operators/exponential_op.cc
+++ b/paddle/fluid/operators/exponential_op.cc
@@ -12,7 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/exponential_op.h"
+#include "paddle/fluid/framework/infershape_utils.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/phi/infermeta/unary.h"
 
 namespace paddle {
 namespace operators {
@@ -21,13 +23,6 @@ class ExponentialOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "ExponentialOp");
-    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "ExponentialOp");
-    auto dim = ctx->GetInputDim("X");
-    ctx->SetOutputDim("Out", dim);
-  }
-
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext &ctx) const override {
@@ -51,52 +46,6 @@ exponential distribution.
   }
 };
 
-class ExponentialOpInferVarType
-    : public framework::PassInDtypeAndVarTypeToOutput {
- protected:
-  std::unordered_map<std::string, std::string> &GetInputOutputWithSameType()
-      const override {
-    static std::unordered_map<std::string, std::string> m{{"X", /*->*/ "Out"}};
-    return m;
-  }
-};
-
-template <typename T>
-class ExponentialKernel<phi::CPUContext, T> : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    auto *out = ctx.Output<framework::Tensor>("Out");
-    T *out_data = out->mutable_data<T>(ctx.GetPlace());
-
-    T lambda = static_cast<T>(ctx.Attr<float>("lambda"));
-    int64_t size = out->numel();
-
-    auto gen = framework::DefaultCPUGenerator();
-    auto engine = gen->GetCPUEngine();
-
-    std::uniform_real_distribution<T> uniform(0.0, 1.0);
-    phi::funcs::exponential_transform<T> trans(lambda);
-    for (int64_t i = 0; i < size; ++i) {
-      out_data[i] = trans(uniform(*engine));
-    }
-  }
-};
-
-class ExponentialGradOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    OP_INOUT_CHECK(ctx->HasInput(framework::GradVarName("Out")),
-                   "Input",
-                   "Out_Grad",
-                   "ExponentialGradOp");
-
-    auto dout_dim = ctx->GetInputDim(framework::GradVarName("Out"));
-    ctx->SetOutputDim(framework::GradVarName("X"), dout_dim);
-  }
-};
-
 template <typename T>
 class ExponentialGradOpMaker : public framework::SingleGradOpMaker<T> {
  public:
@@ -104,10 +53,10 @@ class ExponentialGradOpMaker : public framework::SingleGradOpMaker<T> {
 
  protected:
   void Apply(GradOpPtr<T> retv) const override {
-    retv->SetType("exponential_grad");
-    retv->SetInput(framework::GradVarName("Out"), this->OutputGrad("Out"));
-    retv->SetOutput(framework::GradVarName("X"), this->InputGrad("X"));
-    retv->SetAttrMap(this->Attrs());
+    retv->SetType("fill_any_like");
+    retv->SetInput("X", this->OutputGrad("Out"));
+    retv->SetAttr("value", 0.0f);
+    retv->SetOutput("Out", this->InputGrad("X"));
   }
 };
 
@@ -118,24 +67,15 @@ namespace ops = paddle::operators;
 namespace plat = paddle::platform;
 
 DECLARE_INPLACE_OP_INFERER(ExponentialInferer, {"X", "Out"});
-DECLARE_INPLACE_OP_INFERER(ExponentialGradInferer,
-                           {paddle::framework::GradVarName("Out"),
-                            paddle::framework::GradVarName("X")});
+
+DECLARE_INFER_SHAPE_FUNCTOR(exponential,
+                            ExponentialInfershapeFunctor,
+                            PD_INFER_META(phi::UnchangedInferMeta));
 
 REGISTER_OPERATOR(exponential,
                   ops::ExponentialOp,
                   ops::ExponentialOpMaker,
-                  ops::ExponentialOpInferVarType,
                   ops::ExponentialGradOpMaker<paddle::framework::OpDesc>,
                   ops::ExponentialGradOpMaker<paddle::imperative::OpBase>,
-                  ExponentialInferer);
-REGISTER_OPERATOR(exponential_grad,
-                  ops::ExponentialGradOp,
-                  ExponentialGradInferer);
-
-REGISTER_OP_CPU_KERNEL(exponential,
-                       ops::ExponentialKernel<phi::CPUContext, float>,
-                       ops::ExponentialKernel<phi::CPUContext, double>);
-REGISTER_OP_CPU_KERNEL(exponential_grad,
-                       ops::ExponentialGradKernel<phi::CPUContext, float>,
-                       ops::ExponentialGradKernel<phi::CPUContext, double>);
+                  ExponentialInferer,
+                  ExponentialInfershapeFunctor);
diff --git a/paddle/fluid/operators/exponential_op.cu b/paddle/fluid/operators/exponential_op.cu
deleted file mode 100644
index 58d6fa674baf6..0000000000000
--- a/paddle/fluid/operators/exponential_op.cu
+++ /dev/null
@@ -1,48 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/exponential_op.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename T>
-class ExponentialKernel<platform::CUDADeviceContext, T>
-    : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    framework::Tensor* out = ctx.Output<framework::Tensor>("Out");
-    auto& dev_cxt = ctx.template device_context<platform::CUDADeviceContext>();
-    T lambda = static_cast<T>(ctx.Attr<float>("lambda"));
-
-    phi::funcs::uniform_distribution<T> dist;
-    phi::funcs::exponential_transform<T> trans(lambda);
-    phi::funcs::distribution_and_transform<T>(dev_cxt, out, dist, trans);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-
-REGISTER_OP_CUDA_KERNEL(
-    exponential,
-    ops::ExponentialKernel<plat::CUDADeviceContext, float>,
-    ops::ExponentialKernel<plat::CUDADeviceContext, double>);
-REGISTER_OP_CUDA_KERNEL(
-    exponential_grad,
-    ops::ExponentialGradKernel<plat::CUDADeviceContext, float>,
-    ops::ExponentialGradKernel<plat::CUDADeviceContext, double>);
diff --git a/paddle/fluid/operators/exponential_op.h b/paddle/fluid/operators/exponential_op.h
deleted file mode 100644
index 7ded174a9f47e..0000000000000
--- a/paddle/fluid/operators/exponential_op.h
+++ /dev/null
@@ -1,42 +0,0 @@
-// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include "paddle/fluid/framework/generator.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/operator.h"
-#include "paddle/phi/kernels/funcs/distribution_helper.h"
-#include "paddle/phi/kernels/funcs/math_function.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename DeviceContext, typename T>
-class ExponentialKernel;
-
-template <typename DeviceContext, typename T>
-class ExponentialGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* dx = ctx.Output<framework::Tensor>(framework::GradVarName("X"));
-    dx->mutable_data<T>(ctx.GetPlace());
-    phi::funcs::SetConstant<DeviceContext, T> functor;
-    auto& dev_ctx = ctx.template device_context<DeviceContext>();
-    functor(dev_ctx, dx, static_cast<T>(0));
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/phi/api/yaml/generator/wrapped_infermeta_gen.py b/paddle/phi/api/yaml/generator/wrapped_infermeta_gen.py
index 99da6ce3d955f..dfa6a7f93cbcb 100644
--- a/paddle/phi/api/yaml/generator/wrapped_infermeta_gen.py
+++ b/paddle/phi/api/yaml/generator/wrapped_infermeta_gen.py
@@ -46,7 +46,8 @@ def gene_wrapped_infermeta_and_register(api):
                 'const paddle::optional<Tensor>&': 'const MetaTensor&'
             }
 
-            wrapped_infermeta_name = get_wrapped_infermeta_name(api.api)
+            wrapped_infermeta_name = get_wrapped_infermeta_name(
+                api.kernel['func'][0])
             args = []
             for input_name in api.inputs['names']:
                 if input_name in kernel_params:
diff --git a/paddle/phi/api/yaml/legacy_api.yaml b/paddle/phi/api/yaml/legacy_api.yaml
index ed08fe48ee849..f60309985a6f4 100644
--- a/paddle/phi/api/yaml/legacy_api.yaml
+++ b/paddle/phi/api/yaml/legacy_api.yaml
@@ -689,6 +689,17 @@
     func : expm1
   backward : expm1_grad
 
+- api : exponential_
+  args : (Tensor x, float lambda)
+  output : Tensor(out)
+  infer_meta :
+    func : UnchangedInferMeta
+    param : [x]
+  kernel :
+    func : exponential
+  inplace : (x -> out)
+  backward : exponential__grad
+
 - api : eye
   args : (int64_t num_rows, int64_t num_columns, DataType dtype=DataType::FLOAT32, Place place={})
   output : Tensor(out)
diff --git a/paddle/phi/api/yaml/legacy_backward.yaml b/paddle/phi/api/yaml/legacy_backward.yaml
index 91464ac769f77..6df4883145620 100644
--- a/paddle/phi/api/yaml/legacy_backward.yaml
+++ b/paddle/phi/api/yaml/legacy_backward.yaml
@@ -720,6 +720,15 @@
     func : expm1_grad
   inplace : (out_grad -> x_grad)
 
+- backward_api : exponential__grad
+  forward : exponential_ (Tensor x, float lambda) -> Tensor(out)
+  args : (Tensor out_grad)
+  output : Tensor(x_grad)
+  infer_meta :
+    func : UnchangedInferMeta
+  invoke : zeros_like(out_grad, DataType::UNDEFINED, {})
+  inplace : (out_grad -> x_grad)
+
 - backward_api : flatten_grad
   forward : flatten(Tensor x, int start_axis, int stop_axis) -> Tensor(out), Tensor(xshape)
   args : (Tensor xshape, Tensor out_grad)
diff --git a/paddle/phi/kernels/cpu/exponential_kernel.cc b/paddle/phi/kernels/cpu/exponential_kernel.cc
new file mode 100644
index 0000000000000..a4a07fc7a65e8
--- /dev/null
+++ b/paddle/phi/kernels/cpu/exponential_kernel.cc
@@ -0,0 +1,45 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/exponential_kernel.h"
+
+#include <random>
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/generator.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/distribution_helper.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void ExponentialKernel(const Context& dev_ctx,
+                       const DenseTensor& x,
+                       float lambda,
+                       DenseTensor* out) {
+  T* out_data = dev_ctx.template Alloc<T>(out);
+  auto engine = dev_ctx.GetGenerator()->GetCPUEngine();
+
+  std::uniform_real_distribution<T> uniform(0.0, 1.0);
+  phi::funcs::exponential_transform<T> trans(lambda);
+
+  for (int64_t i = 0; i < out->numel(); ++i) {
+    out_data[i] = trans(uniform(*engine));
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(
+    exponential, CPU, ALL_LAYOUT, phi::ExponentialKernel, float, double) {}
diff --git a/paddle/phi/kernels/exponential_kernel.h b/paddle/phi/kernels/exponential_kernel.h
new file mode 100644
index 0000000000000..736baacca4cc9
--- /dev/null
+++ b/paddle/phi/kernels/exponential_kernel.h
@@ -0,0 +1,27 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void ExponentialKernel(const Context &dev_ctx,
+                       const DenseTensor &x,
+                       float lambda,
+                       DenseTensor *out);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/gpu/exponential_kernel.cu b/paddle/phi/kernels/gpu/exponential_kernel.cu
new file mode 100644
index 0000000000000..fc1730dde64a7
--- /dev/null
+++ b/paddle/phi/kernels/gpu/exponential_kernel.cu
@@ -0,0 +1,36 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/exponential_kernel.h"
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/distribution_helper.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void ExponentialKernel(const Context &dev_ctx,
+                       const DenseTensor &x,
+                       float lambda,
+                       DenseTensor *out) {
+  phi::funcs::uniform_distribution<T> dist;
+  phi::funcs::exponential_transform<T> trans(lambda);
+  phi::funcs::distribution_and_transform<T>(dev_ctx, out, dist, trans);
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(
+    exponential, GPU, ALL_LAYOUT, phi::ExponentialKernel, float, double) {}
diff --git a/paddle/phi/ops/compat/exponential_sig.cc b/paddle/phi/ops/compat/exponential_sig.cc
new file mode 100644
index 0000000000000..2d70a4200ab3c
--- /dev/null
+++ b/paddle/phi/ops/compat/exponential_sig.cc
@@ -0,0 +1,26 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/phi/core/compat/op_utils.h"
+
+namespace phi {
+
+KernelSignature ExponentialOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  return KernelSignature("exponential", {"X"}, {"lambda"}, {"Out"});
+}
+
+}  // namespace phi
+
+PD_REGISTER_ARG_MAPPING_FN(exponential, phi::ExponentialOpArgumentMapping);
diff --git a/python/paddle/fluid/tests/unittests/test_exponential_op.py b/python/paddle/fluid/tests/unittests/test_exponential_op.py
index 57c4fb02d858a..72b4d8990446d 100644
--- a/python/paddle/fluid/tests/unittests/test_exponential_op.py
+++ b/python/paddle/fluid/tests/unittests/test_exponential_op.py
@@ -18,13 +18,13 @@
 from op_test import OpTest
 import os
 
-paddle.enable_static()
 paddle.seed(100)
 
 
 class TestExponentialOp1(OpTest):
 
     def setUp(self):
+        paddle.enable_static()
         self.op_type = "exponential"
         self.config()
 
@@ -87,8 +87,14 @@ def test_static(self):
     def test_dygraph(self):
         paddle.disable_static()
         x = paddle.full([10, 10], -1., dtype='float32')
-        x.exponential_(0.5)
-        self.assertTrue(np.min(x.numpy()) >= 0)
+        x.stop_gradient = False
+        y = 2 * x
+        y.exponential_(0.5)
+        print(y)
+        self.assertTrue(np.min(y.numpy()) >= 0)
+
+        y.backward()
+        self.assertTrue(np.array_equal(x.grad.numpy(), np.zeros([10, 10])))
         paddle.enable_static()
 
     def test_fixed_random_number(self):
diff --git a/python/paddle/tensor/random.py b/python/paddle/tensor/random.py
index 990b20a26772c..e25366df753ed 100644
--- a/python/paddle/tensor/random.py
+++ b/python/paddle/tensor/random.py
@@ -1052,7 +1052,9 @@ def exponential_(x, lam=1.0, name=None):
             #  [0.72520673, 0.45208144, 0.30234432]]
 
     """
-    if paddle.in_dynamic_mode():
+    if in_dygraph_mode():
+        return _C_ops.final_state_exponential_(x, lam)
+    elif paddle.in_dynamic_mode():
         return _C_ops.exponential_(x, "lambda", lam)
 
     check_variable_and_dtype(x, "x", ["float32", "float64"], "exponential")

From 41f11d29526b2a3827a1a5224bc00ebe540e34d4 Mon Sep 17 00:00:00 2001
From: Zhong Hui <zhonghui.net@gmail.com>
Date: Wed, 20 Jul 2022 17:15:04 +0800
Subject: [PATCH 05/12] [PHI] move diag_embed op to phi. (#44408)

* move diag_embed to phi.
---
 paddle/fluid/operators/diag_embed_op.cc       |  93 ++-----------
 paddle/fluid/operators/diag_embed_op.cu       |  30 ----
 paddle/fluid/operators/diag_embed_op.h        | 130 ------------------
 paddle/phi/api/yaml/legacy_api.yaml           |   8 ++
 paddle/phi/infermeta/unary.cc                 |  63 +++++++++
 paddle/phi/infermeta/unary.h                  |   3 +
 paddle/phi/kernels/cpu/diag_embed_kernel.cc   |  28 ++++
 paddle/phi/kernels/diag_embed_kernel.h        |  29 ++++
 paddle/phi/kernels/gpu/diag_embed_kernel.cu   |  28 ++++
 paddle/phi/kernels/impl/diag_embed_impl.h     | 129 +++++++++++++++++
 .../fluid/tests/unittests/test_diag_embed.py  |   3 +-
 python/paddle/nn/functional/extension.py      |  15 +-
 12 files changed, 310 insertions(+), 249 deletions(-)
 delete mode 100644 paddle/fluid/operators/diag_embed_op.cu
 delete mode 100644 paddle/fluid/operators/diag_embed_op.h
 create mode 100644 paddle/phi/kernels/cpu/diag_embed_kernel.cc
 create mode 100644 paddle/phi/kernels/diag_embed_kernel.h
 create mode 100644 paddle/phi/kernels/gpu/diag_embed_kernel.cu
 create mode 100644 paddle/phi/kernels/impl/diag_embed_impl.h

diff --git a/paddle/fluid/operators/diag_embed_op.cc b/paddle/fluid/operators/diag_embed_op.cc
index 531d6f92d8830..0dc5d024ec4a8 100644
--- a/paddle/fluid/operators/diag_embed_op.cc
+++ b/paddle/fluid/operators/diag_embed_op.cc
@@ -12,7 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/operators/diag_embed_op.h"
+#include "paddle/fluid/framework/infershape_utils.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/phi/core/infermeta_utils.h"
+#include "paddle/phi/infermeta/unary.h"
 
 namespace paddle {
 namespace operators {
@@ -20,81 +23,6 @@ namespace operators {
 class DiagEmbedOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    PADDLE_ENFORCE_EQ(
-        ctx->HasInput("Input"),
-        true,
-        platform::errors::NotFound("Input of DiagEmbedOp is not found."));
-
-    PADDLE_ENFORCE_EQ(
-        ctx->HasOutput("Out"),
-        true,
-        platform::errors::NotFound("Output of DiagEmbedOp is not found."));
-
-    int offset = ctx->Attrs().Get<int>("offset");
-    int dim1 = ctx->Attrs().Get<int>("dim1");
-    int dim2 = ctx->Attrs().Get<int>("dim2");
-
-    auto x_dims = ctx->GetInputDim("Input");
-
-    PADDLE_ENFORCE_GE(
-        dim1,
-        -(x_dims.size() + 1),
-        platform::errors::OutOfRange(
-            "Dim1 is out of range (expected to be in range of [%ld, "
-            "%ld], but got %ld).",
-            -(x_dims.size() + 1),
-            x_dims.size(),
-            dim1));
-    PADDLE_ENFORCE_LE(
-        dim1,
-        x_dims.size(),
-        platform::errors::OutOfRange(
-            "Dim1 is out of range (expected to be in range of [%ld, "
-            "%ld], but got %ld).",
-            -(x_dims.size() + 1),
-            x_dims.size(),
-            dim1));
-
-    PADDLE_ENFORCE_GE(
-        dim2,
-        -(x_dims.size() + 1),
-        platform::errors::OutOfRange(
-            "Dim2 is out of range (expected to be in range of [%ld, "
-            "%ld], but got %ld).",
-            -(x_dims.size() + 1),
-            x_dims.size(),
-            dim2));
-    PADDLE_ENFORCE_LE(
-        dim2,
-        x_dims.size(),
-        platform::errors::OutOfRange(
-            "Dim2 is out of range (expected to be in range of [%ld, "
-            "%ld], but got %ld).",
-            -(x_dims.size() + 1),
-            x_dims.size(),
-            dim2));
-
-    int dim1_ = dim1 < 0 ? x_dims.size() + dim1 + 1 : dim1;
-    int dim2_ = dim2 < 0 ? x_dims.size() + dim2 + 1 : dim2;
-    int offset_ = std::abs(offset);
-
-    PADDLE_ENFORCE_NE(dim1_,
-                      dim2_,
-                      platform::errors::InvalidArgument(
-                          "diagonal dimensions should not be identical "
-                          "%ld vs %ld.",
-                          dim1,
-                          dim2));
-
-    int new_dim_len = offset_ + x_dims[x_dims.size() - 1];
-    auto sizes = vectorize(x_dims);
-    sizes.pop_back();
-    sizes.insert(sizes.begin() + std::min(dim1_, dim2_), new_dim_len);
-    sizes.insert(sizes.begin() + std::max(dim1_, dim2_), new_dim_len);
-    ctx->SetOutputDim("Out", phi::make_ddim(sizes));
-  }
 };
 
 class DiagEmbedOpMaker : public framework::OpProtoAndCheckerMaker {
@@ -131,15 +59,14 @@ class DiagEmbedOpMaker : public framework::OpProtoAndCheckerMaker {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-namespace platform = paddle::platform;
+DECLARE_INFER_SHAPE_FUNCTOR(diag_embed,
+                            DiagEmbedInferShapeFunctor,
+                            PD_INFER_META(phi::DiagEmbedInferMeta));
+
 REGISTER_OPERATOR(
     diag_embed,
     ops::DiagEmbedOp,
     ops::DiagEmbedOpMaker,
     paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
-    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>);
-REGISTER_OP_CPU_KERNEL(diag_embed,
-                       ops::DiagEmbedKernel<phi::CPUContext, int>,
-                       ops::DiagEmbedKernel<phi::CPUContext, float>,
-                       ops::DiagEmbedKernel<phi::CPUContext, double>,
-                       ops::DiagEmbedKernel<phi::CPUContext, int64_t>);
+    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>,
+    DiagEmbedInferShapeFunctor);
diff --git a/paddle/fluid/operators/diag_embed_op.cu b/paddle/fluid/operators/diag_embed_op.cu
deleted file mode 100644
index e0f8c16731ff7..0000000000000
--- a/paddle/fluid/operators/diag_embed_op.cu
+++ /dev/null
@@ -1,30 +0,0 @@
-// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <thrust/device_vector.h>
-#include <thrust/host_vector.h>
-
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/diag_embed_op.h"
-
-namespace ops = paddle::operators;
-namespace platform = paddle::platform;
-REGISTER_OP_CUDA_KERNEL(
-    diag_embed,
-    ops::DiagEmbedKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::DiagEmbedKernel<paddle::platform::CUDADeviceContext, int64_t>,
-    ops::DiagEmbedKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::DiagEmbedKernel<paddle::platform::CUDADeviceContext,
-                         platform::float16>,
-    ops::DiagEmbedKernel<paddle::platform::CUDADeviceContext, double>);
diff --git a/paddle/fluid/operators/diag_embed_op.h b/paddle/fluid/operators/diag_embed_op.h
deleted file mode 100644
index 94c479bb452b9..0000000000000
--- a/paddle/fluid/operators/diag_embed_op.h
+++ /dev/null
@@ -1,130 +0,0 @@
-// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <algorithm>
-
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/platform/for_range.h"
-#include "paddle/phi/kernels/funcs/math_function.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename T>
-struct DiagEmbedFunctor {
-  DiagEmbedFunctor(const T* input,
-                   int64_t numel,
-                   const int64_t* dim,
-                   int64_t offset,
-                   int64_t dims_size,
-                   T* output,
-                   const int64_t* strides)
-      : input_(input),
-        numel_(numel),
-        dim_(dim),
-        offset_(offset),
-        dims_size_(dims_size),
-        output_(output),
-        strides_(strides) {}
-
-  HOSTDEVICE void operator()(size_t idx) const {
-    int64_t position = 0;
-    auto numel = numel_;
-    int64_t num = idx;
-    for (int64_t i = 0; i < dims_size_; i++) {
-      numel = numel / dim_[i];
-      position += num / numel * strides_[i];
-      num = num % numel;
-    }
-    output_[position + offset_] = input_[idx];
-  }
-
-  const T* input_;
-  int64_t numel_;
-  const int64_t* dim_;
-  int64_t offset_;
-  int64_t dims_size_;
-  T* output_;
-  const int64_t* strides_;
-};
-
-template <typename DeviceContext, typename T>
-class DiagEmbedKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* input = context.Input<framework::Tensor>("Input");
-    auto* out = context.Output<framework::Tensor>("Out");
-
-    const int64_t offset = context.Attr<int>("offset");
-    const int64_t dim1 = context.Attr<int>("dim1");
-    const int64_t dim2 = context.Attr<int>("dim2");
-    auto* input_data = input->data<T>();
-
-    T* out_data = out->mutable_data<T>(context.GetPlace());
-    phi::funcs::SetConstant<DeviceContext, T> set_zero;
-    auto& dev_ctx = context.template device_context<DeviceContext>();
-    set_zero(dev_ctx, out, static_cast<T>(0.0));
-
-    auto out_dims = out->dims();
-    int dim1_ = dim1 < 0 ? out_dims.size() + dim1 : dim1;
-    int dim2_ = dim2 < 0 ? out_dims.size() + dim2 : dim2;
-    auto stride = phi::stride(out_dims);
-    int64_t diag_size;
-    int64_t storage_offset = 0;
-    if (offset >= 0) {
-      int64_t dim = out_dims[dim2_] - offset;
-      diag_size = std::max<int64_t>(std::min(out_dims[dim1_], dim), 0);
-    } else {
-      int64_t dim = out_dims[dim1_] + offset;
-      diag_size = std::max<int64_t>(std::min(dim, out_dims[dim2_]), 0);
-    }
-    if (diag_size == 0) {
-      // skip
-    } else if (offset >= 0) {
-      storage_offset += offset * stride[dim2_];
-    } else {
-      storage_offset -= offset * stride[dim1_];
-    }
-    auto strides = vectorize(stride);
-    strides.erase(strides.begin() + std::max(dim1_, dim2_));
-    strides.erase(strides.begin() + std::min(dim1_, dim2_));
-    strides.push_back(stride[dim1_] + stride[dim2_]);
-    const auto dims = vectorize(input->dims());
-
-#if defined(__NVCC__) || defined(__HIPCC__)
-    thrust::device_vector<int64_t> dims_vec(dims);
-    const int64_t* dims_arr = thrust::raw_pointer_cast(dims_vec.data());
-    thrust::device_vector<int64_t> strides_vec(strides);
-    const int64_t* strides_arr = thrust::raw_pointer_cast(strides_vec.data());
-#else
-    const int64_t* dims_arr = dims.data();
-    const int64_t* strides_arr = strides.data();
-#endif
-
-    platform::ForRange<DeviceContext> for_range(dev_ctx, input->numel());
-    DiagEmbedFunctor<T> functor(input_data,
-                                input->numel(),
-                                dims_arr,
-                                storage_offset,
-                                dims.size(),
-                                out_data,
-                                strides_arr);
-    for_range(functor);
-  }
-};
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/phi/api/yaml/legacy_api.yaml b/paddle/phi/api/yaml/legacy_api.yaml
index f60309985a6f4..40fbdc9a9170d 100644
--- a/paddle/phi/api/yaml/legacy_api.yaml
+++ b/paddle/phi/api/yaml/legacy_api.yaml
@@ -524,6 +524,14 @@
     func : determinant
   backward : det_grad
 
+- api : diag_embed
+  args : (Tensor x, int offset, int dim1, int dim2)
+  output : Tensor
+  infer_meta :
+    func : DiagEmbedInferMeta
+  kernel :
+    func : diag_embed
+
 - api : divide
   args : (Tensor x, Tensor y)
   output : Tensor
diff --git a/paddle/phi/infermeta/unary.cc b/paddle/phi/infermeta/unary.cc
index c39fb96430f45..7b1c6dfe65a04 100644
--- a/paddle/phi/infermeta/unary.cc
+++ b/paddle/phi/infermeta/unary.cc
@@ -288,6 +288,69 @@ void CumInferMeta(const MetaTensor& x,
   out->share_lod(x);
 }
 
+void DiagEmbedInferMeta(
+    const MetaTensor& x, int offset, int dim1, int dim2, MetaTensor* out) {
+  auto x_dims = x.dims();
+
+  PADDLE_ENFORCE_GE(
+      dim1,
+      -(x_dims.size() + 1),
+      phi::errors::OutOfRange(
+          "Dim1 is out of range (expected to be in range of [%ld, "
+          "%ld], but got %ld).",
+          -(x_dims.size() + 1),
+          x_dims.size(),
+          dim1));
+  PADDLE_ENFORCE_LE(
+      dim1,
+      x_dims.size(),
+      phi::errors::OutOfRange(
+          "Dim1 is out of range (expected to be in range of [%ld, "
+          "%ld], but got %ld).",
+          -(x_dims.size() + 1),
+          x_dims.size(),
+          dim1));
+
+  PADDLE_ENFORCE_GE(
+      dim2,
+      -(x_dims.size() + 1),
+      phi::errors::OutOfRange(
+          "Dim2 is out of range (expected to be in range of [%ld, "
+          "%ld], but got %ld).",
+          -(x_dims.size() + 1),
+          x_dims.size(),
+          dim2));
+  PADDLE_ENFORCE_LE(
+      dim2,
+      x_dims.size(),
+      phi::errors::OutOfRange(
+          "Dim2 is out of range (expected to be in range of [%ld, "
+          "%ld], but got %ld).",
+          -(x_dims.size() + 1),
+          x_dims.size(),
+          dim2));
+
+  int dim1_ = dim1 < 0 ? x_dims.size() + dim1 + 1 : dim1;
+  int dim2_ = dim2 < 0 ? x_dims.size() + dim2 + 1 : dim2;
+  int offset_ = std::abs(offset);
+
+  PADDLE_ENFORCE_NE(dim1_,
+                    dim2_,
+                    phi::errors::InvalidArgument(
+                        "diagonal dimensions should not be identical "
+                        "%ld vs %ld.",
+                        dim1,
+                        dim2));
+
+  int new_dim_len = offset_ + x_dims[x_dims.size() - 1];
+  auto sizes = vectorize(x_dims);
+  sizes.pop_back();
+  sizes.insert(sizes.begin() + std::min(dim1_, dim2_), new_dim_len);
+  sizes.insert(sizes.begin() + std::max(dim1_, dim2_), new_dim_len);
+  out->set_dims(phi::make_ddim(sizes));
+  out->set_dtype(x.dtype());
+}
+
 void DiagInferMeta(const MetaTensor& x,
                    int offset,
                    float padding_value,
diff --git a/paddle/phi/infermeta/unary.h b/paddle/phi/infermeta/unary.h
index 691fc8ff41ca6..e825ba98f44e3 100644
--- a/paddle/phi/infermeta/unary.h
+++ b/paddle/phi/infermeta/unary.h
@@ -71,6 +71,9 @@ void CumInferMeta(const MetaTensor& x,
                   bool reverse,
                   MetaTensor* out);
 
+void DiagEmbedInferMeta(
+    const MetaTensor& x, int offset, int dim1, int dim2, MetaTensor* out);
+
 void DiagInferMeta(const MetaTensor& x,
                    int offset,
                    float padding_value,
diff --git a/paddle/phi/kernels/cpu/diag_embed_kernel.cc b/paddle/phi/kernels/cpu/diag_embed_kernel.cc
new file mode 100644
index 0000000000000..714b53c6919aa
--- /dev/null
+++ b/paddle/phi/kernels/cpu/diag_embed_kernel.cc
@@ -0,0 +1,28 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/diag_embed_kernel.h"
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/diag_embed_impl.h"
+
+PD_REGISTER_KERNEL(diag_embed,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::DiagEmbedKernel,
+                   int,
+                   int64_t,
+                   float,
+                   double) {}
diff --git a/paddle/phi/kernels/diag_embed_kernel.h b/paddle/phi/kernels/diag_embed_kernel.h
new file mode 100644
index 0000000000000..e47eab82474fb
--- /dev/null
+++ b/paddle/phi/kernels/diag_embed_kernel.h
@@ -0,0 +1,29 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void DiagEmbedKernel(const Context& dev_ctx,
+                     const DenseTensor& x,
+                     int offset,
+                     int dim1,
+                     int dim2,
+                     DenseTensor* out);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/gpu/diag_embed_kernel.cu b/paddle/phi/kernels/gpu/diag_embed_kernel.cu
new file mode 100644
index 0000000000000..ece0f012e620e
--- /dev/null
+++ b/paddle/phi/kernels/gpu/diag_embed_kernel.cu
@@ -0,0 +1,28 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/diag_embed_kernel.h"
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/diag_embed_impl.h"
+
+PD_REGISTER_KERNEL(diag_embed,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::DiagEmbedKernel,
+                   int,
+                   int64_t,
+                   float,
+                   double) {}
diff --git a/paddle/phi/kernels/impl/diag_embed_impl.h b/paddle/phi/kernels/impl/diag_embed_impl.h
new file mode 100644
index 0000000000000..a4430fde92343
--- /dev/null
+++ b/paddle/phi/kernels/impl/diag_embed_impl.h
@@ -0,0 +1,129 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#if defined(__NVCC__) || defined(__HIPCC__)
+#include <thrust/device_vector.h>
+#include <thrust/host_vector.h>
+#endif
+
+#include "paddle/phi/kernels/diag_embed_kernel.h"
+
+#include <algorithm>
+
+#include "paddle/phi/kernels/funcs/for_range.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
+
+namespace phi {
+
+template <typename T>
+struct DiagEmbedFunctor {
+  DiagEmbedFunctor(const T* input,
+                   int64_t numel,
+                   const int64_t* dim,
+                   int64_t offset,
+                   int64_t dims_size,
+                   T* output,
+                   const int64_t* strides)
+      : input_(input),
+        numel_(numel),
+        dim_(dim),
+        offset_(offset),
+        dims_size_(dims_size),
+        output_(output),
+        strides_(strides) {}
+
+  HOSTDEVICE void operator()(size_t idx) const {
+    int64_t position = 0;
+    auto numel = numel_;
+    int64_t num = idx;
+    for (int64_t i = 0; i < dims_size_; i++) {
+      numel = numel / dim_[i];
+      position += num / numel * strides_[i];
+      num = num % numel;
+    }
+    output_[position + offset_] = input_[idx];
+  }
+
+  const T* input_;
+  int64_t numel_;
+  const int64_t* dim_;
+  int64_t offset_;
+  int64_t dims_size_;
+  T* output_;
+  const int64_t* strides_;
+};
+
+template <typename T, typename Context>
+void DiagEmbedKernel(const Context& dev_ctx,
+                     const DenseTensor& x,
+                     int offset,
+                     int dim1,
+                     int dim2,
+                     DenseTensor* out) {
+  auto* input_data = x.data<T>();
+  T* out_data = dev_ctx.template Alloc<T>(out);
+  phi::funcs::SetConstant<Context, T> set_zero;
+
+  set_zero(dev_ctx, out, static_cast<T>(0.0));
+
+  auto out_dims = out->dims();
+  int dim1_ = dim1 < 0 ? out_dims.size() + dim1 : dim1;
+  int dim2_ = dim2 < 0 ? out_dims.size() + dim2 : dim2;
+  auto stride = phi::stride(out_dims);
+  int64_t diag_size;
+  int64_t storage_offset = 0;
+  if (offset >= 0) {
+    int64_t dim = out_dims[dim2_] - offset;
+    diag_size = std::max<int64_t>(std::min(out_dims[dim1_], dim), 0);
+  } else {
+    int64_t dim = out_dims[dim1_] + offset;
+    diag_size = std::max<int64_t>(std::min(dim, out_dims[dim2_]), 0);
+  }
+  if (diag_size == 0) {
+    // skip
+  } else if (offset >= 0) {
+    storage_offset += offset * stride[dim2_];
+  } else {
+    storage_offset -= offset * stride[dim1_];
+  }
+  auto strides = vectorize(stride);
+  strides.erase(strides.begin() + std::max(dim1_, dim2_));
+  strides.erase(strides.begin() + std::min(dim1_, dim2_));
+  strides.push_back(stride[dim1_] + stride[dim2_]);
+  const auto dims = vectorize(x.dims());
+
+#if defined(__NVCC__) || defined(__HIPCC__)
+  thrust::device_vector<int64_t> dims_vec(dims);
+  const int64_t* dims_arr = thrust::raw_pointer_cast(dims_vec.data());
+  thrust::device_vector<int64_t> strides_vec(strides);
+  const int64_t* strides_arr = thrust::raw_pointer_cast(strides_vec.data());
+#else
+  const int64_t* dims_arr = dims.data();
+  const int64_t* strides_arr = strides.data();
+#endif
+
+  phi::funcs::ForRange<Context> for_range(dev_ctx, x.numel());
+  DiagEmbedFunctor<T> functor(input_data,
+                              x.numel(),
+                              dims_arr,
+                              storage_offset,
+                              dims.size(),
+                              out_data,
+                              strides_arr);
+  for_range(functor);
+}
+
+}  // namespace phi
diff --git a/python/paddle/fluid/tests/unittests/test_diag_embed.py b/python/paddle/fluid/tests/unittests/test_diag_embed.py
index c7f933d23ea21..546247167b8d0 100644
--- a/python/paddle/fluid/tests/unittests/test_diag_embed.py
+++ b/python/paddle/fluid/tests/unittests/test_diag_embed.py
@@ -27,11 +27,12 @@ class TestDiagEmbedOp(OpTest):
 
     def setUp(self):
         self.op_type = "diag_embed"
+        self.python_api = F.diag_embed
         self.init_config()
         self.outputs = {'Out': self.target}
 
     def test_check_output(self):
-        self.check_output()
+        self.check_output(check_eager=True)
 
     def init_config(self):
         self.case = np.random.randn(2, 3).astype('float32')
diff --git a/python/paddle/nn/functional/extension.py b/python/paddle/nn/functional/extension.py
index 27bc2ef70bcee..1bfa7f148838a 100644
--- a/python/paddle/nn/functional/extension.py
+++ b/python/paddle/nn/functional/extension.py
@@ -98,12 +98,18 @@ def diag_embed(input, offset=0, dim1=-2, dim2=-1):
             #  [[ 0.        ,  0.        ,  0.        ,  0.        ],
             #   [ 0.        ,  0.        ,  0.        ,  0.        ]]]
     """
-    inputs = {'Input': [input]}
-    attrs = {'offset': offset, 'dim1': dim1, 'dim2': dim2}
-
     if not isinstance(input, Variable):
         input = assign(input)
 
+    if in_dygraph_mode():
+        return _C_ops.final_state_diag_embed(input, offset, dim1, dim2)
+    elif in_dynamic_mode():
+        return _C_ops.diag_embed(input, "offset", offset, "dim1", dim1, "dim2",
+                                 dim2)
+
+    inputs = {'Input': [input]}
+    attrs = {'offset': offset, 'dim1': dim1, 'dim2': dim2}
+
     def __check_input(input, offset, dim1, dim2):
         check_dtype(input.dtype, 'Input',
                     ['int32', 'int64', 'float16', 'float32', 'float64'],
@@ -129,8 +135,7 @@ def __check_input(input, offset, dim1, dim2):
                "dim1 and dim2 cannot be the same dimension." \
                 "But received dim1 = %d, dim2 = %d\n"%(dim1, dim2)
 
-    if not in_dynamic_mode():
-        __check_input(input, offset, dim1, dim2)
+    __check_input(input, offset, dim1, dim2)
     helper = LayerHelper("diag_embed", **locals())
 
     out = helper.create_variable_for_type_inference(dtype=input.dtype)

From 1e1a4b9bf399808c07ce5678c4b1234c873b9dca Mon Sep 17 00:00:00 2001
From: fuyou765 <64373205+fuyou765@users.noreply.github.com>
Date: Wed, 20 Jul 2022 18:49:07 +0800
Subject: [PATCH 06/12] [MLU] set_value performance optimizing (#44390)

---
 paddle/fluid/operators/set_value_op_mlu.cc    | 97 +++++++++++--------
 .../unittests/mlu/test_set_value_op_mlu.py    | 14 +++
 2 files changed, 69 insertions(+), 42 deletions(-)

diff --git a/paddle/fluid/operators/set_value_op_mlu.cc b/paddle/fluid/operators/set_value_op_mlu.cc
index 44422994f60da..9a6277dfa2312 100644
--- a/paddle/fluid/operators/set_value_op_mlu.cc
+++ b/paddle/fluid/operators/set_value_op_mlu.cc
@@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#include <numeric>
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/mlu/mlu_baseop.h"
 #include "paddle/fluid/operators/set_value_op.h"
@@ -62,7 +63,6 @@ class SetValueMLUKernel : public framework::OpKernel<T> {
     auto slice_dims_for_assign = decrease_slice_dims;
     if (!none_axes.empty()) {
       std::vector<int64_t> slice_dims_with_none;
-
       size_t none_axes_cur = 0, decrease_axes_cur = 0;
       for (int i = 0; i < slice_dims.size(); ++i) {
         while (none_axes_cur < none_axes.size() &&
@@ -84,51 +84,22 @@ class SetValueMLUKernel : public framework::OpKernel<T> {
 
       slice_dims_for_assign = phi::make_ddim(slice_dims_with_none);
     }
-
-    auto starts_indices = std::vector<int64_t>(in_dims.size(), 0);
-    auto ends_indices = std::vector<int64_t>(in_dims.size(), 0);
-    auto strides_indices = std::vector<int64_t>(in_dims.size(), 0);
+    int in_size = in_dims.size();
+    int starts_indices[in_size] = {0};
+    int ends_indices[in_size] = {0};
+    int strides_indices[in_size] = {0};
 
     for (int i = 0; i < in_dims.size(); ++i) {
       starts_indices[i] = 0;
-      ends_indices[i] = slice_dims[i];
+      ends_indices[i] = static_cast<int>(slice_dims[i]);
       strides_indices[i] = 1;
     }
     for (size_t i = 0; i < axes.size(); i++) {
       int axis_index = axes[i];
-      starts_indices[axis_index] = starts[i];
-      ends_indices[axis_index] = ends[i];
-      strides_indices[axis_index] = steps[i];
-    }
-
-    int64_t stride_step = phi::product(in_dims);
-    std::vector<int64_t> index_indices(1, 0);
-    for (size_t i = 0; i < strides_indices.size(); ++i) {
-      auto index_size = index_indices.size();
-      stride_step /= in_dims[i];
-      for (size_t j = 0; j < index_size; ++j) {
-        auto start_index = *index_indices.begin();
-        if (strides_indices[i] > 0) {
-          for (int64_t k = starts_indices[i]; k < ends_indices[i];
-               k += strides_indices[i]) {
-            index_indices.push_back(start_index + k * stride_step);
-          }
-        } else {
-          for (int64_t k = starts_indices[i]; k > ends_indices[i];
-               k += strides_indices[i]) {
-            index_indices.push_back(start_index + k * stride_step);
-          }
-        }
-        index_indices.erase(index_indices.begin());
-      }
+      starts_indices[axis_index] = static_cast<int>(starts[i]);
+      ends_indices[axis_index] = static_cast<int>(ends[i]);
+      strides_indices[axis_index] = static_cast<int>(steps[i]);
     }
-
-    PADDLE_ENFORCE_EQ(
-        static_cast<int64_t>(index_indices.size()),
-        phi::product(slice_dims_for_assign),
-        platform::errors::InvalidArgument(
-            "OP(set_value) error index indices and value update not match "));
-
     Tensor value_t(in->type());
     if (value_tensor != nullptr) {
       value_t.ShareDataWith(*value_tensor);
@@ -160,29 +131,71 @@ class SetValueMLUKernel : public framework::OpKernel<T> {
 
     int64_t input_numel = phi::product(in_dims);
     int64_t value_numel = phi::product(value_temp.dims());
-    Tensor in_temp, out_temp, val_temp;
+    Tensor in_temp, out_temp, val_temp, index_out;
+    int64_t stride_step = phi::product(in_dims);
+    std::vector<int64_t> index_indices(stride_step);
+    std::iota(index_indices.begin(), index_indices.end(), 0);
     framework::Tensor index_temp;
     in_temp.ShareDataWith(*in);
     val_temp.ShareDataWith(value_temp);
     paddle::framework::TensorFromVector(
         index_indices, ctx.device_context(), &index_temp);
+    index_temp.Resize(in_dims);
+    auto index_dims = in_dims;
+    for (int i = 0; i < in_dims.size(); ++i) {
+      if (starts_indices[i] < 0 || ends_indices[i] < 0) {
+        starts_indices[i] -= in_dims[i];
+        ends_indices[i] -= in_dims[i];
+      }
+      if (strides_indices[i] > 0)
+        index_dims[i] =
+            static_cast<int>((ends_indices[i] - starts_indices[i] - 1) /
+                             strides_indices[i]) +
+            1;
+      else
+        index_dims[i] =
+            static_cast<int>((ends_indices[i] - starts_indices[i] + 1) /
+                             strides_indices[i]) +
+            1;
+    }
     auto new_in_dims = phi::make_ddim({input_numel});
     auto new_val_dims = phi::make_ddim({value_numel});
     in_temp.Resize(new_in_dims);
     val_temp.Resize(new_val_dims);
+    index_out.Resize(index_dims);
+    index_out.mutable_data<int64_t>(ctx.GetPlace());
     cnnlScatterRefMode_t mode = CNNL_SCATTERREF_UPDATE;
     MLUCnnlTensorDesc x_desc(in_temp);
     MLUCnnlTensorDesc indices_desc(index_temp);
+    MLUCnnlTensorDesc indices_out_desc(index_out);
     MLUCnnlTensorDesc updates_desc(val_temp);
     MLUCnnlTensorDesc out_desc(*out);
-
+    MLUCnnl::StridedSlice(ctx,
+                          starts_indices,
+                          ends_indices,
+                          strides_indices,
+                          indices_desc.get(),
+                          GetBasePtr(&index_temp),
+                          indices_out_desc.get(),
+                          GetBasePtr(&index_out));
+    PADDLE_ENFORCE_EQ(
+        static_cast<int64_t>(phi::product(index_out.dims())),
+        phi::product(slice_dims_for_assign),
+        platform::errors::InvalidArgument(
+            "OP(set_value) error index indices and value update not match "));
+    Tensor index_final;
+    index_final.ShareDataWith(index_out);
+    int64_t indices_numel = phi::product(index_dims);
+    auto new_index_dims = phi::make_ddim({indices_numel});
+    index_final.Resize(new_index_dims);
+    MLUCnnlTensorDesc indices_final_desc(index_final);
     MLUCnnl::ScatterRefFunctor(ctx,
                                x_desc.get(),
                                GetBasePtr(&in_temp),
                                updates_desc.get(),
                                GetBasePtr(&val_temp),
-                               indices_desc.get(),
-                               GetBasePtr(&index_temp),
+                               indices_final_desc.get(),
+                               GetBasePtr(&index_final),
                                mode);
     in_temp.Resize(in_dims);
     paddle::framework::TensorCopy(in_temp, ctx.GetPlace(), out);
diff --git a/python/paddle/fluid/tests/unittests/mlu/test_set_value_op_mlu.py b/python/paddle/fluid/tests/unittests/mlu/test_set_value_op_mlu.py
index f6183687f6a47..1842f9a2f632c 100644
--- a/python/paddle/fluid/tests/unittests/mlu/test_set_value_op_mlu.py
+++ b/python/paddle/fluid/tests/unittests/mlu/test_set_value_op_mlu.py
@@ -127,6 +127,18 @@ def _get_answer(self):
         self.data[0:, 1:2, :] = self.value
 
 
+class TestSetValueItemSlice5(TestSetValueApi):
+
+    def set_shape(self):
+        self.shape = [100, 426, 640]
+
+    def _call_setitem(self, x):
+        x[0:-1] = self.value
+
+    def _get_answer(self):
+        self.data[0:-1] = self.value
+
+
 #TODO: Fix this after MLU support while_loop
 #class TestSetValueItemSliceInWhile(TestSetValueApi):
 #     def _call_setitem(self, x):
@@ -517,6 +529,7 @@ def set_dtype(self):
 create_test_value_int32(TestSetValueItemSlice2)
 create_test_value_int32(TestSetValueItemSlice3)
 create_test_value_int32(TestSetValueItemSlice4)
+create_test_value_int32(TestSetValueItemSlice5)
 
 
 def create_test_value_tensor_fp32(parent):
@@ -543,6 +556,7 @@ def _get_answer(self):
 create_test_value_tensor_fp32(TestSetValueItemSlice2)
 create_test_value_tensor_fp32(TestSetValueItemSlice3)
 create_test_value_tensor_fp32(TestSetValueItemSlice4)
+create_test_value_tensor_fp32(TestSetValueItemSlice5)
 
 
 # 3. Test different shape of value

From e0b4efa8f96c40207ae2198dcd8da55897970206 Mon Sep 17 00:00:00 2001
From: Chen Weihang <chenweihang@baidu.com>
Date: Wed, 20 Jul 2022 05:57:34 -0500
Subject: [PATCH 07/12] Update api changing approve members (#44463)

* update api approve members, test=document_fix

* add qingqnig into list, test=document_fix
---
 tools/check_api_approvals.sh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tools/check_api_approvals.sh b/tools/check_api_approvals.sh
index 87edff50ef85e..49d614fa99107 100644
--- a/tools/check_api_approvals.sh
+++ b/tools/check_api_approvals.sh
@@ -41,13 +41,13 @@ function add_failed(){
 api_params_diff=`python ${PADDLE_ROOT}/tools/check_api_compatible.py ${PADDLE_ROOT}/paddle/fluid/API_DEV.spec  ${PADDLE_ROOT}/paddle/fluid/API_PR.spec` 
 api_spec_diff=`python ${PADDLE_ROOT}/tools/diff_api.py ${PADDLE_ROOT}/paddle/fluid/API_DEV.spec.api  ${PADDLE_ROOT}/paddle/fluid/API_PR.spec.api` 
 if [ "$api_spec_diff" != "" -o "${api_params_diff}" != "" ]; then
-    echo_line="You must have one RD (XiaoguangHu01, lanxianghit or Superjomn) approval for API change.\n"
+    echo_line="You must have one RD (XiaoguangHu01, jeff41404, lanxianghit or qingqing01) approval for API change.\n"
     echo_line="${echo_line} and one TPM approval for API change: \n"
     echo_line="${echo_line} jzhang533/ZhangJun, momozi1996/MoYan, dingjiaweiww/DingJiaWei, Ligoml/LiMengLiu for general APIs.\n"
     echo_line="${echo_line} liuTINA0907/LiuShuangQiao for distributed related APIs.\n"
     echo_line="${echo_line} leiqing1/LeiQing for inference related APIs.\n"
 
-    check_approval 1 46782768 47554610 328693
+    check_approval 1 46782768 8555991 47554610 7845005
     check_approval 1 29231 79295425 23093488 39876205 65896652 54695910
 fi
 

From dafe855e6110ebde453b3b53aed1cdf21137d6b0 Mon Sep 17 00:00:00 2001
From: YUNSHEN XIE <1084314248@qq.com>
Date: Wed, 20 Jul 2022 19:17:02 +0800
Subject: [PATCH 08/12] fix bug,test=document_fix (#44478)

---
 paddle/scripts/paddle_build.sh | 1 +
 1 file changed, 1 insertion(+)

diff --git a/paddle/scripts/paddle_build.sh b/paddle/scripts/paddle_build.sh
index ea1c55af46ea0..056df18ee42f1 100755
--- a/paddle/scripts/paddle_build.sh
+++ b/paddle/scripts/paddle_build.sh
@@ -200,6 +200,7 @@ function cmake_base() {
     if [ "$CMD" != "assert_file_approvals" ];then
       which python
       python -V
+      python -m pip install distro
       python ${PADDLE_ROOT}/tools/summary_env.py
       bash ${PADDLE_ROOT}/tools/get_cpu_info.sh
     fi

From 2883e4b21be2c3bb34da0b648719d2eb0da26685 Mon Sep 17 00:00:00 2001
From: lyq <30404405+affectionlu@users.noreply.github.com>
Date: Wed, 20 Jul 2022 20:24:46 +0800
Subject: [PATCH 09/12] [Phi] migrate clip_by_norm to phi (#44458)

---
 paddle/fluid/operators/clip_by_norm_op.cc     |  14 +-
 paddle/fluid/operators/clip_by_norm_op.cu     | 122 ------------------
 paddle/fluid/operators/clip_by_norm_op.h      |  70 ----------
 paddle/fluid/operators/dgc_clip_by_norm_op.h  |  37 +++++-
 paddle/phi/api/yaml/legacy_api.yaml           |   8 ++
 paddle/phi/infermeta/unary.cc                 |  12 ++
 paddle/phi/infermeta/unary.h                  |   2 +
 paddle/phi/kernels/clip_by_norm_kernel.h      |  27 ++++
 paddle/phi/kernels/cpu/clip_by_norm_kernel.cc |  34 +++++
 paddle/phi/kernels/gpu/clip_by_norm_kernel.cu |  89 +++++++++++++
 .../kernels/impl/clip_by_norm_kernel_impl.h   |  55 ++++++++
 .../selected_rows/clip_by_norm_kernel.h       |  29 +++++
 .../selected_rows/cpu/clip_by_norm_kernel.cc  |  22 ++++
 .../selected_rows/gpu/clip_by_norm_kernel.cu  |  27 ++++
 .../impl/clip_by_norm_kernel_impl.h           |  45 +++++++
 paddle/phi/ops/compat/clip_by_norm_sig.cc     |  30 +++++
 python/paddle/fluid/layers/nn.py              |   2 +
 .../tests/unittests/test_clip_by_norm_op.py   |   7 +-
 18 files changed, 429 insertions(+), 203 deletions(-)
 delete mode 100644 paddle/fluid/operators/clip_by_norm_op.cu
 create mode 100644 paddle/phi/kernels/clip_by_norm_kernel.h
 create mode 100644 paddle/phi/kernels/cpu/clip_by_norm_kernel.cc
 create mode 100644 paddle/phi/kernels/gpu/clip_by_norm_kernel.cu
 create mode 100644 paddle/phi/kernels/impl/clip_by_norm_kernel_impl.h
 create mode 100644 paddle/phi/kernels/selected_rows/clip_by_norm_kernel.h
 create mode 100644 paddle/phi/kernels/selected_rows/cpu/clip_by_norm_kernel.cc
 create mode 100644 paddle/phi/kernels/selected_rows/gpu/clip_by_norm_kernel.cu
 create mode 100644 paddle/phi/kernels/selected_rows/impl/clip_by_norm_kernel_impl.h
 create mode 100644 paddle/phi/ops/compat/clip_by_norm_sig.cc

diff --git a/paddle/fluid/operators/clip_by_norm_op.cc b/paddle/fluid/operators/clip_by_norm_op.cc
index cfb56a4b2a6b1..3805e11d752e3 100644
--- a/paddle/fluid/operators/clip_by_norm_op.cc
+++ b/paddle/fluid/operators/clip_by_norm_op.cc
@@ -13,11 +13,17 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/clip_by_norm_op.h"
+#include "paddle/fluid/framework/infershape_utils.h"
+#include "paddle/phi/core/infermeta_utils.h"
+#include "paddle/phi/infermeta/unary.h"
 
 namespace ops = paddle::operators;
+
+DECLARE_INFER_SHAPE_FUNCTOR(clip_by_norm,
+                            ClipByNormInferShapeFunctor,
+                            PD_INFER_META(phi::ClipByNormInferMeta));
+
 REGISTER_OP_WITHOUT_GRADIENT(clip_by_norm,
                              ops::ClipByNormOp,
-                             ops::ClipByNormOpMaker);
-
-REGISTER_OP_CPU_KERNEL(clip_by_norm,
-                       ops::ClipByNormKernel<phi::CPUContext, float>);
+                             ops::ClipByNormOpMaker,
+                             ClipByNormInferShapeFunctor);
diff --git a/paddle/fluid/operators/clip_by_norm_op.cu b/paddle/fluid/operators/clip_by_norm_op.cu
deleted file mode 100644
index b747682716b3f..0000000000000
--- a/paddle/fluid/operators/clip_by_norm_op.cu
+++ /dev/null
@@ -1,122 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/clip_by_norm_op.h"
-#include "paddle/fluid/operators/reduce_ops/reduce_op.cu.h"
-
-namespace paddle {
-namespace operators {
-using Tensor = framework::Tensor;
-
-template <>
-class ClipByNormKernel<platform::CUDADeviceContext, platform::float16>
-    : public framework::OpKernel<platform::float16> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto max_norm = context.Attr<float>("max_norm");
-    auto in_var = context.InputVar("X");
-    auto& dev_ctx =
-        context.template device_context<platform::CUDADeviceContext>();
-
-    Tensor* output = nullptr;
-    const Tensor* input = nullptr;
-    if (in_var->IsType<framework::LoDTensor>()) {
-      input = context.Input<Tensor>("X");
-
-      output = context.Output<Tensor>("Out");
-      output->mutable_data<platform::float16>(context.GetPlace());
-    } else if (in_var->IsType<phi::SelectedRows>()) {
-      auto* x = context.Input<phi::SelectedRows>("X");
-
-      // merge ids in selected rows first
-      math::scatter::MergeAdd<platform::CUDADeviceContext, platform::float16>
-          merge_func;
-      phi::SelectedRows* merged_input =
-          const_cast<framework::Scope&>(context.scope())
-              .Var()
-              ->GetMutable<phi::SelectedRows>();
-      merge_func(context.template device_context<platform::CUDADeviceContext>(),
-                 *x,
-                 merged_input);
-      input = &(merged_input->value());
-
-      phi::SelectedRows* output_selected_rows =
-          context.Output<phi::SelectedRows>("Out");
-      output_selected_rows->set_rows(merged_input->rows());
-      output_selected_rows->set_height(merged_input->height());
-      output = output_selected_rows->mutable_value();
-      output->Resize(merged_input->value().dims());
-      output->mutable_data<platform::float16>(context.GetPlace());
-    } else {
-      PADDLE_THROW(platform::errors::InvalidArgument(
-          "Invalid input variable type, only support LodTensor and "
-          "SelectedRows types, but got type is %s.",
-          framework::ToTypeName(in_var->Type())));
-    }
-
-    PADDLE_ENFORCE_NOT_NULL(input,
-                            platform::errors::InvalidArgument(
-                                "Input(X) of ClipByNormOp should not be null. "
-                                "Please check if it is created correctly."));
-    std::vector<int> reduce_dims;
-    reduce_dims.resize(input->dims().size());
-    for (int i = 0; i < reduce_dims.size(); ++i) {
-      reduce_dims[i] = i;
-    }
-    Tensor tmp = context.AllocateTmpTensor<float, platform::CUDADeviceContext>(
-        {1}, dev_ctx);
-    TensorReduceImpl<platform::float16,
-                     float,
-                     kps::AddFunctor,
-                     kps::SquareFunctor<platform::float16, float>>(
-        dev_ctx,
-        *input,
-        &tmp,
-        kps::SquareFunctor<platform::float16, float>(),
-        reduce_dims,
-        dev_ctx.stream());
-    auto tmp_eigen = EigenVector<float>::Flatten(tmp);
-    auto x_norm = tmp_eigen.sqrt();
-
-    auto x = EigenVector<platform::float16>::Flatten(*input);
-    auto out = EigenVector<platform::float16>::Flatten(*output);
-
-    auto& place =
-        *context.template device_context<platform::CUDADeviceContext>()
-             .eigen_device();
-
-    auto temp = (x_norm <= max_norm).template cast<float>();
-    auto epsilon =
-        ((x_norm <= static_cast<float>(1e-30)).all().template cast<float>()) *
-        static_cast<float>(1e-6);
-
-    auto scaling =
-        (temp + (static_cast<float>(1) - temp) * max_norm / (x_norm + epsilon))
-            .template cast<platform::float16>();
-    Eigen::array<int, 1> one_dim{{1}};
-    Eigen::DSizes<int, 1> m_dsize(input->numel());
-
-    out.device(place) = x * scaling.reshape(one_dim).broadcast(m_dsize);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-REGISTER_OP_CUDA_KERNEL(
-    clip_by_norm,
-    ops::ClipByNormKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::ClipByNormKernel<paddle::platform::CUDADeviceContext, plat::float16>);
diff --git a/paddle/fluid/operators/clip_by_norm_op.h b/paddle/fluid/operators/clip_by_norm_op.h
index 7387821338cd9..6fde5106f10a4 100644
--- a/paddle/fluid/operators/clip_by_norm_op.h
+++ b/paddle/fluid/operators/clip_by_norm_op.h
@@ -30,76 +30,6 @@ template <typename T,
           typename IndexType = Eigen::DenseIndex>
 using EigenVector = framework::EigenVector<T, MajorType, IndexType>;
 
-template <typename DeviceContext, typename T>
-class ClipByNormKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto max_norm = context.Attr<T>("max_norm");
-    auto in_var = context.InputVar("X");
-
-    Tensor* output = nullptr;
-    const Tensor* input = nullptr;
-    if (in_var->IsType<framework::LoDTensor>()) {
-      input = context.Input<Tensor>("X");
-
-      output = context.Output<Tensor>("Out");
-      output->mutable_data<T>(context.GetPlace());
-    } else if (in_var->IsType<phi::SelectedRows>()) {
-      auto* x = context.Input<phi::SelectedRows>("X");
-
-      // merge ids in selected rows first
-      math::scatter::MergeAdd<DeviceContext, T> merge_func;
-      phi::SelectedRows* merged_input =
-          const_cast<framework::Scope&>(context.scope())
-              .Var()
-              ->GetMutable<phi::SelectedRows>();
-      merge_func(
-          context.template device_context<DeviceContext>(), *x, merged_input);
-      input = &(merged_input->value());
-
-      phi::SelectedRows* output_selected_rows =
-          context.Output<phi::SelectedRows>("Out");
-      output_selected_rows->set_rows(merged_input->rows());
-      output_selected_rows->set_height(merged_input->height());
-      output = output_selected_rows->mutable_value();
-      output->Resize(merged_input->value().dims());
-      output->mutable_data<T>(context.GetPlace());
-    } else {
-      PADDLE_THROW(platform::errors::InvalidArgument(
-          "Invalid input variable type, only support LodTensor and "
-          "SelectedRows types, but got type is %s.",
-          framework::ToTypeName(in_var->Type())));
-    }
-
-    PADDLE_ENFORCE_NOT_NULL(input,
-                            platform::errors::InvalidArgument(
-                                "Input(X) of ClipByNormOp should not be null. "
-                                "Please check if it is created correctly."));
-
-    auto x = EigenVector<T>::Flatten(*input);
-    auto out = EigenVector<T>::Flatten(*output);
-    auto x_norm = x.square().sum().sqrt();
-    auto& place =
-        *context.template device_context<DeviceContext>().eigen_device();
-
-    auto temp = (x_norm <= max_norm).template cast<T>();
-    auto epsilon =
-        ((x_norm <= static_cast<T>(1e-30)).all().template cast<T>()) *
-        static_cast<T>(1e-6);
-
-    auto scaling =
-        temp + (static_cast<T>(1) - temp) * max_norm / (x_norm + epsilon);
-    Eigen::array<int, 1> one_dim{{1}};
-    Eigen::DSizes<int, 1> m_dsize(input->numel());
-    if (context.GetPlace() == platform::CPUPlace()) {
-      out.device(place) =
-          x * scaling.reshape(one_dim).eval().broadcast(m_dsize);
-    } else {
-      out.device(place) = x * scaling.reshape(one_dim).broadcast(m_dsize);
-    }
-  }
-};
-
 class ClipByNormOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
diff --git a/paddle/fluid/operators/dgc_clip_by_norm_op.h b/paddle/fluid/operators/dgc_clip_by_norm_op.h
index 197bf59b2a470..27c30a8997b2c 100644
--- a/paddle/fluid/operators/dgc_clip_by_norm_op.h
+++ b/paddle/fluid/operators/dgc_clip_by_norm_op.h
@@ -15,20 +15,24 @@ limitations under the License. */
 #pragma once
 
 #include "paddle/fluid/operators/clip_by_norm_op.h"
+#include "paddle/phi/kernels/clip_by_norm_kernel.h"
+#include "paddle/phi/kernels/selected_rows/clip_by_norm_kernel.h"
 
 namespace paddle {
 namespace operators {
 
+using Tensor = framework::Tensor;
+
 template <typename DeviceContext, typename T>
-class DGCClipByNormKernel : public ClipByNormKernel<DeviceContext, T> {
+class DGCClipByNormKernel : public framework::OpKernel<T> {
  public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto rampup_begin_step = context.Attr<float>("rampup_begin_step");
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto rampup_begin_step = ctx.Attr<float>("rampup_begin_step");
     if (static_cast<int>(rampup_begin_step) < 0) {
       return;
     }
 
-    auto current_step_tensor = context.Input<framework::Tensor>("current_step");
+    auto current_step_tensor = ctx.Input<framework::Tensor>("current_step");
     auto* current_step = current_step_tensor->data<T>();
 
     VLOG(10) << "current_step:" << *current_step
@@ -41,7 +45,30 @@ class DGCClipByNormKernel : public ClipByNormKernel<DeviceContext, T> {
       return;
     }
 
-    return ClipByNormKernel<DeviceContext, T>::Compute(context);
+    auto in_var = ctx.InputVar("X");
+    auto max_norm = ctx.Attr<float>("max_norm");
+    auto& dev_ctx = ctx.device_context<DeviceContext>();
+
+    if (in_var->IsType<framework::LoDTensor>()) {
+      auto* x = ctx.Input<Tensor>("X");
+      auto* y = ctx.Output<Tensor>("Out");
+      return phi::ClipByNormKernel<T>(
+          static_cast<const typename framework::ConvertToPhiContext<
+              DeviceContext>::TYPE&>(dev_ctx),
+          *x,
+          max_norm,
+          y);
+    } else if (in_var->IsType<phi::SelectedRows>()) {
+      auto* x = ctx.Input<phi::SelectedRows>("X");
+      phi::SelectedRows* output_selected_rows =
+          ctx.Output<phi::SelectedRows>("Out");
+      return phi::sr::ClipByNormKernel<T>(
+          static_cast<const typename framework::ConvertToPhiContext<
+              DeviceContext>::TYPE&>(dev_ctx),
+          *x,
+          max_norm,
+          output_selected_rows);
+    }
   };
 };
 
diff --git a/paddle/phi/api/yaml/legacy_api.yaml b/paddle/phi/api/yaml/legacy_api.yaml
index 40fbdc9a9170d..a562db94745c9 100644
--- a/paddle/phi/api/yaml/legacy_api.yaml
+++ b/paddle/phi/api/yaml/legacy_api.yaml
@@ -368,6 +368,14 @@
     func : clip
   backward : clip_grad
 
+- api : clip_by_norm
+  args : (Tensor x, float max_norm)
+  output : Tensor(out)
+  infer_meta :
+    func : ClipByNormInferMeta
+  kernel :
+    func : clip_by_norm
+
 - api : complex
   args : (Tensor x, Tensor y)
   output : Tensor
diff --git a/paddle/phi/infermeta/unary.cc b/paddle/phi/infermeta/unary.cc
index 7b1c6dfe65a04..35cada2c325e5 100644
--- a/paddle/phi/infermeta/unary.cc
+++ b/paddle/phi/infermeta/unary.cc
@@ -264,6 +264,18 @@ void CholeskyInferMeta(const MetaTensor& x, bool upper, MetaTensor* out) {
   out->set_dtype(x.dtype());
 }
 
+void ClipByNormInferMeta(const MetaTensor& x, float max_norm, MetaTensor* out) {
+  PADDLE_ENFORCE_GT(
+      max_norm,
+      0,
+      phi::errors::InvalidArgument("max_norm should be greater than 0. "
+                                   "Received max_norm is %f.",
+                                   max_norm));
+  out->set_dims(x.dims());
+  out->set_dtype(x.dtype());
+  out->share_lod(x);
+}
+
 void CreateLikeInferMeta(const MetaTensor& x, DataType dtype, MetaTensor* out) {
   out->set_dims(x.dims());
   out->set_dtype(dtype == DataType::UNDEFINED ? x.dtype() : dtype);
diff --git a/paddle/phi/infermeta/unary.h b/paddle/phi/infermeta/unary.h
index e825ba98f44e3..1a0da23600339 100644
--- a/paddle/phi/infermeta/unary.h
+++ b/paddle/phi/infermeta/unary.h
@@ -62,6 +62,8 @@ void CastInferMeta(const MetaTensor& x, DataType out_dtype, MetaTensor* out);
 
 void CholeskyInferMeta(const MetaTensor& x, bool upper, MetaTensor* out);
 
+void ClipByNormInferMeta(const MetaTensor& x, float max_norm, MetaTensor* out);
+
 void CreateLikeInferMeta(const MetaTensor& x, DataType dtype, MetaTensor* out);
 
 void CumInferMeta(const MetaTensor& x,
diff --git a/paddle/phi/kernels/clip_by_norm_kernel.h b/paddle/phi/kernels/clip_by_norm_kernel.h
new file mode 100644
index 0000000000000..debff5d08b646
--- /dev/null
+++ b/paddle/phi/kernels/clip_by_norm_kernel.h
@@ -0,0 +1,27 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void ClipByNormKernel(const Context& dev_ctx,
+                      const DenseTensor& x,
+                      float max_norm,
+                      DenseTensor* out);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/cpu/clip_by_norm_kernel.cc b/paddle/phi/kernels/cpu/clip_by_norm_kernel.cc
new file mode 100644
index 0000000000000..8d8e27dda32b4
--- /dev/null
+++ b/paddle/phi/kernels/cpu/clip_by_norm_kernel.cc
@@ -0,0 +1,34 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/clip_by_norm_kernel.h"
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/clip_by_norm_kernel_impl.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void ClipByNormKernel(const Context& dev_ctx,
+                      const DenseTensor& in,
+                      float max_norm,
+                      DenseTensor* output) {
+  return ClipByNormFunctor<T, Context>(dev_ctx, in, max_norm, output);
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(
+    clip_by_norm, CPU, ALL_LAYOUT, phi::ClipByNormKernel, float) {}
diff --git a/paddle/phi/kernels/gpu/clip_by_norm_kernel.cu b/paddle/phi/kernels/gpu/clip_by_norm_kernel.cu
new file mode 100644
index 0000000000000..6c3abf843f998
--- /dev/null
+++ b/paddle/phi/kernels/gpu/clip_by_norm_kernel.cu
@@ -0,0 +1,89 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/clip_by_norm_kernel.h"
+
+#include <typeinfo>
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/common/float16.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/eigen/common.h"
+#include "paddle/phi/kernels/funcs/reduce_function.h"
+#include "paddle/phi/kernels/impl/clip_by_norm_kernel_impl.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void ClipByNormKernel(const Context& dev_ctx,
+                      const DenseTensor& in,
+                      float max_norm,
+                      DenseTensor* output) {
+  if (typeid(T) == typeid(float)) {
+    return ClipByNormFunctor<float, Context>(dev_ctx, in, max_norm, output);
+  }
+  auto input = &in;
+  dev_ctx.template Alloc<dtype::float16>(output);
+
+  PADDLE_ENFORCE_NOT_NULL(input,
+                          phi::errors::InvalidArgument(
+                              "Input(X) of ClipByNormOp should not be null. "
+                              "Please check if it is created correctly."));
+  std::vector<int> reduce_dims;
+  reduce_dims.resize(input->dims().size());
+  for (int i = 0; i < reduce_dims.size(); ++i) {
+    reduce_dims[i] = i;
+  }
+  DenseTensor tmp_tensor;
+  auto* tmp = &tmp_tensor;
+  tmp->Resize({1});
+  dev_ctx.template Alloc<float>(tmp);
+  phi::funcs::ReduceKernel<dtype::float16,
+                           float,
+                           kps::AddFunctor,
+                           kps::SquareFunctor<dtype::float16, float>>(
+      dev_ctx,
+      *input,
+      tmp,
+      kps::SquareFunctor<dtype::float16, float>(),
+      reduce_dims);
+  auto tmp_eigen = phi::EigenVector<float>::Flatten(*tmp);
+  auto x_norm = tmp_eigen.sqrt();
+
+  auto x = phi::EigenVector<dtype::float16>::Flatten(*input);
+  auto out = phi::EigenVector<dtype::float16>::Flatten(*output);
+  auto* place = dev_ctx.eigen_device();
+
+  auto temp = (x_norm <= max_norm).template cast<float>();
+  auto epsilon =
+      ((x_norm <= static_cast<float>(1e-30)).all().template cast<float>()) *
+      static_cast<float>(1e-6);
+
+  auto scaling =
+      (temp + (static_cast<float>(1) - temp) * max_norm / (x_norm + epsilon))
+          .template cast<dtype::float16>();
+  Eigen::array<int, 1> one_dim{{1}};
+  Eigen::DSizes<int, 1> m_dsize(input->numel());
+
+  out.device(*place) = x * scaling.reshape(one_dim).broadcast(m_dsize);
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(clip_by_norm,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::ClipByNormKernel,
+                   float,
+                   phi::dtype::float16) {}
diff --git a/paddle/phi/kernels/impl/clip_by_norm_kernel_impl.h b/paddle/phi/kernels/impl/clip_by_norm_kernel_impl.h
new file mode 100644
index 0000000000000..079254bb8284c
--- /dev/null
+++ b/paddle/phi/kernels/impl/clip_by_norm_kernel_impl.h
@@ -0,0 +1,55 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/kernels/funcs/eigen/common.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void ClipByNormFunctor(const Context& dev_ctx,
+                       const DenseTensor& in,
+                       float max_norm,
+                       DenseTensor* output) {
+  auto input = &in;
+  dev_ctx.template Alloc<T>(output);
+
+  PADDLE_ENFORCE_NOT_NULL(input,
+                          phi::errors::InvalidArgument(
+                              "Input(X) of ClipByNormOp should not be null. "
+                              "Please check if it is created correctly."));
+
+  auto x = phi::EigenVector<T>::Flatten(*input);
+  auto out = phi::EigenVector<T>::Flatten(*output);
+  auto x_norm = x.square().sum().sqrt();
+  auto* place = dev_ctx.eigen_device();
+
+  auto temp = (x_norm <= max_norm).template cast<T>();
+  auto epsilon = ((x_norm <= static_cast<T>(1e-30)).all().template cast<T>()) *
+                 static_cast<T>(1e-6);
+
+  auto scaling =
+      temp + (static_cast<T>(1) - temp) * max_norm / (x_norm + epsilon);
+  Eigen::array<int, 1> one_dim{{1}};
+  Eigen::DSizes<int, 1> m_dsize(input->numel());
+  if (dev_ctx.GetPlace() == phi::CPUPlace()) {
+    out.device(*place) = x * scaling.reshape(one_dim).eval().broadcast(m_dsize);
+  } else {
+    out.device(*place) = x * scaling.reshape(one_dim).broadcast(m_dsize);
+  }
+}
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/selected_rows/clip_by_norm_kernel.h b/paddle/phi/kernels/selected_rows/clip_by_norm_kernel.h
new file mode 100644
index 0000000000000..975aac23ff3ac
--- /dev/null
+++ b/paddle/phi/kernels/selected_rows/clip_by_norm_kernel.h
@@ -0,0 +1,29 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/core/selected_rows.h"
+
+namespace phi {
+namespace sr {
+
+template <typename T, typename Context>
+void ClipByNormKernel(const Context& dev_ctx,
+                      const SelectedRows& x,
+                      float max_norm,
+                      SelectedRows* out);
+}  // namespace sr
+}  // namespace phi
diff --git a/paddle/phi/kernels/selected_rows/cpu/clip_by_norm_kernel.cc b/paddle/phi/kernels/selected_rows/cpu/clip_by_norm_kernel.cc
new file mode 100644
index 0000000000000..ecefe8f74bb72
--- /dev/null
+++ b/paddle/phi/kernels/selected_rows/cpu/clip_by_norm_kernel.cc
@@ -0,0 +1,22 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/selected_rows/clip_by_norm_kernel.h"
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/selected_rows/impl/clip_by_norm_kernel_impl.h"
+
+PD_REGISTER_KERNEL(
+    clip_by_norm_sr, CPU, ALL_LAYOUT, phi::sr::ClipByNormKernel, float) {}
diff --git a/paddle/phi/kernels/selected_rows/gpu/clip_by_norm_kernel.cu b/paddle/phi/kernels/selected_rows/gpu/clip_by_norm_kernel.cu
new file mode 100644
index 0000000000000..4245aa35b3918
--- /dev/null
+++ b/paddle/phi/kernels/selected_rows/gpu/clip_by_norm_kernel.cu
@@ -0,0 +1,27 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/selected_rows/clip_by_norm_kernel.h"
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/common/float16.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/selected_rows/impl/clip_by_norm_kernel_impl.h"
+
+PD_REGISTER_KERNEL(clip_by_norm_sr,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::sr::ClipByNormKernel,
+                   float,
+                   phi::dtype::float16) {}
diff --git a/paddle/phi/kernels/selected_rows/impl/clip_by_norm_kernel_impl.h b/paddle/phi/kernels/selected_rows/impl/clip_by_norm_kernel_impl.h
new file mode 100644
index 0000000000000..5d79393a32d66
--- /dev/null
+++ b/paddle/phi/kernels/selected_rows/impl/clip_by_norm_kernel_impl.h
@@ -0,0 +1,45 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/fluid/operators/math/selected_rows_functor.h"
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/core/device_context.h"
+#include "paddle/phi/core/selected_rows.h"
+#include "paddle/phi/kernels/clip_by_norm_kernel.h"
+#include "paddle/phi/kernels/selected_rows/clip_by_norm_kernel.h"
+
+namespace phi {
+namespace sr {
+
+template <typename T, typename Context>
+void ClipByNormKernel(const Context& dev_ctx,
+                      const SelectedRows& x,
+                      float max_norm,
+                      SelectedRows* out) {
+  phi::SelectedRows merged_input;
+  paddle::operators::math::scatter::MergeAdd<Context, T> merge_func;
+  merge_func(dev_ctx, x, &merged_input);
+  auto input = &(merged_input.value());
+  out->set_rows(merged_input.rows());
+  out->set_height(merged_input.height());
+  auto out_tensor = out->mutable_value();
+  out_tensor->Resize(merged_input.value().dims());
+  return phi::ClipByNormKernel<T, Context>(
+      dev_ctx, *input, max_norm, out_tensor);
+}
+
+}  // namespace sr
+}  // namespace phi
diff --git a/paddle/phi/ops/compat/clip_by_norm_sig.cc b/paddle/phi/ops/compat/clip_by_norm_sig.cc
new file mode 100644
index 0000000000000..8a2cecc0293d3
--- /dev/null
+++ b/paddle/phi/ops/compat/clip_by_norm_sig.cc
@@ -0,0 +1,30 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/phi/core/compat/op_utils.h"
+
+namespace phi {
+
+KernelSignature ClipByNormOpArgumentMapping(const ArgumentMappingContext& ctx) {
+  if (ctx.IsDenseTensorInput("X")) {
+    return KernelSignature("clip_by_norm", {"X"}, {"max_norm"}, {"Out"});
+  } else if (ctx.IsSelectedRowsInput("X")) {
+    return KernelSignature("clip_by_norm_sr", {"X"}, {"max_norm"}, {"Out"});
+  }
+  return KernelSignature("unregistered", {}, {}, {});
+}
+
+}  // namespace phi
+
+PD_REGISTER_ARG_MAPPING_FN(clip_by_norm, phi::ClipByNormOpArgumentMapping);
diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py
index 050d6bfcb6bbb..e68b70107c109 100755
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -13043,6 +13043,8 @@ def clip_by_norm(x, max_norm, name=None):
             # [[0.5, 0.5], [0.5, 0.5]]
     """
 
+    if in_dygraph_mode():
+        return _C_ops.final_state_clip_by_norm(x, max_norm)
     if _non_static_mode():
         return _C_ops.clip_by_norm(x, 'max_norm', max_norm)
 
diff --git a/python/paddle/fluid/tests/unittests/test_clip_by_norm_op.py b/python/paddle/fluid/tests/unittests/test_clip_by_norm_op.py
index 8eb4c7a8be965..04b9c5b8b8bc7 100644
--- a/python/paddle/fluid/tests/unittests/test_clip_by_norm_op.py
+++ b/python/paddle/fluid/tests/unittests/test_clip_by_norm_op.py
@@ -27,6 +27,7 @@ class TestClipByNormOp(OpTest):
 
     def setUp(self):
         self.max_relative_error = 0.006
+        self.python_api = fluid.layers.clip_by_norm
         self.init_dtype()
         self.initTestCase()
         input = np.random.random(self.shape).astype(self.dtype)
@@ -45,7 +46,7 @@ def setUp(self):
         self.outputs = {'Out': output}
 
     def test_check_output(self):
-        self.check_output()
+        self.check_output(check_eager=True)
 
     def initTestCase(self):
         self.shape = (100, )
@@ -85,7 +86,9 @@ def test_check_output(self):
         if core.is_compiled_with_cuda():
             place = core.CUDAPlace(0)
             if core.is_float16_supported(place):
-                self.check_output_with_place(place, atol=0.001)
+                self.check_output_with_place(place,
+                                             atol=0.001,
+                                             check_eager=True)
 
 
 class TestClipByNormOpFp16Case1(TestClipByNormOpFp16):

From fbfdea5148897561d3b668dcdfd29adaee1038e4 Mon Sep 17 00:00:00 2001
From: Leo Chen <chenqiuliang@baidu.com>
Date: Wed, 20 Jul 2022 07:49:26 -0500
Subject: [PATCH 10/12] add eigen3 dependency for phi_backends (#44479)

---
 paddle/phi/backends/CMakeLists.txt        |  3 ++-
 paddle/phi/backends/custom/CMakeLists.txt | 18 ------------------
 2 files changed, 2 insertions(+), 19 deletions(-)
 delete mode 100644 paddle/phi/backends/custom/CMakeLists.txt

diff --git a/paddle/phi/backends/CMakeLists.txt b/paddle/phi/backends/CMakeLists.txt
index a2d4b1deef6e6..de4d82b46133c 100644
--- a/paddle/phi/backends/CMakeLists.txt
+++ b/paddle/phi/backends/CMakeLists.txt
@@ -1,7 +1,7 @@
 add_subdirectory(dynload)
 
 set(BACKENDS_SRCS all_context.cc cpu/cpu_context.cc)
-set(BACKENDS_DEPS enforce place flags)
+set(BACKENDS_DEPS enforce place flags eigen3)
 
 if(WITH_GPU OR WITH_ROCM)
   list(APPEND BACKENDS_SRCS gpu/gpu_context.cc gpu/gpu_info.cc
@@ -40,6 +40,7 @@ endif()
 
 add_library(phi_backends "${BACKENDS_SRCS}")
 target_link_libraries(phi_backends ${BACKENDS_DEPS})
+add_dependencies(phi_backends eigen3)
 
 # for inference library
 get_property(phi_modules GLOBAL PROPERTY PHI_MODULES)
diff --git a/paddle/phi/backends/custom/CMakeLists.txt b/paddle/phi/backends/custom/CMakeLists.txt
deleted file mode 100644
index ceff429f8e596..0000000000000
--- a/paddle/phi/backends/custom/CMakeLists.txt
+++ /dev/null
@@ -1,18 +0,0 @@
-if(WITH_CUSTOM_DEVICE)
-  cc_library(
-    custom_context
-    SRCS custom_context.cc
-    DEPS phi_device_context device_manager)
-  cc_library(
-    custom_device
-    SRCS custom_device.cc
-    DEPS device_base device_context)
-  cc_test(
-    custom_device_test
-    SRCS custom_device_test.cc
-    DEPS device_manager device_context)
-  cc_test(
-    capi_test
-    SRCS capi_test.cc
-    DEPS phi_capi)
-endif()

From 15dd94abf25ae5d91b8f8890c361ac8ffe9dc41b Mon Sep 17 00:00:00 2001
From: danleifeng <52735331+danleifeng@users.noreply.github.com>
Date: Wed, 20 Jul 2022 21:20:04 +0800
Subject: [PATCH 11/12] remove fleet_13 ut in parallel_UT_rule.py; test=develop
 (#44477)

---
 tools/parallel_UT_rule.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/tools/parallel_UT_rule.py b/tools/parallel_UT_rule.py
index 53ab93f57ce56..559f2d95b915f 100755
--- a/tools/parallel_UT_rule.py
+++ b/tools/parallel_UT_rule.py
@@ -671,8 +671,7 @@
     'test_trt_convert_reduce_sum',
     'save_quant2_model_lstm',
     'test_trt_convert_slice',
-    'test_quant2_int8_lstm_mkldnn',
-    'test_dist_fleet_ps13'
+    'test_quant2_int8_lstm_mkldnn'
 ]
 
 # mem=0 but always timeout or failed : It run 15 job each time in Single cases;

From 98e9685394858bf834fdedd0a70d33d9f06226a8 Mon Sep 17 00:00:00 2001
From: YuanRisheng <yuanrisheng@baidu.com>
Date: Wed, 20 Jul 2022 21:31:31 +0800
Subject: [PATCH 12/12] [PHI]Seperate xshape kernel from normal kernel (#44315)

* seperate xshape kernel from normal kernel

* fix bugs in infermeta

* fix compile bugs

* fix compile bugs
---
 paddle/fluid/operators/einsum_op.cc           |  2 +-
 paddle/fluid/operators/squeeze_op.cc          |  2 +-
 paddle/fluid/operators/unsqueeze_op.cc        |  2 +-
 paddle/phi/api/lib/CMakeLists.txt             |  4 +-
 paddle/phi/api/yaml/legacy_api.yaml           | 12 ++--
 paddle/phi/infermeta/unary.cc                 | 56 +++++++++++++------
 paddle/phi/infermeta/unary.h                  | 25 +++++++--
 paddle/phi/kernels/cpu/einsum_kernel.cc       | 11 +++-
 paddle/phi/kernels/cpu/squeeze_kernel.cc      | 15 +++++
 paddle/phi/kernels/cpu/unsqueeze_kernel.cc    | 16 ++++++
 paddle/phi/kernels/gpu/einsum_kernel.cu       | 13 ++++-
 paddle/phi/kernels/gpu/squeeze_kernel.cu      | 16 ++++++
 paddle/phi/kernels/gpu/unsqueeze_kernel.cu    | 17 ++++++
 paddle/phi/kernels/impl/solve_kernel_impl.h   |  6 +-
 paddle/phi/kernels/impl/squeeze_kernel_impl.h | 13 ++++-
 .../phi/kernels/impl/unsqueeze_kernel_impl.h  | 12 +++-
 paddle/phi/kernels/squeeze_kernel.h           | 11 +++-
 paddle/phi/kernels/unsqueeze_kernel.h         | 14 +++--
 paddle/phi/ops/compat/einsum_sig.cc           | 10 +++-
 paddle/phi/ops/compat/squeeze_sig.cc          |  7 ++-
 paddle/phi/ops/compat/unsqueeze_sig.cc        | 36 ++++++++----
 21 files changed, 239 insertions(+), 61 deletions(-)

diff --git a/paddle/fluid/operators/einsum_op.cc b/paddle/fluid/operators/einsum_op.cc
index 0e33efab90a85..5f169e20e3dc3 100644
--- a/paddle/fluid/operators/einsum_op.cc
+++ b/paddle/fluid/operators/einsum_op.cc
@@ -106,7 +106,7 @@ namespace ops = paddle::operators;
 
 DECLARE_INFER_SHAPE_FUNCTOR(einsum,
                             EinsumInferShapeFunctor,
-                            PD_INFER_META(phi::EinsumInferMeta));
+                            PD_INFER_META(phi::EinsumRawInferMeta));
 
 REGISTER_OPERATOR(einsum,
                   ops::EinsumOp,
diff --git a/paddle/fluid/operators/squeeze_op.cc b/paddle/fluid/operators/squeeze_op.cc
index f532a429b49e2..b3c70e2fe9988 100644
--- a/paddle/fluid/operators/squeeze_op.cc
+++ b/paddle/fluid/operators/squeeze_op.cc
@@ -347,7 +347,7 @@ namespace ops = paddle::operators;
 
 DECLARE_INFER_SHAPE_FUNCTOR(squeeze2,
                             SqueezeInferShapeFunctor,
-                            PD_INFER_META(phi::SqueezeInferMeta));
+                            PD_INFER_META(phi::SqueezeWithXShapeInferMeta));
 
 REGISTER_OPERATOR(squeeze,
                   ops::SqueezeOp,
diff --git a/paddle/fluid/operators/unsqueeze_op.cc b/paddle/fluid/operators/unsqueeze_op.cc
index 53de6440f1f61..f01ae5f142d28 100644
--- a/paddle/fluid/operators/unsqueeze_op.cc
+++ b/paddle/fluid/operators/unsqueeze_op.cc
@@ -347,7 +347,7 @@ DECLARE_NO_NEED_BUFFER_VARS_INFERER(UnsqueezeGradOpNoNeedBufferVarInferer, "X");
 
 DECLARE_INFER_SHAPE_FUNCTOR(unsqueeze2,
                             Unsqueeze2InferShapeFunctor,
-                            PD_INFER_META(phi::UnsqueezeInferMeta));
+                            PD_INFER_META(phi::UnsqueezeWithXShapeInferMeta));
 
 namespace ops = paddle::operators;
 REGISTER_OPERATOR(unsqueeze,
diff --git a/paddle/phi/api/lib/CMakeLists.txt b/paddle/phi/api/lib/CMakeLists.txt
index 05d27571b8795..cb7f439690619 100644
--- a/paddle/phi/api/lib/CMakeLists.txt
+++ b/paddle/phi/api/lib/CMakeLists.txt
@@ -325,8 +325,8 @@ add_custom_command(
           ${dygraph_api_header_file}
   COMMAND ${CMAKE_COMMAND} -E copy_if_different ${dygraph_api_source_file_tmp}
           ${dygraph_api_source_file}
-  DEPENDS ${api_yaml_file} ${sparse_api_yaml_file} ${im_api_gen_file}
-          ${api_gen_base} ${api_gen_file}
+  DEPENDS ${api_yaml_file} ${legacy_api_yaml_file} ${sparse_api_yaml_file}
+          ${im_api_gen_file} ${api_gen_base} ${api_gen_file}
   VERBATIM)
 
 # generate wrapped infermeta
diff --git a/paddle/phi/api/yaml/legacy_api.yaml b/paddle/phi/api/yaml/legacy_api.yaml
index a562db94745c9..0d0fd74c17aa7 100644
--- a/paddle/phi/api/yaml/legacy_api.yaml
+++ b/paddle/phi/api/yaml/legacy_api.yaml
@@ -582,10 +582,10 @@
   args : (Tensor[] x, str equation)
   output : Tensor, Tensor[]{x.size()}, Tensor[]{x.size()}
   infer_meta :
-    func : EinsumInferMeta
+    func : EinsumRawInferMeta
     param : [x, equation]
   kernel :
-    func : einsum
+    func : einsum_raw
   backward : einsum_grad
 
 - api : elementwise_pow
@@ -2047,9 +2047,9 @@
   args : (Tensor x, int[] axes)
   output : Tensor(out), Tensor(xshape)
   infer_meta :
-    func : SqueezeInferMeta
+    func : SqueezeWithXShapeInferMeta
   kernel :
-    func : squeeze
+    func : squeeze_with_xshape
   view: (x -> out)
   intermediate : xshape
   backward : squeeze_grad
@@ -2290,9 +2290,9 @@
   args : (Tensor x, IntArray axis)
   output : Tensor(out), Tensor(xshape)
   infer_meta :
-    func : UnsqueezeInferMeta
+    func : UnsqueezeWithXShapeInferMeta
   kernel :
-    func : unsqueeze
+    func : unsqueeze_with_xshape
   view: (x -> out)
   intermediate : xshape
   backward : unsqueeze_grad
diff --git a/paddle/phi/infermeta/unary.cc b/paddle/phi/infermeta/unary.cc
index 35cada2c325e5..c7699c34cc546 100644
--- a/paddle/phi/infermeta/unary.cc
+++ b/paddle/phi/infermeta/unary.cc
@@ -570,9 +570,7 @@ void EigvalsInferMeta(const MetaTensor& x, MetaTensor* out, MetaConfig config) {
 
 void EinsumInferMeta(const std::vector<const MetaTensor*>& inputs,
                      const std::string& equation,
-                     MetaTensor* out,
-                     std::vector<MetaTensor*> inner_cache,
-                     std::vector<MetaTensor*> xshape) {
+                     MetaTensor* out) {
   // collect the following informations to prepare einsum.
   LabelMap labelshape(0);
   LabelMap labeltype(LabelType::Reduction);
@@ -609,6 +607,14 @@ void EinsumInferMeta(const std::vector<const MetaTensor*>& inputs,
   VLOG(3) << "Label Shape is : " << label_to_string(all_labels, labelshape);
   out->set_dims(make_ddim(output_dims));
   out->set_dtype(inputs[0]->dtype());
+}
+
+void EinsumRawInferMeta(const std::vector<const MetaTensor*>& inputs,
+                        const std::string& equation,
+                        MetaTensor* out,
+                        std::vector<MetaTensor*> inner_cache,
+                        std::vector<MetaTensor*> xshape) {
+  EinsumInferMeta(inputs, equation, out);
   for (size_t i = 0; i < xshape.size(); ++i) {
     if (xshape[i] != nullptr) {
       xshape[i]->set_dims(inputs[i]->dims());
@@ -2448,8 +2454,7 @@ void SplitInferMeta(const MetaTensor& x,
 
 void SqueezeInferMeta(const MetaTensor& x,
                       const std::vector<int>& axes,
-                      MetaTensor* out,
-                      MetaTensor* xshape) {
+                      MetaTensor* out) {
   const auto& x_dims = x.dims();
   // Check input tensor dims (<6) Eigen limit.
   PADDLE_ENFORCE_LE(x_dims.size(),
@@ -2469,15 +2474,25 @@ void SqueezeInferMeta(const MetaTensor& x,
     out->share_lod(x);
   }
 
+  out->set_dtype(x.dtype());
+}
+
+void SqueezeWithXShapeInferMeta(const MetaTensor& x,
+                                const std::vector<int>& axes,
+                                MetaTensor* out,
+                                MetaTensor* xshape) {
+  SqueezeInferMeta(x, axes, out);
+  const auto& x_dims = x.dims();
   std::vector<int64_t> xshape_dims(x_dims.size() + 1);
   xshape_dims[0] = 0;
   for (int i = 0; i < x_dims.size(); ++i) {
     xshape_dims[i + 1] = x_dims[i];
   }
-  xshape->set_dims(phi::make_ddim(xshape_dims));
-  xshape->share_lod(x);
-  xshape->set_dtype(x.dtype());
-  out->set_dtype(x.dtype());
+  if (xshape) {
+    xshape->set_dims(phi::make_ddim(xshape_dims));
+    xshape->share_lod(x);
+    xshape->set_dtype(x.dtype());
+  }
 }
 
 void StridedSliceRawInferMeta(const MetaTensor& x,
@@ -3310,7 +3325,6 @@ void UniqueRawInferMeta(const MetaTensor& x,
 void UnsqueezeInferMeta(const MetaTensor& x,
                         const IntArray& axes,
                         MetaTensor* out,
-                        MetaTensor* xshape,
                         MetaConfig config) {
   const auto& x_dims = x.dims();
   // Validity Check: input tensor dims (<6).
@@ -3339,14 +3353,22 @@ void UnsqueezeInferMeta(const MetaTensor& x,
     }
     out->set_dtype(x.dtype());
   }
-  if (xshape) {
-    // set xshape dims.
-    std::vector<int64_t> xshape_dims(x_dims.size() + 1);
-    xshape_dims[0] = 0;
-    for (int i = 0; i < x_dims.size(); ++i) {
-      xshape_dims[i + 1] = x_dims[i];
-    }
+}
 
+void UnsqueezeWithXShapeInferMeta(const MetaTensor& x,
+                                  const IntArray& axes,
+                                  MetaTensor* out,
+                                  MetaTensor* xshape,
+                                  MetaConfig config) {
+  const auto& x_dims = x.dims();
+  UnsqueezeInferMeta(x, axes, out, config);
+  // set xshape dims.
+  std::vector<int64_t> xshape_dims(x_dims.size() + 1);
+  xshape_dims[0] = 0;
+  for (int i = 0; i < x_dims.size(); ++i) {
+    xshape_dims[i + 1] = x_dims[i];
+  }
+  if (xshape) {
     xshape->set_dims(phi::make_ddim(xshape_dims));
     xshape->share_lod(x);
     xshape->set_dtype(x.dtype());
diff --git a/paddle/phi/infermeta/unary.h b/paddle/phi/infermeta/unary.h
index 1a0da23600339..ea7364e643960 100644
--- a/paddle/phi/infermeta/unary.h
+++ b/paddle/phi/infermeta/unary.h
@@ -97,9 +97,13 @@ void EigvalsInferMeta(const MetaTensor& x,
 
 void EinsumInferMeta(const std::vector<const MetaTensor*>& inputs,
                      const std::string& equation,
-                     MetaTensor* out,
-                     std::vector<MetaTensor*> inner_cache,
-                     std::vector<MetaTensor*> xshape);
+                     MetaTensor* out);
+
+void EinsumRawInferMeta(const std::vector<const MetaTensor*>& inputs,
+                        const std::string& equation,
+                        MetaTensor* out,
+                        std::vector<MetaTensor*> inner_cache,
+                        std::vector<MetaTensor*> xshape);
 
 void ExpandInferMeta(const MetaTensor& x,
                      const IntArray& shape,
@@ -341,8 +345,12 @@ void SplitInferMeta(const MetaTensor& x_meta,
 
 void SqueezeInferMeta(const MetaTensor& x,
                       const std::vector<int>& axes,
-                      MetaTensor* out,
-                      MetaTensor* xshape);
+                      MetaTensor* out);
+
+void SqueezeWithXShapeInferMeta(const MetaTensor& x,
+                                const std::vector<int>& axes,
+                                MetaTensor* out,
+                                MetaTensor* xshape);
 
 void StridedSliceRawInferMeta(const MetaTensor& x,
                               const std::vector<int>& axes,
@@ -470,9 +478,14 @@ void UniqueRawInferMeta(const MetaTensor& x,
 void UnsqueezeInferMeta(const MetaTensor& x,
                         const IntArray& axes,
                         MetaTensor* out,
-                        MetaTensor* xshape,
                         MetaConfig config = MetaConfig());
 
+void UnsqueezeWithXShapeInferMeta(const MetaTensor& x,
+                                  const IntArray& axes,
+                                  MetaTensor* out,
+                                  MetaTensor* xshape,
+                                  MetaConfig config = MetaConfig());
+
 void UnStackInferMeta(const MetaTensor& x,
                       int axis,
                       int num,
diff --git a/paddle/phi/kernels/cpu/einsum_kernel.cc b/paddle/phi/kernels/cpu/einsum_kernel.cc
index 901c1fed628d3..7ef85a942e435 100644
--- a/paddle/phi/kernels/cpu/einsum_kernel.cc
+++ b/paddle/phi/kernels/cpu/einsum_kernel.cc
@@ -18,7 +18,7 @@
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/impl/einsum_impl.h"
 
-PD_REGISTER_KERNEL(einsum,
+PD_REGISTER_KERNEL(einsum_raw,
                    CPU,
                    ALL_LAYOUT,
                    phi::EinsumKernelRaw,
@@ -26,3 +26,12 @@ PD_REGISTER_KERNEL(einsum,
                    double,
                    phi::dtype::complex<float>,
                    phi::dtype::complex<double>) {}
+
+PD_REGISTER_KERNEL(einsum,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::EinsumKernel,
+                   float,
+                   double,
+                   phi::dtype::complex<float>,
+                   phi::dtype::complex<double>) {}
diff --git a/paddle/phi/kernels/cpu/squeeze_kernel.cc b/paddle/phi/kernels/cpu/squeeze_kernel.cc
index 7d5a6ca4e884e..d22efdf969440 100644
--- a/paddle/phi/kernels/cpu/squeeze_kernel.cc
+++ b/paddle/phi/kernels/cpu/squeeze_kernel.cc
@@ -32,3 +32,18 @@ PD_REGISTER_KERNEL(squeeze,
                    int64_t,
                    phi::dtype::complex<float>,
                    phi::dtype::complex<double>) {}
+
+PD_REGISTER_KERNEL(squeeze_with_xshape,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::SqueezeWithXShapeKernel,
+                   float,
+                   double,
+                   phi::dtype::bfloat16,
+                   bool,
+                   int,
+                   uint8_t,
+                   int8_t,
+                   int64_t,
+                   phi::dtype::complex<float>,
+                   phi::dtype::complex<double>) {}
diff --git a/paddle/phi/kernels/cpu/unsqueeze_kernel.cc b/paddle/phi/kernels/cpu/unsqueeze_kernel.cc
index 0152a31f80ba8..612e1a78cc5bb 100644
--- a/paddle/phi/kernels/cpu/unsqueeze_kernel.cc
+++ b/paddle/phi/kernels/cpu/unsqueeze_kernel.cc
@@ -33,3 +33,19 @@ PD_REGISTER_KERNEL(unsqueeze,
                    int64_t,
                    phi::dtype::complex<float>,
                    phi::dtype::complex<double>) {}
+
+PD_REGISTER_KERNEL(unsqueeze_with_xshape,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::UnsqueezeWithXShapeKernel,
+                   float,
+                   double,
+                   phi::dtype::bfloat16,
+                   bool,
+                   int,
+                   int16_t,
+                   uint8_t,
+                   int8_t,
+                   int64_t,
+                   phi::dtype::complex<float>,
+                   phi::dtype::complex<double>) {}
diff --git a/paddle/phi/kernels/gpu/einsum_kernel.cu b/paddle/phi/kernels/gpu/einsum_kernel.cu
index b3706710c40e3..99a9c58995c1f 100644
--- a/paddle/phi/kernels/gpu/einsum_kernel.cu
+++ b/paddle/phi/kernels/gpu/einsum_kernel.cu
@@ -18,7 +18,7 @@
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/impl/einsum_impl.h"
 
-PD_REGISTER_KERNEL(einsum,
+PD_REGISTER_KERNEL(einsum_raw,
                    GPU,
                    ALL_LAYOUT,
                    phi::EinsumKernelRaw,
@@ -28,3 +28,14 @@ PD_REGISTER_KERNEL(einsum,
                    phi::dtype::bfloat16,
                    phi::dtype::complex<float>,
                    phi::dtype::complex<double>) {}
+
+PD_REGISTER_KERNEL(einsum,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::EinsumKernel,
+                   float,
+                   double,
+                   phi::dtype::float16,
+                   phi::dtype::bfloat16,
+                   phi::dtype::complex<float>,
+                   phi::dtype::complex<double>) {}
diff --git a/paddle/phi/kernels/gpu/squeeze_kernel.cu b/paddle/phi/kernels/gpu/squeeze_kernel.cu
index ae15e210a02e7..06ddba2ef1c2b 100644
--- a/paddle/phi/kernels/gpu/squeeze_kernel.cu
+++ b/paddle/phi/kernels/gpu/squeeze_kernel.cu
@@ -33,3 +33,19 @@ PD_REGISTER_KERNEL(squeeze,
                    int64_t,
                    phi::dtype::complex<float>,
                    phi::dtype::complex<double>) {}
+
+PD_REGISTER_KERNEL(squeeze_with_xshape,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::SqueezeWithXShapeKernel,
+                   float,
+                   double,
+                   phi::dtype::bfloat16,
+                   phi::dtype::float16,
+                   bool,
+                   int,
+                   uint8_t,
+                   int8_t,
+                   int64_t,
+                   phi::dtype::complex<float>,
+                   phi::dtype::complex<double>) {}
diff --git a/paddle/phi/kernels/gpu/unsqueeze_kernel.cu b/paddle/phi/kernels/gpu/unsqueeze_kernel.cu
index 86b4462254637..2e7bae8666d24 100644
--- a/paddle/phi/kernels/gpu/unsqueeze_kernel.cu
+++ b/paddle/phi/kernels/gpu/unsqueeze_kernel.cu
@@ -34,3 +34,20 @@ PD_REGISTER_KERNEL(unsqueeze,
                    int64_t,
                    phi::dtype::complex<float>,
                    phi::dtype::complex<double>) {}
+
+PD_REGISTER_KERNEL(unsqueeze_with_xshape,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::UnsqueezeWithXShapeKernel,
+                   float,
+                   double,
+                   phi::dtype::float16,
+                   phi::dtype::bfloat16,
+                   bool,
+                   int,
+                   int16_t,
+                   uint8_t,
+                   int8_t,
+                   int64_t,
+                   phi::dtype::complex<float>,
+                   phi::dtype::complex<double>) {}
diff --git a/paddle/phi/kernels/impl/solve_kernel_impl.h b/paddle/phi/kernels/impl/solve_kernel_impl.h
index 09c9e74dd207a..4120823a9d2e9 100644
--- a/paddle/phi/kernels/impl/solve_kernel_impl.h
+++ b/paddle/phi/kernels/impl/solve_kernel_impl.h
@@ -12,6 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#pragma once
+
 #include "paddle/phi/core/tensor_utils.h"
 #include "paddle/phi/kernels/expand_as_kernel.h"
 #include "paddle/phi/kernels/funcs/matrix_solve.h"
@@ -77,7 +79,7 @@ static std::vector<int64_t> get_broadcast_batch_portion(
 static inline std::vector<int> convert_to_int_vec(std::vector<int64_t> a) {
   std::vector<int> ret;
   for (size_t i = 0; i < a.size(); i++) {
-    ret.emplace_back(int(a[i]));
+    ret.emplace_back(static_cast<int>(a[i]));
   }
 
   return ret;
@@ -167,7 +169,7 @@ static void linalg_solve(const Context& dev_ctx,
     out_tmp.Resize(out->dims());
     out_tmp = *out;
 
-    phi::SqueezeKernel<T, Context>(dev_ctx, out_tmp, {-1}, out, nullptr);
+    phi::SqueezeKernel<T, Context>(dev_ctx, out_tmp, {-1}, out);
   } else {
     PADDLE_ENFORCE_EQ(
         x_dim[x_dim_size - 1],
diff --git a/paddle/phi/kernels/impl/squeeze_kernel_impl.h b/paddle/phi/kernels/impl/squeeze_kernel_impl.h
index b4c94d619cc2a..156a71973a794 100644
--- a/paddle/phi/kernels/impl/squeeze_kernel_impl.h
+++ b/paddle/phi/kernels/impl/squeeze_kernel_impl.h
@@ -22,8 +22,7 @@ template <typename T, typename Context>
 void SqueezeKernel(const Context& dev_ctx,
                    const DenseTensor& x,
                    const std::vector<int>& axes,
-                   DenseTensor* out,
-                   DenseTensor* xshape) {
+                   DenseTensor* out) {
   auto x_dims = x.dims();
   auto out_dims = funcs::GetOutputSqueezeShape(axes, x_dims, true);
 
@@ -31,4 +30,14 @@ void SqueezeKernel(const Context& dev_ctx,
   phi::Copy(dev_ctx, x, dev_ctx.GetPlace(), false, out);
   out->Resize(out_dims);
 }
+
+template <typename T, typename Context>
+void SqueezeWithXShapeKernel(const Context& dev_ctx,
+                             const DenseTensor& x,
+                             const std::vector<int>& axes,
+                             DenseTensor* out,
+                             DenseTensor* xshape) {
+  SqueezeKernel<T, Context>(dev_ctx, x, axes, out);
+}
+
 }  // namespace phi
diff --git a/paddle/phi/kernels/impl/unsqueeze_kernel_impl.h b/paddle/phi/kernels/impl/unsqueeze_kernel_impl.h
index 4f81fa6c42341..5bef856d19b72 100644
--- a/paddle/phi/kernels/impl/unsqueeze_kernel_impl.h
+++ b/paddle/phi/kernels/impl/unsqueeze_kernel_impl.h
@@ -22,8 +22,7 @@ template <typename T, typename Context>
 void UnsqueezeKernel(const Context& dev_ctx,
                      const DenseTensor& x,
                      const IntArray& axes,
-                     DenseTensor* out,
-                     DenseTensor* xshape) {
+                     DenseTensor* out) {
   auto x_dims = x.dims();
   auto out_dims = out->dims();
   if (axes.FromTensor()) {
@@ -39,4 +38,13 @@ void UnsqueezeKernel(const Context& dev_ctx,
   phi::Copy(dev_ctx, x, dev_ctx.GetPlace(), false, out);
   out->Resize(out_dims);  // copy will reset the dims.
 }
+
+template <typename T, typename Context>
+void UnsqueezeWithXShapeKernel(const Context& dev_ctx,
+                               const DenseTensor& x,
+                               const IntArray& axes,
+                               DenseTensor* out,
+                               DenseTensor* xshape) {
+  UnsqueezeKernel<T, Context>(dev_ctx, x, axes, out);
+}
 }  // namespace phi
diff --git a/paddle/phi/kernels/squeeze_kernel.h b/paddle/phi/kernels/squeeze_kernel.h
index bd8f508cbb1db..1c6aeedbe5161 100644
--- a/paddle/phi/kernels/squeeze_kernel.h
+++ b/paddle/phi/kernels/squeeze_kernel.h
@@ -23,6 +23,13 @@ template <typename T, typename Context>
 void SqueezeKernel(const Context& dev_ctx,
                    const DenseTensor& x,
                    const std::vector<int>& axes,
-                   DenseTensor* out,
-                   DenseTensor* xshape);
+                   DenseTensor* out);
+
+template <typename T, typename Context>
+void SqueezeWithXShapeKernel(const Context& dev_ctx,
+                             const DenseTensor& x,
+                             const std::vector<int>& axes,
+                             DenseTensor* out,
+                             DenseTensor* xshape);
+
 }  // namespace phi
diff --git a/paddle/phi/kernels/unsqueeze_kernel.h b/paddle/phi/kernels/unsqueeze_kernel.h
index 62ba878c056cb..35a0515c92da3 100644
--- a/paddle/phi/kernels/unsqueeze_kernel.h
+++ b/paddle/phi/kernels/unsqueeze_kernel.h
@@ -25,8 +25,14 @@ template <typename T, typename Context>
 void UnsqueezeKernel(const Context& dev_ctx,
                      const DenseTensor& x,
                      const IntArray& axes,
-                     DenseTensor* out,
-                     DenseTensor* xshape);
+                     DenseTensor* out);
+
+template <typename T, typename Context>
+void UnsqueezeWithXShapeKernel(const Context& dev_ctx,
+                               const DenseTensor& x,
+                               const IntArray& axes,
+                               DenseTensor* out,
+                               DenseTensor* xshape);
 
 template <typename T, typename Context>
 void Unsqueeze(const Context& dev_ctx,
@@ -35,8 +41,8 @@ void Unsqueeze(const Context& dev_ctx,
                DenseTensor* out,
                DenseTensor* xshape) {
   MetaTensor meta_out(out);
-  UnsqueezeInferMeta(x, axes, &meta_out, nullptr, MetaConfig());
-  UnsqueezeKernel<T, Context>(dev_ctx, x, axes, out, nullptr);
+  UnsqueezeInferMeta(x, axes, &meta_out);
+  UnsqueezeKernel<T, Context>(dev_ctx, x, axes, out);
 }
 
 }  // namespace phi
diff --git a/paddle/phi/ops/compat/einsum_sig.cc b/paddle/phi/ops/compat/einsum_sig.cc
index 4fd31c1a2d842..e5aa570985596 100644
--- a/paddle/phi/ops/compat/einsum_sig.cc
+++ b/paddle/phi/ops/compat/einsum_sig.cc
@@ -17,8 +17,14 @@ limitations under the License. */
 namespace phi {
 
 KernelSignature EinsumOpArgumentMapping(const ArgumentMappingContext& ctx) {
-  return KernelSignature(
-      "einsum", {"Operands"}, {"equation"}, {"Out", "InnerCache", "XShape"});
+  if (ctx.OutputSize("XShape") > 0 && ctx.OutputSize("InnerCache") > 0) {
+    return KernelSignature("einsum_raw",
+                           {"Operands"},
+                           {"equation"},
+                           {"Out", "InnerCache", "XShape"});
+  } else {
+    return KernelSignature("einsum", {"Operands"}, {"equation"}, {"Out"});
+  }
 }
 
 KernelSignature EinsumGradOpArgumentMapping(const ArgumentMappingContext& ctx) {
diff --git a/paddle/phi/ops/compat/squeeze_sig.cc b/paddle/phi/ops/compat/squeeze_sig.cc
index cd6d5fc7253df..a251b9f537ccf 100644
--- a/paddle/phi/ops/compat/squeeze_sig.cc
+++ b/paddle/phi/ops/compat/squeeze_sig.cc
@@ -18,7 +18,12 @@
 namespace phi {
 
 KernelSignature SqueezeOpArgumentMapping(const ArgumentMappingContext& ctx) {
-  return KernelSignature("squeeze", {"X"}, {"axes"}, {"Out", "XShape"});
+  if (ctx.HasOutput("XShape")) {
+    return KernelSignature(
+        "squeeze_with_xshape", {"X"}, {"axes"}, {"Out", "XShape"});
+  } else {
+    return KernelSignature("squeeze", {"X"}, {"axes"}, {"Out"});
+  }
 }
 
 KernelSignature SqueezeGradOpArgumentMapping(
diff --git a/paddle/phi/ops/compat/unsqueeze_sig.cc b/paddle/phi/ops/compat/unsqueeze_sig.cc
index aee83933e5b97..a2f184e7150b8 100644
--- a/paddle/phi/ops/compat/unsqueeze_sig.cc
+++ b/paddle/phi/ops/compat/unsqueeze_sig.cc
@@ -18,17 +18,33 @@
 namespace phi {
 
 KernelSignature UnsqueezeOpArgumentMapping(const ArgumentMappingContext& ctx) {
-  if (ctx.InputSize("AxesTensorList") > 0) {
-    VLOG(2) << "unsqueeze2 in AxesTensorList";
-    return KernelSignature(
-        "unsqueeze", {"X"}, {"AxesTensorList"}, {"Out", "XShape"});
-  } else if (ctx.InputSize("AxesTensor") > 0) {
-    VLOG(2) << "unsqueeze2 in AxesTensor";
-    return KernelSignature(
-        "unsqueeze", {"X"}, {"AxesTensor"}, {"Out", "XShape"});
+  if (ctx.HasOutput("XShape")) {
+    if (ctx.InputSize("AxesTensorList") > 0) {
+      VLOG(2) << "unsqueeze2 in AxesTensorList";
+      return KernelSignature("unsqueeze_with_xshape",
+                             {"X"},
+                             {"AxesTensorList"},
+                             {"Out", "XShape"});
+    } else if (ctx.InputSize("AxesTensor") > 0) {
+      VLOG(2) << "unsqueeze2 in AxesTensor";
+      return KernelSignature(
+          "unsqueeze_with_xshape", {"X"}, {"AxesTensor"}, {"Out", "XShape"});
+    } else {
+      VLOG(2) << "unsqueeze2 in axes";
+      return KernelSignature(
+          "unsqueeze_with_xshape", {"X"}, {"axes"}, {"Out", "XShape"});
+    }
   } else {
-    VLOG(2) << "unsqueeze2 in axes";
-    return KernelSignature("unsqueeze", {"X"}, {"axes"}, {"Out", "XShape"});
+    if (ctx.InputSize("AxesTensorList") > 0) {
+      VLOG(2) << "unsqueeze2 in AxesTensorList";
+      return KernelSignature("unsqueeze", {"X"}, {"AxesTensorList"}, {"Out"});
+    } else if (ctx.InputSize("AxesTensor") > 0) {
+      VLOG(2) << "unsqueeze2 in AxesTensor";
+      return KernelSignature("unsqueeze", {"X"}, {"AxesTensor"}, {"Out"});
+    } else {
+      VLOG(2) << "unsqueeze2 in axes";
+      return KernelSignature("unsqueeze", {"X"}, {"axes"}, {"Out"});
+    }
   }
 }