From b8d106e1c57ff6a06d91b5b5c1232cb54b6e47b7 Mon Sep 17 00:00:00 2001 From: danleifeng <52735331+danleifeng@users.noreply.github.com> Date: Wed, 20 Jul 2022 15:02:37 +0800 Subject: [PATCH 01/12] =?UTF-8?q?=E3=80=90GPUPS=E3=80=91Adam=20accessor=20?= =?UTF-8?q?(#43919)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * add adam/sharedadam optimzier for gpups;edit optimizer struct;test=develop --- .../distributed/ps/table/ctr_dymf_accessor.cc | 28 +- .../distributed/ps/table/ctr_dymf_accessor.h | 19 +- .../distributed/ps/table/sparse_sgd_rule.cc | 84 ++- .../distributed/ps/table/sparse_sgd_rule.h | 23 + paddle/fluid/distributed/ps/table/table.cc | 1 + .../distributed/ps/wrapper/CMakeLists.txt | 1 + paddle/fluid/distributed/ps/wrapper/fleet.cc | 45 +- .../framework/distributed_strategy.proto | 6 +- paddle/fluid/framework/fleet/CMakeLists.txt | 15 +- paddle/fluid/framework/fleet/heter_context.h | 2 - .../framework/fleet/heter_ps/CMakeLists.txt | 8 +- .../framework/fleet/heter_ps/feature_value.cu | 192 +++++ .../framework/fleet/heter_ps/feature_value.h | 705 ++++++++++++++++++ .../fleet/heter_ps/graph_gpu_ps_table.h | 6 +- .../framework/fleet/heter_ps/hashtable.h | 12 +- .../fleet/heter_ps/hashtable_kernel.cu | 162 ++-- .../framework/fleet/heter_ps/heter_comm.h | 27 +- .../framework/fleet/heter_ps/heter_comm_inl.h | 433 ++++++----- .../fleet/heter_ps/heter_comm_kernel.cu | 173 +++-- .../fleet/heter_ps/heter_comm_kernel.h | 52 +- .../framework/fleet/heter_ps/heter_ps.cc | 43 +- .../framework/fleet/heter_ps/heter_ps.cu | 145 ++-- .../fluid/framework/fleet/heter_ps/heter_ps.h | 25 +- .../framework/fleet/heter_ps/heter_ps_base.h | 20 +- .../fluid/framework/fleet/heter_ps/mem_pool.h | 14 - .../framework/fleet/heter_ps/optimizer.cuh.h | 472 ++++++++++-- .../framework/fleet/heter_ps/optimizer_conf.h | 28 +- .../fluid/framework/fleet/ps_gpu_wrapper.cc | 310 +++----- .../fluid/framework/fleet/ps_gpu_wrapper.cu | 326 +------- paddle/fluid/framework/fleet/ps_gpu_wrapper.h | 211 ++++-- .../fluid/framework/fleet/ps_gpu_wrapper.kps | 179 ++--- .../fleet/base/distributed_strategy.py | 15 + python/paddle/distributed/ps/the_one_ps.py | 2 +- .../tests/unittests/test_dist_fleet_ps13.py | 201 +++++ .../test_fleet_distributed_strategy.py | 8 + tools/parallel_UT_rule.py | 3 +- 36 files changed, 2714 insertions(+), 1282 deletions(-) create mode 100644 paddle/fluid/framework/fleet/heter_ps/feature_value.cu create mode 100644 python/paddle/fluid/tests/unittests/test_dist_fleet_ps13.py diff --git a/paddle/fluid/distributed/ps/table/ctr_dymf_accessor.cc b/paddle/fluid/distributed/ps/table/ctr_dymf_accessor.cc index c65eac99acc03..4feee70fed751 100644 --- a/paddle/fluid/distributed/ps/table/ctr_dymf_accessor.cc +++ b/paddle/fluid/distributed/ps/table/ctr_dymf_accessor.cc @@ -31,6 +31,7 @@ int CtrDymfAccessor::Initialize() { _embedx_sgd_rule = CREATE_PSCORE_CLASS(SparseValueSGDRule, name); _embedx_sgd_rule->LoadConfig(_config.embedx_sgd_param(), _config.embedx_dim()); + common_feature_value.optimizer_name = name; common_feature_value.embed_sgd_dim = _embed_sgd_rule->Dim(); common_feature_value.embedx_dim = _config.embedx_dim(); @@ -42,7 +43,10 @@ int CtrDymfAccessor::Initialize() { if (_config.ctr_accessor_param().show_scale()) { _show_scale = true; } - VLOG(0) << " INTO CtrDymfAccessor::Initialize()"; + VLOG(0) << " INTO CtrDymfAccessor::Initialize(); embed_sgd_dim:" + << common_feature_value.embed_sgd_dim + << " embedx_dim:" << common_feature_value.embedx_dim + << " embedx_sgd_dim:" << common_feature_value.embedx_sgd_dim; InitAccessorInfo(); return 0; } @@ -53,9 +57,9 @@ void CtrDymfAccessor::InitAccessorInfo() { auto embedx_dim = _config.embedx_dim(); VLOG(0) << "InitAccessorInfo embedx_dim:" << embedx_dim; - _accessor_info.select_dim = 3 + embedx_dim; + _accessor_info.select_dim = 4 + embedx_dim; _accessor_info.select_size = _accessor_info.select_dim * sizeof(float); - _accessor_info.update_dim = 4 + embedx_dim; + _accessor_info.update_dim = 5 + embedx_dim; _accessor_info.update_size = _accessor_info.update_dim * sizeof(float); _accessor_info.mf_size = (embedx_dim + common_feature_value.embedx_sgd_dim) * sizeof(float); @@ -179,8 +183,10 @@ int32_t CtrDymfAccessor::Create(float** values, size_t num) { value[common_feature_value.ClickIndex()] = 0; value[common_feature_value.SlotIndex()] = -1; value[common_feature_value.MfDimIndex()] = -1; - _embed_sgd_rule->InitValue(value + common_feature_value.EmbedWIndex(), - value + common_feature_value.EmbedG2SumIndex()); + _embed_sgd_rule->InitValue( + value + common_feature_value.EmbedWIndex(), + value + common_feature_value.EmbedG2SumIndex(), + false); // adam embed init not zero, adagrad embed init zero _embedx_sgd_rule->InitValue(value + common_feature_value.EmbedxWIndex(), value + common_feature_value.EmbedxG2SumIndex(), false); @@ -293,22 +299,14 @@ std::string CtrDymfAccessor::ParseToString(const float* v, int param) { i++) { os << " " << v[i]; } - // os << " " << common_feature_value.Slot(const_cast(v)) << " " - // << common_feature_value.MfDim(const_cast(v)); auto show = common_feature_value.Show(const_cast(v)); auto click = common_feature_value.Click(const_cast(v)); auto score = ShowClickScore(show, click); + auto mf_dim = int(common_feature_value.MfDim(const_cast(v))); if (score >= _config.embedx_threshold() && param > common_feature_value.EmbedxG2SumIndex()) { - // VLOG(1) << "common_feature_value.EmbedxG2SumIndex():" - // << common_feature_value.EmbedxG2SumIndex(); - // VLOG(1) << "common_feature_value.EmbedxWIndex():" - // << common_feature_value.EmbedxWIndex(); - // VLOG(1) << "common_feature_value.MfDim():" - // << common_feature_value.MfDim(const_cast(v)); for (auto i = common_feature_value.EmbedxG2SumIndex(); - i < common_feature_value.EmbedxWIndex() + - common_feature_value.MfDim(const_cast(v)); + i < common_feature_value.Dim(mf_dim); ++i) { os << " " << v[i]; } diff --git a/paddle/fluid/distributed/ps/table/ctr_dymf_accessor.h b/paddle/fluid/distributed/ps/table/ctr_dymf_accessor.h index a360030cb7d3d..b820d617d06ae 100644 --- a/paddle/fluid/distributed/ps/table/ctr_dymf_accessor.h +++ b/paddle/fluid/distributed/ps/table/ctr_dymf_accessor.h @@ -54,10 +54,24 @@ class CtrDymfAccessor : public ValueAccessor { int ClickIndex() { return ShowIndex() + 1; } int EmbedWIndex() { return ClickIndex() + 1; } int EmbedG2SumIndex() { return EmbedWIndex() + 1; } - int SlotIndex() { return EmbedG2SumIndex() + 1; } + int SlotIndex() { return EmbedG2SumIndex() + embed_sgd_dim; } int MfDimIndex() { return SlotIndex() + 1; } int EmbedxG2SumIndex() { return MfDimIndex() + 1; } - int EmbedxWIndex() { return EmbedxG2SumIndex() + 1; } + int EmbedxWIndex() { return EmbedxG2SumIndex() + embedx_sgd_dim; } + + // 根据mf_dim计算的总长度 + int Dim(int& mf_dim) { + int tmp_embedx_sgd_dim = 1; + if (optimizer_name == "SparseAdamSGDRule") { // adam + tmp_embedx_sgd_dim = mf_dim * 2 + 2; + } else if (optimizer_name == "SparseSharedAdamSGDRule") { // shared_adam + tmp_embedx_sgd_dim = 4; + } + return 7 + embed_sgd_dim + tmp_embedx_sgd_dim + mf_dim; + } + + // 根据mf_dim计算的总byte数 + int Size(int& mf_dim) { return (Dim(mf_dim)) * sizeof(float); } float& UnseenDays(float* val) { return val[UnseenDaysIndex()]; } float& DeltaScore(float* val) { return val[DeltaScoreIndex()]; } @@ -73,6 +87,7 @@ class CtrDymfAccessor : public ValueAccessor { int embed_sgd_dim; int embedx_dim; int embedx_sgd_dim; + std::string optimizer_name; }; struct CtrDymfPushValue { diff --git a/paddle/fluid/distributed/ps/table/sparse_sgd_rule.cc b/paddle/fluid/distributed/ps/table/sparse_sgd_rule.cc index 07562f566d326..014d6e450ab4a 100644 --- a/paddle/fluid/distributed/ps/table/sparse_sgd_rule.cc +++ b/paddle/fluid/distributed/ps/table/sparse_sgd_rule.cc @@ -213,7 +213,6 @@ void SparseAdamSGDRule::UpdateValueWork(float* w, float beta1_pow_ = *beta1_pow; float beta2_pow_ = *beta2_pow; - // lr not change in one update lr *= sqrt(1 - beta2_pow_) / (1 - beta1_pow_); for (size_t i = 0; i < _embedding_dim; i++) { // Calculation @@ -252,5 +251,88 @@ void SparseAdamSGDRule::InitValueWork(float* value, *(sgd + Beta1PowIndex()) = _beta1_decay_rate; *(sgd + Beta2PowIndex()) = _beta2_decay_rate; } + +void SparseSharedAdamSGDRule::LoadConfig( + const SparseCommonSGDRuleParameter& param, size_t emb_dim) { + _embedding_dim = emb_dim; + auto adam_param = param.adam(); + learning_rate_ = adam_param.learning_rate(); + _initial_range = adam_param.initial_range(); + _beta1_decay_rate = adam_param.beta1_decay_rate(); + _beta2_decay_rate = adam_param.beta2_decay_rate(); + _ada_epsilon = adam_param.ada_epsilon(); + if (adam_param.weight_bounds_size() == 0) { + _min_bound = -std::numeric_limits::max(); + _max_bound = std::numeric_limits::max(); + } else { + CHECK(adam_param.weight_bounds_size() >= 2) + << "invalid repeated size for weight_bounds:" + << adam_param.weight_bounds_size(); + _min_bound = adam_param.weight_bounds(0); + _max_bound = adam_param.weight_bounds(1); + } +} + +void SparseSharedAdamSGDRule::UpdateValueWork(float* w, + float* sgd, + const float* grad, + float scale) { + float* gsum = sgd + GSumIndex(); + float* g2sum = sgd + G2SumIndex(); + float* beta1_pow = sgd + Beta1PowIndex(); + float* beta2_pow = sgd + Beta2PowIndex(); + const float* g = grad; + + float lr = learning_rate_; + float beta1_pow_ = *beta1_pow; + float beta2_pow_ = *beta2_pow; + float gsum_ = *gsum; + float g2sum_ = *g2sum; + + lr *= sqrt(1 - beta2_pow_) / (1 - beta1_pow_); + double sum_gsum = 0.0; + double sum_g2sum = 0.0; + for (int i = 0; i < _embedding_dim; i++) { + // Calculation + double new_gsum = + _beta1_decay_rate * gsum_ + (1 - _beta1_decay_rate) * g[i]; + double new_g2sum = + _beta2_decay_rate * g2sum_ + (1 - _beta2_decay_rate) * g[i] * g[i]; + w[i] = w[i] - lr * (new_gsum / (sqrt(new_g2sum) + _ada_epsilon)); + BoundValue(w[i]); + sum_gsum += new_gsum; + sum_g2sum += new_g2sum; + } + // update beta_pow_decay + (*gsum) = sum_gsum / _embedding_dim; + (*g2sum) = sum_g2sum / _embedding_dim; + (*beta1_pow) *= _beta1_decay_rate; + (*beta2_pow) *= _beta2_decay_rate; +} + +void SparseSharedAdamSGDRule::InitValueWork(float* value, + float* sgd, + bool zero_init) { + for (int i = 0; i < _embedding_dim; ++i) { + if (zero_init) { + value[i] = 0.0; + BoundValue(value[i]); + } else { + value[i] = + (local_uniform_real_distribution()(local_random_engine()) * + 2 - + 1) * + _initial_range; + BoundValue(value[i]); + } + } + // init rule gsum and g2sum + for (int i = GSumIndex(); i < Beta1PowIndex(); i++) { + sgd[i] = 0.0; + } + // init beta1_pow and beta2_pow + *(sgd + Beta1PowIndex()) = _beta1_decay_rate; + *(sgd + Beta2PowIndex()) = _beta2_decay_rate; +} } // namespace distributed } // namespace paddle diff --git a/paddle/fluid/distributed/ps/table/sparse_sgd_rule.h b/paddle/fluid/distributed/ps/table/sparse_sgd_rule.h index f62cffdf232e7..4fed331ba93ec 100644 --- a/paddle/fluid/distributed/ps/table/sparse_sgd_rule.h +++ b/paddle/fluid/distributed/ps/table/sparse_sgd_rule.h @@ -144,5 +144,28 @@ class SparseAdamSGDRule : public SparseValueSGDRule { float _beta2_decay_rate; float _ada_epsilon; }; + +class SparseSharedAdamSGDRule : public SparseValueSGDRule { + public: + virtual void LoadConfig(const SparseCommonSGDRuleParameter& param, + size_t emb_dim); + virtual void UpdateValueWork(float* w, + float* sgd, + const float* push_value, + float scale); + virtual void InitValueWork(float* value, float* sgd, bool zero_init); + virtual size_t Dim() { return 4; } + size_t GSumIndex() { return 0; } + size_t G2SumIndex() { return GSumIndex() + 1; } + size_t Beta1PowIndex() { return G2SumIndex() + 1; } + size_t Beta2PowIndex() { return Beta1PowIndex() + 1; } + + protected: + float learning_rate_; + float _beta1_decay_rate; + float _beta2_decay_rate; + float _ada_epsilon; +}; + } // namespace distributed } // namespace paddle diff --git a/paddle/fluid/distributed/ps/table/table.cc b/paddle/fluid/distributed/ps/table/table.cc index cfa286f1c3f7f..3e6d5a9941206 100644 --- a/paddle/fluid/distributed/ps/table/table.cc +++ b/paddle/fluid/distributed/ps/table/table.cc @@ -49,6 +49,7 @@ REGISTER_PSCORE_CLASS(SparseValueSGDRule, StdAdaGradSGDRule); REGISTER_PSCORE_CLASS(SparseValueSGDRule, SparseAdamSGDRule); REGISTER_PSCORE_CLASS(SparseValueSGDRule, SparseNaiveSGDRule); REGISTER_PSCORE_CLASS(SparseValueSGDRule, SparseAdaGradSGDRule); +REGISTER_PSCORE_CLASS(SparseValueSGDRule, SparseSharedAdamSGDRule); int32_t TableManager::Initialize() { static bool initialized = false; diff --git a/paddle/fluid/distributed/ps/wrapper/CMakeLists.txt b/paddle/fluid/distributed/ps/wrapper/CMakeLists.txt index 8b5457ef9eea5..c9cd883dabb69 100644 --- a/paddle/fluid/distributed/ps/wrapper/CMakeLists.txt +++ b/paddle/fluid/distributed/ps/wrapper/CMakeLists.txt @@ -13,6 +13,7 @@ cc_library( op_registry fs shell + ps_gpu_wrapper ${RPC_DEPS}) target_link_libraries(fleet z) diff --git a/paddle/fluid/distributed/ps/wrapper/fleet.cc b/paddle/fluid/distributed/ps/wrapper/fleet.cc index bbefeba559916..3d7190cf55336 100644 --- a/paddle/fluid/distributed/ps/wrapper/fleet.cc +++ b/paddle/fluid/distributed/ps/wrapper/fleet.cc @@ -18,6 +18,10 @@ limitations under the License. */ #include "paddle/fluid/distributed/ps/service/communicator/communicator.h" #include "paddle/fluid/distributed/ps/table/table.h" +#include "paddle/fluid/distributed/ps/wrapper/fleet.h" +#if defined PADDLE_WITH_HETERPS && defined PADDLE_WITH_PSCORE +#include "paddle/fluid/framework/fleet/ps_gpu_wrapper.h" +#endif namespace paddle { namespace distributed { @@ -129,6 +133,13 @@ void FleetWrapper::InitWorker(const std::string& dist_desc, worker_ptr_ = std::shared_ptr( paddle::distributed::PSClientFactory::Create(ps_param)); worker_ptr_->Configure(ps_param, dense_pull_regions, ps_env_, index); +#if defined PADDLE_WITH_HETERPS && defined PADDLE_WITH_PSCORE + VLOG(3) << "FleetWrapper::InitWorker InitializeGPUServer"; + auto* accessor = worker_ptr_->GetTableAccessor(0); + auto ps_gpu_wrapper = paddle::framework::PSGPUWrapper::GetInstance(); + ps_gpu_wrapper->InitializeGPUServer(ps_param); + ps_gpu_wrapper->SetTableAccessor(accessor); +#endif } } else { VLOG(3) << "Client can be initialized only once"; @@ -525,11 +536,11 @@ void FleetWrapper::PushSparseFromTensorAsync( int batch_size = -1; bool batch_size_consist = true; for (auto* input : *inputs) { - int cur_batch_size = + size_t cur_batch_size = input->lod().size() ? input->lod()[0].size() - 1 : input->dims()[0]; if (batch_size == -1) { - batch_size = cur_batch_size; - } else if (batch_size != cur_batch_size) { + batch_size = int(cur_batch_size); + } else if (batch_size != int(cur_batch_size)) { // CHECK(batch_size == cur_batch_size); // NOLINT batch_size_consist = false; break; @@ -537,12 +548,12 @@ void FleetWrapper::PushSparseFromTensorAsync( } CHECK(batch_size > 0); // NOLINT - int show_size = + size_t show_size = shows->lod().size() ? shows->lod()[0].size() - 1 : shows->dims()[0]; - CHECK(show_size == batch_size || show_size == 1); - int clk_size = + CHECK(show_size == size_t(batch_size) || show_size == 1); + size_t clk_size = clks->lod().size() ? clks->lod()[0].size() - 1 : clks->dims()[0]; - CHECK(clk_size == batch_size || clk_size == 1); + CHECK(clk_size == size_t(batch_size) || clk_size == 1); CHECK(outputs->size() == inputs->size()); std::vector push_keys; @@ -601,12 +612,10 @@ void FleetWrapper::PushSparseFromTensorAsync( // in // ctr_accessor.h push_values.back()[0] = 2; // TODO(zhaocaibei123): slot - push_values.back()[1] = (static_cast(i) >= show_size - ? 1 - : static_cast(show_tensor[i])); - push_values.back()[2] = (static_cast(i) >= clk_size - ? 0 - : static_cast(clk_tensor[i])); + push_values.back()[1] = + (i >= show_size ? 1 : static_cast(show_tensor[i])); + push_values.back()[2] = + (i >= clk_size ? 0 : static_cast(clk_tensor[i])); float* data = push_values.back().data() + 3; memcpy(data, g + output_len, sizeof(float) * fea_dim); } @@ -630,12 +639,10 @@ void FleetWrapper::PushSparseFromTensorAsync( // slot show clk grad... consistent with CtrCommonPushValue defined in // ctr_accessor.h push_values.back()[0] = 2; // TODO(zhaocaibei123): slot - push_values.back()[1] = (static_cast(i) >= show_size - ? 1 - : static_cast(show_tensor[i])); - push_values.back()[2] = (static_cast(i) >= clk_size - ? 0 - : static_cast(clk_tensor[i])); + push_values.back()[1] = + (i >= show_size ? 1 : static_cast(show_tensor[i])); + push_values.back()[2] = + (i >= clk_size ? 0 : static_cast(clk_tensor[i])); float* data = push_values.back().data() + 3; memcpy(data, g + output_len, sizeof(float) * fea_dim); } diff --git a/paddle/fluid/framework/distributed_strategy.proto b/paddle/fluid/framework/distributed_strategy.proto index b3a01ae169e4e..45758389c5413 100755 --- a/paddle/fluid/framework/distributed_strategy.proto +++ b/paddle/fluid/framework/distributed_strategy.proto @@ -197,14 +197,14 @@ message TableParameter { message TableAccessorParameter { optional string accessor_class = 1; - optional SGDParameter embed_sgd_param = 2; - optional SGDParameter embedx_sgd_param = 3; optional uint32 fea_dim = 4 [ default = 11 ]; // field size of one value optional uint32 embedx_dim = 5 [ default = 8 ]; // embedx feature size optional uint32 embedx_threshold = 6 [ default = 10 ]; // embedx feature create threshold optional CtrAccessorParameter ctr_accessor_param = 7; repeated TableAccessorSaveParameter table_accessor_save_param = 8; + optional SGDParameter embed_sgd_param = 10; + optional SGDParameter embedx_sgd_param = 11; } message SGDParameter { @@ -228,7 +228,7 @@ message repeated float weight_bounds = 4; } -message SparseAdamSGDParameter { // SparseAdamSGDRule +message SparseAdamSGDParameter { // SparseAdamSGDRule | SparseSharedAdamSGDRule optional double learning_rate = 1 [ default = 0.001 ]; optional double initial_range = 2 [ default = 0.0001 ]; optional double beta1_decay_rate = 3 [ default = 0.9 ]; diff --git a/paddle/fluid/framework/fleet/CMakeLists.txt b/paddle/fluid/framework/fleet/CMakeLists.txt index 42235b7c484e3..4cf3ab8dc1a67 100644 --- a/paddle/fluid/framework/fleet/CMakeLists.txt +++ b/paddle/fluid/framework/fleet/CMakeLists.txt @@ -25,10 +25,17 @@ endif() if(WITH_HETERPS) if(WITH_NCCL AND WITH_GPU) - nv_library( - ps_gpu_wrapper - SRCS ps_gpu_wrapper.cu ps_gpu_wrapper.cc - DEPS heter_ps gloo_wrapper ${BRPC_DEPS}) + if(WITH_PSCORE) + nv_library( + ps_gpu_wrapper + SRCS ps_gpu_wrapper.cu ps_gpu_wrapper.cc + DEPS heter_ps gloo_wrapper ps_framework_proto ${BRPC_DEPS}) + else() + nv_library( + ps_gpu_wrapper + SRCS ps_gpu_wrapper.cu ps_gpu_wrapper.cc + DEPS heter_ps gloo_wrapper ${BRPC_DEPS}) + endif() add_subdirectory(heter_ps) elseif(WITH_XPU_KP) xpu_library( diff --git a/paddle/fluid/framework/fleet/heter_context.h b/paddle/fluid/framework/fleet/heter_context.h index 3955502c8b808..ef2e73d6dd5b5 100644 --- a/paddle/fluid/framework/fleet/heter_context.h +++ b/paddle/fluid/framework/fleet/heter_context.h @@ -81,7 +81,6 @@ class HeterContext { std::vector> device_values_; std::vector> device_keys_; std::vector>> device_dim_keys_; - std::vector>> device_dim_values_; std::vector mutex_; std::vector> dim_mutex_; int multi_mf_dim_ = 0; @@ -114,7 +113,6 @@ class HeterContext { value_dim_ptr_[i].resize(dim_num); } device_values_.resize(device_num); - device_dim_values_.resize(device_num); device_keys_.resize(device_num); device_dim_keys_.resize(device_num); diff --git a/paddle/fluid/framework/fleet/heter_ps/CMakeLists.txt b/paddle/fluid/framework/fleet/heter_ps/CMakeLists.txt index 7540c6147f4b7..9631502f4f05e 100644 --- a/paddle/fluid/framework/fleet/heter_ps/CMakeLists.txt +++ b/paddle/fluid/framework/fleet/heter_ps/CMakeLists.txt @@ -9,16 +9,16 @@ if(WITH_GPU) endif() nv_library( heter_comm_kernel - SRCS heter_comm_kernel.cu feature_value.h + SRCS heter_comm_kernel.cu feature_value.h feature_value.cu DEPS ${HETERPS_DEPS}) nv_library( hashtable_kernel - SRCS hashtable_kernel.cu feature_value.h + SRCS hashtable_kernel.cu feature_value.h feature_value.cu DEPS ${HETERPS_DEPS}) nv_library( heter_comm - SRCS heter_comm.h feature_value.h heter_resource.cc heter_resource.h - mem_pool.h + SRCS heter_comm.h feature_value.h feature_value.cu heter_resource.cc + heter_resource.h mem_pool.h DEPS ${HETERPS_DEPS} heter_comm_kernel hashtable_kernel) nv_test( test_heter_comm diff --git a/paddle/fluid/framework/fleet/heter_ps/feature_value.cu b/paddle/fluid/framework/fleet/heter_ps/feature_value.cu new file mode 100644 index 0000000000000..560ce33b9af78 --- /dev/null +++ b/paddle/fluid/framework/fleet/heter_ps/feature_value.cu @@ -0,0 +1,192 @@ +/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#ifdef PADDLE_WITH_HETERPS +#include "paddle/fluid/framework/fleet/heter_ps/feature_value.h" + +namespace paddle { +namespace framework { + +template +__global__ void PullCopy(float** dest, + const float* src, + const int64_t* len, + int slot_num, + int total_len, + uint64_t** keys, + uint64_t max_val_size, + int* gpu_dim, + FVAccessor feature_value_accessor) { + CUDA_KERNEL_LOOP(i, total_len) { + int low = 0; + int high = slot_num - 1; + while (low < high) { + int mid = (low + high) / 2; + if (i < len[mid]) + high = mid; + else + low = mid + 1; + } + int x = low; + int y = i - (x ? len[x - 1] : 0); + float* feature_value_ptr = + (float*)((char*)src + uint64_t(i) * uint64_t(max_val_size)); + int mf_dim = gpu_dim[x] - 3; + feature_value_accessor.Select( + dest[x] + y * (mf_dim + 3), feature_value_ptr, keys[x] + y, mf_dim); + } +} + +template +__global__ void PushCopyWithPool(float* dest, + float** src, + int64_t* len, + int slot_num, + uint64_t total_len, + int bs, + int* slot_vector, + int* mf_dim_vector, + size_t grad_value_size, + FVAccessor feature_value_accessor) { + CUDA_KERNEL_LOOP(i, total_len) { + int low = 0; + int high = slot_num - 1; + while (low < high) { + int mid = (low + high) / 2; + if (i < len[mid]) + high = mid; + else + low = mid + 1; + } + int x = low; + int y = i - (x ? len[low - 1] : 0); + float* cur = (float*)((char*)dest + i * grad_value_size); + + cur[feature_value_accessor.common_push_value.SlotIndex()] = + (float)slot_vector[x]; + int mf_dim = mf_dim_vector[x]; + cur[feature_value_accessor.common_push_value.MfDimIndex()] = mf_dim; + + cur[feature_value_accessor.common_push_value.ShowIndex()] = + *(src[x] + y * (mf_dim + 3)); + cur[feature_value_accessor.common_push_value.ClickIndex()] = + *(src[x] + y * (mf_dim + 3) + 1); + cur[feature_value_accessor.common_push_value.EmbedGIndex()] = + *(src[x] + y * (mf_dim + 3) + 2) * -1. * bs; + for (int j = 0; j < mf_dim; j++) { + cur[feature_value_accessor.common_push_value.EmbedxGIndex() + j] = + *(src[x] + y * (mf_dim + 3) + 3 + j) * -1. * bs; + } + } +} + +template +void AccessorWrapper::CopyForPullImpl( + const paddle::platform::Place& place, + uint64_t** gpu_keys, + const std::vector& values, + const float* total_values_gpu, + const int64_t* gpu_len, + const int slot_num, + const int hidden_size, + const int64_t total_length, + int* gpu_dim, + int feature_value_size) { + auto stream = dynamic_cast( + paddle::platform::DeviceContextPool::Instance().Get(place)) + ->stream(); + auto buf_value = memory::Alloc(place, values.size() * sizeof(float*)); + float** gpu_values = reinterpret_cast(buf_value->ptr()); + cudaMemcpy(gpu_values, + values.data(), + values.size() * sizeof(float*), + cudaMemcpyHostToDevice); + PullCopy<<<(total_length + 1024 - 1) / 1024, 1024, 0, stream>>>( + gpu_values, + total_values_gpu, + gpu_len, + slot_num, + total_length, + gpu_keys, + feature_value_size, + gpu_dim, + gpu_accessor_); + cudaStreamSynchronize(stream); +} + +template +void AccessorWrapper::CopyForPushImpl( + const paddle::platform::Place& place, + const std::vector& grad_values, + float* total_grad_values_gpu, + const std::vector& slot_lengths, + const uint64_t total_length, + const int batch_size, + size_t grad_value_size, + std::vector& slot_vector, + std::vector& slot_mf_dim_vector) { + auto stream = dynamic_cast( + paddle::platform::DeviceContextPool::Instance().Get(place)) + ->stream(); + auto slot_lengths_lod = slot_lengths; + for (int i = 1; i < slot_lengths_lod.size(); i++) { + slot_lengths_lod[i] += slot_lengths_lod[i - 1]; + } + auto buf_grad_value = + memory::Alloc(place, grad_values.size() * sizeof(float*)); + auto buf_length = memory::Alloc(place, slot_lengths.size() * sizeof(int64_t)); + auto buf_slot_vector = + memory::Alloc(place, slot_lengths_lod.size() * sizeof(int)); + auto buf_mf_dim_vector = + memory::Alloc(place, slot_lengths_lod.size() * sizeof(int)); + float** gpu_values = reinterpret_cast(buf_grad_value->ptr()); + int64_t* gpu_len = reinterpret_cast(buf_length->ptr()); + int* d_slot_vector = reinterpret_cast(buf_slot_vector->ptr()); + int* d_mf_dim_vector = reinterpret_cast(buf_mf_dim_vector->ptr()); + cudaMemcpy(gpu_values, + grad_values.data(), + grad_values.size() * sizeof(float*), + cudaMemcpyHostToDevice); + cudaMemcpy(gpu_len, + slot_lengths_lod.data(), + slot_lengths.size() * sizeof(int64_t), + cudaMemcpyHostToDevice); + cudaMemcpy(d_slot_vector, + slot_vector.data(), + slot_lengths_lod.size() * sizeof(int), + cudaMemcpyHostToDevice); + cudaMemcpy(d_mf_dim_vector, + slot_mf_dim_vector.data(), + slot_lengths_lod.size() * sizeof(int), + cudaMemcpyHostToDevice); + PushCopyWithPool<<<(total_length + 1024 - 1) / 1024, 1024, 0, stream>>>( + total_grad_values_gpu, + gpu_values, + gpu_len, + slot_lengths.size(), + total_length, + batch_size, + d_slot_vector, + d_mf_dim_vector, + grad_value_size, + gpu_accessor_); + cudaStreamSynchronize(stream); +} + +#ifdef PADDLE_WITH_PSCORE +template class AccessorWrapper; +#endif + +} // namespace framework +} // namespace paddle +#endif diff --git a/paddle/fluid/framework/fleet/heter_ps/feature_value.h b/paddle/fluid/framework/fleet/heter_ps/feature_value.h index cb7f3a40d6720..ef4533d64eac2 100644 --- a/paddle/fluid/framework/fleet/heter_ps/feature_value.h +++ b/paddle/fluid/framework/fleet/heter_ps/feature_value.h @@ -17,12 +17,547 @@ limitations under the License. */ #ifdef PADDLE_WITH_HETERPS #include +#include +#include + +#include "paddle/fluid/platform/device_context.h" +#include "paddle/fluid/platform/place.h" +#ifdef PADDLE_WITH_PSCORE +#include "paddle/fluid/distributed/ps/table/accessor.h" +#include "paddle/fluid/distributed/ps/table/ctr_dymf_accessor.h" +#include "paddle/fluid/distributed/ps/table/depends/feature_value.h" +#endif namespace paddle { namespace framework { #define MF_DIM 8 typedef uint64_t FeatureKey; +#define TYPEALIGN(ALIGNVAL, LEN) \ + (((uint64_t)(LEN) + ((ALIGNVAL)-1)) & ~((uint64_t)((ALIGNVAL)-1))) + +class FeatureValueAccessor { + public: + __host__ __device__ FeatureValueAccessor() {} + __host__ __device__ ~FeatureValueAccessor() {} + + __host__ __device__ virtual int Configure( + std::unordered_map config) { + _config = config; + Initialize(); + return 0; + } + __host__ __device__ virtual int Initialize() = 0; + + protected: + std::unordered_map _config; +}; + +// adagrad: embed_sgd_dim=1, embedx_sgd_dim=1,embedx_dim=n +// adam std: embed_sgd_dim=4, embedx_sgd_dim=n*2+2,embedx_dim=n +// adam shared: embed_sgd_dim=4, embedx_sgd_dim=4,embedx_dim=n +class CommonFeatureValueAccessor : public FeatureValueAccessor { + public: + struct CommonFeatureValue { + /* + uint64_t cpu_ptr; + float delta_score; + float show; + float click; + float embed_w; + std::vector embed_g2sum; + float slot; + float mf_dim + float mf_size + std::vector embedx_g2sum; + std::vector embedx_w; + */ + + __host__ __device__ int Dim() { + return 9 + embed_sgd_dim + embedx_sgd_dim + embedx_dim; + } // has cpu_ptr(2) + __host__ __device__ int DimSize(size_t dim, int embedx_dim) { + return sizeof(float); + } + __host__ __device__ size_t Size() { + return TYPEALIGN(8, Dim() * sizeof(float)); + } // cpu_ptr:uint64=2float + __host__ __device__ int EmbedDim() { return embed_sgd_dim; } + __host__ __device__ int EmbedXDim() { return embedx_sgd_dim; } + __host__ __device__ int EmbedWDim() { return embedx_dim; } + __host__ __device__ int CpuPtrIndex() { return 0; } // cpuprt uint64 + __host__ __device__ int DeltaScoreIndex() { return CpuPtrIndex() + 2; } + __host__ __device__ int ShowIndex() { return DeltaScoreIndex() + 1; } + __host__ __device__ int ClickIndex() { return ShowIndex() + 1; } + __host__ __device__ int EmbedWIndex() { return ClickIndex() + 1; } + __host__ __device__ int EmbedG2SumIndex() { return EmbedWIndex() + 1; } + __host__ __device__ int SlotIndex() { + return EmbedG2SumIndex() + embed_sgd_dim; + } + __host__ __device__ int MfDimIndex() { return SlotIndex() + 1; } + __host__ __device__ int MfSizeIndex() { + return MfDimIndex() + 1; + } // actual mf size (ex. 0) + __host__ __device__ int EmbedxG2SumIndex() { return MfSizeIndex() + 1; } + __host__ __device__ int EmbedxWIndex() { + return EmbedxG2SumIndex() + embedx_sgd_dim; + } + + // 根据mf_dim计算的总长度 + __host__ __device__ int Dim(int& mf_dim) { + int tmp_embedx_sgd_dim = 1; + if (optimizer_type_ == 3) { // adam + tmp_embedx_sgd_dim = mf_dim * 2 + 2; + } else if (optimizer_type_ == 4) { // shared_adam + tmp_embedx_sgd_dim = 4; + } + return 9 + embed_sgd_dim + tmp_embedx_sgd_dim + mf_dim; + } + + // 根据mf_dim 计算的总byte数 + __host__ __device__ size_t Size(int& mf_dim) { + return TYPEALIGN(8, Dim(mf_dim) * sizeof(float)); // cpu_ptr:2float + } + + // 根据mf_dim 计算的 mf_size byte数 + __host__ __device__ size_t MFSize(int& mf_dim) { + int tmp_embedx_sgd_dim = 1; + if (optimizer_type_ == 3) { // adam + tmp_embedx_sgd_dim = mf_dim * 2 + 2; + } else if (optimizer_type_ == 4) { // shared_adam + tmp_embedx_sgd_dim = 4; + } + return (tmp_embedx_sgd_dim + mf_dim) * sizeof(float); + } + + __host__ __device__ int EmbedxG2SumOffsetIndex() { return 0; } + __host__ __device__ int EmbedxWOffsetIndex(float* val) { + // has mf + int tmp_embedx_sgd_dim = 1; + if (int(MfSize(val)) > 0) { + if (optimizer_type_ == 3) { // adam + tmp_embedx_sgd_dim = int(MfDim(val)) * 2 + 2; + } else if (optimizer_type_ == 4) { // shared_adam + tmp_embedx_sgd_dim = 4; + } + return EmbedxG2SumIndex() + tmp_embedx_sgd_dim; + } else { + // no mf + return 0; + } + } + + __host__ __device__ uint64_t CpuPtr(float* val) { + return *(reinterpret_cast(val)); + } + __host__ __device__ float& DeltaScore(float* val) { + return val[DeltaScoreIndex()]; + } + __host__ __device__ float& Show(float* val) { return val[ShowIndex()]; } + __host__ __device__ float& Click(float* val) { return val[ClickIndex()]; } + __host__ __device__ float& Slot(float* val) { return val[SlotIndex()]; } + __host__ __device__ float& MfDim(float* val) { return val[MfDimIndex()]; } + __host__ __device__ float& MfSize(float* val) { return val[MfSizeIndex()]; } + __host__ __device__ float& EmbedW(float* val) { return val[EmbedWIndex()]; } + __host__ __device__ float& EmbedG2Sum(float* val) { + return val[EmbedG2SumIndex()]; + } + __host__ __device__ float& EmbedxG2Sum(float* val) { + return val[EmbedxG2SumIndex()]; + } + __host__ __device__ float& EmbedxW(float* val) { + return val[EmbedxWIndex()]; + } + + int embed_sgd_dim; + int embedx_dim; + int embedx_sgd_dim; + int optimizer_type_; + }; + + struct CommonPushValue { + /* + float slot; + float show; + float click; + float mf_dim; + float embed_g; + std::vector embedx_g; + */ + + __host__ __device__ int Dim(int embedx_dim) { return 5 + embedx_dim; } + + __host__ __device__ int DimSize(int dim, int embedx_dim) { + return sizeof(float); + } + __host__ __device__ int Size(int embedx_dim) { + return TYPEALIGN(8, Dim(embedx_dim) * sizeof(float)); + } + __host__ __device__ int SlotIndex() { return 0; } + __host__ __device__ int ShowIndex() { + return CommonPushValue::SlotIndex() + 1; + } + __host__ __device__ int ClickIndex() { + return CommonPushValue::ShowIndex() + 1; + } + __host__ __device__ int MfDimIndex() { + return CommonPushValue::ClickIndex() + 1; + } + __host__ __device__ int EmbedGIndex() { + return CommonPushValue::MfDimIndex() + 1; + } + __host__ __device__ int EmbedxGIndex() { + return CommonPushValue::EmbedGIndex() + 1; + } + __host__ __device__ float& Slot(float* val) { + return val[CommonPushValue::SlotIndex()]; + } + __host__ __device__ float& Show(float* val) { + return val[CommonPushValue::ShowIndex()]; + } + __host__ __device__ float& Click(float* val) { + return val[CommonPushValue::ClickIndex()]; + } + __host__ __device__ float& MfDim(float* val) { + return val[CommonPushValue::MfDimIndex()]; + } + __host__ __device__ float& EmbedG(float* val) { + return val[CommonPushValue::EmbedGIndex()]; + } + __host__ __device__ float* EmbedxG(float* val) { + return val + CommonPushValue::EmbedxGIndex(); + } + }; + + struct CommonPullValue { + /* + float show; + float click; + float embed_w; + std::vector embedx_w; + */ + + __host__ __device__ static int Dim(int embedx_dim) { + return 3 + embedx_dim; + } + __host__ __device__ int DimSize(size_t dim) { return sizeof(float); } + __host__ __device__ int Size(int embedx_dim) { + return TYPEALIGN(8, Dim(embedx_dim) * sizeof(float)); + } + __host__ __device__ int ShowIndex() { return 0; } + __host__ __device__ int ClickIndex() { return 1; } + __host__ __device__ int EmbedWIndex() { return 2; } + __host__ __device__ int EmbedxWIndex() { return 3; } + __host__ __device__ float& Show(float* val) { + return val[CommonPullValue::ShowIndex()]; + } + __host__ __device__ float& Click(float* val) { + return val[CommonPullValue::ClickIndex()]; + } + __host__ __device__ float& EmbedW(float* val) { + return val[CommonPullValue::EmbedWIndex()]; + } + __host__ __device__ float* EmbedxW(float* val) { + return val + CommonPullValue::EmbedxWIndex(); + } + }; + + __host__ __device__ CommonFeatureValueAccessor() {} + __host__ __device__ ~CommonFeatureValueAccessor() {} + + __host__ __device__ virtual int Initialize() { + int optimizer_type = (_config.find("optimizer_type") == _config.end()) + ? 1 + : int(_config["optimizer_type"]); + int sparse_embedx_dim = (_config.find("embedx_dim") == _config.end()) + ? 8 + : int(_config["embedx_dim"]); + if (optimizer_type == 3) { // adam + common_feature_value.embed_sgd_dim = 4; + common_feature_value.embedx_sgd_dim = sparse_embedx_dim * 2 + 2; + } else if (optimizer_type == 4) { // shared_adam + common_feature_value.embed_sgd_dim = 4; + common_feature_value.embedx_sgd_dim = 4; + } else { + common_feature_value.embed_sgd_dim = 1; + common_feature_value.embedx_sgd_dim = 1; + } + common_feature_value.optimizer_type_ = optimizer_type; + common_feature_value.embedx_dim = sparse_embedx_dim; + + return 0; + } + + // // build阶段从cpu_val赋值给gpu_val + __host__ void BuildFill( + float* gpu_val, + void* cpu, + paddle::distributed::ValueAccessor* cpu_table_accessor, + int mf_dim) { +#ifdef PADDLE_WITH_PSCORE + paddle::distributed::CtrDymfAccessor* cpu_accessor = + dynamic_cast(cpu_table_accessor); + paddle::distributed::FixedFeatureValue* cpu_ptr = + (paddle::distributed::FixedFeatureValue*)(cpu); + float* cpu_val = cpu_ptr->data(); + size_t cpu_dim = cpu_ptr->size(); + + gpu_val[common_feature_value.DeltaScoreIndex()] = + cpu_val[cpu_accessor->common_feature_value.DeltaScoreIndex()]; + gpu_val[common_feature_value.ShowIndex()] = + cpu_val[cpu_accessor->common_feature_value.ShowIndex()]; + gpu_val[common_feature_value.ClickIndex()] = + cpu_val[cpu_accessor->common_feature_value.ClickIndex()]; + gpu_val[common_feature_value.SlotIndex()] = + cpu_val[cpu_accessor->common_feature_value.SlotIndex()]; + gpu_val[common_feature_value.EmbedWIndex()] = + cpu_val[cpu_accessor->common_feature_value.EmbedWIndex()]; + for (int i = 0; i < common_feature_value.EmbedDim(); i++) { + gpu_val[common_feature_value.EmbedG2SumIndex() + i] = + cpu_val[cpu_accessor->common_feature_value.EmbedG2SumIndex() + i]; + } + *(reinterpret_cast( + gpu_val + common_feature_value.CpuPtrIndex())) = (uint64_t)(cpu); + cpu_val[cpu_accessor->common_feature_value.MfDimIndex()] = float(mf_dim); + gpu_val[common_feature_value.MfDimIndex()] = mf_dim; + if (cpu_dim > cpu_accessor->GetAccessorInfo().dim - + cpu_accessor->GetAccessorInfo().mf_size / sizeof(float)) { + gpu_val[common_feature_value.MfSizeIndex()] = + common_feature_value.MFSize(mf_dim) / sizeof(float); + + for (int x = 0; + x < int(common_feature_value.MFSize(mf_dim) / sizeof(float)); + x++) { + gpu_val[common_feature_value.EmbedxG2SumIndex() + x] = + cpu_val[cpu_accessor->common_feature_value.EmbedxG2SumIndex() + x]; + } + } else { + gpu_val[common_feature_value.MfSizeIndex()] = 0; + for (int x = common_feature_value.EmbedxG2SumIndex(); + x < int(common_feature_value.Size(mf_dim) / sizeof(float)); + x++) { + gpu_val[x] = 0; + } + } +#endif + } + + // dump_to_cpu阶段从gpu_val赋值给cpu_val + __host__ void DumpFill(float* gpu_val, + paddle::distributed::ValueAccessor* cpu_table_accessor, + int mf_dim) { +#ifdef PADDLE_WITH_PSCORE + paddle::distributed::CtrDymfAccessor* cpu_accessor = + dynamic_cast(cpu_table_accessor); + + auto* downpour_value = + (paddle::distributed::FixedFeatureValue*)(*(reinterpret_cast( + gpu_val + common_feature_value.CpuPtrIndex()))); + size_t downpour_value_size = downpour_value->size(); + if (gpu_val[common_feature_value.MfSizeIndex()] > 0 && + downpour_value_size == (cpu_accessor->GetAccessorInfo().dim - + int(cpu_accessor->GetAccessorInfo().mf_size / + sizeof(float)))) { // cpu_accessor + downpour_value->resize(cpu_accessor->common_feature_value.Dim(mf_dim)); + } + float* cpu_val = downpour_value->data(); + cpu_val[cpu_accessor->common_feature_value.DeltaScoreIndex()] = + gpu_val[common_feature_value.DeltaScoreIndex()]; + cpu_val[cpu_accessor->common_feature_value.ShowIndex()] = + gpu_val[common_feature_value.ShowIndex()]; + cpu_val[cpu_accessor->common_feature_value.ClickIndex()] = + gpu_val[common_feature_value.ClickIndex()]; + cpu_val[cpu_accessor->common_feature_value.EmbedWIndex()] = + gpu_val[common_feature_value.EmbedWIndex()]; + cpu_val[cpu_accessor->common_feature_value.SlotIndex()] = + gpu_val[common_feature_value.SlotIndex()]; + + for (int i = 0; i < common_feature_value.EmbedDim(); i++) { + cpu_val[cpu_accessor->common_feature_value.EmbedG2SumIndex() + i] = + gpu_val[common_feature_value.EmbedG2SumIndex() + i]; + } + + if (gpu_val[common_feature_value.MfSizeIndex()] > 0) { + for (int x = 0; + x < int(common_feature_value.MFSize(mf_dim) / sizeof(float)); + x++) { + cpu_val[cpu_accessor->common_feature_value.EmbedxG2SumIndex() + x] = + gpu_val[common_feature_value.EmbedxG2SumIndex() + x]; + } + } +#endif + } + + // dy_mf_fill_dvals_kernel, dy_mf_search_kernel 阶段 gpukernel + // 中从src_val赋值给dest_val + __host__ __device__ void FeatureValueFill(float* dest_val, + float* src_val, + int mf_dim) { + *(reinterpret_cast(dest_val + + common_feature_value.CpuPtrIndex())) = + *(reinterpret_cast(src_val + + common_feature_value.CpuPtrIndex())); + dest_val[common_feature_value.DeltaScoreIndex()] = + src_val[common_feature_value.DeltaScoreIndex()]; + dest_val[common_feature_value.ShowIndex()] = + src_val[common_feature_value.ShowIndex()]; + dest_val[common_feature_value.ClickIndex()] = + src_val[common_feature_value.ClickIndex()]; + dest_val[common_feature_value.EmbedWIndex()] = + src_val[common_feature_value.EmbedWIndex()]; + for (int i = 0; i < common_feature_value.EmbedDim(); i++) { + dest_val[common_feature_value.EmbedG2SumIndex() + i] = + src_val[common_feature_value.EmbedG2SumIndex() + i]; + } + dest_val[common_feature_value.SlotIndex()] = + src_val[common_feature_value.SlotIndex()]; + dest_val[common_feature_value.MfDimIndex()] = mf_dim; + dest_val[common_feature_value.MfSizeIndex()] = + src_val[common_feature_value.MfSizeIndex()]; + + for (int x = common_feature_value.EmbedxG2SumIndex(); + x < int(common_feature_value.Size(mf_dim) / sizeof(float)); + x++) { + dest_val[x] = src_val[x]; + } + } + + // dy_mf_fill_shard_grads_kernel,update_one 阶段 gpukernel + // 中从src_val赋值给dest_val + __host__ __device__ void PushValueFill(float* dest_val, + const float* src_val) { + dest_val[common_push_value.SlotIndex()] = + src_val[common_push_value.SlotIndex()]; + dest_val[common_push_value.ShowIndex()] = + src_val[common_push_value.ShowIndex()]; + dest_val[common_push_value.ClickIndex()] = + src_val[common_push_value.ClickIndex()]; + dest_val[common_push_value.MfDimIndex()] = + src_val[common_push_value.MfDimIndex()]; + dest_val[common_push_value.EmbedGIndex()] = + src_val[common_push_value.EmbedGIndex()]; + + for (int x = 0; x < int(src_val[common_push_value.MfDimIndex()]); x++) { + dest_val[common_push_value.EmbedxGIndex() + x] = + src_val[common_push_value.EmbedxGIndex() + x]; + } + } + + // update_basic 阶段 gpukernel 中从src_val赋值给dest_val + __host__ __device__ void PushValueFillBasic(float* dest_val, + const float* src_val) { + dest_val[common_push_value.SlotIndex()] = + src_val[common_push_value.SlotIndex()]; + dest_val[common_push_value.ShowIndex()] = + src_val[common_push_value.ShowIndex()]; + dest_val[common_push_value.ClickIndex()] = + src_val[common_push_value.ClickIndex()]; + dest_val[common_push_value.MfDimIndex()] = + src_val[common_push_value.MfDimIndex()]; + dest_val[common_push_value.EmbedGIndex()] = + src_val[common_push_value.EmbedGIndex()]; + } + + // merge_one 阶段 gpukernel 中 PushValue 从src_val赋值给dest_val + __host__ __device__ void MergePushValue(float* dest_val, + const float* src_val) { + dest_val[common_push_value.ShowIndex()] += + src_val[common_push_value.ShowIndex()]; + dest_val[common_push_value.ClickIndex()] += + src_val[common_push_value.ClickIndex()]; + dest_val[common_push_value.EmbedGIndex()] += + src_val[common_push_value.EmbedGIndex()]; + for (int j = 0; j < int(dest_val[common_push_value.MfDimIndex()]); j++) { + dest_val[common_push_value.EmbedxGIndex() + j] += + src_val[common_push_value.EmbedxGIndex() + j]; + } + } + + // merge_basic 阶段 gpukernel 中 PushValue 从src_val赋值给dest_val + __host__ __device__ void MergePushValueBasic(float* dest_val, + const float* src_val) { + dest_val[common_push_value.ShowIndex()] += + src_val[common_push_value.ShowIndex()]; + dest_val[common_push_value.ClickIndex()] += + src_val[common_push_value.ClickIndex()]; + dest_val[common_push_value.EmbedGIndex()] += + src_val[common_push_value.EmbedGIndex()]; + } + + // PullCopy 阶段 gpukernel 中 FeatureValue回填到PullValue + __host__ __device__ void Select(float* dest_val, + float* src_val, + uint64_t* key, + int mf_dim) { + if (*key == 0) { + *(dest_val + common_pull_value.ShowIndex()) = 0; + *(dest_val + common_pull_value.ClickIndex()) = 0; + *(dest_val + common_pull_value.EmbedWIndex()) = 0; + } else { + *(dest_val + common_pull_value.ShowIndex()) = + src_val[common_feature_value.ShowIndex()]; + *(dest_val + common_pull_value.ClickIndex()) = + src_val[common_feature_value.ClickIndex()]; + *(dest_val + common_pull_value.EmbedWIndex()) = + src_val[common_feature_value.EmbedWIndex()]; + } + + if (src_val[common_feature_value.MfSizeIndex()] == 0 || *key == 0) { + for (int j = 0; j < mf_dim; j++) { + *(dest_val + common_pull_value.EmbedxWIndex() + j) = 0; + } + } else { + for (int j = 0; j < mf_dim; j++) { + *(dest_val + common_pull_value.EmbedxWIndex() + j) = + src_val[common_feature_value.EmbedxWOffsetIndex(src_val) + j]; + } + } + } + + __host__ __device__ std::string ParseToString(const float* v, + int param_size) { + /* + uint64_t cpu_ptr; // 2float + float delta_score; + float show; + float click; + float embed_w; + std::vector embed_g2sum; + float slot; + float mf_dim + float mf_size + std::vector embedx_g2sum; + std::vector embedx_w; + */ + std::stringstream os; + os << "cpuptr: " << common_feature_value.CpuPtr(const_cast(v)) + << " delta_score: " << v[2] << " show: " << v[3] << " click: " << v[4] + << " embed_w:" << v[5] << " embed_g2sum:"; + for (int i = common_feature_value.EmbedG2SumIndex(); + i < common_feature_value.SlotIndex(); + i++) { + os << " " << v[i]; + } + int mf_dim = int(common_feature_value.MfDim(const_cast(v))); + os << " slot: " << common_feature_value.Slot(const_cast(v)) + << " mf_dim: " << mf_dim + << " mf_size: " << common_feature_value.MfSize(const_cast(v)) + << " mf: "; + if (param_size > common_feature_value.EmbedxG2SumIndex()) { + for (auto i = common_feature_value.EmbedxG2SumIndex(); + i < common_feature_value.Dim(mf_dim); + ++i) { + os << " " << v[i]; + } + } + return os.str(); + } + + public: + CommonFeatureValue common_feature_value; + CommonPushValue common_push_value; + CommonPullValue common_pull_value; +}; struct FeatureValue { float delta_score; @@ -95,6 +630,176 @@ struct FeaturePushValue { } }; +class VirtualAccessor { + public: + virtual int Configure(std::unordered_map config) = 0; + + virtual size_t GetFeatureValueSize(int& mf_dim) = 0; + + virtual size_t GetPushValueSize(int& mf_dim) = 0; + + virtual void BuildFill(void* gpu_val, + void* cpu_val, + paddle::distributed::ValueAccessor* cpu_table_accessor, + int mf_dim) = 0; + + virtual void DumpFill(float* gpu_val, + paddle::distributed::ValueAccessor* cpu_table_accessor, + int mf_dim) = 0; + + virtual void CopyForPull(const paddle::platform::Place& place, + uint64_t** gpu_keys, + const std::vector& values, + const float* total_values_gpu, + const int64_t* gpu_len, + const int slot_num, + const int hidden_size, + const int64_t total_length, + int* gpu_dim, + int feature_value_size) = 0; + + virtual void CopyForPush(const paddle::platform::Place& place, + const std::vector& grad_values, + float* total_grad_values_gpu, + const std::vector& slot_lengths, + const uint64_t total_length, + const int batch_size, + size_t grad_value_size, + std::vector& slot_vector, + std::vector& slot_mf_dim_vector) = 0; + + virtual std::string ParseToString(const float* v, int param_size) = 0; +}; + +template +class AccessorWrapper : public VirtualAccessor { + public: + explicit AccessorWrapper() {} + virtual ~AccessorWrapper() {} + AccessorWrapper(const AccessorWrapper&) = delete; + AccessorWrapper& operator=(const AccessorWrapper&) = delete; + + virtual int Configure(std::unordered_map config) { + return gpu_accessor_.Configure(config); + } + + virtual size_t GetFeatureValueSize(int& mf_dim) { + return gpu_accessor_.common_feature_value.Size(mf_dim); + } + + virtual size_t GetPushValueSize(int& mf_dim) { + return gpu_accessor_.common_push_value.Size(mf_dim); + } + + virtual void BuildFill(void* gpu_val, + void* cpu_val, + paddle::distributed::ValueAccessor* cpu_table_accessor, + int mf_dim) { + gpu_accessor_.BuildFill( + (float*)(gpu_val), cpu_val, cpu_table_accessor, mf_dim); + } + + virtual void DumpFill(float* gpu_val, + paddle::distributed::ValueAccessor* cpu_table_accessor, + int mf_dim) { + gpu_accessor_.DumpFill(gpu_val, cpu_table_accessor, mf_dim); + } + + virtual void CopyForPull(const paddle::platform::Place& place, + uint64_t** gpu_keys, + const std::vector& values, + const float* total_values_gpu, + const int64_t* gpu_len, + const int slot_num, + const int hidden_size, + const int64_t total_length, + int* gpu_dim, + int feature_value_size) { + CopyForPullImpl(place, + gpu_keys, + values, + total_values_gpu, + gpu_len, + slot_num, + hidden_size, + total_length, + gpu_dim, + feature_value_size); + } + + virtual void CopyForPush(const paddle::platform::Place& place, + const std::vector& grad_values, + float* total_grad_values_gpu, + const std::vector& slot_lengths, + const uint64_t total_length, + const int batch_size, + size_t grad_value_size, + std::vector& slot_vector, + std::vector& slot_mf_dim_vector) { + CopyForPushImpl(place, + grad_values, + total_grad_values_gpu, + slot_lengths, + total_length, + batch_size, + grad_value_size, + slot_vector, + slot_mf_dim_vector); + } + + void CopyForPullImpl(const paddle::platform::Place& place, + uint64_t** gpu_keys, + const std::vector& values, + const float* total_values_gpu, + const int64_t* gpu_len, + const int slot_num, + const int hidden_size, + const int64_t total_length, + int* gpu_dim, + int feature_value_size); + + void CopyForPushImpl(const paddle::platform::Place& place, + const std::vector& grad_values, + float* total_grad_values_gpu, + const std::vector& slot_lengths, + const uint64_t total_length, + const int batch_size, + size_t grad_value_size, + std::vector& slot_vector, + std::vector& slot_mf_dim_vector); + + virtual std::string ParseToString(const float* v, int param_size) { + return gpu_accessor_.ParseToString(v, param_size); + } + + GPUAccessor gpu_accessor_; +}; + +class GlobalAccessorTransfor { + public: + static GlobalAccessorTransfor& GetInstance() { + static GlobalAccessorTransfor ins; + return ins; + } + void Init(std::string accessor_type) { + if (accessor_wrapper_ptr_ != nullptr) { + return; + } + if (accessor_type == "CtrDymfAccessor") { + accessor_wrapper_ptr_ = new AccessorWrapper(); + } else { + VLOG(0) << "GlobalAccessorTransfor Init not support accessor_type:" + << accessor_type; + accessor_wrapper_ptr_ = new AccessorWrapper(); + } + } + VirtualAccessor* GetAccessorWrapper() { return accessor_wrapper_ptr_; } + + private: + VirtualAccessor* accessor_wrapper_ptr_ = nullptr; +}; + } // end namespace framework } // end namespace paddle + #endif diff --git a/paddle/fluid/framework/fleet/heter_ps/graph_gpu_ps_table.h b/paddle/fluid/framework/fleet/heter_ps/graph_gpu_ps_table.h index 9a6581c2ae5e3..a4bee2c19bbda 100644 --- a/paddle/fluid/framework/fleet/heter_ps/graph_gpu_ps_table.h +++ b/paddle/fluid/framework/fleet/heter_ps/graph_gpu_ps_table.h @@ -25,10 +25,12 @@ #ifdef PADDLE_WITH_HETERPS namespace paddle { namespace framework { -class GpuPsGraphTable : public HeterComm { +class GpuPsGraphTable + : public HeterComm { public: GpuPsGraphTable(std::shared_ptr resource, int topo_aware) - : HeterComm(1, resource) { + : HeterComm( + 1, resource) { load_factor_ = 0.25; rw_lock.reset(new pthread_rwlock_t()); gpu_num = resource_->total_device(); diff --git a/paddle/fluid/framework/fleet/heter_ps/hashtable.h b/paddle/fluid/framework/fleet/heter_ps/hashtable.h index dbd6130c1461d..43192df0c71f0 100644 --- a/paddle/fluid/framework/fleet/heter_ps/hashtable.h +++ b/paddle/fluid/framework/fleet/heter_ps/hashtable.h @@ -137,8 +137,12 @@ class HashTable { size_t len, StreamType stream); - template - void get(const KeyType* d_keys, char* d_vals, size_t len, StreamType stream); + template + void get(const KeyType* d_keys, + char* d_vals, + size_t len, + StreamType stream, + FVAccessor& fv_accessor); void show(); @@ -150,9 +154,9 @@ class HashTable { #if defined(PADDLE_WITH_CUDA) - template + template void update(const KeyType* d_keys, - const GradType* d_grads, + const float* d_grads, size_t len, Sgd sgd, StreamType stream); diff --git a/paddle/fluid/framework/fleet/heter_ps/hashtable_kernel.cu b/paddle/fluid/framework/fleet/heter_ps/hashtable_kernel.cu index bb9998249048e..2f5d5697e7c38 100644 --- a/paddle/fluid/framework/fleet/heter_ps/hashtable_kernel.cu +++ b/paddle/fluid/framework/fleet/heter_ps/hashtable_kernel.cu @@ -83,36 +83,25 @@ __global__ void search_kernel(Table* table, } } -template +template __global__ void dy_mf_search_kernel(Table* table, const typename Table::key_type* const keys, char* vals, size_t len, - size_t pull_feature_value_size) { + size_t pull_feature_value_size, + FVAccessor feature_value_accessor) { const size_t i = blockIdx.x * blockDim.x + threadIdx.x; if (i < len) { auto it = table->find(keys[i]); if (it != table->end()) { uint64_t offset = i * pull_feature_value_size; - FeatureValue* cur = (FeatureValue*)(vals + offset); - FeatureValue& input = *(FeatureValue*)(it->second); - cur->slot = input.slot; - cur->show = input.show; - cur->clk = input.clk; - cur->mf_dim = input.mf_dim; - cur->lr = input.lr; - cur->mf_size = input.mf_size; - cur->cpu_ptr = input.cpu_ptr; - cur->delta_score = input.delta_score; - cur->lr_g2sum = input.lr_g2sum; - for (int j = 0; j < cur->mf_dim + 1; ++j) { - cur->mf[j] = input.mf[j]; - } - } else { - if (keys[i] != 0) { - printf("warning::pull miss key: %llu", keys[i]); - } + float* cur = (float*)(vals + offset); + float* input = it->second; + int mf_dim = + int(input[feature_value_accessor.common_feature_value.MfDimIndex()]); + + feature_value_accessor.FeatureValueFill(cur, input, mf_dim); } } } @@ -145,8 +134,8 @@ __global__ void dy_mf_update_kernel(Table* table, if (i < len) { auto it = table->find(keys[i]); if (it != table->end()) { - FeaturePushValue* cur = (FeaturePushValue*)(grads + i * grad_value_size); - sgd.dy_mf_update_value(optimizer_config, (it.getter())->second, *cur); + float* cur = (float*)(grads + i * grad_value_size); + sgd.dy_mf_update_value(optimizer_config, (it.getter())->second, cur); } else { if (keys[i] != 0) { printf("warning::push miss key: %llu", keys[i]); @@ -212,17 +201,18 @@ void HashTable::get(const KeyType* d_keys, } template -template +template void HashTable::get(const KeyType* d_keys, char* d_vals, size_t len, - StreamType stream) { + StreamType stream, + FVAccessor& fv_accessor) { if (len == 0) { return; } const int grid_size = (len - 1) / BLOCK_SIZE_ + 1; dy_mf_search_kernel<<>>( - container_, d_keys, d_vals, len, pull_feature_value_size_); + container_, d_keys, d_vals, len, pull_feature_value_size_, fv_accessor); } template @@ -298,27 +288,6 @@ void HashTable::dump_to_cpu(int devid, StreamType stream) { cpu_val[x + 7] = gpu_val.mf[x]; } } -#endif -#ifdef PADDLE_WITH_PSCORE - auto* downpour_value = - (paddle::distributed::FixedFeatureValue*)(gpu_val.cpu_ptr); - int downpour_value_size = downpour_value->size(); - if (gpu_val.mf_size > 0 && downpour_value_size == 7) { - downpour_value->resize(gpu_val.mf_size + downpour_value_size); - } - float* cpu_val = downpour_value->data(); - // cpu_val[0] = 0; - cpu_val[2] = gpu_val.delta_score; - cpu_val[3] = gpu_val.show; - cpu_val[4] = gpu_val.clk; - cpu_val[5] = gpu_val.lr; - cpu_val[6] = gpu_val.lr_g2sum; - cpu_val[0] = gpu_val.slot; - if (gpu_val.mf_size > 0) { - for (int x = 0; x < gpu_val.mf_size; x++) { - cpu_val[x + 7] = gpu_val.mf[x]; - } - } #endif } }; @@ -336,9 +305,9 @@ void HashTable::dump_to_cpu(int devid, StreamType stream) { } template -template +template void HashTable::update(const KeyType* d_keys, - const GradType* d_grads, + const float* d_grads, size_t len, Sgd sgd, StreamType stream) { @@ -371,8 +340,8 @@ void HashTable::update(const KeyType* d_keys, push_grad_value_size_); } -template class HashTable; -template class HashTable; +template class HashTable; +template class HashTable; template class HashTable; template class HashTable; template class HashTable; @@ -382,15 +351,19 @@ template class HashTable; template class HashTable; template class HashTable; -template void HashTable::get< - cudaStream_t>(const unsigned long* d_keys, - paddle::framework::FeatureValue* d_vals, - size_t len, - cudaStream_t stream); +template void HashTable::get( + const unsigned long* d_keys, + float* d_vals, + size_t len, + cudaStream_t stream); template void -HashTable::get( - const unsigned long* d_keys, char* d_vals, size_t len, cudaStream_t stream); +HashTable::get( + const unsigned long* d_keys, + char* d_vals, + size_t len, + cudaStream_t stream, + CommonFeatureValueAccessor& fv_accessor); template void HashTable::get(const long* d_keys, int* d_vals, @@ -399,6 +372,12 @@ template void HashTable::get(const long* d_keys, template void HashTable::get( const unsigned long* d_keys, int* d_vals, size_t len, cudaStream_t stream); +template void HashTable::get( + const unsigned long* d_keys, + unsigned long* d_vals, + size_t len, + cudaStream_t stream); + template void HashTable::get( const long* d_keys, unsigned long* d_vals, size_t len, cudaStream_t stream); template void HashTable::get(const long* d_keys, @@ -414,19 +393,19 @@ template void HashTable::get( // const unsigned long* d_keys, char* d_vals, size_t len, cudaStream_t // stream); -template void HashTable::insert< - cudaStream_t>(const unsigned long* d_keys, - const paddle::framework::FeatureValue* d_vals, - size_t len, - cudaStream_t stream); +template void HashTable::insert( + const unsigned long* d_keys, + const float* d_vals, + size_t len, + cudaStream_t stream); -template void HashTable:: - insert(const unsigned long* d_keys, - size_t len, - char* pool, - size_t feature_value_size, - size_t start_index, - cudaStream_t stream); +template void HashTable::insert( + const unsigned long* d_keys, + size_t len, + char* pool, + size_t feature_value_size, + size_t start_index, + cudaStream_t stream); template void HashTable::insert(const long* d_keys, const int* d_vals, @@ -460,30 +439,37 @@ template void HashTable::insert( size_t len, cudaStream_t stream); -template void HashTable:: - dump_to_cpu(int devid, cudaStream_t stream); +template void HashTable::insert( + const unsigned long* d_keys, + const unsigned long* d_vals, + size_t len, + cudaStream_t stream); -template void HashTable::update< - paddle::framework::FeaturePushValue, - Optimizer, +template void HashTable::dump_to_cpu( + int devid, cudaStream_t stream); + +template void +HashTable::update( + const unsigned long* d_keys, + const char* d_grads, + size_t len, + SparseAdagradOptimizer sgd, + cudaStream_t stream); +template void +HashTable::update( + const unsigned long* d_keys, + const char* d_grads, + size_t len, + SparseAdamOptimizer sgd, + cudaStream_t stream); +template void HashTable::update< + SparseAdamSharedOptimizer, cudaStream_t>(const unsigned long* d_keys, - const paddle::framework::FeaturePushValue* d_grads, + const char* d_grads, size_t len, - Optimizer sgd, + SparseAdamSharedOptimizer sgd, cudaStream_t stream); -template void HashTable:: - update, - cudaStream_t>(const unsigned long* d_keys, - const char* d_grads, - size_t len, - Optimizer sgd, - cudaStream_t stream); - // template void HashTable::update< // Optimizer +template class HeterComm { public: HeterComm(size_t capacity, std::shared_ptr resource); @@ -65,12 +68,9 @@ class HeterComm { GradType* d_grads, size_t len, int& uniq_len); // NOLINT - void dynamic_merge_grad(int gpu_num, - KeyType* d_keys, - GradType* d_grads, - size_t len, - int& uniq_len); - void pull_sparse(int num, KeyType* d_keys, ValType* d_vals, size_t len); + void dynamic_merge_grad( + int gpu_num, KeyType* d_keys, float* d_grads, size_t len, int& uniq_len); + void pull_sparse(int num, KeyType* d_keys, float* d_vals, size_t len); void build_ps(int num, KeyType* h_keys, ValType* h_vals, @@ -92,7 +92,7 @@ class HeterComm { template void push_sparse(int num, KeyType* d_keys, - GradType* d_grads, + float* d_grads, size_t len, Sgd& sgd); // NOLINT #elif defined(PADDLE_WITH_XPU_KP) @@ -149,6 +149,13 @@ class HeterComm { multi_mf_dim_ = multi_mf_dim; max_mf_dim_ = max_mf_dim; } + + void set_accessor(FVAccessor& accessor) { + feature_value_accessor_ = accessor; + // for (auto& ptr_table: ptr_tables_) { + // ptr_table->set_accessor(feature_value_accessor_); + // } + } #endif bool need_transfer(int send_id, int receive_id) { @@ -282,9 +289,11 @@ class HeterComm { char* src_val, size_t val_size); + FVAccessor feature_value_accessor_; + protected: using Table = HashTable; - using PtrTable = HashTable; + using PtrTable = HashTable; std::vector tables_; std::vector ptr_tables_; std::shared_ptr resource_; diff --git a/paddle/fluid/framework/fleet/heter_ps/heter_comm_inl.h b/paddle/fluid/framework/fleet/heter_ps/heter_comm_inl.h index a7333cd01c6ec..f8657c8e895ad 100644 --- a/paddle/fluid/framework/fleet/heter_ps/heter_comm_inl.h +++ b/paddle/fluid/framework/fleet/heter_ps/heter_comm_inl.h @@ -24,8 +24,12 @@ limitations under the License. */ namespace paddle { namespace framework { -template -HeterComm::HeterComm( + +template +HeterComm::HeterComm( size_t capacity, std::shared_ptr resource) { VLOG(1) << "Construct new HeterComm"; resource_ = resource; @@ -42,10 +46,14 @@ HeterComm::HeterComm( tables_.push_back(table); } else { max_mf_dim_ = resource_->max_mf_dim(); - size_t val_type_size = TYPEALIGN( - 8, sizeof(FeatureValue) + sizeof(float) * (max_mf_dim_ + 1)); - size_t grad_type_size = TYPEALIGN( - 8, sizeof(FeaturePushValue) + (max_mf_dim_ * sizeof(float))); + auto accessor_wrapper_ptr = + GlobalAccessorTransfor::GetInstance().GetAccessorWrapper(); + size_t val_type_size = + accessor_wrapper_ptr->GetFeatureValueSize(max_mf_dim_); + size_t grad_type_size = + accessor_wrapper_ptr->GetPushValueSize(max_mf_dim_); + VLOG(0) << " HeterComm init, max feature_value_size:" << val_type_size + << ", feature_value_push_size:" << grad_type_size; auto ptr_table = new PtrTable(capacity / load_factor_); ptr_table->set_feature_value_size(val_type_size, grad_type_size); ptr_tables_.push_back(ptr_table); @@ -58,8 +66,11 @@ HeterComm::HeterComm( init_path(); } -template -void HeterComm::init_path() { +template +void HeterComm::init_path() { int total_device = resource_->total_device(); path_.resize(total_device); if (!topo_aware_) { @@ -111,14 +122,18 @@ void HeterComm::init_path() { } } -template +template template -void HeterComm::memory_copy(DstPlace dst_place, - void* dst, - SrcPlace src_place, - const void* src, - size_t count, - StreamType stream) { +void HeterComm::memory_copy( + DstPlace dst_place, + void* dst, + SrcPlace src_place, + const void* src, + size_t count, + StreamType stream) { #if defined(PADDLE_WITH_CUDA) cudaMemcpyAsync(dst, src, count, cudaMemcpyDefault, stream); if (stream == 0) { @@ -129,11 +144,12 @@ void HeterComm::memory_copy(DstPlace dst_place, #endif } -template -void HeterComm::create_storage(int start_index, - int end_index, - int keylen, - int vallen) { +template +void HeterComm::create_storage( + int start_index, int end_index, int keylen, int vallen) { #if defined(PADDLE_WITH_CUDA) auto& allocator = allocators_[start_index]; auto& nodes = path_[start_index][end_index].nodes_; @@ -167,9 +183,12 @@ void HeterComm::create_storage(int start_index, #endif } -template -void HeterComm::destroy_storage(int start_index, - int end_index) { +template +void HeterComm::destroy_storage( + int start_index, int end_index) { #if defined(PADDLE_WITH_CUDA) auto& allocator = allocators_[start_index]; auto& nodes = path_[start_index][end_index].nodes_; @@ -184,13 +203,17 @@ void HeterComm::destroy_storage(int start_index, #endif } -template -void HeterComm::walk_to_dest(int start_index, - int num, - int* h_left, - int* h_right, - KeyType* src_key, - GradType* src_val) { +template +void HeterComm::walk_to_dest( + int start_index, + int num, + int* h_left, + int* h_right, + KeyType* src_key, + GradType* src_val) { int need_copy_val = 0; if (src_val) { need_copy_val = 1; @@ -267,14 +290,18 @@ void HeterComm::walk_to_dest(int start_index, } } -template -void HeterComm::walk_to_dest(int start_index, - int gpu_num, - int* h_left, - int* h_right, - KeyType* src_key, - char* src_val, - size_t val_size) { +template +void HeterComm::walk_to_dest( + int start_index, + int gpu_num, + int* h_left, + int* h_right, + KeyType* src_key, + char* src_val, + size_t val_size) { int need_copy_val = 0; if (src_val) { need_copy_val = 1; @@ -327,13 +354,17 @@ void HeterComm::walk_to_dest(int start_index, } } -template -void HeterComm::walk_to_src(int start_index, - int gpu_num, - int* h_left, - int* h_right, - char* src_val, - size_t val_size) { +template +void HeterComm::walk_to_src( + int start_index, + int gpu_num, + int* h_left, + int* h_right, + char* src_val, + size_t val_size) { std::queue que; for (int i = 0; i < gpu_num; i++) { if (h_left[i] == -1 || h_right[i] == -1) { @@ -383,8 +414,11 @@ void HeterComm::walk_to_src(int start_index, } } -template -HeterComm::~HeterComm() { +template +HeterComm::~HeterComm() { if (!multi_mf_dim_) { for (auto& table : tables_) { delete table; @@ -402,15 +436,22 @@ HeterComm::~HeterComm() { } } -template -void HeterComm::show_one_table(int gpu_num) { +template +void HeterComm::show_one_table( + int gpu_num) { if (!multi_mf_dim_) { tables_[gpu_num]->show(); } } -template -int HeterComm::log2i(int x) { +template +int HeterComm::log2i(int x) { unsigned res = 0; while (x >>= 1) { ++res; @@ -418,13 +459,20 @@ int HeterComm::log2i(int x) { return res; } -template -int HeterComm::get_index_by_devid(int devid) { +template +int HeterComm::get_index_by_devid( + int devid) { return resource_->get_index_by_devid(devid); } -template -void HeterComm::set_sparse_sgd( +template +void HeterComm::set_sparse_sgd( const OptimizerConfig& optimizer_config) { for (int i = 0; i < resource_->total_device(); ++i) { AnyDeviceGuard guard(resource_->dev_id(i)); @@ -436,8 +484,11 @@ void HeterComm::set_sparse_sgd( } } -template -void HeterComm::set_embedx_sgd( +template +void HeterComm::set_embedx_sgd( const OptimizerConfig& optimizer_config) { for (int i = 0; i < resource_->total_device(); ++i) { AnyDeviceGuard guard(resource_->dev_id(i)); @@ -449,13 +500,17 @@ void HeterComm::set_embedx_sgd( } } -template -void HeterComm::build_ps(int dev_num, - KeyType* h_keys, - ValType* h_vals, - size_t len, - size_t chunk_size, - int stream_num) { +template +void HeterComm::build_ps( + int dev_num, + KeyType* h_keys, + ValType* h_vals, + size_t len, + size_t chunk_size, + int stream_num) { if (len <= 0) { return; } @@ -518,14 +573,18 @@ void HeterComm::build_ps(int dev_num, } } -template -void HeterComm::build_ps(int num, - KeyType* h_keys, - char* pool, - size_t len, - size_t feature_value_size, - size_t chunk_size, - int stream_num) { +template +void HeterComm::build_ps( + int num, + KeyType* h_keys, + char* pool, + size_t len, + size_t feature_value_size, + size_t chunk_size, + int stream_num) { if (len <= 0) { return; } @@ -580,8 +639,11 @@ void HeterComm::build_ps(int num, } } -template -void HeterComm::merge_grad( +template +void HeterComm::merge_grad( int dev_num, KeyType* d_keys, GradType* d_grads, @@ -654,13 +716,12 @@ void HeterComm::merge_grad( sync_stream(stream); } -template -void HeterComm::dynamic_merge_grad( - int gpu_num, - KeyType* d_keys, - GradType* d_grads, - size_t len, - int& uniq_len) { +template +void HeterComm::dynamic_merge_grad( + int gpu_num, KeyType* d_keys, float* d_grads, size_t len, int& uniq_len) { int dev_id = resource_->dev_id(gpu_num); platform::CUDAPlace place = platform::CUDAPlace(dev_id); platform::CUDADeviceGuard guard(dev_id); @@ -668,16 +729,15 @@ void HeterComm::dynamic_merge_grad( size_t temp_storage_bytes; - // VLOG(1) << "hetercomm merge_grad: max_mf_dim: " << max_mf_dim_; - size_t grad_value_size = - TYPEALIGN(8, sizeof(FeaturePushValue) + (max_mf_dim_ * sizeof(float))); + auto accessor_wrapper_ptr = + GlobalAccessorTransfor::GetInstance().GetAccessorWrapper(); + size_t grad_value_size = accessor_wrapper_ptr->GetPushValueSize(max_mf_dim_); auto d_merge_keys = memory::Alloc(place, len * sizeof(KeyType)); KeyType* d_merge_keys_ptr = reinterpret_cast(d_merge_keys->ptr()); auto d_merge_grads = memory::Alloc(place, len * grad_value_size); - GradType* d_merge_grads_ptr = - reinterpret_cast(d_merge_grads->ptr()); + float* d_merge_grads_ptr = reinterpret_cast(d_merge_grads->ptr()); auto d_fea_num_info = memory::Alloc(place, sizeof(uint32_t) * (len * 3 + 1)); uint32_t* d_fea_num_info_ptr = @@ -772,7 +832,8 @@ void HeterComm::dynamic_merge_grad( uniq_len, grad_value_size, merger_, - stream); + stream, + feature_value_accessor_); PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamSynchronize(stream)); PADDLE_ENFORCE_GPU_SUCCESS(cudaMemcpyAsync(d_grads, d_merge_grads_ptr, @@ -782,8 +843,11 @@ void HeterComm::dynamic_merge_grad( PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamSynchronize(stream)); } -template -void HeterComm::split_input_to_shard( +template +void HeterComm::split_input_to_shard( KeyType* d_keys, int* d_idx_ptr, size_t len, @@ -843,11 +907,12 @@ void HeterComm::split_input_to_shard( sync_stream(stream); } -template -void HeterComm::pull_sparse(int num, - KeyType* d_keys, - ValType* d_vals, - size_t len) { +template +void HeterComm::pull_sparse( + int num, KeyType* d_keys, float* d_vals, size_t len) { if (len == 0) { return; } @@ -893,12 +958,15 @@ void HeterComm::pull_sparse(int num, auto d_idx = memory::Alloc(place, len * sizeof(int)); int* d_idx_ptr = reinterpret_cast(d_idx->ptr()); - size_t val_type_size = - TYPEALIGN(8, sizeof(FeatureValue) + sizeof(float) * (max_mf_dim_ + 1)); + + auto accessor_wrapper_ptr = + GlobalAccessorTransfor::GetInstance().GetAccessorWrapper(); + size_t val_type_size = accessor_wrapper_ptr->GetFeatureValueSize(max_mf_dim_); + VLOG(3) << "pull_sparse len:" << len << " val_type_size: " << val_type_size; auto d_shard_keys = memory::Alloc(place, len * sizeof(KeyType)); KeyType* d_shard_keys_ptr = reinterpret_cast(d_shard_keys->ptr()); auto d_shard_vals = memory::Alloc(place, len * val_type_size); - ValType* d_shard_vals_ptr = reinterpret_cast(d_shard_vals->ptr()); + float* d_shard_vals_ptr = reinterpret_cast(d_shard_vals->ptr()); split_input_to_shard(d_keys, d_idx_ptr, len, d_left_ptr, d_right_ptr, num); @@ -944,7 +1012,8 @@ void HeterComm::pull_sparse(int num, ptr_tables_[i]->get(reinterpret_cast(node.key_storage), node.val_storage, h_right[i] - h_left[i] + 1, - resource_->remote_stream(i, num)); + resource_->remote_stream(i, num), + feature_value_accessor_); } for (int i = 0; i < total_device; ++i) { @@ -964,10 +1033,16 @@ void HeterComm::pull_sparse(int num, auto& node = path_[num][i].nodes_.front(); sync_stream(node.out_stream); } - heter_comm_kernel_->dy_mf_fill_dvals( - d_shard_vals_ptr, d_vals, d_idx_ptr, len, val_type_size, stream); + heter_comm_kernel_->dy_mf_fill_dvals(d_shard_vals_ptr, + d_vals, + d_idx_ptr, + len, + val_type_size, + stream, + feature_value_accessor_); sync_stream(stream); + for (int i = 0; i < total_device; ++i) { if (h_left[i] == -1 || h_right[i] == -1) { continue; @@ -977,13 +1052,17 @@ void HeterComm::pull_sparse(int num, } #if defined(PADDLE_WITH_CUDA) -template +template template -void HeterComm::push_sparse(int dev_num, - KeyType* d_keys, - GradType* d_grads, - size_t len, - Sgd& sgd) { // NOLINT +void HeterComm::push_sparse( + int dev_num, + KeyType* d_keys, + float* d_grads, + size_t len, + Sgd& sgd) { // NOLINT if (len == 0) { return; } @@ -991,8 +1070,9 @@ void HeterComm::push_sparse(int dev_num, int total_device = resource_->total_device(); int dev_id = resource_->dev_id(dev_num); - size_t grad_value_size = - TYPEALIGN(8, sizeof(FeaturePushValue) + (max_mf_dim_ * sizeof(float))); + auto accessor_wrapper_ptr = + GlobalAccessorTransfor::GetInstance().GetAccessorWrapper(); + size_t grad_value_size = accessor_wrapper_ptr->GetPushValueSize(max_mf_dim_); DevPlace place = DevPlace(dev_id); AnyDeviceGuard guard(dev_id); auto stream = resource_->local_stream(dev_num, 0); @@ -1037,8 +1117,7 @@ void HeterComm::push_sparse(int dev_num, KeyType* d_shard_keys_ptr = reinterpret_cast(d_shard_keys->ptr()); auto d_shard_grads = memory::Alloc(place, len * grad_value_size); - GradType* d_shard_grads_ptr = - reinterpret_cast(d_shard_grads->ptr()); + float* d_shard_grads_ptr = reinterpret_cast(d_shard_grads->ptr()); int uniq_len = len; dynamic_merge_grad(dev_num, d_keys, d_grads, len, uniq_len); @@ -1048,24 +1127,15 @@ void HeterComm::push_sparse(int dev_num, split_input_to_shard( d_keys, d_idx_ptr, uniq_len, d_left_ptr, d_right_ptr, dev_num); - if (!multi_mf_dim_) { - heter_comm_kernel_->fill_shard_grads(d_shard_keys_ptr, - d_keys, - d_shard_grads_ptr, - d_grads, - d_idx_ptr, - uniq_len, - stream); - } else { - heter_comm_kernel_->dy_mf_fill_shard_grads(d_shard_keys_ptr, - d_keys, - d_shard_grads_ptr, - d_grads, - d_idx_ptr, - uniq_len, - grad_value_size, - stream); - } + heter_comm_kernel_->dy_mf_fill_shard_grads(d_shard_keys_ptr, + d_keys, + d_shard_grads_ptr, + d_grads, + d_idx_ptr, + uniq_len, + grad_value_size, + stream, + feature_value_accessor_); sync_stream(stream); @@ -1089,33 +1159,17 @@ void HeterComm::push_sparse(int dev_num, if (h_left[i] == -1 || h_right[i] == -1) { continue; } - if (!multi_mf_dim_) { - create_storage(dev_num, - i, - shard_len * sizeof(KeyType), - shard_len * sizeof(GradType)); - } else { - create_storage( - dev_num, i, shard_len * sizeof(KeyType), shard_len * grad_value_size); - } + create_storage( + dev_num, i, shard_len * sizeof(KeyType), shard_len * grad_value_size); } - if (!multi_mf_dim_) { - walk_to_dest(dev_num, - total_device, - h_left, - h_right, - d_shard_keys_ptr, - d_shard_grads_ptr); - } else { - walk_to_dest(dev_num, - total_device, - h_left, - h_right, - d_shard_keys_ptr, - reinterpret_cast(d_shard_grads_ptr), - grad_value_size); - } + walk_to_dest(dev_num, + total_device, + h_left, + h_right, + d_shard_keys_ptr, + reinterpret_cast(d_shard_grads_ptr), + grad_value_size); for (int i = 0; i < total_device; ++i) { if (h_left[i] == -1 || h_right[i] == -1) { @@ -1125,21 +1179,12 @@ void HeterComm::push_sparse(int dev_num, sync_stream(node.in_stream); AnyDeviceGuard guard(resource_->dev_id(i)); - if (!multi_mf_dim_) { - tables_[i]->rwlock_->WRLock(); - tables_[i]->update(reinterpret_cast(node.key_storage), - reinterpret_cast(node.val_storage), - h_right[i] - h_left[i] + 1, - sgd, - resource_->remote_stream(i, dev_num)); - } else { - ptr_tables_[i]->rwlock_->WRLock(); - ptr_tables_[i]->update(reinterpret_cast(node.key_storage), - node.val_storage, - h_right[i] - h_left[i] + 1, - sgd, - resource_->remote_stream(i, dev_num)); - } + ptr_tables_[i]->rwlock_->WRLock(); + ptr_tables_[i]->update(reinterpret_cast(node.key_storage), + node.val_storage, + h_right[i] - h_left[i] + 1, + sgd, + resource_->remote_stream(i, dev_num)); } for (int i = 0; i < total_device; ++i) { @@ -1162,11 +1207,12 @@ void HeterComm::push_sparse(int dev_num, } #elif defined(PADDLE_WITH_XPU_KP) -template -void HeterComm::push_sparse(int dev_num, - KeyType* d_keys, - GradType* d_grads, - size_t len) { +template +void HeterComm::push_sparse( + int dev_num, KeyType* d_keys, GradType* d_grads, size_t len) { if (len == 0) { return; } @@ -1302,9 +1348,12 @@ void HeterComm::push_sparse(int dev_num, #endif #if defined(PADDLE_WITH_CUDA) -template +template template -void HeterComm::update_one_table( +void HeterComm::update_one_table( int gpu_num, KeyType* d_keys, GradType* d_grads, @@ -1323,9 +1372,12 @@ void HeterComm::update_one_table( cudaStreamSynchronize(resource_->remote_stream(gpu_num, gpu_num)); } -template +template template -void HeterComm::push_sparse_multi_node( +void HeterComm::push_sparse_multi_node( int gpu_num, KeyType* d_keys, GradType* d_grads, @@ -1352,8 +1404,11 @@ void HeterComm::push_sparse_multi_node( sgd); } -template -int HeterComm::gather_one_node_grad( +template +int HeterComm::gather_one_node_grad( int gpu_num, KeyType* d_keys, GradType* d_grads, int len) { int total_gpu = resource_->total_device(); int dev_id = resource_->dev_id(gpu_num); @@ -1454,8 +1509,11 @@ int HeterComm::gather_one_node_grad( return ret; } -template -int HeterComm::gather_multi_node_grad( +template +int HeterComm::gather_multi_node_grad( int gpu_num, KeyType* d_keys, GradType* d_grads, int len) { int dev_id = resource_->dev_id(gpu_num); auto& storage = storage_[gpu_num]; @@ -1525,8 +1583,11 @@ int HeterComm::gather_multi_node_grad( } #endif -template -void HeterComm::end_pass() { +template +void HeterComm::end_pass() { int total_device = resource_->total_device(); std::vector threads; @@ -1547,8 +1608,10 @@ void HeterComm::end_pass() { } } -// template -// void HeterComm::dump_to_cpu(int index) { +// template +// void HeterComm::dump_to_cpu(int +// index) { // auto stream = resource_->local_stream(index, 0); // int dev_id = resource_->dev_id(index); // platform::CUDADeviceGuard guard(dev_id); diff --git a/paddle/fluid/framework/fleet/heter_ps/heter_comm_kernel.cu b/paddle/fluid/framework/fleet/heter_ps/heter_comm_kernel.cu index fd0dd1a72cca1..ebf7e76527af0 100644 --- a/paddle/fluid/framework/fleet/heter_ps/heter_comm_kernel.cu +++ b/paddle/fluid/framework/fleet/heter_ps/heter_comm_kernel.cu @@ -128,22 +128,28 @@ __global__ void fill_dvals_kernel(ValType* d_shard_vals, } } -template -__global__ void dy_mf_fill_shard_grads_kernel(KeyType* d_shard_keys, - KeyType* d_keys, - GradType* d_shard_grads, - GradType* d_grads, - T* idx, - size_t len, - size_t grad_value_size) { +template +__global__ void dy_mf_fill_shard_grads_kernel( + KeyType* d_shard_keys, + KeyType* d_keys, + float* d_shard_grads, + float* d_grads, + T* idx, + size_t len, + size_t grad_value_size, + FVAccessor feature_value_accessor) { const size_t i = blockIdx.x * blockDim.x + threadIdx.x; if (i < len) { d_shard_keys[i] = d_keys[idx[i]]; - *(GradType*)((char*)d_shard_grads + i * grad_value_size) = - *(GradType*)((char*)d_grads + uint64_t(idx[i]) * grad_value_size); + float* cur = (float*)((char*)d_shard_grads + i * grad_value_size); + float* shard_val = + (float*)((char*)d_grads + uint64_t(idx[i]) * grad_value_size); + + feature_value_accessor.PushValueFill(cur, shard_val); } } +template __global__ void merge_gradients_kernel(const uint32_t* offset, const uint32_t* fea_num, const uint32_t* index, @@ -151,36 +157,40 @@ __global__ void merge_gradients_kernel(const uint32_t* offset, char* output, int n, size_t grad_value_size, - DynamicGradMerger& merger_) { + DynamicGradMerger& merger, + FVAccessor& feature_value_accessor) { const size_t i = blockIdx.x * blockDim.x + threadIdx.x; if (i < n) { uint32_t start = offset[i]; uint32_t num = fea_num[i]; int ori_index = index[start]; - FeaturePushValue& out = *(FeaturePushValue*)(output + i * grad_value_size); - FeaturePushValue& in = - *(FeaturePushValue*)(input + size_t(ori_index) * grad_value_size); - merger_.update_one(out, in); + float* out = (float*)(output + i * grad_value_size); + float* in = (float*)(input + size_t(ori_index) * grad_value_size); + merger.update_one(out, in, feature_value_accessor); for (int j = 1; j < num; ++j) { ori_index = index[start + j]; - FeaturePushValue& rhs = - *(FeaturePushValue*)(input + size_t(ori_index) * grad_value_size); - merger_.merge_one(out, rhs); + in = (float*)(input + size_t(ori_index) * grad_value_size); + merger.merge_one(out, in, feature_value_accessor); } } } -template -__global__ void dy_mf_fill_dvals_kernel(ValType* d_shard_vals, - ValType* d_vals, +template +__global__ void dy_mf_fill_dvals_kernel(float* d_shard_vals, + float* d_vals, T* idx, size_t len, - size_t val_size) { + size_t val_size, + FVAccessor feature_value_accessor) { const size_t i = blockIdx.x * blockDim.x + threadIdx.x; if (i < len) { uint64_t new_offset = uint64_t(idx[i]) * val_size; - *(ValType*)((char*)d_vals + new_offset) = - *(ValType*)((char*)d_shard_vals + i * val_size); + float* cur = (float*)((char*)d_vals + new_offset); + float* shard_val = (float*)((char*)d_shard_vals + uint64_t(i) * val_size); + int mf_dim = int( + shard_val[feature_value_accessor.common_feature_value.MfDimIndex()]); + + feature_value_accessor.FeatureValueFill(cur, shard_val, mf_dim); } } @@ -312,15 +322,20 @@ void HeterCommKernel::reduce_by_key(void* d_temp_storage, debug_synchronous)); } -template -void HeterCommKernel::dy_mf_fill_shard_grads(KeyType* d_shard_keys, - KeyType* d_keys, - GradType* d_shard_grads, - GradType* d_grads, - T* idx, - long long len, - size_t grad_value_size, - const StreamType& stream) { +template +void HeterCommKernel::dy_mf_fill_shard_grads( + KeyType* d_shard_keys, + KeyType* d_keys, + float* d_shard_grads, + float* d_grads, + T* idx, + long long len, + size_t grad_value_size, + const StreamType& stream, + FVAccessor& feature_value_accessor) { int grid_size = (len - 1) / block_size_ + 1; size_t c_len = (size_t)len; dy_mf_fill_shard_grads_kernel<<>>( @@ -330,10 +345,11 @@ void HeterCommKernel::dy_mf_fill_shard_grads(KeyType* d_shard_keys, d_grads, idx, c_len, - grad_value_size); + grad_value_size, + feature_value_accessor); } -template +template void HeterCommKernel::merge_gradient(const uint32_t* offset, const uint32_t* fea_num, const uint32_t* index, @@ -342,23 +358,33 @@ void HeterCommKernel::merge_gradient(const uint32_t* offset, int n, size_t grad_value_size, DynamicGradMerger& merger_, - const StreamType& stream) { + const StreamType& stream, + FVAccessor& feature_value_accessor) { int grid_size = (n - 1) / block_size_ + 1; merge_gradients_kernel<<>>( - offset, fea_num, index, input, output, n, grad_value_size, merger_); + offset, + fea_num, + index, + input, + output, + n, + grad_value_size, + merger_, + feature_value_accessor); } -template -void HeterCommKernel::dy_mf_fill_dvals(ValType* d_shard_vals, - ValType* d_vals, +template +void HeterCommKernel::dy_mf_fill_dvals(float* d_shard_vals, + float* d_vals, T* idx, long long len, size_t val_size, - const StreamType& stream) { + const StreamType& stream, + FVAccessor& feature_value_accessor) { int grid_size = (len - 1) / block_size_ + 1; size_t c_len = (size_t)len; dy_mf_fill_dvals_kernel<<>>( - d_shard_vals, d_vals, idx, c_len, val_size); + d_shard_vals, d_vals, idx, c_len, val_size, feature_value_accessor); } template void HeterCommKernel::fill_idx( @@ -402,17 +428,15 @@ template void HeterCommKernel::fill_shard_key( long long len, const cudaStream_t& stream); -template void HeterCommKernel::fill_shard_grads< - unsigned long, - paddle::framework::FeaturePushValue, - int, - cudaStream_t>(unsigned long* d_shard_keys, - unsigned long* d_keys, - paddle::framework::FeaturePushValue* d_shard_grads, - paddle::framework::FeaturePushValue* d_grads, - int* idx, - long long len, - const cudaStream_t& stream); +template void +HeterCommKernel::fill_shard_grads( + unsigned long* d_shard_keys, + unsigned long* d_keys, + float* d_shard_grads, + float* d_grads, + int* idx, + long long len, + const cudaStream_t& stream); template void HeterCommKernel::fill_dvals( @@ -467,20 +491,23 @@ template void HeterCommKernel::reduce_by_key< cudaStream_t stream, bool debug_synchronous); -template void HeterCommKernel::dy_mf_fill_shard_grads< - unsigned long, - paddle::framework::FeaturePushValue, - int, - cudaStream_t>(unsigned long* d_shard_keys, - unsigned long* d_keys, - paddle::framework::FeaturePushValue* d_shard_grads, - paddle::framework::FeaturePushValue* d_grads, - int* idx, - long long len, - size_t grad_value_size, - const cudaStream_t& stream); - -template void HeterCommKernel::merge_gradient( +template void +HeterCommKernel::dy_mf_fill_shard_grads( + unsigned long* d_shard_keys, + unsigned long* d_keys, + float* d_shard_grads, + float* d_grads, + int* idx, + long long len, + size_t grad_value_size, + const cudaStream_t& stream, + CommonFeatureValueAccessor& feature_value_accessor); + +template void +HeterCommKernel::merge_gradient( const uint32_t* offset, const uint32_t* fea_num, const uint32_t* index, @@ -489,16 +516,18 @@ template void HeterCommKernel::merge_gradient( int n, size_t grad_value_size, DynamicGradMerger& merger_, - const cudaStream_t& stream); + const cudaStream_t& stream, + CommonFeatureValueAccessor& feature_value_accessor); template void HeterCommKernel:: - dy_mf_fill_dvals( - paddle::framework::FeatureValue* d_shard_vals, - paddle::framework::FeatureValue* d_vals, + dy_mf_fill_dvals( + float* d_shard_vals, + float* d_vals, int* idx, long long len, size_t val_size, - const cudaStream_t& stream); + const cudaStream_t& stream, + CommonFeatureValueAccessor& feature_value_accessor); #endif } // namespace framework diff --git a/paddle/fluid/framework/fleet/heter_ps/heter_comm_kernel.h b/paddle/fluid/framework/fleet/heter_ps/heter_comm_kernel.h index d1555dc2e0919..57f0aff4b6e56 100644 --- a/paddle/fluid/framework/fleet/heter_ps/heter_comm_kernel.h +++ b/paddle/fluid/framework/fleet/heter_ps/heter_comm_kernel.h @@ -41,25 +41,16 @@ struct DynamicGradMerger { return out; } - template - __device__ __forceinline__ void update_one(T& output, const T& input) { - output.slot = input.slot; - output.show = input.show; - output.clk = input.clk; - output.mf_dim = input.mf_dim; - output.lr_g = input.lr_g; - for (int i = 0; i < output.mf_dim; ++i) { - output.mf_g[i] = input.mf_g[i]; - } + template + __device__ __forceinline__ void update_one( + float* output, const float* input, FVAccessor& feature_value_accessor) { + feature_value_accessor.PushValueFill(output, input); } - template - __device__ __forceinline__ void merge_one(T& output, const T& input) { - output.show += input.show; - output.clk += input.clk; - output.lr_g += input.lr_g; - for (int i = 0; i < input.mf_dim; ++i) { - output.mf_g[i] += input.mf_g[i]; - } + + template + __device__ __forceinline__ void merge_one( + float* output, const float* input, FVAccessor& feature_value_accessor) { + feature_value_accessor.MergePushValue(output, input); } }; @@ -146,19 +137,20 @@ class HeterCommKernel { bool debug_synchronous = false); template + typename StreamType, + typename FVAccessor> void dy_mf_fill_shard_grads(KeyType* d_shard_keys, KeyType* d_keys, - GradType* d_shard_grads, - GradType* d_grads, + float* d_shard_grads, + float* d_grads, T* idx, long long len, size_t grad_value_size, - const StreamType& stream); + const StreamType& stream, + FVAccessor& feature_value_accessor); - template + template void merge_gradient(const uint32_t* offset, const uint32_t* fea_num, const uint32_t* index, @@ -167,15 +159,17 @@ class HeterCommKernel { int n, size_t grad_value_size, DynamicGradMerger& merger_, - const StreamType& stream); + const StreamType& stream, + FVAccessor& feature_value_accessor); - template - void dy_mf_fill_dvals(ValType* d_shard_vals, - ValType* d_vals, + template + void dy_mf_fill_dvals(float* d_shard_vals, + float* d_vals, T* idx, long long len, size_t val_size, - const StreamType& stream); + const StreamType& stream, + FVAccessor& feature_value_accessor); private: int block_size_{256}; diff --git a/paddle/fluid/framework/fleet/heter_ps/heter_ps.cc b/paddle/fluid/framework/fleet/heter_ps/heter_ps.cc index 82f5393c3660b..4eff4a8ad55b9 100644 --- a/paddle/fluid/framework/fleet/heter_ps/heter_ps.cc +++ b/paddle/fluid/framework/fleet/heter_ps/heter_ps.cc @@ -22,34 +22,43 @@ namespace paddle { namespace framework { HeterPsBase* HeterPsBase::get_instance( - size_t capacity, std::shared_ptr resource) { - return new HeterPs(capacity, resource); + size_t capacity, + std::shared_ptr resource, + std::unordered_map fleet_config, + std::string accessor_type, + int optimizer_type) { + if (accessor_type == "CtrDymfAccessor" && + (optimizer_type == 1 || optimizer_type == 3 || optimizer_type == 4)) { + return new HeterPs( + capacity, resource, accessor_type, fleet_config, optimizer_type); + } else { + VLOG(0) << " HeterPsBase get_instance Warning: now only support " + "CtrDymfAccessor, but get " + << accessor_type_; + return new HeterPs( + capacity, resource, accessor_type, fleet_config, optimizer_type); + } } -HeterPs::HeterPs(size_t capacity, std::shared_ptr resource) { - comm_ = - std::make_shared>( - capacity, resource); +HeterPs::HeterPs(size_t capacity, + std::shared_ptr resource, + std::unordered_map fleet_config, + std::string accessor_type, + int optimizer_type) { + comm_ = std::make_shared>( + capacity, resource); + optimizer_type_ = optimizer_type; } HeterPs::~HeterPs() {} void HeterPs::pull_sparse(int num, FeatureKey* d_keys, - FeatureValue* d_vals, + float* d_vals, size_t len) { comm_->pull_sparse(num, d_keys, d_vals, len); } -void HeterPs::build_ps(int num, - FeatureKey* h_keys, - FeatureValue* h_vals, - size_t len, - size_t chunk_size, - int stream_num) { - comm_->build_ps(num, h_keys, h_vals, len, chunk_size, stream_num); -} - int HeterPs::get_index_by_devid(int devid) { return comm_->get_index_by_devid(devid); } @@ -68,7 +77,7 @@ void HeterPs::show_one_table(int gpu_num) { comm_->show_one_table(gpu_num); } void HeterPs::push_sparse(int num, FeatureKey* d_keys, - FeaturePushValue* d_grads, + float* d_grads, size_t len) { comm_->push_sparse(num, d_keys, d_grads, len); // comm_->push_sparse_multi_node(num, d_keys, d_grads, len, opt_); diff --git a/paddle/fluid/framework/fleet/heter_ps/heter_ps.cu b/paddle/fluid/framework/fleet/heter_ps/heter_ps.cu index 005cbd401223d..b059690990370 100644 --- a/paddle/fluid/framework/fleet/heter_ps/heter_ps.cu +++ b/paddle/fluid/framework/fleet/heter_ps/heter_ps.cu @@ -22,80 +22,139 @@ namespace paddle { namespace framework { HeterPsBase* HeterPsBase::get_instance( - size_t capacity, std::shared_ptr resource) { - return new HeterPs(capacity, resource); + size_t capacity, + std::shared_ptr resource, + std::unordered_map fleet_config, + std::string accessor_type, + int optimizer_type) { + if (accessor_type == "CtrDymfAccessor" && + (optimizer_type == 1 || optimizer_type == 3 || optimizer_type == 4)) { + return new HeterPs( + capacity, resource, fleet_config, accessor_type, optimizer_type); + } else { + VLOG(0) << " HeterPsBase get_instance Warning: now only support " + "CtrDymfAccessor, but get " + << accessor_type; + return new HeterPs( + capacity, resource, fleet_config, accessor_type, optimizer_type); + } } -HeterPs::HeterPs(size_t capacity, std::shared_ptr resource) { - comm_ = - std::make_shared>( - capacity, resource); - opt_ = Optimizer(); +template +HeterPs::HeterPs( + size_t capacity, + std::shared_ptr resource, + std::unordered_map fleet_config, + std::string accessor_type, + int optimizer_type) { + comm_ = std::make_shared>( + capacity, resource); + feature_value_accessor_.Configure(fleet_config); + set_accessor(feature_value_accessor_); + accessor_type_ = accessor_type; + optimizer_type_ = optimizer_type; } -HeterPs::~HeterPs() {} +template +HeterPs::~HeterPs() {} -void HeterPs::pull_sparse(int num, - FeatureKey* d_keys, - FeatureValue* d_vals, - size_t len) { +template +void HeterPs::pull_sparse(int num, + FeatureKey* d_keys, + float* d_vals, + size_t len) { comm_->pull_sparse(num, d_keys, d_vals, len); } -void HeterPs::build_ps(int num, - FeatureKey* h_keys, - FeatureValue* h_vals, - size_t len, - size_t chunk_size, - int stream_num) { - comm_->build_ps(num, h_keys, h_vals, len, chunk_size, stream_num); -} - -void HeterPs::build_ps(int num, - FeatureKey* h_keys, - char* pool, - size_t len, - size_t feature_value_size, - size_t chunk_size, - int stream_num) { +template +void HeterPs::build_ps(int num, + FeatureKey* h_keys, + char* pool, + size_t len, + size_t feature_value_size, + size_t chunk_size, + int stream_num) { comm_->build_ps( num, h_keys, pool, len, feature_value_size, chunk_size, stream_num); } -int HeterPs::get_index_by_devid(int devid) { +template +int HeterPs::get_index_by_devid(int devid) { return comm_->get_index_by_devid(devid); } -void HeterPs::set_sparse_sgd(const OptimizerConfig& optimizer_config) { +template +void HeterPs::set_sparse_sgd( + const OptimizerConfig& optimizer_config) { comm_->set_sparse_sgd(optimizer_config); } -void HeterPs::set_embedx_sgd(const OptimizerConfig& optimizer_config) { +template +void HeterPs::set_embedx_sgd( + const OptimizerConfig& optimizer_config) { comm_->set_embedx_sgd(optimizer_config); } -void HeterPs::end_pass() { comm_->end_pass(); } +template +void HeterPs::end_pass() { + comm_->end_pass(); +} -void HeterPs::show_one_table(int gpu_num) { comm_->show_one_table(gpu_num); } +template +void HeterPs::show_one_table(int gpu_num) { + comm_->show_one_table(gpu_num); +} -void HeterPs::push_sparse(int num, - FeatureKey* d_keys, - FeaturePushValue* d_grads, - size_t len) { - comm_->push_sparse(num, d_keys, d_grads, len, opt_); - // comm_->push_sparse_multi_node(num, d_keys, d_grads, len, opt_); +template +void HeterPs::push_sparse(int num, + FeatureKey* d_keys, + float* d_grads, + size_t len) { + if (accessor_type_ == "CtrDymfAccessor") { + if (optimizer_type_ == 3) { // adam + auto optimizer = SparseAdamOptimizer(feature_value_accessor_); + VLOG(5) << "INTO push_sparse SparseAdamOptimizer, EmbedDim():" + << optimizer.EmbedDim(); + comm_->push_sparse(num, d_keys, d_grads, len, optimizer); + } else if (optimizer_type_ == 4) { // shared_adam + auto optimizer = SparseAdamSharedOptimizer(feature_value_accessor_); + VLOG(5) << "INTO push_sparse SparseAdamSharedOptimizer, EmbedDim():" + << optimizer.EmbedDim(); + comm_->push_sparse(num, d_keys, d_grads, len, optimizer); + } else if (optimizer_type_ == 1) { // adagrad { + auto optimizer = SparseAdagradOptimizer(feature_value_accessor_); + VLOG(5) << "INTO push_sparse SparseAdagradOptimizer, EmbedDim():" + << optimizer.EmbedDim(); + comm_->push_sparse(num, d_keys, d_grads, len, optimizer); + } else { + VLOG(0) << " push sparse Error: CtrDymfAccessor only support adagrad(1)," + "adam(3) or shared_adam(4), bug get optimizer type:" + << optimizer_type_; + } + } else { + VLOG(0) << " push sparse Error: now only support CtrDymfAccessor, but get " + << accessor_type_; + } } -void HeterPs::set_nccl_comm_and_size(const std::vector& inner_comms, - const std::vector& inter_comms, - int comm_size) { +template +void HeterPs::set_nccl_comm_and_size( + const std::vector& inner_comms, + const std::vector& inter_comms, + int comm_size) { comm_->set_nccl_comm_and_size(inner_comms, inter_comms, comm_size); } -void HeterPs::set_multi_mf_dim(int multi_mf_dim, int max_mf_dim) { +template +void HeterPs::set_multi_mf_dim(int multi_mf_dim, int max_mf_dim) { comm_->set_multi_mf_dim(multi_mf_dim, max_mf_dim); } +template +void HeterPs::set_accessor(FVAccessor& accessor) { + comm_->set_accessor(accessor); +} + } // end namespace framework } // end namespace paddle #endif diff --git a/paddle/fluid/framework/fleet/heter_ps/heter_ps.h b/paddle/fluid/framework/fleet/heter_ps/heter_ps.h index 7fee229738830..439f5d6c81854 100644 --- a/paddle/fluid/framework/fleet/heter_ps/heter_ps.h +++ b/paddle/fluid/framework/fleet/heter_ps/heter_ps.h @@ -26,24 +26,23 @@ limitations under the License. */ namespace paddle { namespace framework { +template class HeterPs : public HeterPsBase { public: HeterPs() {} - HeterPs(size_t capacity, std::shared_ptr resource); + HeterPs(size_t capacity, + std::shared_ptr resource, + std::unordered_map fleet_config, + std::string accessor_type, + int optimizer_type); virtual ~HeterPs(); HeterPs(const HeterPs&) = delete; HeterPs& operator=(const HeterPs&) = delete; void pull_sparse(int num, FeatureKey* d_keys, - FeatureValue* d_vals, + float* d_vals, size_t len) override; - void build_ps(int num, - FeatureKey* h_keys, - FeatureValue* h_vals, - size_t len, - size_t chunk_size, - int stream_num) override; void build_ps(int num, FeatureKey* h_keys, char* pool, @@ -56,6 +55,8 @@ class HeterPs : public HeterPsBase { const std::vector& inter_comms, int comm_size) override; void set_multi_mf_dim(int multi_mf_dim, int max_mf_dim) override; + + void set_accessor(FVAccessor& accessor); #endif void set_sparse_sgd(const OptimizerConfig& optimizer_config) override; @@ -66,13 +67,15 @@ class HeterPs : public HeterPsBase { void show_one_table(int gpu_num) override; void push_sparse(int num, FeatureKey* d_keys, - FeaturePushValue* d_grads, + float* d_grads, size_t len) override; private: - std::shared_ptr> comm_; + std::shared_ptr> comm_; #if defined(PADDLE_WITH_CUDA) - Optimizer opt_; + FVAccessor feature_value_accessor_; + std::string accessor_type_; + int optimizer_type_; #endif }; diff --git a/paddle/fluid/framework/fleet/heter_ps/heter_ps_base.h b/paddle/fluid/framework/fleet/heter_ps/heter_ps_base.h index acc984f14adaa..e45d1db71ccae 100644 --- a/paddle/fluid/framework/fleet/heter_ps/heter_ps_base.h +++ b/paddle/fluid/framework/fleet/heter_ps/heter_ps_base.h @@ -34,14 +34,8 @@ class HeterPsBase { virtual void pull_sparse(int num, FeatureKey* d_keys, - FeatureValue* d_vals, + float* d_vals, size_t len) = 0; - virtual void build_ps(int num, - FeatureKey* h_keys, - FeatureValue* h_vals, - size_t len, - size_t chunk_size, - int stream_num) = 0; virtual void build_ps(int num, FeatureKey* h_keys, char* pool, @@ -56,19 +50,25 @@ class HeterPsBase { const std::vector& inter_comms, int comm_size) = 0; virtual void set_multi_mf_dim(int multi_mf_dim, int max_mf_dim) = 0; + #endif virtual void end_pass() = 0; virtual void show_one_table(int gpu_num) = 0; virtual void push_sparse(int num, FeatureKey* d_keys, - FeaturePushValue* d_grads, + float* d_grads, size_t len) = 0; virtual void set_sparse_sgd(const OptimizerConfig& optimizer_config) = 0; virtual void set_embedx_sgd(const OptimizerConfig& optimizer_config) = 0; - static HeterPsBase* get_instance(size_t capacity, - std::shared_ptr resource); + static HeterPsBase* get_instance( + size_t capacity, + std::shared_ptr resource, + // CommonFeatureValueAccessor feature_value_accessor, + std::unordered_map fleet_config, + std::string accessor_type, + int optimizer_type); }; } // end namespace framework diff --git a/paddle/fluid/framework/fleet/heter_ps/mem_pool.h b/paddle/fluid/framework/fleet/heter_ps/mem_pool.h index 88c3136dd77d1..05e252b2afe44 100644 --- a/paddle/fluid/framework/fleet/heter_ps/mem_pool.h +++ b/paddle/fluid/framework/fleet/heter_ps/mem_pool.h @@ -82,20 +82,6 @@ class HBMMemoryPool : public managed { cudaMemset(mem_, 0, block_size_ * capacity); } - friend std::ostream& operator<<(std::ostream& out, HBMMemoryPool& p) { - for (size_t k = 0; k < 5; k++) { - auto x = (FeatureValue*)(p.mem() + k * p.capacity()); - out << "show: " << x->show << " clk: " << x->clk << " slot: " << x->slot - << " lr: " << x->lr << " mf_dim: " << x->mf_size - << " mf_size: " << x->mf_size << " mf:"; - for (int i = 0; i < x->mf_size + 1; ++i) { - out << " " << x->mf[i]; - } - out << "\n"; - } - return out; - } - char* mem() { return mem_; } size_t capacity() { return capacity_; } diff --git a/paddle/fluid/framework/fleet/heter_ps/optimizer.cuh.h b/paddle/fluid/framework/fleet/heter_ps/optimizer.cuh.h index 74a4f1ca16c2b..3a6f60fef858b 100644 --- a/paddle/fluid/framework/fleet/heter_ps/optimizer.cuh.h +++ b/paddle/fluid/framework/fleet/heter_ps/optimizer.cuh.h @@ -27,134 +27,460 @@ namespace paddle { namespace framework { #if defined(PADDLE_WITH_CUDA) -template + class Optimizer { public: - Optimizer() {} + __host__ Optimizer(CommonFeatureValueAccessor feature_value_accessor) { + feature_value_accessor_ = feature_value_accessor; + } + __host__ ~Optimizer() {} + + __device__ void update_value(const OptimizerConfig& optimizer_config, + float& val, // NOLINT + const float& grad) { + printf( + "Warning: update_value will not used. Please use dy_mf_update_value\n"); + } + + __device__ void dy_mf_update_value(const OptimizerConfig& optimizer_config, + float* ptr, + const float* grad) {} - ~Optimizer() {} + CommonFeatureValueAccessor feature_value_accessor_; - void initialize() {} + size_t _embedding_dim; + size_t _lr_embedding_dim; +}; + +class SparseAdagradOptimizer : public Optimizer { + public: + __host__ SparseAdagradOptimizer( + CommonFeatureValueAccessor feature_value_accessor) + : Optimizer(feature_value_accessor) { + _lr_embedding_dim = 1; + _embedding_dim = feature_value_accessor_.common_feature_value.EmbedWDim(); + } + + __device__ void update_value_work(const OptimizerConfig& optimizer_config, + int n, + float* w, + float* sgd, // NOLINT + const float* g, + float scale) { + float& g2sum = sgd[G2SumIndex()]; + double add_g2sum = 0; + double ratio = optimizer_config.mf_learning_rate * + sqrt(optimizer_config.mf_initial_g2sum / + (optimizer_config.mf_initial_g2sum + g2sum)); + for (int i = 0; i < n; ++i) { + double scaled_grad = g[i] / scale; + + w[i] += scaled_grad * ratio; + + if (w[i] < optimizer_config.mf_min_bound) + w[i] = optimizer_config.mf_min_bound; + if (w[i] > optimizer_config.mf_max_bound) + w[i] = optimizer_config.mf_max_bound; + add_g2sum += scaled_grad * scaled_grad; + } + + g2sum += add_g2sum / n; + } + + __device__ void update_value(const OptimizerConfig& optimizer_config, + float& val, // NOLINT + const float& grad) { + printf( + "Warning: update_value will not used. Please use dy_mf_update_value\n"); + } + __device__ void dy_mf_update_value(const OptimizerConfig& optimizer_config, + float* ptr, + const float* grad) { + float g_show = grad[feature_value_accessor_.common_push_value.ShowIndex()]; + float g_click = + grad[feature_value_accessor_.common_push_value.ClickIndex()]; + + ptr[feature_value_accessor_.common_feature_value.SlotIndex()] = + grad[feature_value_accessor_.common_push_value.SlotIndex()]; + ptr[feature_value_accessor_.common_feature_value.ShowIndex()] += g_show; + ptr[feature_value_accessor_.common_feature_value.ClickIndex()] += g_click; + ptr[feature_value_accessor_.common_feature_value.DeltaScoreIndex()] += + optimizer_config.nonclk_coeff * (g_show - g_click) + + optimizer_config.clk_coeff * g_click; + + update_value_work( + optimizer_config, + 1, + ptr + feature_value_accessor_.common_feature_value.EmbedWIndex(), + ptr + feature_value_accessor_.common_feature_value.EmbedG2SumIndex(), + grad + feature_value_accessor_.common_push_value.EmbedGIndex(), + g_show); + + int mf_dim = + int(ptr[feature_value_accessor_.common_feature_value.MfDimIndex()]); + if (ptr[feature_value_accessor_.common_feature_value.MfSizeIndex()] == 0) { + if (optimizer_config.mf_create_thresholds <= + optimizer_config.nonclk_coeff * + (ptr[feature_value_accessor_.common_feature_value + .ShowIndex()] - + ptr[feature_value_accessor_.common_feature_value + .ClickIndex()]) + + optimizer_config.clk_coeff * + ptr[feature_value_accessor_.common_feature_value + .ClickIndex()]) { + ptr[feature_value_accessor_.common_feature_value.MfSizeIndex()] = + feature_value_accessor_.common_feature_value.MFSize(mf_dim) / + sizeof(float); + + int tid_x = blockIdx.x * blockDim.x + threadIdx.x; + curandState state; + curand_init(clock64(), tid_x, 0, &state); + for (int i = 0; i < mf_dim; ++i) { + ptr[feature_value_accessor_.common_feature_value.EmbedxWIndex() + i] = + (curand_uniform(&state)) * optimizer_config.mf_initial_range; + } + } + } else { + update_value_work( + optimizer_config, + mf_dim, + ptr + feature_value_accessor_.common_feature_value.EmbedxWIndex(), + ptr + feature_value_accessor_.common_feature_value.EmbedxG2SumIndex(), + grad + feature_value_accessor_.common_push_value.EmbedxGIndex(), + g_show); + } + } + + __host__ __device__ size_t Dim() { return EmbedDim() + EmbedxDim(); } + __host__ __device__ size_t EmbedDim() { return _lr_embedding_dim; } + __host__ __device__ size_t EmbedxDim() { return _embedding_dim; } + __host__ __device__ size_t G2SumIndex() { return 0; } + __host__ __device__ size_t EmbedxG2SumIndex() { return 0; } +}; + +class SparseAdamOptimizer : public Optimizer { + public: + __host__ SparseAdamOptimizer( + CommonFeatureValueAccessor feature_value_accessor) + : Optimizer(feature_value_accessor) { + _lr_embedding_dim = 1; + _embedding_dim = feature_value_accessor_.common_feature_value.EmbedWDim(); + } __device__ void update_lr(const OptimizerConfig& optimizer_config, - float& w, // NOLINT - float& g2sum, - float g, // NOLINT + int n, + float* w, + float* sgd, + const float* g, float scale) { - double add_g2sum = 0; - double ratio = optimizer_config.learning_rate * - sqrt(optimizer_config.initial_g2sum / - (optimizer_config.initial_g2sum + g2sum)); - double scaled_grad = g / scale; + float* moment1 = sgd + GSumIndex(); + float* moment2 = sgd + G2SumIndex(); + float* beta1_pow = sgd + Beta1PowIndex(); + float* beta2_pow = sgd + Beta2PowIndex(); - w += scaled_grad * ratio; + float beta1_pow_ = *beta1_pow; + float beta2_pow_ = *beta2_pow; - if (w < optimizer_config.min_bound) w = optimizer_config.min_bound; - if (w > optimizer_config.max_bound) w = optimizer_config.max_bound; + float epsilon = 1e-08; + double ratio = optimizer_config.learning_rate * sqrt(1.0 - beta2_pow_) / + (1.0 - beta1_pow_); + for (int i = 0; i < n; ++i) { + double scaled_grad = g[i] / scale; - add_g2sum += scaled_grad * scaled_grad; + double new_moment1 = + optimizer_config.beta1_decay_rate * moment1[i] + + (1.0 - optimizer_config.beta1_decay_rate) * scaled_grad; + double new_moment2 = + optimizer_config.beta2_decay_rate * moment2[i] + + (1.0 - optimizer_config.beta2_decay_rate) * scaled_grad * scaled_grad; + w[i] += ratio * (new_moment1 / (sqrt(new_moment2) + epsilon)); + + if (w[i] < optimizer_config.mf_min_bound) + w[i] = optimizer_config.mf_min_bound; + if (w[i] > optimizer_config.mf_max_bound) + w[i] = optimizer_config.mf_max_bound; - g2sum += add_g2sum; + moment1[i] = new_moment1; + moment2[i] = new_moment2; + } + (*beta1_pow) *= optimizer_config.beta1_decay_rate; + (*beta2_pow) *= optimizer_config.beta2_decay_rate; } __device__ void update_mf(const OptimizerConfig& optimizer_config, int n, float* w, - float& g2sum, // NOLINT + float* sgd, const float* g, float scale) { - double add_g2sum = 0; - double ratio = optimizer_config.mf_learning_rate * - sqrt(optimizer_config.mf_initial_g2sum / - (optimizer_config.mf_initial_g2sum + g2sum)); + float* moment1 = sgd + EmbedxGSumIndex(); + float* moment2 = sgd + EmbedxG2SumIndex(); + float* beta1_pow = sgd + EmbedxBeta1PowIndex(); + float* beta2_pow = sgd + EmbedxBeta2PowIndex(); + + float beta1_pow_ = *beta1_pow; + float beta2_pow_ = *beta2_pow; + + float epsilon = 1e-08; + double ratio = optimizer_config.learning_rate * sqrt(1.0 - beta2_pow_) / + (1.0 - beta1_pow_); for (int i = 0; i < n; ++i) { double scaled_grad = g[i] / scale; - w[i] += scaled_grad * ratio; + double new_moment1 = + optimizer_config.beta1_decay_rate * moment1[i] + + (1.0 - optimizer_config.beta1_decay_rate) * scaled_grad; + double new_moment2 = + optimizer_config.beta2_decay_rate * moment2[i] + + (1.0 - optimizer_config.beta2_decay_rate) * scaled_grad * scaled_grad; + w[i] += ratio * (new_moment1 / (sqrt(new_moment2) + epsilon)); if (w[i] < optimizer_config.mf_min_bound) w[i] = optimizer_config.mf_min_bound; if (w[i] > optimizer_config.mf_max_bound) w[i] = optimizer_config.mf_max_bound; - add_g2sum += scaled_grad * scaled_grad; - } - g2sum += add_g2sum / n; + moment1[i] = new_moment1; + moment2[i] = new_moment2; + } + (*beta1_pow) *= optimizer_config.beta1_decay_rate; + (*beta2_pow) *= optimizer_config.beta2_decay_rate; } __device__ void update_value(const OptimizerConfig& optimizer_config, - ValType& val, // NOLINT - const GradType& grad) { - val.slot = grad.slot; - val.show += grad.show; - val.clk += grad.clk; - val.delta_score += optimizer_config.nonclk_coeff * (grad.show - grad.clk) + - optimizer_config.clk_coeff * grad.clk; + float& val, // NOLINT + const float& grad) { + printf( + "Warning: update_value will not used. Please use dy_mf_update_value\n"); + } + __device__ void dy_mf_update_value(const OptimizerConfig& optimizer_config, + float* ptr, + const float* grad) { + float g_show = grad[feature_value_accessor_.common_push_value.ShowIndex()]; + float g_click = + grad[feature_value_accessor_.common_push_value.ClickIndex()]; - update_lr(optimizer_config, val.lr, val.lr_g2sum, grad.lr_g, grad.show); + ptr[feature_value_accessor_.common_feature_value.SlotIndex()] = + grad[feature_value_accessor_.common_push_value.SlotIndex()]; + ptr[feature_value_accessor_.common_feature_value.ShowIndex()] += g_show; + ptr[feature_value_accessor_.common_feature_value.ClickIndex()] += g_click; + ptr[feature_value_accessor_.common_feature_value.DeltaScoreIndex()] += + optimizer_config.nonclk_coeff * (g_show - g_click) + + optimizer_config.clk_coeff * g_click; - if (val.mf_size == 0) { + update_lr( + optimizer_config, + 1, + ptr + feature_value_accessor_.common_feature_value.EmbedWIndex(), + ptr + feature_value_accessor_.common_feature_value.EmbedG2SumIndex(), + grad + feature_value_accessor_.common_push_value.EmbedGIndex(), + g_show); + int mf_dim = + int(ptr[feature_value_accessor_.common_feature_value.MfDimIndex()]); + if (ptr[feature_value_accessor_.common_feature_value.MfSizeIndex()] == 0) { if (optimizer_config.mf_create_thresholds <= - optimizer_config.nonclk_coeff * (val.show - val.clk) + - optimizer_config.clk_coeff * val.clk) { - val.mf_size = MF_DIM + 1; - val.mf[0] = 0; + optimizer_config.nonclk_coeff * + (ptr[feature_value_accessor_.common_feature_value + .ShowIndex()] - + ptr[feature_value_accessor_.common_feature_value + .ClickIndex()]) + + optimizer_config.clk_coeff * + ptr[feature_value_accessor_.common_feature_value + .ClickIndex()]) { + ptr[feature_value_accessor_.common_feature_value.MfSizeIndex()] = + feature_value_accessor_.common_feature_value.MFSize(mf_dim) / + sizeof(float); + int tid_x = blockIdx.x * blockDim.x + threadIdx.x; curandState state; curand_init(clock64(), tid_x, 0, &state); - for (int i = 0; i < MF_DIM; ++i) { - val.mf[i + 1] = + for (int i = 0; i < mf_dim; ++i) { + ptr[feature_value_accessor_.common_feature_value.EmbedxWIndex() + i] = (curand_uniform(&state)) * optimizer_config.mf_initial_range; } + ptr[feature_value_accessor_.common_feature_value.EmbedxG2SumIndex() + + EmbedxBeta1PowIndex()] = optimizer_config.beta1_decay_rate; + ptr[feature_value_accessor_.common_feature_value.EmbedxG2SumIndex() + + EmbedxBeta2PowIndex()] = optimizer_config.beta2_decay_rate; } } else { - update_mf(optimizer_config, - MF_DIM, - &val.mf[1], - val.mf[0], - grad.mf_g, - grad.show); + update_mf( + optimizer_config, + mf_dim, + ptr + feature_value_accessor_.common_feature_value.EmbedxWIndex(), + ptr + feature_value_accessor_.common_feature_value.EmbedxG2SumIndex(), + grad + feature_value_accessor_.common_push_value.EmbedxGIndex(), + g_show); + } + // printf("EmbedxGIndex: %f, mf_gsum: %f, ", + // feature_value_accessor_.common_push_value.EmbedxGIndex(), + // ptr[feature_value_accessor_.common_feature_value.EmbedxG2SumIndex()]); + } + + __host__ __device__ size_t Dim() { return EmbedDim() + EmbedxDim(); } + __host__ __device__ size_t EmbedDim() { return _lr_embedding_dim * 2 + 2; } + __host__ __device__ size_t EmbedxDim() { return _embedding_dim * 2 + 2; } + __host__ __device__ size_t GSumIndex() { return 0; } + __host__ __device__ size_t G2SumIndex() { + return GSumIndex() + _lr_embedding_dim; + } + __host__ __device__ size_t Beta1PowIndex() { + return G2SumIndex() + _lr_embedding_dim; + } + __host__ __device__ size_t Beta2PowIndex() { return Beta1PowIndex() + 1; } + __host__ __device__ size_t EmbedxGSumIndex() { return 0; } + __host__ __device__ size_t EmbedxG2SumIndex() { + return EmbedxGSumIndex() + _embedding_dim; + } + __host__ __device__ size_t EmbedxBeta1PowIndex() { + return EmbedxG2SumIndex() + _embedding_dim; + } + __host__ __device__ size_t EmbedxBeta2PowIndex() { + return EmbedxBeta1PowIndex() + 1; + } +}; + +class SparseAdamSharedOptimizer : public Optimizer { + public: + __host__ SparseAdamSharedOptimizer( + CommonFeatureValueAccessor feature_value_accessor) + : Optimizer(feature_value_accessor) { + _lr_embedding_dim = 1; + _embedding_dim = feature_value_accessor_.common_feature_value.EmbedWDim(); + } + + __device__ void update_value_work(const OptimizerConfig& optimizer_config, + int n, + float* w, + float* sgd, + const float* g, + float scale) { + float* moment1 = sgd + GSumIndex(); + float* moment2 = sgd + G2SumIndex(); + float* beta1_pow = sgd + Beta1PowIndex(); + float* beta2_pow = sgd + Beta2PowIndex(); + + float beta1_pow_ = *beta1_pow; + float beta2_pow_ = *beta2_pow; + float moment1_ = *moment1; + float moment2_ = *moment2; + float epsilon = 1e-08; + double ratio = optimizer_config.learning_rate * sqrt(1.0 - beta2_pow_) / + (1.0 - beta1_pow_); + + double sum_mom1 = 0.0; + double sum_mom2 = 0.0; + for (int i = 0; i < n; ++i) { + double scaled_grad = g[i] / scale; + + double new_moment1 = + optimizer_config.beta1_decay_rate * moment1_ + + (1.0 - optimizer_config.beta1_decay_rate) * scaled_grad; + double new_moment2 = + optimizer_config.beta2_decay_rate * moment2_ + + (1.0 - optimizer_config.beta2_decay_rate) * scaled_grad * scaled_grad; + w[i] += ratio * (new_moment1 / (sqrt(new_moment2) + epsilon)); + + if (w[i] < optimizer_config.mf_min_bound) + w[i] = optimizer_config.mf_min_bound; + if (w[i] > optimizer_config.mf_max_bound) + w[i] = optimizer_config.mf_max_bound; + + sum_mom1 += new_moment1; + sum_mom2 += new_moment2; } + + (*moment1) = sum_mom1 / n; + (*moment2) = sum_mom2 / n; + (*beta1_pow) *= optimizer_config.beta1_decay_rate; + (*beta2_pow) *= optimizer_config.beta2_decay_rate; + } + + __device__ void update_value(const OptimizerConfig& optimizer_config, + float& val, // NOLINT + const float& grad) { + printf( + "Warning: update_value will not used. Please use dy_mf_update_value\n"); } __device__ void dy_mf_update_value(const OptimizerConfig& optimizer_config, - ValType* ptr, - const GradType& grad) { - ptr->slot = grad.slot; - ptr->show += grad.show; - ptr->clk += grad.clk; - ptr->delta_score += optimizer_config.nonclk_coeff * (grad.show - grad.clk) + - optimizer_config.clk_coeff * grad.clk; - - update_lr(optimizer_config, ptr->lr, ptr->lr_g2sum, grad.lr_g, grad.show); - // use MF_DIM temporarily - // ptr->mf_dim = grad.mf_dim; - - if (ptr->mf_size == 0) { + float* ptr, + const float* grad) { + float g_show = grad[feature_value_accessor_.common_push_value.ShowIndex()]; + float g_click = + grad[feature_value_accessor_.common_push_value.ClickIndex()]; + + ptr[feature_value_accessor_.common_feature_value.SlotIndex()] = + grad[feature_value_accessor_.common_push_value.SlotIndex()]; + ptr[feature_value_accessor_.common_feature_value.ShowIndex()] += g_show; + ptr[feature_value_accessor_.common_feature_value.ClickIndex()] += g_click; + ptr[feature_value_accessor_.common_feature_value.DeltaScoreIndex()] += + optimizer_config.nonclk_coeff * (g_show - g_click) + + optimizer_config.clk_coeff * g_click; + + update_value_work( + optimizer_config, + 1, + ptr + feature_value_accessor_.common_feature_value.EmbedWIndex(), + ptr + feature_value_accessor_.common_feature_value.EmbedG2SumIndex(), + grad + feature_value_accessor_.common_push_value.EmbedGIndex(), + g_show); + int mf_dim = + int(ptr[feature_value_accessor_.common_feature_value.MfDimIndex()]); + if (ptr[feature_value_accessor_.common_feature_value.MfSizeIndex()] == 0) { if (optimizer_config.mf_create_thresholds <= - optimizer_config.nonclk_coeff * (ptr->show - ptr->clk) + - optimizer_config.clk_coeff * ptr->clk) { - ptr->mf_size = ptr->mf_dim + 1; + optimizer_config.nonclk_coeff * + (ptr[feature_value_accessor_.common_feature_value + .ShowIndex()] - + ptr[feature_value_accessor_.common_feature_value + .ClickIndex()]) + + optimizer_config.clk_coeff * + ptr[feature_value_accessor_.common_feature_value + .ClickIndex()]) { + ptr[feature_value_accessor_.common_feature_value.MfSizeIndex()] = + feature_value_accessor_.common_feature_value.MFSize(mf_dim) / + sizeof(float); - // ptr->mf_size = MF_DIM + 1; - ptr->mf[0] = 0; int tid_x = blockIdx.x * blockDim.x + threadIdx.x; curandState state; curand_init(clock64(), tid_x, 0, &state); - for (int i = 0; i < ptr->mf_dim; ++i) { - ptr->mf[i + 1] = + for (int i = 0; i < mf_dim; ++i) { + ptr[feature_value_accessor_.common_feature_value.EmbedxWIndex() + i] = (curand_uniform(&state)) * optimizer_config.mf_initial_range; } + ptr[feature_value_accessor_.common_feature_value.EmbedxG2SumIndex() + + EmbedxBeta1PowIndex()] = optimizer_config.beta1_decay_rate; + ptr[feature_value_accessor_.common_feature_value.EmbedxG2SumIndex() + + EmbedxBeta2PowIndex()] = optimizer_config.beta2_decay_rate; } } else { - update_mf(optimizer_config, - ptr->mf_dim, - &(ptr->mf[1]), - ptr->mf[0], - grad.mf_g, - grad.show); // for local test + update_value_work( + optimizer_config, + mf_dim, + ptr + feature_value_accessor_.common_feature_value.EmbedxWIndex(), + ptr + feature_value_accessor_.common_feature_value.EmbedxG2SumIndex(), + grad + feature_value_accessor_.common_push_value.EmbedxGIndex(), + g_show); } } + + __host__ __device__ size_t Dim() { return EmbedDim() + EmbedxDim(); } + __host__ __device__ size_t EmbedDim() { return 4; } + __host__ __device__ size_t EmbedxDim() { return 4; } + __host__ __device__ size_t GSumIndex() { return 0; } + __host__ __device__ size_t G2SumIndex() { return GSumIndex() + 1; } + __host__ __device__ size_t Beta1PowIndex() { return G2SumIndex() + 1; } + __host__ __device__ size_t Beta2PowIndex() { return Beta1PowIndex() + 1; } + __host__ __device__ size_t EmbedxGSumIndex() { return 0; } + __host__ __device__ size_t EmbedxG2SumIndex() { + return EmbedxGSumIndex() + 1; + } + __host__ __device__ size_t EmbedxBeta1PowIndex() { + return EmbedxG2SumIndex() + 1; + } + __host__ __device__ size_t EmbedxBeta2PowIndex() { + return EmbedxBeta1PowIndex() + 1; + } }; #endif diff --git a/paddle/fluid/framework/fleet/heter_ps/optimizer_conf.h b/paddle/fluid/framework/fleet/heter_ps/optimizer_conf.h index 0db72992215a2..2db259941c873 100644 --- a/paddle/fluid/framework/fleet/heter_ps/optimizer_conf.h +++ b/paddle/fluid/framework/fleet/heter_ps/optimizer_conf.h @@ -27,13 +27,19 @@ class OptimizerConfig { float learning_rate = 0.05; float initial_g2sum = 3.0; float initial_range = 0; + float beta1_decay_rate = 0.9; // adam + float beta2_decay_rate = 0.999; // adam + float ada_epsilon = 1e-8; float mf_create_thresholds = 10; float mf_learning_rate = 0.05; float mf_initial_g2sum = 3.0; float mf_initial_range = 1e-4; + float mf_beta1_decay_rate = 0.9; // adam + float mf_beta2_decay_rate = 0.999; // adam float mf_min_bound = -10; float mf_max_bound = 10; + float mf_ada_epsilon = 1e-8; void set_sparse_sgd(float nonclk_coeff, float clk_coeff, @@ -41,7 +47,10 @@ class OptimizerConfig { float max_bound, float learning_rate, float initial_g2sum, - float initial_range) { + float initial_range, + float beta1_decay_rate, + float beta2_decay_rate, + float ada_epsilon) { this->nonclk_coeff = nonclk_coeff; this->clk_coeff = clk_coeff; this->min_bound = min_bound; @@ -49,6 +58,9 @@ class OptimizerConfig { this->learning_rate = learning_rate; this->initial_g2sum = initial_g2sum; this->initial_range = initial_range; + this->beta1_decay_rate = beta1_decay_rate; + this->beta2_decay_rate = beta2_decay_rate; + this->ada_epsilon = ada_epsilon; } void set_sparse_sgd(const OptimizerConfig& optimizer_config) { @@ -59,6 +71,9 @@ class OptimizerConfig { this->learning_rate = optimizer_config.learning_rate; this->initial_g2sum = optimizer_config.initial_g2sum; this->initial_range = optimizer_config.initial_range; + this->beta1_decay_rate = optimizer_config.beta1_decay_rate; + this->beta2_decay_rate = optimizer_config.beta2_decay_rate; + this->ada_epsilon = optimizer_config.ada_epsilon; } void set_embedx_sgd(float mf_create_thresholds, @@ -66,13 +81,19 @@ class OptimizerConfig { float mf_initial_g2sum, float mf_initial_range, float mf_min_bound, - float mf_max_bound) { + float mf_max_bound, + float mf_beta1_decay_rate, + float mf_beta2_decay_rate, + float mf_ada_epsilon) { this->mf_create_thresholds = mf_create_thresholds; this->mf_learning_rate = mf_learning_rate; this->mf_initial_g2sum = mf_initial_g2sum; this->mf_initial_range = mf_initial_range; this->mf_min_bound = mf_min_bound; this->mf_max_bound = mf_max_bound; + this->mf_beta1_decay_rate = mf_beta1_decay_rate; + this->mf_beta2_decay_rate = mf_beta2_decay_rate; + this->mf_ada_epsilon = mf_ada_epsilon; } void set_embedx_sgd(const OptimizerConfig& optimizer_config) { @@ -82,6 +103,9 @@ class OptimizerConfig { this->mf_initial_range = optimizer_config.mf_initial_range; this->mf_min_bound = optimizer_config.mf_min_bound; this->mf_max_bound = optimizer_config.mf_max_bound; + this->mf_beta1_decay_rate = optimizer_config.mf_beta1_decay_rate; + this->mf_beta2_decay_rate = optimizer_config.mf_beta2_decay_rate; + this->mf_ada_epsilon = optimizer_config.mf_ada_epsilon; } }; diff --git a/paddle/fluid/framework/fleet/ps_gpu_wrapper.cc b/paddle/fluid/framework/fleet/ps_gpu_wrapper.cc index 94fa386aac488..d9bb6e946f42d 100644 --- a/paddle/fluid/framework/fleet/ps_gpu_wrapper.cc +++ b/paddle/fluid/framework/fleet/ps_gpu_wrapper.cc @@ -33,9 +33,9 @@ limitations under the License. */ #include #include +#include "paddle/fluid/framework/data_set.h" #include "paddle/fluid/platform/timer.h" #if defined(PADDLE_WITH_PSCORE) -#include "paddle/fluid/distributed/ps/table/ctr_dymf_accessor.h" #include "paddle/fluid/distributed/ps/table/depends/feature_value.h" #endif @@ -135,7 +135,7 @@ void PSGPUWrapper::PreBuildTask(std::shared_ptr gpu_task) { std::string data_set_name = std::string(typeid(*dataset_).name()); if (data_set_name.find("SlotRecordDataset") != std::string::npos) { - SlotRecordDataset* dataset = dynamic_cast(dataset_); + SlotRecordDataset* dataset = (SlotRecordDataset*)(dataset_); auto input_channel = dataset->GetInputChannel(); VLOG(0) << "psgpu wrapperinputslotchannle size: " << input_channel->Size(); const std::deque& vec_data = input_channel->GetData(); @@ -185,7 +185,7 @@ void PSGPUWrapper::PreBuildTask(std::shared_ptr gpu_task) { } else { CHECK(data_set_name.find("MultiSlotDataset") != std::string::npos); VLOG(0) << "ps_gpu_wrapper use MultiSlotDataset"; - MultiSlotDataset* dataset = dynamic_cast(dataset_); + MultiSlotDataset* dataset = (MultiSlotDataset*)(dataset_); auto input_channel = dataset->GetInputChannel(); const std::deque& vec_data = input_channel->GetData(); @@ -540,17 +540,17 @@ void PSGPUWrapper::BuildPull(std::shared_ptr gpu_task) { &device_vals, &device_task_keys, &device_task_ptrs](int dev, int shard_id) { - auto& task_keys = device_task_keys[shard_id]; + // auto& task_keys = device_task_keys[shard_id]; #ifdef PADDLE_WITH_PSLIB auto& task_ptrs = device_task_ptrs[shard_id]; #endif -#ifdef PADDLE_WITH_PSCORE - auto& task_ptrs = device_task_ptrs[shard_id]; -#endif + // #ifdef PADDLE_WITH_PSCORE + // auto& task_ptrs = device_task_ptrs[shard_id]; + // #endif - int len = prefix_sum[dev][shard_id + 1] - prefix_sum[dev][shard_id]; - int cur = prefix_sum[dev][shard_id]; + // int len = prefix_sum[dev][shard_id + 1] - prefix_sum[dev][shard_id]; + // int cur = prefix_sum[dev][shard_id]; #ifdef PADDLE_WITH_PSLIB for (int j = 0; j < len; ++j) { device_keys[dev][cur + j] = task_keys[dev][j]; @@ -579,33 +579,6 @@ void PSGPUWrapper::BuildPull(std::shared_ptr gpu_task) { } } #endif -#ifdef PADDLE_WITH_PSCORE - for (int j = 0; j < len; ++j) { - device_keys[dev][cur + j] = task_keys[dev][j]; - float* ptr_val = task_ptrs[dev][j]->data(); - FeatureValue& val = device_vals[dev][cur + j]; - size_t dim = task_ptrs[dev][j]->size(); - val.delta_score = ptr_val[2]; - val.show = ptr_val[3]; - val.clk = ptr_val[4]; - val.slot = ptr_val[0]; - val.lr = ptr_val[5]; - val.lr_g2sum = ptr_val[6]; - val.cpu_ptr = (uint64_t)(task_ptrs[dev][j]); - - if (dim > 7) { - val.mf_size = MF_DIM + 1; - for (int x = 0; x < val.mf_size; x++) { - val.mf[x] = ptr_val[x + 7]; - } - } else { - val.mf_size = 0; - for (int x = 0; x < MF_DIM + 1; x++) { - val.mf[x] = 0; - } - } - } -#endif VLOG(3) << "GpuPs build hbmps done"; }; @@ -665,16 +638,25 @@ void PSGPUWrapper::BuildGPUTask(std::shared_ptr gpu_task) { return; } std::vector threads(device_num); - HeterPs_ = HeterPsBase::get_instance(size_max, resource_); + auto accessor_wrapper_ptr = + GlobalAccessorTransfor::GetInstance().GetAccessorWrapper(); + HeterPs_ = HeterPsBase::get_instance( + size_max, resource_, fleet_config_, accessor_class_, optimizer_type_); #ifdef PADDLE_WITH_CUDA HeterPs_->set_nccl_comm_and_size(inner_comms_, inter_comms_, node_size_); + HeterPs_->set_sparse_sgd(optimizer_config_); + HeterPs_->set_embedx_sgd(optimizer_config_); #endif - auto build_dymf_mem_pool = [this, &gpu_task](int i, int j) { + auto build_dymf_mem_pool = [this, &gpu_task, &accessor_wrapper_ptr](int i, + int j) { this->HeterPs_->set_multi_mf_dim(multi_mf_dim_, max_mf_dim_); int mf_dim = this->index_dim_vec_[j]; + VLOG(0) << "building table: " << i << "with mf dim: " << mf_dim + << " feature_value_size:" + << accessor_wrapper_ptr->GetFeatureValueSize(mf_dim); size_t feature_value_size = - TYPEALIGN(8, sizeof(FeatureValue) + ((mf_dim + 1) * sizeof(float))); + accessor_wrapper_ptr->GetFeatureValueSize(mf_dim); auto& device_dim_keys = gpu_task->device_dim_keys_[i][j]; auto& device_dim_ptrs = gpu_task->device_dim_ptr_[i][j]; size_t len = device_dim_keys.size(); @@ -682,12 +664,13 @@ void PSGPUWrapper::BuildGPUTask(std::shared_ptr gpu_task) { this->mem_pools_[i * this->multi_mf_dim_ + j] = new MemoryPool(len, feature_value_size); }; - auto build_dymf_hbm_pool = [this, &gpu_task](int i, int j) { + auto build_dymf_hbm_pool = [this, &gpu_task, &accessor_wrapper_ptr](int i, + int j) { auto& device_dim_keys = gpu_task->device_dim_keys_[i][j]; size_t len = device_dim_keys.size(); int mf_dim = this->index_dim_vec_[j]; size_t feature_value_size = - TYPEALIGN(8, sizeof(FeatureValue) + ((mf_dim + 1) * sizeof(float))); + accessor_wrapper_ptr->GetFeatureValueSize(mf_dim); auto& mem_pool = this->mem_pools_[i * this->multi_mf_dim_ + j]; platform::CUDADeviceGuard guard(resource_->dev_id(i)); @@ -710,13 +693,13 @@ void PSGPUWrapper::BuildGPUTask(std::shared_ptr gpu_task) { delete mem_pool; }; int thread_num = 16; - auto build_dynamic_mf_func = [this, &gpu_task, thread_num]( - int i, int j, int z) { + auto build_dynamic_mf_func = [this, + &gpu_task, + thread_num, + &accessor_wrapper_ptr](int i, int j, int z) { // this->HeterPs_->set_multi_mf_dim(multi_mf_dim_, max_mf_dim_); int mf_dim = this->index_dim_vec_[j]; VLOG(0) << "building table: " << i << "with mf dim: " << mf_dim; - // size_t feature_value_size = - // TYPEALIGN(8, sizeof(FeatureValue) + ((mf_dim + 1) * sizeof(float))); auto& device_dim_keys = gpu_task->device_dim_keys_[i][j]; auto& device_dim_ptrs = gpu_task->device_dim_ptr_[i][j]; size_t len = device_dim_keys.size(); @@ -743,10 +726,10 @@ void PSGPUWrapper::BuildGPUTask(std::shared_ptr gpu_task) { // ============ add for multi-thread ================ for (size_t k = left; k < right; k++) { - FeatureValue* val = (FeatureValue*)(mem_pool->mem_address(k)); +#ifdef PADDLE_WITH_PSLIB + float* val = (float*)(mem_pool->mem_address(k)); float* ptr_val = device_dim_ptrs[k]->data(); size_t dim = device_dim_ptrs[k]->size(); -#ifdef PADDLE_WITH_PSLIB val->delta_score = ptr_val[paddle::ps::DownpourCtrDymfAccessor:: DownpourCtrDymfFeatureValue::delta_score_index()]; @@ -765,23 +748,6 @@ void PSGPUWrapper::BuildGPUTask(std::shared_ptr gpu_task) { ptr_val[paddle::ps::DownpourCtrDymfAccessor::DownpourCtrDymfFeatureValue:: mf_dim_index()] = float(mf_dim); val->mf_dim = mf_dim; -#endif -#ifdef PADDLE_WITH_PSCORE - paddle::distributed::CtrDymfAccessor accessor; - val->delta_score = - ptr_val[accessor.common_feature_value.DeltaScoreIndex()]; - val->show = ptr_val[accessor.common_feature_value.ShowIndex()]; - val->clk = ptr_val[accessor.common_feature_value.ClickIndex()]; - val->slot = int(ptr_val[accessor.common_feature_value.SlotIndex()]); - val->lr = ptr_val[accessor.common_feature_value.EmbedWIndex()]; - val->lr_g2sum = ptr_val[accessor.common_feature_value.EmbedG2SumIndex()]; - - val->cpu_ptr = (uint64_t)(device_dim_ptrs[k]); - - // TODO(xuefeng) set mf_dim while using DownpourCtrDymfAccessor - ptr_val[accessor.common_feature_value.MfDimIndex()] = float(mf_dim); - val->mf_dim = mf_dim; -#endif if (dim > 8) { // CpuPS alreay expand as mf_dim val->mf_size = mf_dim + 1; for (int x = 0; x < val->mf_dim + 1; x++) { @@ -793,6 +759,12 @@ void PSGPUWrapper::BuildGPUTask(std::shared_ptr gpu_task) { val->mf[x] = 0; } } +#endif +#ifdef PADDLE_WITH_PSCORE + void* val = mem_pool->mem_address(k); + accessor_wrapper_ptr->BuildFill( + val, device_dim_ptrs[k], cpu_table_accessor_, mf_dim); +#endif } }; @@ -945,7 +917,10 @@ void PSGPUWrapper::EndPass() { } } int thread_num = 8; - auto dump_pool_to_cpu_func = [this, thread_num](int i, int j, int z) { + auto accessor_wrapper_ptr = + GlobalAccessorTransfor::GetInstance().GetAccessorWrapper(); + auto dump_pool_to_cpu_func = [this, thread_num, &accessor_wrapper_ptr]( + int i, int j, int z) { PADDLE_ENFORCE_GPU_SUCCESS(cudaSetDevice(this->resource_->dev_id(i))); auto& hbm_pool = this->hbm_pools_[i * this->multi_mf_dim_ + j]; auto& device_keys = this->current_task_->device_dim_keys_[i][j]; @@ -965,9 +940,11 @@ void PSGPUWrapper::EndPass() { } // ============ multi-thread process feasign============ int mf_dim = this->index_dim_vec_[j]; - VLOG(0) << "dump pool to cpu table: " << i << "with mf dim: " << mf_dim; size_t feature_value_size = - TYPEALIGN(8, sizeof(FeatureValue) + ((mf_dim + 1) * sizeof(float))); + accessor_wrapper_ptr->GetFeatureValueSize(mf_dim); + VLOG(0) << "dump pool to cpu table: " << i << "with mf dim: " << mf_dim + << " key_len :" << len + << " feature_value_size:" << feature_value_size; char* test_build_values = (char*)malloc(feature_value_size * real_len); uint64_t offset = left * feature_value_size; cudaMemcpy(test_build_values, @@ -981,7 +958,7 @@ void PSGPUWrapper::EndPass() { continue; } size_t local_offset = (i - left) * feature_value_size; - FeatureValue* gpu_val = (FeatureValue*)(test_build_values + local_offset); + float* gpu_val = (float*)(test_build_values + local_offset); #ifdef PADDLE_WITH_PSLIB auto* downpour_value = (paddle::ps::DownpourFixedFeatureValue*)(gpu_val->cpu_ptr); @@ -1002,31 +979,15 @@ void PSGPUWrapper::EndPass() { embed_g2sum_index()] = gpu_val->lr_g2sum; cpu_val[paddle::ps::DownpourCtrDymfAccessor::DownpourCtrDymfFeatureValue:: slot_index()] = gpu_val->slot; -#endif -#ifdef PADDLE_WITH_PSCORE - auto* downpour_value = - (paddle::distributed::FixedFeatureValue*)(gpu_val->cpu_ptr); - int downpour_value_size = downpour_value->size(); - if (gpu_val->mf_size > 0 && downpour_value_size == 8) { - downpour_value->resize(gpu_val->mf_dim + 1 + downpour_value_size); - } - float* cpu_val = downpour_value->data(); - - paddle::distributed::CtrDymfAccessor accessor; - cpu_val[accessor.common_feature_value.DeltaScoreIndex()] = - gpu_val->delta_score; - cpu_val[accessor.common_feature_value.ShowIndex()] = gpu_val->show; - cpu_val[accessor.common_feature_value.ClickIndex()] = gpu_val->clk; - cpu_val[accessor.common_feature_value.EmbedWIndex()] = gpu_val->lr; - cpu_val[accessor.common_feature_value.EmbedG2SumIndex()] = - gpu_val->lr_g2sum; - cpu_val[accessor.common_feature_value.SlotIndex()] = gpu_val->slot; -#endif if (gpu_val->mf_size > 0) { for (int x = 0; x < gpu_val->mf_dim + 1; x++) { cpu_val[x + 8] = gpu_val->mf[x]; } } +#endif +#ifdef PADDLE_WITH_PSCORE + accessor_wrapper_ptr->DumpFill(gpu_val, cpu_table_accessor_, mf_dim); +#endif } free(test_build_values); }; @@ -1066,79 +1027,8 @@ void PSGPUWrapper::PullSparse(const paddle::platform::Place& place, const std::vector& values, const std::vector& slot_lengths, const int hidden_size) { - platform::Timer all_timer; - platform::Timer pull_gpups_timer; - all_timer.Start(); - int64_t total_length = - std::accumulate(slot_lengths.begin(), slot_lengths.end(), 0UL); - VLOG(3) << "Begine Gpu/Xpu Ps PullSparse"; - auto buf = memory::Alloc(place, total_length * sizeof(FeatureValue)); - FeatureValue* total_values_gpu = reinterpret_cast(buf->ptr()); - if (platform::is_cpu_place(place)) { - PADDLE_THROW(platform::errors::Unimplemented( - "Warning:: CPUPlace is not supported in GpuPs now.")); - } else if (platform::is_gpu_place(place)) { -#ifdef PADDLE_WITH_CUDA - VLOG(3) << "Begin copy keys, key_num[" << total_length << "]"; - int device_id = place.GetDeviceId(); - int devid_2_index = HeterPs_->get_index_by_devid(device_id); - LoDTensor& total_keys_tensor = keys_tensor[devid_2_index]; - uint64_t* total_keys = reinterpret_cast( - total_keys_tensor.mutable_data({total_length, 1}, place)); - - // construct slot_level lod info - auto slot_lengths_lod = slot_lengths; - for (size_t i = 1; i < slot_lengths_lod.size(); i++) { - slot_lengths_lod[i] += slot_lengths_lod[i - 1]; - } - auto buf_key = memory::Alloc(place, keys.size() * sizeof(uint64_t*)); - auto buf_length = - memory::Alloc(place, slot_lengths.size() * sizeof(int64_t)); - uint64_t** gpu_keys = reinterpret_cast(buf_key->ptr()); - int64_t* gpu_len = reinterpret_cast(buf_length->ptr()); - cudaMemcpy(gpu_keys, - keys.data(), - keys.size() * sizeof(uint64_t*), - cudaMemcpyHostToDevice); - cudaMemcpy(gpu_len, - slot_lengths_lod.data(), - slot_lengths.size() * sizeof(int64_t), - cudaMemcpyHostToDevice); - - this->CopyKeys(place, - gpu_keys, - total_keys, - gpu_len, - static_cast(slot_lengths.size()), - static_cast(total_length)); - VLOG(3) << "Begin call PullSparseGPU in GPUPS, dev: " << devid_2_index - << " len: " << total_length; - pull_gpups_timer.Start(); - HeterPs_->pull_sparse(devid_2_index, - total_keys, - total_values_gpu, - static_cast(total_length)); - pull_gpups_timer.Pause(); - - VLOG(3) << "Begin Copy result to tensor, total_length[" << total_length - << "]"; - this->CopyForPull(place, - gpu_keys, - values, - total_values_gpu, - gpu_len, - static_cast(slot_lengths.size()), - hidden_size, - total_length); - } else { - PADDLE_THROW(platform::errors::PreconditionNotMet( - "GpuPs: PullSparse Only Support CUDAPlace Now.")); - } - all_timer.Pause(); - VLOG(3) << "GpuPs PullSparse total costs: " << all_timer.ElapsedSec() - << " s, of which GPUPS costs: " << pull_gpups_timer.ElapsedSec() - << " s"; - VLOG(3) << "End PullSparse"; + VLOG(0) << "Warning:: recommand use pull_gpups_sparse op instead. This " + "PullSparse is not used."; } void PSGPUWrapper::PullSparse(const paddle::platform::Place& place, @@ -1156,13 +1046,16 @@ void PSGPUWrapper::PullSparse(const paddle::platform::Place& place, std::accumulate(slot_lengths.begin(), slot_lengths.end(), 0UL); size_t feature_value_size = 0; - feature_value_size = TYPEALIGN( - 8, sizeof(FeatureValue) + sizeof(float) * (index_dim_vec_.back() + 1)); + auto accessor_wrapper_ptr = + GlobalAccessorTransfor::GetInstance().GetAccessorWrapper(); + feature_value_size = accessor_wrapper_ptr->GetFeatureValueSize(max_mf_dim_); + VLOG(3) << "PullSparse max_dim:" << max_mf_dim_ + << " feature_value_size:" << feature_value_size; #ifdef PADDLE_WITH_CUDA VLOG(3) << "Begine Gpu Ps PullSparse"; auto buf = memory::Alloc(place, total_length * feature_value_size); - FeatureValue* total_values_gpu = reinterpret_cast(buf->ptr()); + float* total_values_gpu = reinterpret_cast(buf->ptr()); #endif #ifdef PADDLE_WITH_XPU_KP VLOG(3) << "Begine Xpu Ps PullSparse"; @@ -1224,19 +1117,19 @@ void PSGPUWrapper::PullSparse(const paddle::platform::Place& place, VLOG(3) << "Begin Copy result to tensor, total_length[" << total_length << "]"; - this->CopyForPull(place, - gpu_keys, - values, - total_values_gpu, - gpu_len, - static_cast(slot_lengths.size()), - hidden_size, - total_length, - gpu_dim); + accessor_wrapper_ptr->CopyForPull(place, + gpu_keys, + values, + total_values_gpu, + gpu_len, + static_cast(slot_lengths.size()), + hidden_size, + total_length, + gpu_dim, + val_type_size_); pull_gpups_timer.Pause(); -#endif } else if (platform::is_xpu_place(place)) { #ifdef PADDLE_WITH_XPU_KP VLOG(3) << "Begin copy keys, key_num[" << total_length << "]"; @@ -1283,14 +1176,15 @@ void PSGPUWrapper::PullSparse(const paddle::platform::Place& place, VLOG(3) << "Begin Copy result to tensor, total_length[" << total_length << "]"; - this->CopyForPull(place, - xpu_keys, - values, - total_values_gpu, - xpu_len, - static_cast(slot_lengths.size()), - hidden_size, - total_length); + accessor_wrapper_ptr->CopyForPull(place, + xpu_keys, + values, + total_values_gpu, + xpu_len, + static_cast(slot_lengths.size()), + hidden_size, + total_length, + val_type_size_); #endif } else { PADDLE_THROW(platform::errors::PreconditionNotMet( @@ -1317,12 +1211,13 @@ void PSGPUWrapper::PushSparseGrad(const paddle::platform::Place& place, std::accumulate(slot_lengths.begin(), slot_lengths.end(), 0UL); // #ifdef PADDLE_WITH_CUDA VLOG(3) << "Begin GPUPS PushSparseGrad"; - size_t grad_value_size = - TYPEALIGN(8, sizeof(FeaturePushValue) + (max_mf_dim_ * sizeof(float))); + auto accessor_wrapper_ptr = + GlobalAccessorTransfor::GetInstance().GetAccessorWrapper(); + size_t grad_value_size = accessor_wrapper_ptr->GetPushValueSize(max_mf_dim_); auto buf = memory::Alloc(place, total_length * grad_value_size); - VLOG(3) << "Push Sparse Max mf dimention: " << max_mf_dim_; - FeaturePushValue* total_grad_values_gpu = - reinterpret_cast(buf->ptr()); + VLOG(3) << "Push Sparse Max mf dimention: " << max_mf_dim_ + << "grad_value_size:" << grad_value_size; + float* total_grad_values_gpu = reinterpret_cast(buf->ptr()); if (platform::is_cpu_place(place)) { PADDLE_THROW(platform::errors::Unimplemented( "Warning:: CPUPlace is not supported in GPUPS now.")); @@ -1334,23 +1229,15 @@ void PSGPUWrapper::PushSparseGrad(const paddle::platform::Place& place, uint64_t* total_keys = reinterpret_cast(cached_total_keys_tensor.data()); VLOG(3) << "Begin copy grad tensor to gpups struct"; - if (!multi_mf_dim_) { - this->CopyForPush(place, - grad_values, - total_grad_values_gpu, - slot_lengths, - hidden_size, - total_length, - batch_size); - } else { - this->CopyForPush(place, - grad_values, - total_grad_values_gpu, - slot_lengths, - total_length, - batch_size, - grad_value_size); - } + accessor_wrapper_ptr->CopyForPush(place, + grad_values, + total_grad_values_gpu, + slot_lengths, + total_length, + batch_size, + grad_value_size, + slot_vector_, + slot_mf_dim_vector_); VLOG(3) << "Begin call PushSparseGPU in GPUPS, dev: " << devid_2_index << " len: " << total_length; @@ -1369,13 +1256,14 @@ void PSGPUWrapper::PushSparseGrad(const paddle::platform::Place& place, uint64_t* total_keys = reinterpret_cast(cached_total_keys_tensor.data()); VLOG(3) << "Begin copy grad tensor to xpups struct"; - this->CopyForPush(place, - grad_values, - total_grad_values_gpu, - slot_lengths, - hidden_size, - total_length, - batch_size); + accessor_wrapper_ptr->CopyForPush(place, + grad_values, + total_grad_values_gpu, + slot_lengths, + hidden_size, + total_length, + batch_size, + slot_vector_); VLOG(3) << "Begin call PushSparseXPU in XPUPS, dev: " << devid_2_index << " len: " << total_length; diff --git a/paddle/fluid/framework/fleet/ps_gpu_wrapper.cu b/paddle/fluid/framework/fleet/ps_gpu_wrapper.cu index 734765fa95423..f8624f48d08f3 100644 --- a/paddle/fluid/framework/fleet/ps_gpu_wrapper.cu +++ b/paddle/fluid/framework/fleet/ps_gpu_wrapper.cu @@ -26,90 +26,6 @@ limitations under the License. */ namespace paddle { namespace framework { -__global__ void PullCopy(float** dest, - const FeatureValue* src, - const int64_t* len, - int hidden, - int slot_num, - int total_len, - uint64_t** keys) { - CUDA_KERNEL_LOOP(i, total_len) { - int low = 0; - int high = slot_num - 1; - while (low < high) { - int mid = (low + high) / 2; - if (i < len[mid]) - high = mid; - else - low = mid + 1; - } - int x = low; - int y = i - (x ? len[x - 1] : 0); - if (*(keys[x] + y) == 0) { - *(dest[x] + y * hidden) = 0; - *(dest[x] + y * hidden + 1) = 0; - *(dest[x] + y * hidden + 2) = 0; - } else { - *(dest[x] + y * hidden) = (src + i)->show; - *(dest[x] + y * hidden + 1) = (src + i)->clk; - *(dest[x] + y * hidden + 2) = (src + i)->lr; - } - if ((src + i)->mf_size == 0 || *(keys[x] + y) == 0) { - for (int j = 0; j < hidden - 3; j++) { - *(dest[x] + y * hidden + 3 + j) = 0; - } - } else { - for (int j = 0; j < hidden - 3; j++) { - *(dest[x] + y * hidden + 3 + j) = (src + i)->mf[1 + j]; - } - } - } -} - -__global__ void PullCopy(float** dest, - const FeatureValue* src, - const int64_t* len, - int slot_num, - int total_len, - uint64_t** keys, - uint64_t max_val_size, - int* gpu_dim) { - CUDA_KERNEL_LOOP(i, total_len) { - int low = 0; - int high = slot_num - 1; - while (low < high) { - int mid = (low + high) / 2; - if (i < len[mid]) - high = mid; - else - low = mid + 1; - } - int x = low; - int y = i - (x ? len[x - 1] : 0); - FeatureValue* feature_value_ptr = - (FeatureValue*)((char*)src + uint64_t(i) * uint64_t(max_val_size)); - int mf_dim = gpu_dim[x] - 3; - if (*(keys[x] + y) == 0) { - *(dest[x] + y * (mf_dim + 3)) = 0; - *(dest[x] + y * (mf_dim + 3) + 1) = 0; - *(dest[x] + y * (mf_dim + 3) + 2) = 0; - } else { - *(dest[x] + y * (mf_dim + 3)) = feature_value_ptr->show; - *(dest[x] + y * (mf_dim + 3) + 1) = feature_value_ptr->clk; - *(dest[x] + y * (mf_dim + 3) + 2) = feature_value_ptr->lr; - } - if ((feature_value_ptr)->mf_size == 0 || *(keys[x] + y) == 0) { - for (int j = 0; j < mf_dim; j++) { - *(dest[x] + y * (mf_dim + 3) + 3 + j) = 0; - } - } else { - for (int j = 0; j < mf_dim; j++) { - *(dest[x] + y * (mf_dim + 3) + 3 + j) = feature_value_ptr->mf[1 + j]; - } - } - } -} - __global__ void CopyKeysKernel(uint64_t** src_keys, uint64_t* dest_total_keys, const int64_t* len, @@ -161,101 +77,8 @@ __global__ void PushCopy(FeaturePushValue* dest, } } -__global__ void PushCopyWithPool(FeaturePushValue* dest, - float** src, - int64_t* len, - int slot_num, - uint64_t total_len, - int bs, - int* slot_vector, - int* mf_dim_vector, - size_t grad_value_size) { - CUDA_KERNEL_LOOP(i, total_len) { - int low = 0; - int high = slot_num - 1; - while (low < high) { - int mid = (low + high) / 2; - if (i < len[mid]) - high = mid; - else - low = mid + 1; - } - int x = low; - int y = i - (x ? len[low - 1] : 0); - FeaturePushValue* cur = - (FeaturePushValue*)((char*)dest + i * grad_value_size); - cur->slot = slot_vector[x]; - int mf_dim = mf_dim_vector[x]; - cur->mf_dim = mf_dim; - cur->show = *(src[x] + y * (mf_dim + 3)); - cur->clk = *(src[x] + y * (mf_dim + 3) + 1); - cur->lr_g = *(src[x] + y * (mf_dim + 3) + 2) * -1. * bs; - for (int j = 0; j < cur->mf_dim; j++) { - cur->mf_g[j] = *(src[x] + y * (mf_dim + 3) + 3 + j) * -1. * bs; - } - } -} PSGPUWrapper::~PSGPUWrapper() { delete HeterPs_; } -void PSGPUWrapper::CopyForPull(const paddle::platform::Place& place, - uint64_t** gpu_keys, - const std::vector& values, - const FeatureValue* total_values_gpu, - const int64_t* gpu_len, - const int slot_num, - const int hidden_size, - const int64_t total_length) { - auto stream = dynamic_cast( - platform::DeviceContextPool::Instance().Get(place)) - ->stream(); - auto buf_value = memory::Alloc(place, values.size() * sizeof(float*)); - float** gpu_values = reinterpret_cast(buf_value->ptr()); - cudaMemcpy(gpu_values, - values.data(), - values.size() * sizeof(float*), - cudaMemcpyHostToDevice); - - PullCopy<<<(total_length + 1024 - 1) / 1024, 1024, 0, stream>>>( - gpu_values, - total_values_gpu, - gpu_len, - hidden_size, - slot_num, - total_length, - gpu_keys); - cudaStreamSynchronize(stream); -} - -void PSGPUWrapper::CopyForPull(const paddle::platform::Place& place, - uint64_t** gpu_keys, - const std::vector& values, - const FeatureValue* total_values_gpu, - const int64_t* gpu_len, - const int slot_num, - const int hidden_size, - const int64_t total_length, - int* gpu_dim) { - auto stream = dynamic_cast( - platform::DeviceContextPool::Instance().Get(place)) - ->stream(); - auto buf_value = memory::Alloc(place, values.size() * sizeof(float*)); - float** gpu_values = reinterpret_cast(buf_value->ptr()); - cudaMemcpy(gpu_values, - values.data(), - values.size() * sizeof(float*), - cudaMemcpyHostToDevice); - PullCopy<<<(total_length + 1024 - 1) / 1024, 1024, 0, stream>>>( - gpu_values, - total_values_gpu, - gpu_len, - slot_num, - total_length, - gpu_keys, - val_type_size_, - gpu_dim); - cudaStreamSynchronize(stream); -} - void PSGPUWrapper::CopyKeys(const paddle::platform::Place& place, uint64_t** origin_keys, uint64_t* total_keys, @@ -270,125 +93,26 @@ void PSGPUWrapper::CopyKeys(const paddle::platform::Place& place, cudaStreamSynchronize(stream); } -void PSGPUWrapper::CopyForPush(const paddle::platform::Place& place, - const std::vector& grad_values, - FeaturePushValue* total_grad_values_gpu, - const std::vector& slot_lengths, - const int hidden_size, - const int64_t total_length, - const int batch_size) { - auto stream = dynamic_cast( - platform::DeviceContextPool::Instance().Get(place)) - ->stream(); - auto slot_lengths_lod = slot_lengths; - for (int i = 1; i < slot_lengths_lod.size(); i++) { - slot_lengths_lod[i] += slot_lengths_lod[i - 1]; - } - auto buf_grad_value = - memory::Alloc(place, grad_values.size() * sizeof(float*)); - auto buf_length = memory::Alloc(place, slot_lengths.size() * sizeof(int64_t)); - auto buf_slot_vector = - memory::Alloc(place, slot_lengths_lod.size() * sizeof(int)); - - float** gpu_values = reinterpret_cast(buf_grad_value->ptr()); - int64_t* gpu_len = reinterpret_cast(buf_length->ptr()); - int* d_slot_vector = reinterpret_cast(buf_slot_vector->ptr()); - - cudaMemcpy(gpu_values, - grad_values.data(), - grad_values.size() * sizeof(float*), - cudaMemcpyHostToDevice); - cudaMemcpy(gpu_len, - slot_lengths_lod.data(), - slot_lengths.size() * sizeof(int64_t), - cudaMemcpyHostToDevice); - cudaMemcpy(d_slot_vector, - slot_vector_.data(), - slot_lengths_lod.size() * sizeof(int), - cudaMemcpyHostToDevice); - - PushCopy<<<(total_length + 1024 - 1) / 1024, 1024, 0, stream>>>( - total_grad_values_gpu, - gpu_values, - gpu_len, - hidden_size, - slot_lengths.size(), - total_length, - batch_size, - d_slot_vector); - cudaStreamSynchronize(stream); -} - -void PSGPUWrapper::CopyForPush(const paddle::platform::Place& place, - const std::vector& grad_values, - FeaturePushValue* total_grad_values_gpu, - const std::vector& slot_lengths, - const uint64_t total_length, - const int batch_size, - size_t grad_value_size) { - auto stream = dynamic_cast( - platform::DeviceContextPool::Instance().Get(place)) - ->stream(); - auto slot_lengths_lod = slot_lengths; - for (int i = 1; i < slot_lengths_lod.size(); i++) { - slot_lengths_lod[i] += slot_lengths_lod[i - 1]; - } - auto buf_grad_value = - memory::Alloc(place, grad_values.size() * sizeof(float*)); - auto buf_length = memory::Alloc(place, slot_lengths.size() * sizeof(int64_t)); - auto buf_slot_vector = - memory::Alloc(place, slot_lengths_lod.size() * sizeof(int)); - auto buf_mf_dim_vector = - memory::Alloc(place, slot_lengths_lod.size() * sizeof(int)); - float** gpu_values = reinterpret_cast(buf_grad_value->ptr()); - int64_t* gpu_len = reinterpret_cast(buf_length->ptr()); - int* d_slot_vector = reinterpret_cast(buf_slot_vector->ptr()); - int* d_mf_dim_vector = reinterpret_cast(buf_mf_dim_vector->ptr()); - cudaMemcpy(gpu_values, - grad_values.data(), - grad_values.size() * sizeof(float*), - cudaMemcpyHostToDevice); - cudaMemcpy(gpu_len, - slot_lengths_lod.data(), - slot_lengths.size() * sizeof(int64_t), - cudaMemcpyHostToDevice); - cudaMemcpy(d_slot_vector, - slot_vector_.data(), - slot_lengths_lod.size() * sizeof(int), - cudaMemcpyHostToDevice); - cudaMemcpy(d_mf_dim_vector, - slot_mf_dim_vector_.data(), - slot_lengths_lod.size() * sizeof(int), - cudaMemcpyHostToDevice); - PushCopyWithPool<<<(total_length + 1024 - 1) / 1024, 1024, 0, stream>>>( - total_grad_values_gpu, - gpu_values, - gpu_len, - slot_lengths.size(), - total_length, - batch_size, - d_slot_vector, - d_mf_dim_vector, - grad_value_size); - cudaStreamSynchronize(stream); -} - void PSGPUWrapper::SetSparseSGD(float nonclk_coeff, float clk_coeff, float min_bound, float max_bound, float learning_rate, float initial_g2sum, - float initial_range) { - OptimizerConfig optimizer_config; - optimizer_config.set_sparse_sgd(nonclk_coeff, - clk_coeff, - min_bound, - max_bound, - learning_rate, - initial_g2sum, - initial_range); - HeterPs_->set_sparse_sgd(optimizer_config); + float initial_range, + float beta1_decay_rate, + float beta2_decay_rate, + float ada_epsilon) { + optimizer_config_.set_sparse_sgd(nonclk_coeff, + clk_coeff, + min_bound, + max_bound, + learning_rate, + initial_g2sum, + initial_range, + beta1_decay_rate, + beta2_decay_rate, + ada_epsilon); } void PSGPUWrapper::SetEmbedxSGD(float mf_create_thresholds, @@ -396,15 +120,19 @@ void PSGPUWrapper::SetEmbedxSGD(float mf_create_thresholds, float mf_initial_g2sum, float mf_initial_range, float mf_min_bound, - float mf_max_bound) { - OptimizerConfig optimizer_config; - optimizer_config.set_embedx_sgd(mf_create_thresholds, - mf_learning_rate, - mf_initial_g2sum, - mf_initial_range, - mf_min_bound, - mf_max_bound); - HeterPs_->set_embedx_sgd(optimizer_config); + float mf_max_bound, + float mf_beta1_decay_rate, + float mf_beta2_decay_rate, + float mf_ada_epsilon) { + optimizer_config_.set_embedx_sgd(mf_create_thresholds, + mf_learning_rate, + mf_initial_g2sum, + mf_initial_range, + mf_min_bound, + mf_max_bound, + mf_beta1_decay_rate, + mf_beta2_decay_rate, + mf_ada_epsilon); } } // end namespace framework diff --git a/paddle/fluid/framework/fleet/ps_gpu_wrapper.h b/paddle/fluid/framework/fleet/ps_gpu_wrapper.h index 65f86acce9151..0d1669a42b1e9 100644 --- a/paddle/fluid/framework/fleet/ps_gpu_wrapper.h +++ b/paddle/fluid/framework/fleet/ps_gpu_wrapper.h @@ -51,7 +51,10 @@ limitations under the License. */ #include "paddle/fluid/platform/macros.h" // for DISABLE_COPY_AND_ASSIGN #include "paddle/fluid/platform/place.h" #ifdef PADDLE_WITH_PSCORE +#include "paddle/fluid/distributed/ps/table/accessor.h" +#include "paddle/fluid/distributed/ps/table/ctr_dymf_accessor.h" #include "paddle/fluid/distributed/ps/wrapper/fleet.h" +#include "paddle/fluid/distributed/the_one_ps.pb.h" #endif #ifdef PADDLE_WITH_PSLIB #include "afs_api.h" @@ -64,9 +67,6 @@ limitations under the License. */ namespace paddle { namespace framework { -#define TYPEALIGN(ALIGNVAL, LEN) \ - (((uint64_t)(LEN) + ((ALIGNVAL)-1)) & ~((uint64_t)((ALIGNVAL)-1))) - class Dataset; #ifdef PADDLE_WITH_PSLIB @@ -98,7 +98,7 @@ class AfsWrapper { class PSGPUWrapper { public: - virtual ~PSGPUWrapper(); + ~PSGPUWrapper(); PSGPUWrapper() { HeterPs_ = NULL; @@ -139,37 +139,6 @@ class PSGPUWrapper { const int64_t* gpu_len, int slot_num, int total_len); - void CopyForPull(const paddle::platform::Place& place, - uint64_t** gpu_keys, - const std::vector& values, - const FeatureValue* total_values_gpu, - const int64_t* gpu_len, - const int slot_num, - const int hidden_size, - const int64_t total_length); - void CopyForPull(const paddle::platform::Place& place, - uint64_t** gpu_keys, - const std::vector& values, - const FeatureValue* total_values_gpu, - const int64_t* gpu_len, - const int slot_num, - const int hidden_size, - const int64_t total_length, - int* gpu_dim); - void CopyForPush(const paddle::platform::Place& place, - const std::vector& grad_values, - FeaturePushValue* total_grad_values_gpu, - const std::vector& slot_lengths, - const int hidden_size, - const int64_t total_length, - const int batch_size); - void CopyForPush(const paddle::platform::Place& place, - const std::vector& grad_values, - FeaturePushValue* total_grad_values_gpu, - const std::vector& slot_lengths, - const uint64_t total_length, - const int batch_size, - size_t grad_value_size); void BuildGPUTask(std::shared_ptr gpu_task); void PreBuildTask(std::shared_ptr gpu_task); @@ -274,13 +243,96 @@ class PSGPUWrapper { float max_bound, float learning_rate, float initial_g2sum, - float initial_range); + float initial_range, + float beta1_decay_rate, + float beta2_decay_rate, + float ada_epsilon); void SetEmbedxSGD(float mf_create_thresholds, float mf_learning_rate, float mf_initial_g2sum, float mf_initial_range, float mf_min_bound, - float mf_max_bound); + float mf_max_bound, + float mf_beta1_decay_rate, + float mf_beta2_decay_rate, + float mf_ada_epsilon); + +#ifdef PADDLE_WITH_PSCORE + void add_sparse_optimizer( + std::unordered_map& config, // NOLINT + const ::paddle::distributed::SparseCommonSGDRuleParameter& sgd_param, + const std::string& prefix = "") { + auto optimizer_name = sgd_param.name(); + if (optimizer_name == "SparseNaiveSGDRule") { + config[prefix + "optimizer_type"] = 0; + config[prefix + "learning_rate"] = sgd_param.naive().learning_rate(); + config[prefix + "initial_range"] = sgd_param.naive().initial_range(); + config[prefix + "min_bound"] = sgd_param.naive().weight_bounds()[0]; + config[prefix + "max_bound"] = sgd_param.naive().weight_bounds()[1]; + } else if (optimizer_name == "SparseAdaGradSGDRule") { + config[prefix + "optimizer_type"] = 1; + config[prefix + "learning_rate"] = sgd_param.adagrad().learning_rate(); + config[prefix + "initial_range"] = sgd_param.adagrad().initial_range(); + config[prefix + "initial_g2sum"] = sgd_param.adagrad().initial_g2sum(); + config[prefix + "min_bound"] = sgd_param.adagrad().weight_bounds()[0]; + config[prefix + "max_bound"] = sgd_param.adagrad().weight_bounds()[1]; + } else if (optimizer_name == "StdAdaGradSGDRule") { + config[prefix + "optimizer_type"] = 2; + config[prefix + "learning_rate"] = sgd_param.adagrad().learning_rate(); + config[prefix + "initial_range"] = sgd_param.adagrad().initial_range(); + config[prefix + "initial_g2sum"] = sgd_param.adagrad().initial_g2sum(); + config[prefix + "min_bound"] = sgd_param.adagrad().weight_bounds()[0]; + config[prefix + "max_bound"] = sgd_param.adagrad().weight_bounds()[1]; + } else if (optimizer_name == "SparseAdamSGDRule") { + config[prefix + "optimizer_type"] = 3; + config[prefix + "learning_rate"] = sgd_param.adam().learning_rate(); + config[prefix + "initial_range"] = sgd_param.adam().initial_range(); + config[prefix + "beta1_decay_rate"] = sgd_param.adam().beta1_decay_rate(); + config[prefix + "beta2_decay_rate"] = sgd_param.adam().beta2_decay_rate(); + config[prefix + "ada_epsilon"] = sgd_param.adam().ada_epsilon(); + config[prefix + "min_bound"] = sgd_param.adam().weight_bounds()[0]; + config[prefix + "max_bound"] = sgd_param.adam().weight_bounds()[1]; + } else if (optimizer_name == "SparseSharedAdamSGDRule") { + config[prefix + "optimizer_type"] = 4; + config[prefix + "learning_rate"] = sgd_param.adam().learning_rate(); + config[prefix + "initial_range"] = sgd_param.adam().initial_range(); + config[prefix + "beta1_decay_rate"] = sgd_param.adam().beta1_decay_rate(); + config[prefix + "beta2_decay_rate"] = sgd_param.adam().beta2_decay_rate(); + config[prefix + "ada_epsilon"] = sgd_param.adam().ada_epsilon(); + config[prefix + "min_bound"] = sgd_param.adam().weight_bounds()[0]; + config[prefix + "max_bound"] = sgd_param.adam().weight_bounds()[1]; + } + } + + void InitializeGPUServer(paddle::distributed::PSParameter ps_param) { + auto sparse_table = + ps_param.server_param().downpour_server_param().downpour_table_param(0); + auto sparse_table_accessor = sparse_table.accessor(); + auto sparse_table_accessor_parameter = + sparse_table_accessor.ctr_accessor_param(); + accessor_class_ = sparse_table_accessor.accessor_class(); + + std::unordered_map config; + config["embedx_dim"] = sparse_table_accessor.embedx_dim(); + config["nonclk_coeff"] = sparse_table_accessor_parameter.nonclk_coeff(); + config["clk_coeff"] = sparse_table_accessor_parameter.click_coeff(); + config["mf_create_thresholds"] = sparse_table_accessor.embedx_threshold(); + + if (accessor_class_ == "CtrDymfAccessor") { + // optimizer config for embed_w and embedx + add_sparse_optimizer(config, sparse_table_accessor.embed_sgd_param()); + add_sparse_optimizer( + config, sparse_table_accessor.embedx_sgd_param(), "mf_"); + } + + fleet_config_ = config; + GlobalAccessorTransfor::GetInstance().Init(accessor_class_); + GlobalAccessorTransfor::GetInstance().GetAccessorWrapper()->Configure( + config); + InitializeGPUServer(config); + } +#endif + void InitializeGPUServer(std::unordered_map config) { float nonclk_coeff = (config.find("nonclk_coeff") == config.end()) ? 1.0 @@ -288,54 +340,83 @@ class PSGPUWrapper { float clk_coeff = (config.find("clk_coeff") == config.end()) ? 1.0 : config["clk_coeff"]; float min_bound = (config.find("min_bound") == config.end()) - ? -10000.0 + ? -10.0 : config["min_bound"]; - float max_bound = (config.find("max_bound") == config.end()) - ? 10000.0 - : config["max_bound"]; + float max_bound = + (config.find("max_bound") == config.end()) ? 10.0 : config["max_bound"]; float learning_rate = (config.find("learning_rate") == config.end()) - ? 1.0 + ? 0.05 : config["learning_rate"]; float initial_g2sum = (config.find("initial_g2sum") == config.end()) - ? 1.0 + ? 3.0 : config["initial_g2sum"]; float initial_range = (config.find("initial_range") == config.end()) - ? 1.0 + ? 1e-4 : config["initial_range"]; - + float beta1_decay_rate = (config.find("beta1_decay_rate") == config.end()) + ? 0.9 + : config["beta1_decay_rate"]; + float beta2_decay_rate = (config.find("beta2_decay_rate") == config.end()) + ? 0.999 + : config["beta2_decay_rate"]; + float ada_epsilon = (config.find("ada_epsilon") == config.end()) + ? 1e-8 + : config["ada_epsilon"]; // mf config settings float mf_create_thresholds = (config.find("mf_create_thresholds") == config.end()) ? static_cast(1.0) : config["mf_create_thresholds"]; float mf_learning_rate = (config.find("mf_learning_rate") == config.end()) - ? 1.0 + ? 0.05 : config["mf_learning_rate"]; float mf_initial_g2sum = (config.find("mf_initial_g2sum") == config.end()) - ? 1.0 + ? 3.0 : config["mf_initial_g2sum"]; float mf_initial_range = (config.find("mf_initial_range") == config.end()) - ? 1.0 + ? 1e-4 : config["mf_initial_range"]; float mf_min_bound = (config.find("mf_min_bound") == config.end()) - ? 1.0 + ? -10.0 : config["mf_min_bound"]; float mf_max_bound = (config.find("mf_max_bound") == config.end()) - ? 1.0 + ? 10.0 : config["mf_max_bound"]; + float mf_beta1_decay_rate = + (config.find("mf_beta1_decay_rate") == config.end()) + ? 0.9 + : config["mf_beta1_decay_rate"]; + float mf_beta2_decay_rate = + (config.find("mf_beta2_decay_rate") == config.end()) + ? 0.999 + : config["mf_beta2_decay_rate"]; + float mf_ada_epsilon = (config.find("mf_ada_epsilon") == config.end()) + ? 1e-8 + : config["mf_ada_epsilon"]; this->SetSparseSGD(nonclk_coeff, clk_coeff, min_bound, max_bound, learning_rate, initial_g2sum, - initial_range); + initial_range, + beta1_decay_rate, + beta2_decay_rate, + ada_epsilon); this->SetEmbedxSGD(mf_create_thresholds, mf_learning_rate, mf_initial_g2sum, mf_initial_range, mf_min_bound, - mf_max_bound); + mf_max_bound, + mf_beta1_decay_rate, + mf_beta2_decay_rate, + mf_ada_epsilon); + + // set optimizer type(naive,adagrad,std_adagrad,adam,share_adam) + optimizer_type_ = (config.find("optimizer_type") == config.end()) + ? 1 + : static_cast(config["optimizer_type"]); } void SetDate(int year, int month, int day) { @@ -380,7 +461,7 @@ class PSGPUWrapper { if (slot_info_initialized_) { return; } - SlotRecordDataset* dataset = dynamic_cast(dataset_); + SlotRecordDataset* dataset = (SlotRecordDataset*)(dataset_); auto slots_vec = dataset->GetSlots(); slot_offset_vector_.clear(); for (auto& slot : slot_vector_) { @@ -421,10 +502,13 @@ class PSGPUWrapper { for (size_t i = 0; i < slot_index_vec_.size(); i++) { slot_index_vec_[i] = dim_index_map[slot_mf_dim_vector_[i]]; } - val_type_size_ = - TYPEALIGN(8, sizeof(FeatureValue) + sizeof(float) * (max_mf_dim_ + 1)); - grad_type_size_ = - TYPEALIGN(8, sizeof(FeaturePushValue) + (max_mf_dim_ * sizeof(float))); + + auto accessor_wrapper_ptr = + GlobalAccessorTransfor::GetInstance().GetAccessorWrapper(); + val_type_size_ = accessor_wrapper_ptr->GetFeatureValueSize(max_mf_dim_); + grad_type_size_ = accessor_wrapper_ptr->GetPushValueSize(max_mf_dim_); + VLOG(0) << "InitSlotInfo: val_type_size_" << val_type_size_ + << " grad_type_size_:" << grad_type_size_; slot_info_initialized_ = true; } #endif @@ -445,6 +529,12 @@ class PSGPUWrapper { const std::string& conf); #endif +#ifdef PADDLE_WITH_PSCORE + void SetTableAccessor(paddle::distributed::ValueAccessor* accessor) { + cpu_table_accessor_ = accessor; + } +#endif + private: static std::shared_ptr s_instance_; Dataset* dataset_; @@ -497,6 +587,12 @@ class PSGPUWrapper { int day_; bool slot_info_initialized_ = false; int use_afs_api_ = 0; + int optimizer_type_ = 1; + std::string accessor_class_; + std::unordered_map fleet_config_; +#ifdef PADDLE_WITH_PSCORE + paddle::distributed::ValueAccessor* cpu_table_accessor_; +#endif #ifdef PADDLE_WITH_CUDA std::vector mem_pools_; @@ -521,6 +617,7 @@ class PSGPUWrapper { bool running_ = false; std::vector> pull_thread_pool_; std::vector> hbm_thread_pool_; + OptimizerConfig optimizer_config_; protected: static bool is_initialized_; diff --git a/paddle/fluid/framework/fleet/ps_gpu_wrapper.kps b/paddle/fluid/framework/fleet/ps_gpu_wrapper.kps index ef6c70e624d4c..3505bff72e90a 100644 --- a/paddle/fluid/framework/fleet/ps_gpu_wrapper.kps +++ b/paddle/fluid/framework/fleet/ps_gpu_wrapper.kps @@ -28,9 +28,13 @@ limitations under the License. */ namespace paddle { namespace framework { -__global__ void PullCopy(float* dest, const FeatureValue* src, - const long long* len, int hidden, int slot_num, - int total_len, unsigned long long* keys) { +__global__ void PullCopy(float* dest, + const FeatureValue* src, + const long long* len, + int hidden, + int slot_num, + int total_len, + unsigned long long* keys) { int cid = core_id(); int ncores = core_num(); if (cid >= ncores) { @@ -42,8 +46,8 @@ __global__ void PullCopy(float* dest, const FeatureValue* src, GM2LM(len, local_len, slot_num * sizeof(int64_t)); __global_ptr__ unsigned long long* local_keys[slot_num]; - GM2LM(keys, local_keys, - slot_num * sizeof(__global_ptr__ unsigned long long*)); + GM2LM( + keys, local_keys, slot_num * sizeof(__global_ptr__ unsigned long long*)); __global_ptr__ float* local_dest[slot_num]; GM2LM(dest, local_dest, slot_num * sizeof(__global_ptr__ float*)); @@ -64,10 +68,11 @@ __global__ void PullCopy(float* dest, const FeatureValue* src, // copy read_len (length) of slots' val to LM for (int k = 0; k < slot_len; k += read_len) { int real_read_len = min(read_len, slot_len - k); - GM2LM(src + dest_len + k, local_slot_vals, + GM2LM(src + dest_len + k, + local_slot_vals, real_read_len * sizeof(FeatureValue)); - GM2LM(local_keys[i] + k, local_slot_keys, - real_read_len * sizeof(uint64_t)); + GM2LM( + local_keys[i] + k, local_slot_keys, real_read_len * sizeof(uint64_t)); for (int j = 0; j < real_read_len; j++) { if (local_slot_keys[j] == 0) { local_dest_vals[j * hidden] = 0; @@ -89,7 +94,8 @@ __global__ void PullCopy(float* dest, const FeatureValue* src, } } } - LM2GM(local_dest_vals, local_dest[i] + k * hidden, + LM2GM(local_dest_vals, + local_dest[i] + k * hidden, real_read_len * hidden * sizeof(float)); } } @@ -97,7 +103,8 @@ __global__ void PullCopy(float* dest, const FeatureValue* src, __global__ void CopyKeysKernel(unsigned long long* src_keys, unsigned long long* dest_total_keys, - const long long* len, int slot_num, + const long long* len, + int slot_num, int total_len) { int cid = core_id(); int ncores = core_num(); @@ -110,7 +117,8 @@ __global__ void CopyKeysKernel(unsigned long long* src_keys, GM2LM(len, local_len, slot_num * sizeof(long long)); __global_ptr__ unsigned long long* local_keys[slot_num]; - GM2LM(src_keys, local_keys, + GM2LM(src_keys, + local_keys, slot_num * sizeof(__global_ptr__ unsigned long long*)); for (int i = thread_id; i < slot_num; i += nthreads) { @@ -123,16 +131,23 @@ __global__ void CopyKeysKernel(unsigned long long* src_keys, for (int k = 0; k < slot_len; k += read_len) { int real_read_len = min(read_len, slot_len - k); - GM2LM(local_keys[i] + k, local_slot_keys, + GM2LM(local_keys[i] + k, + local_slot_keys, real_read_len * sizeof(unsigned long long)); - LM2GM(local_slot_keys, dest_total_keys + dest_len + k, + LM2GM(local_slot_keys, + dest_total_keys + dest_len + k, real_read_len * sizeof(unsigned long long)); } } } -__global__ void PushCopy(FeaturePushValue* dest, float* src, long long* len, - int hidden, int slot_num, int total_len, int bs, +__global__ void PushCopy(FeaturePushValue* dest, + float* src, + long long* len, + int hidden, + int slot_num, + int total_len, + int bs, int* slot_vector) { int cid = core_id(); int ncores = core_num(); @@ -163,7 +178,8 @@ __global__ void PushCopy(FeaturePushValue* dest, float* src, long long* len, // copy read_len(length) of slots' grad to LM for (int k = 0; k < slot_len; k += read_len) { int real_read_len = min(read_len, slot_len - k); - GM2LM(local_src[i] + k * hidden, local_slot_grads, + GM2LM(local_src[i] + k * hidden, + local_slot_grads, real_read_len * hidden * sizeof(float)); // copy from slots' grad to total grad for (int j = 0; j < real_read_len; j++) { @@ -176,7 +192,8 @@ __global__ void PushCopy(FeaturePushValue* dest, float* src, long long* len, local_slot_grads[j * hidden + 3 + m] * -1. * bs; } } - LM2GM(local_dest_grads, dest + dest_len + k, + LM2GM(local_dest_grads, + dest + dest_len + k, real_read_len * sizeof(FeaturePushValue)); } } @@ -184,40 +201,11 @@ __global__ void PushCopy(FeaturePushValue* dest, float* src, long long* len, PSGPUWrapper::~PSGPUWrapper() { delete HeterPs_; } -void PSGPUWrapper::CopyForPull(const paddle::platform::Place& place, - uint64_t** gpu_keys, - const std::vector& values, - const FeatureValue* total_values_gpu, - const int64_t* gpu_len, const int slot_num, - const int hidden_size, - const int64_t total_length) { - XPUStream stream = nullptr; - auto dev_ctx = platform::DeviceContextPool::Instance().Get(place); - stream = static_cast(dev_ctx) - ->x_context() - ->xpu_stream; - // float* buf_value = nullptr; - // xpu_malloc(reinterpret_cast(&buf_value), - // values.size() * sizeof(float*)); - // float** gpu_values = reinterpret_cast(&buf_value); - float* gpu_values = nullptr; - xpu_malloc(reinterpret_cast(&gpu_values), - values.size() * sizeof(float*)); - xpu_memcpy(gpu_values, values.data(), values.size() * sizeof(float*), - XPU_HOST_TO_DEVICE); - - // unsigned long long** c_keys = (unsigned long long**)gpu_keys; - unsigned long long* c_keys = reinterpret_cast(gpu_keys); - const long long* c_len = (const long long*)gpu_len; - PullCopy<<<2, 64, stream>>>(gpu_values, total_values_gpu, c_len, hidden_size, - slot_num, total_length, c_keys); - - xpu_wait(stream); -} - void PSGPUWrapper::CopyKeys(const paddle::platform::Place& place, - uint64_t** origin_keys, uint64_t* total_keys, - const int64_t* gpu_len, int slot_num, + uint64_t** origin_keys, + uint64_t* total_keys, + const int64_t* gpu_len, + int slot_num, int total_len) { XPUStream stream = nullptr; auto dev_ctx = platform::DeviceContextPool::Instance().Get(place); @@ -232,66 +220,49 @@ void PSGPUWrapper::CopyKeys(const paddle::platform::Place& place, xpu_wait(stream); } -void PSGPUWrapper::CopyForPush(const paddle::platform::Place& place, - const std::vector& grad_values, - FeaturePushValue* total_grad_values_gpu, - const std::vector& slot_lengths, - const int hidden_size, - const int64_t total_length, - const int batch_size) { - XPUStream stream = nullptr; - auto dev_ctx = platform::DeviceContextPool::Instance().Get(place); - stream = static_cast(dev_ctx) - ->x_context() - ->xpu_stream; - auto slot_lengths_lod = slot_lengths; - for (size_t i = 1; i < slot_lengths_lod.size(); i++) { - slot_lengths_lod[i] += slot_lengths_lod[i - 1]; - } - - float* gpu_values = nullptr; - int64_t* gpu_len = nullptr; - int* d_slot_vector = nullptr; - - xpu_malloc(reinterpret_cast(&gpu_values), - grad_values.size() * sizeof(float*)); - xpu_malloc(reinterpret_cast(&gpu_len), - slot_lengths.size() * sizeof(int64_t)); - xpu_malloc(reinterpret_cast(&d_slot_vector), - slot_lengths_lod.size() * sizeof(int)); - - xpu_memcpy(gpu_values, grad_values.data(), - grad_values.size() * sizeof(float*), XPU_HOST_TO_DEVICE); - xpu_memcpy(gpu_len, slot_lengths_lod.data(), - slot_lengths.size() * sizeof(int64_t), XPU_HOST_TO_DEVICE); - xpu_memcpy(d_slot_vector, slot_vector_.data(), - slot_lengths_lod.size() * sizeof(int), XPU_HOST_TO_DEVICE); - - long long* c_len = (long long*)gpu_len; - PushCopy<<<2, 64, stream>>>(total_grad_values_gpu, gpu_values, c_len, - hidden_size, slot_lengths.size(), total_length, - batch_size, d_slot_vector); - xpu_wait(stream); -} - -void PSGPUWrapper::SetSparseSGD(float nonclk_coeff, float clk_coeff, - float min_bound, float max_bound, - float learning_rate, float initial_g2sum, - float initial_range) { +void PSGPUWrapper::SetSparseSGD(float nonclk_coeff, + float clk_coeff, + float min_bound, + float max_bound, + float learning_rate, + float initial_g2sum, + float initial_range, + float beta1_decay_rate, + float beta2_decay_rate, + float ada_epsilon) { OptimizerConfig optimizer_config; - optimizer_config.set_sparse_sgd(nonclk_coeff, clk_coeff, min_bound, max_bound, - learning_rate, initial_g2sum, initial_range); + optimizer_config.set_sparse_sgd(nonclk_coeff, + clk_coeff, + min_bound, + max_bound, + learning_rate, + initial_g2sum, + initial_range, + beta1_decay_rate, + beta2_decay_rate, + ada_epsilon); HeterPs_->set_sparse_sgd(optimizer_config); } void PSGPUWrapper::SetEmbedxSGD(float mf_create_thresholds, - float mf_learning_rate, float mf_initial_g2sum, - float mf_initial_range, float mf_min_bound, - float mf_max_bound) { + float mf_learning_rate, + float mf_initial_g2sum, + float mf_initial_range, + float mf_min_bound, + float mf_max_bound, + float mf_beta1_decay_rate, + float mf_beta2_decay_rate, + float mf_ada_epsilon) { OptimizerConfig optimizer_config; - optimizer_config.set_embedx_sgd(mf_create_thresholds, mf_learning_rate, - mf_initial_g2sum, mf_initial_range, - mf_min_bound, mf_max_bound); + optimizer_config.set_embedx_sgd(mf_create_thresholds, + mf_learning_rate, + mf_initial_g2sum, + mf_initial_range, + mf_min_bound, + mf_max_bound, + mf_beta1_decay_rate, + mf_beta2_decay_rate, + mf_ada_epsilon); HeterPs_->set_embedx_sgd(optimizer_config); } diff --git a/python/paddle/distributed/fleet/base/distributed_strategy.py b/python/paddle/distributed/fleet/base/distributed_strategy.py index 902854a7c7279..c58b539b6877d 100755 --- a/python/paddle/distributed/fleet/base/distributed_strategy.py +++ b/python/paddle/distributed/fleet/base/distributed_strategy.py @@ -594,6 +594,21 @@ def sparse_optimizer_config(sgd, strategy, prefix): bounds = strategy.get(prefix + 'sparse_weight_bounds', [-10, 10]) sgd.adam.weight_bounds.extend(bounds) + elif optimizer_name == "shared_adam": + sgd.name = 'SparseSharedAdamSGDRule' + sgd.adam.learning_rate = strategy.get( + prefix + 'sparse_learning_rate', 0.001) + sgd.adam.initial_range = strategy.get( + prefix + 'sparse_initial_range', 1e-4) + sgd.adam.beta1_decay_rate = strategy.get( + prefix + 'sparse_beta1_decay_rate', 0.9) + sgd.adam.beta2_decay_rate = strategy.get( + prefix + 'sparse_beta2_decay_rate', 0.999) + sgd.adam.ada_epsilon = strategy.get( + prefix + 'sparse_ada_epsilon', 1e-8) + bounds = strategy.get(prefix + 'sparse_weight_bounds', + [-10, 10]) + sgd.adam.weight_bounds.extend(bounds) def set_sparse_table_config(table_data, config): for key in config: diff --git a/python/paddle/distributed/ps/the_one_ps.py b/python/paddle/distributed/ps/the_one_ps.py index c6ba48e5e32b5..7d240983a1c28 100755 --- a/python/paddle/distributed/ps/the_one_ps.py +++ b/python/paddle/distributed/ps/the_one_ps.py @@ -195,7 +195,7 @@ def _set(self, accessor_proto, varname, program_id, context): sgd_param.naive.initial_range = 0.0001 if len(sgd_param.naive.weight_bounds) == 0: sgd_param.naive.weight_bounds.extend([-10.0, 10.0]) - if sgd_param.name == "SparseAdamSGDRule": + if sgd_param.name == "SparseAdamSGDRule" or sgd_param.name == "SparseSharedAdamSGDRule": if not sgd_param.adam.HasField("learning_rate"): sgd_param.adam.learning_rate = 0.001 if not sgd_param.adam.HasField("initial_range"): diff --git a/python/paddle/fluid/tests/unittests/test_dist_fleet_ps13.py b/python/paddle/fluid/tests/unittests/test_dist_fleet_ps13.py new file mode 100644 index 0000000000000..c5ae2365b07cd --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_ps13.py @@ -0,0 +1,201 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import os + +os.environ["WITH_DISTRIBUTE"] = "ON" + +import unittest +import tempfile +import shutil + +import paddle +import paddle.fluid as fluid +import paddle.distributed.fleet.base.role_maker as role_maker +import paddle.distributed.fleet as fleet + +paddle.enable_static() + +# For Net +base_lr = 0.2 +emb_lr = base_lr * 3 +dict_dim = 1500 +emb_dim = 128 +hid_dim = 128 +margin = 0.1 +sample_rate = 1 +batch_size = 4 + + +# this unittest is tested for SparseSharedAdamSGDRule +class TestPSPassWithBow(unittest.TestCase): + + def net(self): + + def get_acc(cos_q_nt, cos_q_pt, batch_size): + cond = fluid.layers.less_than(cos_q_nt, cos_q_pt) + cond = fluid.layers.cast(cond, dtype='float64') + cond_3 = fluid.layers.reduce_sum(cond) + acc = fluid.layers.elementwise_div(cond_3, + fluid.layers.fill_constant( + shape=[1], + value=batch_size * 1.0, + dtype='float64'), + name="simnet_acc") + return acc + + def get_loss(cos_q_pt, cos_q_nt): + loss_op1 = fluid.layers.elementwise_sub( + fluid.layers.fill_constant_batch_size_like(input=cos_q_pt, + shape=[-1, 1], + value=margin, + dtype='float32'), + cos_q_pt) + loss_op2 = fluid.layers.elementwise_add(loss_op1, cos_q_nt) + loss_op3 = fluid.layers.elementwise_max( + fluid.layers.fill_constant_batch_size_like(input=loss_op2, + shape=[-1, 1], + value=0.0, + dtype='float32'), + loss_op2) + avg_cost = fluid.layers.mean(loss_op3) + return avg_cost + + is_distributed = False + is_sparse = True + + # query + q = fluid.layers.data(name="query_ids", + shape=[1], + dtype="int64", + lod_level=1) + # embedding + q_emb = fluid.contrib.layers.sparse_embedding( + input=q, + size=[dict_dim, emb_dim], + param_attr=fluid.ParamAttr( + initializer=fluid.initializer.Constant(value=0.01), + name="__emb__", + learning_rate=emb_lr)) + q_emb = fluid.layers.reshape(q_emb, [-1, emb_dim]) + # vsum + q_sum = fluid.layers.sequence_pool(input=q_emb, pool_type='sum') + q_ss = fluid.layers.softsign(q_sum) + # fc layer after conv + q_fc = fluid.layers.fc( + input=q_ss, + size=hid_dim, + param_attr=fluid.ParamAttr( + initializer=fluid.initializer.Constant(value=0.01), + name="__q_fc__", + learning_rate=base_lr)) + # label data + label = fluid.layers.data(name="label", shape=[1], dtype="int64") + # pt + pt = fluid.layers.data(name="pos_title_ids", + shape=[1], + dtype="int64", + lod_level=1) + # embedding + pt_emb = fluid.contrib.layers.sparse_embedding( + input=pt, + size=[dict_dim, emb_dim], + param_attr=fluid.ParamAttr( + initializer=fluid.initializer.Constant(value=0.01), + name="__emb__", + learning_rate=emb_lr)) + pt_emb = fluid.layers.reshape(pt_emb, [-1, emb_dim]) + # vsum + pt_sum = fluid.layers.sequence_pool(input=pt_emb, pool_type='sum') + pt_ss = fluid.layers.softsign(pt_sum) + # fc layer + pt_fc = fluid.layers.fc( + input=pt_ss, + size=hid_dim, + param_attr=fluid.ParamAttr( + initializer=fluid.initializer.Constant(value=0.01), + name="__fc__", + learning_rate=base_lr), + bias_attr=fluid.ParamAttr(name="__fc_b__")) + # nt + nt = fluid.layers.data(name="neg_title_ids", + shape=[1], + dtype="int64", + lod_level=1) + # embedding + nt_emb = fluid.contrib.layers.sparse_embedding( + input=nt, + size=[dict_dim, emb_dim], + param_attr=fluid.ParamAttr( + initializer=fluid.initializer.Constant(value=0.01), + name="__emb__", + learning_rate=emb_lr)) + nt_emb = fluid.layers.reshape(nt_emb, [-1, emb_dim]) + # vsum + nt_sum = fluid.layers.sequence_pool(input=nt_emb, pool_type='sum') + nt_ss = fluid.layers.softsign(nt_sum) + # fc layer + nt_fc = fluid.layers.fc( + input=nt_ss, + size=hid_dim, + param_attr=fluid.ParamAttr( + initializer=fluid.initializer.Constant(value=0.01), + name="__fc__", + learning_rate=base_lr), + bias_attr=fluid.ParamAttr(name="__fc_b__")) + cos_q_pt = fluid.layers.cos_sim(q_fc, pt_fc) + cos_q_nt = fluid.layers.cos_sim(q_fc, nt_fc) + # loss + avg_cost = get_loss(cos_q_pt, cos_q_nt) + # acc + acc = get_acc(cos_q_nt, cos_q_pt, batch_size) + return [avg_cost, acc, cos_q_pt] + + def test(self): + os.environ["PADDLE_PSERVER_NUMS"] = "2" + os.environ["PADDLE_TRAINERS_NUM"] = "2" + os.environ["POD_IP"] = "127.0.0.1" + os.environ["PADDLE_PORT"] = "36001" + os.environ["PADDLE_TRAINER_ID"] = "0" + os.environ["PADDLE_TRAINERS_NUM"] = "2" + os.environ[ + "PADDLE_PSERVERS_IP_PORT_LIST"] = "127.0.0.1:36001,127.0.0.2:36001" + os.environ["TRAINING_ROLE"] = "PSERVER" + + role = role_maker.PaddleCloudRoleMaker() + fleet.init(role) + loss, acc, _ = self.net() + + strategy = paddle.distributed.fleet.DistributedStrategy() + strategy.a_sync = True + + configs = {} + configs['__emb__'] = { + "table_parameters.__emb__.accessor.embed_sgd_param.name": + "SparseSharedAdamSGDRule", + "table_parameters.__emb__.accessor.embedx_sgd_param.name": + "SparseSharedAdamSGDRule", + } + strategy.sparse_table_configs = configs + optimizer = paddle.fluid.optimizer.SGD(learning_rate=0.01) + optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy) + optimizer.minimize(loss) + + fleet.init_server() + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_fleet_distributed_strategy.py b/python/paddle/fluid/tests/unittests/test_fleet_distributed_strategy.py index 455a7a30cfd18..9ac88c802111f 100644 --- a/python/paddle/fluid/tests/unittests/test_fleet_distributed_strategy.py +++ b/python/paddle/fluid/tests/unittests/test_fleet_distributed_strategy.py @@ -334,6 +334,14 @@ def test_fleet_desc_configs(self): strategy.sparse_table_configs[0].accessor.embed_sgd_param.adagrad. initial_range, 0.0001) + strategy = paddle.distributed.fleet.DistributedStrategy() + configs = {} + configs['emb'] = {"sparse_optimizer": "shared_adam"} + strategy.fleet_desc_configs = configs + self.assertEqual( + strategy.sparse_table_configs[0].accessor.embed_sgd_param.adam. + beta1_decay_rate, 0.9) + def test_trainer_desc_configs(self): strategy = paddle.distributed.fleet.DistributedStrategy() configs = { diff --git a/tools/parallel_UT_rule.py b/tools/parallel_UT_rule.py index 559f2d95b915f..53ab93f57ce56 100755 --- a/tools/parallel_UT_rule.py +++ b/tools/parallel_UT_rule.py @@ -671,7 +671,8 @@ 'test_trt_convert_reduce_sum', 'save_quant2_model_lstm', 'test_trt_convert_slice', - 'test_quant2_int8_lstm_mkldnn' + 'test_quant2_int8_lstm_mkldnn', + 'test_dist_fleet_ps13' ] # mem=0 but always timeout or failed : It run 15 job each time in Single cases; From c99c70cb2bbcf9deb0ab11d44ee2d98c9d378ae8 Mon Sep 17 00:00:00 2001 From: lyq <30404405+affectionlu@users.noreply.github.com> Date: Wed, 20 Jul 2022 15:33:03 +0800 Subject: [PATCH 02/12] [Phi] migrate sync_batch_norm to phi (#44369) --- paddle/fluid/operators/inplace_abn_op.cu | 130 ++-- paddle/fluid/operators/sync_batch_norm_op.cu | 137 ---- .../fluid/operators/sync_batch_norm_op.cu.h | 637 ------------------ paddle/phi/api/yaml/legacy_api.yaml | 10 + paddle/phi/api/yaml/legacy_backward.yaml | 12 + .../gpu/sync_batch_norm_grad_kernel.cu | 75 +++ .../phi/kernels/gpu/sync_batch_norm_kernel.cu | 190 ++++++ .../phi/kernels/gpu/sync_batch_norm_utils.h | 493 ++++++++++++++ .../phi/kernels/sync_batch_norm_grad_kernel.h | 45 ++ paddle/phi/kernels/sync_batch_norm_kernel.h | 43 ++ paddle/phi/ops/compat/sync_batch_norm_sig.cc | 67 ++ python/paddle/nn/layer/norm.py | 11 +- 12 files changed, 1027 insertions(+), 823 deletions(-) delete mode 100644 paddle/fluid/operators/sync_batch_norm_op.cu delete mode 100644 paddle/fluid/operators/sync_batch_norm_op.cu.h create mode 100644 paddle/phi/kernels/gpu/sync_batch_norm_grad_kernel.cu create mode 100644 paddle/phi/kernels/gpu/sync_batch_norm_kernel.cu create mode 100644 paddle/phi/kernels/gpu/sync_batch_norm_utils.h create mode 100644 paddle/phi/kernels/sync_batch_norm_grad_kernel.h create mode 100644 paddle/phi/kernels/sync_batch_norm_kernel.h create mode 100644 paddle/phi/ops/compat/sync_batch_norm_sig.cc diff --git a/paddle/fluid/operators/inplace_abn_op.cu b/paddle/fluid/operators/inplace_abn_op.cu index 71e21a2edd47b..a74150a330672 100644 --- a/paddle/fluid/operators/inplace_abn_op.cu +++ b/paddle/fluid/operators/inplace_abn_op.cu @@ -13,17 +13,19 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/inplace_abn_op.h" +#include #include "paddle/fluid/operators/batch_norm_op.h" -#include "paddle/fluid/operators/sync_batch_norm_op.cu.h" #include "paddle/phi/kernels/batch_norm_grad_kernel.h" #include "paddle/phi/kernels/batch_norm_kernel.h" +#include "paddle/phi/kernels/gpu/sync_batch_norm_utils.h" +#include "paddle/phi/kernels/sync_batch_norm_grad_kernel.h" +#include "paddle/phi/kernels/sync_batch_norm_kernel.h" namespace paddle { namespace operators { template -class InplaceABNKernel - : public paddle::operators::SyncBatchNormKernel { +class InplaceABNKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { auto* y = ctx.Output("Y"); @@ -36,29 +38,49 @@ class InplaceABNKernel GetInplaceABNActivationType(ctx.Attr("activation")); auto& place = *ctx.template device_context().eigen_device(); + auto* scale = ctx.Input("Scale"); + auto* bias = ctx.Input("Bias"); + auto* mean = ctx.Input("Mean"); + auto* variance = ctx.Input("Variance"); + + auto momentum = ctx.Attr("momentum"); + auto epsilon = ctx.Attr("epsilon"); + auto data_layout = ctx.Attr("data_layout"); + auto is_test = ctx.Attr("is_test"); + auto use_global_stats = ctx.Attr("use_global_stats"); + auto trainable_statistics = ctx.Attr("trainable_statistics"); + auto fuse_with_relu = ctx.Attr("fuse_with_relu"); + + auto* mean_out = ctx.Output("MeanOut"); + auto* variance_out = ctx.Output("VarianceOut"); + auto* saved_mean = ctx.Output("SavedMean"); + auto* saved_variance = ctx.Output("SavedVariance"); + auto* reserve_space = ctx.Output("ReserveSpace"); + if (ctx.Attr("use_sync_bn")) { - SyncBatchNormKernel::Compute(ctx); + auto& dev_ctx = ctx.device_context(); + phi::SyncBatchNormKernel( + static_cast::TYPE&>(dev_ctx), + *x, + *scale, + *bias, + *mean, + *variance, + momentum, + epsilon, + data_layout, + is_test, + use_global_stats, + trainable_statistics, + fuse_with_relu, + y, + mean_out, + variance_out, + saved_mean, + saved_variance, + reserve_space); } else { - // BatchNormKernel::Compute(ctx); - auto* scale = ctx.Input("Scale"); - auto* bias = ctx.Input("Bias"); - auto* mean = ctx.Input("Mean"); - auto* variance = ctx.Input("Variance"); - - auto momentum = ctx.Attr("momentum"); - auto epsilon = ctx.Attr("epsilon"); - auto data_layout = ctx.Attr("data_layout"); - auto is_test = ctx.Attr("is_test"); - auto use_global_stats = ctx.Attr("use_global_stats"); - auto trainable_statistics = ctx.Attr("trainable_statistics"); - auto fuse_with_relu = ctx.Attr("fuse_with_relu"); - - auto* mean_out = ctx.Output("MeanOut"); - auto* variance_out = ctx.Output("VarianceOut"); - auto* saved_mean = ctx.Output("SavedMean"); - auto* saved_variance = ctx.Output("SavedVariance"); - auto* reserve_space = ctx.Output("ReserveSpace"); - auto& dev_ctx = ctx.device_context(); phi::BatchNormKernel( static_cast -class InplaceABNGradKernel - : public paddle::operators::SyncBatchNormGradKernel { +class InplaceABNGradKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { const auto* y = ctx.Input("Y"); @@ -115,29 +136,44 @@ class InplaceABNGradKernel InplaceABNActivation functor; functor.GradCompute(ctx, activation, place, cur_y, cur_y, cur_dy, cur_dy); + auto* scale = ctx.Input("Scale"); + auto* bias = ctx.Input("Bias"); + auto* saved_mean = ctx.Input("SavedMean"); + auto* saved_variance = ctx.Input("SavedVariance"); + + auto momentum = ctx.Attr("momentum"); + auto epsilon = ctx.Attr("epsilon"); + auto data_layout = ctx.Attr("data_layout"); + auto is_test = ctx.Attr("is_test"); + auto use_global_stats = ctx.Attr("use_global_stats"); + auto trainable_statistics = ctx.Attr("trainable_statistics"); + auto fuse_with_relu = ctx.Attr("fuse_with_relu"); + + auto* scale_grad = ctx.Output(framework::GradVarName("Scale")); + auto* bias_grad = ctx.Output(framework::GradVarName("Bias")); + + auto* reserve_space = ctx.Input("ReserveSpace"); + auto* mean = ctx.Input("ReserveSpace"); + auto* variance = ctx.Input("ReserveSpace"); + if (ctx.Attr("use_sync_bn")) { - SyncBatchNormGradKernel::Compute(ctx); + auto& dev_ctx = ctx.device_context(); + phi::SyncBatchNormGradFunctor( + static_cast::TYPE&>(dev_ctx), + nullptr, + y, + *scale, + *bias, + *saved_mean, + *saved_variance, + *d_y, + epsilon, + data_layout, + d_x, + scale_grad, + bias_grad); } else { - auto* scale = ctx.Input("Scale"); - auto* bias = ctx.Input("Bias"); - auto* saved_mean = ctx.Input("SavedMean"); - auto* saved_variance = ctx.Input("SavedVariance"); - - auto momentum = ctx.Attr("momentum"); - auto epsilon = ctx.Attr("epsilon"); - auto data_layout = ctx.Attr("data_layout"); - auto is_test = ctx.Attr("is_test"); - auto use_global_stats = ctx.Attr("use_global_stats"); - auto trainable_statistics = ctx.Attr("trainable_statistics"); - auto fuse_with_relu = ctx.Attr("fuse_with_relu"); - - auto* scale_grad = ctx.Output(framework::GradVarName("Scale")); - auto* bias_grad = ctx.Output(framework::GradVarName("Bias")); - - auto* reserve_space = ctx.Input("ReserveSpace"); - auto* mean = ctx.Input("ReserveSpace"); - auto* variance = ctx.Input("ReserveSpace"); - paddle::optional space_opt; paddle::optional mean_opt; paddle::optional variance_opt; diff --git a/paddle/fluid/operators/sync_batch_norm_op.cu b/paddle/fluid/operators/sync_batch_norm_op.cu deleted file mode 100644 index 637064398e177..0000000000000 --- a/paddle/fluid/operators/sync_batch_norm_op.cu +++ /dev/null @@ -1,137 +0,0 @@ -/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/operators/sync_batch_norm_op.cu.h" - -namespace paddle { -namespace operators { - -template -class SyncBatchNormKernel - : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext &ctx) const override { - double epsilon = static_cast(ctx.Attr("epsilon")); - const float momentum = ctx.Attr("momentum"); - const bool is_test = ctx.Attr("is_test"); - const std::string layout_str = ctx.Attr("data_layout"); - const DataLayout layout = framework::StringToDataLayout(layout_str); - const bool use_global_stats = ctx.Attr("use_global_stats"); - const bool trainable_stats = ctx.Attr("trainable_statistics"); - PADDLE_ENFORCE_EQ(use_global_stats, - false, - platform::errors::InvalidArgument( - "sync_batch_norm doesn't support " - "to set use_global_stats True. Please use batch_norm " - "in this case.")); - - const auto *x = ctx.Input("X"); - auto *y = ctx.Output("Y"); - - const auto *est_mean = ctx.Input("Mean"); - const auto *est_var = ctx.Input("Variance"); - - // moving mean/variance - auto *mean_out = ctx.Output("MeanOut"); - auto *variance_out = ctx.Output("VarianceOut"); - - auto *saved_mean = ctx.Output("SavedMean"); - auto *saved_inv_variance = ctx.Output("SavedVariance"); - - bool test_mode = is_test && (!trainable_stats); - SyncBatchNormFunctor(ctx, - layout, - x, - y, - est_mean, - est_var, - mean_out, - variance_out, - saved_mean, - saved_inv_variance, - epsilon, - momentum, - test_mode, - use_global_stats); - } -}; - -template -class SyncBatchNormGradKernel - : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext &ctx) const override { - PADDLE_ENFORCE_EQ( - platform::is_gpu_place(ctx.GetPlace()), - true, - platform::errors::InvalidArgument("It must use CUDAPlace.")); - double epsilon = static_cast(ctx.Attr("epsilon")); - const std::string layout_str = ctx.Attr("data_layout"); - - const DataLayout layout = framework::StringToDataLayout(layout_str); - const auto *d_y = ctx.Input(framework::GradVarName("Y")); - const auto *scale = ctx.Input("Scale"); - const auto *bias = ctx.Input("Bias"); - - // init output - auto *d_x = ctx.Output(framework::GradVarName("X")); - auto *d_scale = ctx.Output(framework::GradVarName("Scale")); - auto *d_bias = ctx.Output(framework::GradVarName("Bias")); - - const auto *saved_mean = ctx.Input("SavedMean"); - const auto *saved_inv_var = ctx.Input("SavedVariance"); - - SyncBatchNormGradFunctor(ctx, - layout, - scale, - bias, - d_x, - d_y, - d_scale, - d_bias, - saved_mean, - saved_inv_var, - epsilon); - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -namespace plat = paddle::platform; -#ifdef PADDLE_WITH_HIP -// MIOPEN do not support double -REGISTER_OP_CUDA_KERNEL( - sync_batch_norm, - ops::SyncBatchNormKernel, - ops::SyncBatchNormKernel); -REGISTER_OP_CUDA_KERNEL( - sync_batch_norm_grad, - ops::SyncBatchNormGradKernel, - ops::SyncBatchNormGradKernel); -#else -REGISTER_OP_CUDA_KERNEL( - sync_batch_norm, - ops::SyncBatchNormKernel, - ops::SyncBatchNormKernel, - ops::SyncBatchNormKernel); -REGISTER_OP_CUDA_KERNEL( - sync_batch_norm_grad, - ops::SyncBatchNormGradKernel, - ops::SyncBatchNormGradKernel, - ops::SyncBatchNormGradKernel); -#endif - -// clang-format on diff --git a/paddle/fluid/operators/sync_batch_norm_op.cu.h b/paddle/fluid/operators/sync_batch_norm_op.cu.h deleted file mode 100644 index 47de27e876922..0000000000000 --- a/paddle/fluid/operators/sync_batch_norm_op.cu.h +++ /dev/null @@ -1,637 +0,0 @@ -/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include -#include -#include -#include -#include -#ifdef __NVCC__ -#include "cub/cub.cuh" -#endif -#ifdef __HIPCC__ -#include -namespace cub = hipcub; -#endif -#include "paddle/fluid/framework/convert_utils.h" -#include "paddle/fluid/framework/data_layout.h" -#include "paddle/fluid/memory/malloc.h" -#include "paddle/fluid/operators/batch_norm_op.h" -#include "paddle/fluid/operators/norm_utils.h" -#include "paddle/fluid/platform/device/gpu/gpu_dnn.h" -#include "paddle/fluid/platform/device/gpu/nccl_helper.h" -#include "paddle/fluid/platform/float16.h" - -namespace paddle { -namespace operators { - -using Tensor = framework::Tensor; -using DataLayout = framework::DataLayout; -template -using CudnnDataType = platform::CudnnDataType; -template -using BatchNormParamType = typename CudnnDataType::BatchNormParamType; - -template -__global__ void KeLocalStats( - const T *x, int N, int M, int C, BatchNormParamType *mean_var) { - typedef cub::BlockReduce, BlockDim> BlockReduce; - __shared__ typename BlockReduce::TempStorage temp_storage; - for (int k = blockIdx.x; k < C; k += gridDim.x) { - BatchNormParamType x_sum = 0.; - BatchNormParamType x2_sum = 0.; - for (int i = threadIdx.x; i < N * M; i += BlockDim) { - int id = layout == framework::DataLayout::kNCHW - ? (i / M) * C * M + k * M + i % M - : i * C + k; - auto x_in = static_cast>(x[id]); - x_sum += x_in; - x2_sum += x_in * x_in; - } - __syncthreads(); - auto out = BlockReduce(temp_storage).Reduce(x_sum, cub::Sum()); - __syncthreads(); - if (threadIdx.x == 0) { - mean_var[k] = out / (N * M); - } - out = BlockReduce(temp_storage).Reduce(x2_sum, cub::Sum()); - __syncthreads(); - if (threadIdx.x == 0) { - mean_var[k + C] = out / (N * M); - } - } - if (blockIdx.x == 0 && threadIdx.x == 0) { - mean_var[2 * C] = static_cast>(1.0); - } -} - -template -__global__ void KeSyncAndMovingStats(BatchNormParamType *means, - BatchNormParamType *variances, - BatchNormParamType *num_dev, - const int C, - const BatchNormParamType momentum, - const double epsilon, - BatchNormParamType *sv_mean_data, - BatchNormParamType *sv_inv_var_data, - BatchNormParamType *moving_means, - BatchNormParamType *moving_variances) { - // sync stats across multi-devices - int gid = blockIdx.x * blockDim.x + threadIdx.x; - int stride = blockDim.x * gridDim.x; - for (int i = gid; i < C; i += stride) { - auto mean = means[i] / (*num_dev); - auto var = variances[i] / (*num_dev); - var = var - mean * mean; - - // sync stats - sv_mean_data[i] = mean; - sv_inv_var_data[i] = 1.0 / sqrt(var + epsilon); - variances[i] = var; - - // moving stats - moving_means[i] = moving_means[i] * momentum + mean * (1. - momentum); - moving_variances[i] = - moving_variances[i] * momentum + var * (1. - momentum); - } -} - -template -static __global__ void KeNormAffine(const T *x, - const BatchNormParamType *scale, - const BatchNormParamType *bias, - const BatchNormParamType *mean, - const BatchNormParamType *variance, - const double epsilon, - const int C, - const int M, - const int num, - T *y) { - int gid = blockIdx.x * blockDim.x + threadIdx.x; - int stride = blockDim.x * gridDim.x; - for (int i = gid; i < num; i += stride) { - const int c = layout == framework::DataLayout::kNCHW ? (i / M) % C : i % C; - auto x_i = static_cast>(x[i]); - auto y_i = - (x_i - mean[c]) / sqrt(variance[c] + epsilon) * scale[c] + bias[c]; - y[i] = static_cast(y_i); - } -} - -template -void SyncBatchNormFunctor(const framework::ExecutionContext &ctx, - const DataLayout layout, - const framework::Tensor *x, - framework::Tensor *y, - const framework::Tensor *mean, - const framework::Tensor *variance, - framework::Tensor *mean_out, - framework::Tensor *variance_out, - framework::Tensor *saved_mean, - framework::Tensor *saved_variance, - double epsilon, - const float momentum, - const bool is_test, - const bool use_global_stats - -) { - const auto &x_dims = x->dims(); - PADDLE_ENFORCE_GE(x_dims.size(), - 2, - platform::errors::InvalidArgument( - "The Input dim size should be larger than 1.")); - PADDLE_ENFORCE_LE(x_dims.size(), - 5, - platform::errors::InvalidArgument( - "The Input dim size should be less than 6.")); - int N, C, H, W, D; - ExtractNCWHD(x_dims, layout, &N, &C, &H, &W, &D); - int x_numel = x->numel(); - - const T *x_d = x->data(); - const auto *s_d = ctx.Input("Scale")->data>(); - const auto *b_d = ctx.Input("Bias")->data>(); - - T *y_d = y->mutable_data(ctx.GetPlace()); - - const BatchNormParamType *mean_data = nullptr; - const BatchNormParamType *var_data = nullptr; - - auto &dev_ctx = ctx.cuda_device_context(); - auto stream = dev_ctx.stream(); - const int block = 512; - int max_threads = dev_ctx.GetMaxPhysicalThreadCount(); - - paddle::memory::AllocationPtr alloc_ptr{nullptr}; - - if (is_test) { - mean_data = mean->data>(); - var_data = variance->data>(); - } else { - // x, x^2, 1, here 1 is used to calc device num - // device num also can be got from platform::DeviceContextPool - const int bytes = (C * 2 + 1) * sizeof(BatchNormParamType); - alloc_ptr = memory::Alloc(dev_ctx, bytes); - - auto *stats = reinterpret_cast *>(alloc_ptr->ptr()); - const int threads = 256; - int grid = std::min(C, (max_threads + threads - 1) / threads); - if (layout == framework::DataLayout::kNCHW) { - KeLocalStats - <<>>(x_d, N, H * W * D, C, stats); - } else { - KeLocalStats - <<>>(x_d, N, H * W * D, C, stats); - } - -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) - auto *comm = dev_ctx.nccl_comm(); - if (comm) { - int dtype = platform::ToNCCLDataType( - framework::TransToProtoVarType(mean_out->dtype())); - // In-place operation - PADDLE_ENFORCE_GPU_SUCCESS( - platform::dynload::ncclAllReduce(stats, - stats, - 2 * C + 1, - static_cast(dtype), - ncclSum, - comm, - stream)); - } -#endif - - auto *est_mean_data = - mean_out->mutable_data>(ctx.GetPlace()); - auto *est_var_data = - variance_out->mutable_data>(ctx.GetPlace()); - - auto *sv_mean_data = - saved_mean->mutable_data>(ctx.GetPlace()); - auto *sv_inv_var_data = - saved_variance->mutable_data>(ctx.GetPlace()); - - // Note, Input('Mean')/Input('Variance') share variable with - // Output('MeanOut')/Output('VarianceOut') - KeSyncAndMovingStats - <<<(C + block - 1) / block, block, 0, stream>>>(stats, - stats + C, - stats + 2 * C, - C, - momentum, - epsilon, - sv_mean_data, - sv_inv_var_data, - est_mean_data, - est_var_data); - - mean_data = sv_mean_data; - var_data = stats + C; - } - - int grid2 = (std::min(x_numel, max_threads) + block - 1) / block; - if (layout == framework::DataLayout::kNCHW) { - KeNormAffine - <<>>(x_d, - s_d, - b_d, - mean_data, - var_data, - epsilon, - C, - H * W * D, - x_numel, - y_d); - } else { - KeNormAffine - <<>>(x_d, - s_d, - b_d, - mean_data, - var_data, - epsilon, - C, - H * W * D, - x_numel, - y_d); - } -} - -template -__global__ void KeBackwardLocalStats(const T *dy, - const T *x, - const BatchNormParamType *means, - int N, - int M, - int C, - BatchNormParamType *sum_dy_prod) { - typedef cub::BlockReduce, BlockDim> BlockReduce; - __shared__ typename BlockReduce::TempStorage temp_storage; - for (int k = blockIdx.x; k < C; k += gridDim.x) { - BatchNormParamType sum1 = 0.; - BatchNormParamType sum2 = 0.; - auto mean = means[k]; - for (int i = threadIdx.x; i < N * M; i += blockDim.x) { - int id = layout == framework::DataLayout::kNCHW - ? (i / M) * C * M + k * M + i % M - : i * C + k; - auto g = static_cast>(dy[id]); - sum1 += g; - auto x_i = static_cast>(x[id]); - sum2 += g * (x_i - mean); - } - - __syncthreads(); - auto out = BlockReduce(temp_storage).Reduce(sum1, cub::Sum()); - __syncthreads(); - if (threadIdx.x == 0) { - sum_dy_prod[k] = out; - } - out = BlockReduce(temp_storage).Reduce(sum2, cub::Sum()); - __syncthreads(); - if (threadIdx.x == 0) { - sum_dy_prod[k + C] = out; - } - } - if (blockIdx.x == 0 && threadIdx.x == 0) { - sum_dy_prod[2 * C] = 1.0; - } -} - -template -static __global__ void KeBNBackwardScaleBias( - const T *dy, - const T *x, - const BatchNormParamType *mean, - const BatchNormParamType *inv_variance, - const double epsilon, - const int N, - const int C, - const int HxW, - BatchNormParamType *dscale, - BatchNormParamType *dbias) { - const int outer_size = C; - const int inner_size = N * HxW; - typedef cub::BlockReduce, BlockDim> BlockReduce; - __shared__ typename BlockReduce::TempStorage temp_storage; - - for (int i = blockIdx.x; i < outer_size; i += gridDim.x) { - BatchNormParamType ds_sum = 0.; - BatchNormParamType db_sum = 0.; - - auto inv_var_i = inv_variance[i]; - auto mean_i = mean[i]; - for (int j = threadIdx.x; j < inner_size; j += blockDim.x) { - const int id = layout == framework::DataLayout::kNCHW - ? ((j / HxW) * C + i) * HxW + (j % HxW) - : j * outer_size + i; - auto x_i = static_cast>(x[id]); - auto dy_i = static_cast>(dy[id]); - ds_sum += dy_i * (x_i - mean_i); - db_sum += dy_i; - } - __syncthreads(); - auto os = BlockReduce(temp_storage).Reduce(ds_sum, cub::Sum()); - __syncthreads(); - auto ob = BlockReduce(temp_storage).Reduce(db_sum, cub::Sum()); - __syncthreads(); - if (threadIdx.x == 0) { - dscale[i] = os * inv_var_i; - dbias[i] = ob; - } - __syncthreads(); - } -} - -template -static __global__ void KeBNRestoreData(T *x, - const BatchNormParamType *scale, - const BatchNormParamType *bias, - const BatchNormParamType *mean, - const BatchNormParamType *sv_inv, - const double epsilon, - int C, - int M, - int num, - const T *y) { - int gid = blockIdx.x * blockDim.x + threadIdx.x; - int stride = blockDim.x * gridDim.x; - for (int i = gid; i < num; i += stride) { - const int c = layout == framework::DataLayout::kNCHW ? (i / M) % C : i % C; - auto y_i = static_cast>(y[i]); - auto x_i = (y_i - bias[c]) / scale[c] / sv_inv[c] + mean[c]; - x[i] = static_cast(x_i); - } -} - -template -static __global__ void KeBNBackwardData( - const T *dy, - const T *x, - const BatchNormParamType *gamma, - const BatchNormParamType *mean, - const BatchNormParamType *inv_variance, - const BatchNormParamType *g_sum_dy, - const BatchNormParamType *g_sum_dy_prod, - const BatchNormParamType *num_dev, - const double epsilon, - const int C, - const int HxW, - const int num, - T *dx) { - int gid = blockIdx.x * blockDim.x + threadIdx.x; - int stride = blockDim.x * gridDim.x; - auto scale = static_cast>(C) / num; - auto dev_num = num_dev[0]; - for (int i = gid; i < num; i += stride) { - const int c = layout == framework::DataLayout::kNCHW ? i / HxW % C : i % C; - auto inv_var = inv_variance[c]; - auto s_d = gamma[c]; - auto gvar = - -(g_sum_dy_prod[c] / dev_num) * s_d * inv_var * (inv_var * inv_var); - auto gmean = -(g_sum_dy[c] / dev_num) * s_d * inv_var; - - auto x_i = static_cast>(x[i]); - auto dy_i = static_cast>(dy[i]); - auto dx_i = - dy_i * s_d * inv_var + gmean * scale + gvar * scale * (x_i - mean[c]); - dx[i] = static_cast(dx_i); - } -} - -template -void SyncBatchNormGradFunctor(const framework::ExecutionContext &ctx, - const DataLayout layout, - const framework::Tensor *scale, - const framework::Tensor *bias, - framework::Tensor *d_x, - const framework::Tensor *d_y, - framework::Tensor *d_scale, - framework::Tensor *d_bias, - const framework::Tensor *mean, - const framework::Tensor *variance, - const double epsilon) { - // sync_batch_norm with inplace as false will take X as grad input, which - // is same as cuDNN batch_norm backward calculation, batch_norm - // with inplace as true only take Y as input and X should be calculate - // by inverse operation of batch_norm on Y - const Tensor *x; - bool is_inplace; - if (ctx.HasInput("Y")) { - x = ctx.Input("Y"); - is_inplace = true; - } else { - x = ctx.Input("X"); - is_inplace = false; - } - - const auto &x_dims = x->dims(); - - PADDLE_ENFORCE_GE(x_dims.size(), - 2, - platform::errors::InvalidArgument( - "The Input X dim size should be larger than 1.")); - PADDLE_ENFORCE_LE(x_dims.size(), - 5, - platform::errors::InvalidArgument( - "The Input X dim size should be less than 6.")); - - int N, C, H, W, D; - ExtractNCWHD(x_dims, layout, &N, &C, &H, &W, &D); - PADDLE_ENFORCE_EQ(scale->dims()[0], - C, - platform::errors::InvalidArgument( - "Expected first dim for input parameter(scale) of " - "OP(sync_batch_norm) be (%d), but given (%d).", - C, - scale->dims()[0])); - - d_x->mutable_data(ctx.GetPlace()); - if (d_scale && d_bias) { - d_scale->mutable_data>(ctx.GetPlace()); - d_bias->mutable_data>(ctx.GetPlace()); - } - PADDLE_ENFORCE_EQ(scale->dims().size(), - 1UL, - platform::errors::InvalidArgument( - "Expected rank for input parameter(scale) of " - "OP(sync_batch_norm) be (1), but given (%d).", - scale->dims().size())); - - std::vector dims; - std::vector strides; - if (layout == DataLayout::kNCHW) { - dims = {N, C, H, W, D}; - strides = {C * H * W * D, H * W * D, W * D, D, 1}; - } else { - dims = {N, C, H, W, D}; - strides = {H * W * C * D, 1, W * D * C, D * C, C}; - } - const T *x_d = x->data(); - auto px = *x; - const T *dy_d = d_y->data(); - - auto &dev_ctx = ctx.cuda_device_context(); - auto stream = dev_ctx.stream(); - - const auto *saved_mean = mean->data>(); - const auto *saved_inv_var = variance->data>(); - const int bytes = (C * 2 + 1) * sizeof(BatchNormParamType); - auto alloc_ptr = memory::Alloc(dev_ctx, bytes); - auto *stats = reinterpret_cast *>(alloc_ptr->ptr()); - - const int block = 512; - const int threads = 256; - int x_numel = x->numel(); - int fsize = H * W * D; - int max_threads = dev_ctx.GetMaxPhysicalThreadCount(); - int grid = std::min(C, (max_threads + threads - 1) / threads); - int grid2 = (std::min(x_numel, max_threads) + block - 1) / block; - - if (is_inplace) { - if (layout == framework::DataLayout::kNCHW) { - KeBNRestoreData - <<>>(px.mutable_data(ctx.GetPlace()), - scale->data>(), - bias->data>(), - saved_mean, - saved_inv_var, - epsilon, - C, - H * W * D, - x_numel, - x->data()); - } else { - KeBNRestoreData - <<>>(px.mutable_data(ctx.GetPlace()), - scale->data>(), - bias->data>(), - saved_mean, - saved_inv_var, - epsilon, - C, - H * W * D, - x_numel, - x->data()); - } - } - - if (layout == framework::DataLayout::kNCHW) { - KeBackwardLocalStats - <<>>( - dy_d, x_d, saved_mean, N, fsize, C, stats); - } else { - KeBackwardLocalStats - <<>>( - dy_d, x_d, saved_mean, N, fsize, C, stats); - } - -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) - auto *comm = dev_ctx.nccl_comm(); - if (comm) { - int dtype = platform::ToNCCLDataType( - framework::TransToProtoVarType(scale->dtype())); - // In-place operation - PADDLE_ENFORCE_GPU_SUCCESS( - platform::dynload::ncclAllReduce(stats, - stats, - 2 * C + 1, - static_cast(dtype), - ncclSum, - comm, - stream)); - } -#endif - - if (layout == framework::DataLayout::kNCHW) { - if (d_scale && d_bias) { - KeBNBackwardScaleBias - <<>>(dy_d, - x_d, - saved_mean, - saved_inv_var, - epsilon, - N, - C, - fsize, - d_scale->data>(), - d_bias->data>()); - } - if (d_x) { - KeBNBackwardData - <<>>(dy_d, - x_d, - scale->data>(), - saved_mean, - saved_inv_var, - stats, - stats + C, - stats + 2 * C, - epsilon, - C, - fsize, - x->numel(), - d_x->data()); - } - } else { - if (d_scale && d_bias) { - KeBNBackwardScaleBias - <<>>(dy_d, - x_d, - saved_mean, - saved_inv_var, - epsilon, - N, - C, - fsize, - d_scale->data>(), - d_bias->data>()); - } - if (d_x) { - KeBNBackwardData - <<>>(dy_d, - x_d, - scale->data>(), - saved_mean, - saved_inv_var, - stats, - stats + C, - stats + 2 * C, - epsilon, - C, - fsize, - x->numel(), - d_x->data()); - } - } -} - -template -class SyncBatchNormKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext &ctx) const override; -}; - -// Deriving the Gradient for the Backward Pass of Batch Normalization -// https://kevinzakka.github.io/2016/09/14/batch_normalization/ -template -class SyncBatchNormGradKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext &ctx) const override; -}; - -} // namespace operators -} // namespace paddle diff --git a/paddle/phi/api/yaml/legacy_api.yaml b/paddle/phi/api/yaml/legacy_api.yaml index 453a0d9c16690..ed08fe48ee849 100644 --- a/paddle/phi/api/yaml/legacy_api.yaml +++ b/paddle/phi/api/yaml/legacy_api.yaml @@ -2075,6 +2075,16 @@ func : swish backward : swish_grad +# sync_batch_norm +- api : sync_batch_norm + args : (Tensor x, Tensor scale, Tensor bias, Tensor mean, Tensor variance, float momentum, float epsilon, str data_layout, bool is_test, bool use_global_stats, bool trainable_statistics, bool fuse_with_relu) + output : Tensor(out), Tensor(mean_out), Tensor(variance_out), Tensor(saved_mean), Tensor(saved_variance), Tensor(reserve_space) + infer_meta : + func : BatchNormInferMeta + kernel : + func : sync_batch_norm + backward : sync_batch_norm_grad + # take_along_axis - api : take_along_axis args : (Tensor x, Tensor index, int axis) diff --git a/paddle/phi/api/yaml/legacy_backward.yaml b/paddle/phi/api/yaml/legacy_backward.yaml index 50aa57a3845cd..91464ac769f77 100644 --- a/paddle/phi/api/yaml/legacy_backward.yaml +++ b/paddle/phi/api/yaml/legacy_backward.yaml @@ -2085,6 +2085,18 @@ func : swish_grad inplace : (out_grad -> x_grad) +- backward_api : sync_batch_norm_grad + forward : sync_batch_norm (Tensor x, Tensor scale, Tensor bias, Tensor mean, Tensor variance, float momentum, float epsilon, str data_layout, bool is_test, bool use_global_stats, bool trainable_statistics, bool fuse_with_relu) -> Tensor(out), Tensor(mean_out), Tensor(variance_out), Tensor(saved_mean), Tensor(saved_variance), Tensor(reserve_space) + args : (Tensor x, Tensor scale, Tensor bias, Tensor mean_out, Tensor variance_out, Tensor saved_mean, Tensor saved_variance, Tensor reserve_space, Tensor out_grad, float momentum, float epsilon, str data_layout, bool is_test, bool use_global_stats, bool trainable_statistics, bool fuse_with_relu) + output : Tensor(x_grad), Tensor(scale_grad), Tensor(bias_grad) + infer_meta : + func : GeneralTernaryGradInferMeta + param : [x, scale, bias] + kernel : + func : sync_batch_norm_grad + data_type : out_grad + optional : mean_out, variance_out, reserve_space + - backward_api : take_along_axis_grad forward : take_along_axis (Tensor x, Tensor index, int axis) -> Tensor(out) args : (Tensor x, Tensor index, Tensor out_grad, int axis) diff --git a/paddle/phi/kernels/gpu/sync_batch_norm_grad_kernel.cu b/paddle/phi/kernels/gpu/sync_batch_norm_grad_kernel.cu new file mode 100644 index 0000000000000..ba5020d08bd0f --- /dev/null +++ b/paddle/phi/kernels/gpu/sync_batch_norm_grad_kernel.cu @@ -0,0 +1,75 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/sync_batch_norm_grad_kernel.h" +#include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/gpu/sync_batch_norm_utils.h" + +namespace phi { + +template +void SyncBatchNormGradKernel(const Context& ctx, + const DenseTensor& x, + const DenseTensor& scale, + const DenseTensor& bias, + const paddle::optional& mean, + const paddle::optional& variance, + const DenseTensor& saved_mean, + const DenseTensor& saved_variance, + const paddle::optional& reserve_space, + const DenseTensor& y_grad, + float momentum, + float epsilon_f, + const std::string& data_layout_str, + bool is_test, + bool use_global_stats, + bool trainable_statistics, + bool fuse_with_relu, + DenseTensor* x_grad, + DenseTensor* scale_grad, + DenseTensor* bias_grad) { + SyncBatchNormGradFunctor(ctx, + &x, + nullptr, + scale, + bias, + saved_mean, + saved_variance, + y_grad, + epsilon_f, + data_layout_str, + x_grad, + scale_grad, + bias_grad); +} + +} // namespace phi + +#ifdef PADDLE_WITH_HIP +PD_REGISTER_KERNEL(sync_batch_norm_grad, + GPU, + ALL_LAYOUT, + phi::SyncBatchNormGradKernel, + float, + phi::dtype::float16) {} +#else +PD_REGISTER_KERNEL(sync_batch_norm_grad, + GPU, + ALL_LAYOUT, + phi::SyncBatchNormGradKernel, + float, + double, + phi::dtype::float16) {} +#endif diff --git a/paddle/phi/kernels/gpu/sync_batch_norm_kernel.cu b/paddle/phi/kernels/gpu/sync_batch_norm_kernel.cu new file mode 100644 index 0000000000000..a1d4b681ca053 --- /dev/null +++ b/paddle/phi/kernels/gpu/sync_batch_norm_kernel.cu @@ -0,0 +1,190 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/sync_batch_norm_kernel.h" +#include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/gpu/sync_batch_norm_utils.h" + +namespace phi { + +template +void SyncBatchNormKernel(const Context &ctx, + const DenseTensor &x, + const DenseTensor &scale, + const DenseTensor &bias, + const DenseTensor &mean, + const DenseTensor &variance, + float momentum, + float epsilon_f, + const std::string &data_layout_str, + bool is_test, + bool use_global_stats, + bool trainable_statistics, + bool fuse_with_relu, + DenseTensor *y, + DenseTensor *mean_out, + DenseTensor *variance_out, + DenseTensor *saved_mean, + DenseTensor *saved_variance, + DenseTensor *reserve_space) { + PADDLE_ENFORCE_EQ(use_global_stats, + false, + phi::errors::InvalidArgument( + "sync_batch_norm doesn't support " + "to set use_global_stats True. Please use batch_norm " + "in this case.")); + + double epsilon = epsilon_f; + const bool trainable_stats = trainable_statistics; + const DataLayout layout = + paddle::framework::StringToDataLayout(data_layout_str); + bool test_mode = is_test && (!trainable_statistics); + const auto &x_dims = x.dims(); + PADDLE_ENFORCE_GE(x_dims.size(), + 2, + phi::errors::InvalidArgument( + "The Input dim size should be larger than 1.")); + PADDLE_ENFORCE_LE(x_dims.size(), + 5, + phi::errors::InvalidArgument( + "The Input dim size should be less than 6.")); + int N, C, H, W, D; + funcs::ExtractNCWHD(x_dims, layout, &N, &C, &H, &W, &D); + int x_numel = x.numel(); + + const T *x_d = x.template data(); + const auto *s_d = scale.template data>(); + const auto *b_d = bias.template data>(); + + T *y_d = ctx.template Alloc(y); + + const BatchNormParamType *mean_data = nullptr; + const BatchNormParamType *var_data = nullptr; + + auto stream = ctx.stream(); + const int block = 512; + int max_threads = ctx.GetMaxPhysicalThreadCount(); + + paddle::memory::AllocationPtr alloc_ptr{nullptr}; + + if (test_mode) { + mean_data = mean.template data>(); + var_data = variance.template data>(); + } else { + // x, x^2, 1, here 1 is used to calc device num + // device num also can be got from platform::DeviceContextPool + const int bytes = (C * 2 + 1) * sizeof(BatchNormParamType); + alloc_ptr = paddle::memory::Alloc(ctx, bytes); + + auto *stats = reinterpret_cast *>(alloc_ptr->ptr()); + const int threads = 256; + int grid = std::min(C, (max_threads + threads - 1) / threads); + if (layout == paddle::framework::DataLayout::kNCHW) { + KeLocalStats + <<>>(x_d, N, H * W * D, C, stats); + } else { + KeLocalStats + <<>>(x_d, N, H * W * D, C, stats); + } + +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) + auto *comm = ctx.nccl_comm(); + if (comm) { + int dtype = paddle::platform::ToNCCLDataType( + paddle::framework::TransToProtoVarType(mean_out->dtype())); + // In-place operation + PADDLE_ENFORCE_GPU_SUCCESS(paddle::platform::dynload::ncclAllReduce( + stats, + stats, + 2 * C + 1, + static_cast(dtype), + ncclSum, + comm, + stream)); + } +#endif + + auto *est_mean_data = ctx.template Alloc>(mean_out); + auto *est_var_data = + ctx.template Alloc>(variance_out); + + auto *sv_mean_data = ctx.template Alloc>(saved_mean); + auto *sv_inv_var_data = + ctx.template Alloc>(saved_variance); + + // Note, Input('Mean')/Input('Variance') share variable with + // Output('MeanOut')/Output('VarianceOut') + KeSyncAndMovingStats + <<<(C + block - 1) / block, block, 0, stream>>>(stats, + stats + C, + stats + 2 * C, + C, + momentum, + epsilon, + sv_mean_data, + sv_inv_var_data, + est_mean_data, + est_var_data); + + mean_data = sv_mean_data; + var_data = stats + C; + } + + int grid2 = (std::min(x_numel, max_threads) + block - 1) / block; + if (layout == paddle::framework::DataLayout::kNCHW) { + KeNormAffine + <<>>(x_d, + s_d, + b_d, + mean_data, + var_data, + epsilon, + C, + H * W * D, + x_numel, + y_d); + } else { + KeNormAffine + <<>>(x_d, + s_d, + b_d, + mean_data, + var_data, + epsilon, + C, + H * W * D, + x_numel, + y_d); + } +} + +} // namespace phi + +#ifdef PADDLE_WITH_HIP +PD_REGISTER_KERNEL(sync_batch_norm, + GPU, + ALL_LAYOUT, + phi::SyncBatchNormKernel, + float, + phi::dtype::float16) {} +#else +PD_REGISTER_KERNEL(sync_batch_norm, + GPU, + ALL_LAYOUT, + phi::SyncBatchNormKernel, + float, + double, + phi::dtype::float16) {} +#endif diff --git a/paddle/phi/kernels/gpu/sync_batch_norm_utils.h b/paddle/phi/kernels/gpu/sync_batch_norm_utils.h new file mode 100644 index 0000000000000..37b9bca73a857 --- /dev/null +++ b/paddle/phi/kernels/gpu/sync_batch_norm_utils.h @@ -0,0 +1,493 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include +#include +#include +#include +#ifdef __NVCC__ +#include "cub/cub.cuh" +#endif +#ifdef __HIPCC__ +#include +namespace cub = hipcub; +#endif +#include "paddle/fluid/framework/convert_utils.h" +#include "paddle/fluid/memory/malloc.h" +#include "paddle/fluid/platform/device/gpu/gpu_dnn.h" +#include "paddle/fluid/platform/device/gpu/nccl_helper.h" +#include "paddle/phi/common/layout.h" +#include "paddle/phi/kernels/funcs/norm_utils.h" + +namespace phi { + +template +using CudnnDataType = paddle::platform::CudnnDataType; +template +using BatchNormParamType = typename CudnnDataType::BatchNormParamType; + +template +__global__ void KeLocalStats( + const T *x, int N, int M, int C, BatchNormParamType *mean_var) { + typedef cub::BlockReduce, BlockDim> BlockReduce; + __shared__ typename BlockReduce::TempStorage temp_storage; + for (int k = blockIdx.x; k < C; k += gridDim.x) { + BatchNormParamType x_sum = 0.; + BatchNormParamType x2_sum = 0.; + for (int i = threadIdx.x; i < N * M; i += BlockDim) { + int id = layout == DataLayout::kNCHW ? (i / M) * C * M + k * M + i % M + : i * C + k; + auto x_in = static_cast>(x[id]); + x_sum += x_in; + x2_sum += x_in * x_in; + } + __syncthreads(); + auto out = BlockReduce(temp_storage).Reduce(x_sum, cub::Sum()); + __syncthreads(); + if (threadIdx.x == 0) { + mean_var[k] = out / (N * M); + } + out = BlockReduce(temp_storage).Reduce(x2_sum, cub::Sum()); + __syncthreads(); + if (threadIdx.x == 0) { + mean_var[k + C] = out / (N * M); + } + } + if (blockIdx.x == 0 && threadIdx.x == 0) { + mean_var[2 * C] = static_cast>(1.0); + } +} + +template +__global__ void KeSyncAndMovingStats(BatchNormParamType *means, + BatchNormParamType *variances, + BatchNormParamType *num_dev, + const int C, + const BatchNormParamType momentum, + const double epsilon, + BatchNormParamType *sv_mean_data, + BatchNormParamType *sv_inv_var_data, + BatchNormParamType *moving_means, + BatchNormParamType *moving_variances) { + // sync stats across multi-devices + int gid = blockIdx.x * blockDim.x + threadIdx.x; + int stride = blockDim.x * gridDim.x; + for (int i = gid; i < C; i += stride) { + auto mean = means[i] / (*num_dev); + auto var = variances[i] / (*num_dev); + var = var - mean * mean; + + // sync stats + sv_mean_data[i] = mean; + sv_inv_var_data[i] = 1.0 / sqrt(var + epsilon); + variances[i] = var; + + // moving stats + moving_means[i] = moving_means[i] * momentum + mean * (1. - momentum); + moving_variances[i] = + moving_variances[i] * momentum + var * (1. - momentum); + } +} + +template +static __global__ void KeNormAffine(const T *x, + const BatchNormParamType *scale, + const BatchNormParamType *bias, + const BatchNormParamType *mean, + const BatchNormParamType *variance, + const double epsilon, + const int C, + const int M, + const int num, + T *y) { + int gid = blockIdx.x * blockDim.x + threadIdx.x; + int stride = blockDim.x * gridDim.x; + for (int i = gid; i < num; i += stride) { + const int c = layout == DataLayout::kNCHW ? (i / M) % C : i % C; + auto x_i = static_cast>(x[i]); + auto y_i = + (x_i - mean[c]) / sqrt(variance[c] + epsilon) * scale[c] + bias[c]; + y[i] = static_cast(y_i); + } +} + +template +__global__ void KeBackwardLocalStats(const T *dy, + const T *x, + const BatchNormParamType *means, + int N, + int M, + int C, + BatchNormParamType *sum_dy_prod) { + typedef cub::BlockReduce, BlockDim> BlockReduce; + __shared__ typename BlockReduce::TempStorage temp_storage; + for (int k = blockIdx.x; k < C; k += gridDim.x) { + BatchNormParamType sum1 = 0.; + BatchNormParamType sum2 = 0.; + auto mean = means[k]; + for (int i = threadIdx.x; i < N * M; i += blockDim.x) { + int id = layout == DataLayout::kNCHW ? (i / M) * C * M + k * M + i % M + : i * C + k; + auto g = static_cast>(dy[id]); + sum1 += g; + auto x_i = static_cast>(x[id]); + sum2 += g * (x_i - mean); + } + + __syncthreads(); + auto out = BlockReduce(temp_storage).Reduce(sum1, cub::Sum()); + __syncthreads(); + if (threadIdx.x == 0) { + sum_dy_prod[k] = out; + } + out = BlockReduce(temp_storage).Reduce(sum2, cub::Sum()); + __syncthreads(); + if (threadIdx.x == 0) { + sum_dy_prod[k + C] = out; + } + } + if (blockIdx.x == 0 && threadIdx.x == 0) { + sum_dy_prod[2 * C] = 1.0; + } +} + +template +static __global__ void KeBNBackwardScaleBias( + const T *dy, + const T *x, + const BatchNormParamType *mean, + const BatchNormParamType *inv_variance, + const double epsilon, + const int N, + const int C, + const int HxW, + BatchNormParamType *dscale, + BatchNormParamType *dbias) { + const int outer_size = C; + const int inner_size = N * HxW; + typedef cub::BlockReduce, BlockDim> BlockReduce; + __shared__ typename BlockReduce::TempStorage temp_storage; + + for (int i = blockIdx.x; i < outer_size; i += gridDim.x) { + BatchNormParamType ds_sum = 0.; + BatchNormParamType db_sum = 0.; + + auto inv_var_i = inv_variance[i]; + auto mean_i = mean[i]; + for (int j = threadIdx.x; j < inner_size; j += blockDim.x) { + const int id = layout == DataLayout::kNCHW + ? ((j / HxW) * C + i) * HxW + (j % HxW) + : j * outer_size + i; + auto x_i = static_cast>(x[id]); + auto dy_i = static_cast>(dy[id]); + ds_sum += dy_i * (x_i - mean_i); + db_sum += dy_i; + } + __syncthreads(); + auto os = BlockReduce(temp_storage).Reduce(ds_sum, cub::Sum()); + __syncthreads(); + auto ob = BlockReduce(temp_storage).Reduce(db_sum, cub::Sum()); + __syncthreads(); + if (threadIdx.x == 0) { + dscale[i] = os * inv_var_i; + dbias[i] = ob; + } + __syncthreads(); + } +} + +template +static __global__ void KeBNRestoreData(T *x, + const BatchNormParamType *scale, + const BatchNormParamType *bias, + const BatchNormParamType *mean, + const BatchNormParamType *sv_inv, + const double epsilon, + int C, + int M, + int num, + const T *y) { + int gid = blockIdx.x * blockDim.x + threadIdx.x; + int stride = blockDim.x * gridDim.x; + for (int i = gid; i < num; i += stride) { + const int c = layout == DataLayout::kNCHW ? (i / M) % C : i % C; + auto y_i = static_cast>(y[i]); + auto x_i = (y_i - bias[c]) / scale[c] / sv_inv[c] + mean[c]; + x[i] = static_cast(x_i); + } +} + +template +static __global__ void KeBNBackwardData( + const T *dy, + const T *x, + const BatchNormParamType *gamma, + const BatchNormParamType *mean, + const BatchNormParamType *inv_variance, + const BatchNormParamType *g_sum_dy, + const BatchNormParamType *g_sum_dy_prod, + const BatchNormParamType *num_dev, + const double epsilon, + const int C, + const int HxW, + const int num, + T *dx) { + int gid = blockIdx.x * blockDim.x + threadIdx.x; + int stride = blockDim.x * gridDim.x; + auto scale = static_cast>(C) / num; + auto dev_num = num_dev[0]; + for (int i = gid; i < num; i += stride) { + const int c = layout == DataLayout::kNCHW ? i / HxW % C : i % C; + auto inv_var = inv_variance[c]; + auto s_d = gamma[c]; + auto gvar = + -(g_sum_dy_prod[c] / dev_num) * s_d * inv_var * (inv_var * inv_var); + auto gmean = -(g_sum_dy[c] / dev_num) * s_d * inv_var; + + auto x_i = static_cast>(x[i]); + auto dy_i = static_cast>(dy[i]); + auto dx_i = + dy_i * s_d * inv_var + gmean * scale + gvar * scale * (x_i - mean[c]); + dx[i] = static_cast(dx_i); + } +} + +template +void SyncBatchNormGradFunctor( + const Context &ctx, + const DenseTensor *input_x, + const DenseTensor *input_y, + const DenseTensor &scale, + const DenseTensor &bias, + // const paddle::optional& mean, + // const paddle::optional& variance, + const DenseTensor &saved_mean, + const DenseTensor &saved_variance, + // const paddle::optional& reserve_space, + const DenseTensor &y_grad, + // float momentum, + float epsilon_f, + const std::string &data_layout_str, + // bool is_test, + // bool use_global_stats, + // bool trainable_statistics, + // bool fuse_with_relu, + DenseTensor *x_grad, + DenseTensor *scale_grad, + DenseTensor *bias_grad) { + double epsilon = static_cast(epsilon_f); + + const DataLayout layout = + paddle::framework::StringToDataLayout(data_layout_str); + + const auto *d_y = &y_grad; + + auto *d_x = x_grad; + auto *d_scale = scale_grad; + auto *d_bias = bias_grad; + + const DenseTensor *x; + bool is_inplace = false; + if (input_y) { + is_inplace = true; + x = input_y; + } else { + x = input_x; + } + const auto &x_dims = x->dims(); + + PADDLE_ENFORCE_GE(x_dims.size(), + 2, + phi::errors::InvalidArgument( + "The Input X dim size should be larger than 1.")); + PADDLE_ENFORCE_LE(x_dims.size(), + 5, + phi::errors::InvalidArgument( + "The Input X dim size should be less than 6.")); + + int N, C, H, W, D; + funcs::ExtractNCWHD(x_dims, layout, &N, &C, &H, &W, &D); + PADDLE_ENFORCE_EQ(scale.dims()[0], + C, + phi::errors::InvalidArgument( + "Expected first dim for input parameter(scale) of " + "OP(sync_batch_norm) be (%d), but given (%d).", + C, + scale.dims()[0])); + + ctx.template Alloc(d_x); + if (d_scale && d_bias) { + ctx.template Alloc>(d_scale); + ctx.template Alloc>(d_bias); + } + PADDLE_ENFORCE_EQ(scale.dims().size(), + 1UL, + phi::errors::InvalidArgument( + "Expected rank for input parameter(scale) of " + "OP(sync_batch_norm) be (1), but given (%d).", + scale.dims().size())); + + std::vector dims; + std::vector strides; + if (layout == DataLayout::kNCHW) { + dims = {N, C, H, W, D}; + strides = {C * H * W * D, H * W * D, W * D, D, 1}; + } else { + dims = {N, C, H, W, D}; + strides = {H * W * C * D, 1, W * D * C, D * C, C}; + } + const T *x_d = x->data(); + auto px = *x; + const T *dy_d = d_y->data(); + + auto stream = ctx.stream(); + + const auto *saved_mean_ptr = + saved_mean.template data>(); + const auto *saved_inv_var = + saved_variance.template data>(); + const int bytes = (C * 2 + 1) * sizeof(BatchNormParamType); + auto alloc_ptr = paddle::memory::Alloc(ctx, bytes); + auto *stats = reinterpret_cast *>(alloc_ptr->ptr()); + + const int block = 512; + const int threads = 256; + int x_numel = x->numel(); + int fsize = H * W * D; + int max_threads = ctx.GetMaxPhysicalThreadCount(); + int grid = std::min(C, (max_threads + threads - 1) / threads); + int grid2 = (std::min(x_numel, max_threads) + block - 1) / block; + + if (is_inplace) { + if (layout == DataLayout::kNCHW) { + KeBNRestoreData<<>>( + ctx.template Alloc(&px), + scale.template data>(), + bias.template data>(), + saved_mean_ptr, + saved_inv_var, + epsilon, + C, + H * W * D, + x_numel, + x->data()); + } else { + KeBNRestoreData<<>>( + ctx.template Alloc(&px), + scale.template data>(), + bias.template data>(), + saved_mean_ptr, + saved_inv_var, + epsilon, + C, + H * W * D, + x_numel, + x->data()); + } + } + + if (layout == DataLayout::kNCHW) { + KeBackwardLocalStats + <<>>( + dy_d, x_d, saved_mean_ptr, N, fsize, C, stats); + } else { + KeBackwardLocalStats + <<>>( + dy_d, x_d, saved_mean_ptr, N, fsize, C, stats); + } + +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) + auto *comm = ctx.nccl_comm(); + if (comm) { + int dtype = paddle::platform::ToNCCLDataType( + paddle::framework::TransToProtoVarType(scale.dtype())); + // In-place operation + PADDLE_ENFORCE_GPU_SUCCESS(paddle::platform::dynload::ncclAllReduce( + stats, + stats, + 2 * C + 1, + static_cast(dtype), + ncclSum, + comm, + stream)); + } +#endif + + if (layout == DataLayout::kNCHW) { + if (d_scale && d_bias) { + KeBNBackwardScaleBias + <<>>(dy_d, + x_d, + saved_mean_ptr, + saved_inv_var, + epsilon, + N, + C, + fsize, + d_scale->data>(), + d_bias->data>()); + } + if (d_x) { + KeBNBackwardData<<>>( + dy_d, + x_d, + scale.template data>(), + saved_mean_ptr, + saved_inv_var, + stats, + stats + C, + stats + 2 * C, + epsilon, + C, + fsize, + x->numel(), + d_x->data()); + } + } else { + if (d_scale && d_bias) { + KeBNBackwardScaleBias + <<>>(dy_d, + x_d, + saved_mean_ptr, + saved_inv_var, + epsilon, + N, + C, + fsize, + d_scale->data>(), + d_bias->data>()); + } + if (d_x) { + KeBNBackwardData<<>>( + dy_d, + x_d, + scale.template data>(), + saved_mean_ptr, + saved_inv_var, + stats, + stats + C, + stats + 2 * C, + epsilon, + C, + fsize, + x->numel(), + d_x->data()); + } + } +} + +} // namespace phi diff --git a/paddle/phi/kernels/sync_batch_norm_grad_kernel.h b/paddle/phi/kernels/sync_batch_norm_grad_kernel.h new file mode 100644 index 0000000000000..395bec23f1091 --- /dev/null +++ b/paddle/phi/kernels/sync_batch_norm_grad_kernel.h @@ -0,0 +1,45 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include + +#include "paddle/phi/core/dense_tensor.h" + +namespace phi { + +template +void SyncBatchNormGradKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& scale, + const DenseTensor& bias, + const paddle::optional& mean, + const paddle::optional& variance, + const DenseTensor& saved_mean, + const DenseTensor& saved_variance, + const paddle::optional& reserve_space, + const DenseTensor& y_grad, + float momentum, + float epsilon, + const std::string& data_layout, + bool is_test, + bool use_global_stats, + bool trainable_statistics, + bool fuse_with_relu, + DenseTensor* x_grad, + DenseTensor* scale_grad, + DenseTensor* bias_grad); + +} // namespace phi diff --git a/paddle/phi/kernels/sync_batch_norm_kernel.h b/paddle/phi/kernels/sync_batch_norm_kernel.h new file mode 100644 index 0000000000000..5071eaabf8653 --- /dev/null +++ b/paddle/phi/kernels/sync_batch_norm_kernel.h @@ -0,0 +1,43 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include + +#include "paddle/phi/core/dense_tensor.h" + +namespace phi { + +template +void SyncBatchNormKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& scale, + const DenseTensor& bias, + const DenseTensor& mean, + const DenseTensor& variance, + float momentum, + float epsilon, + const std::string& data_layout, + bool is_test, + bool use_global_stats, + bool trainable_statistics, + bool fuse_with_relu, + DenseTensor* y, + DenseTensor* mean_out, + DenseTensor* variance_out, + DenseTensor* saved_mean, + DenseTensor* saved_variance, + DenseTensor* reserve_space); +} // namespace phi diff --git a/paddle/phi/ops/compat/sync_batch_norm_sig.cc b/paddle/phi/ops/compat/sync_batch_norm_sig.cc new file mode 100644 index 0000000000000..2595f241ff233 --- /dev/null +++ b/paddle/phi/ops/compat/sync_batch_norm_sig.cc @@ -0,0 +1,67 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/core/compat/op_utils.h" + +namespace phi { + +KernelSignature SyncBatchNormOpArgumentMapping( + const ArgumentMappingContext& ctx) { + return KernelSignature("sync_batch_norm", + {"X", "Scale", "Bias", "Mean", "Variance"}, + {"momentum", + "epsilon", + "data_layout", + "is_test", + "use_global_stats", + "trainable_statistics", + "fuse_with_relu"}, + {"Y", + "MeanOut", + "VarianceOut", + "SavedMean", + "SavedVariance", + "ReserveSpace"}); +} + +KernelSignature SyncBatchNormGradOpArgumentMapping( + const ArgumentMappingContext& ctx) { + return KernelSignature("sync_batch_norm_grad", + { + "X", + "Scale", + "Bias", + "Mean", + "Variance", + "SavedMean", + "SavedVariance", + "ReserveSpace", + "Y@GRAD", + }, + {"momentum", + "epsilon", + "data_layout", + "is_test", + "use_global_stats", + "trainable_statistics", + "fuse_with_relu"}, + {"X@GRAD", "Scale@GRAD", "Bias@GRAD"}); +} + +} // namespace phi + +PD_REGISTER_ARG_MAPPING_FN(sync_batch_norm, + phi::SyncBatchNormOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(sync_batch_norm_grad, + phi::SyncBatchNormGradOpArgumentMapping); diff --git a/python/paddle/nn/layer/norm.py b/python/paddle/nn/layer/norm.py index e549859fe626d..b9081d0c8e682 100644 --- a/python/paddle/nn/layer/norm.py +++ b/python/paddle/nn/layer/norm.py @@ -49,6 +49,7 @@ from paddle import _C_ops from .. import Layer from paddle import in_dynamic_mode +from paddle.fluid.framework import in_dygraph_mode __all__ = [] @@ -1100,7 +1101,14 @@ def forward(self, x): ### train mode: use mini-batch stats, eval mode: use global stats ### use_global_stats only support False in sync_batch_norm - if in_dynamic_mode(): + if in_dygraph_mode(): + sync_batch_norm_out, _, _, _, _, _ = _C_ops.final_state_sync_batch_norm( + x, self.weight, self.bias, self._mean, self._variance, + self._momentum, self._epsilon, self._data_format, + not self.training, False, False, False) + return sync_batch_norm_out + + elif in_dynamic_mode(): attrs = ("momentum", self._momentum, "epsilon", self._epsilon, "is_test", not self.training, "data_layout", self._data_format, "use_mkldnn", False, "fuse_with_relu", @@ -1109,7 +1117,6 @@ def forward(self, x): sync_batch_norm_out, _, _, _, _, _ = _C_ops.sync_batch_norm( x, self.weight, self.bias, self._mean, self._variance, mean_out, variance_out, *attrs) - return sync_batch_norm_out check_variable_and_dtype(x, 'input', ['float16', 'float32', 'float64'], From 99bf7007d0d04a80382f1b2d47ae41f7644340ea Mon Sep 17 00:00:00 2001 From: zmxdream Date: Wed, 20 Jul 2022 16:46:11 +0800 Subject: [PATCH 03/12] [GPUPS]Fix psgpuwrapper initialization (#44468) * Update ps_gpu_wrapper.h * Update ps_gpu_wrapper.h * Update ps_gpu_wrapper.cc --- paddle/fluid/framework/fleet/ps_gpu_wrapper.cc | 1 + paddle/fluid/framework/fleet/ps_gpu_wrapper.h | 9 +++++++-- 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/paddle/fluid/framework/fleet/ps_gpu_wrapper.cc b/paddle/fluid/framework/fleet/ps_gpu_wrapper.cc index d9bb6e946f42d..622793653dcab 100644 --- a/paddle/fluid/framework/fleet/ps_gpu_wrapper.cc +++ b/paddle/fluid/framework/fleet/ps_gpu_wrapper.cc @@ -95,6 +95,7 @@ int AfsWrapper::mv(const std::string& old_path, const std::string& dest_path) { std::shared_ptr PSGPUWrapper::s_instance_ = NULL; bool PSGPUWrapper::is_initialized_ = false; +std::mutex PSGPUWrapper::ins_mutex; #ifdef PADDLE_WITH_PSLIB void PSGPUWrapper::InitAfsApi(const std::string& fs_name, const std::string& fs_user, diff --git a/paddle/fluid/framework/fleet/ps_gpu_wrapper.h b/paddle/fluid/framework/fleet/ps_gpu_wrapper.h index 0d1669a42b1e9..cce120bcef747 100644 --- a/paddle/fluid/framework/fleet/ps_gpu_wrapper.h +++ b/paddle/fluid/framework/fleet/ps_gpu_wrapper.h @@ -20,6 +20,7 @@ limitations under the License. */ #include #include #include +#include #include #include #include @@ -429,8 +430,11 @@ class PSGPUWrapper { // PSGPUWrapper singleton static std::shared_ptr GetInstance() { - if (NULL == s_instance_) { - s_instance_.reset(new paddle::framework::PSGPUWrapper()); + { + std::lock_guard lk(ins_mutex); + if (NULL == s_instance_) { + s_instance_.reset(new paddle::framework::PSGPUWrapper()); + } } return s_instance_; } @@ -537,6 +541,7 @@ class PSGPUWrapper { private: static std::shared_ptr s_instance_; + static std::mutex ins_mutex; Dataset* dataset_; #ifdef PADDLE_WITH_PSLIB paddle::ps::AfsApiWrapper afs_handler_; From 889bdde3a6e7515cb07a4b00531fccc0ee31bc2a Mon Sep 17 00:00:00 2001 From: Zhou Wei <1183042833@qq.com> Date: Wed, 20 Jul 2022 16:52:18 +0800 Subject: [PATCH 04/12] [Phi] migrate exponential kernel to phi (#44376) * [Phi] migrate exponential kernel to phi * fix comment * fix CI --- paddle/fluid/operators/exponential_op.cc | 86 +++---------------- paddle/fluid/operators/exponential_op.cu | 48 ----------- paddle/fluid/operators/exponential_op.h | 42 --------- .../yaml/generator/wrapped_infermeta_gen.py | 3 +- paddle/phi/api/yaml/legacy_api.yaml | 11 +++ paddle/phi/api/yaml/legacy_backward.yaml | 9 ++ paddle/phi/kernels/cpu/exponential_kernel.cc | 45 ++++++++++ paddle/phi/kernels/exponential_kernel.h | 27 ++++++ paddle/phi/kernels/gpu/exponential_kernel.cu | 36 ++++++++ paddle/phi/ops/compat/exponential_sig.cc | 26 ++++++ .../tests/unittests/test_exponential_op.py | 12 ++- python/paddle/tensor/random.py | 4 +- 12 files changed, 181 insertions(+), 168 deletions(-) delete mode 100644 paddle/fluid/operators/exponential_op.cu delete mode 100644 paddle/fluid/operators/exponential_op.h create mode 100644 paddle/phi/kernels/cpu/exponential_kernel.cc create mode 100644 paddle/phi/kernels/exponential_kernel.h create mode 100644 paddle/phi/kernels/gpu/exponential_kernel.cu create mode 100644 paddle/phi/ops/compat/exponential_sig.cc diff --git a/paddle/fluid/operators/exponential_op.cc b/paddle/fluid/operators/exponential_op.cc index 5a75063fba7c1..26e06e50a7784 100644 --- a/paddle/fluid/operators/exponential_op.cc +++ b/paddle/fluid/operators/exponential_op.cc @@ -12,7 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/exponential_op.h" +#include "paddle/fluid/framework/infershape_utils.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/phi/infermeta/unary.h" namespace paddle { namespace operators { @@ -21,13 +23,6 @@ class ExponentialOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - void InferShape(framework::InferShapeContext *ctx) const override { - OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "ExponentialOp"); - OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "ExponentialOp"); - auto dim = ctx->GetInputDim("X"); - ctx->SetOutputDim("Out", dim); - } - protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext &ctx) const override { @@ -51,52 +46,6 @@ exponential distribution. } }; -class ExponentialOpInferVarType - : public framework::PassInDtypeAndVarTypeToOutput { - protected: - std::unordered_map &GetInputOutputWithSameType() - const override { - static std::unordered_map m{{"X", /*->*/ "Out"}}; - return m; - } -}; - -template -class ExponentialKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext &ctx) const override { - auto *out = ctx.Output("Out"); - T *out_data = out->mutable_data(ctx.GetPlace()); - - T lambda = static_cast(ctx.Attr("lambda")); - int64_t size = out->numel(); - - auto gen = framework::DefaultCPUGenerator(); - auto engine = gen->GetCPUEngine(); - - std::uniform_real_distribution uniform(0.0, 1.0); - phi::funcs::exponential_transform trans(lambda); - for (int64_t i = 0; i < size; ++i) { - out_data[i] = trans(uniform(*engine)); - } - } -}; - -class ExponentialGradOp : public framework::OperatorWithKernel { - public: - using framework::OperatorWithKernel::OperatorWithKernel; - - void InferShape(framework::InferShapeContext *ctx) const override { - OP_INOUT_CHECK(ctx->HasInput(framework::GradVarName("Out")), - "Input", - "Out_Grad", - "ExponentialGradOp"); - - auto dout_dim = ctx->GetInputDim(framework::GradVarName("Out")); - ctx->SetOutputDim(framework::GradVarName("X"), dout_dim); - } -}; - template class ExponentialGradOpMaker : public framework::SingleGradOpMaker { public: @@ -104,10 +53,10 @@ class ExponentialGradOpMaker : public framework::SingleGradOpMaker { protected: void Apply(GradOpPtr retv) const override { - retv->SetType("exponential_grad"); - retv->SetInput(framework::GradVarName("Out"), this->OutputGrad("Out")); - retv->SetOutput(framework::GradVarName("X"), this->InputGrad("X")); - retv->SetAttrMap(this->Attrs()); + retv->SetType("fill_any_like"); + retv->SetInput("X", this->OutputGrad("Out")); + retv->SetAttr("value", 0.0f); + retv->SetOutput("Out", this->InputGrad("X")); } }; @@ -118,24 +67,15 @@ namespace ops = paddle::operators; namespace plat = paddle::platform; DECLARE_INPLACE_OP_INFERER(ExponentialInferer, {"X", "Out"}); -DECLARE_INPLACE_OP_INFERER(ExponentialGradInferer, - {paddle::framework::GradVarName("Out"), - paddle::framework::GradVarName("X")}); + +DECLARE_INFER_SHAPE_FUNCTOR(exponential, + ExponentialInfershapeFunctor, + PD_INFER_META(phi::UnchangedInferMeta)); REGISTER_OPERATOR(exponential, ops::ExponentialOp, ops::ExponentialOpMaker, - ops::ExponentialOpInferVarType, ops::ExponentialGradOpMaker, ops::ExponentialGradOpMaker, - ExponentialInferer); -REGISTER_OPERATOR(exponential_grad, - ops::ExponentialGradOp, - ExponentialGradInferer); - -REGISTER_OP_CPU_KERNEL(exponential, - ops::ExponentialKernel, - ops::ExponentialKernel); -REGISTER_OP_CPU_KERNEL(exponential_grad, - ops::ExponentialGradKernel, - ops::ExponentialGradKernel); + ExponentialInferer, + ExponentialInfershapeFunctor); diff --git a/paddle/fluid/operators/exponential_op.cu b/paddle/fluid/operators/exponential_op.cu deleted file mode 100644 index 58d6fa674baf6..0000000000000 --- a/paddle/fluid/operators/exponential_op.cu +++ /dev/null @@ -1,48 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/operators/exponential_op.h" - -namespace paddle { -namespace operators { - -template -class ExponentialKernel - : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - framework::Tensor* out = ctx.Output("Out"); - auto& dev_cxt = ctx.template device_context(); - T lambda = static_cast(ctx.Attr("lambda")); - - phi::funcs::uniform_distribution dist; - phi::funcs::exponential_transform trans(lambda); - phi::funcs::distribution_and_transform(dev_cxt, out, dist, trans); - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -namespace plat = paddle::platform; - -REGISTER_OP_CUDA_KERNEL( - exponential, - ops::ExponentialKernel, - ops::ExponentialKernel); -REGISTER_OP_CUDA_KERNEL( - exponential_grad, - ops::ExponentialGradKernel, - ops::ExponentialGradKernel); diff --git a/paddle/fluid/operators/exponential_op.h b/paddle/fluid/operators/exponential_op.h deleted file mode 100644 index 7ded174a9f47e..0000000000000 --- a/paddle/fluid/operators/exponential_op.h +++ /dev/null @@ -1,42 +0,0 @@ -// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include "paddle/fluid/framework/generator.h" -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/framework/operator.h" -#include "paddle/phi/kernels/funcs/distribution_helper.h" -#include "paddle/phi/kernels/funcs/math_function.h" - -namespace paddle { -namespace operators { - -template -class ExponentialKernel; - -template -class ExponentialGradKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* dx = ctx.Output(framework::GradVarName("X")); - dx->mutable_data(ctx.GetPlace()); - phi::funcs::SetConstant functor; - auto& dev_ctx = ctx.template device_context(); - functor(dev_ctx, dx, static_cast(0)); - } -}; - -} // namespace operators -} // namespace paddle diff --git a/paddle/phi/api/yaml/generator/wrapped_infermeta_gen.py b/paddle/phi/api/yaml/generator/wrapped_infermeta_gen.py index 99da6ce3d955f..dfa6a7f93cbcb 100644 --- a/paddle/phi/api/yaml/generator/wrapped_infermeta_gen.py +++ b/paddle/phi/api/yaml/generator/wrapped_infermeta_gen.py @@ -46,7 +46,8 @@ def gene_wrapped_infermeta_and_register(api): 'const paddle::optional&': 'const MetaTensor&' } - wrapped_infermeta_name = get_wrapped_infermeta_name(api.api) + wrapped_infermeta_name = get_wrapped_infermeta_name( + api.kernel['func'][0]) args = [] for input_name in api.inputs['names']: if input_name in kernel_params: diff --git a/paddle/phi/api/yaml/legacy_api.yaml b/paddle/phi/api/yaml/legacy_api.yaml index ed08fe48ee849..f60309985a6f4 100644 --- a/paddle/phi/api/yaml/legacy_api.yaml +++ b/paddle/phi/api/yaml/legacy_api.yaml @@ -689,6 +689,17 @@ func : expm1 backward : expm1_grad +- api : exponential_ + args : (Tensor x, float lambda) + output : Tensor(out) + infer_meta : + func : UnchangedInferMeta + param : [x] + kernel : + func : exponential + inplace : (x -> out) + backward : exponential__grad + - api : eye args : (int64_t num_rows, int64_t num_columns, DataType dtype=DataType::FLOAT32, Place place={}) output : Tensor(out) diff --git a/paddle/phi/api/yaml/legacy_backward.yaml b/paddle/phi/api/yaml/legacy_backward.yaml index 91464ac769f77..6df4883145620 100644 --- a/paddle/phi/api/yaml/legacy_backward.yaml +++ b/paddle/phi/api/yaml/legacy_backward.yaml @@ -720,6 +720,15 @@ func : expm1_grad inplace : (out_grad -> x_grad) +- backward_api : exponential__grad + forward : exponential_ (Tensor x, float lambda) -> Tensor(out) + args : (Tensor out_grad) + output : Tensor(x_grad) + infer_meta : + func : UnchangedInferMeta + invoke : zeros_like(out_grad, DataType::UNDEFINED, {}) + inplace : (out_grad -> x_grad) + - backward_api : flatten_grad forward : flatten(Tensor x, int start_axis, int stop_axis) -> Tensor(out), Tensor(xshape) args : (Tensor xshape, Tensor out_grad) diff --git a/paddle/phi/kernels/cpu/exponential_kernel.cc b/paddle/phi/kernels/cpu/exponential_kernel.cc new file mode 100644 index 0000000000000..a4a07fc7a65e8 --- /dev/null +++ b/paddle/phi/kernels/cpu/exponential_kernel.cc @@ -0,0 +1,45 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/exponential_kernel.h" + +#include + +#include "paddle/phi/backends/cpu/cpu_context.h" +#include "paddle/phi/core/generator.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/funcs/distribution_helper.h" + +namespace phi { + +template +void ExponentialKernel(const Context& dev_ctx, + const DenseTensor& x, + float lambda, + DenseTensor* out) { + T* out_data = dev_ctx.template Alloc(out); + auto engine = dev_ctx.GetGenerator()->GetCPUEngine(); + + std::uniform_real_distribution uniform(0.0, 1.0); + phi::funcs::exponential_transform trans(lambda); + + for (int64_t i = 0; i < out->numel(); ++i) { + out_data[i] = trans(uniform(*engine)); + } +} + +} // namespace phi + +PD_REGISTER_KERNEL( + exponential, CPU, ALL_LAYOUT, phi::ExponentialKernel, float, double) {} diff --git a/paddle/phi/kernels/exponential_kernel.h b/paddle/phi/kernels/exponential_kernel.h new file mode 100644 index 0000000000000..736baacca4cc9 --- /dev/null +++ b/paddle/phi/kernels/exponential_kernel.h @@ -0,0 +1,27 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/core/dense_tensor.h" + +namespace phi { + +template +void ExponentialKernel(const Context &dev_ctx, + const DenseTensor &x, + float lambda, + DenseTensor *out); + +} // namespace phi diff --git a/paddle/phi/kernels/gpu/exponential_kernel.cu b/paddle/phi/kernels/gpu/exponential_kernel.cu new file mode 100644 index 0000000000000..fc1730dde64a7 --- /dev/null +++ b/paddle/phi/kernels/gpu/exponential_kernel.cu @@ -0,0 +1,36 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/exponential_kernel.h" + +#include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/funcs/distribution_helper.h" + +namespace phi { + +template +void ExponentialKernel(const Context &dev_ctx, + const DenseTensor &x, + float lambda, + DenseTensor *out) { + phi::funcs::uniform_distribution dist; + phi::funcs::exponential_transform trans(lambda); + phi::funcs::distribution_and_transform(dev_ctx, out, dist, trans); +} + +} // namespace phi + +PD_REGISTER_KERNEL( + exponential, GPU, ALL_LAYOUT, phi::ExponentialKernel, float, double) {} diff --git a/paddle/phi/ops/compat/exponential_sig.cc b/paddle/phi/ops/compat/exponential_sig.cc new file mode 100644 index 0000000000000..2d70a4200ab3c --- /dev/null +++ b/paddle/phi/ops/compat/exponential_sig.cc @@ -0,0 +1,26 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/phi/core/compat/op_utils.h" + +namespace phi { + +KernelSignature ExponentialOpArgumentMapping( + const ArgumentMappingContext& ctx) { + return KernelSignature("exponential", {"X"}, {"lambda"}, {"Out"}); +} + +} // namespace phi + +PD_REGISTER_ARG_MAPPING_FN(exponential, phi::ExponentialOpArgumentMapping); diff --git a/python/paddle/fluid/tests/unittests/test_exponential_op.py b/python/paddle/fluid/tests/unittests/test_exponential_op.py index 57c4fb02d858a..72b4d8990446d 100644 --- a/python/paddle/fluid/tests/unittests/test_exponential_op.py +++ b/python/paddle/fluid/tests/unittests/test_exponential_op.py @@ -18,13 +18,13 @@ from op_test import OpTest import os -paddle.enable_static() paddle.seed(100) class TestExponentialOp1(OpTest): def setUp(self): + paddle.enable_static() self.op_type = "exponential" self.config() @@ -87,8 +87,14 @@ def test_static(self): def test_dygraph(self): paddle.disable_static() x = paddle.full([10, 10], -1., dtype='float32') - x.exponential_(0.5) - self.assertTrue(np.min(x.numpy()) >= 0) + x.stop_gradient = False + y = 2 * x + y.exponential_(0.5) + print(y) + self.assertTrue(np.min(y.numpy()) >= 0) + + y.backward() + self.assertTrue(np.array_equal(x.grad.numpy(), np.zeros([10, 10]))) paddle.enable_static() def test_fixed_random_number(self): diff --git a/python/paddle/tensor/random.py b/python/paddle/tensor/random.py index 990b20a26772c..e25366df753ed 100644 --- a/python/paddle/tensor/random.py +++ b/python/paddle/tensor/random.py @@ -1052,7 +1052,9 @@ def exponential_(x, lam=1.0, name=None): # [0.72520673, 0.45208144, 0.30234432]] """ - if paddle.in_dynamic_mode(): + if in_dygraph_mode(): + return _C_ops.final_state_exponential_(x, lam) + elif paddle.in_dynamic_mode(): return _C_ops.exponential_(x, "lambda", lam) check_variable_and_dtype(x, "x", ["float32", "float64"], "exponential") From 41f11d29526b2a3827a1a5224bc00ebe540e34d4 Mon Sep 17 00:00:00 2001 From: Zhong Hui Date: Wed, 20 Jul 2022 17:15:04 +0800 Subject: [PATCH 05/12] [PHI] move diag_embed op to phi. (#44408) * move diag_embed to phi. --- paddle/fluid/operators/diag_embed_op.cc | 93 ++----------- paddle/fluid/operators/diag_embed_op.cu | 30 ---- paddle/fluid/operators/diag_embed_op.h | 130 ------------------ paddle/phi/api/yaml/legacy_api.yaml | 8 ++ paddle/phi/infermeta/unary.cc | 63 +++++++++ paddle/phi/infermeta/unary.h | 3 + paddle/phi/kernels/cpu/diag_embed_kernel.cc | 28 ++++ paddle/phi/kernels/diag_embed_kernel.h | 29 ++++ paddle/phi/kernels/gpu/diag_embed_kernel.cu | 28 ++++ paddle/phi/kernels/impl/diag_embed_impl.h | 129 +++++++++++++++++ .../fluid/tests/unittests/test_diag_embed.py | 3 +- python/paddle/nn/functional/extension.py | 15 +- 12 files changed, 310 insertions(+), 249 deletions(-) delete mode 100644 paddle/fluid/operators/diag_embed_op.cu delete mode 100644 paddle/fluid/operators/diag_embed_op.h create mode 100644 paddle/phi/kernels/cpu/diag_embed_kernel.cc create mode 100644 paddle/phi/kernels/diag_embed_kernel.h create mode 100644 paddle/phi/kernels/gpu/diag_embed_kernel.cu create mode 100644 paddle/phi/kernels/impl/diag_embed_impl.h diff --git a/paddle/fluid/operators/diag_embed_op.cc b/paddle/fluid/operators/diag_embed_op.cc index 531d6f92d8830..0dc5d024ec4a8 100644 --- a/paddle/fluid/operators/diag_embed_op.cc +++ b/paddle/fluid/operators/diag_embed_op.cc @@ -12,7 +12,10 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/fluid/operators/diag_embed_op.h" +#include "paddle/fluid/framework/infershape_utils.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/phi/core/infermeta_utils.h" +#include "paddle/phi/infermeta/unary.h" namespace paddle { namespace operators { @@ -20,81 +23,6 @@ namespace operators { class DiagEmbedOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - - void InferShape(framework::InferShapeContext *ctx) const override { - PADDLE_ENFORCE_EQ( - ctx->HasInput("Input"), - true, - platform::errors::NotFound("Input of DiagEmbedOp is not found.")); - - PADDLE_ENFORCE_EQ( - ctx->HasOutput("Out"), - true, - platform::errors::NotFound("Output of DiagEmbedOp is not found.")); - - int offset = ctx->Attrs().Get("offset"); - int dim1 = ctx->Attrs().Get("dim1"); - int dim2 = ctx->Attrs().Get("dim2"); - - auto x_dims = ctx->GetInputDim("Input"); - - PADDLE_ENFORCE_GE( - dim1, - -(x_dims.size() + 1), - platform::errors::OutOfRange( - "Dim1 is out of range (expected to be in range of [%ld, " - "%ld], but got %ld).", - -(x_dims.size() + 1), - x_dims.size(), - dim1)); - PADDLE_ENFORCE_LE( - dim1, - x_dims.size(), - platform::errors::OutOfRange( - "Dim1 is out of range (expected to be in range of [%ld, " - "%ld], but got %ld).", - -(x_dims.size() + 1), - x_dims.size(), - dim1)); - - PADDLE_ENFORCE_GE( - dim2, - -(x_dims.size() + 1), - platform::errors::OutOfRange( - "Dim2 is out of range (expected to be in range of [%ld, " - "%ld], but got %ld).", - -(x_dims.size() + 1), - x_dims.size(), - dim2)); - PADDLE_ENFORCE_LE( - dim2, - x_dims.size(), - platform::errors::OutOfRange( - "Dim2 is out of range (expected to be in range of [%ld, " - "%ld], but got %ld).", - -(x_dims.size() + 1), - x_dims.size(), - dim2)); - - int dim1_ = dim1 < 0 ? x_dims.size() + dim1 + 1 : dim1; - int dim2_ = dim2 < 0 ? x_dims.size() + dim2 + 1 : dim2; - int offset_ = std::abs(offset); - - PADDLE_ENFORCE_NE(dim1_, - dim2_, - platform::errors::InvalidArgument( - "diagonal dimensions should not be identical " - "%ld vs %ld.", - dim1, - dim2)); - - int new_dim_len = offset_ + x_dims[x_dims.size() - 1]; - auto sizes = vectorize(x_dims); - sizes.pop_back(); - sizes.insert(sizes.begin() + std::min(dim1_, dim2_), new_dim_len); - sizes.insert(sizes.begin() + std::max(dim1_, dim2_), new_dim_len); - ctx->SetOutputDim("Out", phi::make_ddim(sizes)); - } }; class DiagEmbedOpMaker : public framework::OpProtoAndCheckerMaker { @@ -131,15 +59,14 @@ class DiagEmbedOpMaker : public framework::OpProtoAndCheckerMaker { } // namespace paddle namespace ops = paddle::operators; -namespace platform = paddle::platform; +DECLARE_INFER_SHAPE_FUNCTOR(diag_embed, + DiagEmbedInferShapeFunctor, + PD_INFER_META(phi::DiagEmbedInferMeta)); + REGISTER_OPERATOR( diag_embed, ops::DiagEmbedOp, ops::DiagEmbedOpMaker, paddle::framework::EmptyGradOpMaker, - paddle::framework::EmptyGradOpMaker); -REGISTER_OP_CPU_KERNEL(diag_embed, - ops::DiagEmbedKernel, - ops::DiagEmbedKernel, - ops::DiagEmbedKernel, - ops::DiagEmbedKernel); + paddle::framework::EmptyGradOpMaker, + DiagEmbedInferShapeFunctor); diff --git a/paddle/fluid/operators/diag_embed_op.cu b/paddle/fluid/operators/diag_embed_op.cu deleted file mode 100644 index e0f8c16731ff7..0000000000000 --- a/paddle/fluid/operators/diag_embed_op.cu +++ /dev/null @@ -1,30 +0,0 @@ -// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include -#include - -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/operators/diag_embed_op.h" - -namespace ops = paddle::operators; -namespace platform = paddle::platform; -REGISTER_OP_CUDA_KERNEL( - diag_embed, - ops::DiagEmbedKernel, - ops::DiagEmbedKernel, - ops::DiagEmbedKernel, - ops::DiagEmbedKernel, - ops::DiagEmbedKernel); diff --git a/paddle/fluid/operators/diag_embed_op.h b/paddle/fluid/operators/diag_embed_op.h deleted file mode 100644 index 94c479bb452b9..0000000000000 --- a/paddle/fluid/operators/diag_embed_op.h +++ /dev/null @@ -1,130 +0,0 @@ -// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include - -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/framework/operator.h" -#include "paddle/fluid/platform/for_range.h" -#include "paddle/phi/kernels/funcs/math_function.h" - -namespace paddle { -namespace operators { - -template -struct DiagEmbedFunctor { - DiagEmbedFunctor(const T* input, - int64_t numel, - const int64_t* dim, - int64_t offset, - int64_t dims_size, - T* output, - const int64_t* strides) - : input_(input), - numel_(numel), - dim_(dim), - offset_(offset), - dims_size_(dims_size), - output_(output), - strides_(strides) {} - - HOSTDEVICE void operator()(size_t idx) const { - int64_t position = 0; - auto numel = numel_; - int64_t num = idx; - for (int64_t i = 0; i < dims_size_; i++) { - numel = numel / dim_[i]; - position += num / numel * strides_[i]; - num = num % numel; - } - output_[position + offset_] = input_[idx]; - } - - const T* input_; - int64_t numel_; - const int64_t* dim_; - int64_t offset_; - int64_t dims_size_; - T* output_; - const int64_t* strides_; -}; - -template -class DiagEmbedKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - auto* input = context.Input("Input"); - auto* out = context.Output("Out"); - - const int64_t offset = context.Attr("offset"); - const int64_t dim1 = context.Attr("dim1"); - const int64_t dim2 = context.Attr("dim2"); - auto* input_data = input->data(); - - T* out_data = out->mutable_data(context.GetPlace()); - phi::funcs::SetConstant set_zero; - auto& dev_ctx = context.template device_context(); - set_zero(dev_ctx, out, static_cast(0.0)); - - auto out_dims = out->dims(); - int dim1_ = dim1 < 0 ? out_dims.size() + dim1 : dim1; - int dim2_ = dim2 < 0 ? out_dims.size() + dim2 : dim2; - auto stride = phi::stride(out_dims); - int64_t diag_size; - int64_t storage_offset = 0; - if (offset >= 0) { - int64_t dim = out_dims[dim2_] - offset; - diag_size = std::max(std::min(out_dims[dim1_], dim), 0); - } else { - int64_t dim = out_dims[dim1_] + offset; - diag_size = std::max(std::min(dim, out_dims[dim2_]), 0); - } - if (diag_size == 0) { - // skip - } else if (offset >= 0) { - storage_offset += offset * stride[dim2_]; - } else { - storage_offset -= offset * stride[dim1_]; - } - auto strides = vectorize(stride); - strides.erase(strides.begin() + std::max(dim1_, dim2_)); - strides.erase(strides.begin() + std::min(dim1_, dim2_)); - strides.push_back(stride[dim1_] + stride[dim2_]); - const auto dims = vectorize(input->dims()); - -#if defined(__NVCC__) || defined(__HIPCC__) - thrust::device_vector dims_vec(dims); - const int64_t* dims_arr = thrust::raw_pointer_cast(dims_vec.data()); - thrust::device_vector strides_vec(strides); - const int64_t* strides_arr = thrust::raw_pointer_cast(strides_vec.data()); -#else - const int64_t* dims_arr = dims.data(); - const int64_t* strides_arr = strides.data(); -#endif - - platform::ForRange for_range(dev_ctx, input->numel()); - DiagEmbedFunctor functor(input_data, - input->numel(), - dims_arr, - storage_offset, - dims.size(), - out_data, - strides_arr); - for_range(functor); - } -}; -} // namespace operators -} // namespace paddle diff --git a/paddle/phi/api/yaml/legacy_api.yaml b/paddle/phi/api/yaml/legacy_api.yaml index f60309985a6f4..40fbdc9a9170d 100644 --- a/paddle/phi/api/yaml/legacy_api.yaml +++ b/paddle/phi/api/yaml/legacy_api.yaml @@ -524,6 +524,14 @@ func : determinant backward : det_grad +- api : diag_embed + args : (Tensor x, int offset, int dim1, int dim2) + output : Tensor + infer_meta : + func : DiagEmbedInferMeta + kernel : + func : diag_embed + - api : divide args : (Tensor x, Tensor y) output : Tensor diff --git a/paddle/phi/infermeta/unary.cc b/paddle/phi/infermeta/unary.cc index c39fb96430f45..7b1c6dfe65a04 100644 --- a/paddle/phi/infermeta/unary.cc +++ b/paddle/phi/infermeta/unary.cc @@ -288,6 +288,69 @@ void CumInferMeta(const MetaTensor& x, out->share_lod(x); } +void DiagEmbedInferMeta( + const MetaTensor& x, int offset, int dim1, int dim2, MetaTensor* out) { + auto x_dims = x.dims(); + + PADDLE_ENFORCE_GE( + dim1, + -(x_dims.size() + 1), + phi::errors::OutOfRange( + "Dim1 is out of range (expected to be in range of [%ld, " + "%ld], but got %ld).", + -(x_dims.size() + 1), + x_dims.size(), + dim1)); + PADDLE_ENFORCE_LE( + dim1, + x_dims.size(), + phi::errors::OutOfRange( + "Dim1 is out of range (expected to be in range of [%ld, " + "%ld], but got %ld).", + -(x_dims.size() + 1), + x_dims.size(), + dim1)); + + PADDLE_ENFORCE_GE( + dim2, + -(x_dims.size() + 1), + phi::errors::OutOfRange( + "Dim2 is out of range (expected to be in range of [%ld, " + "%ld], but got %ld).", + -(x_dims.size() + 1), + x_dims.size(), + dim2)); + PADDLE_ENFORCE_LE( + dim2, + x_dims.size(), + phi::errors::OutOfRange( + "Dim2 is out of range (expected to be in range of [%ld, " + "%ld], but got %ld).", + -(x_dims.size() + 1), + x_dims.size(), + dim2)); + + int dim1_ = dim1 < 0 ? x_dims.size() + dim1 + 1 : dim1; + int dim2_ = dim2 < 0 ? x_dims.size() + dim2 + 1 : dim2; + int offset_ = std::abs(offset); + + PADDLE_ENFORCE_NE(dim1_, + dim2_, + phi::errors::InvalidArgument( + "diagonal dimensions should not be identical " + "%ld vs %ld.", + dim1, + dim2)); + + int new_dim_len = offset_ + x_dims[x_dims.size() - 1]; + auto sizes = vectorize(x_dims); + sizes.pop_back(); + sizes.insert(sizes.begin() + std::min(dim1_, dim2_), new_dim_len); + sizes.insert(sizes.begin() + std::max(dim1_, dim2_), new_dim_len); + out->set_dims(phi::make_ddim(sizes)); + out->set_dtype(x.dtype()); +} + void DiagInferMeta(const MetaTensor& x, int offset, float padding_value, diff --git a/paddle/phi/infermeta/unary.h b/paddle/phi/infermeta/unary.h index 691fc8ff41ca6..e825ba98f44e3 100644 --- a/paddle/phi/infermeta/unary.h +++ b/paddle/phi/infermeta/unary.h @@ -71,6 +71,9 @@ void CumInferMeta(const MetaTensor& x, bool reverse, MetaTensor* out); +void DiagEmbedInferMeta( + const MetaTensor& x, int offset, int dim1, int dim2, MetaTensor* out); + void DiagInferMeta(const MetaTensor& x, int offset, float padding_value, diff --git a/paddle/phi/kernels/cpu/diag_embed_kernel.cc b/paddle/phi/kernels/cpu/diag_embed_kernel.cc new file mode 100644 index 0000000000000..714b53c6919aa --- /dev/null +++ b/paddle/phi/kernels/cpu/diag_embed_kernel.cc @@ -0,0 +1,28 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/diag_embed_kernel.h" + +#include "paddle/phi/backends/cpu/cpu_context.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/impl/diag_embed_impl.h" + +PD_REGISTER_KERNEL(diag_embed, + CPU, + ALL_LAYOUT, + phi::DiagEmbedKernel, + int, + int64_t, + float, + double) {} diff --git a/paddle/phi/kernels/diag_embed_kernel.h b/paddle/phi/kernels/diag_embed_kernel.h new file mode 100644 index 0000000000000..e47eab82474fb --- /dev/null +++ b/paddle/phi/kernels/diag_embed_kernel.h @@ -0,0 +1,29 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/core/dense_tensor.h" + +namespace phi { + +template +void DiagEmbedKernel(const Context& dev_ctx, + const DenseTensor& x, + int offset, + int dim1, + int dim2, + DenseTensor* out); + +} // namespace phi diff --git a/paddle/phi/kernels/gpu/diag_embed_kernel.cu b/paddle/phi/kernels/gpu/diag_embed_kernel.cu new file mode 100644 index 0000000000000..ece0f012e620e --- /dev/null +++ b/paddle/phi/kernels/gpu/diag_embed_kernel.cu @@ -0,0 +1,28 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/diag_embed_kernel.h" + +#include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/impl/diag_embed_impl.h" + +PD_REGISTER_KERNEL(diag_embed, + GPU, + ALL_LAYOUT, + phi::DiagEmbedKernel, + int, + int64_t, + float, + double) {} diff --git a/paddle/phi/kernels/impl/diag_embed_impl.h b/paddle/phi/kernels/impl/diag_embed_impl.h new file mode 100644 index 0000000000000..a4430fde92343 --- /dev/null +++ b/paddle/phi/kernels/impl/diag_embed_impl.h @@ -0,0 +1,129 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#if defined(__NVCC__) || defined(__HIPCC__) +#include +#include +#endif + +#include "paddle/phi/kernels/diag_embed_kernel.h" + +#include + +#include "paddle/phi/kernels/funcs/for_range.h" +#include "paddle/phi/kernels/funcs/math_function.h" + +namespace phi { + +template +struct DiagEmbedFunctor { + DiagEmbedFunctor(const T* input, + int64_t numel, + const int64_t* dim, + int64_t offset, + int64_t dims_size, + T* output, + const int64_t* strides) + : input_(input), + numel_(numel), + dim_(dim), + offset_(offset), + dims_size_(dims_size), + output_(output), + strides_(strides) {} + + HOSTDEVICE void operator()(size_t idx) const { + int64_t position = 0; + auto numel = numel_; + int64_t num = idx; + for (int64_t i = 0; i < dims_size_; i++) { + numel = numel / dim_[i]; + position += num / numel * strides_[i]; + num = num % numel; + } + output_[position + offset_] = input_[idx]; + } + + const T* input_; + int64_t numel_; + const int64_t* dim_; + int64_t offset_; + int64_t dims_size_; + T* output_; + const int64_t* strides_; +}; + +template +void DiagEmbedKernel(const Context& dev_ctx, + const DenseTensor& x, + int offset, + int dim1, + int dim2, + DenseTensor* out) { + auto* input_data = x.data(); + T* out_data = dev_ctx.template Alloc(out); + phi::funcs::SetConstant set_zero; + + set_zero(dev_ctx, out, static_cast(0.0)); + + auto out_dims = out->dims(); + int dim1_ = dim1 < 0 ? out_dims.size() + dim1 : dim1; + int dim2_ = dim2 < 0 ? out_dims.size() + dim2 : dim2; + auto stride = phi::stride(out_dims); + int64_t diag_size; + int64_t storage_offset = 0; + if (offset >= 0) { + int64_t dim = out_dims[dim2_] - offset; + diag_size = std::max(std::min(out_dims[dim1_], dim), 0); + } else { + int64_t dim = out_dims[dim1_] + offset; + diag_size = std::max(std::min(dim, out_dims[dim2_]), 0); + } + if (diag_size == 0) { + // skip + } else if (offset >= 0) { + storage_offset += offset * stride[dim2_]; + } else { + storage_offset -= offset * stride[dim1_]; + } + auto strides = vectorize(stride); + strides.erase(strides.begin() + std::max(dim1_, dim2_)); + strides.erase(strides.begin() + std::min(dim1_, dim2_)); + strides.push_back(stride[dim1_] + stride[dim2_]); + const auto dims = vectorize(x.dims()); + +#if defined(__NVCC__) || defined(__HIPCC__) + thrust::device_vector dims_vec(dims); + const int64_t* dims_arr = thrust::raw_pointer_cast(dims_vec.data()); + thrust::device_vector strides_vec(strides); + const int64_t* strides_arr = thrust::raw_pointer_cast(strides_vec.data()); +#else + const int64_t* dims_arr = dims.data(); + const int64_t* strides_arr = strides.data(); +#endif + + phi::funcs::ForRange for_range(dev_ctx, x.numel()); + DiagEmbedFunctor functor(input_data, + x.numel(), + dims_arr, + storage_offset, + dims.size(), + out_data, + strides_arr); + for_range(functor); +} + +} // namespace phi diff --git a/python/paddle/fluid/tests/unittests/test_diag_embed.py b/python/paddle/fluid/tests/unittests/test_diag_embed.py index c7f933d23ea21..546247167b8d0 100644 --- a/python/paddle/fluid/tests/unittests/test_diag_embed.py +++ b/python/paddle/fluid/tests/unittests/test_diag_embed.py @@ -27,11 +27,12 @@ class TestDiagEmbedOp(OpTest): def setUp(self): self.op_type = "diag_embed" + self.python_api = F.diag_embed self.init_config() self.outputs = {'Out': self.target} def test_check_output(self): - self.check_output() + self.check_output(check_eager=True) def init_config(self): self.case = np.random.randn(2, 3).astype('float32') diff --git a/python/paddle/nn/functional/extension.py b/python/paddle/nn/functional/extension.py index 27bc2ef70bcee..1bfa7f148838a 100644 --- a/python/paddle/nn/functional/extension.py +++ b/python/paddle/nn/functional/extension.py @@ -98,12 +98,18 @@ def diag_embed(input, offset=0, dim1=-2, dim2=-1): # [[ 0. , 0. , 0. , 0. ], # [ 0. , 0. , 0. , 0. ]]] """ - inputs = {'Input': [input]} - attrs = {'offset': offset, 'dim1': dim1, 'dim2': dim2} - if not isinstance(input, Variable): input = assign(input) + if in_dygraph_mode(): + return _C_ops.final_state_diag_embed(input, offset, dim1, dim2) + elif in_dynamic_mode(): + return _C_ops.diag_embed(input, "offset", offset, "dim1", dim1, "dim2", + dim2) + + inputs = {'Input': [input]} + attrs = {'offset': offset, 'dim1': dim1, 'dim2': dim2} + def __check_input(input, offset, dim1, dim2): check_dtype(input.dtype, 'Input', ['int32', 'int64', 'float16', 'float32', 'float64'], @@ -129,8 +135,7 @@ def __check_input(input, offset, dim1, dim2): "dim1 and dim2 cannot be the same dimension." \ "But received dim1 = %d, dim2 = %d\n"%(dim1, dim2) - if not in_dynamic_mode(): - __check_input(input, offset, dim1, dim2) + __check_input(input, offset, dim1, dim2) helper = LayerHelper("diag_embed", **locals()) out = helper.create_variable_for_type_inference(dtype=input.dtype) From 1e1a4b9bf399808c07ce5678c4b1234c873b9dca Mon Sep 17 00:00:00 2001 From: fuyou765 <64373205+fuyou765@users.noreply.github.com> Date: Wed, 20 Jul 2022 18:49:07 +0800 Subject: [PATCH 06/12] [MLU] set_value performance optimizing (#44390) --- paddle/fluid/operators/set_value_op_mlu.cc | 97 +++++++++++-------- .../unittests/mlu/test_set_value_op_mlu.py | 14 +++ 2 files changed, 69 insertions(+), 42 deletions(-) diff --git a/paddle/fluid/operators/set_value_op_mlu.cc b/paddle/fluid/operators/set_value_op_mlu.cc index 44422994f60da..9a6277dfa2312 100644 --- a/paddle/fluid/operators/set_value_op_mlu.cc +++ b/paddle/fluid/operators/set_value_op_mlu.cc @@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ +#include #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/operators/mlu/mlu_baseop.h" #include "paddle/fluid/operators/set_value_op.h" @@ -62,7 +63,6 @@ class SetValueMLUKernel : public framework::OpKernel { auto slice_dims_for_assign = decrease_slice_dims; if (!none_axes.empty()) { std::vector slice_dims_with_none; - size_t none_axes_cur = 0, decrease_axes_cur = 0; for (int i = 0; i < slice_dims.size(); ++i) { while (none_axes_cur < none_axes.size() && @@ -84,51 +84,22 @@ class SetValueMLUKernel : public framework::OpKernel { slice_dims_for_assign = phi::make_ddim(slice_dims_with_none); } - - auto starts_indices = std::vector(in_dims.size(), 0); - auto ends_indices = std::vector(in_dims.size(), 0); - auto strides_indices = std::vector(in_dims.size(), 0); + int in_size = in_dims.size(); + int starts_indices[in_size] = {0}; + int ends_indices[in_size] = {0}; + int strides_indices[in_size] = {0}; for (int i = 0; i < in_dims.size(); ++i) { starts_indices[i] = 0; - ends_indices[i] = slice_dims[i]; + ends_indices[i] = static_cast(slice_dims[i]); strides_indices[i] = 1; } for (size_t i = 0; i < axes.size(); i++) { int axis_index = axes[i]; - starts_indices[axis_index] = starts[i]; - ends_indices[axis_index] = ends[i]; - strides_indices[axis_index] = steps[i]; - } - - int64_t stride_step = phi::product(in_dims); - std::vector index_indices(1, 0); - for (size_t i = 0; i < strides_indices.size(); ++i) { - auto index_size = index_indices.size(); - stride_step /= in_dims[i]; - for (size_t j = 0; j < index_size; ++j) { - auto start_index = *index_indices.begin(); - if (strides_indices[i] > 0) { - for (int64_t k = starts_indices[i]; k < ends_indices[i]; - k += strides_indices[i]) { - index_indices.push_back(start_index + k * stride_step); - } - } else { - for (int64_t k = starts_indices[i]; k > ends_indices[i]; - k += strides_indices[i]) { - index_indices.push_back(start_index + k * stride_step); - } - } - index_indices.erase(index_indices.begin()); - } + starts_indices[axis_index] = static_cast(starts[i]); + ends_indices[axis_index] = static_cast(ends[i]); + strides_indices[axis_index] = static_cast(steps[i]); } - - PADDLE_ENFORCE_EQ( - static_cast(index_indices.size()), - phi::product(slice_dims_for_assign), - platform::errors::InvalidArgument( - "OP(set_value) error index indices and value update not match ")); - Tensor value_t(in->type()); if (value_tensor != nullptr) { value_t.ShareDataWith(*value_tensor); @@ -160,29 +131,71 @@ class SetValueMLUKernel : public framework::OpKernel { int64_t input_numel = phi::product(in_dims); int64_t value_numel = phi::product(value_temp.dims()); - Tensor in_temp, out_temp, val_temp; + Tensor in_temp, out_temp, val_temp, index_out; + int64_t stride_step = phi::product(in_dims); + std::vector index_indices(stride_step); + std::iota(index_indices.begin(), index_indices.end(), 0); framework::Tensor index_temp; in_temp.ShareDataWith(*in); val_temp.ShareDataWith(value_temp); paddle::framework::TensorFromVector( index_indices, ctx.device_context(), &index_temp); + index_temp.Resize(in_dims); + auto index_dims = in_dims; + for (int i = 0; i < in_dims.size(); ++i) { + if (starts_indices[i] < 0 || ends_indices[i] < 0) { + starts_indices[i] -= in_dims[i]; + ends_indices[i] -= in_dims[i]; + } + if (strides_indices[i] > 0) + index_dims[i] = + static_cast((ends_indices[i] - starts_indices[i] - 1) / + strides_indices[i]) + + 1; + else + index_dims[i] = + static_cast((ends_indices[i] - starts_indices[i] + 1) / + strides_indices[i]) + + 1; + } auto new_in_dims = phi::make_ddim({input_numel}); auto new_val_dims = phi::make_ddim({value_numel}); in_temp.Resize(new_in_dims); val_temp.Resize(new_val_dims); + index_out.Resize(index_dims); + index_out.mutable_data(ctx.GetPlace()); cnnlScatterRefMode_t mode = CNNL_SCATTERREF_UPDATE; MLUCnnlTensorDesc x_desc(in_temp); MLUCnnlTensorDesc indices_desc(index_temp); + MLUCnnlTensorDesc indices_out_desc(index_out); MLUCnnlTensorDesc updates_desc(val_temp); MLUCnnlTensorDesc out_desc(*out); - + MLUCnnl::StridedSlice(ctx, + starts_indices, + ends_indices, + strides_indices, + indices_desc.get(), + GetBasePtr(&index_temp), + indices_out_desc.get(), + GetBasePtr(&index_out)); + PADDLE_ENFORCE_EQ( + static_cast(phi::product(index_out.dims())), + phi::product(slice_dims_for_assign), + platform::errors::InvalidArgument( + "OP(set_value) error index indices and value update not match ")); + Tensor index_final; + index_final.ShareDataWith(index_out); + int64_t indices_numel = phi::product(index_dims); + auto new_index_dims = phi::make_ddim({indices_numel}); + index_final.Resize(new_index_dims); + MLUCnnlTensorDesc indices_final_desc(index_final); MLUCnnl::ScatterRefFunctor(ctx, x_desc.get(), GetBasePtr(&in_temp), updates_desc.get(), GetBasePtr(&val_temp), - indices_desc.get(), - GetBasePtr(&index_temp), + indices_final_desc.get(), + GetBasePtr(&index_final), mode); in_temp.Resize(in_dims); paddle::framework::TensorCopy(in_temp, ctx.GetPlace(), out); diff --git a/python/paddle/fluid/tests/unittests/mlu/test_set_value_op_mlu.py b/python/paddle/fluid/tests/unittests/mlu/test_set_value_op_mlu.py index f6183687f6a47..1842f9a2f632c 100644 --- a/python/paddle/fluid/tests/unittests/mlu/test_set_value_op_mlu.py +++ b/python/paddle/fluid/tests/unittests/mlu/test_set_value_op_mlu.py @@ -127,6 +127,18 @@ def _get_answer(self): self.data[0:, 1:2, :] = self.value +class TestSetValueItemSlice5(TestSetValueApi): + + def set_shape(self): + self.shape = [100, 426, 640] + + def _call_setitem(self, x): + x[0:-1] = self.value + + def _get_answer(self): + self.data[0:-1] = self.value + + #TODO: Fix this after MLU support while_loop #class TestSetValueItemSliceInWhile(TestSetValueApi): # def _call_setitem(self, x): @@ -517,6 +529,7 @@ def set_dtype(self): create_test_value_int32(TestSetValueItemSlice2) create_test_value_int32(TestSetValueItemSlice3) create_test_value_int32(TestSetValueItemSlice4) +create_test_value_int32(TestSetValueItemSlice5) def create_test_value_tensor_fp32(parent): @@ -543,6 +556,7 @@ def _get_answer(self): create_test_value_tensor_fp32(TestSetValueItemSlice2) create_test_value_tensor_fp32(TestSetValueItemSlice3) create_test_value_tensor_fp32(TestSetValueItemSlice4) +create_test_value_tensor_fp32(TestSetValueItemSlice5) # 3. Test different shape of value From e0b4efa8f96c40207ae2198dcd8da55897970206 Mon Sep 17 00:00:00 2001 From: Chen Weihang Date: Wed, 20 Jul 2022 05:57:34 -0500 Subject: [PATCH 07/12] Update api changing approve members (#44463) * update api approve members, test=document_fix * add qingqnig into list, test=document_fix --- tools/check_api_approvals.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/check_api_approvals.sh b/tools/check_api_approvals.sh index 87edff50ef85e..49d614fa99107 100644 --- a/tools/check_api_approvals.sh +++ b/tools/check_api_approvals.sh @@ -41,13 +41,13 @@ function add_failed(){ api_params_diff=`python ${PADDLE_ROOT}/tools/check_api_compatible.py ${PADDLE_ROOT}/paddle/fluid/API_DEV.spec ${PADDLE_ROOT}/paddle/fluid/API_PR.spec` api_spec_diff=`python ${PADDLE_ROOT}/tools/diff_api.py ${PADDLE_ROOT}/paddle/fluid/API_DEV.spec.api ${PADDLE_ROOT}/paddle/fluid/API_PR.spec.api` if [ "$api_spec_diff" != "" -o "${api_params_diff}" != "" ]; then - echo_line="You must have one RD (XiaoguangHu01, lanxianghit or Superjomn) approval for API change.\n" + echo_line="You must have one RD (XiaoguangHu01, jeff41404, lanxianghit or qingqing01) approval for API change.\n" echo_line="${echo_line} and one TPM approval for API change: \n" echo_line="${echo_line} jzhang533/ZhangJun, momozi1996/MoYan, dingjiaweiww/DingJiaWei, Ligoml/LiMengLiu for general APIs.\n" echo_line="${echo_line} liuTINA0907/LiuShuangQiao for distributed related APIs.\n" echo_line="${echo_line} leiqing1/LeiQing for inference related APIs.\n" - check_approval 1 46782768 47554610 328693 + check_approval 1 46782768 8555991 47554610 7845005 check_approval 1 29231 79295425 23093488 39876205 65896652 54695910 fi From dafe855e6110ebde453b3b53aed1cdf21137d6b0 Mon Sep 17 00:00:00 2001 From: YUNSHEN XIE <1084314248@qq.com> Date: Wed, 20 Jul 2022 19:17:02 +0800 Subject: [PATCH 08/12] fix bug,test=document_fix (#44478) --- paddle/scripts/paddle_build.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/paddle/scripts/paddle_build.sh b/paddle/scripts/paddle_build.sh index ea1c55af46ea0..056df18ee42f1 100755 --- a/paddle/scripts/paddle_build.sh +++ b/paddle/scripts/paddle_build.sh @@ -200,6 +200,7 @@ function cmake_base() { if [ "$CMD" != "assert_file_approvals" ];then which python python -V + python -m pip install distro python ${PADDLE_ROOT}/tools/summary_env.py bash ${PADDLE_ROOT}/tools/get_cpu_info.sh fi From 2883e4b21be2c3bb34da0b648719d2eb0da26685 Mon Sep 17 00:00:00 2001 From: lyq <30404405+affectionlu@users.noreply.github.com> Date: Wed, 20 Jul 2022 20:24:46 +0800 Subject: [PATCH 09/12] [Phi] migrate clip_by_norm to phi (#44458) --- paddle/fluid/operators/clip_by_norm_op.cc | 14 +- paddle/fluid/operators/clip_by_norm_op.cu | 122 ------------------ paddle/fluid/operators/clip_by_norm_op.h | 70 ---------- paddle/fluid/operators/dgc_clip_by_norm_op.h | 37 +++++- paddle/phi/api/yaml/legacy_api.yaml | 8 ++ paddle/phi/infermeta/unary.cc | 12 ++ paddle/phi/infermeta/unary.h | 2 + paddle/phi/kernels/clip_by_norm_kernel.h | 27 ++++ paddle/phi/kernels/cpu/clip_by_norm_kernel.cc | 34 +++++ paddle/phi/kernels/gpu/clip_by_norm_kernel.cu | 89 +++++++++++++ .../kernels/impl/clip_by_norm_kernel_impl.h | 55 ++++++++ .../selected_rows/clip_by_norm_kernel.h | 29 +++++ .../selected_rows/cpu/clip_by_norm_kernel.cc | 22 ++++ .../selected_rows/gpu/clip_by_norm_kernel.cu | 27 ++++ .../impl/clip_by_norm_kernel_impl.h | 45 +++++++ paddle/phi/ops/compat/clip_by_norm_sig.cc | 30 +++++ python/paddle/fluid/layers/nn.py | 2 + .../tests/unittests/test_clip_by_norm_op.py | 7 +- 18 files changed, 429 insertions(+), 203 deletions(-) delete mode 100644 paddle/fluid/operators/clip_by_norm_op.cu create mode 100644 paddle/phi/kernels/clip_by_norm_kernel.h create mode 100644 paddle/phi/kernels/cpu/clip_by_norm_kernel.cc create mode 100644 paddle/phi/kernels/gpu/clip_by_norm_kernel.cu create mode 100644 paddle/phi/kernels/impl/clip_by_norm_kernel_impl.h create mode 100644 paddle/phi/kernels/selected_rows/clip_by_norm_kernel.h create mode 100644 paddle/phi/kernels/selected_rows/cpu/clip_by_norm_kernel.cc create mode 100644 paddle/phi/kernels/selected_rows/gpu/clip_by_norm_kernel.cu create mode 100644 paddle/phi/kernels/selected_rows/impl/clip_by_norm_kernel_impl.h create mode 100644 paddle/phi/ops/compat/clip_by_norm_sig.cc diff --git a/paddle/fluid/operators/clip_by_norm_op.cc b/paddle/fluid/operators/clip_by_norm_op.cc index cfb56a4b2a6b1..3805e11d752e3 100644 --- a/paddle/fluid/operators/clip_by_norm_op.cc +++ b/paddle/fluid/operators/clip_by_norm_op.cc @@ -13,11 +13,17 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/clip_by_norm_op.h" +#include "paddle/fluid/framework/infershape_utils.h" +#include "paddle/phi/core/infermeta_utils.h" +#include "paddle/phi/infermeta/unary.h" namespace ops = paddle::operators; + +DECLARE_INFER_SHAPE_FUNCTOR(clip_by_norm, + ClipByNormInferShapeFunctor, + PD_INFER_META(phi::ClipByNormInferMeta)); + REGISTER_OP_WITHOUT_GRADIENT(clip_by_norm, ops::ClipByNormOp, - ops::ClipByNormOpMaker); - -REGISTER_OP_CPU_KERNEL(clip_by_norm, - ops::ClipByNormKernel); + ops::ClipByNormOpMaker, + ClipByNormInferShapeFunctor); diff --git a/paddle/fluid/operators/clip_by_norm_op.cu b/paddle/fluid/operators/clip_by_norm_op.cu deleted file mode 100644 index b747682716b3f..0000000000000 --- a/paddle/fluid/operators/clip_by_norm_op.cu +++ /dev/null @@ -1,122 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/operators/clip_by_norm_op.h" -#include "paddle/fluid/operators/reduce_ops/reduce_op.cu.h" - -namespace paddle { -namespace operators { -using Tensor = framework::Tensor; - -template <> -class ClipByNormKernel - : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - auto max_norm = context.Attr("max_norm"); - auto in_var = context.InputVar("X"); - auto& dev_ctx = - context.template device_context(); - - Tensor* output = nullptr; - const Tensor* input = nullptr; - if (in_var->IsType()) { - input = context.Input("X"); - - output = context.Output("Out"); - output->mutable_data(context.GetPlace()); - } else if (in_var->IsType()) { - auto* x = context.Input("X"); - - // merge ids in selected rows first - math::scatter::MergeAdd - merge_func; - phi::SelectedRows* merged_input = - const_cast(context.scope()) - .Var() - ->GetMutable(); - merge_func(context.template device_context(), - *x, - merged_input); - input = &(merged_input->value()); - - phi::SelectedRows* output_selected_rows = - context.Output("Out"); - output_selected_rows->set_rows(merged_input->rows()); - output_selected_rows->set_height(merged_input->height()); - output = output_selected_rows->mutable_value(); - output->Resize(merged_input->value().dims()); - output->mutable_data(context.GetPlace()); - } else { - PADDLE_THROW(platform::errors::InvalidArgument( - "Invalid input variable type, only support LodTensor and " - "SelectedRows types, but got type is %s.", - framework::ToTypeName(in_var->Type()))); - } - - PADDLE_ENFORCE_NOT_NULL(input, - platform::errors::InvalidArgument( - "Input(X) of ClipByNormOp should not be null. " - "Please check if it is created correctly.")); - std::vector reduce_dims; - reduce_dims.resize(input->dims().size()); - for (int i = 0; i < reduce_dims.size(); ++i) { - reduce_dims[i] = i; - } - Tensor tmp = context.AllocateTmpTensor( - {1}, dev_ctx); - TensorReduceImpl>( - dev_ctx, - *input, - &tmp, - kps::SquareFunctor(), - reduce_dims, - dev_ctx.stream()); - auto tmp_eigen = EigenVector::Flatten(tmp); - auto x_norm = tmp_eigen.sqrt(); - - auto x = EigenVector::Flatten(*input); - auto out = EigenVector::Flatten(*output); - - auto& place = - *context.template device_context() - .eigen_device(); - - auto temp = (x_norm <= max_norm).template cast(); - auto epsilon = - ((x_norm <= static_cast(1e-30)).all().template cast()) * - static_cast(1e-6); - - auto scaling = - (temp + (static_cast(1) - temp) * max_norm / (x_norm + epsilon)) - .template cast(); - Eigen::array one_dim{{1}}; - Eigen::DSizes m_dsize(input->numel()); - - out.device(place) = x * scaling.reshape(one_dim).broadcast(m_dsize); - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -namespace plat = paddle::platform; -REGISTER_OP_CUDA_KERNEL( - clip_by_norm, - ops::ClipByNormKernel, - ops::ClipByNormKernel); diff --git a/paddle/fluid/operators/clip_by_norm_op.h b/paddle/fluid/operators/clip_by_norm_op.h index 7387821338cd9..6fde5106f10a4 100644 --- a/paddle/fluid/operators/clip_by_norm_op.h +++ b/paddle/fluid/operators/clip_by_norm_op.h @@ -30,76 +30,6 @@ template using EigenVector = framework::EigenVector; -template -class ClipByNormKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - auto max_norm = context.Attr("max_norm"); - auto in_var = context.InputVar("X"); - - Tensor* output = nullptr; - const Tensor* input = nullptr; - if (in_var->IsType()) { - input = context.Input("X"); - - output = context.Output("Out"); - output->mutable_data(context.GetPlace()); - } else if (in_var->IsType()) { - auto* x = context.Input("X"); - - // merge ids in selected rows first - math::scatter::MergeAdd merge_func; - phi::SelectedRows* merged_input = - const_cast(context.scope()) - .Var() - ->GetMutable(); - merge_func( - context.template device_context(), *x, merged_input); - input = &(merged_input->value()); - - phi::SelectedRows* output_selected_rows = - context.Output("Out"); - output_selected_rows->set_rows(merged_input->rows()); - output_selected_rows->set_height(merged_input->height()); - output = output_selected_rows->mutable_value(); - output->Resize(merged_input->value().dims()); - output->mutable_data(context.GetPlace()); - } else { - PADDLE_THROW(platform::errors::InvalidArgument( - "Invalid input variable type, only support LodTensor and " - "SelectedRows types, but got type is %s.", - framework::ToTypeName(in_var->Type()))); - } - - PADDLE_ENFORCE_NOT_NULL(input, - platform::errors::InvalidArgument( - "Input(X) of ClipByNormOp should not be null. " - "Please check if it is created correctly.")); - - auto x = EigenVector::Flatten(*input); - auto out = EigenVector::Flatten(*output); - auto x_norm = x.square().sum().sqrt(); - auto& place = - *context.template device_context().eigen_device(); - - auto temp = (x_norm <= max_norm).template cast(); - auto epsilon = - ((x_norm <= static_cast(1e-30)).all().template cast()) * - static_cast(1e-6); - - auto scaling = - temp + (static_cast(1) - temp) * max_norm / (x_norm + epsilon); - Eigen::array one_dim{{1}}; - Eigen::DSizes m_dsize(input->numel()); - if (context.GetPlace() == platform::CPUPlace()) { - out.device(place) = - x * scaling.reshape(one_dim).eval().broadcast(m_dsize); - } else { - out.device(place) = x * scaling.reshape(one_dim).broadcast(m_dsize); - } - } -}; - class ClipByNormOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; diff --git a/paddle/fluid/operators/dgc_clip_by_norm_op.h b/paddle/fluid/operators/dgc_clip_by_norm_op.h index 197bf59b2a470..27c30a8997b2c 100644 --- a/paddle/fluid/operators/dgc_clip_by_norm_op.h +++ b/paddle/fluid/operators/dgc_clip_by_norm_op.h @@ -15,20 +15,24 @@ limitations under the License. */ #pragma once #include "paddle/fluid/operators/clip_by_norm_op.h" +#include "paddle/phi/kernels/clip_by_norm_kernel.h" +#include "paddle/phi/kernels/selected_rows/clip_by_norm_kernel.h" namespace paddle { namespace operators { +using Tensor = framework::Tensor; + template -class DGCClipByNormKernel : public ClipByNormKernel { +class DGCClipByNormKernel : public framework::OpKernel { public: - void Compute(const framework::ExecutionContext& context) const override { - auto rampup_begin_step = context.Attr("rampup_begin_step"); + void Compute(const framework::ExecutionContext& ctx) const override { + auto rampup_begin_step = ctx.Attr("rampup_begin_step"); if (static_cast(rampup_begin_step) < 0) { return; } - auto current_step_tensor = context.Input("current_step"); + auto current_step_tensor = ctx.Input("current_step"); auto* current_step = current_step_tensor->data(); VLOG(10) << "current_step:" << *current_step @@ -41,7 +45,30 @@ class DGCClipByNormKernel : public ClipByNormKernel { return; } - return ClipByNormKernel::Compute(context); + auto in_var = ctx.InputVar("X"); + auto max_norm = ctx.Attr("max_norm"); + auto& dev_ctx = ctx.device_context(); + + if (in_var->IsType()) { + auto* x = ctx.Input("X"); + auto* y = ctx.Output("Out"); + return phi::ClipByNormKernel( + static_cast::TYPE&>(dev_ctx), + *x, + max_norm, + y); + } else if (in_var->IsType()) { + auto* x = ctx.Input("X"); + phi::SelectedRows* output_selected_rows = + ctx.Output("Out"); + return phi::sr::ClipByNormKernel( + static_cast::TYPE&>(dev_ctx), + *x, + max_norm, + output_selected_rows); + } }; }; diff --git a/paddle/phi/api/yaml/legacy_api.yaml b/paddle/phi/api/yaml/legacy_api.yaml index 40fbdc9a9170d..a562db94745c9 100644 --- a/paddle/phi/api/yaml/legacy_api.yaml +++ b/paddle/phi/api/yaml/legacy_api.yaml @@ -368,6 +368,14 @@ func : clip backward : clip_grad +- api : clip_by_norm + args : (Tensor x, float max_norm) + output : Tensor(out) + infer_meta : + func : ClipByNormInferMeta + kernel : + func : clip_by_norm + - api : complex args : (Tensor x, Tensor y) output : Tensor diff --git a/paddle/phi/infermeta/unary.cc b/paddle/phi/infermeta/unary.cc index 7b1c6dfe65a04..35cada2c325e5 100644 --- a/paddle/phi/infermeta/unary.cc +++ b/paddle/phi/infermeta/unary.cc @@ -264,6 +264,18 @@ void CholeskyInferMeta(const MetaTensor& x, bool upper, MetaTensor* out) { out->set_dtype(x.dtype()); } +void ClipByNormInferMeta(const MetaTensor& x, float max_norm, MetaTensor* out) { + PADDLE_ENFORCE_GT( + max_norm, + 0, + phi::errors::InvalidArgument("max_norm should be greater than 0. " + "Received max_norm is %f.", + max_norm)); + out->set_dims(x.dims()); + out->set_dtype(x.dtype()); + out->share_lod(x); +} + void CreateLikeInferMeta(const MetaTensor& x, DataType dtype, MetaTensor* out) { out->set_dims(x.dims()); out->set_dtype(dtype == DataType::UNDEFINED ? x.dtype() : dtype); diff --git a/paddle/phi/infermeta/unary.h b/paddle/phi/infermeta/unary.h index e825ba98f44e3..1a0da23600339 100644 --- a/paddle/phi/infermeta/unary.h +++ b/paddle/phi/infermeta/unary.h @@ -62,6 +62,8 @@ void CastInferMeta(const MetaTensor& x, DataType out_dtype, MetaTensor* out); void CholeskyInferMeta(const MetaTensor& x, bool upper, MetaTensor* out); +void ClipByNormInferMeta(const MetaTensor& x, float max_norm, MetaTensor* out); + void CreateLikeInferMeta(const MetaTensor& x, DataType dtype, MetaTensor* out); void CumInferMeta(const MetaTensor& x, diff --git a/paddle/phi/kernels/clip_by_norm_kernel.h b/paddle/phi/kernels/clip_by_norm_kernel.h new file mode 100644 index 0000000000000..debff5d08b646 --- /dev/null +++ b/paddle/phi/kernels/clip_by_norm_kernel.h @@ -0,0 +1,27 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/core/dense_tensor.h" + +namespace phi { + +template +void ClipByNormKernel(const Context& dev_ctx, + const DenseTensor& x, + float max_norm, + DenseTensor* out); + +} // namespace phi diff --git a/paddle/phi/kernels/cpu/clip_by_norm_kernel.cc b/paddle/phi/kernels/cpu/clip_by_norm_kernel.cc new file mode 100644 index 0000000000000..8d8e27dda32b4 --- /dev/null +++ b/paddle/phi/kernels/cpu/clip_by_norm_kernel.cc @@ -0,0 +1,34 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/clip_by_norm_kernel.h" + +#include "paddle/phi/backends/cpu/cpu_context.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/impl/clip_by_norm_kernel_impl.h" + +namespace phi { + +template +void ClipByNormKernel(const Context& dev_ctx, + const DenseTensor& in, + float max_norm, + DenseTensor* output) { + return ClipByNormFunctor(dev_ctx, in, max_norm, output); +} + +} // namespace phi + +PD_REGISTER_KERNEL( + clip_by_norm, CPU, ALL_LAYOUT, phi::ClipByNormKernel, float) {} diff --git a/paddle/phi/kernels/gpu/clip_by_norm_kernel.cu b/paddle/phi/kernels/gpu/clip_by_norm_kernel.cu new file mode 100644 index 0000000000000..6c3abf843f998 --- /dev/null +++ b/paddle/phi/kernels/gpu/clip_by_norm_kernel.cu @@ -0,0 +1,89 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/clip_by_norm_kernel.h" + +#include + +#include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/common/float16.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/funcs/eigen/common.h" +#include "paddle/phi/kernels/funcs/reduce_function.h" +#include "paddle/phi/kernels/impl/clip_by_norm_kernel_impl.h" + +namespace phi { + +template +void ClipByNormKernel(const Context& dev_ctx, + const DenseTensor& in, + float max_norm, + DenseTensor* output) { + if (typeid(T) == typeid(float)) { + return ClipByNormFunctor(dev_ctx, in, max_norm, output); + } + auto input = ∈ + dev_ctx.template Alloc(output); + + PADDLE_ENFORCE_NOT_NULL(input, + phi::errors::InvalidArgument( + "Input(X) of ClipByNormOp should not be null. " + "Please check if it is created correctly.")); + std::vector reduce_dims; + reduce_dims.resize(input->dims().size()); + for (int i = 0; i < reduce_dims.size(); ++i) { + reduce_dims[i] = i; + } + DenseTensor tmp_tensor; + auto* tmp = &tmp_tensor; + tmp->Resize({1}); + dev_ctx.template Alloc(tmp); + phi::funcs::ReduceKernel>( + dev_ctx, + *input, + tmp, + kps::SquareFunctor(), + reduce_dims); + auto tmp_eigen = phi::EigenVector::Flatten(*tmp); + auto x_norm = tmp_eigen.sqrt(); + + auto x = phi::EigenVector::Flatten(*input); + auto out = phi::EigenVector::Flatten(*output); + auto* place = dev_ctx.eigen_device(); + + auto temp = (x_norm <= max_norm).template cast(); + auto epsilon = + ((x_norm <= static_cast(1e-30)).all().template cast()) * + static_cast(1e-6); + + auto scaling = + (temp + (static_cast(1) - temp) * max_norm / (x_norm + epsilon)) + .template cast(); + Eigen::array one_dim{{1}}; + Eigen::DSizes m_dsize(input->numel()); + + out.device(*place) = x * scaling.reshape(one_dim).broadcast(m_dsize); +} + +} // namespace phi + +PD_REGISTER_KERNEL(clip_by_norm, + GPU, + ALL_LAYOUT, + phi::ClipByNormKernel, + float, + phi::dtype::float16) {} diff --git a/paddle/phi/kernels/impl/clip_by_norm_kernel_impl.h b/paddle/phi/kernels/impl/clip_by_norm_kernel_impl.h new file mode 100644 index 0000000000000..079254bb8284c --- /dev/null +++ b/paddle/phi/kernels/impl/clip_by_norm_kernel_impl.h @@ -0,0 +1,55 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/core/dense_tensor.h" +#include "paddle/phi/kernels/funcs/eigen/common.h" + +namespace phi { + +template +void ClipByNormFunctor(const Context& dev_ctx, + const DenseTensor& in, + float max_norm, + DenseTensor* output) { + auto input = ∈ + dev_ctx.template Alloc(output); + + PADDLE_ENFORCE_NOT_NULL(input, + phi::errors::InvalidArgument( + "Input(X) of ClipByNormOp should not be null. " + "Please check if it is created correctly.")); + + auto x = phi::EigenVector::Flatten(*input); + auto out = phi::EigenVector::Flatten(*output); + auto x_norm = x.square().sum().sqrt(); + auto* place = dev_ctx.eigen_device(); + + auto temp = (x_norm <= max_norm).template cast(); + auto epsilon = ((x_norm <= static_cast(1e-30)).all().template cast()) * + static_cast(1e-6); + + auto scaling = + temp + (static_cast(1) - temp) * max_norm / (x_norm + epsilon); + Eigen::array one_dim{{1}}; + Eigen::DSizes m_dsize(input->numel()); + if (dev_ctx.GetPlace() == phi::CPUPlace()) { + out.device(*place) = x * scaling.reshape(one_dim).eval().broadcast(m_dsize); + } else { + out.device(*place) = x * scaling.reshape(one_dim).broadcast(m_dsize); + } +} + +} // namespace phi diff --git a/paddle/phi/kernels/selected_rows/clip_by_norm_kernel.h b/paddle/phi/kernels/selected_rows/clip_by_norm_kernel.h new file mode 100644 index 0000000000000..975aac23ff3ac --- /dev/null +++ b/paddle/phi/kernels/selected_rows/clip_by_norm_kernel.h @@ -0,0 +1,29 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/core/dense_tensor.h" +#include "paddle/phi/core/selected_rows.h" + +namespace phi { +namespace sr { + +template +void ClipByNormKernel(const Context& dev_ctx, + const SelectedRows& x, + float max_norm, + SelectedRows* out); +} // namespace sr +} // namespace phi diff --git a/paddle/phi/kernels/selected_rows/cpu/clip_by_norm_kernel.cc b/paddle/phi/kernels/selected_rows/cpu/clip_by_norm_kernel.cc new file mode 100644 index 0000000000000..ecefe8f74bb72 --- /dev/null +++ b/paddle/phi/kernels/selected_rows/cpu/clip_by_norm_kernel.cc @@ -0,0 +1,22 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/selected_rows/clip_by_norm_kernel.h" + +#include "paddle/phi/backends/cpu/cpu_context.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/selected_rows/impl/clip_by_norm_kernel_impl.h" + +PD_REGISTER_KERNEL( + clip_by_norm_sr, CPU, ALL_LAYOUT, phi::sr::ClipByNormKernel, float) {} diff --git a/paddle/phi/kernels/selected_rows/gpu/clip_by_norm_kernel.cu b/paddle/phi/kernels/selected_rows/gpu/clip_by_norm_kernel.cu new file mode 100644 index 0000000000000..4245aa35b3918 --- /dev/null +++ b/paddle/phi/kernels/selected_rows/gpu/clip_by_norm_kernel.cu @@ -0,0 +1,27 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/selected_rows/clip_by_norm_kernel.h" + +#include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/common/float16.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/selected_rows/impl/clip_by_norm_kernel_impl.h" + +PD_REGISTER_KERNEL(clip_by_norm_sr, + GPU, + ALL_LAYOUT, + phi::sr::ClipByNormKernel, + float, + phi::dtype::float16) {} diff --git a/paddle/phi/kernels/selected_rows/impl/clip_by_norm_kernel_impl.h b/paddle/phi/kernels/selected_rows/impl/clip_by_norm_kernel_impl.h new file mode 100644 index 0000000000000..5d79393a32d66 --- /dev/null +++ b/paddle/phi/kernels/selected_rows/impl/clip_by_norm_kernel_impl.h @@ -0,0 +1,45 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/fluid/operators/math/selected_rows_functor.h" +#include "paddle/phi/core/dense_tensor.h" +#include "paddle/phi/core/device_context.h" +#include "paddle/phi/core/selected_rows.h" +#include "paddle/phi/kernels/clip_by_norm_kernel.h" +#include "paddle/phi/kernels/selected_rows/clip_by_norm_kernel.h" + +namespace phi { +namespace sr { + +template +void ClipByNormKernel(const Context& dev_ctx, + const SelectedRows& x, + float max_norm, + SelectedRows* out) { + phi::SelectedRows merged_input; + paddle::operators::math::scatter::MergeAdd merge_func; + merge_func(dev_ctx, x, &merged_input); + auto input = &(merged_input.value()); + out->set_rows(merged_input.rows()); + out->set_height(merged_input.height()); + auto out_tensor = out->mutable_value(); + out_tensor->Resize(merged_input.value().dims()); + return phi::ClipByNormKernel( + dev_ctx, *input, max_norm, out_tensor); +} + +} // namespace sr +} // namespace phi diff --git a/paddle/phi/ops/compat/clip_by_norm_sig.cc b/paddle/phi/ops/compat/clip_by_norm_sig.cc new file mode 100644 index 0000000000000..8a2cecc0293d3 --- /dev/null +++ b/paddle/phi/ops/compat/clip_by_norm_sig.cc @@ -0,0 +1,30 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/phi/core/compat/op_utils.h" + +namespace phi { + +KernelSignature ClipByNormOpArgumentMapping(const ArgumentMappingContext& ctx) { + if (ctx.IsDenseTensorInput("X")) { + return KernelSignature("clip_by_norm", {"X"}, {"max_norm"}, {"Out"}); + } else if (ctx.IsSelectedRowsInput("X")) { + return KernelSignature("clip_by_norm_sr", {"X"}, {"max_norm"}, {"Out"}); + } + return KernelSignature("unregistered", {}, {}, {}); +} + +} // namespace phi + +PD_REGISTER_ARG_MAPPING_FN(clip_by_norm, phi::ClipByNormOpArgumentMapping); diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py index 050d6bfcb6bbb..e68b70107c109 100755 --- a/python/paddle/fluid/layers/nn.py +++ b/python/paddle/fluid/layers/nn.py @@ -13043,6 +13043,8 @@ def clip_by_norm(x, max_norm, name=None): # [[0.5, 0.5], [0.5, 0.5]] """ + if in_dygraph_mode(): + return _C_ops.final_state_clip_by_norm(x, max_norm) if _non_static_mode(): return _C_ops.clip_by_norm(x, 'max_norm', max_norm) diff --git a/python/paddle/fluid/tests/unittests/test_clip_by_norm_op.py b/python/paddle/fluid/tests/unittests/test_clip_by_norm_op.py index 8eb4c7a8be965..04b9c5b8b8bc7 100644 --- a/python/paddle/fluid/tests/unittests/test_clip_by_norm_op.py +++ b/python/paddle/fluid/tests/unittests/test_clip_by_norm_op.py @@ -27,6 +27,7 @@ class TestClipByNormOp(OpTest): def setUp(self): self.max_relative_error = 0.006 + self.python_api = fluid.layers.clip_by_norm self.init_dtype() self.initTestCase() input = np.random.random(self.shape).astype(self.dtype) @@ -45,7 +46,7 @@ def setUp(self): self.outputs = {'Out': output} def test_check_output(self): - self.check_output() + self.check_output(check_eager=True) def initTestCase(self): self.shape = (100, ) @@ -85,7 +86,9 @@ def test_check_output(self): if core.is_compiled_with_cuda(): place = core.CUDAPlace(0) if core.is_float16_supported(place): - self.check_output_with_place(place, atol=0.001) + self.check_output_with_place(place, + atol=0.001, + check_eager=True) class TestClipByNormOpFp16Case1(TestClipByNormOpFp16): From fbfdea5148897561d3b668dcdfd29adaee1038e4 Mon Sep 17 00:00:00 2001 From: Leo Chen Date: Wed, 20 Jul 2022 07:49:26 -0500 Subject: [PATCH 10/12] add eigen3 dependency for phi_backends (#44479) --- paddle/phi/backends/CMakeLists.txt | 3 ++- paddle/phi/backends/custom/CMakeLists.txt | 18 ------------------ 2 files changed, 2 insertions(+), 19 deletions(-) delete mode 100644 paddle/phi/backends/custom/CMakeLists.txt diff --git a/paddle/phi/backends/CMakeLists.txt b/paddle/phi/backends/CMakeLists.txt index a2d4b1deef6e6..de4d82b46133c 100644 --- a/paddle/phi/backends/CMakeLists.txt +++ b/paddle/phi/backends/CMakeLists.txt @@ -1,7 +1,7 @@ add_subdirectory(dynload) set(BACKENDS_SRCS all_context.cc cpu/cpu_context.cc) -set(BACKENDS_DEPS enforce place flags) +set(BACKENDS_DEPS enforce place flags eigen3) if(WITH_GPU OR WITH_ROCM) list(APPEND BACKENDS_SRCS gpu/gpu_context.cc gpu/gpu_info.cc @@ -40,6 +40,7 @@ endif() add_library(phi_backends "${BACKENDS_SRCS}") target_link_libraries(phi_backends ${BACKENDS_DEPS}) +add_dependencies(phi_backends eigen3) # for inference library get_property(phi_modules GLOBAL PROPERTY PHI_MODULES) diff --git a/paddle/phi/backends/custom/CMakeLists.txt b/paddle/phi/backends/custom/CMakeLists.txt deleted file mode 100644 index ceff429f8e596..0000000000000 --- a/paddle/phi/backends/custom/CMakeLists.txt +++ /dev/null @@ -1,18 +0,0 @@ -if(WITH_CUSTOM_DEVICE) - cc_library( - custom_context - SRCS custom_context.cc - DEPS phi_device_context device_manager) - cc_library( - custom_device - SRCS custom_device.cc - DEPS device_base device_context) - cc_test( - custom_device_test - SRCS custom_device_test.cc - DEPS device_manager device_context) - cc_test( - capi_test - SRCS capi_test.cc - DEPS phi_capi) -endif() From 15dd94abf25ae5d91b8f8890c361ac8ffe9dc41b Mon Sep 17 00:00:00 2001 From: danleifeng <52735331+danleifeng@users.noreply.github.com> Date: Wed, 20 Jul 2022 21:20:04 +0800 Subject: [PATCH 11/12] remove fleet_13 ut in parallel_UT_rule.py; test=develop (#44477) --- tools/parallel_UT_rule.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/tools/parallel_UT_rule.py b/tools/parallel_UT_rule.py index 53ab93f57ce56..559f2d95b915f 100755 --- a/tools/parallel_UT_rule.py +++ b/tools/parallel_UT_rule.py @@ -671,8 +671,7 @@ 'test_trt_convert_reduce_sum', 'save_quant2_model_lstm', 'test_trt_convert_slice', - 'test_quant2_int8_lstm_mkldnn', - 'test_dist_fleet_ps13' + 'test_quant2_int8_lstm_mkldnn' ] # mem=0 but always timeout or failed : It run 15 job each time in Single cases; From 98e9685394858bf834fdedd0a70d33d9f06226a8 Mon Sep 17 00:00:00 2001 From: YuanRisheng Date: Wed, 20 Jul 2022 21:31:31 +0800 Subject: [PATCH 12/12] [PHI]Seperate xshape kernel from normal kernel (#44315) * seperate xshape kernel from normal kernel * fix bugs in infermeta * fix compile bugs * fix compile bugs --- paddle/fluid/operators/einsum_op.cc | 2 +- paddle/fluid/operators/squeeze_op.cc | 2 +- paddle/fluid/operators/unsqueeze_op.cc | 2 +- paddle/phi/api/lib/CMakeLists.txt | 4 +- paddle/phi/api/yaml/legacy_api.yaml | 12 ++-- paddle/phi/infermeta/unary.cc | 56 +++++++++++++------ paddle/phi/infermeta/unary.h | 25 +++++++-- paddle/phi/kernels/cpu/einsum_kernel.cc | 11 +++- paddle/phi/kernels/cpu/squeeze_kernel.cc | 15 +++++ paddle/phi/kernels/cpu/unsqueeze_kernel.cc | 16 ++++++ paddle/phi/kernels/gpu/einsum_kernel.cu | 13 ++++- paddle/phi/kernels/gpu/squeeze_kernel.cu | 16 ++++++ paddle/phi/kernels/gpu/unsqueeze_kernel.cu | 17 ++++++ paddle/phi/kernels/impl/solve_kernel_impl.h | 6 +- paddle/phi/kernels/impl/squeeze_kernel_impl.h | 13 ++++- .../phi/kernels/impl/unsqueeze_kernel_impl.h | 12 +++- paddle/phi/kernels/squeeze_kernel.h | 11 +++- paddle/phi/kernels/unsqueeze_kernel.h | 14 +++-- paddle/phi/ops/compat/einsum_sig.cc | 10 +++- paddle/phi/ops/compat/squeeze_sig.cc | 7 ++- paddle/phi/ops/compat/unsqueeze_sig.cc | 36 ++++++++---- 21 files changed, 239 insertions(+), 61 deletions(-) diff --git a/paddle/fluid/operators/einsum_op.cc b/paddle/fluid/operators/einsum_op.cc index 0e33efab90a85..5f169e20e3dc3 100644 --- a/paddle/fluid/operators/einsum_op.cc +++ b/paddle/fluid/operators/einsum_op.cc @@ -106,7 +106,7 @@ namespace ops = paddle::operators; DECLARE_INFER_SHAPE_FUNCTOR(einsum, EinsumInferShapeFunctor, - PD_INFER_META(phi::EinsumInferMeta)); + PD_INFER_META(phi::EinsumRawInferMeta)); REGISTER_OPERATOR(einsum, ops::EinsumOp, diff --git a/paddle/fluid/operators/squeeze_op.cc b/paddle/fluid/operators/squeeze_op.cc index f532a429b49e2..b3c70e2fe9988 100644 --- a/paddle/fluid/operators/squeeze_op.cc +++ b/paddle/fluid/operators/squeeze_op.cc @@ -347,7 +347,7 @@ namespace ops = paddle::operators; DECLARE_INFER_SHAPE_FUNCTOR(squeeze2, SqueezeInferShapeFunctor, - PD_INFER_META(phi::SqueezeInferMeta)); + PD_INFER_META(phi::SqueezeWithXShapeInferMeta)); REGISTER_OPERATOR(squeeze, ops::SqueezeOp, diff --git a/paddle/fluid/operators/unsqueeze_op.cc b/paddle/fluid/operators/unsqueeze_op.cc index 53de6440f1f61..f01ae5f142d28 100644 --- a/paddle/fluid/operators/unsqueeze_op.cc +++ b/paddle/fluid/operators/unsqueeze_op.cc @@ -347,7 +347,7 @@ DECLARE_NO_NEED_BUFFER_VARS_INFERER(UnsqueezeGradOpNoNeedBufferVarInferer, "X"); DECLARE_INFER_SHAPE_FUNCTOR(unsqueeze2, Unsqueeze2InferShapeFunctor, - PD_INFER_META(phi::UnsqueezeInferMeta)); + PD_INFER_META(phi::UnsqueezeWithXShapeInferMeta)); namespace ops = paddle::operators; REGISTER_OPERATOR(unsqueeze, diff --git a/paddle/phi/api/lib/CMakeLists.txt b/paddle/phi/api/lib/CMakeLists.txt index 05d27571b8795..cb7f439690619 100644 --- a/paddle/phi/api/lib/CMakeLists.txt +++ b/paddle/phi/api/lib/CMakeLists.txt @@ -325,8 +325,8 @@ add_custom_command( ${dygraph_api_header_file} COMMAND ${CMAKE_COMMAND} -E copy_if_different ${dygraph_api_source_file_tmp} ${dygraph_api_source_file} - DEPENDS ${api_yaml_file} ${sparse_api_yaml_file} ${im_api_gen_file} - ${api_gen_base} ${api_gen_file} + DEPENDS ${api_yaml_file} ${legacy_api_yaml_file} ${sparse_api_yaml_file} + ${im_api_gen_file} ${api_gen_base} ${api_gen_file} VERBATIM) # generate wrapped infermeta diff --git a/paddle/phi/api/yaml/legacy_api.yaml b/paddle/phi/api/yaml/legacy_api.yaml index a562db94745c9..0d0fd74c17aa7 100644 --- a/paddle/phi/api/yaml/legacy_api.yaml +++ b/paddle/phi/api/yaml/legacy_api.yaml @@ -582,10 +582,10 @@ args : (Tensor[] x, str equation) output : Tensor, Tensor[]{x.size()}, Tensor[]{x.size()} infer_meta : - func : EinsumInferMeta + func : EinsumRawInferMeta param : [x, equation] kernel : - func : einsum + func : einsum_raw backward : einsum_grad - api : elementwise_pow @@ -2047,9 +2047,9 @@ args : (Tensor x, int[] axes) output : Tensor(out), Tensor(xshape) infer_meta : - func : SqueezeInferMeta + func : SqueezeWithXShapeInferMeta kernel : - func : squeeze + func : squeeze_with_xshape view: (x -> out) intermediate : xshape backward : squeeze_grad @@ -2290,9 +2290,9 @@ args : (Tensor x, IntArray axis) output : Tensor(out), Tensor(xshape) infer_meta : - func : UnsqueezeInferMeta + func : UnsqueezeWithXShapeInferMeta kernel : - func : unsqueeze + func : unsqueeze_with_xshape view: (x -> out) intermediate : xshape backward : unsqueeze_grad diff --git a/paddle/phi/infermeta/unary.cc b/paddle/phi/infermeta/unary.cc index 35cada2c325e5..c7699c34cc546 100644 --- a/paddle/phi/infermeta/unary.cc +++ b/paddle/phi/infermeta/unary.cc @@ -570,9 +570,7 @@ void EigvalsInferMeta(const MetaTensor& x, MetaTensor* out, MetaConfig config) { void EinsumInferMeta(const std::vector& inputs, const std::string& equation, - MetaTensor* out, - std::vector inner_cache, - std::vector xshape) { + MetaTensor* out) { // collect the following informations to prepare einsum. LabelMap labelshape(0); LabelMap labeltype(LabelType::Reduction); @@ -609,6 +607,14 @@ void EinsumInferMeta(const std::vector& inputs, VLOG(3) << "Label Shape is : " << label_to_string(all_labels, labelshape); out->set_dims(make_ddim(output_dims)); out->set_dtype(inputs[0]->dtype()); +} + +void EinsumRawInferMeta(const std::vector& inputs, + const std::string& equation, + MetaTensor* out, + std::vector inner_cache, + std::vector xshape) { + EinsumInferMeta(inputs, equation, out); for (size_t i = 0; i < xshape.size(); ++i) { if (xshape[i] != nullptr) { xshape[i]->set_dims(inputs[i]->dims()); @@ -2448,8 +2454,7 @@ void SplitInferMeta(const MetaTensor& x, void SqueezeInferMeta(const MetaTensor& x, const std::vector& axes, - MetaTensor* out, - MetaTensor* xshape) { + MetaTensor* out) { const auto& x_dims = x.dims(); // Check input tensor dims (<6) Eigen limit. PADDLE_ENFORCE_LE(x_dims.size(), @@ -2469,15 +2474,25 @@ void SqueezeInferMeta(const MetaTensor& x, out->share_lod(x); } + out->set_dtype(x.dtype()); +} + +void SqueezeWithXShapeInferMeta(const MetaTensor& x, + const std::vector& axes, + MetaTensor* out, + MetaTensor* xshape) { + SqueezeInferMeta(x, axes, out); + const auto& x_dims = x.dims(); std::vector xshape_dims(x_dims.size() + 1); xshape_dims[0] = 0; for (int i = 0; i < x_dims.size(); ++i) { xshape_dims[i + 1] = x_dims[i]; } - xshape->set_dims(phi::make_ddim(xshape_dims)); - xshape->share_lod(x); - xshape->set_dtype(x.dtype()); - out->set_dtype(x.dtype()); + if (xshape) { + xshape->set_dims(phi::make_ddim(xshape_dims)); + xshape->share_lod(x); + xshape->set_dtype(x.dtype()); + } } void StridedSliceRawInferMeta(const MetaTensor& x, @@ -3310,7 +3325,6 @@ void UniqueRawInferMeta(const MetaTensor& x, void UnsqueezeInferMeta(const MetaTensor& x, const IntArray& axes, MetaTensor* out, - MetaTensor* xshape, MetaConfig config) { const auto& x_dims = x.dims(); // Validity Check: input tensor dims (<6). @@ -3339,14 +3353,22 @@ void UnsqueezeInferMeta(const MetaTensor& x, } out->set_dtype(x.dtype()); } - if (xshape) { - // set xshape dims. - std::vector xshape_dims(x_dims.size() + 1); - xshape_dims[0] = 0; - for (int i = 0; i < x_dims.size(); ++i) { - xshape_dims[i + 1] = x_dims[i]; - } +} +void UnsqueezeWithXShapeInferMeta(const MetaTensor& x, + const IntArray& axes, + MetaTensor* out, + MetaTensor* xshape, + MetaConfig config) { + const auto& x_dims = x.dims(); + UnsqueezeInferMeta(x, axes, out, config); + // set xshape dims. + std::vector xshape_dims(x_dims.size() + 1); + xshape_dims[0] = 0; + for (int i = 0; i < x_dims.size(); ++i) { + xshape_dims[i + 1] = x_dims[i]; + } + if (xshape) { xshape->set_dims(phi::make_ddim(xshape_dims)); xshape->share_lod(x); xshape->set_dtype(x.dtype()); diff --git a/paddle/phi/infermeta/unary.h b/paddle/phi/infermeta/unary.h index 1a0da23600339..ea7364e643960 100644 --- a/paddle/phi/infermeta/unary.h +++ b/paddle/phi/infermeta/unary.h @@ -97,9 +97,13 @@ void EigvalsInferMeta(const MetaTensor& x, void EinsumInferMeta(const std::vector& inputs, const std::string& equation, - MetaTensor* out, - std::vector inner_cache, - std::vector xshape); + MetaTensor* out); + +void EinsumRawInferMeta(const std::vector& inputs, + const std::string& equation, + MetaTensor* out, + std::vector inner_cache, + std::vector xshape); void ExpandInferMeta(const MetaTensor& x, const IntArray& shape, @@ -341,8 +345,12 @@ void SplitInferMeta(const MetaTensor& x_meta, void SqueezeInferMeta(const MetaTensor& x, const std::vector& axes, - MetaTensor* out, - MetaTensor* xshape); + MetaTensor* out); + +void SqueezeWithXShapeInferMeta(const MetaTensor& x, + const std::vector& axes, + MetaTensor* out, + MetaTensor* xshape); void StridedSliceRawInferMeta(const MetaTensor& x, const std::vector& axes, @@ -470,9 +478,14 @@ void UniqueRawInferMeta(const MetaTensor& x, void UnsqueezeInferMeta(const MetaTensor& x, const IntArray& axes, MetaTensor* out, - MetaTensor* xshape, MetaConfig config = MetaConfig()); +void UnsqueezeWithXShapeInferMeta(const MetaTensor& x, + const IntArray& axes, + MetaTensor* out, + MetaTensor* xshape, + MetaConfig config = MetaConfig()); + void UnStackInferMeta(const MetaTensor& x, int axis, int num, diff --git a/paddle/phi/kernels/cpu/einsum_kernel.cc b/paddle/phi/kernels/cpu/einsum_kernel.cc index 901c1fed628d3..7ef85a942e435 100644 --- a/paddle/phi/kernels/cpu/einsum_kernel.cc +++ b/paddle/phi/kernels/cpu/einsum_kernel.cc @@ -18,7 +18,7 @@ #include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/kernels/impl/einsum_impl.h" -PD_REGISTER_KERNEL(einsum, +PD_REGISTER_KERNEL(einsum_raw, CPU, ALL_LAYOUT, phi::EinsumKernelRaw, @@ -26,3 +26,12 @@ PD_REGISTER_KERNEL(einsum, double, phi::dtype::complex, phi::dtype::complex) {} + +PD_REGISTER_KERNEL(einsum, + CPU, + ALL_LAYOUT, + phi::EinsumKernel, + float, + double, + phi::dtype::complex, + phi::dtype::complex) {} diff --git a/paddle/phi/kernels/cpu/squeeze_kernel.cc b/paddle/phi/kernels/cpu/squeeze_kernel.cc index 7d5a6ca4e884e..d22efdf969440 100644 --- a/paddle/phi/kernels/cpu/squeeze_kernel.cc +++ b/paddle/phi/kernels/cpu/squeeze_kernel.cc @@ -32,3 +32,18 @@ PD_REGISTER_KERNEL(squeeze, int64_t, phi::dtype::complex, phi::dtype::complex) {} + +PD_REGISTER_KERNEL(squeeze_with_xshape, + CPU, + ALL_LAYOUT, + phi::SqueezeWithXShapeKernel, + float, + double, + phi::dtype::bfloat16, + bool, + int, + uint8_t, + int8_t, + int64_t, + phi::dtype::complex, + phi::dtype::complex) {} diff --git a/paddle/phi/kernels/cpu/unsqueeze_kernel.cc b/paddle/phi/kernels/cpu/unsqueeze_kernel.cc index 0152a31f80ba8..612e1a78cc5bb 100644 --- a/paddle/phi/kernels/cpu/unsqueeze_kernel.cc +++ b/paddle/phi/kernels/cpu/unsqueeze_kernel.cc @@ -33,3 +33,19 @@ PD_REGISTER_KERNEL(unsqueeze, int64_t, phi::dtype::complex, phi::dtype::complex) {} + +PD_REGISTER_KERNEL(unsqueeze_with_xshape, + CPU, + ALL_LAYOUT, + phi::UnsqueezeWithXShapeKernel, + float, + double, + phi::dtype::bfloat16, + bool, + int, + int16_t, + uint8_t, + int8_t, + int64_t, + phi::dtype::complex, + phi::dtype::complex) {} diff --git a/paddle/phi/kernels/gpu/einsum_kernel.cu b/paddle/phi/kernels/gpu/einsum_kernel.cu index b3706710c40e3..99a9c58995c1f 100644 --- a/paddle/phi/kernels/gpu/einsum_kernel.cu +++ b/paddle/phi/kernels/gpu/einsum_kernel.cu @@ -18,7 +18,7 @@ #include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/kernels/impl/einsum_impl.h" -PD_REGISTER_KERNEL(einsum, +PD_REGISTER_KERNEL(einsum_raw, GPU, ALL_LAYOUT, phi::EinsumKernelRaw, @@ -28,3 +28,14 @@ PD_REGISTER_KERNEL(einsum, phi::dtype::bfloat16, phi::dtype::complex, phi::dtype::complex) {} + +PD_REGISTER_KERNEL(einsum, + GPU, + ALL_LAYOUT, + phi::EinsumKernel, + float, + double, + phi::dtype::float16, + phi::dtype::bfloat16, + phi::dtype::complex, + phi::dtype::complex) {} diff --git a/paddle/phi/kernels/gpu/squeeze_kernel.cu b/paddle/phi/kernels/gpu/squeeze_kernel.cu index ae15e210a02e7..06ddba2ef1c2b 100644 --- a/paddle/phi/kernels/gpu/squeeze_kernel.cu +++ b/paddle/phi/kernels/gpu/squeeze_kernel.cu @@ -33,3 +33,19 @@ PD_REGISTER_KERNEL(squeeze, int64_t, phi::dtype::complex, phi::dtype::complex) {} + +PD_REGISTER_KERNEL(squeeze_with_xshape, + GPU, + ALL_LAYOUT, + phi::SqueezeWithXShapeKernel, + float, + double, + phi::dtype::bfloat16, + phi::dtype::float16, + bool, + int, + uint8_t, + int8_t, + int64_t, + phi::dtype::complex, + phi::dtype::complex) {} diff --git a/paddle/phi/kernels/gpu/unsqueeze_kernel.cu b/paddle/phi/kernels/gpu/unsqueeze_kernel.cu index 86b4462254637..2e7bae8666d24 100644 --- a/paddle/phi/kernels/gpu/unsqueeze_kernel.cu +++ b/paddle/phi/kernels/gpu/unsqueeze_kernel.cu @@ -34,3 +34,20 @@ PD_REGISTER_KERNEL(unsqueeze, int64_t, phi::dtype::complex, phi::dtype::complex) {} + +PD_REGISTER_KERNEL(unsqueeze_with_xshape, + GPU, + ALL_LAYOUT, + phi::UnsqueezeWithXShapeKernel, + float, + double, + phi::dtype::float16, + phi::dtype::bfloat16, + bool, + int, + int16_t, + uint8_t, + int8_t, + int64_t, + phi::dtype::complex, + phi::dtype::complex) {} diff --git a/paddle/phi/kernels/impl/solve_kernel_impl.h b/paddle/phi/kernels/impl/solve_kernel_impl.h index 09c9e74dd207a..4120823a9d2e9 100644 --- a/paddle/phi/kernels/impl/solve_kernel_impl.h +++ b/paddle/phi/kernels/impl/solve_kernel_impl.h @@ -12,6 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ +#pragma once + #include "paddle/phi/core/tensor_utils.h" #include "paddle/phi/kernels/expand_as_kernel.h" #include "paddle/phi/kernels/funcs/matrix_solve.h" @@ -77,7 +79,7 @@ static std::vector get_broadcast_batch_portion( static inline std::vector convert_to_int_vec(std::vector a) { std::vector ret; for (size_t i = 0; i < a.size(); i++) { - ret.emplace_back(int(a[i])); + ret.emplace_back(static_cast(a[i])); } return ret; @@ -167,7 +169,7 @@ static void linalg_solve(const Context& dev_ctx, out_tmp.Resize(out->dims()); out_tmp = *out; - phi::SqueezeKernel(dev_ctx, out_tmp, {-1}, out, nullptr); + phi::SqueezeKernel(dev_ctx, out_tmp, {-1}, out); } else { PADDLE_ENFORCE_EQ( x_dim[x_dim_size - 1], diff --git a/paddle/phi/kernels/impl/squeeze_kernel_impl.h b/paddle/phi/kernels/impl/squeeze_kernel_impl.h index b4c94d619cc2a..156a71973a794 100644 --- a/paddle/phi/kernels/impl/squeeze_kernel_impl.h +++ b/paddle/phi/kernels/impl/squeeze_kernel_impl.h @@ -22,8 +22,7 @@ template void SqueezeKernel(const Context& dev_ctx, const DenseTensor& x, const std::vector& axes, - DenseTensor* out, - DenseTensor* xshape) { + DenseTensor* out) { auto x_dims = x.dims(); auto out_dims = funcs::GetOutputSqueezeShape(axes, x_dims, true); @@ -31,4 +30,14 @@ void SqueezeKernel(const Context& dev_ctx, phi::Copy(dev_ctx, x, dev_ctx.GetPlace(), false, out); out->Resize(out_dims); } + +template +void SqueezeWithXShapeKernel(const Context& dev_ctx, + const DenseTensor& x, + const std::vector& axes, + DenseTensor* out, + DenseTensor* xshape) { + SqueezeKernel(dev_ctx, x, axes, out); +} + } // namespace phi diff --git a/paddle/phi/kernels/impl/unsqueeze_kernel_impl.h b/paddle/phi/kernels/impl/unsqueeze_kernel_impl.h index 4f81fa6c42341..5bef856d19b72 100644 --- a/paddle/phi/kernels/impl/unsqueeze_kernel_impl.h +++ b/paddle/phi/kernels/impl/unsqueeze_kernel_impl.h @@ -22,8 +22,7 @@ template void UnsqueezeKernel(const Context& dev_ctx, const DenseTensor& x, const IntArray& axes, - DenseTensor* out, - DenseTensor* xshape) { + DenseTensor* out) { auto x_dims = x.dims(); auto out_dims = out->dims(); if (axes.FromTensor()) { @@ -39,4 +38,13 @@ void UnsqueezeKernel(const Context& dev_ctx, phi::Copy(dev_ctx, x, dev_ctx.GetPlace(), false, out); out->Resize(out_dims); // copy will reset the dims. } + +template +void UnsqueezeWithXShapeKernel(const Context& dev_ctx, + const DenseTensor& x, + const IntArray& axes, + DenseTensor* out, + DenseTensor* xshape) { + UnsqueezeKernel(dev_ctx, x, axes, out); +} } // namespace phi diff --git a/paddle/phi/kernels/squeeze_kernel.h b/paddle/phi/kernels/squeeze_kernel.h index bd8f508cbb1db..1c6aeedbe5161 100644 --- a/paddle/phi/kernels/squeeze_kernel.h +++ b/paddle/phi/kernels/squeeze_kernel.h @@ -23,6 +23,13 @@ template void SqueezeKernel(const Context& dev_ctx, const DenseTensor& x, const std::vector& axes, - DenseTensor* out, - DenseTensor* xshape); + DenseTensor* out); + +template +void SqueezeWithXShapeKernel(const Context& dev_ctx, + const DenseTensor& x, + const std::vector& axes, + DenseTensor* out, + DenseTensor* xshape); + } // namespace phi diff --git a/paddle/phi/kernels/unsqueeze_kernel.h b/paddle/phi/kernels/unsqueeze_kernel.h index 62ba878c056cb..35a0515c92da3 100644 --- a/paddle/phi/kernels/unsqueeze_kernel.h +++ b/paddle/phi/kernels/unsqueeze_kernel.h @@ -25,8 +25,14 @@ template void UnsqueezeKernel(const Context& dev_ctx, const DenseTensor& x, const IntArray& axes, - DenseTensor* out, - DenseTensor* xshape); + DenseTensor* out); + +template +void UnsqueezeWithXShapeKernel(const Context& dev_ctx, + const DenseTensor& x, + const IntArray& axes, + DenseTensor* out, + DenseTensor* xshape); template void Unsqueeze(const Context& dev_ctx, @@ -35,8 +41,8 @@ void Unsqueeze(const Context& dev_ctx, DenseTensor* out, DenseTensor* xshape) { MetaTensor meta_out(out); - UnsqueezeInferMeta(x, axes, &meta_out, nullptr, MetaConfig()); - UnsqueezeKernel(dev_ctx, x, axes, out, nullptr); + UnsqueezeInferMeta(x, axes, &meta_out); + UnsqueezeKernel(dev_ctx, x, axes, out); } } // namespace phi diff --git a/paddle/phi/ops/compat/einsum_sig.cc b/paddle/phi/ops/compat/einsum_sig.cc index 4fd31c1a2d842..e5aa570985596 100644 --- a/paddle/phi/ops/compat/einsum_sig.cc +++ b/paddle/phi/ops/compat/einsum_sig.cc @@ -17,8 +17,14 @@ limitations under the License. */ namespace phi { KernelSignature EinsumOpArgumentMapping(const ArgumentMappingContext& ctx) { - return KernelSignature( - "einsum", {"Operands"}, {"equation"}, {"Out", "InnerCache", "XShape"}); + if (ctx.OutputSize("XShape") > 0 && ctx.OutputSize("InnerCache") > 0) { + return KernelSignature("einsum_raw", + {"Operands"}, + {"equation"}, + {"Out", "InnerCache", "XShape"}); + } else { + return KernelSignature("einsum", {"Operands"}, {"equation"}, {"Out"}); + } } KernelSignature EinsumGradOpArgumentMapping(const ArgumentMappingContext& ctx) { diff --git a/paddle/phi/ops/compat/squeeze_sig.cc b/paddle/phi/ops/compat/squeeze_sig.cc index cd6d5fc7253df..a251b9f537ccf 100644 --- a/paddle/phi/ops/compat/squeeze_sig.cc +++ b/paddle/phi/ops/compat/squeeze_sig.cc @@ -18,7 +18,12 @@ namespace phi { KernelSignature SqueezeOpArgumentMapping(const ArgumentMappingContext& ctx) { - return KernelSignature("squeeze", {"X"}, {"axes"}, {"Out", "XShape"}); + if (ctx.HasOutput("XShape")) { + return KernelSignature( + "squeeze_with_xshape", {"X"}, {"axes"}, {"Out", "XShape"}); + } else { + return KernelSignature("squeeze", {"X"}, {"axes"}, {"Out"}); + } } KernelSignature SqueezeGradOpArgumentMapping( diff --git a/paddle/phi/ops/compat/unsqueeze_sig.cc b/paddle/phi/ops/compat/unsqueeze_sig.cc index aee83933e5b97..a2f184e7150b8 100644 --- a/paddle/phi/ops/compat/unsqueeze_sig.cc +++ b/paddle/phi/ops/compat/unsqueeze_sig.cc @@ -18,17 +18,33 @@ namespace phi { KernelSignature UnsqueezeOpArgumentMapping(const ArgumentMappingContext& ctx) { - if (ctx.InputSize("AxesTensorList") > 0) { - VLOG(2) << "unsqueeze2 in AxesTensorList"; - return KernelSignature( - "unsqueeze", {"X"}, {"AxesTensorList"}, {"Out", "XShape"}); - } else if (ctx.InputSize("AxesTensor") > 0) { - VLOG(2) << "unsqueeze2 in AxesTensor"; - return KernelSignature( - "unsqueeze", {"X"}, {"AxesTensor"}, {"Out", "XShape"}); + if (ctx.HasOutput("XShape")) { + if (ctx.InputSize("AxesTensorList") > 0) { + VLOG(2) << "unsqueeze2 in AxesTensorList"; + return KernelSignature("unsqueeze_with_xshape", + {"X"}, + {"AxesTensorList"}, + {"Out", "XShape"}); + } else if (ctx.InputSize("AxesTensor") > 0) { + VLOG(2) << "unsqueeze2 in AxesTensor"; + return KernelSignature( + "unsqueeze_with_xshape", {"X"}, {"AxesTensor"}, {"Out", "XShape"}); + } else { + VLOG(2) << "unsqueeze2 in axes"; + return KernelSignature( + "unsqueeze_with_xshape", {"X"}, {"axes"}, {"Out", "XShape"}); + } } else { - VLOG(2) << "unsqueeze2 in axes"; - return KernelSignature("unsqueeze", {"X"}, {"axes"}, {"Out", "XShape"}); + if (ctx.InputSize("AxesTensorList") > 0) { + VLOG(2) << "unsqueeze2 in AxesTensorList"; + return KernelSignature("unsqueeze", {"X"}, {"AxesTensorList"}, {"Out"}); + } else if (ctx.InputSize("AxesTensor") > 0) { + VLOG(2) << "unsqueeze2 in AxesTensor"; + return KernelSignature("unsqueeze", {"X"}, {"AxesTensor"}, {"Out"}); + } else { + VLOG(2) << "unsqueeze2 in axes"; + return KernelSignature("unsqueeze", {"X"}, {"axes"}, {"Out"}); + } } }