From e4459a40be76268f3f1880e45a9e358a74d61ab5 Mon Sep 17 00:00:00 2001 From: sneaxiy <32832641+sneaxiy@users.noreply.github.com> Date: Thu, 7 Apr 2022 13:38:56 +0800 Subject: [PATCH 001/211] Add Output(Step) to DistributedFusedLamb optimizer (#41249) * add Output(Step) to distributed fused lamb op * add _set_step --- .../distributed_fused_lamb_init_op.cc | 1 + .../distributed_fused_lamb_init_op.cu | 4 ++ .../optimizers/distributed_fused_lamb_op.cc | 1 + .../optimizers/distributed_fused_lamb_op.cu | 46 ++++++++++++------- .../optimizer/distributed_fused_lamb.py | 13 ++++++ 5 files changed, 49 insertions(+), 16 deletions(-) diff --git a/paddle/fluid/operators/optimizers/distributed_fused_lamb_init_op.cc b/paddle/fluid/operators/optimizers/distributed_fused_lamb_init_op.cc index efec50efa92ea..95b45934ea6d2 100644 --- a/paddle/fluid/operators/optimizers/distributed_fused_lamb_init_op.cc +++ b/paddle/fluid/operators/optimizers/distributed_fused_lamb_init_op.cc @@ -94,6 +94,7 @@ class DistributedFusedLambInitOpMaker AddOutput("GradOut", "The output gradient list.").AsDuplicable(); AddOutput("GlobalScale", "The global scale. It is usually the scale factor for AMP."); + AddOutput("Step", "The global step which excludes the NaN/Inf step."); AddAttr("beta1", "The initial value of Beta1Pow."); AddAttr("beta2", "The initial value of Beta2Pow."); diff --git a/paddle/fluid/operators/optimizers/distributed_fused_lamb_init_op.cu b/paddle/fluid/operators/optimizers/distributed_fused_lamb_init_op.cu index 7d8a7186d58b4..3688b8067c231 100644 --- a/paddle/fluid/operators/optimizers/distributed_fused_lamb_init_op.cu +++ b/paddle/fluid/operators/optimizers/distributed_fused_lamb_init_op.cu @@ -698,6 +698,10 @@ class DistributedFusedLambInitOpKernel TensorFillConstant(dev_ctx, global_scale, {1}, 1.0f); } VLOG(10) << "Init global scale ends"; + + TensorFillConstant(dev_ctx, ctx.Output("Step"), + {1}, static_cast(0)); + dev_ctx.Wait(); VLOG(10) << "Wait for H2D copy"; } diff --git a/paddle/fluid/operators/optimizers/distributed_fused_lamb_op.cc b/paddle/fluid/operators/optimizers/distributed_fused_lamb_op.cc index 8f7c87912e93a..161483c3420fc 100644 --- a/paddle/fluid/operators/optimizers/distributed_fused_lamb_op.cc +++ b/paddle/fluid/operators/optimizers/distributed_fused_lamb_op.cc @@ -110,6 +110,7 @@ class DistributedFusedLambOpMaker : public framework::OpProtoAndCheckerMaker { .AsDuplicable(); AddOutput("FoundInf", "Whether there is NaN/Inf"); + AddOutput("Step", "The global step which excludes the NaN/Inf step."); AddAttr("beta1", "The initial Beta1Pow value."); AddAttr("beta2", "The initial Beta2Pow value."); diff --git a/paddle/fluid/operators/optimizers/distributed_fused_lamb_op.cu b/paddle/fluid/operators/optimizers/distributed_fused_lamb_op.cu index 5b60f65442b55..f445a140f27a3 100644 --- a/paddle/fluid/operators/optimizers/distributed_fused_lamb_op.cu +++ b/paddle/fluid/operators/optimizers/distributed_fused_lamb_op.cu @@ -381,8 +381,9 @@ static __global__ void UpdateLambMomentAndTrustRatioDivCUDAKernel( const T *__restrict__ square_grad_norm_p, const T *__restrict__ global_scale, const T *__restrict__ beta1pow_p, const T *__restrict__ beta2pow_p, T *__restrict__ mom1_p, - T *__restrict__ mom2_p, T *__restrict__ trust_ratio_div_p, bool *found_inf, - T weight_decay, int weight_decay_end_numel, T beta1, T beta2, T epsilon, + T *__restrict__ mom2_p, T *__restrict__ trust_ratio_div_p, + bool *__restrict__ found_inf, int64_t *__restrict__ step, T weight_decay, + int weight_decay_end_numel, T beta1, T beta2, T epsilon, T max_global_grad_norm, int num, T rescale_grad) { T square_grad_norm = *square_grad_norm_p; bool need_update_found_inf = @@ -392,6 +393,7 @@ static __global__ void UpdateLambMomentAndTrustRatioDivCUDAKernel( return; } else if (need_update_found_inf) { *found_inf = false; + ++(*step); } T scale = rescale_grad / global_scale[0]; @@ -467,8 +469,8 @@ static void MultiTensorUpdateLambMomentAndTrustRatioDiv( const platform::CUDADeviceContext &dev_ctx, const int *offsets, int n, const T *param_p, const GradT *grad_p, const T *square_grad_norm_p, const T *global_scale, const T *beta1pow_p, const T *beta2pow_p, T *mom1_p, - T *mom2_p, T *trust_ratio_div_p, bool *found_inf_p, T weight_decay, - int weight_decay_end_idx, T beta1, T beta2, T epsilon, + T *mom2_p, T *trust_ratio_div_p, bool *found_inf_p, int64_t *step, + T weight_decay, int weight_decay_end_idx, T beta1, T beta2, T epsilon, T max_global_grad_norm, T rescale_grad) { if (n <= 0) return; int numel = offsets[n] - offsets[0]; @@ -496,15 +498,24 @@ static void MultiTensorUpdateLambMomentAndTrustRatioDiv( auto stream = dev_ctx.stream(); auto config = platform::GetGpuLaunchConfig1D(dev_ctx, numel, vec_size); + if (found_inf_p == nullptr) { + PADDLE_ENFORCE_EQ( + step, nullptr, + platform::errors::InvalidArgument( + "Output(Step) cannot be updated twice in one mini-batch.")); + } else { + PADDLE_ENFORCE_NOT_NULL(step, platform::errors::InvalidArgument( + "Output(Step) cannot be nullptr.")); + } -#define PD_LAUNCH_LAMB_MOM_TRUST_RATIO_DIV_KERNEL \ - do { \ - UpdateLambMomentAndTrustRatioDivCUDAKernel<<< \ - config.block_per_grid, config.thread_per_block, 0, stream>>>( \ - param_p, grad_p, square_grad_norm_p, global_scale, beta1pow_p, \ - beta2pow_p, mom1_p, mom2_p, trust_ratio_div_p, found_inf_p, \ - weight_decay, weight_decay_end_numel, beta1, beta2, epsilon, \ - max_global_grad_norm, numel, rescale_grad); \ +#define PD_LAUNCH_LAMB_MOM_TRUST_RATIO_DIV_KERNEL \ + do { \ + UpdateLambMomentAndTrustRatioDivCUDAKernel<<< \ + config.block_per_grid, config.thread_per_block, 0, stream>>>( \ + param_p, grad_p, square_grad_norm_p, global_scale, beta1pow_p, \ + beta2pow_p, mom1_p, mom2_p, trust_ratio_div_p, found_inf_p, step, \ + weight_decay, weight_decay_end_numel, beta1, beta2, epsilon, \ + max_global_grad_norm, numel, rescale_grad); \ } while (0) PD_VEC_LAUNCH_KERNEL(vec_size, PD_LAUNCH_LAMB_MOM_TRUST_RATIO_DIV_KERNEL); @@ -1315,6 +1326,8 @@ class DistributedFusedLambOpKernel const auto *fp16_partial_fused_offsets = fp16_partial_fused_offsets_t->data(); + auto *step = ctx.Output("Step")->data(); + VLOG(1) << "FusedParamOffsets: " << FlattenToString(fused_offsets, fused_offsets_t->numel(), fused_offsets_t->place()); @@ -1337,8 +1350,8 @@ class DistributedFusedLambOpKernel dev_ctx, fp32_partial_fused_offsets, fp32_local_param_num, fp32_param + fp32_offset, fp32_sum_grad, fp32_square_grad_norm, global_scale, beta1pow, beta2pow, moment1, moment2, trust_ratio_div, - found_inf, weight_decay, fp32_weight_decay_end_idx, beta1, beta2, - epsilon, max_global_grad_norm, rescale_grad); + found_inf, step, weight_decay, fp32_weight_decay_end_idx, beta1, + beta2, epsilon, max_global_grad_norm, rescale_grad); VLOG(10) << "Update FP32 Moment and TrustRatioDiv done"; } float *master_param = nullptr; @@ -1346,13 +1359,14 @@ class DistributedFusedLambOpKernel master_param = fp32_param + fp32_numel; VLOG(10) << "Update FP16 Moment and TrustRatioDiv starts"; auto tmp_found_inf = has_fp32_param ? nullptr : found_inf; + auto tmp_step = has_fp32_param ? nullptr : step; MultiTensorUpdateLambMomentAndTrustRatioDiv( dev_ctx, fp16_partial_fused_offsets, fp16_local_param_num, master_param + fp16_offset, fp16_sum_grad, fp32_square_grad_norm, global_scale, beta1pow, beta2pow, moment1 + fp32_numel_each_device, moment2 + fp32_numel_each_device, - trust_ratio_div + fp32_numel_each_device, tmp_found_inf, weight_decay, - fp16_weight_decay_end_idx, beta1, beta2, epsilon, + trust_ratio_div + fp32_numel_each_device, tmp_found_inf, tmp_step, + weight_decay, fp16_weight_decay_end_idx, beta1, beta2, epsilon, max_global_grad_norm, rescale_grad); VLOG(10) << "Update FP16 Moment and TrustRatioDiv done"; } diff --git a/python/paddle/incubate/optimizer/distributed_fused_lamb.py b/python/paddle/incubate/optimizer/distributed_fused_lamb.py index 00a39dfba0f18..12a88106a44cd 100644 --- a/python/paddle/incubate/optimizer/distributed_fused_lamb.py +++ b/python/paddle/incubate/optimizer/distributed_fused_lamb.py @@ -75,9 +75,18 @@ def __init__(self, name=unique_name.generate('found_inf'), shape=[1], dtype=core.VarDesc.VarType.BOOL) + self._step = None self._param_to_master_param = {} + def _set_step(self, step): + self._step = step + + def _get_or_create_step(self): + if self._step is None: + self._step = self._create_persistable_var('step', dtype='int64') + return self._step + def _set_scale(self, scale): assert scale is not None if not isinstance(scale, Variable): @@ -189,6 +198,8 @@ def _apply_gradients_impl(self, params_grads): param_order = self._create_persistable_var('param_order', dtype='int32') param_order.is_distributed = True + step = self._get_or_create_step() + rank = get_rank() nranks = get_world_size() scale = self._get_or_create_scale() @@ -234,6 +245,7 @@ def _apply_gradients_impl(self, params_grads): 'FP16ShardFusedParamOffsets': [fp16_partial_fused_offsets], 'FusedParamOffsets': [fused_offsets], 'ParamOrder': [param_order], + 'Step': [step], }, attrs={ 'alignment': self._alignment, @@ -290,6 +302,7 @@ def _apply_gradients_impl(self, params_grads): 'ParamOut': params, 'GradOut': grads, 'FoundInf': [self._found_inf], + 'Step': [step], }, attrs={ 'weight_decay': self._weight_decay, From aadeff53c7e1655e83dd41f5ffbc1f602dc5777d Mon Sep 17 00:00:00 2001 From: huzhiqiang <912790387@qq.com> Date: Thu, 7 Apr 2022 13:44:59 +0800 Subject: [PATCH 002/211] [infrt]Add gpu compile method (#41463) --- paddle/infrt/CMakeLists.txt | 4 +++ paddle/scripts/infrt_build.sh | 59 ++++++++++++++++++++++++----------- 2 files changed, 44 insertions(+), 19 deletions(-) diff --git a/paddle/infrt/CMakeLists.txt b/paddle/infrt/CMakeLists.txt index 0f90ec96db2c7..e5f224bf6ad99 100644 --- a/paddle/infrt/CMakeLists.txt +++ b/paddle/infrt/CMakeLists.txt @@ -9,6 +9,10 @@ option(INFRT_WITH_TRT "Compile INFRT with TensorRT" OFF) #TODO(xiaowei) remove fluid include_directories(${PADDLE_SOURCE_DIR}/paddle/fluid/platform) +if(WITH_GPU) + set(INFRT_WITH_GPU ON) +endif() + if (INFRT_WITH_PHI) add_definitions("-DINFRT_WITH_PHI") diff --git a/paddle/scripts/infrt_build.sh b/paddle/scripts/infrt_build.sh index 1ea06059ccb8f..ef753200971b3 100755 --- a/paddle/scripts/infrt_build.sh +++ b/paddle/scripts/infrt_build.sh @@ -20,6 +20,9 @@ set -e +# TARGET: CPU/GPU/TensorRt +TARGET=GPU + if [ -z ${BRANCH} ]; then BRANCH="develop" fi @@ -32,7 +35,13 @@ function update_pd_ops() { # compile and install paddle rm -rf ${PADDLE_ROOT}/build && mkdir -p ${PADDLE_ROOT}/build cd ${PADDLE_ROOT}/build - cmake .. -DWITH_PYTHON=ON -DWITH_MKL=OFF -DWITH_GPU=OFF -DPYTHON_EXECUTABLE=`which python3` -DWITH_XBYAK=OFF -DWITH_NCCL=OFF -DWITH_RCCL=OFF -DWITH_CRYPTO=OFF + + INFER_WITH_GPU=OFF + if [ "${TARGET}" == "GPU" ] || [ "${TARGET}" == "gpu" ] || [ "${TARGET}" == "TensorRt" ] || [ "${TARGET}" == "tensorrt" ]; then + INFER_WITH_GPU=ON + fi + + cmake .. -DWITH_PYTHON=ON -DWITH_MKL=OFF -DWITH_GPU=$INFER_WITH_GPU -DPYTHON_EXECUTABLE=`which python3` -DWITH_XBYAK=OFF -DWITH_NCCL=OFF -DWITH_RCCL=OFF -DWITH_CRYPTO=OFF make -j24 paddle_python print_pten_kernels kernel_signature_generator cd ${PADDLE_ROOT}/build ./paddle/phi/tools/print_pten_kernels > ../tools/infrt/kernels.json @@ -94,7 +103,13 @@ function infrt_gen_and_build() { # step2. compile infrt cd ${PADDLE_ROOT}/build rm -f infrt_summary.txt - cmake .. -DWITH_MKL=OFF -DWITH_GPU=ON -DWITH_TENSORRT=ON -DWITH_CRYPTO=OFF -DCMAKE_BUILD_TYPE=Release -DWITH_INFRT=ON -DINFRT_WITH_GPU=ON -DINFRT_WITH_TRT=ON -DWITH_PYTHON=OFF -DWITH_TESTING==${WITH_TESTING:-ON}; build_error=$? + + INFER_WITH_GPU=OFF + if [ "${TARGET}" == "GPU" ] || [ "${TARGET}" == "gpu" ] || [ "${TARGET}" == "TensorRt" ]; then + INFER_WITH_GPU=ON + fi + + cmake .. -DWITH_MKL=OFF -DWITH_GPU=${INFER_WITH_GPU} -DWITH_TENSORRT=ON -DWITH_CRYPTO=OFF -DCMAKE_BUILD_TYPE=Release -DWITH_INFRT=ON -DINFRT_WITH_GPU=ON -DINFRT_WITH_TRT=ON -DWITH_PYTHON=OFF -DWITH_TESTING==${WITH_TESTING:-ON}; build_error=$? if [ "$build_error" != 0 ];then exit 7; fi @@ -157,28 +172,34 @@ function main() { echo " (2)bash infrt_build.sh build_only" echo " (3)bash infrt_build.sh test_only" echo " optional command: --update_pd_ops : pd_ops.td will be updated according to paddle's code." + echo " --target= : GPU/gpu/CPU/cpu/TensorRt/tensorrt, default value is GPU." exit 0 fi init - case $CMD in - build_and_test) - infrt_gen_and_build ${parallel_number} - test_infrt - ;; - build_only) - infrt_gen_and_build ${parallel_number} - ;; - test_only) - test_infrt - ;; - *) - print_usage - exit 1 - ;; - esac - + for i in "$@"; do + case $i in + --target=*) + TARGET="${i#*=}" + shift + ;; + build_and_test) + infrt_gen_and_build ${parallel_number} + test_infrt + ;; + build_only) + infrt_gen_and_build ${parallel_number} + ;; + test_only) + test_infrt + ;; + *) + print_usage + exit 1 + ;; + esac + done set +x if [[ -f ${PADDLE_ROOT}/build/infrt_summary.txt ]];then echo "=====================build summary======================" From ad4193fe957fe2eccbc2c9fd36b1f8395e2ecf1d Mon Sep 17 00:00:00 2001 From: Chen Weihang Date: Thu, 7 Apr 2022 13:47:13 +0800 Subject: [PATCH 003/211] fix get tensor backend set bug (#41478) --- paddle/phi/api/lib/kernel_dispatch.cc | 34 ++++++++++++++++++++++++--- paddle/phi/core/string_tensor_utils.h | 5 ++++ paddle/phi/core/tensor_utils.h | 5 ++++ 3 files changed, 41 insertions(+), 3 deletions(-) diff --git a/paddle/phi/api/lib/kernel_dispatch.cc b/paddle/phi/api/lib/kernel_dispatch.cc index 1ca6e2ce0bb9a..6d97dc7657f00 100644 --- a/paddle/phi/api/lib/kernel_dispatch.cc +++ b/paddle/phi/api/lib/kernel_dispatch.cc @@ -14,18 +14,46 @@ limitations under the License. */ #include "paddle/phi/api/lib/kernel_dispatch.h" -#include "paddle/phi/api/include/context_pool.h" -#include "paddle/phi/core/compat/convert_utils.h" #ifdef _MSC_VER #include #endif +#include "paddle/phi/api/include/context_pool.h" +#include "paddle/phi/core/compat/convert_utils.h" +#include "paddle/phi/core/string_tensor_utils.h" +#include "paddle/phi/core/tensor_utils.h" + namespace paddle { namespace experimental { namespace detail { +// We need judge whether the allocation is nullptr, +// whether the allocation is initialized, wo we need GetHolder method +bool HasAllocation(const phi::TensorBase& t) { + if (phi::DenseTensor::classof(&t)) { + return phi::DenseTensorUtils::GetHolder( + static_cast(t)) != nullptr; + } else if (phi::SelectedRows::classof(&t)) { + return phi::DenseTensorUtils::GetHolder( + static_cast(t).value()) != nullptr; + } else if (phi::SparseCsrTensor::classof(&t)) { + return phi::DenseTensorUtils::GetHolder( + static_cast(t) + .non_zero_elements()) != nullptr; + } else if (phi::SparseCooTensor::classof(&t)) { + return phi::DenseTensorUtils::GetHolder( + static_cast(t) + .non_zero_elements()) != nullptr; + } else if (phi::StringTensor::classof(&t)) { + return phi::StringTensorUtils::GetHolder( + static_cast(t)) != nullptr; + } else { + return false; + } +} + BackendSet GetTensorBackendSet(const phi::TensorBase& t) { - if (t.initialized()) { + if (HasAllocation(t)) { BackendSet backend_set(phi::TransToPhiBackend(t.place())); switch (t.layout()) { case DataLayout::MKLDNN: diff --git a/paddle/phi/core/string_tensor_utils.h b/paddle/phi/core/string_tensor_utils.h index c1b0d09647d91..777a24c9adfe1 100644 --- a/paddle/phi/core/string_tensor_utils.h +++ b/paddle/phi/core/string_tensor_utils.h @@ -23,6 +23,11 @@ class StringTensorUtils { static StringTensorMeta* GetMutableMeta(StringTensor* tensor) { return &(tensor->meta_); } + + static const std::shared_ptr& GetHolder( + const StringTensor& tensor) { + return tensor.holder_; + } }; } // namespace phi diff --git a/paddle/phi/core/tensor_utils.h b/paddle/phi/core/tensor_utils.h index 676a590ecbce2..abf8aeff4d3ab 100644 --- a/paddle/phi/core/tensor_utils.h +++ b/paddle/phi/core/tensor_utils.h @@ -25,6 +25,11 @@ class DenseTensorUtils { return &(tensor->meta_); } + static const std::shared_ptr& GetHolder( + const DenseTensor& tensor) { + return tensor.holder_; + } + static DenseTensor Slice(const DenseTensor& tensor, int64_t begin_idx, int64_t end_idx) { From b0ca369b7d359d9faa3a42e9aad8d9f82d0cec4c Mon Sep 17 00:00:00 2001 From: 0x45f <23097963+0x45f@users.noreply.github.com> Date: Thu, 7 Apr 2022 14:09:55 +0800 Subject: [PATCH 004/211] Add fill_constant_batch_size YAML and UT (#41474) --- python/paddle/fluid/layers/tensor.py | 12 +++ .../test_fill_constant_batch_size_like.py | 75 +++++++++++++++++++ python/paddle/utils/code_gen/api.yaml | 12 +++ 3 files changed, 99 insertions(+) create mode 100644 python/paddle/fluid/tests/unittests/test_fill_constant_batch_size_like.py diff --git a/python/paddle/fluid/layers/tensor.py b/python/paddle/fluid/layers/tensor.py index 188bb539c01da..a63e87472ebed 100644 --- a/python/paddle/fluid/layers/tensor.py +++ b/python/paddle/fluid/layers/tensor.py @@ -846,6 +846,18 @@ def fill_constant_batch_size_like(input, input=like, shape=[1], value=0, dtype='int64') #like=[[10, 10]] data=[0] """ + if in_dygraph_mode(): + if not isinstance(dtype, core.VarDesc.VarType): + dtype = convert_np_dtype_to_dtype_(dtype) + + place = _current_expected_place() + if force_cpu: + place = core.CPUPlace() + out = _C_ops.final_state_full_batch_size_like( + input, shape, dtype, value, input_dim_idx, output_dim_idx, place) + out.stop_gradient = True + return out + helper = LayerHelper("fill_constant_batch_size_like", **locals()) out = helper.create_variable_for_type_inference(dtype=dtype) attrs = { diff --git a/python/paddle/fluid/tests/unittests/test_fill_constant_batch_size_like.py b/python/paddle/fluid/tests/unittests/test_fill_constant_batch_size_like.py new file mode 100644 index 0000000000000..774134f7a9960 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_fill_constant_batch_size_like.py @@ -0,0 +1,75 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import paddle +import paddle.fluid.core as core +from paddle.static import program_guard, Program +import paddle.compat as cpt +import unittest +import numpy as np +from op_test import OpTest +from paddle.fluid.framework import convert_np_dtype_to_dtype_ + +paddle.enable_static() + + +def fill_constant_batch_size_like(input, + shape, + value, + data_type, + input_dim_idx=0, + output_dim_idx=0, + force_cpu=False): + return paddle.fluid.layers.fill_constant_batch_size_like( + input, shape, data_type, value, input_dim_idx, output_dim_idx, + force_cpu) + + +class TestFillConstatnBatchSizeLike1(OpTest): + # test basic + def setUp(self): + self.op_type = "fill_constant_batch_size_like" + self.python_api = fill_constant_batch_size_like + self.init_data() + + input = np.zeros(self.shape) + out = np.full_like(input, self.value, self.dtype) + + self.inputs = {'Input': input} + self.outputs = {'Out': out} + self.attrs = { + 'shape': self.shape, + 'dtype': convert_np_dtype_to_dtype_(self.dtype), + 'value': self.value, + 'input_dim_idx': self.input_dim_idx, + 'output_dim_idx': self.output_dim_idx, + 'force_cpu': self.force_cpu + } + + def init_data(self): + self.shape = [10, 10] + self.dtype = np.float32 + self.value = 100 + self.input_dim_idx = 0 + self.output_dim_idx = 0 + self.force_cpu = False + + def test_check_output(self): + self.check_output(check_eager=True) + + +if __name__ == "__main__": + unittest.main() diff --git a/python/paddle/utils/code_gen/api.yaml b/python/paddle/utils/code_gen/api.yaml index 72cff705c14ef..589dfdb0f3e1a 100644 --- a/python/paddle/utils/code_gen/api.yaml +++ b/python/paddle/utils/code_gen/api.yaml @@ -718,6 +718,18 @@ data_type : dtype backend : place +- api : full_batch_size_like + args : (Tensor input, int[] shape, DataType dtype, Scalar value, int input_dim_idx, int output_dim_idx, Place place=CPUPlace()) + output: Tensor + infer_meta : + func : FullBatchSizeLikeInferMeta + param : [input, shape, value, dtype, input_dim_idx, output_dim_idx] + kernel : + func : full_batch_size_like + param : [input, shape, value, dtype, input_dim_idx, output_dim_idx] + data_type : dtype + backend : place + - api : full_like args : (Tensor x, Scalar value, DataType dtype = DataType::UNDEFINED, Place place = {}) output: Tensor From 633ac4e61500729efe2046b86b0ba186fe76c3e8 Mon Sep 17 00:00:00 2001 From: lilong12 Date: Thu, 7 Apr 2022 14:43:26 +0800 Subject: [PATCH 005/211] add send/recv to/from switch module for PrcoessGroupHeter (#41285) --- cmake/flags.cmake | 4 + .../distributed/collective/CMakeLists.txt | 4 +- .../distributed/collective/ProcessGroup.cc | 4 +- .../distributed/collective/ProcessGroup.h | 18 +- .../collective/ProcessGroupHCCL.cc | 6 - .../collective/ProcessGroupHeter.cc | 188 +++++++++++++++--- .../collective/ProcessGroupHeter.h | 7 +- .../collective/ProcessGroupNCCL.cc | 37 ++++ .../distributed/collective/ProcessGroupNCCL.h | 4 + .../operators/collective/c_broadcast_op.cu.cc | 2 +- paddle/fluid/pybind/CMakeLists.txt | 6 + paddle/fluid/pybind/distributed_py.cc | 35 ++++ 12 files changed, 264 insertions(+), 51 deletions(-) diff --git a/cmake/flags.cmake b/cmake/flags.cmake index f90b71f9e60a8..5742a6b602ff3 100644 --- a/cmake/flags.cmake +++ b/cmake/flags.cmake @@ -244,3 +244,7 @@ if(WITH_ROCM) string (REPLACE "-Werror" "-Wno-error" CMAKE_C_FLAGS ${CMAKE_C_FLAGS}) endif() +if(WITH_PSCORE OR WITH_PSLIB) + string (REPLACE "-Wnon-virtual-dtor" "-Wno-non-virtual-dtor" CMAKE_CXX_FLAGS ${CMAKE_CXX_FLAGS}) + string (REPLACE "-Wnon-virtual-dtor" "-Wno-non-virtual-dtor" CMAKE_C_FLAGS ${CMAKE_C_FLAGS}) +endif() diff --git a/paddle/fluid/distributed/collective/CMakeLists.txt b/paddle/fluid/distributed/collective/CMakeLists.txt index 6fb805a72e4de..6d736d5543ce4 100644 --- a/paddle/fluid/distributed/collective/CMakeLists.txt +++ b/paddle/fluid/distributed/collective/CMakeLists.txt @@ -7,14 +7,14 @@ endif() if(WITH_NCCL) cc_library(processgroup_nccl SRCS ProcessGroupNCCL.cc NCCLTools.cc Common.cc DEPS place cuda_stream enforce collective_helper device_context phi phi_api eager_api) - if (WITH_DISTRIBUTE) + if (WITH_DISTRIBUTE AND WITH_PSCORE) cc_library(processgroup_heter SRCS ProcessGroupHeter.cc NCCLTools.cc Common.cc DEPS place cuda_stream enforce collective_helper device_context phi phi_api eager_api) endif() endif() if(WITH_ASCEND_CL) cc_library(processgroup_hccl SRCS ProcessGroupHCCL.cc HCCLTools.cc Common.cc DEPS place npu_stream enforce collective_helper device_context phi phi_api eager_api) - if (WITH_DISTRIBUTE) + if (WITH_DISTRIBUTE AND WITH_PSCORE) cc_library(processgroup_heter SRCS ProcessGroupHeter.cc HCCLTools.cc Common.cc DEPS place npu_stream enforce collective_helper device_context phi phi_api eager_api) endif() endif() diff --git a/paddle/fluid/distributed/collective/ProcessGroup.cc b/paddle/fluid/distributed/collective/ProcessGroup.cc index ab118dadd5d88..6da83a888683b 100644 --- a/paddle/fluid/distributed/collective/ProcessGroup.cc +++ b/paddle/fluid/distributed/collective/ProcessGroup.cc @@ -35,10 +35,10 @@ bool ProcessGroup::Task::Wait(std::chrono::milliseconds timeout) { void ProcessGroup::Task::Synchronize() {} ProcessGroup::ProcessGroup(int rank, int size, int gid) - : rank_(rank), size_(size) { + : rank_(rank), size_(size), gid_(gid) { if (gid != IGNORE_ID) { auto map = ProcessGroupMapFromGid::getInstance(); - map->insert(gid, this); + map->insert(gid_, this); } } diff --git a/paddle/fluid/distributed/collective/ProcessGroup.h b/paddle/fluid/distributed/collective/ProcessGroup.h index c2ad1aa2c93ea..17d021852671e 100644 --- a/paddle/fluid/distributed/collective/ProcessGroup.h +++ b/paddle/fluid/distributed/collective/ProcessGroup.h @@ -93,8 +93,8 @@ class ProcessGroup { } virtual void Broadcast(const phi::DenseTensor* in, phi::DenseTensor* out) { - PADDLE_THROW(platform::errors::InvalidArgument( - "ProcessGroup%s does not support broadcast for static", + PADDLE_THROW(platform::errors::Fatal( + "ProcessGroup%s does not support broadcast for static mode runtime", GetBackendName())); } @@ -148,6 +148,7 @@ class ProcessGroup { protected: const int rank_; const int size_; + const int gid_; }; class ProcessGroupMapFromGid { @@ -158,17 +159,20 @@ class ProcessGroupMapFromGid { } void insert(int gid, ProcessGroup* pg) { + // TODO(sandyhouse): address ut and uncomment the following codes // PADDLE_ENFORCE_EQ(has(gid), false, - // platform::errors::PreconditionNotMet( - // "The process group with id %d does exist.", gid)); + // platform::errors::PreconditionNotMet( + // "The process group with id %d doesnot exist.", + // gid)); map_[gid] = pg; } ProcessGroup* get(int gid) { + // TODO(sandyhouse): address ut and uncomment the following codes // PADDLE_ENFORCE_EQ(has(gid), true, - // platform::errors::PreconditionNotMet( - // "The process group with id %d doesnot exist.", - // gid)); + // platform::errors::PreconditionNotMet( + // "The process group with id %d doesnot exist.", + // gid)); return map_.find(gid)->second; } diff --git a/paddle/fluid/distributed/collective/ProcessGroupHCCL.cc b/paddle/fluid/distributed/collective/ProcessGroupHCCL.cc index b21155e09d06e..55945b5e0e396 100644 --- a/paddle/fluid/distributed/collective/ProcessGroupHCCL.cc +++ b/paddle/fluid/distributed/collective/ProcessGroupHCCL.cc @@ -30,12 +30,6 @@ constexpr int64_t kWaitBlockTImeout = 10; namespace paddle { namespace distributed { -// bool CheckTensorsInNPUPlace(const std::vector& tensors) { -// return std::all_of(tensors.cbegin(), tensors.cend(), [&](const Tensor& t) { -// return t.place() == platform::DeviceType::NPU; -// }); -// } - void SyncDefaultStream( const std::vector& places, std::vector& hcclEvents, // NOLINT diff --git a/paddle/fluid/distributed/collective/ProcessGroupHeter.cc b/paddle/fluid/distributed/collective/ProcessGroupHeter.cc index ffd653042494d..b3c9ddde50116 100644 --- a/paddle/fluid/distributed/collective/ProcessGroupHeter.cc +++ b/paddle/fluid/distributed/collective/ProcessGroupHeter.cc @@ -56,7 +56,8 @@ ProcessGroupHeter::ProcessGroupHeter(const std::shared_ptr& store, local_size_(local_size), gloo_rank_(gloo_rank), gloo_size_(gloo_size), - with_switch_(with_switch) { + with_switch_(with_switch), + switch_endpoint_(switch_endpoint) { #if defined(PADDLE_WITH_NCCL) inner_pg_ = std::make_shared(store, local_rank, local_size, IGNORE_ID); @@ -64,14 +65,10 @@ ProcessGroupHeter::ProcessGroupHeter(const std::shared_ptr& store, inner_pg_ = std::make_shared(store, local_rank, local_size, IGNORE_ID); #else - PADDLE_THROW(platform::errors::InvalidArgument( + PADDLE_THROW(platform::errors::Fatal( "ProcessGroupHeter only supports NCCL and HCCL now."); #endif - if (with_switch_) { - // TODO(sandyhouse) starts a client to connect the cloud switch module - // std::shared_ptr client_ = - // HeterClient::GetInstance({switch_endpoint}, {}, 0); - } else if (local_rank_ == 0) { + if (local_rank_ == 0 && !with_switch_) { auto opts = ProcessGroupGloo::GlooOptions::create(); opts->device = ProcessGroupGloo::createDefaultDevice(); inter_pg_ = std::make_shared(store, gloo_rank_, @@ -79,6 +76,15 @@ ProcessGroupHeter::ProcessGroupHeter(const std::shared_ptr& store, } } +template +static void _do_add(T* dst, T* src, size_t size) { + for (size_t i = 0; i < size; i++) { + *dst += *src; + dst++; + src++; + } +} + std::shared_ptr ProcessGroupHeter::AllReduce( std::vector& tensors, const AllreduceOptions& opts) { #if defined(PADDLE_WITH_NCCL) @@ -93,33 +99,92 @@ std::shared_ptr ProcessGroupHeter::AllReduce( // Step2: copy tensors to CPU if (local_rank_ == 0) { - std::vector cpu_tensors(tensors.size()); + std::vector cpu_tensors; + cpu_tensors.reserve(tensors.size()); for (size_t i = 0; i < tensors.size(); i++) { auto dense_gpu_tensor = std::dynamic_pointer_cast(tensors[i].impl()); - auto dense_cpu_tensor = - std::dynamic_pointer_cast(cpu_tensors[i].impl()); - dense_cpu_tensor->Resize(tensors[i].dims()); + phi::DenseTensorMeta meta = phi::DenseTensorMeta( + dense_gpu_tensor->dtype(), dense_gpu_tensor->dims()); + std::shared_ptr dense_cpu_tensor = + std::make_shared( + std::make_unique( + paddle::platform::CPUPlace()) + .get(), + meta); + dense_cpu_tensor->ResizeAndAllocate(dense_gpu_tensor->dims()); + cpu_tensors[i] = paddle::experimental::Tensor(dense_cpu_tensor); framework::TensorCopySync(*dense_gpu_tensor, platform::CPUPlace(), dense_cpu_tensor.get()); } // Step3: do inter cluster allreduce if (with_switch_) { - // TODO(sandyhouse) send to and recv from switch, and do add + if (local_rank_ == 0) { + HeterClient* client_ = + HeterClient::GetInstance({switch_endpoint_}, {}, 0).get(); + auto dense_cpu_tensor = + std::dynamic_pointer_cast(cpu_tensors[0].impl()); + std::vector send_size; + send_size.push_back(dense_cpu_tensor->numel()); + int ret = client_->Send( + gid_, {dense_cpu_tensor->name()}, send_size, + dense_cpu_tensor->data(), + dense_cpu_tensor->numel() * + framework::DataTypeSize(dense_cpu_tensor->dtype())); + PADDLE_ENFORCE_EQ(ret, 0, platform::errors::PreconditionNotMet( + "Send to the switch module error.")); + phi::DenseTensorMeta meta = phi::DenseTensorMeta( + dense_cpu_tensor->dtype(), dense_cpu_tensor->dims()); + std::shared_ptr dense_cpu_tensor2 = + std::make_shared( + std::make_unique( + paddle::platform::CPUPlace()) + .get(), + meta); + dense_cpu_tensor2->ResizeAndAllocate(dense_cpu_tensor->dims()); + Tensor cpu_tensor_temp = + paddle::experimental::Tensor(dense_cpu_tensor2); + ret = client_->Recv( + gid_, {dense_cpu_tensor->name()}, dense_cpu_tensor2->data(), + dense_cpu_tensor2->numel() * + framework::DataTypeSize(dense_cpu_tensor2->dtype())); + PADDLE_ENFORCE_EQ(ret, 0, platform::errors::PreconditionNotMet( + "Recv from the switch module error.")); + + switch (dense_cpu_tensor->dtype()) { + case DataType::FLOAT32: + _do_add(reinterpret_cast(dense_cpu_tensor->data()), + reinterpret_cast(dense_cpu_tensor2->data()), + dense_cpu_tensor->numel()); + break; + case DataType::FLOAT64: + _do_add( + reinterpret_cast(dense_cpu_tensor->data()), + reinterpret_cast(dense_cpu_tensor2->data()), + dense_cpu_tensor->numel()); + break; + case DataType::INT32: + _do_add(reinterpret_cast(dense_cpu_tensor->data()), + reinterpret_cast(dense_cpu_tensor2->data()), + dense_cpu_tensor->numel()); + break; + default: + PADDLE_THROW(platform::errors::PreconditionNotMet( + "Unsupported data type (%s) to do add.", + framework::DataType2String(dense_cpu_tensor->dtype()))); + } + } } else { auto gloo_task = inter_pg_->AllReduce(cpu_tensors, opts); gloo_task->Wait(); } // Step4: copy cpu tensors to gpu - // TODO(sandyhouse) // copy cpu tensors to gpu for (size_t i = 0; i < tensors.size(); i++) { auto dense_gpu_tensor = std::dynamic_pointer_cast(tensors[i].impl()); auto dense_cpu_tensor = std::dynamic_pointer_cast(cpu_tensors[i].impl()); - // framework::TensorCopySync(*dense_cpu_tensor, tensors[i].place(), - // dense_gpu_tensor.get()); framework::TensorCopySync(*dense_cpu_tensor, dense_cpu_tensor->place(), dense_gpu_tensor.get()); } @@ -147,18 +212,57 @@ std::shared_ptr ProcessGroupHeter::Broadcast( inner_pg_->Broadcast(tensors, b_opts); if (local_rank_ == 0) { - std::vector cpu_tensors(tensors.size()); + std::vector cpu_tensors; + cpu_tensors.reserve(tensors.size()); for (size_t i = 0; i < tensors.size(); i++) { auto dense_gpu_tensor = std::dynamic_pointer_cast(tensors[i].impl()); - auto dense_cpu_tensor = - std::dynamic_pointer_cast(cpu_tensors[i].impl()); - dense_cpu_tensor->Resize(tensors[i].dims()); + phi::DenseTensorMeta meta = phi::DenseTensorMeta( + dense_gpu_tensor->dtype(), dense_gpu_tensor->dims()); + std::shared_ptr dense_cpu_tensor = + std::make_shared( + std::make_unique( + paddle::platform::CPUPlace()) + .get(), + meta); + dense_cpu_tensor->ResizeAndAllocate(dense_gpu_tensor->dims()); + cpu_tensors[i] = paddle::experimental::Tensor(dense_cpu_tensor); framework::TensorCopySync(*dense_gpu_tensor, platform::CPUPlace(), dense_cpu_tensor.get()); } if (with_switch_) { - // TODO(sandyhouse) send to and recv + if (local_rank_ == 0) { + HeterClient* client_ = + HeterClient::GetInstance({switch_endpoint_}, {}, 0).get(); + auto dense_cpu_tensor = + std::dynamic_pointer_cast(cpu_tensors[0].impl()); + if (gloo_rank_ == 0) { + std::vector send_size; + send_size.push_back(dense_cpu_tensor->numel()); + int ret = client_->Send( + gid_, {dense_cpu_tensor->name()}, send_size, + dense_cpu_tensor->data(), + dense_cpu_tensor->numel() * + framework::DataTypeSize(dense_cpu_tensor->dtype())); + PADDLE_ENFORCE_EQ(ret, 0, platform::errors::PreconditionNotMet( + "Send to the switch module error.")); + } else { + int ret = client_->Recv( + gid_, {dense_cpu_tensor->name()}, dense_cpu_tensor->data(), + dense_cpu_tensor->numel() * + framework::DataTypeSize(dense_cpu_tensor->dtype())); + PADDLE_ENFORCE_EQ(ret, 0, + platform::errors::PreconditionNotMet( + "Receive from the switch module error.")); + ret = client_->Recv( + gid_, {dense_cpu_tensor->name()}, dense_cpu_tensor->data(), + dense_cpu_tensor->numel() * + framework::DataTypeSize(dense_cpu_tensor->dtype())); + PADDLE_ENFORCE_EQ(ret, 0, + platform::errors::PreconditionNotMet( + "Receive from the switch module error.")); + } + } } else { auto gloo_task = inter_pg_->Broadcast(cpu_tensors, opts); gloo_task->Wait(); @@ -168,8 +272,6 @@ std::shared_ptr ProcessGroupHeter::Broadcast( std::dynamic_pointer_cast(tensors[i].impl()); auto dense_cpu_tensor = std::dynamic_pointer_cast(cpu_tensors[i].impl()); - // framework::TensorCopySync(*dense_cpu_tensor, tensors[i].place(), - // dense_gpu_tensor.get()); framework::TensorCopySync(*dense_cpu_tensor, dense_cpu_tensor->place(), dense_gpu_tensor.get()); } @@ -185,22 +287,44 @@ void ProcessGroupHeter::Broadcast(const phi::DenseTensor* in, inner_pg_->Broadcast(in, out); if (local_rank_ == 0) { - Tensor cpu_tensor; - auto dense_cpu_tensor = - std::dynamic_pointer_cast(cpu_tensor.impl()); - dense_cpu_tensor->Resize(in->dims()); + phi::DenseTensorMeta meta = phi::DenseTensorMeta(in->dtype(), in->dims()); + std::shared_ptr dense_cpu_tensor = + std::make_shared( + std::make_unique( + paddle::platform::CPUPlace()) + .get(), + meta); + dense_cpu_tensor->ResizeAndAllocate(in->dims()); + Tensor cpu_tensor = paddle::experimental::Tensor(dense_cpu_tensor); framework::TensorCopySync(*in, platform::CPUPlace(), dense_cpu_tensor.get()); if (with_switch_) { - // TODO(sandyhouse) send to and recv + if (local_rank_ == 0) { + HeterClient* client_ = + HeterClient::GetInstance({switch_endpoint_}, {}, 0).get(); + if (gloo_rank_ == 0) { + std::vector send_size; + send_size.push_back(in->numel()); + int ret = client_->Send( + gid_, {in->name()}, send_size, dense_cpu_tensor->data(), + in->numel() * framework::DataTypeSize(in->dtype())); + PADDLE_ENFORCE_EQ(ret, 0, platform::errors::PreconditionNotMet( + "Send to the switch module error.")); + } else { + int ret = + client_->Recv(gid_, {in->name()}, dense_cpu_tensor->data(), + in->numel() * framework::DataTypeSize(in->dtype())); + PADDLE_ENFORCE_EQ(ret, 0, + platform::errors::PreconditionNotMet( + "Receive from the switch module error.")); + } + } } else { std::vector cpu_tensors = {cpu_tensor}; - // auto gloo_task = inter_pg_->Broadcast(cpu_tensors); - // gloo_task->Wait(); - inter_pg_->Broadcast(cpu_tensors); + auto gloo_task = inter_pg_->Broadcast(cpu_tensors); + gloo_task->Wait(); } - framework::TensorCopySync(*dense_cpu_tensor, dense_cpu_tensor->place(), - out); + framework::TensorCopySync(*dense_cpu_tensor, out->place(), out); } inner_pg_->Broadcast(out, out); } diff --git a/paddle/fluid/distributed/collective/ProcessGroupHeter.h b/paddle/fluid/distributed/collective/ProcessGroupHeter.h index 8a26adbea4d78..892dbb9369e8d 100644 --- a/paddle/fluid/distributed/collective/ProcessGroupHeter.h +++ b/paddle/fluid/distributed/collective/ProcessGroupHeter.h @@ -23,7 +23,6 @@ #include "paddle/fluid/distributed/collective/ProcessGroup.h" #include "paddle/fluid/distributed/collective/ProcessGroupGloo.h" -// #include "paddle/fluid/distributed/ps/service/heter_client.h" #include "paddle/fluid/platform/device_context.h" #ifdef PADDLE_WITH_GLOO @@ -48,6 +47,11 @@ #include "paddle/fluid/distributed/collective/ProcessGroupHCCL.h" #endif +#if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE) && \ + (defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_ASCEND_CL)) +#include "paddle/fluid/distributed/ps/service/heter_client.h" +#endif + #include "paddle/fluid/distributed/collective/Common.h" constexpr const char* HETER_BACKEND_NAME = "HETER_BACKEND"; @@ -108,6 +112,7 @@ class ProcessGroupHeter : public ProcessGroup { int gloo_rank_; int gloo_size_; bool with_switch_; + std::string switch_endpoint_; }; } // namespace distributed diff --git a/paddle/fluid/distributed/collective/ProcessGroupNCCL.cc b/paddle/fluid/distributed/collective/ProcessGroupNCCL.cc index 7c0752b5f367c..eeb5e3b397c10 100644 --- a/paddle/fluid/distributed/collective/ProcessGroupNCCL.cc +++ b/paddle/fluid/distributed/collective/ProcessGroupNCCL.cc @@ -226,6 +226,43 @@ std::shared_ptr ProcessGroupNCCL::Collective( return task; } +template +void ProcessGroupNCCL::Collective(const phi::DenseTensor* in, + phi::DenseTensor* out, Fn fn, + CommType op_type) { + std::vector places; + places.push_back(in->place()); + const auto key = GetKeyFromPlaces(places); + + { + std::lock_guard lock(mutex_); + if (places_to_ncclcomm_.find(key) == places_to_ncclcomm_.end()) { + CreateNCCLManagerCache(key, places); + } + } + + auto& nccl_comms = places_to_ncclcomm_[key]; + + SyncDefaultStream(places, places_to_events_[key], places_to_ctx_[key]); + + // construct uninitialize guard for device + platform::CUDADeviceGuard cuda_guard; + + if (FLAGS_use_stream_safe_cuda_allocator) { + cuda_guard.SetDevice(places[0]); + memory::RecordStream(in->Holder(), places_to_ctx_[key][0]->stream()); + } + + { + platform::NCCLGroupGuard nccl_guard; + cuda_guard.SetDevice(places[0]); + const auto& nccl_stream = places_to_ctx_[key][0]->stream(); + fn(in, out, nccl_comms[0]->GetNcclComm(), nccl_stream); + } + + cuda_guard.SetDevice(places[0]); +} + template std::shared_ptr ProcessGroupNCCL::PointToPoint( std::vector& tensors, Fn fn, int dst_rank, CommType op_type) { diff --git a/paddle/fluid/distributed/collective/ProcessGroupNCCL.h b/paddle/fluid/distributed/collective/ProcessGroupNCCL.h index 4ab5374dacaf4..fa73ed195b0c1 100644 --- a/paddle/fluid/distributed/collective/ProcessGroupNCCL.h +++ b/paddle/fluid/distributed/collective/ProcessGroupNCCL.h @@ -146,6 +146,10 @@ class ProcessGroupNCCL : public ProcessGroup { std::vector& outputs, // NOLINT Fn fn, CommType op_type); + template + void Collective(const phi::DenseTensor*, phi::DenseTensor*, Fn fn, + CommType op_type); + template std::shared_ptr PointToPoint( std::vector& tensors, // NOLINT diff --git a/paddle/fluid/operators/collective/c_broadcast_op.cu.cc b/paddle/fluid/operators/collective/c_broadcast_op.cu.cc index 0ad61bb16b51e..7bdf5f0c46ca6 100644 --- a/paddle/fluid/operators/collective/c_broadcast_op.cu.cc +++ b/paddle/fluid/operators/collective/c_broadcast_op.cu.cc @@ -37,7 +37,6 @@ class CBroadcastOpCUDAKernel : public framework::OpKernel { int rid = ctx.Attr("ring_id"); auto place = ctx.GetPlace(); - auto comm = platform::NCCLCommContext::Instance().Get(rid, place); auto map = distributed::ProcessGroupMapFromGid::getInstance(); if (map->has(rid)) { // Use ProcessGroup @@ -46,6 +45,7 @@ class CBroadcastOpCUDAKernel : public framework::OpKernel { return; } + auto comm = platform::NCCLCommContext::Instance().Get(rid, place); gpuStream_t stream = nullptr; if (ctx.Attr("use_calc_stream")) { auto dev_ctx = platform::DeviceContextPool::Instance().Get(place); diff --git a/paddle/fluid/pybind/CMakeLists.txt b/paddle/fluid/pybind/CMakeLists.txt index b190f429410f4..f8e7081de01bd 100644 --- a/paddle/fluid/pybind/CMakeLists.txt +++ b/paddle/fluid/pybind/CMakeLists.txt @@ -91,12 +91,18 @@ if(NOT ON_INFER) set (PYBIND_DEPS ${PYBIND_DEPS} processgroup eager_reducer) if (WITH_NCCL) set (PYBIND_DEPS ${PYBIND_DEPS} processgroup_nccl) + if (WITH_PSCORE) + set (PYBIND_DEPS ${PYBIND_DEPS} processgroup_heter) + endif() endif() if (WITH_GLOO) set (PYBIND_DEPS ${PYBIND_DEPS} processgroup_gloo) endif() if(WITH_ASCEND_CL) set (PYBIND_DEPS ${PYBIND_DEPS} processgroup_hccl) + if (WITH_PSCORE) + set (PYBIND_DEPS ${PYBIND_DEPS} processgroup_heter) + endif() endif() set(PYBIND_SRCS ${PYBIND_SRCS} distributed_py.cc) endif() diff --git a/paddle/fluid/pybind/distributed_py.cc b/paddle/fluid/pybind/distributed_py.cc index 6c74ea2eef4d0..38ed1d4f2bb5d 100644 --- a/paddle/fluid/pybind/distributed_py.cc +++ b/paddle/fluid/pybind/distributed_py.cc @@ -39,6 +39,11 @@ limitations under the License. */ #include "paddle/fluid/distributed/collective/ProcessGroupHCCL.h" #endif +#if defined(PADDLE_WITH_GLOO) && defined(PADDLE_WITH_PSCORE) && \ + (defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_ASCEND_CL)) +#include "paddle/fluid/distributed/collective/ProcessGroupHeter.h" +#endif + #if defined(PADDLE_WITH_GLOO) #include "paddle/fluid/distributed/collective/ProcessGroupGloo.h" #include "paddle/fluid/distributed/store/tcp_store.h" @@ -217,6 +222,21 @@ void BindDistributed(py::module *m) { int>(), py::arg("store"), py::arg("rank"), py::arg("world_size"), py::arg("group_id") = 0, py::call_guard()); + +#if defined(PADDLE_WITH_GLOO) && defined(PADDLE_WITH_PSCORE) && \ + (defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_ASCEND_CL)) + py::class_>( + *m, "ProcessGroupHeter", ProcessGroup) + .def(py::init &, int, int, int, + int, int, int, int, bool, std::string>(), + py::arg("store"), py::arg("rank"), py::arg("world_size"), + py::arg("gid") = 0, py::arg("local_rank") = 0, + py::arg("local_size") = 1, py::arg("gloo_rank") = 0, + py::arg("gloo_size") = 1, py::arg("with_switch") = false, + py::arg("switch_endpoint") = "", + py::call_guard()); +#endif #endif #if defined(PADDLE_WITH_ASCEND_CL) @@ -227,6 +247,21 @@ void BindDistributed(py::module *m) { int>(), py::arg("store"), py::arg("rank"), py::arg("world_size"), py::arg("group_id") = 0, py::call_guard()); + +#if defined(PADDLE_WITH_GLOO) && defined(PADDLE_WITH_PSCORE) && \ + (defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_ASCEND_CL)) + py::class_>( + *m, "ProcessGroupHeter", ProcessGroup) + .def(py::init &, int, int, int, + int, int, int, int, bool, std::string>(), + py::arg("store"), py::arg("rank"), py::arg("world_size"), + py::arg("gid") = 0, py::arg("local_rank") = 0, + py::arg("local_size") = 1, py::arg("gloo_rank") = 0, + py::arg("gloo_rank") = 1, py::arg("with_switch") = false, + py::arg("switch_endpoint") = "", + py::call_guard()); +#endif #endif py::class_ Date: Thu, 7 Apr 2022 14:50:31 +0800 Subject: [PATCH 006/211] fix compile bug of windows cuda11.5 (#41433) --- paddle/phi/kernels/funcs/activation_functor.h | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/paddle/phi/kernels/funcs/activation_functor.h b/paddle/phi/kernels/funcs/activation_functor.h index eee6cf5640774..84da69ed5da02 100644 --- a/paddle/phi/kernels/funcs/activation_functor.h +++ b/paddle/phi/kernels/funcs/activation_functor.h @@ -1878,12 +1878,17 @@ struct CudaCosGradFunctor : public BaseActivationFunctor { template struct CudaExpFunctor : public BaseActivationFunctor { - using MPType = typename phi::dtype::MPTypeTrait::Type; + // exp(x) = expf(x) + __device__ __forceinline__ T operator()(const T x) const { + return static_cast(expf(static_cast(x))); + } +}; +template <> +struct CudaExpFunctor : public BaseActivationFunctor { // exp(x) = exp(x) - __device__ __forceinline__ T operator()(const T arg_x) const { - MPType x = static_cast(arg_x); - return static_cast(exp(x)); + __device__ __forceinline__ double operator()(const double x) const { + return exp(x); } }; From 73533b9b4f8afa80bd41c71c79cddf31812bcf42 Mon Sep 17 00:00:00 2001 From: xiongkun Date: Thu, 7 Apr 2022 15:04:47 +0800 Subject: [PATCH 007/211] [Yaml] add unittest for prelu, gelu. (#41444) * add gelu pythonapi and unittest * fix prelu --- .../fluid/tests/unittests/test_gelu_op.py | 5 ++++ .../fluid/tests/unittests/test_prelu_op.py | 23 +++++++++++++++---- python/paddle/nn/functional/activation.py | 5 +++- 3 files changed, 27 insertions(+), 6 deletions(-) diff --git a/python/paddle/fluid/tests/unittests/test_gelu_op.py b/python/paddle/fluid/tests/unittests/test_gelu_op.py index de34b63c9398e..abfb65c27a951 100644 --- a/python/paddle/fluid/tests/unittests/test_gelu_op.py +++ b/python/paddle/fluid/tests/unittests/test_gelu_op.py @@ -21,6 +21,7 @@ import paddle.fluid.dygraph as dg import paddle import paddle.nn.functional as F +from paddle.fluid.framework import _test_eager_guard def gelu(x, approximate): @@ -91,6 +92,10 @@ def run_gelu_op(approximate): np.allclose( x_g_ref, x_g_fast_math, rtol=1e-5, atol=5e-4)) + def test_fast_math_eager(self): + with _test_eager_guard(): + self.test_fast_math() + if __name__ == '__main__': unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_prelu_op.py b/python/paddle/fluid/tests/unittests/test_prelu_op.py index 56b32d41a9bd1..73c423a23e6ba 100644 --- a/python/paddle/fluid/tests/unittests/test_prelu_op.py +++ b/python/paddle/fluid/tests/unittests/test_prelu_op.py @@ -23,6 +23,7 @@ from op_test import OpTest, skip_check_grad_ci import paddle import paddle.nn.functional as F +from paddle.fluid.framework import _test_eager_guard def ref_prelu(x, weight): @@ -76,6 +77,10 @@ def test_dygraph_api(self): self.dygraph_check(self.weight_np_0) self.dygraph_check(self.weight_np_1) + def test_dygraph_api_eager(self): + with _test_eager_guard(): + self.test_dygraph_api() + def test_error(self): with paddle.static.program_guard(paddle.static.Program()): weight_fp32 = paddle.fluid.data( @@ -151,13 +156,19 @@ def test_dygraph_api(self): paddle.enable_static() +def prelu_api_wrapper(x, weight, data_format="NCHW"): + weight = weight.reshape([-1]) + return paddle.nn.functional.prelu(x, weight, data_format, name=None) + + class PReluTest(OpTest): def setUp(self): self.init_dtype() self.init_input_shape() + self.eager_mode = True self.init_attr() self.op_type = "prelu" - self.python_api = paddle.nn.functional.prelu + self.python_api = prelu_api_wrapper x_np = np.random.uniform(-1, 1, self.x_shape).astype(self.dtype) # Since zero point in prelu is not differentiable, avoid randomize @@ -178,6 +189,8 @@ def setUp(self): alpha_np = np.random.uniform(-1, -0.5, [1, 1, 1, self.x_shape[-1]]) else: alpha_np = np.random.uniform(-1, -0.5, [1] + self.x_shape[1:]) + # eager check don't support mode = 'all' + self.eager_mode = False alpha_np = alpha_np.astype(self.dtype) self.inputs = {'X': x_np, 'Alpha': alpha_np} @@ -208,10 +221,10 @@ def init_attr(self): self.attrs = {'mode': "channel", "data_format": "NCHW"} def test_check_output(self): - self.check_output(check_eager=False) + self.check_output(check_eager=self.eager_mode) def test_check_grad(self): - self.check_grad(['X', 'Alpha'], 'Out', check_eager=False) + self.check_grad(['X', 'Alpha'], 'Out', check_eager=self.eager_mode) @skip_check_grad_ci( @@ -375,7 +388,7 @@ def test_check_output(self): place = core.CUDAPlace(0) if core.is_float16_supported(place): self.check_output_with_place( - place, atol=atol, check_eager=False) + place, atol=atol, check_eager=self.eager_mode) def test_check_grad(self): place = core.CUDAPlace(0) @@ -384,7 +397,7 @@ def test_check_grad(self): place, ['X', 'Alpha'], 'Out', max_relative_error=max_relative_error, - check_eager=False) + check_eager=self.eager_mode) cls_name = "{0}_{1}".format(parent.__name__, "Fp16Op") TestPReluFp16Case.__name__ = cls_name diff --git a/python/paddle/nn/functional/activation.py b/python/paddle/nn/functional/activation.py index 3bdda982ff4f1..d145b615c3d7f 100644 --- a/python/paddle/nn/functional/activation.py +++ b/python/paddle/nn/functional/activation.py @@ -175,7 +175,10 @@ def gelu(x, approximate=False, name=None): # [ 0.84119201, 1.39957154]] """ - if in_dynamic_mode(): + if in_dygraph_mode(): + return _C_ops.final_state_gelu(x, approximate) + + if _in_legacy_dygraph(): return _C_ops.gelu(x, 'approximate', approximate) check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64'], 'gelu') From f87f06560cf90687528da88918a0261df069740f Mon Sep 17 00:00:00 2001 From: "joanna.wozna.intel" Date: Thu, 7 Apr 2022 09:41:49 +0200 Subject: [PATCH 008/211] Fix problem with py3.6 and test for quant2_int8_lstm (#41420) --- python/paddle/distributed/parallel.py | 1 + python/paddle/fluid/contrib/slim/tests/save_quant_model.py | 4 ++-- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/python/paddle/distributed/parallel.py b/python/paddle/distributed/parallel.py index d9d252024d9f3..f0365cab8c896 100644 --- a/python/paddle/distributed/parallel.py +++ b/python/paddle/distributed/parallel.py @@ -30,6 +30,7 @@ from paddle.distributed.fleet.launch_utils import check_backend from paddle.fluid.dygraph.parallel import ParallelEnv from paddle.distributed.fleet.base.private_helper_function import wait_server_ready # noqa: F401 +from paddle.distributed import collective from paddle.distributed.collective import _set_group_map from paddle.distributed.collective import _set_group_map_by_name from paddle.distributed.collective import _get_group_map_by_name diff --git a/python/paddle/fluid/contrib/slim/tests/save_quant_model.py b/python/paddle/fluid/contrib/slim/tests/save_quant_model.py index f97c2778c0918..73ec8cf3e023d 100644 --- a/python/paddle/fluid/contrib/slim/tests/save_quant_model.py +++ b/python/paddle/fluid/contrib/slim/tests/save_quant_model.py @@ -88,8 +88,8 @@ def transform_and_save_int8_model(original_path, debug=False, quant_model_filename='', quant_params_filename='', - save_model_filename='', - save_params_filename=''): + save_model_filename="__model__", + save_params_filename=None): place = fluid.CPUPlace() exe = fluid.Executor(place) inference_scope = fluid.executor.global_scope() From dfb4798603a8d231827bd70fdccc431b31b72989 Mon Sep 17 00:00:00 2001 From: liutiexing <74819124+liutiexing@users.noreply.github.com> Date: Thu, 7 Apr 2022 16:16:27 +0800 Subject: [PATCH 009/211] Profile Executors (#41100) * Profile Executors * update * fix ut * fix names * update * update --- .../details/fast_threaded_ssa_graph_executor.cc | 8 ++++++-- paddle/fluid/framework/executor.cc | 8 ++++++++ paddle/fluid/framework/ir/cost_model.cc | 17 +++++++++++++++-- .../framework/new_executor/event_manager.cc | 7 +++++++ .../framework/new_executor/interpretercore.cc | 12 ++++++++++++ .../workqueue/nonblocking_threadpool.h | 4 ++-- .../new_executor/workqueue/workqueue.cc | 10 ++++++---- paddle/fluid/framework/parallel_executor.cc | 2 ++ .../auto_growth_best_fit_allocator.cc | 10 ++++++---- .../allocation/stream_safe_cuda_allocator.cc | 10 ++++++---- paddle/fluid/pybind/pybind.cc | 2 +- 11 files changed, 71 insertions(+), 19 deletions(-) diff --git a/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.cc b/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.cc index 1b2b24762894c..ce471d55b24a1 100644 --- a/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.cc +++ b/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.cc @@ -132,6 +132,9 @@ FetchResultType FastThreadedSSAGraphExecutor::Run( } // Wait FetchOps. if (!fetch_ops.empty()) { + platform::RecordEvent record_wait( + "FastThreadedSSAGraphExecutor::WaitFetchOps", + platform::TracerEventType::Operator, 1); ClearFetchOp(graph_, &fetch_ops); for (auto &place : places_) { @@ -231,8 +234,9 @@ void FastThreadedSSAGraphExecutor::RunOpAsync( OpHandleBase *op, const std::shared_ptr> &complete_q) { ++remaining_; - platform::RecordEvent("WorkQueue::AddTask", - platform::TracerEventType::UserDefined, 10 /*level*/); + platform::RecordEvent record("WorkQueue::AddTask", + platform::TracerEventType::UserDefined, + 10 /*level*/); this->pool_->enqueue([=] { std::deque op_queue; op_queue.push_front(op); diff --git a/paddle/fluid/framework/executor.cc b/paddle/fluid/framework/executor.cc index f951b5d0f5070..06ce9712f5c52 100644 --- a/paddle/fluid/framework/executor.cc +++ b/paddle/fluid/framework/executor.cc @@ -172,6 +172,8 @@ void Executor::Run(const ProgramDesc& pdesc, Scope* scope, int block_id, bool create_local_scope, bool create_vars, const std::vector& skip_ref_cnt_vars, bool force_disable_gc, bool keep_kid_scopes) { + platform::RecordEvent record_run("Executor::Run", + platform::TracerEventType::UserDefined, 1); platform::RecordBlock b(block_id); if (FLAGS_use_mkldnn) EnableMKLDNN(pdesc); auto ctx = Prepare(pdesc, block_id, skip_ref_cnt_vars, force_disable_gc); @@ -301,6 +303,8 @@ void Executor::Run(const ProgramDesc& program, Scope* scope, bool create_local_scope, bool create_vars, const std::string& feed_holder_name, const std::string& fetch_holder_name) { + platform::RecordEvent record_run("Executor::Run", + platform::TracerEventType::UserDefined, 1); platform::RecordBlock b(kProgramId); if (FLAGS_use_mkldnn) EnableMKLDNN(program); #ifdef PADDLE_WITH_MKLDNN @@ -428,6 +432,8 @@ void Executor::RunPartialPreparedContext(ExecutorPrepareContext* ctx, int64_t end_op_index, bool create_local_scope, bool create_vars, bool keep_kids) { + platform::RecordEvent record_run("Executor::RunPartialPreparedContext", + platform::TracerEventType::UserDefined, 1); platform::RecordBlock b(kProgramId); PADDLE_ENFORCE_NOT_NULL( scope, platform::errors::InvalidArgument("Scope shouldn't be null")); @@ -518,6 +524,8 @@ void Executor::RunPartialPreparedContext(ExecutorPrepareContext* ctx, auto& op = ctx->ops_[i]; op->Run(*local_scope, place_); if (gc) { + platform::RecordEvent record("CheckGC", + platform::TracerEventType::UserDefined, 10); DeleteUnusedTensors(*local_scope, op.get(), ctx->unused_vars_, gc.get()); } } diff --git a/paddle/fluid/framework/ir/cost_model.cc b/paddle/fluid/framework/ir/cost_model.cc index 5027c50103a52..6086409ffd971 100644 --- a/paddle/fluid/framework/ir/cost_model.cc +++ b/paddle/fluid/framework/ir/cost_model.cc @@ -44,6 +44,19 @@ double CostData::GetWholeMemoryBytes() const { return whole_memory_bytes_; } const Graph* CostData::GetGraph() const { return graph_; } const ProgramDesc* CostData::GetProgram() const { return program_; } +static bool StringHasEnding(const std::string& full, + const std::string& ending) { + if (full.length() < ending.length()) { + return false; + } + if (full.length() == ending.length()) { + return full == ending; + } + size_t prefix_len = full.length() - ending.length(); + return 0 == full.compare(prefix_len, ending.length(), ending) && + full[prefix_len - 1] == '/'; +} + bool CostData::SetCostData(const ProgramDesc& program, const std::vector>& time_events) { // TODO(zhhsplendid): Make a copy so that CostData can be available even if @@ -77,7 +90,7 @@ bool CostData::SetCostData(const ProgramDesc& program, std::string op_type = op_desc->Type(); while (event_index < main_thread_events.size()) { - if (main_thread_events[event_index].name() == op_type && + if (StringHasEnding(main_thread_events[event_index].name(), op_type) && main_thread_events[event_index].type() == platform::EventType::kPushRange) { break; @@ -97,7 +110,7 @@ bool CostData::SetCostData(const ProgramDesc& program, // ControlFlow Op can be like that, but this version only support global // block // TODO(zhhsplendid): make a more strict mapping between push and pop - if (main_thread_events[event_index].name() == op_type && + if (StringHasEnding(main_thread_events[event_index].name(), op_type) && main_thread_events[event_index].type() == platform::EventType::kPopRange) { break; diff --git a/paddle/fluid/framework/new_executor/event_manager.cc b/paddle/fluid/framework/new_executor/event_manager.cc index cc6fd6e3ed0f9..bca2264b66afc 100644 --- a/paddle/fluid/framework/new_executor/event_manager.cc +++ b/paddle/fluid/framework/new_executor/event_manager.cc @@ -13,6 +13,7 @@ // limitations under the License. #include "paddle/fluid/framework/new_executor/event_manager.h" +#include "paddle/fluid/platform/profiler/event_tracing.h" namespace paddle { namespace framework { @@ -24,6 +25,8 @@ void WaitEvent(const Instruction& instruction, const platform::Place& place) { VLOG(3) << "Deal StreamWaitEventOrSync for " << instruction.OpBase()->Type(); for (auto& event_iter : instruction.InputEvents()) { + platform::RecordEvent record("WaitStreamEvent", + platform::TracerEventType::UserDefined, 10); VLOG(3) << "wait var_id: " << event_iter.var_id_ << " 's event with waiter_type: " << event_iter.waiter_type_; event_iter.event_->Wait(event_iter.waiter_type_, @@ -36,6 +39,8 @@ void RecordEvent(const Instruction& instruction, const platform::Place& place) { if (platform::is_cpu_place(place)) return; for (auto& event : instruction.OutputEvents()) { + platform::RecordEvent record("RecordStreamEvent", + platform::TracerEventType::UserDefined, 10); VLOG(3) << "Record event in out_var_id: " << event.var_id_; event.event_->Record(&instruction.DeviceContext()); } @@ -46,6 +51,8 @@ void RecordEvent(const Instruction& instruction) { if (platform::is_cpu_place(instruction.DeviceContext().GetPlace())) return; for (auto& event : instruction.OutputEvents()) { + platform::RecordEvent record("RecordStreamEvent", + platform::TracerEventType::UserDefined, 10); VLOG(3) << "Record event in out_var_id: " << event.var_id_; event.event_->Record(&instruction.DeviceContext()); } diff --git a/paddle/fluid/framework/new_executor/interpretercore.cc b/paddle/fluid/framework/new_executor/interpretercore.cc index 29aa7b13a270e..20a6e53479323 100644 --- a/paddle/fluid/framework/new_executor/interpretercore.cc +++ b/paddle/fluid/framework/new_executor/interpretercore.cc @@ -489,6 +489,8 @@ void InterpreterCore::RunInstruction(const Instruction& instr_node) { VLOG(4) << "End run " << place << " " << op->DebugStringEx(global_scope_); if (!instr_node.InplaceBackMap().empty()) { + platform::RecordEvent inplaceback_event( + "InplaceVarsBack", platform::TracerEventType::UserDefined, 10); auto& m = instr_node.InplaceBackMap(); // NOTE(zhiqiu): same logic as TransferInplaceVarsBack() in operator.cc for (auto& p : m) { @@ -530,6 +532,8 @@ void InterpreterCore::ExecuteInstructionList( return; } + platform::RecordEvent record_prepare( + "PrepareAtomic", platform::TracerEventType::UserDefined, 1); // NOTE(zhiqiu): get the prepared deps from std::future, and async prepare // those for the next step auto atomic_deps = async_work_queue_->AtomicDeps(); @@ -537,6 +541,7 @@ void InterpreterCore::ExecuteInstructionList( async_work_queue_->PrepareAtomicDeps(dependecy_count_); async_work_queue_->PrepareAtomicVarRef(global_scope_->VecMetaInfo()); + record_prepare.End(); exception_holder_.Clear(); @@ -573,6 +578,9 @@ void InterpreterCore::RunNextInstructions( const Instruction& instr, std::queue* reserved_next_ops, std::vector>* atomic_deps, std::vector>* atomic_var_ref) { + platform::RecordEvent record("RunNextInstructions", + platform::TracerEventType::UserDefined, 10); + VLOG(4) << "atomic 1:" << atomic_deps; auto& next_instr = instr.NextInstructions(); auto IsReady = [atomic_deps](size_t next_id) { @@ -708,6 +716,8 @@ void InterpreterCore::RecordStreamForGC(const Instruction& instr) { instr.KernelType() != OpFuncType::kQueueAsync) { return; } + platform::RecordEvent record("RecordStreamForGC", + platform::TracerEventType::UserDefined, 10); gpuStream_t stream = reinterpret_cast( instr.DeviceContext()) @@ -799,6 +809,8 @@ void InterpreterCore::RecordStreamForGC(const Instruction& instr) { void InterpreterCore::CheckGC( const Instruction& instr, std::vector>* atomic_var_ref) { + platform::RecordEvent record("CheckGC", + platform::TracerEventType::UserDefined, 10); size_t instr_id = instr.Id(); auto& var_scope = *global_scope_; diff --git a/paddle/fluid/framework/new_executor/workqueue/nonblocking_threadpool.h b/paddle/fluid/framework/new_executor/workqueue/nonblocking_threadpool.h index bc65231abe737..384498584c66a 100644 --- a/paddle/fluid/framework/new_executor/workqueue/nonblocking_threadpool.h +++ b/paddle/fluid/framework/new_executor/workqueue/nonblocking_threadpool.h @@ -408,8 +408,8 @@ class ThreadPoolTempl { ec_.Notify(true); return false; } - platform::RecordEvent("SleepWaitForWork", - platform::TracerEventType::UserDefined, 10); + platform::RecordEvent record("WaitForWork", + platform::TracerEventType::UserDefined, 10); ec_.CommitWait(waiter); blocked_--; return true; diff --git a/paddle/fluid/framework/new_executor/workqueue/workqueue.cc b/paddle/fluid/framework/new_executor/workqueue/workqueue.cc index 881878ebb12a7..b8dfcad187ca0 100644 --- a/paddle/fluid/framework/new_executor/workqueue/workqueue.cc +++ b/paddle/fluid/framework/new_executor/workqueue/workqueue.cc @@ -55,8 +55,9 @@ class WorkQueueImpl : public WorkQueue { } void AddTask(std::function fn) override { - platform::RecordEvent("WorkQueue::AddTask", - platform::TracerEventType::UserDefined, 10 /*level*/); + platform::RecordEvent record("WorkQueue::AddTask", + platform::TracerEventType::UserDefined, + 10 /*level*/); if (tracker_ != nullptr) { fn = [ task = std::move(fn), raii = CounterGuard(tracker_) @@ -146,8 +147,9 @@ WorkQueueGroupImpl::~WorkQueueGroupImpl() { } void WorkQueueGroupImpl::AddTask(size_t queue_idx, std::function fn) { - platform::RecordEvent("WorkQueue::AddTask", - platform::TracerEventType::UserDefined, 10 /*level*/); + platform::RecordEvent record("WorkQueue::AddTask", + platform::TracerEventType::UserDefined, + 10 /*level*/); assert(queue_idx < queues_.size()); if (queues_options_.at(queue_idx).track_task) { fn = [ diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc index 5b913ff2d21de..b088a535a1232 100644 --- a/paddle/fluid/framework/parallel_executor.cc +++ b/paddle/fluid/framework/parallel_executor.cc @@ -916,6 +916,8 @@ void ParallelExecutor::BCastParamsToDevices( FetchResultType ParallelExecutor::Run( const std::vector &fetch_tensors, bool return_merged) { + platform::RecordEvent record_run("ParallelExecutor::Run", + platform::TracerEventType::UserDefined, 1); VLOG(3) << "enter ParallelExecutor Run"; #ifdef PADDLE_WITH_CUDA if (platform::IsCUDAGraphCapturing()) { diff --git a/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator.cc b/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator.cc index f5e4941d78709..782062283e985 100644 --- a/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator.cc +++ b/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator.cc @@ -48,8 +48,9 @@ AutoGrowthBestFitAllocator::AutoGrowthBestFitAllocator( phi::Allocation *AutoGrowthBestFitAllocator::AllocateImpl( size_t unaligned_size) { - platform::RecordEvent("AutoGrowthBestFitAllocator::Allocate", - platform::TracerEventType::UserDefined, 9 /*level*/); + platform::RecordEvent record("AutoGrowthBestFitAllocator::Allocate", + platform::TracerEventType::UserDefined, + 9 /*level*/); size_t size = AlignedSize(unaligned_size, alignment_); VLOG(10) << "Allocate " << unaligned_size << " bytes, aligned to " << size; @@ -111,8 +112,9 @@ phi::Allocation *AutoGrowthBestFitAllocator::AllocateImpl( } void AutoGrowthBestFitAllocator::FreeImpl(phi::Allocation *allocation) { - platform::RecordEvent("AutoGrowthBestFitAllocator::Free", - platform::TracerEventType::UserDefined, 9 /*level*/); + platform::RecordEvent record("AutoGrowthBestFitAllocator::Free", + platform::TracerEventType::UserDefined, + 9 /*level*/); VLOG(10) << "Free " << allocation->size() << " bytes, ptr = " << allocation->ptr(); std::lock_guard guard(spinlock_); diff --git a/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.cc b/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.cc index 82233fd4fe821..80877cb670ba9 100644 --- a/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.cc +++ b/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.cc @@ -163,8 +163,9 @@ void StreamSafeCUDAAllocator::SetDefaultStream(gpuStream_t stream) { } phi::Allocation* StreamSafeCUDAAllocator::AllocateImpl(size_t size) { - platform::RecordEvent("StreamSafeCUDAAllocator::Allocate", - platform::TracerEventType::UserDefined, 9 /*level*/); + platform::RecordEvent record("StreamSafeCUDAAllocator::Allocate", + platform::TracerEventType::UserDefined, + 9 /*level*/); ProcessUnfreedAllocations(); VLOG(8) << "Try allocate " << size << " bytes"; AllocationPtr underlying_allocation; @@ -192,8 +193,9 @@ phi::Allocation* StreamSafeCUDAAllocator::AllocateImpl(size_t size) { } void StreamSafeCUDAAllocator::FreeImpl(phi::Allocation* allocation) { - platform::RecordEvent("StreamSafeCUDAAllocator::Free", - platform::TracerEventType::UserDefined, 9 /*level*/); + platform::RecordEvent record("StreamSafeCUDAAllocator::Free", + platform::TracerEventType::UserDefined, + 9 /*level*/); StreamSafeCUDAAllocation* stream_safe_cuda_allocation = static_cast(allocation); diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc index 96d86ee1a3100..44abf3357d63d 100644 --- a/paddle/fluid/pybind/pybind.cc +++ b/paddle/fluid/pybind/pybind.cc @@ -2867,7 +2867,7 @@ All parameter, weight, gradient are variables in Paddle. [](StandaloneExecutor &self, std::vector feed_names, std::vector fetch_names) { platform::RecordEvent record_event( - "StandaloneExecutor:run", + "StandaloneExecutor::run", platform::TracerEventType::UserDefined, 1); paddle::framework::FetchList ret; { From 75227c9e35308dac71d710e8360eaa9854f97915 Mon Sep 17 00:00:00 2001 From: lilong12 Date: Thu, 7 Apr 2022 16:38:16 +0800 Subject: [PATCH 010/211] use group id to differentiate keys for tcp store (#41496) --- paddle/fluid/distributed/collective/ProcessGroupNCCL.cc | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/paddle/fluid/distributed/collective/ProcessGroupNCCL.cc b/paddle/fluid/distributed/collective/ProcessGroupNCCL.cc index eeb5e3b397c10..b1d892e2521a3 100644 --- a/paddle/fluid/distributed/collective/ProcessGroupNCCL.cc +++ b/paddle/fluid/distributed/collective/ProcessGroupNCCL.cc @@ -110,7 +110,8 @@ void ProcessGroupNCCL::BroadcastUniqueNCCLID( std::vector& nccl_ids) { // NOLINT if (rank_ == 0) { for (size_t i = 0; i < nccl_ids.size(); i++) { - auto key = "ProcessGroupNCCL/nccl_ids/" + std::to_string(i); + auto key = "ProcessGroupNCCL/nccl_ids/" + std::to_string(gid_) + "/" + + std::to_string(i); auto nccl_id = std::vector( reinterpret_cast(&nccl_ids[i]), reinterpret_cast(&nccl_ids[i]) + NCCL_UNIQUE_ID_BYTES); @@ -118,7 +119,8 @@ void ProcessGroupNCCL::BroadcastUniqueNCCLID( } } else { for (size_t i = 0; i < nccl_ids.size(); i++) { - auto key = "ProcessGroupNCCL/nccl_ids/" + std::to_string(i); + auto key = "ProcessGroupNCCL/nccl_ids/" + std::to_string(gid_) + "/" + + std::to_string(i); auto ret = store_->get(key); std::memcpy(&nccl_ids[i], ret.data(), ret.size()); } From edbb39863d8abf5b0eb9d101afb06dc2471f36b6 Mon Sep 17 00:00:00 2001 From: 0x45f <23097963+0x45f@users.noreply.github.com> Date: Thu, 7 Apr 2022 16:43:49 +0800 Subject: [PATCH 011/211] Switch some dy2st UT to eager mode (#41382) * Sitch some dy2st UT to eager mode * Fix test_lstm and remove test_transformer * Run test_resnet_v2 in old dy mode --- python/paddle/fluid/dygraph/varbase_patch_methods.py | 2 +- python/paddle/fluid/tests/unittests/CMakeLists.txt | 7 +++++++ .../fluid/tests/unittests/dygraph_to_static/CMakeLists.txt | 2 +- .../tests/unittests/dygraph_to_static/test_resnet_v2.py | 2 ++ 4 files changed, 11 insertions(+), 2 deletions(-) diff --git a/python/paddle/fluid/dygraph/varbase_patch_methods.py b/python/paddle/fluid/dygraph/varbase_patch_methods.py index 4659c98abccc1..72aee0ba87e58 100644 --- a/python/paddle/fluid/dygraph/varbase_patch_methods.py +++ b/python/paddle/fluid/dygraph/varbase_patch_methods.py @@ -99,7 +99,7 @@ def _to_static_var(self, to_parameter=False, **kwargs): # Note: getattr(self, attr, None) will call x.grad=x.gradient(), but gradient() only available in dygraph. # It will fail. So, for propery that different between dynamic and static graph, should not getattr(self, attr, None). - attr_not_need_keys = ['grad', 'T'] + attr_not_need_keys = ['grad', 'T', 'place', '_place_str'] if isinstance(self, (ParamBase, EagerParamBase)): attr_kwargs = self.__dict__.copy() else: diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt index 6085360543e92..2e4259d2085c5 100755 --- a/python/paddle/fluid/tests/unittests/CMakeLists.txt +++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt @@ -596,6 +596,13 @@ foreach(TEST_OP ${TEST_OPS_WITH_GC}) py_test_modules(${TEST_OP} MODULES ${TEST_OP} ENVS ${GC_ENVS}) endforeach() +# Switch some dy2st UT to eager mode +set(TEST_EAGER_OPS test_jit_save_load test_translated_layer) +foreach(TEST_OP ${TEST_EAGER_OPS}) + list(REMOVE_ITEM TEST_OPS ${TEST_OP}) + py_test_modules(${TEST_OP} MODULES ${TEST_OP} ENVS FLAGS_enable_eager_mode=1) +endforeach() + if ((NOT WITH_GPU) AND (NOT WITH_XPU) AND NOT (WITH_ASCEND OR WITH_ASCEND_CL)) list(REMOVE_ITEM TEST_OPS "test_fleet_graph_execution_meta_optimizer") list(REMOVE_ITEM TEST_OPS "test_gen_nccl_id_op") diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/CMakeLists.txt b/python/paddle/fluid/tests/unittests/dygraph_to_static/CMakeLists.txt index eeb377ff3b4a2..f046c7b73927e 100644 --- a/python/paddle/fluid/tests/unittests/dygraph_to_static/CMakeLists.txt +++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/CMakeLists.txt @@ -6,7 +6,7 @@ set(DY2ST_EAGER_TEST_ENVS ${GC_ENVS} FLAGS_enable_eager_mode=1) set(TEST_EAGER_OPS test_bmn test_break_continue test_ifelse test_loop test_mnist_amp test_mnist_pure_fp16 test_mobile_net test_program_translator test_ptb_lm test_reinforcement_learning test_resnet test_resnet_amp test_resnet_pure_fp16 test_se_resnet test_sentiment test_seq2seq - test_tsm test_word2vec test_yolov3) + test_tsm test_word2vec test_yolov3 test_bert test_cycle_gan test_lstm test_simnet) list(REMOVE_ITEM TEST_OPS test_lac) # NOTE(Aurelius84): In case of Windows CI, if open ON_INFER, RWLOCK of Scope will # be removed and will cause some random failed in multi-thread. diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_resnet_v2.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_resnet_v2.py index ae7a588579059..0cf96b7159579 100644 --- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_resnet_v2.py +++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_resnet_v2.py @@ -14,6 +14,8 @@ from __future__ import print_function +import os +os.environ["FLAGS_enable_eager_mode"] = "0" import math import time import unittest From 5516f180fc5e445be281a575304b0c2b70db9cee Mon Sep 17 00:00:00 2001 From: Chen Weihang Date: Thu, 7 Apr 2022 16:48:09 +0800 Subject: [PATCH 012/211] [Phi] Add unbind yaml and final state api (#41277) * add unbind yaml * fix unittest --- paddle/phi/api/lib/api_custom_impl.cc | 48 +++++++++++++++++++ paddle/phi/api/lib/api_custom_impl.h | 4 ++ paddle/phi/infermeta/unary.cc | 12 ++--- paddle/phi/infermeta/unary.h | 2 +- .../fluid/tests/unittests/test_unbind_op.py | 22 +++++++++ python/paddle/tensor/manipulation.py | 5 +- python/paddle/utils/code_gen/api.yaml | 6 +++ python/paddle/utils/code_gen/backward.yaml | 6 +++ 8 files changed, 97 insertions(+), 8 deletions(-) diff --git a/paddle/phi/api/lib/api_custom_impl.cc b/paddle/phi/api/lib/api_custom_impl.cc index f559027fdd4b0..5d1851fb85aa2 100644 --- a/paddle/phi/api/lib/api_custom_impl.cc +++ b/paddle/phi/api/lib/api_custom_impl.cc @@ -475,6 +475,54 @@ std::tuple momentum_impl( return api_output; } +std::vector unbind_impl(const Tensor& input, int axis) { + auto kernel_key_set = ParseKernelKeyByInputArgs(input); + auto kernel_key = kernel_key_set.GetHighestPriorityKernelKey(); + + Backend kernel_backend = kernel_key.backend(); + DataLayout kernel_layout = kernel_key.layout(); + DataType kernel_data_type = kernel_key.dtype(); + + auto kernel = phi::KernelFactory::Instance().SelectKernelOrThrowError( + "unbind", {kernel_backend, kernel_layout, kernel_data_type}); + VLOG(6) << "unbind API kernel key: [" << kernel_backend << ", " + << kernel_layout << ", " << kernel_data_type << "]"; + VLOG(6) << "unbind API kernel: " << kernel; + + auto* dev_ctx = GetDeviceContextByBackend(kernel_backend); + + auto dense_input = PrepareData(input, kernel.InputAt(0), {}); + + // Calculate the number of out tensors + auto input_shape = input.dims(); + if (axis < 0) { + axis = input_shape.size() + axis; + } + auto out_num = input_shape[axis]; + + std::vector out; + auto dense_outs = SetKernelOutput(out_num, kernel_backend, &out); + std::vector meta_outs; + meta_outs.reserve(out_num); + std::vector meta_out_ptrs; + meta_out_ptrs.reserve(out_num); + for (int64_t i = 0; i < out_num; ++i) { + meta_outs.push_back(dense_outs[i]); + meta_out_ptrs.push_back(&meta_outs.back()); + } + + phi::UnbindInferMeta(MakeMetaTensor(*dense_input), axis, meta_out_ptrs); + + using kernel_signature = void (*)(const phi::DeviceContext&, + const phi::DenseTensor&, + int, + std::vector&); + auto* kernel_fn = kernel.GetVariadicKernelFn(); + (*kernel_fn)(*dev_ctx, *dense_input, axis, dense_outs); + + return out; +} + ////////////////// Backward(grad) api impls ////////////////////// // TODO(chenweihang): the original sum grad op can support higher-level diff --git a/paddle/phi/api/lib/api_custom_impl.h b/paddle/phi/api/lib/api_custom_impl.h index 4745782d914ca..80ace229316a9 100644 --- a/paddle/phi/api/lib/api_custom_impl.h +++ b/paddle/phi/api/lib/api_custom_impl.h @@ -14,6 +14,8 @@ limitations under the License. */ #pragma once +#include + #include "paddle/phi/api/include/tensor.h" #include "paddle/phi/common/int_array.h" #include "paddle/phi/common/place.h" @@ -73,6 +75,8 @@ std::tuple momentum_impl( bool multi_precision, float rescale_grad); +std::vector unbind_impl(const Tensor& input, int axis); + ////////////////// Backward(grad) api impls ////////////////////// std::vector add_n_grad_impl(const std::vector& x, diff --git a/paddle/phi/infermeta/unary.cc b/paddle/phi/infermeta/unary.cc index e0ea637074c20..0fedcca255c90 100644 --- a/paddle/phi/infermeta/unary.cc +++ b/paddle/phi/infermeta/unary.cc @@ -2429,7 +2429,7 @@ void TransposeGradInferMeta(const MetaTensor& x, void UnbindInferMeta(const MetaTensor& x, int axis, - std::vector* outs) { + std::vector outs) { auto in_dims = x.dims(); std::vector out_dim; axis = axis < 0 ? in_dims.size() + axis : axis; @@ -2438,11 +2438,11 @@ void UnbindInferMeta(const MetaTensor& x, } auto out_dims = phi::make_ddim(out_dim); - for (size_t i = 0; i < outs->size(); ++i) { - (*outs)[i].set_dtype(x.dtype()); - (*outs)[i].set_dims(out_dims); - (*outs)[i].set_layout(x.layout()); - (*outs)[i].share_lod(x); + for (size_t i = 0; i < outs.size(); ++i) { + outs[i]->set_dtype(x.dtype()); + outs[i]->set_dims(out_dims); + outs[i]->set_layout(x.layout()); + outs[i]->share_lod(x); } } diff --git a/paddle/phi/infermeta/unary.h b/paddle/phi/infermeta/unary.h index 5106c6f448733..1d69c9504d9cd 100644 --- a/paddle/phi/infermeta/unary.h +++ b/paddle/phi/infermeta/unary.h @@ -365,7 +365,7 @@ void TrilTriuInferMeta(const MetaTensor& x, void UnbindInferMeta(const MetaTensor& x, int axis, - std::vector* outs); + std::vector outs); void UnchangedInferMeta(const MetaTensor& x, MetaTensor* out); diff --git a/python/paddle/fluid/tests/unittests/test_unbind_op.py b/python/paddle/fluid/tests/unittests/test_unbind_op.py index e16fb6ddaacd7..43f2f3526ac0f 100644 --- a/python/paddle/fluid/tests/unittests/test_unbind_op.py +++ b/python/paddle/fluid/tests/unittests/test_unbind_op.py @@ -17,9 +17,11 @@ import unittest import numpy as np from op_test import OpTest, convert_float_to_uint16 +import paddle import paddle.fluid as fluid import paddle.tensor as tensor from paddle.fluid import compiler, Program, program_guard, core +from paddle.fluid.framework import _test_eager_guard class TestUnbind(unittest.TestCase): @@ -39,6 +41,25 @@ def test_unbind(self): assert np.array_equal(res_1, input_1[0, 0:100]) assert np.array_equal(res_2, input_1[1, 0:100]) + def test_unbind_dygraph(self): + with fluid.dygraph.guard(): + np_x = np.random.random([2, 3]).astype("float32") + x = paddle.to_tensor(np_x) + x.stop_gradient = False + [res_1, res_2] = paddle.unbind(x, 0) + self.assertTrue(np.array_equal(res_1, np_x[0, 0:100])) + self.assertTrue(np.array_equal(res_2, np_x[1, 0:100])) + + out = paddle.add_n([res_1, res_2]) + + np_grad = np.ones(x.shape, np.float32) + out.backward() + self.assertTrue(np.array_equal(x.grad.numpy(), np_grad)) + + def test_unbind_dygraph_final_state(self): + with _test_eager_guard(): + self.test_unbind_dygraph() + class TestLayersUnbind(unittest.TestCase): def test_layers_unbind(self): @@ -157,6 +178,7 @@ def outReshape(self): class TestUnbindBF16Op(OpTest): def setUp(self): self._set_op_type() + self.python_api = paddle.unbind self.dtype = self.get_dtype() self.axis = 0 self.num = 3 diff --git a/python/paddle/tensor/manipulation.py b/python/paddle/tensor/manipulation.py index 7e19feba90676..0f90cf6950aff 100755 --- a/python/paddle/tensor/manipulation.py +++ b/python/paddle/tensor/manipulation.py @@ -1469,6 +1469,9 @@ def unbind(input, axis=0): # x3.shape [3, 5] """ + if in_dygraph_mode(): + return _C_ops.final_state_unbind(input, axis) + if not isinstance(axis, (int)): raise TypeError("The type of 'axis' must be int, but received %s." % (type(axis))) @@ -1477,7 +1480,7 @@ def unbind(input, axis=0): input_shape = input.shape axis_ = axis if axis >= 0 else len(input_shape) + axis num = input_shape[axis_] - if paddle.in_dynamic_mode(): + if _in_legacy_dygraph(): return _C_ops.unbind(input, num, 'axis', axis) helper = LayerHelper("unbind", **locals()) diff --git a/python/paddle/utils/code_gen/api.yaml b/python/paddle/utils/code_gen/api.yaml index 589dfdb0f3e1a..4f46b6d0e55ec 100644 --- a/python/paddle/utils/code_gen/api.yaml +++ b/python/paddle/utils/code_gen/api.yaml @@ -1939,6 +1939,12 @@ backend : place data_type : dtype +- api : unbind + args : (Tensor input, int axis) + output : Tensor[] + invoke : unbind_impl(input, axis) + backward : unbind_grad + # unfold - api : unfold args : (Tensor x, int[] kernel_sizes, int[] strides, int[] paddings, int[] dilations) diff --git a/python/paddle/utils/code_gen/backward.yaml b/python/paddle/utils/code_gen/backward.yaml index 942089f18ce55..3f6dc0e7477ab 100644 --- a/python/paddle/utils/code_gen/backward.yaml +++ b/python/paddle/utils/code_gen/backward.yaml @@ -1480,6 +1480,12 @@ kernel : func : trunc_grad +- backward_api : unbind_grad + forward : unbind (Tensor input, int axis) -> Tensor[](out) + args : (Tensor[] out_grad, int axis) + output : Tensor(input_grad) + invoke : stack(out_grad, axis) + - backward_api : unfold_grad forward : unfold (Tensor x, int[] kernel_sizes, int[] strides, int[] paddings, int[] dilations) -> Tensor(out) args : (Tensor x, Tensor out_grad, int[] kernel_sizes, int[] strides, int[] paddings, int[] dilations) From c77a263d263654a2e3afa3baef7b2a49d042e35e Mon Sep 17 00:00:00 2001 From: Ruibiao Chen Date: Thu, 7 Apr 2022 17:03:45 +0800 Subject: [PATCH 013/211] Add yaml for matrix rank op (#41466) * modify matrix_rank * add matrix_rank shape * add matrix_rank shape * Add yaml for matrix_rank OP * Add UT Co-authored-by: zhoujianqian <15205085056@163.com> --- paddle/phi/infermeta/binary.cc | 51 +++++++++++++++++++ paddle/phi/infermeta/binary.h | 6 +++ paddle/phi/infermeta/unary.cc | 35 +++++++++++++ paddle/phi/infermeta/unary.h | 5 ++ .../tests/unittests/test_matrix_rank_op.py | 29 ++++++++++- python/paddle/tensor/linalg.py | 20 +++++++- python/paddle/utils/code_gen/api.yaml | 17 +++++++ 7 files changed, 161 insertions(+), 2 deletions(-) diff --git a/paddle/phi/infermeta/binary.cc b/paddle/phi/infermeta/binary.cc index 298ad14f9e04b..2139605fb2048 100644 --- a/paddle/phi/infermeta/binary.cc +++ b/paddle/phi/infermeta/binary.cc @@ -64,6 +64,16 @@ static void BinarySameInputDimsCheck(const MetaTensor& x, } } +// Used in MatrixRankTolInferMeta +static DDim CheckAndGetOutputDim(const DDim& dim_x) { + auto x_vec = phi::vectorize(dim_x); + if (x_vec.size() == 2) { + return phi::make_ddim({1}); + } + x_vec.erase(x_vec.end() - 2, x_vec.end()); + return phi::make_ddim(x_vec); +} + } // namespace detail void AllValueCompareInferMeta(const MetaTensor& x, @@ -1465,6 +1475,47 @@ void MatmulWithFlattenInferMeta(const MetaTensor& x, out->share_lod(x); } +void MatrixRankTolInferMeta(const MetaTensor& x, + const MetaTensor& atol_tensor, + bool use_default_tol, + bool hermitian, + MetaTensor* out) { + auto dim_x = x.dims(); + PADDLE_ENFORCE_GE( + dim_x.size(), + 2, + phi::errors::InvalidArgument("The dims of input must be greater than 2")); + + if (hermitian) { + int rows = dim_x[dim_x.size() - 2]; + int cols = dim_x[dim_x.size() - 1]; + PADDLE_ENFORCE_EQ(rows, + cols, + phi::errors::InvalidArgument( + "if hermitian == true, matrix should be n*n")); + } + DDim dim_x_batch = detail::CheckAndGetOutputDim(dim_x); + auto dim_tol = atol_tensor.dims(); + if (dim_x_batch == dim_tol) { + out->set_dims(dim_x_batch); + } else { + int max_dim = std::max(dim_x_batch.size(), dim_tol.size()); + int axis = std::abs(dim_x_batch.size() - dim_tol.size()); + std::vector x_batch_dims_array(max_dim); + std::vector tol_dims_array(max_dim); + std::vector out_dims_array(max_dim); + phi::funcs::GetBroadcastDimsArrays(dim_x_batch, + dim_tol, + x_batch_dims_array.data(), + tol_dims_array.data(), + out_dims_array.data(), + max_dim, + axis); + out->set_dims(phi::make_ddim(out_dims_array)); + } + out->share_lod(x); +} + void MvInferMeta(const MetaTensor& x, const MetaTensor& vec, MetaTensor* out) { auto dim_x = x.dims(); auto dim_vec = vec.dims(); diff --git a/paddle/phi/infermeta/binary.h b/paddle/phi/infermeta/binary.h index 70c3c9dfe849d..192fa214c905f 100644 --- a/paddle/phi/infermeta/binary.h +++ b/paddle/phi/infermeta/binary.h @@ -218,6 +218,12 @@ void MatmulWithFlattenInferMeta(const MetaTensor& x, int y_num_col_dims, MetaTensor* out); +void MatrixRankTolInferMeta(const MetaTensor& x, + const MetaTensor& atol_tensor, + bool use_default_tol, + bool hermitian, + MetaTensor* out); + void MvInferMeta(const MetaTensor& x, const MetaTensor& vec, MetaTensor* out); void PReluInferMeta(const MetaTensor& x, diff --git a/paddle/phi/infermeta/unary.cc b/paddle/phi/infermeta/unary.cc index 0fedcca255c90..a81a0e1503a9b 100644 --- a/paddle/phi/infermeta/unary.cc +++ b/paddle/phi/infermeta/unary.cc @@ -31,6 +31,18 @@ limitations under the License. */ namespace phi { +namespace detail { +// Used in MatrixRankInferMeta +static DDim CheckAndGetOutputDim(const DDim& dim_x) { + auto x_vec = phi::vectorize(dim_x); + if (x_vec.size() == 2) { + return phi::make_ddim({1}); + } + x_vec.erase(x_vec.end() - 2, x_vec.end()); + return phi::make_ddim(x_vec); +} +} // namespace detail + void ArgMinMaxInferMeta(const MetaTensor& x, int64_t axis, bool keepdims, @@ -901,6 +913,29 @@ void MatrixPowerInferMeta(const MetaTensor& x, int n, MetaTensor* out) { out->set_dtype(x.dtype()); } +void MatrixRankInferMeta(const MetaTensor& x, + bool use_default_tol, + bool hermitian, + MetaTensor* out) { + auto dim_x = x.dims(); + PADDLE_ENFORCE_GE( + dim_x.size(), + 2, + phi::errors::InvalidArgument("The dims of input must be greater than 2")); + + if (hermitian) { + int rows = dim_x[dim_x.size() - 2]; + int cols = dim_x[dim_x.size() - 1]; + PADDLE_ENFORCE_EQ(rows, + cols, + phi::errors::InvalidArgument( + "if hermitian == true, matrix should be n*n")); + } + DDim dim_x_batch = detail::CheckAndGetOutputDim(dim_x); + out->set_dims(dim_x_batch); + out->share_lod(x); +} + void MaxOutInferMeta(const MetaTensor& x, int groups, int axis, diff --git a/paddle/phi/infermeta/unary.h b/paddle/phi/infermeta/unary.h index 1d69c9504d9cd..63a1dd52bbb0f 100644 --- a/paddle/phi/infermeta/unary.h +++ b/paddle/phi/infermeta/unary.h @@ -142,6 +142,11 @@ void LogsumexpInferMeta(const MetaTensor& input, void MatrixPowerInferMeta(const MetaTensor& x, int n, MetaTensor* out); +void MatrixRankInferMeta(const MetaTensor& x, + bool use_default_tol, + bool hermitian, + MetaTensor* out); + void MaxOutInferMeta(const MetaTensor& x, int groups, int axis, diff --git a/python/paddle/fluid/tests/unittests/test_matrix_rank_op.py b/python/paddle/fluid/tests/unittests/test_matrix_rank_op.py index d0b84a0d7e108..b13b346261762 100644 --- a/python/paddle/fluid/tests/unittests/test_matrix_rank_op.py +++ b/python/paddle/fluid/tests/unittests/test_matrix_rank_op.py @@ -30,8 +30,13 @@ np.random.seed(SEED) +def matrix_rank_wraper(x, tol=None, use_default_tol=True, hermitian=False): + return paddle.linalg.matrix_rank(x, tol, hermitian) + + class TestMatrixRankOP(OpTest): def setUp(self): + self.python_api = matrix_rank_wraper self.op_type = "matrix_rank" self.init_data() self.inputs = {'X': self.x} @@ -44,7 +49,7 @@ def setUp(self): self.outputs = {'Out': self.out} def test_check_output(self): - self.check_output() + self.check_output(check_eager=True) def init_data(self): self.x = np.eye(3, dtype=np.float32) @@ -110,6 +115,28 @@ def init_data(self): self.hermitian) +class TestMatrixRankOP6(TestMatrixRankOP): + def init_data(self): + self.x = np.random.rand(3, 4, 5, 6).astype(np.float32) + self.tol_tensor = None + self.tol = None + self.use_default_tol = False + self.hermitian = False + self.out = np.linalg.matrix_rank(self.x, self.tol_tensor, + self.hermitian) + + +class TestMatrixRankOP7(TestMatrixRankOP): + def init_data(self): + self.x = np.eye(200, dtype=np.float64) + self.tol_tensor = np.random.random([200, 200]).astype(self.x.dtype) + self.tol = None + self.use_default_tol = True + self.hermitian = True + self.out = np.linalg.matrix_rank(self.x, self.tol_tensor, + self.hermitian) + + class TestMatrixRankAPI(unittest.TestCase): def test_dygraph(self): paddle.disable_static() diff --git a/python/paddle/tensor/linalg.py b/python/paddle/tensor/linalg.py index 876fd5ed5e958..eb15183cb0cc5 100644 --- a/python/paddle/tensor/linalg.py +++ b/python/paddle/tensor/linalg.py @@ -1284,8 +1284,26 @@ def matrix_rank(x, tol=None, hermitian=False, name=None): # [1, 1, 1, 1]] """ + if in_dygraph_mode(): + if isinstance(tol, Variable): + if tol.dtype != x.dtype: + tol_tensor = cast(tol, x.dtype) + else: + tol_tensor = tol + use_default_tol = False + return _C_ops.final_state_matrix_rank_tol( + x, tol_tensor, use_default_tol, hermitian) - if paddle.in_dynamic_mode(): + if tol is None: + tol_attr = 0.0 + use_default_tol = True + else: + tol_attr = float(tol) + use_default_tol = False + return _C_ops.final_state_matrix_rank(x, tol_attr, use_default_tol, + hermitian) + + if _in_legacy_dygraph(): if tol is None: tol_tensor = None tol_attr = 0.0 diff --git a/python/paddle/utils/code_gen/api.yaml b/python/paddle/utils/code_gen/api.yaml index 4f46b6d0e55ec..97e8795818451 100644 --- a/python/paddle/utils/code_gen/api.yaml +++ b/python/paddle/utils/code_gen/api.yaml @@ -1157,6 +1157,23 @@ func : matrix_power backward : matrix_power_grad +- api : matrix_rank + args : (Tensor x, float tol, bool use_default_tol=true, bool hermitian=false) + output : Tensor(out) + infer_meta : + func : MatrixRankInferMeta + param : [x, use_default_tol, hermitian] + kernel : + func : matrix_rank + +- api : matrix_rank_tol + args : (Tensor x, Tensor atol_tensor, bool use_default_tol=true, bool hermitian=false) + output : Tensor(out) + infer_meta : + func : MatrixRankTolInferMeta + kernel : + func : matrix_rank_tol + - api : max args : (Tensor x, int64_t[] dims={}, bool keep_dim=false) output : Tensor(out) From 8fb8fa4109592c49b995be9b246c30d40bce6935 Mon Sep 17 00:00:00 2001 From: Weilong Wu Date: Thu, 7 Apr 2022 17:09:53 +0800 Subject: [PATCH 014/211] [Eager] Fix tensor type (#41468) --- python/paddle/tensor/logic.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/python/paddle/tensor/logic.py b/python/paddle/tensor/logic.py index 03d0f42d8417b..ffd827b0eb530 100755 --- a/python/paddle/tensor/logic.py +++ b/python/paddle/tensor/logic.py @@ -16,9 +16,13 @@ from ..fluid.data_feeder import check_type, check_variable_and_dtype from ..fluid.layers.layer_function_generator import templatedoc from ..static import Variable -from ..framework import VarBase as Tensor from ..fluid.framework import _in_legacy_dygraph, in_dygraph_mode -# TODO: define logic functions of a tensor +# TODO: define logic functions of a tensor +import paddle.fluid as fluid +if fluid.framework._in_eager_mode_: + Tensor = fluid.framework.core.eager.Tensor +else: + from ..framework import VarBase as Tensor from ..fluid.layers import is_empty # noqa: F401 from ..fluid.layers import logical_and # noqa: F401 from ..fluid.layers import logical_not # noqa: F401 From 90cb337ee315abb133d094340081ed7f4744c8e5 Mon Sep 17 00:00:00 2001 From: YuanRisheng Date: Thu, 7 Apr 2022 20:32:14 +0800 Subject: [PATCH 015/211] [Phi]Add hard_swish/kron/linspace/logit yaml file (#41298) * add yaml * perfect converage --- paddle/fluid/operators/linspace_op.cc | 2 +- paddle/phi/infermeta/ternary.cc | 16 ++++++-- paddle/phi/infermeta/ternary.h | 6 +++ paddle/phi/kernels/activation_grad_kernel.h | 1 + python/paddle/fluid/layers/tensor.py | 6 ++- .../tests/unittests/test_activation_op.py | 13 ++++++- .../fluid/tests/unittests/test_kron_op.py | 29 ++++++++++---- .../fluid/tests/unittests/test_linspace.py | 15 +++++-- .../fluid/tests/unittests/test_logit_op.py | 12 +++++- python/paddle/nn/functional/activation.py | 5 ++- python/paddle/tensor/math.py | 10 +++-- python/paddle/utils/code_gen/api.yaml | 39 +++++++++++++++++++ python/paddle/utils/code_gen/backward.yaml | 31 +++++++++++++++ 13 files changed, 158 insertions(+), 27 deletions(-) diff --git a/paddle/fluid/operators/linspace_op.cc b/paddle/fluid/operators/linspace_op.cc index 5599debbf3871..1cd59672f97fc 100644 --- a/paddle/fluid/operators/linspace_op.cc +++ b/paddle/fluid/operators/linspace_op.cc @@ -67,7 +67,7 @@ class LinspaceOpMaker : public framework::OpProtoAndCheckerMaker { namespace ops = paddle::operators; DECLARE_INFER_SHAPE_FUNCTOR(linspace, LinspaceInferShapeFunctor, - PD_INFER_META(phi::LinspaceInferMeta)); + PD_INFER_META(phi::LinspaceRawInferMeta)); REGISTER_OPERATOR( linspace, ops::LinspaceOp, ops::LinspaceOpMaker, paddle::framework::EmptyGradOpMaker, diff --git a/paddle/phi/infermeta/ternary.cc b/paddle/phi/infermeta/ternary.cc index 3e4aa7b4448e3..c692b6c8fcd13 100644 --- a/paddle/phi/infermeta/ternary.cc +++ b/paddle/phi/infermeta/ternary.cc @@ -276,10 +276,10 @@ void LerpInferMeta(const MetaTensor& x, out->share_lod(x); } -void LinspaceInferMeta(const MetaTensor& start, - const MetaTensor& stop, - const MetaTensor& number, - MetaTensor* out) { +void LinspaceRawInferMeta(const MetaTensor& start, + const MetaTensor& stop, + const MetaTensor& number, + MetaTensor* out) { auto s_dims = start.dims(); PADDLE_ENFORCE_EQ( (s_dims.size() == 1) && (s_dims[0] == 1), @@ -305,6 +305,14 @@ void LinspaceInferMeta(const MetaTensor& start, out->set_dtype(start.dtype()); } +void LinspaceInferMeta(const MetaTensor& start, + const MetaTensor& stop, + const MetaTensor& number, + DataType dtype, + MetaTensor* out) { + LinspaceRawInferMeta(start, stop, number, out); +} + void NllLossRawInferMeta(const MetaTensor& input, const MetaTensor& label, paddle::optional weight, diff --git a/paddle/phi/infermeta/ternary.h b/paddle/phi/infermeta/ternary.h index 00e49811688ac..83505f2c2fada 100644 --- a/paddle/phi/infermeta/ternary.h +++ b/paddle/phi/infermeta/ternary.h @@ -65,9 +65,15 @@ void LerpInferMeta(const MetaTensor& x, const MetaTensor& weight, MetaTensor* out); +void LinspaceRawInferMeta(const MetaTensor& start, + const MetaTensor& stop, + const MetaTensor& number, + MetaTensor* out); + void LinspaceInferMeta(const MetaTensor& start, const MetaTensor& stop, const MetaTensor& number, + DataType dtype, MetaTensor* out); void NllLossRawInferMeta(const MetaTensor& input, diff --git a/paddle/phi/kernels/activation_grad_kernel.h b/paddle/phi/kernels/activation_grad_kernel.h index 82e168a3c630b..065d018852267 100644 --- a/paddle/phi/kernels/activation_grad_kernel.h +++ b/paddle/phi/kernels/activation_grad_kernel.h @@ -197,6 +197,7 @@ DECLARE_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPX(ThresholdedRelu, threshold); DECLARE_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPX(SoftShrink, lambda); DECLARE_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPX(HardShrink, threshold); DECLARE_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPX(Swish, beta); +DECLARE_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPX(Logit, eps); DECLARE_ACT_GRAD_KERNEL_WITH_TWO_ATTRS_DEPX(BRelu, t_min, t_max); diff --git a/python/paddle/fluid/layers/tensor.py b/python/paddle/fluid/layers/tensor.py index a63e87472ebed..e302371988739 100644 --- a/python/paddle/fluid/layers/tensor.py +++ b/python/paddle/fluid/layers/tensor.py @@ -1548,10 +1548,12 @@ def linspace(start, stop, num, dtype=None, name=None): if not isinstance(num, Variable): with device_guard("cpu"): tensor_num = fill_constant([1], 'int32', num) - if _non_static_mode(): + if _in_legacy_dygraph(): return _C_ops.linspace(tensor_start, tensor_stop, tensor_num, 'dtype', dtype) - + if in_dygraph_mode(): + return _C_ops.final_state_linspace(tensor_start, tensor_stop, + tensor_num, dtype) helper = LayerHelper("linspace", **locals()) start_dtype = convert_dtype(tensor_start.dtype) diff --git a/python/paddle/fluid/tests/unittests/test_activation_op.py b/python/paddle/fluid/tests/unittests/test_activation_op.py index 89f8ebbd0cafb..80fef6d37576f 100755 --- a/python/paddle/fluid/tests/unittests/test_activation_op.py +++ b/python/paddle/fluid/tests/unittests/test_activation_op.py @@ -25,6 +25,7 @@ import paddle.fluid as fluid import paddle.fluid.core as core from paddle.fluid import compiler, Program, program_guard +from paddle.fluid.framework import _test_eager_guard paddle.enable_static() @@ -1755,7 +1756,7 @@ class TestHardSwish(TestActivation): def setUp(self): self.op_type = 'hard_swish' self.init_dtype() - + self.python_api = paddle.nn.functional.hardswish skip_check_grad_ci(reason="not implemented yet") np.random.seed(1024) @@ -1777,7 +1778,10 @@ def test_check_grad(self): return return # not implemented yet - self.check_grad(['X'], 'Out') + self.check_grad(['X'], 'Out', check_eager=True) + + def test_check_output(self): + self.check_output(check_eager=True) class TestHardswishAPI(unittest.TestCase): @@ -1838,6 +1842,11 @@ def test_errors(self): name='x_fp16', shape=[12, 10], dtype='float16') F.hardswish(x_fp16) + def test_api_eager_dygraph(self): + with _test_eager_guard(): + self.test_dygraph_api() + self.test_errors() + class TestSoftRelu(TestActivation): def setUp(self): diff --git a/python/paddle/fluid/tests/unittests/test_kron_op.py b/python/paddle/fluid/tests/unittests/test_kron_op.py index d6db4c2f074a9..f4d013b7c6a3e 100644 --- a/python/paddle/fluid/tests/unittests/test_kron_op.py +++ b/python/paddle/fluid/tests/unittests/test_kron_op.py @@ -21,11 +21,13 @@ import paddle import paddle.fluid as fluid import paddle.fluid.dygraph as dg +from paddle.fluid.framework import _test_eager_guard class TestKronOp(OpTest): def setUp(self): self.op_type = "kron" + self.python_api = paddle.kron self.dtype = self._init_dtype() x = np.random.uniform(size=(10, 10)).astype(self.dtype) y = np.random.uniform(size=(10, 10)).astype(self.dtype) @@ -37,21 +39,22 @@ def _init_dtype(self): return "float64" def test_check_output(self): - self.check_output() + self.check_output(check_eager=True) def test_check_grad(self): - self.check_grad(['X', 'Y'], 'Out') + self.check_grad(['X', 'Y'], 'Out', check_eager=True) def test_check_grad_ignore_x(self): - self.check_grad(['Y'], 'Out', no_grad_set=set('X')) + self.check_grad(['Y'], 'Out', no_grad_set=set('X'), check_eager=True) def test_check_grad_ignore_y(self): - self.check_grad(['X'], 'Out', no_grad_set=set('Y')) + self.check_grad(['X'], 'Out', no_grad_set=set('Y'), check_eager=True) class TestKronOp2(TestKronOp): def setUp(self): self.op_type = "kron" + self.python_api = paddle.kron self.dtype = self._init_dtype() x = np.random.uniform(size=(5, 5, 4)).astype(self.dtype) y = np.random.uniform(size=(10, 10)).astype(self.dtype) @@ -63,6 +66,7 @@ def setUp(self): class TestKronOp3(TestKronOp): def setUp(self): self.op_type = "kron" + self.python_api = paddle.kron self.dtype = self._init_dtype() x = np.random.uniform(size=(10, 10)).astype(self.dtype) y = np.random.uniform(size=(5, 5, 4)).astype(self.dtype) @@ -101,10 +105,16 @@ def test_case_with_output(self): c, = exe.run(main, feed={'a': a, 'b': b}, fetch_list=[out_var]) np.testing.assert_allclose(c, np.kron(a, b)) + def test_api_eager_dygraph(self): + with _test_eager_guard(): + self.test_case() + self.test_case_with_output() + class TestComplexKronOp(OpTest): def setUp(self): self.op_type = "kron" + self.python_api = paddle.kron self.x_shape = np.array([10, 10]) self.y_shape = np.array([3, 35]) self.out_shape = self.x_shape * self.y_shape @@ -160,14 +170,15 @@ def get_grad_y_by_numpy(self): return grad_y def test_check_output(self): - self.check_output() + self.check_output(check_eager=True) def test_check_grad_normal(self): self.check_grad( ['X', 'Y'], 'Out', user_defined_grads=[self.grad_x, self.grad_y], - user_defined_grad_outputs=[self.grad_out]) + user_defined_grad_outputs=[self.grad_out], + check_eager=True) def test_check_grad_ingore_x(self): self.check_grad( @@ -175,7 +186,8 @@ def test_check_grad_ingore_x(self): 'Out', no_grad_set=set("X"), user_defined_grads=[self.grad_y], - user_defined_grad_outputs=[self.grad_out]) + user_defined_grad_outputs=[self.grad_out], + check_eager=True) def test_check_grad_ingore_y(self): self.check_grad( @@ -183,7 +195,8 @@ def test_check_grad_ingore_y(self): 'Out', no_grad_set=set('Y'), user_defined_grads=[self.grad_x], - user_defined_grad_outputs=[self.grad_out]) + user_defined_grad_outputs=[self.grad_out], + check_eager=True) class TestKronOpTypePromotion(TestComplexKronOp): diff --git a/python/paddle/fluid/tests/unittests/test_linspace.py b/python/paddle/fluid/tests/unittests/test_linspace.py index 54846e6a14bd2..65a6c21fb0720 100644 --- a/python/paddle/fluid/tests/unittests/test_linspace.py +++ b/python/paddle/fluid/tests/unittests/test_linspace.py @@ -21,11 +21,13 @@ import paddle.fluid as fluid from paddle.fluid import compiler, Program, program_guard from paddle.fluid import core +from paddle.fluid.framework import _test_eager_guard class TestLinspaceOpCommonCase(OpTest): def setUp(self): self.op_type = "linspace" + self.python_api = paddle.linspace dtype = 'float32' self.inputs = { 'Start': np.array([0]).astype(dtype), @@ -37,12 +39,13 @@ def setUp(self): self.outputs = {'Out': np.arange(0, 11).astype(dtype)} def test_check_output(self): - self.check_output() + self.check_output(check_eager=True) class TestLinspaceOpReverseCase(OpTest): def setUp(self): self.op_type = "linspace" + self.python_api = paddle.linspace dtype = 'float32' self.inputs = { 'Start': np.array([10]).astype(dtype), @@ -54,12 +57,13 @@ def setUp(self): self.outputs = {'Out': np.arange(10, -1, -1).astype(dtype)} def test_check_output(self): - self.check_output() + self.check_output(check_eager=True) class TestLinspaceOpNumOneCase(OpTest): def setUp(self): self.op_type = "linspace" + self.python_api = paddle.linspace dtype = 'float32' self.inputs = { 'Start': np.array([10]).astype(dtype), @@ -71,7 +75,7 @@ def setUp(self): self.outputs = {'Out': np.array(10, dtype=dtype)} def test_check_output(self): - self.check_output() + self.check_output(check_eager=True) class TestLinspaceAPI(unittest.TestCase): @@ -123,6 +127,11 @@ def test_imperative(self): self.assertEqual((out2.numpy() == np_out2).all(), True) self.assertEqual((out3.numpy() == np_out3).all(), True) + def test_api_eager_dygraph(self): + with _test_eager_guard(): + self.test_variable_input2() + self.test_imperative() + class TestLinspaceOpError(unittest.TestCase): def test_errors(self): diff --git a/python/paddle/fluid/tests/unittests/test_logit_op.py b/python/paddle/fluid/tests/unittests/test_logit_op.py index 9254996eb4463..9b46039da13b1 100644 --- a/python/paddle/fluid/tests/unittests/test_logit_op.py +++ b/python/paddle/fluid/tests/unittests/test_logit_op.py @@ -16,6 +16,7 @@ import numpy as np from op_test import OpTest import paddle +from paddle.fluid.framework import _test_eager_guard np.random.seed(10) @@ -37,6 +38,7 @@ def logit_grad(x, eps=1e-8): class TestLogitOp(OpTest): def setUp(self): self.op_type = 'logit' + self.python_api = paddle.logit self.dtype = np.float64 self.shape = [120] self.eps = 1e-8 @@ -52,10 +54,11 @@ def set_attrs(self): pass def test_check_output(self): - self.check_output() + self.check_output(check_eager=True) def test_check_grad(self): - self.check_grad(['X'], ['Out'], user_defined_grads=[self.x_grad]) + self.check_grad( + ['X'], ['Out'], user_defined_grads=[self.x_grad], check_eager=True) class TestLogitShape(TestLogitOp): @@ -106,6 +109,11 @@ def test_errors(self): x = paddle.fluid.data(name='X2', shape=[100], dtype='float32') self.assertRaises(TypeError, paddle.logit, x, dtype='int32') + def test_api_eager_dygraph(self): + with _test_eager_guard(): + self.test_check_api() + self.test_errors() + if __name__ == "__main__": unittest.main() diff --git a/python/paddle/nn/functional/activation.py b/python/paddle/nn/functional/activation.py index d145b615c3d7f..10bf5d9a46c6b 100644 --- a/python/paddle/nn/functional/activation.py +++ b/python/paddle/nn/functional/activation.py @@ -28,6 +28,7 @@ import paddle from paddle import _C_ops, in_dynamic_mode from paddle.framework import core +from paddle.fluid.framework import _in_legacy_dygraph, in_dygraph_mode __all__ = [] @@ -386,8 +387,10 @@ def hardswish(x, name=None): out = F.hardswish(x) # [0., 5., 0.666667] """ - if in_dynamic_mode(): + if _in_legacy_dygraph(): return _C_ops.hard_swish(x) + if in_dygraph_mode(): + return _C_ops.final_state_hard_swish(x, 6, 6, 3) check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64'], 'hardswish') diff --git a/python/paddle/tensor/math.py b/python/paddle/tensor/math.py index 9751892e70188..311f5f8edd5d6 100644 --- a/python/paddle/tensor/math.py +++ b/python/paddle/tensor/math.py @@ -2674,9 +2674,10 @@ def kron(x, y, name=None): # [12, 15, 18, 16, 20, 24], # [21, 24, 27, 28, 32, 36]]) """ - if paddle.in_dynamic_mode(): + if _in_legacy_dygraph(): return _C_ops.kron(x, y) - + if in_dygraph_mode(): + return _C_ops.final_state_kron(x, y) helper = LayerHelper('kron', **locals()) check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64', 'int32', 'int64'], 'kron') check_variable_and_dtype(y, 'y', ['float16', 'float32', 'float64', 'int32', 'int64'], 'kron') @@ -3525,9 +3526,10 @@ def logit(x, eps=None, name=None): if eps == None: eps = 0.0 - if paddle.in_dynamic_mode(): + if _in_legacy_dygraph(): return _C_ops.logit(x, 'eps', eps) - + if in_dygraph_mode(): + return _C_ops.final_state_logit(x, eps) check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64'], 'logit') helper = LayerHelper("logit", **locals()) out = helper.create_variable_for_type_inference(x.dtype) diff --git a/python/paddle/utils/code_gen/api.yaml b/python/paddle/utils/code_gen/api.yaml index 97e8795818451..e41495bf0c3b1 100644 --- a/python/paddle/utils/code_gen/api.yaml +++ b/python/paddle/utils/code_gen/api.yaml @@ -838,6 +838,16 @@ func : hard_sigmoid backward : hard_sigmoid_grad +- api : hard_swish + args : (Tensor x, float threshold = 6.0, float scale = 6.0, float offset = 3.0) + output : Tensor + infer_meta : + func : UnchangedInferMeta + param : [x] + kernel : + func : hard_swish + backward : hard_swish_grad + # histogram - api : histogram args : (Tensor x, int64_t bins, int min, int max) @@ -949,6 +959,15 @@ data_type : x backward : kldiv_loss_grad +- api : kron + args : (Tensor x, Tensor y) + output : Tensor + infer_meta : + func : KronInferMeta + kernel : + func : kron + backward : kron_grad + - api : kthvalue args : (Tensor x, int k, int axis, bool keepdim) output : Tensor(out), Tensor(indices) @@ -1016,6 +1035,15 @@ func : lgamma backward : lgamma_grad +- api : linspace + args : (Tensor start, Tensor stop, Tensor number, DataType dtype) + output : Tensor + infer_meta : + func : LinspaceInferMeta + kernel : + func : linspace + data_type : dtype + - api : log args : (Tensor x) output : Tensor @@ -1107,6 +1135,17 @@ kernel : func : logical_xor +# logit +- api : logit + args : (Tensor x, float eps = 1e-6f) + output : Tensor + infer_meta : + func : UnchangedInferMeta + param : [x] + kernel : + func : logit + backward : logit_grad + # logsigmoid - api : logsigmoid args : (Tensor x) diff --git a/python/paddle/utils/code_gen/backward.yaml b/python/paddle/utils/code_gen/backward.yaml index 3f6dc0e7477ab..917fd5ec442ca 100644 --- a/python/paddle/utils/code_gen/backward.yaml +++ b/python/paddle/utils/code_gen/backward.yaml @@ -568,6 +568,16 @@ kernel : func : hard_sigmoid_grad +- backward_api : hard_swish_grad + forward : hard_swish (Tensor x, float threshold = 6.0, float scale = 6.0, float offset = 3.0) -> Tensor(out) + args : (Tensor x, Tensor out_grad, float threshold, float scale, float offset) + output : Tensor(x_grad) + infer_meta : + func : UnchangedInferMeta + param : [x] + kernel : + func : hard_swish_grad + - backward_api : huber_loss_grad forward : huber_loss (Tensor input, Tensor label, float delta) -> Tensor(out), Tensor(residual) args : (Tensor residual, Tensor out_grad, float delta) @@ -617,6 +627,17 @@ kernel : func : kldiv_loss_grad +- backward_api : kron_grad + forward : kron (Tensor x, Tensor y) -> Tensor(out) + args : (Tensor x, Tensor y, Tensor out_grad) + output : Tensor(x_grad), Tensor(y_grad) + infer_meta : + func : GeneralBinaryGradInferMeta + param : [x, y] + kernel : + func : kron_grad + data_type : out_grad + - backward_api : kthvalue_grad forward : kthvalue(Tensor x, int k, int axis, bool keepdim) -> Tensor(out), Tensor(indices) args : (Tensor x, Tensor indices, Tensor out_grad, int k, int axis, bool keepdim) @@ -728,6 +749,16 @@ kernel : func : log_softmax_grad +- backward_api : logit_grad + forward : logit (Tensor x, float eps = 1e-6f) -> Tensor(out) + args : (Tensor x, Tensor out_grad, float eps) + output : Tensor(x_grad) + infer_meta : + func : UnchangedInferMeta + param : [x] + kernel : + func : logit_grad + - backward_api : logsigmoid_grad forward : logsigmoid (Tensor x) -> Tensor(out) args : (Tensor x, Tensor out_grad) From 50ddc0b2d707cc2f57576350d20e3f312bf603d0 Mon Sep 17 00:00:00 2001 From: hong <43953930+phlrain@users.noreply.github.com> Date: Thu, 7 Apr 2022 21:34:24 +0800 Subject: [PATCH 016/211] Add dist norm yamls (#41424) * add dist erfinv gumbel softmax * fix test gumbel softmax op bug * try to fix gumbel softmax error * add label smooth backlist --- .../fluid/tests/unittests/test_dist_op.py | 10 ++++++++-- .../fluid/tests/unittests/test_erfinv_op.py | 3 ++- .../tests/unittests/test_expand_v2_op.py | 1 + .../tests/unittests/test_gumbel_softmax_op.py | 18 +++++++++++------ python/paddle/nn/functional/activation.py | 3 +++ python/paddle/tensor/linalg.py | 3 +++ python/paddle/tensor/math.py | 3 +++ python/paddle/utils/code_gen/api.yaml | 2 +- python/paddle/utils/code_gen/backward.yaml | 20 +++++++++---------- 9 files changed, 43 insertions(+), 20 deletions(-) diff --git a/python/paddle/fluid/tests/unittests/test_dist_op.py b/python/paddle/fluid/tests/unittests/test_dist_op.py index b9b8ea92cb3a8..ad999c3feae42 100644 --- a/python/paddle/fluid/tests/unittests/test_dist_op.py +++ b/python/paddle/fluid/tests/unittests/test_dist_op.py @@ -37,6 +37,7 @@ def dist(x, y, p): class TestDistOp(OpTest): def setUp(self): self.op_type = 'dist' + self.python_api = paddle.dist self.attrs = {} self.init_case() self.init_data_type() @@ -106,10 +107,14 @@ def get_reduce_dims(x, y): return x_grad, y_grad def test_check_output(self): - self.check_output() + self.check_output(check_eager=True) def test_check_grad(self): - self.check_grad(["X", "Y"], "Out", user_defined_grads=self.gradient) + self.check_grad( + ["X", "Y"], + "Out", + user_defined_grads=self.gradient, + check_eager=True) class TestDistOpCase1(TestDistOp): @@ -174,4 +179,5 @@ def test_api(self): if __name__ == '__main__': + paddle.enable_static() unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_erfinv_op.py b/python/paddle/fluid/tests/unittests/test_erfinv_op.py index 847a868dd6ca0..5b5a7c0384316 100644 --- a/python/paddle/fluid/tests/unittests/test_erfinv_op.py +++ b/python/paddle/fluid/tests/unittests/test_erfinv_op.py @@ -28,6 +28,7 @@ class TestErfinv(OpTest): def setUp(self): self.op_type = "erfinv" + self.python_api = paddle.erfinv self.init_dtype() self.shape = [11, 17] self.x = np.random.uniform(-1, 1, size=self.shape).astype(self.dtype) @@ -42,7 +43,7 @@ def init_dtype(self): self.dtype = np.float64 def test_check_output(self): - self.check_output() + self.check_output(check_eager=True) def test_check_grad(self): self.check_grad( diff --git a/python/paddle/fluid/tests/unittests/test_expand_v2_op.py b/python/paddle/fluid/tests/unittests/test_expand_v2_op.py index a204c26c1b823..70b3fda79b50f 100644 --- a/python/paddle/fluid/tests/unittests/test_expand_v2_op.py +++ b/python/paddle/fluid/tests/unittests/test_expand_v2_op.py @@ -27,6 +27,7 @@ class TestExpandV2OpRank1(OpTest): def setUp(self): self.op_type = "expand_v2" self.init_data() + self.python_api = paddle.expand self.inputs = {'X': np.random.random(self.ori_shape).astype("float64")} self.attrs = {'shape': self.shape} diff --git a/python/paddle/fluid/tests/unittests/test_gumbel_softmax_op.py b/python/paddle/fluid/tests/unittests/test_gumbel_softmax_op.py index e423404d07fb1..7c706eabd1d7a 100644 --- a/python/paddle/fluid/tests/unittests/test_gumbel_softmax_op.py +++ b/python/paddle/fluid/tests/unittests/test_gumbel_softmax_op.py @@ -17,6 +17,7 @@ import paddle import paddle.fluid as fluid from paddle.fluid import Program, program_guard +from paddle.fluid.framework import _test_eager_guard paddle.enable_static() @@ -177,12 +178,17 @@ def test_check_api(self): self.assertEqual(out_np.sum(), self.count_expected) # test dygrapg api - paddle.disable_static() - x = paddle.to_tensor(self.x) - y = paddle.nn.functional.gumbel_softmax(x, hard=True) - out_np = np.array(y) - self.assertEqual(out_np.sum(), self.count_expected) - paddle.enable_static() + with paddle.fluid.dygraph.base.guard(): + x = paddle.to_tensor(self.x) + y = paddle.nn.functional.gumbel_softmax(x, hard=True) + out_np = np.array(y) + self.assertEqual(out_np.sum(), self.count_expected) + + with _test_eager_guard(): + x = paddle.to_tensor(self.x) + y = paddle.nn.functional.gumbel_softmax(x, hard=True) + out_np = np.array(y) + self.assertEqual(out_np.sum(), self.count_expected) class TestGumbelSoftmaxOpError(unittest.TestCase): diff --git a/python/paddle/nn/functional/activation.py b/python/paddle/nn/functional/activation.py index 10bf5d9a46c6b..62567fa2a6113 100644 --- a/python/paddle/nn/functional/activation.py +++ b/python/paddle/nn/functional/activation.py @@ -1524,6 +1524,9 @@ def gumbel_softmax(x, temperature=1.0, hard=False, axis=-1, name=None): # [0.00000000, 0.00000000, 0.00000000, 0.00001258, 0.99998736, 0.00000000]] """ + if in_dygraph_mode(): + return _C_ops.final_state_gumbel_softmax(x, temperature, hard, axis) + if in_dynamic_mode(): return _C_ops.gumbel_softmax(x, 'temperature', temperature, 'hard', hard, 'axis', axis) diff --git a/python/paddle/tensor/linalg.py b/python/paddle/tensor/linalg.py index eb15183cb0cc5..e29513beb166e 100644 --- a/python/paddle/tensor/linalg.py +++ b/python/paddle/tensor/linalg.py @@ -551,6 +551,9 @@ def dist(x, y, p=2, name=None): out = paddle.dist(x, y, float("-inf")) print(out) # out = [0.] """ + if in_dygraph_mode(): + return _C_ops.final_state_dist(x, y, p) + check_variable_and_dtype(x, 'dtype', ['float32', 'float64'], 'dist') check_variable_and_dtype(y, 'dtype', ['float32', 'float64'], 'dist') check_type(p, 'p', (float, int), 'dist') diff --git a/python/paddle/tensor/math.py b/python/paddle/tensor/math.py index 311f5f8edd5d6..a1d27ab904e82 100644 --- a/python/paddle/tensor/math.py +++ b/python/paddle/tensor/math.py @@ -3636,6 +3636,9 @@ def erfinv(x, name=None): # out: [0, 0.4769, -inf] """ + if in_dygraph_mode(): + return _C_ops.final_state_erfinv( x ) + check_variable_and_dtype(x, 'x', ['float32', 'float64'], 'erfinv') if paddle.in_dynamic_mode(): diff --git a/python/paddle/utils/code_gen/api.yaml b/python/paddle/utils/code_gen/api.yaml index e41495bf0c3b1..90e08c68cf411 100644 --- a/python/paddle/utils/code_gen/api.yaml +++ b/python/paddle/utils/code_gen/api.yaml @@ -814,7 +814,7 @@ func : GumbelSoftmaxInferMeta kernel : func : gumbel_softmax - # backward : gumbel_softmax_grad + backward : gumbel_softmax_grad # hard_shrink - api : hard_shrink diff --git a/python/paddle/utils/code_gen/backward.yaml b/python/paddle/utils/code_gen/backward.yaml index 917fd5ec442ca..1e41a0e79491c 100644 --- a/python/paddle/utils/code_gen/backward.yaml +++ b/python/paddle/utils/code_gen/backward.yaml @@ -1,13 +1,3 @@ -# - backward_api : gumbel_softmax_grad -# forward : gumbel_softmax (Tensor x, float temperature, bool hard, int axis) -> Tensor(out) -# args : (Tensor out, Tensor out_grad, int axis) -# output : Tensor(x_grad) -# infer_meta : -# func : GumbelSoftmaxGradInferMeta -# param : [out, out_grad, axis] -# kernel : -# func : gumbel_softmax_grad - - backward_api : abs_grad forward : abs (Tensor x) -> Tensor(out) args : (Tensor x, Tensor out_grad) @@ -548,6 +538,16 @@ func : graph_send_recv_grad optional: out, dst_count +- backward_api : gumbel_softmax_grad + forward : gumbel_softmax (Tensor x, float temperature, bool hard, int axis) -> Tensor(out) + args : (Tensor out, Tensor out_grad, int axis) + output : Tensor(x_grad) + infer_meta : + func : GumbelSoftmaxGradInferMeta + param : [out, out_grad, axis] + kernel : + func : gumbel_softmax_grad + - backward_api : hard_shrink_grad forward : hard_shrink (Tensor x, float threshold) -> Tensor(out) args : (Tensor x, Tensor out_grad, float threshold) From 0d642d3a92f9ef1f614714a1e989fb66dcc623fa Mon Sep 17 00:00:00 2001 From: hong <43953930+phlrain@users.noreply.github.com> Date: Thu, 7 Apr 2022 22:00:28 +0800 Subject: [PATCH 017/211] add norm, segment_pool (#41465) --- .../fluid/tests/unittests/test_bincount_op.py | 5 ++++- .../fluid/tests/unittests/test_norm_op.py | 1 + .../fluid/tests/unittests/test_segment_ops.py | 18 ++++++++++++++++-- python/paddle/incubate/tensor/math.py | 8 ++++---- python/paddle/tensor/linalg.py | 7 ++----- python/paddle/utils/code_gen/api.yaml | 11 +++++++++++ python/paddle/utils/code_gen/backward.yaml | 11 +++++++++++ 7 files changed, 49 insertions(+), 12 deletions(-) diff --git a/python/paddle/fluid/tests/unittests/test_bincount_op.py b/python/paddle/fluid/tests/unittests/test_bincount_op.py index 851bf7b01125a..17b04b954afe8 100644 --- a/python/paddle/fluid/tests/unittests/test_bincount_op.py +++ b/python/paddle/fluid/tests/unittests/test_bincount_op.py @@ -126,6 +126,7 @@ class TestBincountOp(OpTest): # without weights def setUp(self): self.op_type = "bincount" + self.python_api = paddle.bincount self.init_test_case() self.inputs = {"X": self.np_input} self.attrs = {"minlength": self.minlength} @@ -137,13 +138,14 @@ def init_test_case(self): self.Out = np.bincount(self.np_input, minlength=self.minlength) def test_check_output(self): - self.check_output() + self.check_output(check_eager=False) class TestCase1(TestBincountOp): # with weights(FLOAT32) def setUp(self): self.op_type = "bincount" + self.python_api = paddle.bincount self.init_test_case() self.inputs = {"X": self.np_input, "Weights": self.np_weights} self.attrs = {"minlength": self.minlength} @@ -163,6 +165,7 @@ class TestCase2(TestBincountOp): # with weights(other) def setUp(self): self.op_type = "bincount" + self.python_api = paddle.bincount self.init_test_case() self.inputs = {"X": self.np_input, "Weights": self.np_weights} self.attrs = {"minlength": self.minlength} diff --git a/python/paddle/fluid/tests/unittests/test_norm_op.py b/python/paddle/fluid/tests/unittests/test_norm_op.py index 626de9b12b9c1..49e1f2533491d 100644 --- a/python/paddle/fluid/tests/unittests/test_norm_op.py +++ b/python/paddle/fluid/tests/unittests/test_norm_op.py @@ -32,6 +32,7 @@ def l2_norm(x, axis, epsilon): class TestNormOp(OpTest): def setUp(self): self.op_type = "norm" + self.python_api = paddle.fluid.layers.l2_normalize self.init_test_case() self.init_dtype() x = np.random.random(self.shape).astype(self.dtype) diff --git a/python/paddle/fluid/tests/unittests/test_segment_ops.py b/python/paddle/fluid/tests/unittests/test_segment_ops.py index e2aadbedbd07f..90d597837a8e1 100644 --- a/python/paddle/fluid/tests/unittests/test_segment_ops.py +++ b/python/paddle/fluid/tests/unittests/test_segment_ops.py @@ -73,6 +73,17 @@ def compute_segment_min_max(x, segment_ids, pooltype="MAX"): return results, gradient / results.size +def segment_pool_split(X, SegmentIds, pooltype): + if pooltype == "SUM": + return paddle.incubate.tensor.segment_sum(X, SegmentIds) + elif pooltype == "MEAN": + return paddle.incubate.tensor.segment_mean(X, SegmentIds) + elif pooltype == "MIN": + return paddle.incubate.tensor.segment_min(X, SegmentIds) + elif pooltype == "MAX": + return paddle.incubate.tensor.segment_max(X, SegmentIds) + + class TestSegmentOps(OpTest): def set_data(self): x = np.random.uniform(-1, 1, self.shape).astype(self.dtype) @@ -90,6 +101,8 @@ def compute(self, x, segment_ids): def prepare(self): self.op_type = "segment_pool" + self.python_api = segment_pool_split + self.python_out_sig = ["Out"] self.dtype = np.float64 self.shape = [30, 15] self.attrs = {"pooltype": "SUM"} @@ -105,10 +118,10 @@ def setUp(self): self.outputs = {'Out': result.astype(self.dtype)} def test_check_output(self): - self.check_output() + self.check_output(check_eager=True) def test_check_grad(self): - self.check_grad(["X"], "Out") + self.check_grad(["X"], "Out", check_eager=True) class TestSegmentSum2(TestSegmentOps): @@ -259,4 +272,5 @@ def test_dygraph(self): if __name__ == '__main__': + paddle.enable_static() unittest.main() diff --git a/python/paddle/incubate/tensor/math.py b/python/paddle/incubate/tensor/math.py index b36aaef9acf36..da6eb4e17c7fb 100644 --- a/python/paddle/incubate/tensor/math.py +++ b/python/paddle/incubate/tensor/math.py @@ -52,7 +52,7 @@ def segment_sum(data, segment_ids, name=None): """ if in_dygraph_mode(): - return _C_ops.final_state_segment_pool(data, segment_idsm, "SUM")[0] + return _C_ops.final_state_segment_pool(data, segment_ids, "SUM")[0] if _in_legacy_dygraph(): out, tmp = _C_ops.segment_pool(data, segment_ids, 'pooltype', "SUM") return out @@ -109,7 +109,7 @@ def segment_mean(data, segment_ids, name=None): """ if in_dygraph_mode(): - return _C_ops.final_state_segment_pool(data, segment_idsm, "MEAN")[0] + return _C_ops.final_state_segment_pool(data, segment_ids, "MEAN")[0] if _non_static_mode(): out, tmp = _C_ops.segment_pool(data, segment_ids, 'pooltype', "MEAN") return out @@ -165,7 +165,7 @@ def segment_min(data, segment_ids, name=None): """ if in_dygraph_mode(): - return _C_ops.final_state_segment_pool(data, segment_idsm, "MIN")[0] + return _C_ops.final_state_segment_pool(data, segment_ids, "MIN")[0] if _non_static_mode(): out, tmp = _C_ops.segment_pool(data, segment_ids, 'pooltype', "MIN") @@ -222,7 +222,7 @@ def segment_max(data, segment_ids, name=None): """ if in_dygraph_mode(): - out, tmp = _C_ops.final_state_segment_pool(data, segment_ids, "MAX")[0] + out = _C_ops.final_state_segment_pool(data, segment_ids, "MAX")[0] return out if _non_static_mode(): diff --git a/python/paddle/tensor/linalg.py b/python/paddle/tensor/linalg.py index e29513beb166e..38616026f128a 100644 --- a/python/paddle/tensor/linalg.py +++ b/python/paddle/tensor/linalg.py @@ -17,7 +17,7 @@ from ..framework import _varbase_creator, _dygraph_tracer from ..fluid.data_feeder import check_variable_and_dtype, check_type, check_dtype from ..static import Variable -from ..fluid.framework import _in_legacy_dygraph, in_dygraph_mode +from ..fluid.framework import _in_legacy_dygraph, in_dygraph_mode, _non_static_mode from ..fluid.layers import transpose, cast # noqa: F401 from ..fluid import layers import paddle @@ -1487,10 +1487,7 @@ def bincount(x, weights=None, minlength=0, name=None): if x.dtype not in [paddle.int32, paddle.int64]: raise TypeError("Elements in Input(x) should all be integers") - # if in_dygraph_mode(): - # return _C_ops.final_state_bincount(x, weights, minlength) - - if _in_legacy_dygraph(): + if _non_static_mode(): return _C_ops.bincount(x, weights, "minlength", minlength) helper = LayerHelper('bincount', **locals()) diff --git a/python/paddle/utils/code_gen/api.yaml b/python/paddle/utils/code_gen/api.yaml index 90e08c68cf411..ca53766eb9c64 100644 --- a/python/paddle/utils/code_gen/api.yaml +++ b/python/paddle/utils/code_gen/api.yaml @@ -1363,6 +1363,16 @@ optional : weight backward : nll_loss_grad +- api : norm + args : (Tensor x, int axis, float epsilon, bool is_test) + output : Tensor(out), Tensor(norm) + infer_meta : + func : NormInferMeta + kernel : + func : norm + intermediate : norm + backward : norm_grad + - api : not_equal args : (Tensor x, Tensor y, int axis = -1) output : Tensor @@ -1669,6 +1679,7 @@ func : SegmentPoolInferMeta kernel : func : segment_pool + data_type : x backward : segment_pool_grad # selu diff --git a/python/paddle/utils/code_gen/backward.yaml b/python/paddle/utils/code_gen/backward.yaml index 1e41a0e79491c..3640470503480 100644 --- a/python/paddle/utils/code_gen/backward.yaml +++ b/python/paddle/utils/code_gen/backward.yaml @@ -980,6 +980,16 @@ data_type : input optional : weight +- backward_api : norm_grad + forward : norm (Tensor x, int axis, float epsilon, bool is_test) -> Tensor(out), Tensor(norm) + args : (Tensor x, Tensor norm, Tensor out_grad, int axis, float epsilon, bool is_test) + output : Tensor(x_grad) + infer_meta : + func : UnchangedInferMeta + param : [x] + kernel : + func : norm_grad + - backward_api : p_norm_grad forward : p_norm(Tensor x, float porder, int axis, float epsilon, bool keepdim, bool asvector=false) -> Tensor(out) args : (Tensor x, Tensor out, Tensor out_grad, float porder, int axis, float epsilon, bool keepdim, bool asvector) @@ -1211,6 +1221,7 @@ param : [x] kernel : func : segment_pool_grad + optional : summed_ids - backward_api : selu_grad forward : selu (Tensor x, float scale, float alpha) -> Tensor(out) From 9714878cc76b6db1e1fdec2a81dabc4874f25ea6 Mon Sep 17 00:00:00 2001 From: Zhou Wei <1183042833@qq.com> Date: Thu, 7 Apr 2022 22:15:52 +0800 Subject: [PATCH 018/211] remove FLAGS_use_curand and change all random op CUDA implementation (#41308) --- paddle/fluid/operators/dropout_impl.cu.h | 151 ++++--------- paddle/fluid/operators/gaussian_random_op.cu | 7 - paddle/fluid/operators/uniform_random_op.h | 54 +---- paddle/fluid/platform/flags.cc | 2 - paddle/phi/kernels/cpu/transpose_kernel.cc | 1 + paddle/phi/kernels/gpu/bernoulli_kernel.cu | 59 +---- .../phi/kernels/gpu/gaussian_random_kernel.cu | 25 +- paddle/phi/kernels/gpu/multinomial_kernel.cu | 213 +++++------------- paddle/phi/kernels/gpu/randint_kernel.cu | 36 +-- paddle/phi/kernels/gpu/randperm_kernel.cu | 144 +++++------- .../phi/kernels/gpu/uniform_random_kernel.cu | 61 +---- paddle/scripts/paddle_build.bat | 1 - paddle/scripts/paddle_build.sh | 2 - python/paddle/fluid/initializer.py | 16 +- .../tests/unittests/test_bernoulli_op.py | 3 - .../fluid/tests/unittests/test_dropout_op.py | 3 - .../tests/unittests/test_exponential_op.py | 3 - .../unittests/test_gaussian_random_op.py | 3 - .../fluid/tests/unittests/test_linear.py | 16 ++ .../tests/unittests/test_multinomial_op.py | 3 - .../fluid/tests/unittests/test_poisson_op.py | 3 - .../fluid/tests/unittests/test_randint_op.py | 3 - .../fluid/tests/unittests/test_randperm_op.py | 3 - .../tests/unittests/test_uniform_random_op.py | 45 ++-- python/paddle/nn/utils/__init__.py | 2 +- .../paddle/nn/utils/transform_parameters.py | 33 +++ 26 files changed, 267 insertions(+), 625 deletions(-) diff --git a/paddle/fluid/operators/dropout_impl.cu.h b/paddle/fluid/operators/dropout_impl.cu.h index 83ca9ace20d05..6af8c925ff580 100644 --- a/paddle/fluid/operators/dropout_impl.cu.h +++ b/paddle/fluid/operators/dropout_impl.cu.h @@ -38,43 +38,9 @@ limitations under the License. */ #include "paddle/phi/kernels/funcs/distribution_helper.h" #include "paddle/phi/kernels/funcs/functors.h" -DECLARE_bool(use_curand); - namespace paddle { namespace operators { -template -struct DstMaskGenerator { - const float dropout_prob_; - const bool is_upscale_in_train_; - using MT = typename details::MPTypeTrait::Type; - MT factor; - HOSTDEVICE inline DstMaskGenerator(const float dropout_prob, - const bool is_upscale_in_train) - : dropout_prob_(dropout_prob), is_upscale_in_train_(is_upscale_in_train) { - factor = static_cast(1.0f / (1.0f - dropout_prob_)); - } - - HOSTDEVICE inline void operator()(OutT* dst, const T1* src_val, - const T2* rand, int num) const { - static constexpr int kCount = - phi::funcs::uniform_distribution::kReturnsCount; -// 0 ~ kCount -1 is dist , kCount ~ 2 * kCount - 1 is mask -#pragma unroll - for (int i = 0; i < kCount; i++) { - if (rand[i] < dropout_prob_) { - dst[i] = static_cast(0); - dst[i + kCount] = dst[i]; - } else { - dst[i] = is_upscale_in_train_ - ? static_cast(static_cast(src_val[i]) * factor) - : static_cast(src_val[i]); - dst[i + kCount] = static_cast(1); - } - } - } -}; - template struct DstMaskFunctor { const float retain_prob_; @@ -113,7 +79,7 @@ __global__ void VectorizedRandomGenerator(const size_t n, uint64_t seed, const T* src, MaskType* mask, T* dst, bool is_upscale_in_train, uint64_t increment, - size_t main_offset, bool use_curand) { + size_t main_offset) { size_t idx = static_cast(BLOCK_ID_X * BLOCK_NUM_X); static constexpr int kCount = phi::funcs::uniform_distribution::kReturnsCount; @@ -135,76 +101,41 @@ __global__ void VectorizedRandomGenerator(const size_t n, uint64_t seed, int deal_size = BLOCK_NUM_X * kCount; size_t fix = idx * kCount; - if (use_curand) { - auto dst_functor = - DstMaskFunctor(1.0f - dropout_prob, is_upscale_in_train); - for (; fix < main_offset; fix += stride) { - kps::ReadData(&dst_mask[0], src + fix, deal_size); - kps::ElementwiseRandom(&rands[0], Rand(), - &state); - // dst - kps::OperatorTernary>( - &dst_mask[0], &dst_mask[0], &rands[0], dst_functor, kCount); - kps::WriteData(dst + fix, &dst_mask[0], - deal_size); - // mask - kps::ElementwiseUnary( - &mask_result[0], &dst_mask[kCount], Cast()); - kps::WriteData(mask + fix, &mask_result[0], - deal_size); - if (fix > idx * kCount + 1) { - __syncthreads(); - } - } - int remainder = n - fix; - if (remainder > 0) { - kps::ReadData(&dst_mask[0], src + fix, remainder); - kps::ElementwiseRandom(&rands[0], Rand(), - &state); - // dst - kps::OperatorTernary>( - &dst_mask[0], &dst_mask[0], &rands[0], dst_functor, kCount); - kps::WriteData(dst + fix, &dst_mask[0], remainder); - // mask - kps::ElementwiseUnary( - &mask_result[0], &dst_mask[kCount], Cast()); - kps::WriteData(mask + fix, &mask_result[0], - remainder); + + auto dst_functor = + DstMaskFunctor(1.0f - dropout_prob, is_upscale_in_train); + for (; fix < main_offset; fix += stride) { + kps::ReadData(&dst_mask[0], src + fix, deal_size); + kps::ElementwiseRandom(&rands[0], Rand(), + &state); + // dst + kps::OperatorTernary>( + &dst_mask[0], &dst_mask[0], &rands[0], dst_functor, kCount); + kps::WriteData(dst + fix, &dst_mask[0], deal_size); + // mask + kps::ElementwiseUnary( + &mask_result[0], &dst_mask[kCount], Cast()); + kps::WriteData(mask + fix, &mask_result[0], + deal_size); + if (fix > idx * kCount + 1) { __syncthreads(); } - } else { - auto dst_functor = - DstMaskGenerator(dropout_prob, is_upscale_in_train); - for (; fix < main_offset; fix += stride) { - kps::ReadData(&dst_mask[0], src + fix, deal_size); - kps::ElementwiseRandom(&rands[0], Rand(), - &state); - // dst - kps::OperatorTernary>( - &dst_mask[0], &dst_mask[0], &rands[0], dst_functor, kCount); - kps::WriteData(dst + fix, &dst_mask[0], - deal_size); - // mask - kps::ElementwiseUnary( - &mask_result[0], &dst_mask[kCount], Cast()); - kps::WriteData(mask + fix, &mask_result[0], - deal_size); - } - int remainder = n - fix; - if (remainder > 0) { - kps::ReadData(&dst_mask[0], src + fix, remainder); - kps::ElementwiseRandom(&rands[0], Rand(), - &state); - // dst - kps::OperatorTernary>( - &dst_mask[0], &dst_mask[0], &rands[0], dst_functor, kCount); - kps::WriteData(dst + fix, &dst_mask[0], remainder); - // mask - kps::ElementwiseUnary( - &mask_result[0], &dst_mask[kCount], Cast()); - kps::WriteData(mask + fix, &mask_result[0], - remainder); - } + } + int remainder = n - fix; + if (remainder > 0) { + kps::ReadData(&dst_mask[0], src + fix, remainder); + kps::ElementwiseRandom(&rands[0], Rand(), + &state); + // dst + kps::OperatorTernary>( + &dst_mask[0], &dst_mask[0], &rands[0], dst_functor, kCount); + kps::WriteData(dst + fix, &dst_mask[0], remainder); + // mask + kps::ElementwiseUnary( + &mask_result[0], &dst_mask[kCount], Cast()); + kps::WriteData(mask + fix, &mask_result[0], + remainder); + __syncthreads(); } } @@ -251,13 +182,11 @@ void DropoutFwGPUKernelDriver(const phi::GPUContext& dev_ctx, bool is_test, size_t grid_size = gpu_config.GetGridSize(); size_t block_size = gpu_config.GetBlockSize(); - if (FLAGS_use_curand) { - int64_t device_id = dev_ctx.GetPlace().GetDeviceId(); - const auto& prop = platform::GetDeviceProperties(device_id); - size_t max_grid_size = prop.maxThreadsPerMultiProcessor * - prop.multiProcessorCount / block_size; - grid_size = std::min(grid_size, max_grid_size); - } + int64_t device_id = dev_ctx.GetPlace().GetDeviceId(); + const auto& prop = platform::GetDeviceProperties(device_id); + size_t max_grid_size = prop.maxThreadsPerMultiProcessor * + prop.multiProcessorCount / block_size; + grid_size = std::min(grid_size, max_grid_size); auto offset = ((x_numel - 1) / (grid_size * block_size * kVecSize) + 1) * kVecSize; @@ -268,7 +197,7 @@ void DropoutFwGPUKernelDriver(const phi::GPUContext& dev_ctx, bool is_test, VectorizedRandomGenerator<<>>( size, seed_data, dropout_prob, x_data, mask_data, y_data, - upscale_in_train, increment, main_offset, FLAGS_use_curand); + upscale_in_train, increment, main_offset); } else { if (upscale_in_train) { // todo: can y share with data with x directly? diff --git a/paddle/fluid/operators/gaussian_random_op.cu b/paddle/fluid/operators/gaussian_random_op.cu index 00ce10bfe3bcc..552649279e911 100644 --- a/paddle/fluid/operators/gaussian_random_op.cu +++ b/paddle/fluid/operators/gaussian_random_op.cu @@ -11,21 +11,14 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include -#include #include -#include #include "paddle/fluid/framework/generator.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/operator.h" #include "paddle/fluid/operators/amp/fp16_type_traits.h" #include "paddle/fluid/operators/fill_constant_op.h" - -#include "paddle/phi/kernels/funcs/distribution_helper.h" #include "paddle/phi/kernels/funcs/index_impl.cu.h" -DECLARE_bool(use_curand); - namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/uniform_random_op.h b/paddle/fluid/operators/uniform_random_op.h index b941dc21c3ab2..ae846f4cae6fb 100644 --- a/paddle/fluid/operators/uniform_random_op.h +++ b/paddle/fluid/operators/uniform_random_op.h @@ -19,11 +19,7 @@ #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/operator.h" #if defined(__NVCC__) || defined(__HIPCC__) -DECLARE_bool(use_curand); -#include -#include #include -#include #include "paddle/fluid/framework/generator.h" #include "paddle/phi/kernels/full_kernel.h" #include "paddle/phi/kernels/funcs/distribution_helper.h" @@ -146,39 +142,6 @@ struct UniformGenerator { } }; -template -struct UniformGeneratorOffset { - T min_, max_; - unsigned int seed_; - T diag_val_; - unsigned int diag_num_; - unsigned int diag_step_; - int offset_; - __host__ __device__ UniformGeneratorOffset(T min, T max, int seed, - int diag_num, int diag_step, - T diag_val, int offset) - : min_(min), - max_(max), - seed_(seed), - diag_num_(diag_num), - diag_step_(diag_step), - diag_val_(diag_val), - offset_(offset) {} - - __host__ __device__ T operator()(const unsigned int n) const { - thrust::minstd_rand rng; - rng.seed(seed_); - thrust::uniform_real_distribution dist(min_, max_); - rng.discard(n + offset_); - T out = dist(rng); - unsigned int remainder = n % (diag_step_ + 1); - if (remainder == 0 && diag_num_ > n / (diag_step_ + 1)) { - out = diag_val_; - } - return out; - } -}; - template void UniformRandom(const framework::ExecutionContext& context, framework::Tensor* tensor) { @@ -205,19 +168,10 @@ void UniformRandom(const framework::ExecutionContext& context, int device_id = context.GetPlace().GetDeviceId(); auto gen_cuda = framework::GetDefaultCUDAGenerator(device_id); if (gen_cuda->GetIsInitPy() && seed_flag) { - if (FLAGS_use_curand) { - using MT = typename details::MPTypeTrait::Type; - phi::funcs::uniform_distribution dist; - phi::funcs::uniform_real_transform trans(min, max); - phi::funcs::distribution_and_transform(dev_cxt, tensor, dist, trans); - } else { - auto seed_offset = gen_cuda->IncrementOffset(1); - int64_t gen_offset = size * seed_offset.second; - auto func = - UniformGeneratorOffset(min, max, seed_offset.first, diag_num, - diag_step, diag_val, gen_offset); - phi::IndexKernel>(dev_cxt, tensor, func); - } + using MT = typename details::MPTypeTrait::Type; + phi::funcs::uniform_distribution dist; + phi::funcs::uniform_real_transform trans(min, max); + phi::funcs::distribution_and_transform(dev_cxt, tensor, dist, trans); } else { auto func = UniformGenerator(min, max, seed, diag_num, diag_step, diag_val); diff --git a/paddle/fluid/platform/flags.cc b/paddle/fluid/platform/flags.cc index 4e47c130c7252..c3d3f6a4f6893 100644 --- a/paddle/fluid/platform/flags.cc +++ b/paddle/fluid/platform/flags.cc @@ -545,8 +545,6 @@ PADDLE_DEFINE_EXPORTED_double( */ PADDLE_DEFINE_EXPORTED_bool(use_mkldnn, false, "Use MKLDNN to run"); -PADDLE_DEFINE_EXPORTED_bool(use_curand, false, "Random OP use CURAND"); - /** * Debug related FLAG * Name: FLAGS_call_stack_level diff --git a/paddle/phi/kernels/cpu/transpose_kernel.cc b/paddle/phi/kernels/cpu/transpose_kernel.cc index a80196e7f80e1..5dc4866e1efc3 100644 --- a/paddle/phi/kernels/cpu/transpose_kernel.cc +++ b/paddle/phi/kernels/cpu/transpose_kernel.cc @@ -75,6 +75,7 @@ PD_REGISTER_KERNEL(transpose, double, int32_t, int64_t, + phi::dtype::float16, phi::dtype::bfloat16, phi::dtype::complex, phi::dtype::complex) {} diff --git a/paddle/phi/kernels/gpu/bernoulli_kernel.cu b/paddle/phi/kernels/gpu/bernoulli_kernel.cu index 79d8a7b0f3444..edcf29e2d88d3 100644 --- a/paddle/phi/kernels/gpu/bernoulli_kernel.cu +++ b/paddle/phi/kernels/gpu/bernoulli_kernel.cu @@ -14,8 +14,6 @@ #include "paddle/phi/kernels/bernoulli_kernel.h" -#include -#include #ifdef __NVCC__ #include #endif @@ -32,35 +30,8 @@ #include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/kernels/funcs/distribution_helper.h" -// See Note [ Why still include the fluid headers? ] -#include "paddle/fluid/platform/transform.h" - -DECLARE_bool(use_curand); - namespace phi { -template -struct BernoulliCudaFunctor { - unsigned int seed_; - unsigned int offset_; - __host__ __device__ BernoulliCudaFunctor(unsigned int seed, - unsigned int offset) - : seed_(seed), offset_(offset) {} - - __host__ __device__ T operator()(const unsigned int n, const T p) const { - // NOTE(zhiqiu): currently, PADDLE_ENFORCE in cuda kernel may print several - // lines of error messages if, and it should be refined. - PADDLE_ENFORCE(p >= 0.0 && p <= 1.0, - "The probability should be >=0 and <= 1, but got %f", - p); - thrust::minstd_rand rng; - rng.seed(seed_); - thrust::uniform_real_distribution dist(0.0, 1.0); - rng.discard(n + offset_); - return static_cast(dist(rng) < p); - } -}; - // 'curand_uniform4/hiprand_uniform4' generate 4 random number each time template __global__ void bernoulli_cuda_kernel( @@ -100,30 +71,16 @@ void BernoulliKernel(const Context& ctx, auto gen_cuda = ctx.GetGenerator(); - if (FLAGS_use_curand) { - auto seed_offset = gen_cuda->IncrementOffset(12); - uint64_t seed = seed_offset.first; - uint64_t offset = seed_offset.second; + auto seed_offset = gen_cuda->IncrementOffset(12); + uint64_t seed = seed_offset.first; + uint64_t offset = seed_offset.second; - auto gpu_config = phi::backends::gpu::GetGpuLaunchConfig1D(ctx, numel, 4); - size_t grid_size = gpu_config.GetGridSize(); - size_t block_size = gpu_config.GetBlockSize(); + auto gpu_config = phi::backends::gpu::GetGpuLaunchConfig1D(ctx, numel, 4); + size_t grid_size = gpu_config.GetGridSize(); + size_t block_size = gpu_config.GetBlockSize(); - bernoulli_cuda_kernel<<>>( - numel, seed, offset, x_data, out_data); - } else { - auto seed_offset = gen_cuda->IncrementOffset(1); - int64_t gen_offset = numel * seed_offset.second; - paddle::platform::Transform trans; - thrust::counting_iterator index_sequence_begin(0); - trans(ctx, - index_sequence_begin, - index_sequence_begin + numel, - x_data, - out_data, - BernoulliCudaFunctor(static_cast(seed_offset.first), - static_cast(gen_offset))); - } + bernoulli_cuda_kernel<<>>( + numel, seed, offset, x_data, out_data); } } // namespace phi diff --git a/paddle/phi/kernels/gpu/gaussian_random_kernel.cu b/paddle/phi/kernels/gpu/gaussian_random_kernel.cu index e159e5916cff2..96ebc0353ef24 100644 --- a/paddle/phi/kernels/gpu/gaussian_random_kernel.cu +++ b/paddle/phi/kernels/gpu/gaussian_random_kernel.cu @@ -14,10 +14,7 @@ #include "paddle/phi/kernels/gaussian_random_kernel.h" -#include -#include #include -#include #include "paddle/phi/backends/gpu/gpu_context.h" #include "paddle/phi/common/amp_type_traits.h" #include "paddle/phi/core/dense_tensor.h" @@ -27,8 +24,6 @@ #include "paddle/fluid/framework/generator.h" -DECLARE_bool(use_curand); - namespace phi { template @@ -83,21 +78,11 @@ void GaussianRandomKernel(const Context& dev_ctx, auto gen_cuda = paddle::framework::GetDefaultCUDAGenerator(device_id); if (gen_cuda->GetIsInitPy() && seed_flag) { - if (FLAGS_use_curand) { - using MT = typename phi::dtype::MPTypeTrait::Type; - funcs::normal_distribution dist; - funcs::normal_transform trans(static_cast(mean), - static_cast(std)); - funcs::distribution_and_transform(dev_ctx, tensor, dist, trans); - } else { - auto seed_offset = gen_cuda->IncrementOffset(1); - int64_t gen_offset = size * seed_offset.second; - auto func = GaussianGenerator(static_cast(mean), - static_cast(std), - seed_offset.first, - gen_offset); - IndexKernel>(dev_ctx, tensor, func); - } + using MT = typename phi::dtype::MPTypeTrait::Type; + funcs::normal_distribution dist; + funcs::normal_transform trans(static_cast(mean), + static_cast(std)); + funcs::distribution_and_transform(dev_ctx, tensor, dist, trans); } else { auto func = GaussianGenerator(static_cast(mean), static_cast(std), seed); diff --git a/paddle/phi/kernels/gpu/multinomial_kernel.cu b/paddle/phi/kernels/gpu/multinomial_kernel.cu index ee5f843b18a90..ef6cd1323a9df 100644 --- a/paddle/phi/kernels/gpu/multinomial_kernel.cu +++ b/paddle/phi/kernels/gpu/multinomial_kernel.cu @@ -18,11 +18,6 @@ limitations under the License. */ #include "paddle/phi/kernels/multinomial_kernel.h" -#include -#include -#include -#include - #ifdef __NVCC__ #include "cub/cub.cuh" #endif @@ -44,12 +39,6 @@ namespace cub = hipcub; #include "paddle/phi/kernels/funcs/multinomial_functor.h" #include "paddle/phi/kernels/top_k_kernel.h" -// See Note [ Why still include the fluid headers? ] -#include "paddle/fluid/memory/memcpy.h" -#include "paddle/fluid/platform/transform.h" - -DECLARE_bool(use_curand); - namespace phi { template @@ -74,32 +63,6 @@ __global__ void NormalizeProbability(T* norm_probs, } } -template -__global__ void GetCumulativeProbs(T* norm_probs_data, - int64_t num_distributions, - int64_t num_categories, - T* cumulative_probs_data) { - int id = blockIdx.x; - thrust::inclusive_scan(thrust::device, - norm_probs_data + id * num_categories, - norm_probs_data + (id + 1) * num_categories, - cumulative_probs_data + id * num_categories); -} - -template -struct RandomGeneratorCudaFunctor { - unsigned int seed_; - __host__ __device__ RandomGeneratorCudaFunctor(int seed) : seed_(seed) {} - - __host__ __device__ T operator()(const unsigned int n) const { - thrust::minstd_rand rng; - rng.seed(seed_); - thrust::uniform_real_distribution dist(0.0, 1.0); - rng.discard(n); - return dist(rng); - } -}; - template __device__ int binarySearchFunctor(T* cumulative_probs_data, T* norm_probs_data, @@ -130,7 +93,6 @@ __device__ int binarySearchFunctor(T* cumulative_probs_data, template __global__ void sampleMultinomialWithReplacement( - T* rng_data, const int64_t num_samples, int64_t* out_data, const int64_t num_distributions, @@ -138,10 +100,9 @@ __global__ void sampleMultinomialWithReplacement( T* cumulative_probs_data, T* norm_probs_data, uint64_t seed, - uint64_t offset, - bool use_curand) { + uint64_t offset) { // use binary search to get the selected category sample id. - // let cumulative_probs_data[id-1] < rng_data < cumulative_probs_data[id]. + // let cumulative_probs_data[id-1] < rng_number < cumulative_probs_data[id]. size_t idx = gridDim.x * blockDim.x * blockIdx.y + blockDim.x * blockIdx.x + threadIdx.x; @@ -151,10 +112,7 @@ __global__ void sampleMultinomialWithReplacement( int sample = blockIdx.x * blockDim.x + threadIdx.x; for (int dist = blockIdx.y; dist < num_distributions; dist += gridDim.y) { if (sample < num_samples) { - T rng_number = rng_data[sample + dist * num_samples]; - if (use_curand) { - rng_number = static_cast(curand_uniform4(&state).x); - } + T rng_number = static_cast(curand_uniform4(&state).x); // Find the bucket that a uniform random number lies in int selected_category = binarySearchFunctor(cumulative_probs_data + dist * num_categories, @@ -182,10 +140,7 @@ void MultinomialKernel(const Context& dev_ctx, const int64_t num_distributions = in_rank > 1 ? in_dims[in_rank - 2] : 1; // If replacement is False, it's not a replaceable sample. Every category - // can - // be used only once. So after every sample, probability of the distribution - // will change. The implementation can't be parallelizable. Thus, call CPU - // implementation ``funcs::MultinomialFunctor`` to sample the distribution. + // can be used only once. if (!replacement) { int64_t in_data_numel = x.numel(); int64_t out_data_numel = out->numel(); @@ -202,76 +157,50 @@ void MultinomialKernel(const Context& dev_ctx, in_data_numel * sizeof(T), cudaMemcpyDeviceToHost); #endif - if (FLAGS_use_curand) { - for (size_t i = 0; i < num_distributions; ++i) { - int zero_num = 0; - for (size_t j = 0; j < num_categories; ++j) { - T weight = cpu_in_data[i * num_distributions + j]; - PADDLE_ENFORCE_GE( - weight, - 0, - errors::InvalidArgument( - "Each element of multinomial'input must >= 0, but got %f.", - weight)); - if (weight == static_cast(0)) { - zero_num++; - } + for (size_t i = 0; i < num_distributions; ++i) { + int zero_num = 0; + for (size_t j = 0; j < num_categories; ++j) { + T weight = cpu_in_data[i * num_distributions + j]; + PADDLE_ENFORCE_GE( + weight, + 0, + errors::InvalidArgument( + "Each element of multinomial'input must >= 0, but got %f.", + weight)); + if (weight == static_cast(0)) { + zero_num++; } - int valid_samples = num_categories - zero_num; - PADDLE_ENFORCE_LE( - num_samples, - valid_samples, - errors::InvalidArgument("When replacement=False, 'num_samples' " - "must less than or eaqual to the number of " - "positive item of input")); } - - // Refer to [gumbel softmax algorithm] - DenseTensor rand = EmptyLike(dev_ctx, x); - T* rand_data = rand.data(); - funcs::uniform_distribution dist; - funcs::exponential_transform trans(1.0); - funcs::distribution_and_transform(dev_ctx, &rand, dist, trans); - - funcs::ForRange for_range(dev_ctx, x.numel()); - for_range([rand_data, in_data] __device__(size_t idx) { - rand_data[idx] = in_data[idx] / rand_data[idx]; - }); - - if (num_samples == 1) { - ArgMaxKernel( - dev_ctx, rand, -1, true, false, 3 /*proto::VarType::INT64*/, out); - } else { - std::vector out_dim_vec = vectorize(out->dims()); - DenseTensor value = Empty(dev_ctx, IntArray(out_dim_vec)); - TopkKernel( - dev_ctx, rand, Scalar(num_samples), -1, true, true, &value, out); - } - return; + int valid_samples = num_categories - zero_num; + PADDLE_ENFORCE_LE( + num_samples, + valid_samples, + errors::InvalidArgument("When replacement=False, 'num_samples' " + "must less than or eaqual to the number of " + "positive item of input")); } - funcs::MultinomialFunctor(dev_ctx, - cpu_out_data, - cpu_in_data, - num_samples, - replacement, - num_categories, - num_distributions); - -#ifdef PADDLE_WITH_HIP - hipMemcpy(out_data, - cpu_out_data, - out_data_numel * sizeof(int64_t), - hipMemcpyHostToDevice); -#else - cudaMemcpy(out_data, - cpu_out_data, - out_data_numel * sizeof(int64_t), - cudaMemcpyHostToDevice); -#endif - - delete[] cpu_in_data; - delete[] cpu_out_data; + // Refer to [gumbel softmax algorithm] + DenseTensor rand = EmptyLike(dev_ctx, x); + T* rand_data = rand.data(); + funcs::uniform_distribution dist; + funcs::exponential_transform trans(1.0); + funcs::distribution_and_transform(dev_ctx, &rand, dist, trans); + + funcs::ForRange for_range(dev_ctx, x.numel()); + for_range([rand_data, in_data] __device__(size_t idx) { + rand_data[idx] = in_data[idx] / rand_data[idx]; + }); + + if (num_samples == 1) { + ArgMaxKernel( + dev_ctx, rand, -1, true, false, 3 /*proto::VarType::INT64*/, out); + } else { + std::vector out_dim_vec = vectorize(out->dims()); + DenseTensor value = Empty(dev_ctx, IntArray(out_dim_vec)); + TopkKernel( + dev_ctx, rand, Scalar(num_samples), -1, true, true, &value, out); + } return; } @@ -322,44 +251,18 @@ void MultinomialKernel(const Context& dev_ctx, auto* cumulative_probs_data = dev_ctx.template Alloc(&cumulative_probs_tensor); - if (FLAGS_use_curand) { - // 'phi::funcs::InclusiveScan' has higher accuracy than - // 'thrust::inclusive_scan' - funcs::InclusiveScan>( - /*in*/ norm_probs_data, - /*out*/ cumulative_probs_data, - /*outer_dim*/ static_cast(num_distributions), - /*mid_dim*/ static_cast(num_categories), - /*inner_dim*/ static_cast(1), - /*init*/ static_cast(0), - std::plus(), - /*reverse=*/false, - dev_ctx); - } else { - dim3 block_cumsum(1); - dim3 grid_cumsum(num_distributions); - GetCumulativeProbs<<>>( - norm_probs_data, - num_distributions, - num_categories, - cumulative_probs_data); - } - - // Generate random number for each sample. - std::random_device rd; - auto seed = rd(); - - DenseTensor rng_data_tensor; - rng_data_tensor.Resize({num_distributions, num_samples}); - auto* rng_data = dev_ctx.template Alloc(&rng_data_tensor); - - thrust::counting_iterator index_sequence_begin(0); - paddle::platform::Transform trans; - trans(dev_ctx, - index_sequence_begin, - index_sequence_begin + num_distributions * num_samples, - rng_data, - RandomGeneratorCudaFunctor(seed)); + // 'phi::funcs::InclusiveScan' has higher accuracy than + // 'thrust::inclusive_scan' + funcs::InclusiveScan>( + /*in*/ norm_probs_data, + /*out*/ cumulative_probs_data, + /*outer_dim*/ static_cast(num_distributions), + /*mid_dim*/ static_cast(num_categories), + /*inner_dim*/ static_cast(1), + /*init*/ static_cast(0), + std::plus(), + /*reverse=*/false, + dev_ctx); // Sample the multinomial distributions. dim3 block(128); @@ -376,7 +279,6 @@ void MultinomialKernel(const Context& dev_ctx, auto seed_offset = gen_cuda->IncrementOffset(increment); sampleMultinomialWithReplacement<<>>( - rng_data, num_samples, out_data, num_distributions, @@ -384,8 +286,7 @@ void MultinomialKernel(const Context& dev_ctx, cumulative_probs_data, norm_probs_data, seed_offset.first, - seed_offset.second, - FLAGS_use_curand); + seed_offset.second); } } // namespace phi diff --git a/paddle/phi/kernels/gpu/randint_kernel.cu b/paddle/phi/kernels/gpu/randint_kernel.cu index 0188505002268..90eaea6a0868c 100644 --- a/paddle/phi/kernels/gpu/randint_kernel.cu +++ b/paddle/phi/kernels/gpu/randint_kernel.cu @@ -23,8 +23,6 @@ // See Note [ Why still include the fluid headers? ] #include "paddle/fluid/memory/memcpy.h" -DECLARE_bool(use_curand); - namespace phi { template @@ -37,37 +35,9 @@ void RandintRawKernel(const Context& dev_ctx, DenseTensor* out) { out->Resize(phi::make_ddim(shape.GetData())); T* data = dev_ctx.template Alloc(out); - if (FLAGS_use_curand) { - funcs::uniform_distribution dist; - funcs::uniform_int_transform trans(low, high); - funcs::distribution_and_transform(dev_ctx, out, dist, trans); - } else { - DenseTensor tmp; - tmp.Resize(phi::make_ddim(shape.GetData())); - T* tmp_data = dev_ctx.template HostAlloc(&tmp); - - std::shared_ptr engine; - if (seed) { - engine = std::make_shared(); - engine->seed(seed); - } else { - engine = dev_ctx.GetHostGenerator()->GetCPUEngine(); - } - - std::uniform_int_distribution dist(low, high - 1); - auto numel = out->numel(); - for (int64_t i = 0; i < numel; ++i) { - tmp_data[i] = dist(*engine); - } - - paddle::memory::Copy( - out->place(), - data, - tmp.place(), - tmp_data, - numel * paddle::experimental::SizeOf(out->dtype()), - 0); - } + funcs::uniform_distribution dist; + funcs::uniform_int_transform trans(low, high); + funcs::distribution_and_transform(dev_ctx, out, dist, trans); } template diff --git a/paddle/phi/kernels/gpu/randperm_kernel.cu b/paddle/phi/kernels/gpu/randperm_kernel.cu index 678b580beca2f..4e488ed470df9 100644 --- a/paddle/phi/kernels/gpu/randperm_kernel.cu +++ b/paddle/phi/kernels/gpu/randperm_kernel.cu @@ -84,91 +84,65 @@ __global__ void SwapRepeatKernel( template void RandpermRawKernel( const Context& dev_ctx, int n, DataType dtype, int seed, DenseTensor* out) { - if (FLAGS_use_curand) { - DenseTensor key; - RandintKernel(dev_ctx, - std::numeric_limits::min(), - std::numeric_limits::max(), - IntArray({n}), - phi::DataType::INT32, - &key); - DenseTensor key_out = Empty(dev_ctx, IntArray({n})); - - DenseTensor range = Empty(dev_ctx, IntArray({n})); - T* range_data = range.data(); - funcs::ForRange for_range(dev_ctx, n); - for_range([range_data] __device__(size_t idx) { - range_data[idx] = static_cast(idx); - }); - - out->Resize(phi::make_ddim({n})); - T* out_data = dev_ctx.template Alloc(out); - - // Refer to [Algorithm of randperm] https://osf.io/af2hy/ to - // improve performance of radix sort. - double n_d = static_cast(n); - int begin_bit = 0; - int end_bit = - std::ceil(std::log2(n_d - (6 * n_d * n_d + 1) / (12 * std::log(0.9)))); - - size_t temp_storage_bytes = 0; - cub::DeviceRadixSort::SortPairs(nullptr, - temp_storage_bytes, - key.data(), - key_out.data(), - range.data(), - out_data, - n, - begin_bit, - end_bit < 32 ? end_bit : 32, - dev_ctx.stream()); - - auto d_temp_storage = paddle::memory::Alloc(dev_ctx, temp_storage_bytes); - cub::DeviceRadixSort::SortPairs(d_temp_storage->ptr(), - temp_storage_bytes, - key.data(), - key_out.data(), - range.data(), - out_data, - n, - begin_bit, - end_bit < 32 ? end_bit : 32, - dev_ctx.stream()); - - auto gen_cuda = dev_ctx.GetGenerator(); - auto seed_offset = gen_cuda->IncrementOffset(n); - uint64_t seed = seed_offset.first; - uint64_t offset = seed_offset.second; - - auto config = phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, n); - SwapRepeatKernel<<>>( - key_out.data(), out_data, n, seed, offset); - } else { - DenseTensor tmp; - tmp.Resize(phi::make_ddim({n})); - T* tmp_data = dev_ctx.template HostAlloc(&tmp); - - std::shared_ptr engine; - if (seed) { - engine = std::make_shared(); - engine->seed(seed); - } else { - engine = dev_ctx.GetHostGenerator()->GetCPUEngine(); - } - - for (int i = 0; i < n; ++i) { - tmp_data[i] = static_cast(i); - } - std::shuffle(tmp_data, tmp_data + n, *engine); - - T* out_data = dev_ctx.template Alloc(out); - auto size = out->numel() * paddle::experimental::SizeOf(out->dtype()); - paddle::memory::Copy( - out->place(), out_data, tmp.place(), tmp_data, size, 0); - } + DenseTensor key; + RandintKernel(dev_ctx, + std::numeric_limits::min(), + std::numeric_limits::max(), + IntArray({n}), + phi::DataType::INT32, + &key); + DenseTensor key_out = Empty(dev_ctx, IntArray({n})); + + DenseTensor range = Empty(dev_ctx, IntArray({n})); + T* range_data = range.data(); + funcs::ForRange for_range(dev_ctx, n); + for_range([range_data] __device__(size_t idx) { + range_data[idx] = static_cast(idx); + }); + + out->Resize(phi::make_ddim({n})); + T* out_data = dev_ctx.template Alloc(out); + + // Refer to [Algorithm of randperm] https://osf.io/af2hy/ to + // improve performance of radix sort. + double n_d = static_cast(n); + int begin_bit = 0; + int end_bit = + std::ceil(std::log2(n_d - (6 * n_d * n_d + 1) / (12 * std::log(0.9)))); + + size_t temp_storage_bytes = 0; + cub::DeviceRadixSort::SortPairs(nullptr, + temp_storage_bytes, + key.data(), + key_out.data(), + range.data(), + out_data, + n, + begin_bit, + end_bit < 32 ? end_bit : 32, + dev_ctx.stream()); + + auto d_temp_storage = paddle::memory::Alloc(dev_ctx, temp_storage_bytes); + cub::DeviceRadixSort::SortPairs(d_temp_storage->ptr(), + temp_storage_bytes, + key.data(), + key_out.data(), + range.data(), + out_data, + n, + begin_bit, + end_bit < 32 ? end_bit : 32, + dev_ctx.stream()); + + auto gen_cuda = dev_ctx.GetGenerator(); + auto seed_offset = gen_cuda->IncrementOffset(n); + + auto config = phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, n); + SwapRepeatKernel<<>>( + key_out.data(), out_data, n, seed_offset.first, seed_offset.second); } template diff --git a/paddle/phi/kernels/gpu/uniform_random_kernel.cu b/paddle/phi/kernels/gpu/uniform_random_kernel.cu index 2cabde0bbf942..a4aea10cfe762 100644 --- a/paddle/phi/kernels/gpu/uniform_random_kernel.cu +++ b/paddle/phi/kernels/gpu/uniform_random_kernel.cu @@ -14,14 +14,13 @@ #include "paddle/phi/kernels/uniform_random_kernel.h" +#include #include "gflags/gflags.h" #include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/kernels/funcs/distribution_helper.h" #include "paddle/phi/kernels/funcs/index_impl.cu.h" -DECLARE_bool(use_curand); - namespace phi { template @@ -54,43 +53,6 @@ struct UniformGenerator { } }; -template -struct UniformGeneratorOffset { - T min_, max_; - unsigned int seed_; - T diag_val_; - unsigned int diag_num_; - unsigned int diag_step_; - int offset_; - __host__ __device__ UniformGeneratorOffset(T min, - T max, - int seed, - int diag_num, - int diag_step, - T diag_val, - int offset) - : min_(min), - max_(max), - seed_(seed), - diag_num_(diag_num), - diag_step_(diag_step), - diag_val_(diag_val), - offset_(offset) {} - - __host__ __device__ T operator()(const unsigned int n) const { - thrust::minstd_rand rng; - rng.seed(seed_); - thrust::uniform_real_distribution dist(min_, max_); - rng.discard(n + offset_); - T out = dist(rng); - unsigned int remainder = n % (diag_step_ + 1); - if (remainder == 0 && diag_num_ > n / (diag_step_ + 1)) { - out = diag_val_; - } - return out; - } -}; - template void UniformRandomRawKernel(const Context& dev_ctx, const IntArray& shape, @@ -114,23 +76,10 @@ void UniformRandomRawKernel(const Context& dev_ctx, auto generator = dev_ctx.GetGenerator(); if (generator->GetIsInitPy() && seed_flag) { - if (FLAGS_use_curand) { - using MT = typename kps::details::MPTypeTrait::Type; - funcs::uniform_distribution dist; - funcs::uniform_real_transform trans(min, max); - funcs::distribution_and_transform(dev_ctx, out, dist, trans); - } else { - auto seed_offset = generator->IncrementOffset(1); - int64_t gen_offset = size * seed_offset.second; - auto func = UniformGeneratorOffset(min, - max, - seed_offset.first, - diag_num, - diag_step, - diag_val, - gen_offset); - IndexKernel>(dev_ctx, out, func); - } + using MT = typename kps::details::MPTypeTrait::Type; + funcs::uniform_distribution dist; + funcs::uniform_real_transform trans(min, max); + funcs::distribution_and_transform(dev_ctx, out, dist, trans); } else { auto func = UniformGenerator(min, max, seed, diag_num, diag_step, diag_val); diff --git a/paddle/scripts/paddle_build.bat b/paddle/scripts/paddle_build.bat index cc55ea82df608..21df60e972121 100644 --- a/paddle/scripts/paddle_build.bat +++ b/paddle/scripts/paddle_build.bat @@ -657,7 +657,6 @@ for /F %%# in ('wmic os get localdatetime^|findstr 20') do set start=%%# set start=%start:~4,10% set FLAGS_call_stack_level=2 -set FLAGS_use_curand=True dir %THIRD_PARTY_PATH:/=\%\install\openblas\lib dir %THIRD_PARTY_PATH:/=\%\install\openblas\bin dir %THIRD_PARTY_PATH:/=\%\install\zlib\bin diff --git a/paddle/scripts/paddle_build.sh b/paddle/scripts/paddle_build.sh index d1220e4537582..e8bde467e085d 100755 --- a/paddle/scripts/paddle_build.sh +++ b/paddle/scripts/paddle_build.sh @@ -61,8 +61,6 @@ function init() { # NOTE(chenweihang): For easy debugging, CI displays the C++ error stacktrace by default export FLAGS_call_stack_level=2 - export FLAGS_use_curand=True - # set CI_SKIP_CPP_TEST if only *.py changed # In order to avoid using in some CI(such as daily performance), the current # branch must not be `${BRANCH}` which is usually develop. diff --git a/python/paddle/fluid/initializer.py b/python/paddle/fluid/initializer.py index 37eff6d132d03..b3baedc401504 100644 --- a/python/paddle/fluid/initializer.py +++ b/python/paddle/fluid/initializer.py @@ -561,12 +561,12 @@ def __call__(self, var, block=None): if framework._non_static_mode(): if self._uniform: - limit = np.sqrt(6.0 / float(fan_in + fan_out)) + limit = math.sqrt(6.0 / float(fan_in + fan_out)) out_var = _C_ops.uniform_random('shape', out_var.shape, 'min', -limit, 'max', limit, 'seed', self._seed, 'dtype', out_dtype) else: - std = np.sqrt(2.0 / float(fan_in + fan_out)) + std = math.sqrt(2.0 / float(fan_in + fan_out)) out_var = _C_ops.gaussian_random( 'shape', out_var.shape, 'dtype', out_dtype, 'mean', 0.0, 'std', std, 'seed', self._seed) @@ -581,7 +581,7 @@ def __call__(self, var, block=None): return None else: if self._uniform: - limit = np.sqrt(6.0 / float(fan_in + fan_out)) + limit = math.sqrt(6.0 / float(fan_in + fan_out)) op = block.append_op( type="uniform_random", inputs={}, @@ -595,7 +595,7 @@ def __call__(self, var, block=None): }, stop_gradient=True) else: - std = np.sqrt(2.0 / float(fan_in + fan_out)) + std = math.sqrt(2.0 / float(fan_in + fan_out)) op = block.append_op( type="gaussian_random", outputs={"Out": out_var}, @@ -713,13 +713,13 @@ def __call__(self, var, block=None): if framework._non_static_mode(): if self._uniform: - limit = np.sqrt(6.0 / float(fan_in)) + limit = math.sqrt(6.0 / float(fan_in)) out_var = _C_ops.uniform_random('shape', out_var.shape, 'min', -limit, 'max', limit, 'seed', self._seed, 'dtype', int(out_dtype)) else: - std = np.sqrt(2.0 / float(fan_in)) + std = math.sqrt(2.0 / float(fan_in)) out_var = _C_ops.gaussian_random( 'shape', out_var.shape, 'dtype', int(out_dtype), 'mean', 0.0, 'std', std, 'seed', self._seed) @@ -734,7 +734,7 @@ def __call__(self, var, block=None): return None else: if self._uniform: - limit = np.sqrt(6.0 / float(fan_in)) + limit = math.sqrt(6.0 / float(fan_in)) op = block.append_op( type="uniform_random", inputs={}, @@ -749,7 +749,7 @@ def __call__(self, var, block=None): stop_gradient=True) else: - std = np.sqrt(2.0 / float(fan_in)) + std = math.sqrt(2.0 / float(fan_in)) op = block.append_op( type="gaussian_random", outputs={"Out": out_var}, diff --git a/python/paddle/fluid/tests/unittests/test_bernoulli_op.py b/python/paddle/fluid/tests/unittests/test_bernoulli_op.py index 426d5d463f453..fc4ee13384b2d 100644 --- a/python/paddle/fluid/tests/unittests/test_bernoulli_op.py +++ b/python/paddle/fluid/tests/unittests/test_bernoulli_op.py @@ -75,9 +75,6 @@ def test_fixed_random_number(self): if not paddle.is_compiled_with_cuda(): return - if os.getenv("FLAGS_use_curand", None) in ('0', 'False', None): - return - print("Test Fixed Random number on GPU------>") paddle.disable_static() paddle.set_device('gpu') diff --git a/python/paddle/fluid/tests/unittests/test_dropout_op.py b/python/paddle/fluid/tests/unittests/test_dropout_op.py index d8a4eb8f45f7d..3aca428ac77af 100644 --- a/python/paddle/fluid/tests/unittests/test_dropout_op.py +++ b/python/paddle/fluid/tests/unittests/test_dropout_op.py @@ -1034,9 +1034,6 @@ def test_fixed_random_number(self): if not "V100" in paddle.device.cuda.get_device_name(): return - if os.getenv("FLAGS_use_curand", None) in ('0', 'False', None): - return - print("Test Fixed Random number on V100 GPU------>") paddle.disable_static() paddle.set_device('gpu') diff --git a/python/paddle/fluid/tests/unittests/test_exponential_op.py b/python/paddle/fluid/tests/unittests/test_exponential_op.py index 7a3ae203be62d..c8f4101ea5d6b 100644 --- a/python/paddle/fluid/tests/unittests/test_exponential_op.py +++ b/python/paddle/fluid/tests/unittests/test_exponential_op.py @@ -100,9 +100,6 @@ def test_fixed_random_number(self): if not "V100" in paddle.device.cuda.get_device_name(): return - if os.getenv("FLAGS_use_curand", None) in ('0', 'False', None): - return - print("Test Fixed Random number on V100 GPU------>") paddle.disable_static() paddle.set_device('gpu') diff --git a/python/paddle/fluid/tests/unittests/test_gaussian_random_op.py b/python/paddle/fluid/tests/unittests/test_gaussian_random_op.py index 738441a46d377..4fca8b9f2a118 100644 --- a/python/paddle/fluid/tests/unittests/test_gaussian_random_op.py +++ b/python/paddle/fluid/tests/unittests/test_gaussian_random_op.py @@ -342,9 +342,6 @@ def test_fixed_random_number(self): if not "V100" in paddle.device.cuda.get_device_name(): return - if os.getenv("FLAGS_use_curand", None) in ('0', 'False', None): - return - def _check_random_value(dtype, expect, expect_mean, expect_std): x = paddle.randn([32, 3, 1024, 1024], dtype=dtype) actual = x.numpy() diff --git a/python/paddle/fluid/tests/unittests/test_linear.py b/python/paddle/fluid/tests/unittests/test_linear.py index 9d07a80da15db..6b00a86e3e900 100644 --- a/python/paddle/fluid/tests/unittests/test_linear.py +++ b/python/paddle/fluid/tests/unittests/test_linear.py @@ -73,6 +73,22 @@ def test_error(self, place=paddle.CPUPlace()): np.testing.assert_array_almost_equal(res_f, res_nn) np.testing.assert_array_almost_equal(res_nn, res_np) + def test_weight_init(self): + if not paddle.is_compiled_with_cuda(): + return + paddle.seed(100) + linear = paddle.nn.Linear( + 2, 3, weight_attr=paddle.nn.initializer.Normal(0, 1.)) + paddle.nn.utils._stride_column(linear.weight) + expect = [[1.4349908, -0.8099171, -2.64788], + [-1.4981681, -1.1784115, -0.023253186]] + self.assertTrue(np.allclose(linear.weight.numpy(), expect)) + + linear = paddle.nn.Linear(2, 3) + expect = [[0.73261100, 0.43836895, 0.07908206], + [0.85075015, -1.04724526, 0.64371765]] + self.assertTrue(np.allclose(linear.weight.numpy(), expect)) + if __name__ == "__main__": unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_multinomial_op.py b/python/paddle/fluid/tests/unittests/test_multinomial_op.py index a65a1c7e14c2b..ecde527523d3d 100644 --- a/python/paddle/fluid/tests/unittests/test_multinomial_op.py +++ b/python/paddle/fluid/tests/unittests/test_multinomial_op.py @@ -227,9 +227,6 @@ def test_fixed_random_number(self): if not "V100" in paddle.device.cuda.get_device_name(): return - if os.getenv("FLAGS_use_curand", None) in ('0', 'False', None): - return - print("Test Fixed Random number on V100 GPU------>") paddle.disable_static() paddle.set_device('gpu') diff --git a/python/paddle/fluid/tests/unittests/test_poisson_op.py b/python/paddle/fluid/tests/unittests/test_poisson_op.py index 2123d4e0e7e35..f8183bb5f8db2 100644 --- a/python/paddle/fluid/tests/unittests/test_poisson_op.py +++ b/python/paddle/fluid/tests/unittests/test_poisson_op.py @@ -107,9 +107,6 @@ def test_fixed_random_number(self): if not paddle.is_compiled_with_cuda(): return - if os.getenv("FLAGS_use_curand", None) in ('0', 'False', None): - return - print("Test Fixed Random number on GPU------>") paddle.disable_static() paddle.set_device('gpu') diff --git a/python/paddle/fluid/tests/unittests/test_randint_op.py b/python/paddle/fluid/tests/unittests/test_randint_op.py index 1eb99e08bb8e1..361f4d280f70f 100644 --- a/python/paddle/fluid/tests/unittests/test_randint_op.py +++ b/python/paddle/fluid/tests/unittests/test_randint_op.py @@ -198,9 +198,6 @@ def test_fixed_random_number(self): if not "V100" in paddle.device.cuda.get_device_name(): return - if os.getenv("FLAGS_use_curand", None) in ('0', 'False', None): - return - print("Test Fixed Random number on GPU------>") paddle.disable_static() diff --git a/python/paddle/fluid/tests/unittests/test_randperm_op.py b/python/paddle/fluid/tests/unittests/test_randperm_op.py index 5c9ab36fa34bc..deb0a9a082140 100644 --- a/python/paddle/fluid/tests/unittests/test_randperm_op.py +++ b/python/paddle/fluid/tests/unittests/test_randperm_op.py @@ -155,9 +155,6 @@ def test_fixed_random_number(self): if not paddle.is_compiled_with_cuda(): return - if os.getenv("FLAGS_use_curand", None) in ('0', 'False', None): - return - print("Test Fixed Random number on GPU------>") paddle.disable_static() paddle.set_device('gpu') diff --git a/python/paddle/fluid/tests/unittests/test_uniform_random_op.py b/python/paddle/fluid/tests/unittests/test_uniform_random_op.py index 41b6ed36d65cc..683cc2fdf867e 100644 --- a/python/paddle/fluid/tests/unittests/test_uniform_random_op.py +++ b/python/paddle/fluid/tests/unittests/test_uniform_random_op.py @@ -573,37 +573,46 @@ def test_fixed_random_number(self): if not "V100" in paddle.device.cuda.get_device_name(): return - if os.getenv("FLAGS_use_curand", None) in ('0', 'False', None): - return - - def _check_random_value(dtype, expect, expect_mean, expect_std): - x = paddle.rand([32, 3, 1024, 1024], dtype=dtype) - actual = x.numpy() - self.assertTrue(np.allclose(actual[2, 1, 512, 1000:1010], expect)) - self.assertEqual(np.mean(actual), expect_mean) - self.assertEqual(np.std(actual), expect_std) - print("Test Fixed Random number on V100 GPU------>") paddle.disable_static() + paddle.set_device('gpu') paddle.seed(2021) + + expect_mean = 0.50000454338820143895816272561205551028251647949218750 + expect_std = 0.28867379167297479991560749112977646291255950927734375 expect = [ 0.55298901, 0.65184678, 0.49375412, 0.57943639, 0.16459608, 0.67181056, 0.03021481, 0.0238559, 0.07742096, 0.55972187 ] - expect_mean = 0.50000454338820143895816272561205551028251647949218750 - expect_std = 0.28867379167297479991560749112977646291255950927734375 - _check_random_value(core.VarDesc.VarType.FP64, expect, expect_mean, - expect_std) + out = paddle.rand([32, 3, 1024, 1024], dtype='float64').numpy() + self.assertEqual(np.mean(out), expect_mean) + self.assertEqual(np.std(out), expect_std) + self.assertTrue(np.allclose(out[2, 1, 512, 1000:1010], expect)) + expect_mean = 0.50002604722976684570312500 + expect_std = 0.2886914908885955810546875 expect = [ 0.45320973, 0.17582087, 0.725341, 0.30849215, 0.622257, 0.46352342, 0.97228295, 0.12771158, 0.286525, 0.9810645 ] - expect_mean = 0.50002604722976684570312500 - expect_std = 0.2886914908885955810546875 - _check_random_value(core.VarDesc.VarType.FP32, expect, expect_mean, - expect_std) + out = paddle.rand([32, 3, 1024, 1024], dtype='float32').numpy() + self.assertEqual(np.mean(out), expect_mean) + self.assertEqual(np.std(out), expect_std) + self.assertTrue(np.allclose(out[2, 1, 512, 1000:1010], expect)) + + expect_mean = 25.11843109130859375 + expect_std = 43.370647430419921875 + expect = [ + 30.089634, 77.05225, 3.1201615, 68.34072, 59.266724, -25.33281, + 12.973292, 27.41127, -17.412298, 27.931019 + ] + out = paddle.empty( + [16, 16, 16, 16], dtype='float32').uniform_(-50, 100).numpy() + self.assertEqual(np.mean(out), expect_mean) + self.assertEqual(np.std(out), expect_std) + self.assertTrue(np.allclose(out[10, 10, 10, 0:10], expect)) + paddle.enable_static() diff --git a/python/paddle/nn/utils/__init__.py b/python/paddle/nn/utils/__init__.py index 8f9b55d15cad0..8ec4e8cfd60b5 100644 --- a/python/paddle/nn/utils/__init__.py +++ b/python/paddle/nn/utils/__init__.py @@ -14,7 +14,7 @@ from .spectral_norm_hook import spectral_norm from .weight_norm_hook import weight_norm, remove_weight_norm # noqa: F401 -from .transform_parameters import parameters_to_vector, vector_to_parameters # noqa: F401 +from .transform_parameters import parameters_to_vector, vector_to_parameters, _stride_column # noqa: F401 __all__ = [ #noqa 'weight_norm', 'remove_weight_norm', 'spectral_norm', 'parameters_to_vector', 'vector_to_parameters' diff --git a/python/paddle/nn/utils/transform_parameters.py b/python/paddle/nn/utils/transform_parameters.py index 99870ce29a138..feb70e02d5988 100644 --- a/python/paddle/nn/utils/transform_parameters.py +++ b/python/paddle/nn/utils/transform_parameters.py @@ -36,6 +36,39 @@ def _inplace_reshape_dygraph(x, shape): stop_gradient=True) +@dygraph_only +def _stride_column(param): + """ + A tool function. Permute date of parameter as a 'columns' stride. Now, it only support 2-D parameter. + + Args: + param(Tensor]): The param that will be strided according to 'columns'. + + Examples: + .. code-block:: python + + import paddle + paddle.seed(100) + + linear = paddle.nn.Linear(2, 3) + print(linear.weight) + # [[-0.31485492, -1.02896988, 0.45741916], + # [-0.65525872, -1.04643178, 1.07262802]] + + paddle.nn.utils.stride_column(linear.weight) + print(linear.weight) + # [[-0.31485492, 0.45741916, -1.04643178], + # [-1.02896988, -0.65525872, 1.07262802]] + + """ + assert len(param.shape) == 2 + shape = [param.shape[1], param.shape[0]] + with paddle.fluid.dygraph.no_grad(): + reshape_var = paddle.reshape(param, shape) + transpose_var = paddle.transpose(reshape_var, [1, 0]) + transpose_var._share_underline_tensor_to(param) + + @dygraph_only def parameters_to_vector(parameters, name=None): """ From b3bcebbeb1debeae72be94907b45ff8c8df5101d Mon Sep 17 00:00:00 2001 From: Thunderbrook <52529258+Thunderbrook@users.noreply.github.com> Date: Thu, 7 Apr 2022 23:37:07 +0800 Subject: [PATCH 019/211] [GPUPS] bind afs wrpper (#41227) * afs wrapper * format * format * macro --- .../fluid/framework/fleet/ps_gpu_wrapper.cc | 37 +++++++++++++++++++ paddle/fluid/framework/fleet/ps_gpu_wrapper.h | 21 +++++++++++ paddle/fluid/pybind/ps_gpu_wrapper_py.cc | 21 +++++++++++ paddle/fluid/pybind/ps_gpu_wrapper_py.h | 3 ++ paddle/fluid/pybind/pybind.cc | 3 ++ 5 files changed, 85 insertions(+) diff --git a/paddle/fluid/framework/fleet/ps_gpu_wrapper.cc b/paddle/fluid/framework/fleet/ps_gpu_wrapper.cc index 75f5c24af5a99..c7852de00a18e 100755 --- a/paddle/fluid/framework/fleet/ps_gpu_wrapper.cc +++ b/paddle/fluid/framework/fleet/ps_gpu_wrapper.cc @@ -37,6 +37,43 @@ limitations under the License. */ namespace paddle { namespace framework { +#ifdef PADDLE_WITH_PSLIB +void AfsWrapper::init(const std::string& fs_name, const std::string& fs_user, + const std::string& pass_wd, const std::string& conf) { + int ret = afs_handler_.init(fs_name.c_str(), fs_user.c_str(), pass_wd.c_str(), + conf.c_str()); + if (ret != 0) { + LOG(ERROR) << "AFS Init Error"; + } +} + +int AfsWrapper::remove(const std::string& path) { + return afs_handler_.remove(path); +} + +int AfsWrapper::mkdir(const std::string& path) { + return afs_handler_.mkdir(path); +} + +std::vector AfsWrapper::list(const std::string& path) { + return afs_handler_.list(path); +} + +int AfsWrapper::exist(const std::string& path) { + return afs_handler_.exist(path); +} + +int AfsWrapper::upload(const std::string& local_file, + const std::string& afs_file) { + return afs_handler_.upload_file(local_file, afs_file); +} + +int AfsWrapper::download(const std::string& local_file, + const std::string& afs_file) { + return afs_handler_.download_file(local_file, afs_file); +} +#endif + std::shared_ptr PSGPUWrapper::s_instance_ = NULL; bool PSGPUWrapper::is_initialized_ = false; #ifdef PADDLE_WITH_PSLIB diff --git a/paddle/fluid/framework/fleet/ps_gpu_wrapper.h b/paddle/fluid/framework/fleet/ps_gpu_wrapper.h index d9d29cc072dd7..9b7d6de082d1c 100755 --- a/paddle/fluid/framework/fleet/ps_gpu_wrapper.h +++ b/paddle/fluid/framework/fleet/ps_gpu_wrapper.h @@ -55,6 +55,27 @@ namespace framework { #define TYPEALIGN(ALIGNVAL, LEN) \ (((uint64_t)(LEN) + ((ALIGNVAL)-1)) & ~((uint64_t)((ALIGNVAL)-1))) +#ifdef PADDLE_WITH_PSLIB +class AfsWrapper { + public: + AfsWrapper() {} + virtual ~AfsWrapper() {} + void init(const std::string& fs_name, const std::string& fs_user, + const std::string& pass_wd, const std::string& conf); + int remove(const std::string& path); + int mkdir(const std::string& path); + std::vector list(const std::string& path); + + int exist(const std::string& path); + int upload(const std::string& local_file, const std::string& afs_file); + + int download(const std::string& local_file, const std::string& afs_file); + + private: + paddle::ps::AfsApiWrapper afs_handler_; +}; +#endif + class PSGPUWrapper { public: virtual ~PSGPUWrapper() { delete HeterPs_; } diff --git a/paddle/fluid/pybind/ps_gpu_wrapper_py.cc b/paddle/fluid/pybind/ps_gpu_wrapper_py.cc index fe1f27226bad4..79529fca7d1be 100644 --- a/paddle/fluid/pybind/ps_gpu_wrapper_py.cc +++ b/paddle/fluid/pybind/ps_gpu_wrapper_py.cc @@ -63,6 +63,27 @@ void BindPSGPUWrapper(py::module* m) { .def("finalize", &framework::PSGPUWrapper::Finalize, py::call_guard()); } // end PSGPUWrapper +#ifdef PADDLE_WITH_PSLIB +void BindAfsWrapper(py::module* m) { + py::class_>( + *m, "AfsWrapper") + .def(py::init([]() { return std::make_shared(); })) + .def("init", &framework::AfsWrapper::init, + py::call_guard()) + .def("list", &framework::AfsWrapper::list, + py::call_guard()) + .def("mkdir", &framework::AfsWrapper::mkdir, + py::call_guard()) + .def("exist", &framework::AfsWrapper::exist, + py::call_guard()) + .def("download", &framework::AfsWrapper::download, + py::call_guard()) + .def("upload", &framework::AfsWrapper::upload, + py::call_guard()) + .def("remove", &framework::AfsWrapper::remove, + py::call_guard()); +} +#endif #endif } // end namespace pybind } // end namespace paddle diff --git a/paddle/fluid/pybind/ps_gpu_wrapper_py.h b/paddle/fluid/pybind/ps_gpu_wrapper_py.h index ba4f146389ed3..22cd5ef0fd149 100644 --- a/paddle/fluid/pybind/ps_gpu_wrapper_py.h +++ b/paddle/fluid/pybind/ps_gpu_wrapper_py.h @@ -24,6 +24,9 @@ namespace pybind { #ifdef PADDLE_WITH_HETERPS void BindPSGPUWrapper(py::module* m); +#ifdef PADDLE_WITH_PSLIB +void BindAfsWrapper(py::module* m); +#endif #endif } // namespace pybind } // namespace paddle diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc index 44abf3357d63d..c9e304e696df2 100644 --- a/paddle/fluid/pybind/pybind.cc +++ b/paddle/fluid/pybind/pybind.cc @@ -4458,6 +4458,9 @@ All parameter, weight, gradient are variables in Paddle. #endif #ifdef PADDLE_WITH_HETERPS BindPSGPUWrapper(&m); +#ifdef PADDLE_WITH_PSLIB + BindAfsWrapper(&m); +#endif #endif BindGlooWrapper(&m); BindBoxHelper(&m); From 9844aafb3d01f0d39c941d5dbc8ab45ec839890d Mon Sep 17 00:00:00 2001 From: Chen Weihang Date: Fri, 8 Apr 2022 08:43:28 +0800 Subject: [PATCH 020/211] [Phi] Add swish yaml and final state api (#41479) * add swish yaml and final state api * skip mkldnn test * fix grad mkldnn test --- .../unittests/mkldnn/test_activation_mkldnn_op.py | 2 ++ .../fluid/tests/unittests/test_activation_op.py | 11 ++++++++++- python/paddle/nn/functional/activation.py | 5 +++-- python/paddle/utils/code_gen/api.yaml | 11 +++++++++++ python/paddle/utils/code_gen/backward.yaml | 10 ++++++++++ 5 files changed, 36 insertions(+), 3 deletions(-) diff --git a/python/paddle/fluid/tests/unittests/mkldnn/test_activation_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_activation_mkldnn_op.py index 4e4fe69d914fa..44263b89e1616 100644 --- a/python/paddle/fluid/tests/unittests/mkldnn/test_activation_mkldnn_op.py +++ b/python/paddle/fluid/tests/unittests/mkldnn/test_activation_mkldnn_op.py @@ -113,6 +113,7 @@ def setUp(self): super(TestMKLDNNSwishDim2, self).setUp() self.attrs["use_mkldnn"] = True + self.check_eager = False def init_dtype(self): self.dtype = np.float32 @@ -284,6 +285,7 @@ def setUp(self): self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)} self.outputs = {'Out': out} self.attrs = {"use_mkldnn": True, "beta": beta} + self.check_eager = False def init_dtype(self): self.dtype = np.float32 diff --git a/python/paddle/fluid/tests/unittests/test_activation_op.py b/python/paddle/fluid/tests/unittests/test_activation_op.py index 80fef6d37576f..58d8610ee352d 100755 --- a/python/paddle/fluid/tests/unittests/test_activation_op.py +++ b/python/paddle/fluid/tests/unittests/test_activation_op.py @@ -2940,7 +2940,9 @@ def ref_swish(x): class TestSwish(TestActivation): def setUp(self): self.op_type = "swish" + self.python_api = paddle.nn.functional.swish self.init_dtype() + self.check_eager = True np.random.seed(1024) x = np.random.uniform(-1, 1, [10, 12]).astype(self.dtype) @@ -2952,7 +2954,10 @@ def setUp(self): def test_check_grad(self): if self.dtype == np.float16: return - self.check_grad(['X'], 'Out') + check_eager = False + if hasattr(self, 'check_eager'): + check_eager = self.check_eager + self.check_grad(['X'], 'Out', check_eager=check_eager) class TestSwishAPI(unittest.TestCase): @@ -2987,6 +2992,10 @@ def test_dygraph_api(self): self.assertEqual(np.allclose(out_ref, r.numpy()), True) paddle.enable_static() + def test_dygraph_final_state_api(self): + with _test_eager_guard(): + self.test_dygraph_api() + def test_fluid_api(self): paddle.enable_static() with fluid.program_guard(fluid.Program()): diff --git a/python/paddle/nn/functional/activation.py b/python/paddle/nn/functional/activation.py index 62567fa2a6113..90283b632ef2b 100644 --- a/python/paddle/nn/functional/activation.py +++ b/python/paddle/nn/functional/activation.py @@ -1181,8 +1181,9 @@ def swish(x, name=None): x = paddle.to_tensor(np.array([-2., 0., 1.])) out = F.swish(x) # [-0.238406, 0., 0.731059] """ - - if in_dynamic_mode(): + if in_dygraph_mode(): + return _C_ops.final_state_swish(x, 1.0) + if _in_legacy_dygraph(): return _C_ops.swish(x, 'beta', 1.0) check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64'], 'swish') diff --git a/python/paddle/utils/code_gen/api.yaml b/python/paddle/utils/code_gen/api.yaml index ca53766eb9c64..76f03f9ff8ca9 100644 --- a/python/paddle/utils/code_gen/api.yaml +++ b/python/paddle/utils/code_gen/api.yaml @@ -1876,6 +1876,17 @@ data_type : x backward : sum_grad +# The python API paddle.nn.functional.swish has no `bete` argument, it may be removed later +- api : swish + args : (Tensor x, float beta=1.0) + output : Tensor(out) + infer_meta : + func : UnchangedInferMeta + param : [x] + kernel : + func : swish + backward : swish_grad + # take_along_axis - api : take_along_axis args : (Tensor x, Tensor index, int axis) diff --git a/python/paddle/utils/code_gen/backward.yaml b/python/paddle/utils/code_gen/backward.yaml index 3640470503480..b32e015325bdc 100644 --- a/python/paddle/utils/code_gen/backward.yaml +++ b/python/paddle/utils/code_gen/backward.yaml @@ -1410,6 +1410,16 @@ kernel : func : sum_grad +- backward_api : swish_grad + forward : swish (Tensor x, float beta=1.0) -> Tensor(out) + args : (Tensor x, Tensor out_grad, float bete=1.0) + output : Tensor(x_grad) + infer_meta : + func : GeneralUnaryGradInferMeta + param : [x] + kernel : + func : swish_grad + - backward_api : take_along_axis_grad forward : take_along_axis (Tensor x, Tensor index, int axis) -> Tensor(out) args : (Tensor x, Tensor index, Tensor out_grad, int axis) From bc88fbb5b6ea0dd1edb019aba97d8affa4ac13c0 Mon Sep 17 00:00:00 2001 From: hong <43953930+phlrain@users.noreply.github.com> Date: Fri, 8 Apr 2022 08:46:09 +0800 Subject: [PATCH 021/211] Add conj pixel shuffle yaml (#41499) * ad conj flip yaml * add flip conj pixel shuffle --- paddle/fluid/operators/pixel_shuffle_op.cc | 42 +++---------------- paddle/phi/infermeta/unary.cc | 30 +++++++++++++ paddle/phi/infermeta/unary.h | 5 +++ .../fluid/tests/unittests/test_conj_op.py | 6 ++- .../paddle/fluid/tests/unittests/test_flip.py | 6 ++- .../tests/unittests/test_pixel_shuffle.py | 6 ++- python/paddle/tensor/manipulation.py | 4 ++ python/paddle/tensor/math.py | 3 ++ python/paddle/utils/code_gen/api.yaml | 4 +- python/paddle/utils/code_gen/backward.yaml | 29 +++++++++++++ 10 files changed, 91 insertions(+), 44 deletions(-) diff --git a/paddle/fluid/operators/pixel_shuffle_op.cc b/paddle/fluid/operators/pixel_shuffle_op.cc index 21ca26f49f653..1724aedbe9b24 100644 --- a/paddle/fluid/operators/pixel_shuffle_op.cc +++ b/paddle/fluid/operators/pixel_shuffle_op.cc @@ -82,42 +82,6 @@ class PixelShuffleGradMaker : public framework::SingleGradOpMaker { class PixelShuffleGradOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - - void InferShape(framework::InferShapeContext* ctx) const override { - PADDLE_ENFORCE_EQ( - ctx->HasInput(framework::GradVarName("Out")), true, - platform::errors::NotFound("Input(Out@Grad) should not be null")); - PADDLE_ENFORCE_EQ( - ctx->HasOutput(framework::GradVarName("X")), true, - platform::errors::NotFound("Output(X@Grad) should not be null")); - - auto do_dims = ctx->GetInputDim(framework::GradVarName("Out")); - PADDLE_ENFORCE_EQ(do_dims.size(), 4, - platform::errors::InvalidArgument( - "Input should be a 4-D tensor of format [N, C, H, W] " - "or [N, H, W, C], but got %u.", - do_dims.size())); - - auto upscale_factor = ctx->Attrs().Get("upscale_factor"); - - const std::string data_format = - ctx->Attrs().Get("data_format"); - const bool channel_last = (data_format == "NHWC"); - - auto dx_dims = do_dims; - dx_dims[0] = do_dims[0]; - - if (!channel_last) { - dx_dims[1] = do_dims[1] * (upscale_factor * upscale_factor); - dx_dims[2] = do_dims[2] / upscale_factor; - dx_dims[3] = do_dims[3] / upscale_factor; - } else { - dx_dims[1] = do_dims[1] / upscale_factor; - dx_dims[2] = do_dims[2] / upscale_factor; - dx_dims[3] = do_dims[3] * (upscale_factor * upscale_factor); - } - ctx->SetOutputDim(framework::GradVarName("X"), dx_dims); - } }; } // namespace operators @@ -132,7 +96,11 @@ REGISTER_OPERATOR(pixel_shuffle, ops::PixelShuffleOp, ops::PixelShuffleOpMaker, ops::PixelShuffleGradMaker, PixelShuffleInferShapeFunctor); -REGISTER_OPERATOR(pixel_shuffle_grad, ops::PixelShuffleGradOp); +DECLARE_INFER_SHAPE_FUNCTOR(pixel_shuffle_grad, + PixelShuffleGradInferShapeFunctor, + PD_INFER_META(phi::PixelShuffleGradInferMeta)); +REGISTER_OPERATOR(pixel_shuffle_grad, ops::PixelShuffleGradOp, + PixelShuffleGradInferShapeFunctor); REGISTER_OP_VERSION(pixel_shuffle) .AddCheckpoint( diff --git a/paddle/phi/infermeta/unary.cc b/paddle/phi/infermeta/unary.cc index a81a0e1503a9b..c6e2cb761911e 100644 --- a/paddle/phi/infermeta/unary.cc +++ b/paddle/phi/infermeta/unary.cc @@ -1315,6 +1315,36 @@ void PixelShuffleInferMeta(const MetaTensor& x, out->set_dims(output_dims); } +void PixelShuffleGradInferMeta(const MetaTensor& out_grad, + int upscale_factor, + const std::string& data_format, + MetaTensor* x_grad) { + auto do_dims = out_grad.dims(); + PADDLE_ENFORCE_EQ(do_dims.size(), + 4, + phi::errors::InvalidArgument( + "Input should be a 4-D tensor of format [N, C, H, W] " + "or [N, H, W, C], but got %u.", + do_dims.size())); + + const bool channel_last = (data_format == "NHWC"); + + auto dx_dims = do_dims; + dx_dims[0] = do_dims[0]; + + if (!channel_last) { + dx_dims[1] = do_dims[1] * (upscale_factor * upscale_factor); + dx_dims[2] = do_dims[2] / upscale_factor; + dx_dims[3] = do_dims[3] / upscale_factor; + } else { + dx_dims[1] = do_dims[1] / upscale_factor; + dx_dims[2] = do_dims[2] / upscale_factor; + dx_dims[3] = do_dims[3] * (upscale_factor * upscale_factor); + } + x_grad->set_dims(dx_dims); + x_grad->set_dtype(out_grad.dtype()); +} + void PNormInferMeta(const MetaTensor& x, float porder, int axis, diff --git a/paddle/phi/infermeta/unary.h b/paddle/phi/infermeta/unary.h index 63a1dd52bbb0f..c49e4c88dd899 100644 --- a/paddle/phi/infermeta/unary.h +++ b/paddle/phi/infermeta/unary.h @@ -200,6 +200,11 @@ void PixelShuffleInferMeta(const MetaTensor& x, const std::string& data_format, MetaTensor* out); +void PixelShuffleGradInferMeta(const MetaTensor& out_grad, + int upscale_factor, + const std::string& data_format, + MetaTensor* x_grad); + void PNormInferMeta(const MetaTensor& x, float porder, int axis, diff --git a/python/paddle/fluid/tests/unittests/test_conj_op.py b/python/paddle/fluid/tests/unittests/test_conj_op.py index 774a29ada4a84..fe9efc301fea7 100644 --- a/python/paddle/fluid/tests/unittests/test_conj_op.py +++ b/python/paddle/fluid/tests/unittests/test_conj_op.py @@ -32,6 +32,7 @@ class TestConjOp(OpTest): def setUp(self): self.op_type = "conj" + self.python_api = paddle.tensor.conj self.init_dtype_type() self.init_input_output() self.init_grad_input_output() @@ -53,14 +54,15 @@ def init_grad_input_output(self): self.grad_in = np.conj(self.grad_out) def test_check_output(self): - self.check_output() + self.check_output(check_eager=True) def test_check_grad_normal(self): self.check_grad( ['X'], 'Out', user_defined_grads=[self.grad_in], - user_defined_grad_outputs=[self.grad_out]) + user_defined_grad_outputs=[self.grad_out], + check_eager=True) class TestComplexConjOp(unittest.TestCase): diff --git a/python/paddle/fluid/tests/unittests/test_flip.py b/python/paddle/fluid/tests/unittests/test_flip.py index 5e2aacf9cefed..010d23bca51d7 100644 --- a/python/paddle/fluid/tests/unittests/test_flip.py +++ b/python/paddle/fluid/tests/unittests/test_flip.py @@ -67,6 +67,7 @@ def test_dygraph(self): class TestFlipOp(OpTest): def setUp(self): self.op_type = 'flip' + self.python_api = paddle.tensor.flip self.init_test_case() self.inputs = {'X': np.random.random(self.in_shape).astype('float64')} self.init_attrs() @@ -76,10 +77,10 @@ def init_attrs(self): self.attrs = {"axis": self.axis} def test_check_output(self): - self.check_output() + self.check_output(check_eager=True) def test_check_grad(self): - self.check_grad(["X"], "Out") + self.check_grad(["X"], "Out", check_eager=True) def init_test_case(self): self.in_shape = (6, 4, 2, 3) @@ -131,4 +132,5 @@ def init_test_case(self): if __name__ == "__main__": + paddle.enable_static() unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_pixel_shuffle.py b/python/paddle/fluid/tests/unittests/test_pixel_shuffle.py index f1a409c712fc3..06d975fe2b88f 100644 --- a/python/paddle/fluid/tests/unittests/test_pixel_shuffle.py +++ b/python/paddle/fluid/tests/unittests/test_pixel_shuffle.py @@ -52,6 +52,7 @@ def pixel_shuffle_np(x, up_factor, data_format="NCHW"): class TestPixelShuffleOp(OpTest): def setUp(self): self.op_type = "pixel_shuffle" + self.python_api = paddle.nn.functional.pixel_shuffle self.init_data_format() n, c, h, w = 2, 9, 4, 4 @@ -73,10 +74,10 @@ def init_data_format(self): self.format = "NCHW" def test_check_output(self): - self.check_output() + self.check_output(check_eager=True) def test_check_grad(self): - self.check_grad(['X'], 'Out') + self.check_grad(['X'], 'Out', check_eager=True) class TestChannelLast(TestPixelShuffleOp): @@ -220,4 +221,5 @@ def error_data_format_layer(): if __name__ == '__main__': + paddle.enable_static() unittest.main() diff --git a/python/paddle/tensor/manipulation.py b/python/paddle/tensor/manipulation.py index 0f90cf6950aff..d8021f36c211c 100755 --- a/python/paddle/tensor/manipulation.py +++ b/python/paddle/tensor/manipulation.py @@ -458,6 +458,10 @@ def flip(x, axis, name=None): """ if isinstance(axis, int): axis = [axis] + + if in_dygraph_mode(): + return _C_ops.final_state_flip(x, axis) + if paddle.in_dynamic_mode(): return _C_ops.flip(x, "axis", axis) diff --git a/python/paddle/tensor/math.py b/python/paddle/tensor/math.py index a1d27ab904e82..298d7af96ea57 100644 --- a/python/paddle/tensor/math.py +++ b/python/paddle/tensor/math.py @@ -3349,6 +3349,9 @@ def conj(x, name=None): # [(4-4j), (5-5j), (6-6j)]]) """ + if in_dygraph_mode(): + return _C_ops.final_state_conj(x) + if paddle.in_dynamic_mode(): return _C_ops.conj(x) diff --git a/python/paddle/utils/code_gen/api.yaml b/python/paddle/utils/code_gen/api.yaml index 76f03f9ff8ca9..3a76e89bbb727 100644 --- a/python/paddle/utils/code_gen/api.yaml +++ b/python/paddle/utils/code_gen/api.yaml @@ -345,6 +345,7 @@ func : UnchangedInferMeta kernel : func : conj + backward : conj_grad - api : conv2d args : (Tensor input, Tensor filter, int[] strides, int[] paddings, str paddding_algorithm, int groups, int[] dilations, str data_format, bool use_addto, int workspace_size_MB, bool exhaustive_search) @@ -659,6 +660,7 @@ func : FlipInferMeta kernel : func : flip + backward : flip_grad - api : floor args : (Tensor x) @@ -1430,7 +1432,7 @@ func : PixelShuffleInferMeta kernel : func : pixel_shuffle - # backward : pixel_shuffle_grad + backward : pixel_shuffle_grad # poisson // no need grad - api : poisson diff --git a/python/paddle/utils/code_gen/backward.yaml b/python/paddle/utils/code_gen/backward.yaml index b32e015325bdc..3456fe3260abc 100644 --- a/python/paddle/utils/code_gen/backward.yaml +++ b/python/paddle/utils/code_gen/backward.yaml @@ -208,6 +208,16 @@ output : Tensor[](x_grad) invoke : concat_grad_impl(x, out_grad, axis) +- backward_api : conj_grad + forward : conj (Tensor x) -> Tensor(out) + args : (Tensor out_grad) + output : Tensor(x_grad) + infer_meta : + func : UnchangedInferMeta + param: [out_grad] + kernel : + func : conj + - backward_api : conv2d_grad forward : conv2d (Tensor input, Tensor filter, int[] strides, int[] paddings, str paddding_algorithm, int groups, int[] dilations, str data_format, bool use_addto, int workspace_size_MB, bool exhaustive_search) -> Tensor(out) args : (Tensor input, Tensor filter, Tensor out_grad, int[] strides, int[] paddings, str paddding_algorithm, int groups, int[] dilations, str data_format, bool use_addto, int workspace_size_MB, bool exhaustive_search) @@ -456,6 +466,16 @@ backend: out_grad layout: out_grad +- backward_api : flip_grad + forward : flip (Tensor x, int[] axis) -> Tensor(out) + args : (Tensor out_grad, int[] axis) + output : Tensor(x_grad) + infer_meta : + func : UnchangedInferMeta + param: [out_grad] + kernel : + func : flip + - backward_api : floor_grad forward : floor(Tensor x) -> Tensor(out) args : (Tensor out_grad) @@ -1010,6 +1030,15 @@ kernel : func : pad3d_grad +- backward_api : pixel_shuffle_grad + forward : pixel_shuffle (Tensor x, int upscale_factor, str data_format) -> Tensor(out) + args : (Tensor out_grad, int upscale_factor, str data_format) + output : Tensor(x_grad) + infer_meta : + func : PixelShuffleGradInferMeta + kernel : + func : pixel_shuffle_grad + - backward_api : pool2d_grad forward : pool2d(Tensor x, int[] kernel_size, int[] strides, int[] paddings, bool ceil_mode, bool exclusive, str data_format, str pooling_type, bool global_pooling, bool adaptive, str padding_algorithm) -> Tensor(out) args : (Tensor x, Tensor out, Tensor out_grad, int[] kernel_size, int[] strides, int[] paddings, bool ceil_mode, bool exclusive, str data_format, str pooling_type, bool global_pooling, bool adaptive, str padding_algorithm) From 0cd577cfc3496a1f4e2b50895c739d654cbc8850 Mon Sep 17 00:00:00 2001 From: ronnywang <524019753@qq.com> Date: Fri, 8 Apr 2022 10:04:39 +0800 Subject: [PATCH 022/211] pybind support CustomPlace (#41136) --- paddle/fluid/pybind/imperative.cc | 6 +++++ paddle/fluid/pybind/pybind.cc | 37 +++++++++++++++++++++++++++++++ 2 files changed, 43 insertions(+) diff --git a/paddle/fluid/pybind/imperative.cc b/paddle/fluid/pybind/imperative.cc index 7df6d8f7f791c..e09c205db14e7 100644 --- a/paddle/fluid/pybind/imperative.cc +++ b/paddle/fluid/pybind/imperative.cc @@ -2182,6 +2182,7 @@ void BindImperative(py::module *m_ptr) { m.def("varbase_copy", &VarBaseCopy); m.def("varbase_copy", &VarBaseCopy); m.def("varbase_copy", &VarBaseCopy); + m.def("varbase_copy", &VarBaseCopy); m.def("varbase_copy", &VarBaseCopy); m.def( @@ -2341,6 +2342,11 @@ void BindImperative(py::module *m_ptr) { const py::args args, const py::kwargs kwargs) { return imperative::PyLayerApply(place, cls, args, kwargs); }); + m.def("pylayer_apply", + [](const platform::CustomPlace &place, const py::object &cls, + const py::args args, const py::kwargs kwargs) { + return imperative::PyLayerApply(place, cls, args, kwargs); + }); #if defined(PADDLE_WITH_CUDA) m.def("to_uva_tensor", diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc index c9e304e696df2..396c6c5e42d37 100644 --- a/paddle/fluid/pybind/pybind.cc +++ b/paddle/fluid/pybind/pybind.cc @@ -845,6 +845,10 @@ PYBIND11_MODULE(core_noavx, m) { [](framework::Tensor &self, const std::string &layout) { self.set_layout(StringToDataLayout(layout)); }) + .def("_alloc_float", + [](framework::Tensor &self, paddle::platform::CustomPlace &place) { + self.mutable_data(place); + }) .def("_alloc_float", [](framework::Tensor &self, paddle::platform::CUDAPlace &place) { self.mutable_data(place); @@ -873,6 +877,10 @@ PYBIND11_MODULE(core_noavx, m) { [](framework::Tensor &self, paddle::platform::CPUPlace &place) { self.mutable_data(place); }) + .def("_alloc_int", + [](framework::Tensor &self, paddle::platform::CustomPlace &place) { + self.mutable_data(place); + }) .def("_alloc_int", [](framework::Tensor &self, paddle::platform::XPUPlace &place) { self.mutable_data(place); @@ -901,6 +909,12 @@ PYBIND11_MODULE(core_noavx, m) { return reinterpret_cast( self.mutable_data(place, framework::TransToPhiDataType(type))); }) + .def("_mutable_data", + [](framework::Tensor &self, paddle::platform::CustomPlace &place, + paddle::framework::proto::VarType::Type type) { + return reinterpret_cast( + self.mutable_data(place, framework::TransToPhiDataType(type))); + }) .def("_mutable_data", [](framework::Tensor &self, paddle::platform::XPUPlace &place, paddle::framework::proto::VarType::Type type) { @@ -934,6 +948,8 @@ PYBIND11_MODULE(core_noavx, m) { }) .def("_copy_from", &TensorCopyFrom, py::arg("tensor"), py::arg("place"), py::arg("batch_size") = -1) + .def("_copy_from", &TensorCopyFrom, + py::arg("tensor"), py::arg("place"), py::arg("batch_size") = -1) .def("_copy_from", &TensorCopyFrom, py::arg("tensor"), py::arg("place"), py::arg("batch_size") = -1) .def("_copy_from", &TensorCopyFrom, @@ -948,6 +964,8 @@ PYBIND11_MODULE(core_noavx, m) { py::arg("tensor"), py::arg("place"), py::arg("batch_size") = -1) .def("set", SetTensorFromPyArray, py::arg("array"), py::arg("place"), py::arg("zero_copy") = false) + .def("set", SetTensorFromPyArray, + py::arg("array"), py::arg("place"), py::arg("zero_copy") = false) .def("set", SetTensorFromPyArray, py::arg("array"), py::arg("place"), py::arg("zero_copy") = false) .def("set", SetTensorFromPyArray, @@ -1985,6 +2003,19 @@ All parameter, weight, gradient are variables in Paddle. "Please recompile or reinstall Paddle with NPU support.")); #else return new paddle::platform::NPUDeviceContext(place); +#endif + }) + .def_static("create", + [](paddle::platform::CustomPlace& place) + -> paddle::platform::DeviceContext* { +#ifndef PADDLE_WITH_CUSTOM_DEVICE + PADDLE_THROW( + platform::errors::PermissionDenied( + "Cannot use CustomPlace in CPU/GPU/XPU version, " + "Please recompile or reinstall Paddle with " + "CustomDevice support.")); +#else + return new paddle::platform::CustomDeviceContext(place); #endif }) .def_static("create", @@ -2722,6 +2753,12 @@ All parameter, weight, gradient are variables in Paddle. pybind11::gil_scoped_release release; self.Run(scope, place); }) + .def("run", + [](OperatorBase &self, const Scope &scope, + const platform::CustomPlace &place) { + pybind11::gil_scoped_release release; + self.Run(scope, place); + }) .def("type", [](const OperatorBase &op) -> std::string { return op.Type(); }) .def("outputs", From 1ed1a97b6ffbcd8dc3744fb7009cb7097eb36a20 Mon Sep 17 00:00:00 2001 From: sneaxiy <32832641+sneaxiy@users.noreply.github.com> Date: Fri, 8 Apr 2022 10:27:07 +0800 Subject: [PATCH 023/211] Fix cv2 import error and some issues for lamb (#41500) * fix image cv2 import * fix lamb --- python/paddle/dataset/image.py | 5 ++++- .../paddle/incubate/optimizer/distributed_fused_lamb.py | 8 ++------ 2 files changed, 6 insertions(+), 7 deletions(-) diff --git a/python/paddle/dataset/image.py b/python/paddle/dataset/image.py index c36213282c59c..a094529edf575 100644 --- a/python/paddle/dataset/image.py +++ b/python/paddle/dataset/image.py @@ -54,7 +54,10 @@ if retcode != 0: cv2 = None else: - import cv2 + try: + import cv2 + except ImportError: + cv2 = None else: try: import cv2 diff --git a/python/paddle/incubate/optimizer/distributed_fused_lamb.py b/python/paddle/incubate/optimizer/distributed_fused_lamb.py index 12a88106a44cd..74b5398230dee 100644 --- a/python/paddle/incubate/optimizer/distributed_fused_lamb.py +++ b/python/paddle/incubate/optimizer/distributed_fused_lamb.py @@ -17,7 +17,7 @@ from paddle.fluid.clip import ClipGradByGlobalNorm from paddle.fluid.initializer import Constant from paddle.fluid.layer_helper import LayerHelper -from paddle.optimizer import Optimizer +from paddle.fluid.optimizer import Optimizer from paddle.distributed import get_rank, get_world_size from paddle.fluid.executor import global_scope from paddle.fluid.framework import name_scope @@ -42,11 +42,7 @@ def __init__(self, assert not framework._non_static_mode( ), "DistributedFusedLamb does not support dygraph mode" super(DistributedFusedLamb, self).__init__( - learning_rate=learning_rate, - parameters=parameters, - weight_decay=None, - grad_clip=None, - name=name) + learning_rate=learning_rate, grad_clip=None, name=name) self._beta1 = beta1 self._beta2 = beta2 From 770ce7cf1e7195c0ea5a4fe6282c2036ecdebefd Mon Sep 17 00:00:00 2001 From: taixiurong Date: Fri, 8 Apr 2022 10:30:05 +0800 Subject: [PATCH 024/211] xpu mul unittest *test=kunlun (#41140) --- paddle/fluid/operators/mul_op_xpu.cc | 62 +++--- .../fluid/platform/device/xpu/xpu2_op_list.h | 8 +- .../tests/unittests/xpu/test_mul_op_xpu.py | 186 ++++++++++-------- 3 files changed, 143 insertions(+), 113 deletions(-) diff --git a/paddle/fluid/operators/mul_op_xpu.cc b/paddle/fluid/operators/mul_op_xpu.cc index 6ef41e059c7d9..7410b3b607c82 100644 --- a/paddle/fluid/operators/mul_op_xpu.cc +++ b/paddle/fluid/operators/mul_op_xpu.cc @@ -19,6 +19,8 @@ limitations under the License. */ #include #include #include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/xpu_api_wrapper.h" +#include "paddle/fluid/platform/device/device_wrapper.h" namespace paddle { namespace operators { @@ -28,6 +30,8 @@ using framework::Tensor; template class MulXPUKernel : public framework::OpKernel { + using XPUType = typename XPUTypeTrait::Type; + public: void Compute(const framework::ExecutionContext& context) const override { const Tensor* x = context.Input("X"); @@ -62,14 +66,15 @@ class MulXPUKernel : public framework::OpKernel { const T* data_b = y_matrix.data(); T* data_c = z->data(); auto& dev_ctx = context.template device_context(); - int ret = xpu::fc_int16(dev_ctx.x_context(), trans_a, trans_b, m, n, k, - alpha, data_a, data_b, beta, data_c); - PADDLE_ENFORCE_EQ( - ret, XPU_SUCCESS, - platform::errors::External( - "XPU API return wrong value[%d], please check whether " - "Baidu Kunlun Card is properly installed.", - ret)); + + int ret = xpu_fc_wrapper( + dev_ctx.x_context(), reinterpret_cast(data_a), + reinterpret_cast(data_b), + reinterpret_cast(data_c), m, n, k, trans_a, trans_b, nullptr, + nullptr, nullptr, k, n, n, alpha, beta, nullptr, + xpu::Activation_t::LINEAR); + PADDLE_ENFORCE_XDNN_SUCCESS(ret, "xpu_fc_wrapper"); + if (z_dim.size() != 2) { z->Resize(z_dim); } @@ -78,6 +83,8 @@ class MulXPUKernel : public framework::OpKernel { template class MulGradXPUKernel : public framework::OpKernel { + using XPUType = typename XPUTypeTrait::Type; + public: void Compute(const framework::ExecutionContext& ctx) const override { int x_num_col_dims = ctx.template Attr("x_num_col_dims"); @@ -126,14 +133,14 @@ class MulGradXPUKernel : public framework::OpKernel { const T* data_a = dout->data(); const T* data_b = y_matrix.data(); T* data_c = dx_matrix.data(); - int ret = - xpu::gemm_int16(dev_ctx.x_context(), trans_a, trans_b, m, n, k, alpha, - data_a, lda, data_b, ldb, beta, data_c, ldc); - PADDLE_ENFORCE_EQ(ret, XPU_SUCCESS, - platform::errors::External( - "XPU API return wrong value[%d], please check " - "where Baidu Kunlun Card is properly installed.", - ret)); + + int ret = xpu_fc_wrapper( + dev_ctx.x_context(), reinterpret_cast(data_a), + reinterpret_cast(data_b), + reinterpret_cast(data_c), m, n, k, trans_a, trans_b, + nullptr, nullptr, nullptr, lda, ldb, ldc, alpha, beta, nullptr, + xpu::Activation_t::LINEAR); + PADDLE_ENFORCE_XDNN_SUCCESS(ret, "xpu_fc_wrapper"); } if (dy) { @@ -159,14 +166,14 @@ class MulGradXPUKernel : public framework::OpKernel { const T* data_a = x_matrix.data(); const T* data_b = dout->data(); T* data_c = dy_matrix.data(); - int ret = - xpu::gemm_int16(dev_ctx.x_context(), trans_a, trans_b, m, n, k, alpha, - data_a, lda, data_b, ldb, beta, data_c, ldc); - PADDLE_ENFORCE_EQ(ret, XPU_SUCCESS, - platform::errors::External( - "XPU API return wrong value[%d], please check " - "where Baidu Kunlun Card is properly installed.", - ret)); + + int ret = xpu_fc_wrapper( + dev_ctx.x_context(), reinterpret_cast(data_a), + reinterpret_cast(data_b), + reinterpret_cast(data_c), m, n, k, trans_a, trans_b, + nullptr, nullptr, nullptr, lda, ldb, ldc, alpha, beta, nullptr, + xpu::Activation_t::LINEAR); + PADDLE_ENFORCE_XDNN_SUCCESS(ret, "xpu_fc_wrapper"); } } }; @@ -175,9 +182,12 @@ class MulGradXPUKernel : public framework::OpKernel { } // namespace paddle namespace ops = paddle::operators; +namespace plat = paddle::platform; REGISTER_OP_XPU_KERNEL( - mul, ops::MulXPUKernel); + mul, ops::MulXPUKernel, + ops::MulXPUKernel); REGISTER_OP_XPU_KERNEL( - mul_grad, ops::MulGradXPUKernel) + mul_grad, ops::MulGradXPUKernel, + ops::MulGradXPUKernel) #endif diff --git a/paddle/fluid/platform/device/xpu/xpu2_op_list.h b/paddle/fluid/platform/device/xpu/xpu2_op_list.h index 15db243f751a6..08a7f08006957 100644 --- a/paddle/fluid/platform/device/xpu/xpu2_op_list.h +++ b/paddle/fluid/platform/device/xpu/xpu2_op_list.h @@ -70,8 +70,10 @@ XPUOpMap& get_kl2_ops() { XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()), pOpKernelType(vartype::FP16, XPUPlace())})}, {"dropout_grad", - XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, - {"dropout", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, + XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()), + pOpKernelType(vartype::FP16, XPUPlace())})}, + {"dropout", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()), + pOpKernelType(vartype::FP16, XPUPlace())})}, {"elementwise_add_grad", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()), pOpKernelType(vartype::FP16, XPUPlace())})}, @@ -249,6 +251,8 @@ XPUOpMap& get_kl2_ops() { {"momentum", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, {"mul", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()), pOpKernelType(vartype::FP16, XPUPlace())})}, + {"mul_grad", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()), + pOpKernelType(vartype::FP16, XPUPlace())})}, {"nearest_interp_v2", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, {"nearest_interp_v2_grad", diff --git a/python/paddle/fluid/tests/unittests/xpu/test_mul_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_mul_op_xpu.py index 58a8fa3083055..9d98ab70041e9 100644 --- a/python/paddle/fluid/tests/unittests/xpu/test_mul_op_xpu.py +++ b/python/paddle/fluid/tests/unittests/xpu/test_mul_op_xpu.py @@ -27,104 +27,120 @@ paddle.enable_static() +from xpu.get_test_cover_info import create_test_class, get_xpu_op_support_types, XPUOpTestWrapper + -@unittest.skipIf(not paddle.is_compiled_with_xpu(), - "core is not compiled with XPU") class TestMulOpError(unittest.TestCase): def test_errors(self): with program_guard(Program(), Program()): # The input type of mul_op must be Variable. x1 = fluid.create_lod_tensor( - np.array([[-1]]), [[1]], fluid.CPUPlace()) + np.array([[-1]]), [[1]], fluid.XPUPlace(0)) x2 = fluid.create_lod_tensor( - np.array([[-1]]), [[1]], fluid.CPUPlace()) + np.array([[-1]]), [[1]], fluid.XPUPlace(0)) self.assertRaises(TypeError, fluid.layers.mul, x1, x2) - # The input dtype of mul_op must be float32 or float64. + # The input dtype of mul_op must be float32. x3 = fluid.layers.data(name='x3', shape=[4], dtype="int32") x4 = fluid.layers.data(name='x4', shape=[4], dtype="int32") self.assertRaises(TypeError, fluid.layers.mul, x3, x4) -@unittest.skipIf(not paddle.is_compiled_with_xpu(), - "core is not compiled with XPU") -class TestXPUMulOp1(XPUOpTest): - def setUp(self): - self.op_type = "mul" - self.dtype = np.float32 - self.use_xpu = True - self.init_dtype_type() - self.inputs = { - 'X': np.random.random((3, 4, 2, 9)).astype(self.dtype), - 'Y': np.random.random((3, 6, 1, 2, 3)).astype(self.dtype) - } - self.attrs = { - 'x_num_col_dims': 2, - 'y_num_col_dims': 2, - } - result = np.dot(self.inputs['X'].reshape(3 * 4, 2 * 9), - self.inputs['Y'].reshape(3 * 6, 1 * 2 * 3)) - result = result.reshape(3, 4, 1, 2, 3) - self.outputs = {'Out': result} - - def init_dtype_type(self): - pass - - def test_check_output(self): - place = paddle.XPUPlace(0) - self.check_output_with_place(place, atol=0.01) - - def test_check_grad_normal(self): - place = paddle.XPUPlace(0) - self.check_grad_with_place( - place, ['X', 'Y'], 'Out', max_relative_error=0.1) - - def test_check_grad_ingore_x(self): - place = paddle.XPUPlace(0) - self.check_grad_with_place( - place, ['Y'], 'Out', max_relative_error=0.1, no_grad_set=set("X")) - - def test_check_grad_ignore_y(self): - place = paddle.XPUPlace(0) - self.check_grad_with_place( - place, ['X'], 'Out', max_relative_error=0.1, no_grad_set=set('Y')) - - -@unittest.skipIf(not paddle.is_compiled_with_xpu(), - "core is not compiled with XPU") -class TestXPUMulOp2(XPUOpTest): - def setUp(self): - self.op_type = "mul" - self.use_xpu = True - self.dtype = np.float32 - self.init_dtype_type() - self.inputs = { - 'X': np.random.random((20, 5)).astype(self.dtype), - 'Y': np.random.random((5, 21)).astype(self.dtype) - } - self.outputs = {'Out': np.dot(self.inputs['X'], self.inputs['Y'])} - - def init_dtype_type(self): - self.dtype = np.float32 - - def test_check_output(self): - place = paddle.XPUPlace(0) - self.check_output_with_place(place, atol=0.01) - - def test_check_grad_normal(self): - place = paddle.XPUPlace(0) - self.check_grad_with_place( - place, ['X', 'Y'], 'Out', max_relative_error=0.1) - - def test_check_grad_ingore_x(self): - place = paddle.XPUPlace(0) - self.check_grad_with_place( - place, ['Y'], 'Out', max_relative_error=0.1, no_grad_set=set("X")) - - def test_check_grad_ingore_y(self): - place = paddle.XPUPlace(0) - self.check_grad_with_place( - place, ['X'], 'Out', max_relative_error=0.1, no_grad_set=set('Y')) - +class XPUTestMulOp(XPUOpTestWrapper): + def __init__(self): + self.op_name = 'mul' + self.use_dynamic_create_class = False + + class TestXPUMulOp1(XPUOpTest): + def setUp(self): + self.op_type = "mul" + self.dtype = self.in_type + self.inputs = { + 'X': np.random.random((3, 4, 2, 9)).astype(self.in_type_str), + 'Y': np.random.random((3, 6, 1, 2, 3)).astype(self.in_type_str) + } + self.attrs = { + 'x_num_col_dims': 2, + 'y_num_col_dims': 2, + } + result = np.dot(self.inputs['X'].reshape(3 * 4, 2 * 9), + self.inputs['Y'].reshape(3 * 6, 1 * 2 * 3)) + result = result.reshape(3, 4, 1, 2, 3) + self.outputs = {'Out': result} + + def test_check_output(self): + paddle.enable_static() + place = paddle.XPUPlace(0) + self.check_output_with_place(place, atol=0.01) + + def test_check_grad_normal(self): + place = paddle.XPUPlace(0) + paddle.enable_static() + self.check_grad_with_place( + place, ['X', 'Y'], 'Out', max_relative_error=0.1) + + def test_check_grad_ingore_x(self): + place = paddle.XPUPlace(0) + paddle.enable_static() + self.check_grad_with_place( + place, ['Y'], + 'Out', + max_relative_error=0.1, + no_grad_set=set("X")) + + def test_check_grad_ignore_y(self): + place = paddle.XPUPlace(0) + paddle.enable_static() + self.check_grad_with_place( + place, ['X'], + 'Out', + max_relative_error=0.1, + no_grad_set=set('Y')) + + class TestXPUMulOp2(XPUOpTest): + def setUp(self): + self.op_type = "mul" + self.use_xpu = True + self.dtype = self.in_type + self.inputs = { + 'X': np.random.random((20, 5)).astype(self.in_type_str), + 'Y': np.random.random((5, 21)).astype(self.in_type_str) + } + self.outputs = {'Out': np.dot(self.inputs['X'], self.inputs['Y'])} + + def test_check_output(self): + place = paddle.XPUPlace(0) + paddle.enable_static() + self.check_output_with_place(place, atol=0.01) + + def test_check_grad_normal(self): + place = paddle.XPUPlace(0) + paddle.enable_static() + self.check_grad_with_place( + place, ['X', 'Y'], 'Out', max_relative_error=0.1) + + def test_check_grad_ingore_x(self): + place = paddle.XPUPlace(0) + paddle.enable_static() + self.check_grad_with_place( + place, ['Y'], + 'Out', + max_relative_error=0.1, + no_grad_set=set("X")) + + def test_check_grad_ingore_y(self): + place = paddle.XPUPlace(0) + paddle.enable_static() + self.check_grad_with_place( + place, ['X'], + 'Out', + max_relative_error=0.1, + no_grad_set=set('Y')) + + +support_types = get_xpu_op_support_types('mul') +for stype in support_types: + create_test_class(globals(), XPUTestMulOp, stype) if __name__ == "__main__": + paddle.enable_static() unittest.main() From 14dba636e4924b68c298e86b28ca2ec73a092c8e Mon Sep 17 00:00:00 2001 From: Qi Li Date: Fri, 8 Apr 2022 12:22:41 +0800 Subject: [PATCH 025/211] [ROCm] fix dcu error in device event base, test=develop (#41521) * [ROCm] fix dcu error in device event base, test=develop * fix, test=develop --- paddle/fluid/platform/device_event.h | 2 +- paddle/fluid/platform/device_event_gpu.cc | 2 +- paddle/fluid/platform/device_event_test.cc | 52 ++++++++++++++++++++++ 3 files changed, 54 insertions(+), 2 deletions(-) diff --git a/paddle/fluid/platform/device_event.h b/paddle/fluid/platform/device_event.h index 57f45a40165d7..463329d32c936 100644 --- a/paddle/fluid/platform/device_event.h +++ b/paddle/fluid/platform/device_event.h @@ -29,7 +29,7 @@ using ::paddle::platform::kCPU; USE_EVENT(kCPU) USE_EVENT_WAIT(kCPU, kCPU) -#ifdef PADDLE_WITH_CUDA +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) USE_EVENT(kCUDA); USE_EVENT_WAIT(kCUDA, kCUDA) USE_EVENT_WAIT(kCPU, kCUDA) diff --git a/paddle/fluid/platform/device_event_gpu.cc b/paddle/fluid/platform/device_event_gpu.cc index a811a5b9c130d..f42ccc5a1db54 100644 --- a/paddle/fluid/platform/device_event_gpu.cc +++ b/paddle/fluid/platform/device_event_gpu.cc @@ -15,7 +15,7 @@ #include "paddle/fluid/platform/device_event_base.h" #include "paddle/fluid/platform/event.h" -#ifdef PADDLE_WITH_CUDA +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) namespace paddle { namespace platform { struct CUDADeviceEventWrapper { diff --git a/paddle/fluid/platform/device_event_test.cc b/paddle/fluid/platform/device_event_test.cc index 96e89f9257dd2..d9f744b26256b 100644 --- a/paddle/fluid/platform/device_event_test.cc +++ b/paddle/fluid/platform/device_event_test.cc @@ -75,6 +75,58 @@ TEST(DeviceEvent, CUDA) { } #endif +#ifdef PADDLE_WITH_HIP +#include + +TEST(DeviceEvent, CUDA) { + VLOG(1) << "In Test"; + using paddle::platform::CUDAPlace; + + auto& pool = DeviceContextPool::Instance(); + auto place = CUDAPlace(0); + auto* context = + static_cast(pool.Get(place)); + + ASSERT_NE(context, nullptr); + // case 1. test for event_creator + DeviceEvent event(place); + ASSERT_NE(event.GetEvent().get(), nullptr); + bool status = event.Query(); + ASSERT_EQ(status, true); + // case 2. test for event_recorder + event.Record(context); + status = event.Query(); + ASSERT_EQ(status, false); + // case 3. test for event_finisher + event.Finish(); + status = event.Query(); + ASSERT_EQ(status, true); + + // case 4. test for event_waiter + float *src_fp32, *dst_fp32; + int size = 1000000 * sizeof(float); + hipMallocHost(reinterpret_cast(&src_fp32), size); + hipMalloc(reinterpret_cast(&dst_fp32), size); + hipMemcpyAsync(dst_fp32, src_fp32, size, hipMemcpyHostToDevice, + context->stream()); + event.Record(context); // step 1. record it + status = event.Query(); + ASSERT_EQ(status, false); + + event.Wait(kCUDA, context); // step 2. add streamWaitEvent + status = event.Query(); + ASSERT_EQ(status, false); // async + + event.Wait(kCPU, context); // step 3. EventSynchornize + status = event.Query(); + ASSERT_EQ(status, true); // sync + + // release resource + hipFree(dst_fp32); + hipFreeHost(src_fp32); +} +#endif + TEST(DeviceEvent, CPU) { using paddle::platform::CPUPlace; auto place = CPUPlace(); From f43af2759c9fc6e8aed797f3bb96c126f0624b87 Mon Sep 17 00:00:00 2001 From: chenjian Date: Fri, 8 Apr 2022 14:30:58 +0800 Subject: [PATCH 026/211] Refine statistic table (#41524) --- .../unittests/test_profiler_statistic.py | 88 +++---- python/paddle/profiler/profiler_statistic.py | 231 ++++++++++++------ 2 files changed, 205 insertions(+), 114 deletions(-) diff --git a/python/paddle/fluid/tests/unittests/test_profiler_statistic.py b/python/paddle/fluid/tests/unittests/test_profiler_statistic.py index adc42d0447f34..dc944e68c7f55 100644 --- a/python/paddle/fluid/tests/unittests/test_profiler_statistic.py +++ b/python/paddle/fluid/tests/unittests/test_profiler_statistic.py @@ -185,20 +185,22 @@ def test_statistic_case1(self): profiler.TracerEventType.Communication), 5) self.assertEqual(len(event_summary.items), 2) self.assertEqual(len(event_summary.userdefined_items), 1) - self.assertEqual(len(event_summary.model_perspective_items), 3) + self.assertEqual(len(event_summary.model_perspective_items), 4) self.assertEqual(len(event_summary.memory_manipulation_items), 1) self.assertEqual(event_summary.items['conv2d'].cpu_time, 15) - self.assertEqual(event_summary.items['conv2d'].gpu_time, 25) + self.assertEqual(event_summary.items['conv2d'].general_gpu_time, 25) self.assertEqual( event_summary.model_perspective_items['Forward'].cpu_time, 100) self.assertEqual( - event_summary.model_perspective_items['Forward'].gpu_time, 135) + event_summary.model_perspective_items['Forward'].general_gpu_time, + 135) self.assertEqual( - event_summary.model_perspective_items['Backward'].gpu_time, 0) + event_summary.model_perspective_items['Backward'].general_gpu_time, + 0) self.assertEqual( event_summary.memory_manipulation_items['AsyncMemcpy'].cpu_time, 15) - self.assertEqual( - event_summary.memory_manipulation_items['AsyncMemcpy'].gpu_time, 60) + self.assertEqual(event_summary.memory_manipulation_items['AsyncMemcpy'] + .general_gpu_time, 60) print( profiler.profiler_statistic._build_table( statistic_data, @@ -226,31 +228,31 @@ def test_statistic_case2(self): userdefined_node = HostPythonNode('Communication Time', profiler.TracerEventType.UserDefined, 100, 110, 1000, 1001) - reduce_all_launchkernel0 = HostPythonNode( + allreduce_launchkernel0 = HostPythonNode( 'cudalaunchkernel', profiler.TracerEventType.CudaRuntime, 102, 104, 1000, 1001) - nccl_reduce_all_kernel0 = DevicePythonNode( - 'nccl_reduce_all_kernel', profiler.TracerEventType.Kernel, 105, 120, + nccl_allreduce_kernel0 = DevicePythonNode( + 'nccl_allreduce_kernel', profiler.TracerEventType.Kernel, 105, 120, 0, 0, 2) communication_node = HostPythonNode( 'Communication', profiler.TracerEventType.Communication, 105, 110, 1000, 1001) - reduce_all_op1 = HostPythonNode('reduce_all_op1', - profiler.TracerEventType.Operator, 105, - 108, 1000, 1001) - reduce_all_op1_infershape = HostPythonNode( - 'reduce_all_op1::infershape', - profiler.TracerEventType.OperatorInner, 105, 106, 1000, 1001) + allreduce_op1 = HostPythonNode('allreduce_op1', + profiler.TracerEventType.Operator, 105, + 108, 1000, 1001) + allreduce_op1_infershape = HostPythonNode( + 'allreduce_op1::infershape', profiler.TracerEventType.OperatorInner, + 105, 106, 1000, 1001) - reduce_all_launchkernel1 = HostPythonNode( + allreduce_launchkernel1 = HostPythonNode( 'cudalaunchkernel', profiler.TracerEventType.CudaRuntime, 106, 107, 1000, 1001) - nccl_reduce_all_kernel1 = DevicePythonNode( - 'nccl_reduce_all_kernel', profiler.TracerEventType.Kernel, 130, 150, + nccl_allreduce_kernel1 = DevicePythonNode( + 'nccl_allreduce_kernel', profiler.TracerEventType.Kernel, 130, 150, 0, 0, 2) backward_node = HostPythonNode('Gradient Backward', @@ -305,19 +307,19 @@ def test_statistic_case2(self): 'sync_batch_norm_memcpy', profiler.TracerEventType.Memcpy, 150, 200, 0, 0, 1) - reduce_all_node2 = HostPythonNode('reduce_all', - profiler.TracerEventType.Operator, - 230, 250, 1000, 1001) + allreduce_node2 = HostPythonNode('allreduce', + profiler.TracerEventType.Operator, 230, + 250, 1000, 1001) - reduce_all_node2_infershape = HostPythonNode( - 'reduce_all_node2::infershape', + allreduce_node2_infershape = HostPythonNode( + 'allreduce_node2::infershape', profiler.TracerEventType.OperatorInner, 231, 232, 1000, 1001) - reduce_all_launchkernel2 = HostPythonNode( + allreduce_launchkernel2 = HostPythonNode( 'cudalaunchkernel', profiler.TracerEventType.CudaRuntime, 235, 240, 1000, 1001) - nccl_reduce_all_kernel2 = DevicePythonNode( - 'nccl_reduce_all_kernel', profiler.TracerEventType.Kernel, 250, 280, + nccl_allreduce_kernel2 = DevicePythonNode( + 'nccl_allreduce_kernel', profiler.TracerEventType.Kernel, 250, 280, 0, 0, 2) root_node.children_node.append(profilerstep_node) @@ -329,12 +331,12 @@ def test_statistic_case2(self): yolonet_node.children_node.extend( [sync_batch_norm_node, userdefined_node]) userdefined_node.children_node.append(communication_node) - userdefined_node.runtime_node.append(reduce_all_launchkernel0) - reduce_all_launchkernel0.device_node.append(nccl_reduce_all_kernel0) - communication_node.children_node.append(reduce_all_op1) - reduce_all_op1.children_node.append(reduce_all_op1_infershape) - reduce_all_op1.runtime_node.append(reduce_all_launchkernel1) - reduce_all_launchkernel1.device_node.append(nccl_reduce_all_kernel1) + userdefined_node.runtime_node.append(allreduce_launchkernel0) + allreduce_launchkernel0.device_node.append(nccl_allreduce_kernel0) + communication_node.children_node.append(allreduce_op1) + allreduce_op1.children_node.append(allreduce_op1_infershape) + allreduce_op1.runtime_node.append(allreduce_launchkernel1) + allreduce_launchkernel1.device_node.append(nccl_allreduce_kernel1) conv2d_node.children_node.extend( [conv2d_infer_shape, conv2d_compute, conv2d_MemCpy]) conv2d_compute.runtime_node.append(conv2d_launchkernel) @@ -350,10 +352,10 @@ def test_statistic_case2(self): sync_batch_norm_MemCpy.runtime_node.append(sync_batch_norm_cudaMemCpy) sync_batch_norm_launchkernel.device_node.append(sync_batch_norm_kernel) sync_batch_norm_cudaMemCpy.device_node.append(sync_batch_norm_memcpy) - optimization_node.children_node.append(reduce_all_node2) - reduce_all_node2.children_node.append(reduce_all_node2_infershape) - reduce_all_node2.runtime_node.append(reduce_all_launchkernel2) - reduce_all_launchkernel2.device_node.append(nccl_reduce_all_kernel2) + optimization_node.children_node.append(allreduce_node2) + allreduce_node2.children_node.append(allreduce_node2_infershape) + allreduce_node2.runtime_node.append(allreduce_launchkernel2) + allreduce_launchkernel2.device_node.append(nccl_allreduce_kernel2) thread_tree = {'thread1001': root_node} extra_info = { 'Process Cpu Utilization': '1.02', @@ -415,20 +417,22 @@ def test_statistic_case2(self): distributed_summary.overlap_range), 85) self.assertEqual(len(event_summary.items), 4) self.assertEqual(len(event_summary.userdefined_items), 1) - self.assertEqual(len(event_summary.model_perspective_items), 3) + self.assertEqual(len(event_summary.model_perspective_items), 4) self.assertEqual(len(event_summary.memory_manipulation_items), 1) self.assertEqual(event_summary.items['conv2d'].cpu_time, 15) - self.assertEqual(event_summary.items['conv2d'].gpu_time, 25) + self.assertEqual(event_summary.items['conv2d'].general_gpu_time, 25) self.assertEqual( event_summary.model_perspective_items['Forward'].cpu_time, 100) self.assertEqual( - event_summary.model_perspective_items['Forward'].gpu_time, 315) + event_summary.model_perspective_items['Forward'].general_gpu_time, + 315) self.assertEqual( - event_summary.model_perspective_items['Backward'].gpu_time, 0) + event_summary.model_perspective_items['Backward'].general_gpu_time, + 0) self.assertEqual( event_summary.memory_manipulation_items['AsyncMemcpy'].cpu_time, 15) - self.assertEqual( - event_summary.memory_manipulation_items['AsyncMemcpy'].gpu_time, 60) + self.assertEqual(event_summary.memory_manipulation_items['AsyncMemcpy'] + .general_gpu_time, 60) print( profiler.profiler_statistic._build_table( statistic_data, diff --git a/python/paddle/profiler/profiler_statistic.py b/python/paddle/profiler/profiler_statistic.py index 3be6088a484b8..5fed51476132e 100755 --- a/python/paddle/profiler/profiler_statistic.py +++ b/python/paddle/profiler/profiler_statistic.py @@ -28,7 +28,7 @@ TracerEventType.PythonOp, TracerEventType.PythonUserDefined ] -_CommunicationOpName = ['reduce', 'broadcast', 'rpc'] +_CommunicationOpName = ['allreduce', 'broadcast', 'rpc'] class SortedKeys(Enum): @@ -74,8 +74,10 @@ def __init__(self, hostnode): self.runtime_node = [] self.cpu_time = 0 self.self_cpu_time = 0 - self.gpu_time = 0 + self.gpu_time = 0 # kernel time self.self_gpu_time = 0 + self.general_gpu_time = 0 # besides kernel, include time of gpu events like memcpy and memset + self.self_general_gpu_time = 0 def cal_statistic(self): for child in self.children_node: @@ -86,14 +88,20 @@ def cal_statistic(self): self.cpu_time = self.hostnode.end_ns - self.hostnode.start_ns for child in self.children_node: self.gpu_time += child.gpu_time + self.general_gpu_time += child.general_gpu_time self.self_cpu_time -= (child.end_ns - child.start_ns) for rt in self.runtime_node: self.self_cpu_time -= (rt.end_ns - rt.start_ns) self.gpu_time += rt.gpu_time self.self_gpu_time += rt.gpu_time + self.general_gpu_time += rt.general_gpu_time + self.self_general_gpu_time += rt.general_gpu_time for device in self.hostnode.device_node: - self.gpu_time += (device.end_ns - device.start_ns) - self.self_gpu_time += (device.end_ns - device.start_ns) + if device.type == TracerEventType.Kernel: + self.gpu_time += (device.end_ns - device.start_ns) + self.self_gpu_time += (device.end_ns - device.start_ns) + self.general_gpu_time += (device.end_ns - device.start_ns) + self.self_general_gpu_time += (device.end_ns - device.start_ns) @property def end_ns(self): @@ -258,6 +266,8 @@ def __init__(self): self.communication_range = [] self.computation_range = [] self.overlap_range = [] + self.cpu_calls = 0 + self.gpu_calls = 0 def parse(self, nodetrees): ''' @@ -300,6 +310,8 @@ def parse(self, nodetrees): else: self.computation_range.append(( devicenode.start_ns, devicenode.end_ns)) + self.cpu_calls = len(set(self.cpu_communication_range)) + self.gpu_calls = len(set(self.gpu_communication_range)) self.cpu_communication_range = merge_self_ranges( self.cpu_communication_range, is_sorted=False) self.gpu_communication_range = merge_self_ranges( @@ -354,6 +366,9 @@ def __init__(self, name): self.min_gpu_time = float('inf') self.devices = {} self.operator_inners = {} + self.general_gpu_time = 0 + self.min_general_gpu_time = float('inf') + self.max_general_gpu_time = 0 @property def avg_cpu_time(self): @@ -363,6 +378,10 @@ def avg_cpu_time(self): def avg_gpu_time(self): return self.gpu_time / self.call + @property + def avg_general_gpu_time(self): + return self.general_gpu_time / self.call + def add_cpu_time(self, time): if time > self.max_cpu_time: self.max_cpu_time = time @@ -377,6 +396,13 @@ def add_gpu_time(self, time): self.min_gpu_time = time self.gpu_time += time + def add_general_gpu_time(self, time): + if time > self.max_general_gpu_time: + self.max_general_gpu_time = time + if time < self.min_general_gpu_time: + self.min_general_gpu_time = time + self.general_gpu_time += time + def add_call(self): self.call += 1 @@ -384,6 +410,7 @@ def add_item(self, node): self.add_call() self.add_cpu_time(node.cpu_time) self.add_gpu_time(node.gpu_time) + self.add_general_gpu_time(node.general_gpu_time) for child in node.children_node: if child.name not in self.operator_inners: self.operator_inners[ @@ -407,6 +434,9 @@ def __init__(self, name): self.gpu_time = 0 self.max_gpu_time = 0 self.min_gpu_time = float('inf') + self.general_gpu_time = 0 + self.min_general_gpu_time = float('inf') + self.max_general_gpu_time = 0 @property def avg_cpu_time(self): @@ -416,6 +446,10 @@ def avg_cpu_time(self): def avg_gpu_time(self): return self.gpu_time / self.call + @property + def avg_general_gpu_time(self): + return self.general_gpu_time / self.call + def add_cpu_time(self, time): if time > self.max_cpu_time: self.max_cpu_time = time @@ -430,6 +464,13 @@ def add_gpu_time(self, time): self.min_gpu_time = time self.gpu_time += time + def add_general_gpu_time(self, time): + if time > self.max_general_gpu_time: + self.max_general_gpu_time = time + if time < self.min_general_gpu_time: + self.min_general_gpu_time = time + self.general_gpu_time += time + def add_call(self): self.call += 1 @@ -437,6 +478,7 @@ def add_item(self, node): self.add_call() self.add_cpu_time(node.cpu_time) self.add_gpu_time(node.gpu_time) + self.add_general_gpu_time(node.general_gpu_time) def __init__(self): self.items = {} # for operator summary @@ -478,6 +520,8 @@ def parse(self, nodetrees): self.add_model_perspective_item( child) #find first model perspective node else: + if child.type == TracerEventType.ProfileStep: + self.add_model_perspective_item(child) deque.append(child) def add_operator_item(self, operator_node): @@ -533,6 +577,8 @@ def add_model_perspective_item(self, model_perspective_node): name = 'Optimization' elif model_perspective_node.type == TracerEventType.Dataloader: name = 'Dataloader' + elif model_perspective_node.type == TracerEventType.ProfileStep: + name = 'ProfileStep' else: return if name not in self.model_perspective_items: @@ -626,7 +672,6 @@ def format_ratio(ratio, indent=0): # construct table string append(add_title(line_length, "Device Summary")) - append('Time unit: {}'.format(time_unit)) append(header_sep) append(row_format.format(*headers)) append(header_sep) @@ -661,7 +706,7 @@ def format_ratio(ratio, indent=0): return ''.join(result) ###### Print Overview Summary ###### - headers = ['Event Type', 'CPU Time', 'Ratio (%)'] + headers = ['Event Type', 'Calls', 'CPU Time', 'Ratio (%)'] row_format_list = [""] header_sep_list = [""] line_length_list = [-SPACING_SIZE] @@ -680,13 +725,13 @@ def format_ratio(ratio, indent=0): append(header_sep) append(row_format.format(*headers)) append(header_sep) - row_values = [ - 'Total Time', format_time( - total_time, unit=time_unit), format_ratio(1) - ] - append(row_format.format(*row_values)) cpu_type_time = collections.defaultdict(int) gpu_type_time = collections.defaultdict(int) + cpu_call_times = collections.defaultdict(int) + gpu_call_times = collections.defaultdict(int) + cpu_call_times.update(statistic_data.time_range_summary.call_times) + gpu_call_times.update(statistic_data.time_range_summary.call_times) + for event_type, value in statistic_data.time_range_summary.CPUTimeRangeSum.items( ): if event_type != TracerEventType.Communication: @@ -694,6 +739,9 @@ def format_ratio(ratio, indent=0): if statistic_data.distributed_summary.cpu_communication_range: cpu_type_time[TracerEventType.Communication] = sum_ranges( statistic_data.distributed_summary.cpu_communication_range) + cpu_call_times[ + TracerEventType. + Communication] = statistic_data.distributed_summary.cpu_calls gpu_time_range = collections.defaultdict(list) for device_id, device_time_ranges in statistic_data.time_range_summary.GPUTimeRange.items( @@ -706,22 +754,34 @@ def format_ratio(ratio, indent=0): if statistic_data.distributed_summary.gpu_communication_range: gpu_type_time[TracerEventType.Communication] = sum_ranges( statistic_data.distributed_summary.gpu_communication_range) + gpu_call_times[ + TracerEventType. + Communication] = statistic_data.distributed_summary.gpu_calls sorted_items = sorted( cpu_type_time.items(), key=lambda x: x[1], reverse=True) - for event_type, time in sorted_items: + event_type, time = sorted_items[0] + row_values = [ + '{}'.format(str(event_type).split('.')[1]), cpu_call_times[event_type], + format_time( + time, unit=time_unit), format_ratio(float(time) / total_time) + ] + append(row_format.format(*row_values)) + for event_type, time in sorted_items[1:]: row_values = [ - ' {}'.format(str(event_type).split('.')[1]), format_time( + ' {}'.format(str(event_type).split('.')[1]), + cpu_call_times[event_type], format_time( time, unit=time_unit), format_ratio(float(time) / total_time) ] append(row_format.format(*row_values)) append(header_sep) - headers = ['', 'GPU Time', 'Ratio (%)'] + headers = ['', 'Calls', 'GPU Time', 'Ratio (%)'] append(row_format.format(*headers)) append(header_sep) for event_type, time in gpu_type_time.items(): row_values = [ - ' {}'.format(str(event_type).split('.')[1]), format_time( + ' {}'.format(str(event_type).split('.')[1]), + gpu_call_times[event_type], format_time( time, unit=time_unit), format_ratio(float(time) / total_time) ] append(row_format.format(*row_values)) @@ -730,7 +790,7 @@ def format_ratio(ratio, indent=0): append( "Note:\nIn this table, We sum up all collected events in terms of event type.\n" "The time of events collected on host are presented as CPU Time, and as GPU Time if on device.\n" - "Ratio = CPU(GPU) Time / Total Time.\n" + "The time with ratio 100% is the base time for calculating ratio. \n" "Events with different types may overlap or inclusion, e.g. Operator includes OperatorInner, so the sum of ratios is not 100%.\n" "The time of events in the same type with overlap will not calculate twice, and all time is summed after merged.\n" "Example:\n" @@ -746,21 +806,21 @@ def format_ratio(ratio, indent=0): ###### Print Model Summary Report ###### model_perspective_items = statistic_data.event_summary.model_perspective_items - if model_perspective_items: + if len(model_perspective_items) > 1: all_row_values = [] - row_values = [ - 'Total Time', '-', '{} / - / - / - / {}'.format( - format_time( - total_time, unit=time_unit), format_ratio(1)), - '- / - / - / -/ -' - ] - all_row_values.append(row_values) accmulation_time = 0 - for name in ['Dataloader', 'Forward', 'Backward', 'Optimization']: + gpu_accmulation_time = 0 + gpu_total_time = 0 + for name in [ + 'ProfileStep', 'Dataloader', 'Forward', 'Backward', + 'Optimization' + ]: if name in model_perspective_items: item = model_perspective_items[name] + name = '{}'.format( + name) if 'ProfileStep' in name else ' {}'.format(name) row_values = [ - ' {}'.format(name), item.call, + '{}'.format(name), item.call, '{} / {} / {} / {} / {}'.format( format_time( item.cpu_time, unit=time_unit), @@ -783,15 +843,23 @@ def format_ratio(ratio, indent=0): format_ratio(float(item.gpu_time) / total_time)) ] all_row_values.append(row_values) - accmulation_time += item.cpu_time + if 'ProfileStep' not in name: + accmulation_time += item.cpu_time + gpu_accmulation_time += item.gpu_time + else: + gpu_total_time = item.gpu_time other_time = total_time - accmulation_time + other_gpu_time = gpu_total_time - gpu_accmulation_time row_values = [ ' Others', '-', '{} / - / - / - / {}'.format( format_time( other_time, unit=time_unit), format_ratio(float(other_time) / total_time)), - '- / - / - / - / -' + '{} / - / - / - / {}'.format( + format_time( + other_gpu_time, unit=time_unit), + format_ratio(float(other_gpu_time) / gpu_total_time)) ] all_row_values.append(row_values) # Calculate the column width @@ -835,6 +903,7 @@ def format_ratio(ratio, indent=0): append( "Note:\nIn this table, GPU time is the sum of all device(GPU) events called in the phase.\n" "Unlike overview summary, if two device(GPU) events execute on different streams with overlap time, we sum them directly here.\n" + "The time with ratio 100% is the base time for calculating ratio. \n" ) append('-' * line_length) append('') @@ -872,21 +941,27 @@ def format_ratio(ratio, indent=0): overlap_time = sum_ranges( statistic_data.distributed_summary.overlap_range) row_values = [ - 'Communication', format_time( + 'ProfileStep', format_time( + total_time, unit=time_unit), + format_ratio(float(total_time) / total_time) + ] + append(row_format.format(*row_values)) + row_values = [ + ' Communication', format_time( communication_time, unit=time_unit), format_ratio(float(communication_time) / total_time) ] append(row_format.format(*row_values)) row_values = [ - 'Computation', format_time( + ' Computation', format_time( computation_time, unit=time_unit), format_ratio(float(computation_time) / total_time) ] append(row_format.format(*row_values)) row_values = [ - 'Overlap', format_time( + ' Overlap', format_time( overlap_time, unit=time_unit), format_ratio(float(overlap_time) / total_time) ] @@ -896,6 +971,7 @@ def format_ratio(ratio, indent=0): "Note:\nCommunication time: Communication Event time, Communication Op time and its kernel time on gpu.\n" "Computation time: Kernel time, except kernels belong to communication(nccl kernels).\n" "Overlap time: Communication time intersects with computation time.\n" + "The time with ratio 100% is the base time for calculating ratio. \n" "Example:\n" "Communication:\n" " CPU: |_________________|\n" @@ -938,20 +1014,22 @@ def format_ratio(ratio, indent=0): items.items(), key=lambda x: x[1].min_cpu_time) elif sorted_by == SortedKeys.GPUTotal: sorted_items = sorted( - items.items(), key=lambda x: x[1].gpu_time, reverse=True) + items.items(), + key=lambda x: x[1].general_gpu_time, + reverse=True) elif sorted_by == SortedKeys.GPUAvg: sorted_items = sorted( items.items(), - key=lambda x: x[1].avg_gpu_time, + key=lambda x: x[1].avg_general_gpu_time, reverse=True) elif sorted_by == SortedKeys.GPUMax: sorted_items = sorted( items.items(), - key=lambda x: x[1].max_gpu_time, + key=lambda x: x[1].max_general_gpu_time, reverse=True) elif sorted_by == SortedKeys.GPUMin: sorted_items = sorted( - items.items(), key=lambda x: x[1].min_gpu_time) + items.items(), key=lambda x: x[1].min_general_gpu_time) for name, item in sorted_items: row_values = [ @@ -967,14 +1045,15 @@ def format_ratio(ratio, indent=0): format_ratio(float(item.cpu_time) / total_time)), '{} / {} / {} / {} / {}'.format( format_time( - item.gpu_time, unit=time_unit), + item.general_gpu_time, unit=time_unit), format_time( - item.avg_gpu_time, unit=time_unit), + item.avg_general_gpu_time, unit=time_unit), format_time( - item.max_gpu_time, unit=time_unit), + item.max_general_gpu_time, unit=time_unit), format_time( - item.min_gpu_time, unit=time_unit), - format_ratio(float(item.gpu_time) / total_time)) + item.min_general_gpu_time, unit=time_unit), + format_ratio( + float(item.general_gpu_time) / total_time)) ] all_row_values.append(row_values) if op_detail: @@ -998,18 +1077,23 @@ def format_ratio(ratio, indent=0): float(innerop_node.cpu_time) / total_time)), '{} / {} / {} / {} / {}'.format( format_time( - innerop_node.gpu_time, unit=time_unit), + innerop_node.general_gpu_time, + unit=time_unit), format_time( - innerop_node.avg_gpu_time, unit=time_unit), + innerop_node.avg_general_gpu_time, + unit=time_unit), format_time( - innerop_node.max_gpu_time, unit=time_unit), + innerop_node.max_general_gpu_time, + unit=time_unit), format_time( - innerop_node.min_gpu_time, unit=time_unit), + innerop_node.min_general_gpu_time, + unit=time_unit), format_ratio( - float(innerop_node.gpu_time) / total_time)) + float(innerop_node.general_gpu_time) / + total_time)) ] all_row_values.append(row_values) - for device_node_name, devicenode in innerop_node.devices.items( + for device_node_name, device_node in innerop_node.devices.items( ): if len(device_node_name) + 4 > name_column_width: device_node_name = device_node_name[: @@ -1018,21 +1102,21 @@ def format_ratio(ratio, indent=0): device_node_name += "..." row_values = [ ' {}'.format(device_node_name), - devicenode.call, '- / - / - / - / -', + device_node.call, '- / - / - / - / -', '{} / {} / {} / {} / {}'.format( format_time( - devicenode.gpu_time, unit=time_unit), + device_node.gpu_time, unit=time_unit), format_time( - devicenode.avg_gpu_time, + device_node.avg_gpu_time, unit=time_unit), format_time( - devicenode.max_gpu_time, + device_node.max_gpu_time, unit=time_unit), format_time( - devicenode.min_gpu_time, + device_node.min_gpu_time, unit=time_unit), format_ratio( - float(devicenode.gpu_time) / + float(device_node.gpu_time) / total_time)) ] all_row_values.append(row_values) @@ -1043,19 +1127,19 @@ def format_ratio(ratio, indent=0): - 5] device_node_name += "..." row_values = [ - ' {}'.format(device_node_name), devicenode.call, + ' {}'.format(device_node_name), device_node.call, '- / - / - / - / -', '{} / {} / {} / {} / {}'.format( format_time( - devicenode.gpu_time, unit=time_unit), + device_node.gpu_time, unit=time_unit), format_time( - devicenode.avg_gpu_time, unit=time_unit), + device_node.avg_gpu_time, unit=time_unit), format_time( - devicenode.max_gpu_time, unit=time_unit), + device_node.max_gpu_time, unit=time_unit), format_time( - devicenode.min_gpu_time, unit=time_unit), + device_node.min_gpu_time, unit=time_unit), format_ratio( - float(devicenode.gpu_time) / total_time)) + float(device_node.gpu_time) / total_time)) ] all_row_values.append(row_values) # Calculate the column width @@ -1123,14 +1207,14 @@ def format_ratio(ratio, indent=0): format_ratio(float(item.cpu_time) / total_time)), '{} / {} / {} / {} / {}'.format( format_time( - item.gpu_time, unit=time_unit), + item.general_gpu_time, unit=time_unit), format_time( - item.avg_gpu_time, unit=time_unit), + item.avg_general_gpu_time, unit=time_unit), format_time( - item.max_gpu_time, unit=time_unit), + item.max_general_gpu_time, unit=time_unit), format_time( - item.min_gpu_time, unit=time_unit), - format_ratio(float(item.gpu_time) / total_time)), + item.min_general_gpu_time, unit=time_unit), + format_ratio(float(item.general_gpu_time) / total_time)), ] all_row_values.append(row_values) @@ -1207,20 +1291,22 @@ def format_ratio(ratio, indent=0): items.items(), key=lambda x: x[1].min_cpu_time) elif sorted_by == SortedKeys.GPUTotal: sorted_items = sorted( - items.items(), key=lambda x: x[1].gpu_time, reverse=True) + items.items(), + key=lambda x: x[1].general_gpu_time, + reverse=True) elif sorted_by == SortedKeys.GPUAvg: sorted_items = sorted( items.items(), - key=lambda x: x[1].avg_gpu_time, + key=lambda x: x[1].avg_general_gpu_time, reverse=True) elif sorted_by == SortedKeys.GPUMax: sorted_items = sorted( items.items(), - key=lambda x: x[1].max_gpu_time, + key=lambda x: x[1].max_general_gpu_time, reverse=True) elif sorted_by == SortedKeys.GPUMin: sorted_items = sorted( - items.items(), key=lambda x: x[1].min_gpu_time) + items.items(), key=lambda x: x[1].min_general_gpu_time) for name, item in sorted_items: row_values = [ @@ -1238,14 +1324,15 @@ def format_ratio(ratio, indent=0): format_ratio(float(item.cpu_time) / total_time)), '{} / {} / {} / {} / {}'.format( format_time( - item.gpu_time, unit=time_unit), + item.general_gpu_time, unit=time_unit), format_time( - item.avg_gpu_time, unit=time_unit), + item.avg_general_gpu_time, unit=time_unit), format_time( - item.max_gpu_time, unit=time_unit), + item.max_general_gpu_time, unit=time_unit), format_time( - item.min_gpu_time, unit=time_unit), - format_ratio(float(item.gpu_time) / total_time)), + item.min_general_gpu_time, unit=time_unit), + format_ratio( + float(item.general_gpu_time) / total_time)), ] all_row_values.append(row_values) From acc25d0b5d3e351d524e3818db1ad5611f0735fa Mon Sep 17 00:00:00 2001 From: zhiboniu <31800336+zhiboniu@users.noreply.github.com> Date: Fri, 8 Apr 2022 14:39:01 +0800 Subject: [PATCH 027/211] tensor fluid code transfer part1 (#41094) --- python/paddle/common_ops_import.py | 2 +- .../tests/unittests/test_multiplex_op.py | 8 +- python/paddle/framework/__init__.py | 3 + python/paddle/nn/functional/extension.py | 2 +- .../paddle/tensor/layer_function_generator.py | 382 +++++++++++++ python/paddle/tensor/logic.py | 2 +- python/paddle/tensor/math.py | 350 ++++++++++-- python/paddle/tensor/ops.py | 532 ++++++++++++++++++ 8 files changed, 1230 insertions(+), 51 deletions(-) create mode 100644 python/paddle/tensor/layer_function_generator.py create mode 100644 python/paddle/tensor/ops.py diff --git a/python/paddle/common_ops_import.py b/python/paddle/common_ops_import.py index 9897480858946..de8056f280a39 100644 --- a/python/paddle/common_ops_import.py +++ b/python/paddle/common_ops_import.py @@ -22,7 +22,7 @@ from paddle.fluid import core, dygraph_utils from paddle.fluid.data_feeder import check_type, check_dtype, check_variable_and_dtype, convert_dtype from paddle.fluid.layers import fill_constant, utils, scale -from paddle.fluid.layers.layer_function_generator import templatedoc +from paddle.tensor.layer_function_generator import templatedoc import paddle.fluid as fluid import numpy import warnings diff --git a/python/paddle/fluid/tests/unittests/test_multiplex_op.py b/python/paddle/fluid/tests/unittests/test_multiplex_op.py index a840586d78db0..a26eed12246e4 100644 --- a/python/paddle/fluid/tests/unittests/test_multiplex_op.py +++ b/python/paddle/fluid/tests/unittests/test_multiplex_op.py @@ -68,26 +68,26 @@ def test_errors(self): def test_list(): # the inputs type must be list - fluid.layers.multiplex(inputs=x1, index=index) + paddle.multiplex(inputs=x1, index=index) self.assertRaises(TypeError, test_list) def test_len(): - fluid.layers.multiplex(inputs=[x1], index=index) + paddle.multiplex(inputs=[x1], index=index) self.assertRaises(ValueError, test_len) def test_type(): y1 = fluid.data(name='y1', shape=[None, 2], dtype='int16') y2 = fluid.data(name='y2', shape=[None, 2], dtype='int16') - fluid.layers.multiplex(inputs=[y1, y2], index=index) + paddle.multiplex(inputs=[y1, y2], index=index) self.assertRaises(TypeError, test_type) def test_type2(): index2 = fluid.data( name='index2', shape=[None, 1], dtype='int16') - fluid.layers.multiplex(inputs=[x1, x2], index=index2) + paddle.multiplex(inputs=[x1, x2], index=index2) self.assertRaises(TypeError, test_type2) diff --git a/python/paddle/framework/__init__.py b/python/paddle/framework/__init__.py index 2f8c23187e8d1..ffd1607fe87b4 100644 --- a/python/paddle/framework/__init__.py +++ b/python/paddle/framework/__init__.py @@ -53,4 +53,7 @@ from ..fluid.framework import convert_np_dtype_to_dtype_, _varbase_creator, OpProtoHolder # noqa: F401 from ..fluid.framework import _dygraph_tracer # noqa: F401 +from ..fluid.layer_helper import LayerHelper # noqa: F401 +from ..fluid.framework import in_dygraph_mode # noqa: F401 + __all__ = [] diff --git a/python/paddle/nn/functional/extension.py b/python/paddle/nn/functional/extension.py index 6a8686b612e7f..2483eab6c053a 100644 --- a/python/paddle/nn/functional/extension.py +++ b/python/paddle/nn/functional/extension.py @@ -20,7 +20,7 @@ from ...static import Variable from ...tensor.creation import assign from ...fluid import dygraph_utils -from ...fluid.layers.layer_function_generator import templatedoc +from ...tensor.layer_function_generator import templatedoc from ...fluid.layers.sequence_lod import sequence_mask #noqa: F401 from paddle import in_dynamic_mode diff --git a/python/paddle/tensor/layer_function_generator.py b/python/paddle/tensor/layer_function_generator.py new file mode 100644 index 0000000000000..ecb13613a125e --- /dev/null +++ b/python/paddle/tensor/layer_function_generator.py @@ -0,0 +1,382 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function +import re +import functools +import warnings +import string + +from six.moves import cStringIO +from ..static import Variable +from ..fluid.proto import framework_pb2 +from ..framework import OpProtoHolder, core, convert_np_dtype_to_dtype_ +from ..framework import LayerHelper +from ..fluid.data_feeder import check_variable_and_dtype +import paddle +from paddle import _C_ops + +__all__ = [] + + +def _convert_(name): + """ + Formatting. + + Args: + name: The name/alias + + This function takes in a name and converts it to a standard format of + group1_group2. Where as per the regular expression, group1 can have + alphabets and numbers and group2 has capital alphabets. + + """ + s1 = re.sub('(.)([A-Z][a-z]+)', r'\1_\2', name) + return re.sub('([a-z0-9])([A-Z])', r'\1_\2', s1).lower() + + +def _type_to_str_(tp): + return framework_pb2.AttrType.Name(tp) + + +_two_dollar_pattern_ = re.compile(r"\$\$([^\$]+)\$\$") +_single_dollar_pattern_ = re.compile(r"\$([^\$]+)\$") +_two_bang_pattern_ = re.compile(r"!!([^!]+)!!") + + +def escape_math(text): + #return _two_bang_pattern_.sub( + # r'$$\1$$', + # _single_dollar_pattern_.sub(r':math:\n`\1`', + # _two_dollar_pattern_.sub(r"!!\1!!", text))) + return _two_dollar_pattern_.sub(r':math:`\1`', text) + + +def _generate_doc_string_(op_proto, + additional_args_lines=None, + skip_attrs_set=None): + """ + Generate docstring by OpProto + + Args: + op_proto (framework_pb2.OpProto): a protobuf message typed OpProto + + Returns: + str: the document string + """ + + if not isinstance(op_proto, framework_pb2.OpProto): + raise TypeError("OpProto should be `framework_pb2.OpProto`") + + buf = cStringIO() + buf.write(escape_math(op_proto.comment)) + buf.write('\nArgs:\n') + for each_input in op_proto.inputs: + line_begin = ' {0}'.format(_convert_(each_input.name)) + buf.write(line_begin) + buf.write(" (Tensor): ") + buf.write(escape_math(each_input.comment)) + if each_input.duplicable: + buf.write(" Duplicatable.") + if each_input.dispensable: + buf.write(" Optional.") + buf.write('\n') + + skip_attrs = OpProtoHolder.generated_op_attr_names() + # attr use_mkldnn and is_test also should not be visible to users. + skip_attrs.add("use_mkldnn") + skip_attrs.add("is_test") + skip_attrs.add("use_cudnn") + + if skip_attrs_set: + for t in skip_attrs_set: + skip_attrs.add(t) + + for each_attr in op_proto.attrs: + if each_attr.name in skip_attrs: + continue + buf.write(' ') + buf.write(each_attr.name) + buf.write(' (') + buf.write(_type_to_str_(each_attr.type)) + buf.write('): ') + buf.write(escape_math(each_attr.comment)) + buf.write('\n') + + if additional_args_lines is not None: + for line in additional_args_lines: + line = line.strip() + buf.write(' ') + buf.write(line) + buf.write('\n') + + if len(op_proto.outputs) != 0: + buf.write('\nReturns:\n') + buf.write(' ') + for each_opt in op_proto.outputs: + if not each_opt.intermediate: + break + buf.write(_convert_(each_opt.name)) + buf.write(' (Tensor): ') + buf.write(escape_math(each_opt.comment)) + + return buf.getvalue() + + +def generate_layer_fn(op_type): + """Register the Python layer for an Operator. + + Args: + op_type: The name of the operator to be created. + + This function takes in the operator type (sigmoid, mean , average etc) and + creates the operator functionality. + + """ + op_proto = OpProtoHolder.instance().get_op_proto(op_type) + not_intermediate_outputs = \ + [output for output in op_proto.outputs if not output.intermediate] + intermediate_outputs = \ + [output for output in op_proto.outputs if output.intermediate] + + if len(not_intermediate_outputs) != 1: + raise ValueError("Only one non intermediate output operator can be", + "automatically generated. {0}".format(op_type)) + + if not_intermediate_outputs[0].duplicable: + raise ValueError( + "Only non duplicable op can be automatically generated.") + + for output in intermediate_outputs: + if output.duplicable: + raise ValueError("The op can be automatically generated only when ", + "all intermediate ops are not duplicable.") + + o_name = not_intermediate_outputs[0].name + intermediate_output_names = [output.name for output in intermediate_outputs] + + def infer_and_check_dtype(op_proto, *args, **kwargs): + """ + This function performs the sanity check for dtype and + instance type. + """ + dtype = None + for ipt in op_proto.inputs: + name = _convert_(ipt.name) + val = kwargs.pop(name, []) + if not isinstance(val, list) and not isinstance(val, tuple): + val = [val] + if len(val) == 0: + if len(args) == 0: + continue + val = [args[0]] + args = args[1:] + + for each in val: + if not isinstance(each, Variable): + raise ValueError("input of {0} must be variable".format( + op_type)) + + if dtype is None: + dtype = each.dtype + elif dtype != each.dtype: + raise ValueError( + "operator {0} must input same dtype. {1} vs {2}".format( + op_type, dtype, each.dtype)) + + if dtype is None: + arg_dtype = kwargs.get("dtype") + if arg_dtype: + if not isinstance(arg_dtype, core.VarDesc.VarType): + dtype = convert_np_dtype_to_dtype_(arg_dtype) + else: + dtype = arg_dtype + else: + dtype = core.VarDesc.VarType.FP32 + return dtype + + def func(*args, **kwargs): + helper = LayerHelper(op_type, **kwargs) + + dtype = infer_and_check_dtype(op_proto, *args, **kwargs) + + inputs = dict() + for ipt in op_proto.inputs: + name = _convert_(ipt.name) + val = kwargs.pop(name, []) + if not isinstance(val, list) and not isinstance(val, tuple): + val = [val] + if len(val) == 0 and len(args) != 0: + val = args[0] + args = args[1:] + inputs[ipt.name] = val + + outputs = dict() + out = kwargs.pop(_convert_(o_name), []) + if out: + out_var = out[0] if (isinstance(out, list) or + isinstance(out, tuple)) else out + else: + out_var = helper.create_variable_for_type_inference(dtype=dtype) + outputs[o_name] = [out_var] + for name in intermediate_output_names: + outputs[name] = [ + helper.create_variable_for_type_inference(dtype=dtype) + ] + helper.append_op( + type=op_type, inputs=inputs, outputs=outputs, attrs=kwargs) + return helper.append_activation(out_var) + + func.__name__ = op_type + func.__doc__ = _generate_doc_string_(op_proto) + return func + + +def generate_activation_fn(op_type): + """Register the Python layer for an Operator without Attribute. + + Args: + op_type: The name of the operator to be created. + + This function takes in the operator type (sigmoid, exp , tanh etc) and + creates the operator functionality. + + """ + op_proto = OpProtoHolder.instance().get_op_proto(op_type) + + def func(x, name=None): + if paddle.in_dynamic_mode(): + op = getattr(_C_ops, op_type) + return op(x) + + if op_type not in ["abs", "exp", "square"]: + check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64'], + op_type) + else: + # abs exp square ops support dtype(int32, int64, float16, float32, float64) + check_variable_and_dtype( + x, 'x', ['int32', 'int64', 'float16', 'float32', 'float64'], + op_type) + + helper = LayerHelper(op_type, **locals()) + + output = helper.create_variable_for_type_inference(dtype=x.dtype) + helper.append_op(type=op_type, inputs={"X": x}, outputs={"Out": output}) + return output + + func.__name__ = op_type + func.__doc__ = _generate_doc_string_( + op_proto, + additional_args_lines=[ + "name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`." + ]) + return func + + +def generate_inplace_fn(inplace_op_type): + """Register the Python layer for an Inplace Operator without Attribute. + + Args: + inplace_op_type: The name of the inplace operator to be created. + + This function takes in the inplace operator type (exp_ , ceil_ etc) and + creates the operator functionality. + """ + origin_op_type = inplace_op_type[:-1] + + def func(x, name=None): + if paddle.in_dynamic_mode(): + op = getattr(_C_ops, inplace_op_type) + return op(x) + warnings.warn( + "In static mode, {}() is the same as {}() and does not perform inplace operation.". + format(inplace_op_type, origin_op_type)) + return generate_activation_fn(origin_op_type)(x, name) + + func.__name__ = inplace_op_type + func.__doc__ = """ +Inplace version of ``{0}`` API, the output Tensor will be inplaced with input ``x``. +Please refer to :ref:`api_fluid_layers_{1}`. +""".format(origin_op_type, origin_op_type) + + return func + + +def templatedoc(op_type=None): + """ + Decorator of layer function. It will use the docstring from the layer + function as the template. The template arguments are: + + * ${comment}: The operator comment written in CPP. + * ${{name}_comment}: The comment of ${name} written with AddAttr, AddOutput, + and AddInput. The ${name} is Python snake style. i.e., xxx_xxx. + * ${{name}_type}: The type of ${name}. + + Returns: + Decorated function. + """ + + def trim_ending_dot(msg): + return msg.rstrip('.') + + def __impl__(func): + if op_type is None: + op_type_name = func.__name__ + else: + op_type_name = op_type + op_proto = OpProtoHolder.instance().get_op_proto(op_type_name) + tmpl = string.Template(func.__doc__) + + comment_lines = op_proto.comment.split("\n") + comment = "" + for line in comment_lines: + line = line.strip() + if len(line) != 0: + comment += escape_math(line) + comment += " " + elif len(comment) != 0: + comment += "\n \n " + + args = {"comment": trim_ending_dot(comment)} + for each_input in op_proto.inputs: + input_name = _convert_(each_input.name) + args["{0}_comment".format(input_name)] = trim_ending_dot( + each_input.comment) + args["{0}_type".format(input_name)] = "Variable" + for each_attr in op_proto.attrs: + input_name = _convert_(each_attr.name) + args["{0}_comment".format(input_name)] = trim_ending_dot( + each_attr.comment) + args["{0}_type".format(input_name)] = _type_to_str_(each_attr.type) + + for each_opt in op_proto.outputs: + output_name = _convert_(each_opt.name) + args["{0}_comment".format(output_name)] = trim_ending_dot( + each_opt.comment) + args["{0}_type".format(output_name)] = "Variable" + func.__doc__ = tmpl.substitute(args) + return func + + return __impl__ + + +def add_sample_code(func, sample_code): + """ + Append sample code for dynamically generated functions. + + Args: + func: The function of the function to be append sample code to. + sample_code: sample code session in rst format. + """ + func.__doc__ = func.__doc__ + sample_code diff --git a/python/paddle/tensor/logic.py b/python/paddle/tensor/logic.py index ffd827b0eb530..27aa333b1a546 100755 --- a/python/paddle/tensor/logic.py +++ b/python/paddle/tensor/logic.py @@ -14,7 +14,7 @@ from ..fluid.layer_helper import LayerHelper from ..fluid.data_feeder import check_type, check_variable_and_dtype -from ..fluid.layers.layer_function_generator import templatedoc +from .layer_function_generator import templatedoc from ..static import Variable from ..fluid.framework import _in_legacy_dygraph, in_dygraph_mode # TODO: define logic functions of a tensor diff --git a/python/paddle/tensor/math.py b/python/paddle/tensor/math.py index 298d7af96ea57..3a2d08af88ff8 100644 --- a/python/paddle/tensor/math.py +++ b/python/paddle/tensor/math.py @@ -23,56 +23,52 @@ from paddle.common_ops_import import templatedoc from paddle.common_ops_import import dygraph_utils -from paddle.tensor import cast -from paddle.tensor.attribute import _complex_to_real_dtype +from .manipulation import cast +from .creation import _complex_to_real_dtype +from .layer_function_generator import _generate_doc_string_, generate_activation_fn, generate_layer_fn + import paddle -from paddle.static import Variable -from ..framework import core -from ..fluid.framework import _in_legacy_dygraph, in_dygraph_mode, _non_static_mode +from ..static import Variable +from ..framework import core, in_dygraph_mode, _non_static_mode, LayerHelper +from ..fluid.framework import _in_legacy_dygraph from ..framework import _varbase_creator, convert_np_dtype_to_dtype_ -from ..fluid.layer_helper import LayerHelper from ..fluid.data_feeder import check_variable_and_dtype, check_type, check_dtype, convert_dtype -from ..fluid.layers.layer_function_generator import _generate_doc_string_, generate_activation_fn, generate_layer_fn from ..fluid.dygraph.inplace_utils import inplace_apis_in_dygraph_only # TODO: define math functions # yapf: disable -from ..fluid.layers import abs # noqa: F401 -from ..fluid.layers import acos # noqa: F401 -from ..fluid.layers import asin # noqa: F401 -from ..fluid.layers import ceil # noqa: F401 -from ..fluid.layers import ceil_ # noqa: F401 -from ..fluid.layers import cos # noqa: F401 -from ..fluid.layers import tan # noqa: F401 -from ..fluid.layers import sinh # noqa: F401 -from ..fluid.layers import cosh # noqa: F401 -from ..fluid.layers import exp # noqa: F401 -from ..fluid.layers import exp_ # noqa: F401 -from ..fluid.layers import expm1 # noqa: F401 -from ..fluid.layers import floor # noqa: F401 -from ..fluid.layers import floor_ # noqa: F401 -from ..fluid.layers import log # noqa: F401 -from ..fluid.layers import reciprocal # noqa: F401 -from ..fluid.layers import reciprocal_ # noqa: F401 -from ..fluid.layers import round # noqa: F401 -from ..fluid.layers import round_ # noqa: F401 -from ..fluid.layers import rsqrt # noqa: F401 -from ..fluid.layers import rsqrt_ # noqa: F401 -from ..fluid.layers import scale # noqa: F401 -from ..fluid.layers import square # noqa: F401 -from ..fluid.layers import stanh # noqa: F401 -from ..fluid.layers import atan # noqa: F401 -from ..fluid.layers import erf # noqa: F401 -from ..fluid.layers import sqrt # noqa: F401 -from ..fluid.layers import sqrt_ # noqa: F401 -from ..fluid.layers import sin # noqa: F401 -from ..fluid.layers import lgamma # noqa: F401 -from ..fluid.layers import asinh # noqa: F401 -from ..fluid.layers import acosh # noqa: F401 -from ..fluid.layers import atanh # noqa: F401 - -from ..fluid.layers import multiplex # noqa: F401 -from ..fluid.layers import reduce_prod +from .ops import abs # noqa: F401 +from .ops import acos # noqa: F401 +from .ops import asin # noqa: F401 +from .ops import ceil # noqa: F401 +from .ops import ceil_ # noqa: F401 +from .ops import cos # noqa: F401 +from .ops import tan # noqa: F401 +from .ops import sinh # noqa: F401 +from .ops import cosh # noqa: F401 +from .ops import exp # noqa: F401 +from .ops import exp_ # noqa: F401 +from .ops import expm1 # noqa: F401 +from .ops import floor # noqa: F401 +from .ops import floor_ # noqa: F401 +from .ops import reciprocal # noqa: F401 +from .ops import reciprocal_ # noqa: F401 +from .ops import round # noqa: F401 +from .ops import round_ # noqa: F401 +from .ops import rsqrt # noqa: F401 +from .ops import rsqrt_ # noqa: F401 +from .ops import square # noqa: F401 +from .ops import atan # noqa: F401 +from .ops import erf # noqa: F401 +from .ops import sqrt # noqa: F401 +from .ops import sqrt_ # noqa: F401 +from .ops import sin # noqa: F401 +from .ops import lgamma # noqa: F401 +from .ops import asinh # noqa: F401 +from .ops import acosh # noqa: F401 +from .ops import atanh # noqa: F401 + + from ..fluid.layers import elementwise_sub from paddle import _C_ops @@ -92,6 +88,241 @@ ] +def log(x, name=None): + r""" + Calculates the natural log of the given input tensor, element-wise. + + .. math:: + + Out = \\ln(x) + + Args: + x (Tensor): Input Tensor. Must be one of the following types: float32, float64. + name (str|None): The default value is None. Normally there is no need for user to set this property. For more information, please refer to :ref:`api_guide_Name` + + + Returns: + Tensor: The natural log of the input Tensor computed element-wise. + + Examples: + + .. code-block:: python + + import paddle + + x = [[2,3,4], [7,8,9]] + x = paddle.to_tensor(x, dtype='float32') + res = paddle.log(x) + # [[0.693147, 1.09861, 1.38629], [1.94591, 2.07944, 2.19722]] + """ + if in_dygraph_mode(): + return _C_ops.final_state_log(x) + if _in_legacy_dygraph(): + return _C_ops.log(x) + + check_variable_and_dtype(x, 'x', ['float32', 'float64'], "log") + inputs = {'X': [x]} + helper = LayerHelper('log', **locals()) + dtype = helper.input_dtype(input_param_name='x') + out = helper.create_variable_for_type_inference(dtype) + helper.append_op(type="log", inputs={"X": x}, outputs={"Out": out}) + return out + + +def scale(x, scale=1.0, bias=0.0, bias_after_scale=True, act=None, name=None): + """ + Scale operator. + + Putting scale and bias to the input Tensor as following: + + ``bias_after_scale`` is True: + + .. math:: + Out=scale*X+bias + + ``bias_after_scale`` is False: + + .. math:: + Out=scale*(X+bias) + + Args: + x(Tensor): Input N-D Tensor of scale operator. Data type can be float32, float64, int8, int16, int32, int64, uint8. + scale(float|Tensor): The scale factor of the input, it should be a float number or a Tensor with shape [1] and data type as float32. + bias(float): The bias to be put on the input. + bias_after_scale(bool): Apply bias addition after or before scaling. It is useful for numeric stability in some circumstances. + act(str, optional): Activation applied to the output such as tanh, softmax, sigmoid, relu. + name(str, optional): The default value is None. Normally there is no need for user to set this property. For more information, please refer to :ref:`api_guide_Name` + + Returns: + Tensor: Output tensor of scale operator, with shape and data type same as input. + + Examples: + .. code-block:: python + + # scale as a float32 number + import paddle + + data = paddle.randn(shape=[2,3], dtype='float32') + res = paddle.scale(data, scale=2.0, bias=1.0) + + .. code-block:: python + + # scale with parameter scale as a Tensor + import paddle + + data = paddle.randn(shape=[2, 3], dtype='float32') + factor = paddle.to_tensor([2], dtype='float32') + res = paddle.scale(data, scale=factor, bias=1.0) + + """ + + if in_dygraph_mode(): + out = _C_ops.final_state_scale(x, scale, float(bias), bias_after_scale) + return dygraph_utils._append_activation_in_dygraph(out) + if _non_static_mode(): + _scale = scale.numpy().item(0) if isinstance(scale, Variable) else scale + out = _C_ops.scale(x, 'scale', + float(_scale), 'bias', + float(bias), 'bias_after_scale', bias_after_scale) + return dygraph_utils._append_activation_in_dygraph(out) + + check_variable_and_dtype(x, "x", [ + 'float16', 'uint16', 'float32', 'float64', 'int8', 'int16', 'int32', + 'int64', 'uint8' + ], "scale") + inputs = {'X': [x]} + attrs = { + 'bias': float(bias), + 'bias_after_scale': bias_after_scale, + } + if isinstance(scale, Variable): + inputs['ScaleTensor'] = [scale] + else: + attrs['scale'] = float(scale) + helper = LayerHelper('scale', **locals()) + out = helper.create_variable_for_type_inference(dtype=x.dtype) + + helper.append_op( + type='scale', inputs=inputs, outputs={'Out': out}, attrs=attrs) + return helper.append_activation(out) + + +def stanh(x, scale_a=0.67, scale_b=1.7159, name=None): + """ + stanh activation. + + .. math:: + + out = b * \\frac{e^{a * x} - e^{-a * x}}{e^{a * x} + e^{-a * x}} + + Parameters: + x (Tensor): The input Tensor with data type float32, float64. + scale_a (float, optional): The scale factor a of the input. Default is 0.67. + scale_b (float, optional): The scale factor b of the output. Default is 1.7159. + name (str, optional): Name for the operation (optional, default is None). + For more information, please refer to :ref:`api_guide_Name`. + + Returns: + A Tensor with the same data type and shape as ``x`` . + + Examples: + .. code-block:: python + + import paddle + + x = paddle.to_tensor([1.0, 2.0, 3.0, 4.0]) + out = paddle.stanh(x, scale_a=0.67, scale_b=1.72) # [1.00616539, 1.49927628, 1.65933108, 1.70390463] + + """ + + if _non_static_mode(): + return _C_ops.stanh(x, 'scale_a', scale_a, 'scale_b', scale_b) + + check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64'], 'stanh') + + helper = LayerHelper('stanh', **locals()) + out = helper.create_variable_for_type_inference(dtype=x.dtype) + helper.append_op( + type='stanh', + inputs={'X': x}, + outputs={'Out': out}, + attrs={'scale_a': scale_a, + 'scale_b': scale_b}) + return out + +def multiplex(inputs, index, name=None): + """ + + Based on the given index parameter, the OP selects a specific row from each input Tensor to construct the output Tensor. + + If the input of this OP contains :math:`m` Tensors, where :math:`I_{i}` means the i-th input Tensor, :math:`i` between :math:`[0,m)` . + + And :math:`O` means the output, where :math:`O[i]` means the i-th row of the output, then the output satisfies that :math:`O[i] = I_{index[i]}[i]` . + + For Example: + + .. code-block:: text + + Given: + + inputs = [[[0,0,3,4], [0,1,3,4], [0,2,4,4], [0,3,3,4]], + [[1,0,3,4], [1,1,7,8], [1,2,4,2], [1,3,3,4]], + [[2,0,3,4], [2,1,7,8], [2,2,4,2], [2,3,3,4]], + [[3,0,3,4], [3,1,7,8], [3,2,4,2], [3,3,3,4]]] + + index = [[3],[0],[1],[2]] + + out = [[3,0,3,4], # out[0] = inputs[index[0]][0] = inputs[3][0] = [3,0,3,4] + [0,1,3,4], # out[1] = inputs[index[1]][1] = inputs[0][1] = [0,1,3,4] + [1,2,4,2], # out[2] = inputs[index[2]][2] = inputs[1][2] = [1,2,4,2] + [2,3,3,4]] # out[3] = inputs[index[3]][3] = inputs[2][3] = [2,3,3,4] + + + Args: + inputs (list): The input Tensor list. The list elements are N-D Tensors of data types float32, float64, int32, int64. All input Tensor shapes should be the same and rank must be at least 2. + index (Tensor): Used to select some rows in the input Tensor to construct an index of the output Tensor. It is a 2-D Tensor with data type int32 or int64 and shape [M, 1], where M is the number of input Tensors. + name(str, optional): The default value is None. Normally there is no + need for user to set this property. For more information, please + refer to :ref:`api_guide_Name`. + Returns: + Tensor: Output of multiplex OP, with data type being float32, float64, int32, int64. + + Examples: + + .. code-block:: python + + import paddle + import numpy as np + img1 = np.array([[1, 2], [3, 4]]).astype(np.float32) + img2 = np.array([[5, 6], [7, 8]]).astype(np.float32) + inputs = [paddle.to_tensor(img1), paddle.to_tensor(img2)] + index = paddle.to_tensor(np.array([[1], [0]]).astype(np.int32)) + res = paddle.multiplex(inputs, index) + print(res) # [array([[5., 6.], [3., 4.]], dtype=float32)] + + """ + if _non_static_mode(): + return _C_ops.multiplex(index, inputs) + helper = LayerHelper('multiplex', **locals()) + + check_type(inputs, 'inputs', (list), 'multiplex') + if len(inputs) < 2: + raise ValueError( + "inputs should be a list object with at least 2 elements.") + for id, x in enumerate(inputs): + check_variable_and_dtype(x, 'input[' + str(id) + ']', + ['float32', 'float64', 'int32', 'int64'], + 'multiplex') + check_variable_and_dtype(index, "index", ['int32', 'int64'], 'multiplex') + + out = helper.create_variable_for_type_inference(inputs[0].dtype) + helper.append_op( + type='multiplex', + inputs={'X': inputs, + 'Ids': index}, + outputs={'Out': [out]}) + return out + @inplace_apis_in_dygraph_only def scale_(x, scale=1.0, bias=0.0, bias_after_scale=True, act=None, name=None): """ @@ -2973,7 +3204,38 @@ def prod(x, axis=None, keepdim=False, dtype=None, name=None): if x.dtype != convert_np_dtype_to_dtype_(dtype): x = cast(x, dtype) - return reduce_prod(input=x, dim=axis, keep_dim=keepdim, name=name) + input = x + dim = axis + keep_dim = keepdim + if dim is not None and not isinstance(dim, list): + if isinstance(dim, tuple): + dim = list(dim) + elif isinstance(dim, int): + dim = [dim] + else: + raise TypeError( + "The type of axis must be int, list or tuple, but received {}". + format(type(dim))) + if in_dygraph_mode(): + return _C_ops.final_state_reduce_prod( + input, dim if dim != None and dim != [] else [0], keep_dim, True if + dim == None or dim == [] or len(dim) == len(input.shape) else False) + + helper = LayerHelper('reduce_prod', **locals()) + check_variable_and_dtype( + input, 'input', ['float32', 'float64', 'int32', 'int64'], 'reduce_prod') + out = helper.create_variable_for_type_inference(dtype=helper.input_dtype()) + helper.append_op( + type='reduce_prod', + inputs={'X': input}, + outputs={'Out': out}, + attrs={ + 'dim': dim if dim != None and dim != [] else [0], + 'keep_dim': keep_dim, + 'reduce_all': True if dim == None or dim == [] or + len(dim) == len(input.shape) else False + }) + return out def sign(x, name=None): diff --git a/python/paddle/tensor/ops.py b/python/paddle/tensor/ops.py new file mode 100644 index 0000000000000..9ee59c6cfd843 --- /dev/null +++ b/python/paddle/tensor/ops.py @@ -0,0 +1,532 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function +import os +from .layer_function_generator import generate_layer_fn, generate_activation_fn, generate_inplace_fn, add_sample_code +from ..framework import core +from ..framework import convert_np_dtype_to_dtype_ +from ..static import Variable +from ..fluid.data_feeder import convert_dtype, check_variable_and_dtype, check_type, check_dtype + +__deprecated_func_name__ = { + 'tanh_shrink': 'tanhshrink', + 'logsigmoid': 'log_sigmoid' +} + +__activations_noattr__ = [ + 'sigmoid', + 'silu', + 'logsigmoid', + 'tanh_shrink', + 'softplus', + 'softsign', + 'tanh', +] + +__unary_func__ = [ + 'exp', + 'expm1', + 'atan', + 'sqrt', + 'rsqrt', + 'abs', + 'ceil', + 'floor', + 'cos', + 'tan', + 'acos', + 'sin', + 'sinh', + 'asin', + 'cosh', + 'round', + 'reciprocal', + 'square', + 'lgamma', + 'acosh', + 'asinh', + 'atanh', +] + +__inplace_unary_func__ = [ + 'exp_', + 'sqrt_', + 'rsqrt_', + 'ceil_', + 'floor_', + 'round_', + 'reciprocal_', +] + +__all__ = [] + +for _OP in set(__all__): + globals()[_OP] = generate_layer_fn(_OP) + +# It is a hot fix in some unittest using: +# fluid.layers.scale(x=x, scale=10.0, out=out_var) +# e.g.: test_program_code.py, test_dist_train.py +globals()['_scale'] = generate_layer_fn('scale') + +globals()['_elementwise_div'] = generate_layer_fn('elementwise_div') + +__all__ += __activations_noattr__ +__all__ += __unary_func__ +__all__ += __inplace_unary_func__ + +for _OP in set(__activations_noattr__): + _new_OP = _OP + if _OP in __deprecated_func_name__: + _new_OP = __deprecated_func_name__[_OP] + _func = generate_activation_fn(_OP) + globals()[_OP] = _func + +for _OP in set(__unary_func__): + _new_OP = _OP + if _OP in __deprecated_func_name__: + _new_OP = __deprecated_func_name__[_OP] + _func = generate_activation_fn(_OP) + globals()[_OP] = _func + +for _OP in set(__inplace_unary_func__): + _new_OP = _OP + if _OP in __deprecated_func_name__: + _new_OP = __deprecated_func_name__[_OP] + _func = generate_inplace_fn(_OP) + globals()[_OP] = _func + +add_sample_code(globals()["sigmoid"], r""" +Examples: + .. code-block:: python + + import paddle + import paddle.nn.functional as F + + x = paddle.to_tensor([-0.4, -0.2, 0.1, 0.3]) + out = F.sigmoid(x) + print(out) + # [0.40131234 0.450166 0.52497919 0.57444252] + +""") + +add_sample_code(globals()["silu"], r""" +Examples: + .. code-block:: python + + import paddle + import paddle.nn.functional as F + + x = paddle.to_tensor([1.0, 2.0, 3.0, 4.0]) + out = F.silu(x) + print(out) + # [ 0.7310586 1.7615942 2.8577224, 3.9280552 ] + +""") + +add_sample_code(globals()["logsigmoid"], r""" +Examples: + .. code-block:: python + + import paddle + import paddle.nn.functional as F + + x = paddle.to_tensor([-0.4, -0.2, 0.1, 0.3]) + out = F.log_sigmoid(x) + print(out) + # [-0.91301525 -0.79813887 -0.64439666 -0.55435524] + +""") + +add_sample_code(globals()["exp"], r""" +Examples: + .. code-block:: python + + import paddle + + x = paddle.to_tensor([-0.4, -0.2, 0.1, 0.3]) + out = paddle.exp(x) + print(out) + # [0.67032005 0.81873075 1.10517092 1.34985881] + +""") + +add_sample_code(globals()["expm1"], r""" +Examples: + .. code-block:: python + + import paddle + + x = paddle.to_tensor([-0.4, -0.2, 0.1, 0.3]) + out = paddle.expm1(x) + print(out) + # [-0.32967997, -0.18126924, 0.10517092, 0.34985882] + +""") + +add_sample_code(globals()["tanh"], r""" +Examples: + .. code-block:: python + + import paddle + + x = paddle.to_tensor([-0.4, -0.2, 0.1, 0.3]) + out = paddle.tanh(x) + print(out) + # [-0.37994896 -0.19737532 0.09966799 0.29131261] + +""") + +add_sample_code(globals()["atan"], r""" +Examples: + .. code-block:: python + + import paddle + + x = paddle.to_tensor([-0.4, -0.2, 0.1, 0.3]) + out = paddle.atan(x) + print(out) + # [-0.38050638 -0.19739556 0.09966865 0.29145679] + +""") + +add_sample_code(globals()["tanh_shrink"], r""" +Examples: + .. code-block:: python + + import paddle + import paddle.nn.functional as F + + x = paddle.to_tensor([-0.4, -0.2, 0.1, 0.3]) + out = F.tanhshrink(x) + print(out) + # [-0.020051, -0.00262468, 0.000332005, 0.00868739] + +""") + +add_sample_code(globals()["sqrt"], r""" +Examples: + .. code-block:: python + + import paddle + + x = paddle.to_tensor([0.1, 0.2, 0.3, 0.4]) + out = paddle.sqrt(x) + print(out) + # [0.31622777 0.4472136 0.54772256 0.63245553] + +""") + +add_sample_code(globals()["rsqrt"], r""" +Examples: + .. code-block:: python + + import paddle + + x = paddle.to_tensor([0.1, 0.2, 0.3, 0.4]) + out = paddle.rsqrt(x) + print(out) + # [3.16227766 2.23606798 1.82574186 1.58113883] + +""") + +add_sample_code(globals()["abs"], r""" +Examples: + .. code-block:: python + + import paddle + + x = paddle.to_tensor([-0.4, -0.2, 0.1, 0.3]) + out = paddle.abs(x) + print(out) + # [0.4 0.2 0.1 0.3] + +""") + +add_sample_code(globals()["ceil"], r""" +Examples: + .. code-block:: python + + import paddle + + x = paddle.to_tensor([-0.4, -0.2, 0.1, 0.3]) + out = paddle.ceil(x) + print(out) + # [-0. -0. 1. 1.] + +""") + +add_sample_code(globals()["floor"], r""" +Examples: + .. code-block:: python + + import paddle + + x = paddle.to_tensor([-0.4, -0.2, 0.1, 0.3]) + out = paddle.floor(x) + print(out) + # [-1. -1. 0. 0.] + +""") + +add_sample_code(globals()["cos"], r""" +Examples: + .. code-block:: python + + import paddle + + x = paddle.to_tensor([-0.4, -0.2, 0.1, 0.3]) + out = paddle.cos(x) + print(out) + # [0.92106099 0.98006658 0.99500417 0.95533649] + +""") + +add_sample_code(globals()["tan"], r""" +Examples: + .. code-block:: python + + import paddle + + x = paddle.to_tensor([-0.4, -0.2, 0.1, 0.3]) + out = paddle.tan(x) + print(out) + # [-0.42279324, -0.20271005, 0.10033467, 0.30933627] + +""") + +add_sample_code(globals()["acos"], r""" +Examples: + .. code-block:: python + + import paddle + + x = paddle.to_tensor([-0.4, -0.2, 0.1, 0.3]) + out = paddle.acos(x) + print(out) + # [1.98231317 1.77215425 1.47062891 1.26610367] + +""") + +add_sample_code(globals()["sin"], r""" +Examples: + .. code-block:: python + + import paddle + + x = paddle.to_tensor([-0.4, -0.2, 0.1, 0.3]) + out = paddle.sin(x) + print(out) + # [-0.38941834 -0.19866933 0.09983342 0.29552021] + +""") + +add_sample_code(globals()["asin"], r""" +Examples: + .. code-block:: python + + import paddle + + x = paddle.to_tensor([-0.4, -0.2, 0.1, 0.3]) + out = paddle.asin(x) + print(out) + # [-0.41151685 -0.20135792 0.10016742 0.30469265] + +""") + +add_sample_code(globals()["cosh"], r""" +Examples: + .. code-block:: python + + import paddle + + x = paddle.to_tensor([-0.4, -0.2, 0.1, 0.3]) + out = paddle.cosh(x) + print(out) + # [1.08107237 1.02006676 1.00500417 1.04533851] + +""") + +add_sample_code(globals()["sinh"], r""" +Examples: + .. code-block:: python + + import paddle + + x = paddle.to_tensor([-0.4, -0.2, 0.1, 0.3]) + out = paddle.sinh(x) + print(out) + # [-0.41075233 -0.201336 0.10016675 0.30452029] + +""") + +add_sample_code(globals()["asinh"], r""" +Examples: + .. code-block:: python + + import paddle + + x = paddle.to_tensor([-0.4, -0.2, 0.1, 0.3]) + out = paddle.asinh(x) + print(out) + # [-0.39003533, -0.19869010, 0.09983408, 0.29567307] + +""") + +add_sample_code(globals()["acosh"], r""" +Examples: + .. code-block:: python + + import paddle + + x = paddle.to_tensor([1., 3., 4., 5.]) + out = paddle.acosh(x) + print(out) + # [0. , 1.76274729, 2.06343699, 2.29243159] + +""") + +add_sample_code(globals()["atanh"], r""" +Examples: + .. code-block:: python + + import paddle + + x = paddle.to_tensor([-0.4, -0.2, 0.1, 0.3]) + out = paddle.atanh(x) + print(out) + # [-0.42364895, -0.20273256, 0.10033535, 0.30951962] + +""") + +add_sample_code(globals()["round"], r""" +Examples: + .. code-block:: python + + import paddle + + x = paddle.to_tensor([-0.5, -0.2, 0.6, 1.5]) + out = paddle.round(x) + print(out) + # [-1. -0. 1. 2.] + +""") + +add_sample_code(globals()["reciprocal"], r""" +Examples: + .. code-block:: python + + import paddle + + x = paddle.to_tensor([-0.4, -0.2, 0.1, 0.3]) + out = paddle.reciprocal(x) + print(out) + # [-2.5 -5. 10. 3.33333333] + +""") + +add_sample_code(globals()["square"], r""" +Examples: + .. code-block:: python + + import paddle + + x = paddle.to_tensor([-0.4, -0.2, 0.1, 0.3]) + out = paddle.square(x) + print(out) + # [0.16 0.04 0.01 0.09] + +""") + +add_sample_code(globals()["lgamma"], r""" +Examples: + .. code-block:: python + + import paddle + + x = paddle.to_tensor([-0.4, -0.2, 0.1, 0.3]) + out = paddle.lgamma(x) + print(out) + # [1.31452441, 1.76149750, 2.25271273, 1.09579802] + +""") + +add_sample_code(globals()["softplus"], r""" +Examples: + .. code-block:: python + + import paddle + import paddle.nn.functional as F + + x = paddle.to_tensor([-0.4, -0.2, 0.1, 0.3]) + out = F.softplus(x) + print(out) + # [0.513015, 0.598139, 0.744397, 0.854355] + +""") + +add_sample_code(globals()["softsign"], r""" +Examples: + .. code-block:: python + + import paddle + import paddle.nn.functional as F + + x = paddle.to_tensor([-0.4, -0.2, 0.1, 0.3]) + out = F.softsign(x) + print(out) + # [-0.285714, -0.166667, 0.0909091, 0.230769] + +""") + +__all__ += ['erf'] + +_erf_ = generate_layer_fn('erf') + + +def erf(x, name=None): + locals_var = locals().copy() + kwargs = dict() + for name, val in locals_var.items(): + if val is not None: + kwargs[name] = val + return _erf_(**kwargs) + + +erf.__doc__ = r""" +:strong:`Erf Operator` +For more details, see [Error function](https://en.wikipedia.org/wiki/Error_function). + +Equation: + .. math:: + out = \\frac{2}{\\sqrt{\\pi}} \\int_{0}^{x}e^{- \\eta^{2}}d\\eta + +Args: + + x (Tensor): The input tensor, it's data type should be float32, float64. + +Returns: + + Tensor: The output of Erf op, dtype: float32 or float64, the same as the input, shape: the same as the input. + +Examples: + + .. code-block:: python + + import paddle + x = paddle.to_tensor([-0.4, -0.2, 0.1, 0.3]) + out = paddle.erf(x) + print(out) + # [-0.42839236 -0.22270259 0.11246292 0.32862676] +""" From 70036d5d8324893cbb2655faa0b10e11b4e20e97 Mon Sep 17 00:00:00 2001 From: Zhong Hui Date: Fri, 8 Apr 2022 15:21:22 +0800 Subject: [PATCH 028/211] Fix libmct.cmake tar ownership change (#41516) --- cmake/external/libmct.cmake | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cmake/external/libmct.cmake b/cmake/external/libmct.cmake index 92c3165fbaa90..a166e43c7b95e 100644 --- a/cmake/external/libmct.cmake +++ b/cmake/external/libmct.cmake @@ -45,7 +45,7 @@ ExternalProject_Add( PREFIX ${LIBMCT_PREFIX_DIR} DOWNLOAD_DIR ${LIBMCT_DOWNLOAD_DIR} DOWNLOAD_COMMAND wget --no-check-certificate ${LIBMCT_URL} -c -q -O ${LIBMCT_NAME}.tar.gz - && tar zxvf ${LIBMCT_NAME}.tar.gz + && tar --no-same-owner -zxvf ${LIBMCT_NAME}.tar.gz DOWNLOAD_NO_PROGRESS 1 UPDATE_COMMAND "" CMAKE_ARGS -DCMAKE_INSTALL_PREFIX=${LIBMCT_INSTALL_ROOT} From 0a6fe6994afcaff7b3c25ff122ce73cbad4a1fe5 Mon Sep 17 00:00:00 2001 From: Aurelius84 Date: Fri, 8 Apr 2022 16:08:52 +0800 Subject: [PATCH 029/211] [Eager]Fix segment_pool/allclose/isclose/scale API bug (#41506) * [Eager]Fix segment_pool/allclose/isclose/scale API bug * fix kernel register problem --- paddle/fluid/operators/cast_op.cu | 22 +++++++++++----------- python/paddle/incubate/tensor/math.py | 2 +- python/paddle/tensor/logic.py | 14 ++++++++++++-- python/paddle/utils/code_gen/backward.yaml | 3 ++- 4 files changed, 26 insertions(+), 15 deletions(-) diff --git a/paddle/fluid/operators/cast_op.cu b/paddle/fluid/operators/cast_op.cu index eb51215790bbc..0afe09ec028e3 100644 --- a/paddle/fluid/operators/cast_op.cu +++ b/paddle/fluid/operators/cast_op.cu @@ -19,15 +19,15 @@ namespace ops = paddle::operators; namespace plat = paddle::platform; using CUDA = paddle::platform::CUDADeviceContext; -#define REGISTER_CAST_CUDA_BASE(op_name, ...) \ - REGISTER_OP_CUDA_KERNEL( \ - op_name, ops::CastOpKernel, \ - ops::CastOpKernel, ops::CastOpKernel, \ - ops::CastOpKernel, ops::CastOpKernel, \ - ops::CastOpKernel, ops::CastOpKernel, \ - ops::CastOpKernel, \ - ops::CastOpKernel>, \ - ops::CastOpKernel>, ##__VA_ARGS__); - // See [ why register transfer_dtype_op alias with cast_op? ] in cast_op.cc -REGISTER_CAST_CUDA_BASE(transfer_dtype, ops::CastOpKernel) +REGISTER_OP_CUDA_KERNEL(transfer_dtype, ops::CastOpKernel, + ops::CastOpKernel, + ops::CastOpKernel, + ops::CastOpKernel, + ops::CastOpKernel, + ops::CastOpKernel, + ops::CastOpKernel, + ops::CastOpKernel, + ops::CastOpKernel>, + ops::CastOpKernel>, + ops::CastOpKernel); diff --git a/python/paddle/incubate/tensor/math.py b/python/paddle/incubate/tensor/math.py index da6eb4e17c7fb..07dc7c1581fc4 100644 --- a/python/paddle/incubate/tensor/math.py +++ b/python/paddle/incubate/tensor/math.py @@ -222,7 +222,7 @@ def segment_max(data, segment_ids, name=None): """ if in_dygraph_mode(): - out = _C_ops.final_state_segment_pool(data, segment_ids, "MAX")[0] + out, tmp = _C_ops.final_state_segment_pool(data, segment_ids, "MAX") return out if _non_static_mode(): diff --git a/python/paddle/tensor/logic.py b/python/paddle/tensor/logic.py index 27aa333b1a546..636b2ef17c6a0 100755 --- a/python/paddle/tensor/logic.py +++ b/python/paddle/tensor/logic.py @@ -127,7 +127,12 @@ def allclose(x, y, rtol=1e-05, atol=1e-08, equal_nan=False, name=None): """ if in_dygraph_mode(): - return _C_ops.final_state_allclose(x, y, rtol, atol, equal_nan) + # NOTE(dev): Pass tol as Tensor to fix precision loss problem, because + # C++ backend will cast it into float32 if passing float from python. + as_tensor = lambda x: paddle.to_tensor([x], dtype='float64', place='cpu') + return _C_ops.final_state_allclose(x, y, + as_tensor(rtol), + as_tensor(atol), equal_nan) if _in_legacy_dygraph(): return _C_ops.allclose(x, y, 'rtol', str(rtol), 'atol', @@ -689,7 +694,12 @@ def isclose(x, y, rtol=1e-05, atol=1e-08, equal_nan=False, name=None): """ if in_dygraph_mode(): - return _C_ops.final_state_isclose(x, y, rtol, atol, equal_nan) + # NOTE(dev): Pass tol as Tensor to fix precision loss problem, because + # C++ backend will cast it into float32 if passing float from python. + as_tensor = lambda x: paddle.to_tensor([x], dtype='float64', place='cpu') + return _C_ops.final_state_isclose(x, y, + as_tensor(rtol), + as_tensor(atol), equal_nan) if _in_legacy_dygraph(): return _C_ops.isclose(x, y, 'rtol', str(rtol), 'atol', diff --git a/python/paddle/utils/code_gen/backward.yaml b/python/paddle/utils/code_gen/backward.yaml index 3456fe3260abc..602fecc83b8f7 100644 --- a/python/paddle/utils/code_gen/backward.yaml +++ b/python/paddle/utils/code_gen/backward.yaml @@ -1217,7 +1217,7 @@ forward : scale (Tensor x, Scalar scale, float bias, bool bias_after_scale) -> Tensor(out) args : (Tensor out_grad, Scalar scale=1.0, float bias=0.0, bool bias_after_scale=true) output : Tensor(x_grad) - invoke : scale(out_grad, scale, bias, bias_after_scale) + invoke : scale(out_grad, scale, 0.0, bias_after_scale) - backward_api : scatter_grad forward : scatter (Tensor x, Tensor index, Tensor updates, bool overwrite) -> Tensor(out) @@ -1250,6 +1250,7 @@ param : [x] kernel : func : segment_pool_grad + data_type : x optional : summed_ids - backward_api : selu_grad From d4710dfe882eda5fb44b1a0a5d46a129f27597d0 Mon Sep 17 00:00:00 2001 From: helen88 Date: Fri, 8 Apr 2022 17:11:32 +0800 Subject: [PATCH 030/211] modify unittest of lstm forward, *test=kunlun (#41534) * modify unittest of lstm forward, *test=kunlun * modify unittest of lstm forward, *test=kunlun --- cmake/external/xpu.cmake | 2 +- paddle/fluid/operators/rnn_op_xpu.cc | 5 +- .../tests/unittests/xpu/test_rnn_op_xpu.py | 52 +++++++++---------- 3 files changed, 29 insertions(+), 30 deletions(-) diff --git a/cmake/external/xpu.cmake b/cmake/external/xpu.cmake index e83bdef327891..2b84def46520f 100644 --- a/cmake/external/xpu.cmake +++ b/cmake/external/xpu.cmake @@ -36,7 +36,7 @@ ENDIF() if(NOT DEFINED XPU_BASE_URL) SET(XPU_BASE_URL_WITHOUT_DATE "https://baidu-kunlun-product.cdn.bcebos.com/KL-SDK/klsdk-dev") - SET(XPU_BASE_URL "${XPU_BASE_URL_WITHOUT_DATE}/20220402") + SET(XPU_BASE_URL "${XPU_BASE_URL_WITHOUT_DATE}/20220408") else() SET(XPU_BASE_URL "${XPU_BASE_URL}") endif() diff --git a/paddle/fluid/operators/rnn_op_xpu.cc b/paddle/fluid/operators/rnn_op_xpu.cc index 2dee4e889f739..c75c24ab0abc2 100644 --- a/paddle/fluid/operators/rnn_op_xpu.cc +++ b/paddle/fluid/operators/rnn_op_xpu.cc @@ -16,6 +16,7 @@ limitations under the License. */ #include "paddle/fluid/platform/device/device_wrapper.h" #include "paddle/fluid/platform/device/xpu/xpu_header.h" #include "paddle/fluid/platform/device_context.h" +#include "paddle/phi/kernels/funcs/math_function.h" namespace paddle { namespace operators { @@ -114,6 +115,9 @@ class RnnXPUKernel : public framework::OpKernel { if (dropout_mask->numel() != output->numel()) dropout_mask->clear(); } dropout_mask->mutable_data(output->dims(), ctx.GetPlace()); + auto& dev_ctx = ctx.template device_context(); + phi::funcs::SetConstant ones; + ones(dev_ctx, dropout_mask, static_cast(1)); PADDLE_ENFORCE_EQ( mode, "LSTM", @@ -190,7 +194,6 @@ class RnnXPUKernel : public framework::OpKernel { seq_len_tensor = operators::GetDataFromTensor(sequence_length); } - auto& dev_ctx = ctx.template device_context(); int state_offset = pre_state[0]->dims()[1] * pre_state[0]->dims()[2]; for (int i = 0; i < num_layers; i++) { diff --git a/python/paddle/fluid/tests/unittests/xpu/test_rnn_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_rnn_op_xpu.py index e0d208644e79e..20a3fc69fe8d2 100755 --- a/python/paddle/fluid/tests/unittests/xpu/test_rnn_op_xpu.py +++ b/python/paddle/fluid/tests/unittests/xpu/test_rnn_op_xpu.py @@ -46,8 +46,9 @@ def setUp(self): self.init_dtype() self.op_type = "rnn" self.place = paddle.XPUPlace(0) - self.sequence_length = np.ones( - (self.batch_size, ), dtype=np.int32) * self.seq_length + self.sequence_length = np.array([12, 11, 10, 9, 8], dtype=np.int32) + self.num_layers = 1 + self.is_bidirec = False self.set_attrs() self.mode = "LSTM" self.is_test = False @@ -61,6 +62,10 @@ def setUp(self): high=0.1, size=(self.seq_length, self.batch_size, self.input_size)).astype(self.dtype) + input[11][1:][:] = 0 + input[10][2:][:] = 0 + input[9][3:][:] = 0 + input[8][4:][:] = 0 rnn1 = LSTM( self.input_size, @@ -126,10 +131,10 @@ def test_check_output(self): no_check_set=['Reserve', 'DropoutState']) def init_size(self): - self.seq_length = 1 - self.batch_size = 1 - self.input_size = 5 - self.hidden_size = 16 + self.seq_length = 12 + self.batch_size = 5 + self.input_size = 3 + self.hidden_size = 2 def get_weight_names(self): weight_names = [] @@ -142,38 +147,18 @@ def get_weight_names(self): return weight_names def set_attrs(self): - self.num_layers = 1 - self.is_bidirec = False + pass class TestRNNOp1(TestRNNOp): - def init_size(self): - self.seq_length = 2 - self.batch_size = 4 - self.input_size = 10 - self.hidden_size = 32 - def set_attrs(self): - self.num_layers = 1 - self.is_bidirec = False + self.sequence_length = None class TestRNNOp2(TestRNNOp): - def init_size(self): - self.seq_length = 5 - self.batch_size = 16 - self.input_size = 30 - self.hidden_size = 64 - def set_attrs(self): self.num_layers = 1 self.is_bidirec = True class TestRNNOp3(TestRNNOp): - def init_size(self): - self.seq_length = 10 - self.batch_size = 64 - self.input_size = 50 - self.hidden_size = 64 - def set_attrs(self): self.num_layers = 2 self.is_bidirec = False @@ -188,6 +173,17 @@ def set_attrs(self): self.num_layers = 2 self.is_bidirec = True + class TestRNNOp6(TestRNNOp): + def set_attrs(self): + self.num_layers = 2 + self.is_bidirec = True + self.sequence_length = None + + class TestRNNOp7(TestRNNOp): + def set_attrs(self): + self.num_layers = 3 + self.is_bidirec = True + support_types = get_xpu_op_support_types('rnn') for stype in support_types: From 09203e46f754e2042b009c7c8eca1a5bb546efab Mon Sep 17 00:00:00 2001 From: Jack Zhou Date: Fri, 8 Apr 2022 18:00:57 +0800 Subject: [PATCH 031/211] Fix RNN OP multi-threads predict bug (#41529) --- paddle/phi/kernels/cpu/rnn_kernel.cc | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/paddle/phi/kernels/cpu/rnn_kernel.cc b/paddle/phi/kernels/cpu/rnn_kernel.cc index 4d3976b0aba68..cae97eb076453 100644 --- a/paddle/phi/kernels/cpu/rnn_kernel.cc +++ b/paddle/phi/kernels/cpu/rnn_kernel.cc @@ -832,11 +832,13 @@ void RnnKernel(const Context& dev_ctx, DenseTensor* dropout_state, std::vector state, DenseTensor* reserve) { - if (dropout_state->IsInitialized()) { - if (dropout_state->numel() != out->numel()) dropout_state->clear(); + if (!is_test) { + if (dropout_state->IsInitialized()) { + if (dropout_state->numel() != out->numel()) dropout_state->clear(); + } + const auto& out_dim = out->dims(); + Full(dev_ctx, {out_dim.Get(), out_dim.size()}, 1, dropout_state); } - const auto& out_dim = out->dims(); - Full(dev_ctx, {out_dim.Get(), out_dim.size()}, 1, dropout_state); // init the output and allocate the memory dev_ctx.template Alloc(out); From 33abfbe6da4cc4fb50a14c56398317c2f3590606 Mon Sep 17 00:00:00 2001 From: Chen Weihang Date: Fri, 8 Apr 2022 18:01:45 +0800 Subject: [PATCH 032/211] [Eager] Remove elementwise add in conv (#41515) * remove elementwise add in conv * use reshape --- python/paddle/nn/functional/conv.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/python/paddle/nn/functional/conv.py b/python/paddle/nn/functional/conv.py index 086ae78919454..84aadbbac649b 100644 --- a/python/paddle/nn/functional/conv.py +++ b/python/paddle/nn/functional/conv.py @@ -127,8 +127,12 @@ def _conv_nd(x, x, weight, stride, padding, padding_algorithm, groups, dilation, data_format, False, -1, False) if bias is not None: - out = nn.elementwise_add(pre_bias, bias, axis=channel_dim) - return out + channel_dim = channel_dim + len( + x.shape) if channel_dim < 0 else channel_dim + tmp_bias = _C_ops.final_state_reshape( + bias, bias.shape + + [1 for i in range(len(x.shape) - channel_dim - 1)]) + return _C_ops.final_state_add(pre_bias, tmp_bias) else: return pre_bias if in_dynamic_mode(): From fcccb3f21ca88a3c196a9405378b7aa77f092b98 Mon Sep 17 00:00:00 2001 From: Xiaoxu Chen Date: Fri, 8 Apr 2022 18:59:18 +0800 Subject: [PATCH 033/211] fix test_autograd_functional_dynamic random timeout (#41457) --- python/paddle/fluid/tests/unittests/autograd/CMakeLists.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/python/paddle/fluid/tests/unittests/autograd/CMakeLists.txt b/python/paddle/fluid/tests/unittests/autograd/CMakeLists.txt index 1f69abac01ac6..46af5509d244b 100644 --- a/python/paddle/fluid/tests/unittests/autograd/CMakeLists.txt +++ b/python/paddle/fluid/tests/unittests/autograd/CMakeLists.txt @@ -6,5 +6,5 @@ foreach(TEST_OP ${TEST_OPS}) py_test_modules(${TEST_OP} MODULES ${TEST_OP} ENVS ${GC_ENVS}) endforeach(TEST_OP) -set_tests_properties(test_autograd_functional_dynamic PROPERTIES TIMEOUT 100) -set_tests_properties(test_autograd_functional_static PROPERTIES TIMEOUT 100) +set_tests_properties(test_autograd_functional_dynamic PROPERTIES TIMEOUT 160) +set_tests_properties(test_autograd_functional_static PROPERTIES TIMEOUT 160) From ab137a84a8fb2fcab4d361391e868b0db1aff7d7 Mon Sep 17 00:00:00 2001 From: lilong12 Date: Fri, 8 Apr 2022 20:03:18 +0800 Subject: [PATCH 034/211] update (#41309) --- python/paddle/fluid/tests/unittests/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt index 2e4259d2085c5..3f640a73a55c5 100755 --- a/python/paddle/fluid/tests/unittests/CMakeLists.txt +++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt @@ -1153,7 +1153,7 @@ if(WITH_DISTRIBUTE AND WITH_GPU AND WITH_NCCL) set_tests_properties(test_auto_parallel_save_load PROPERTIES TIMEOUT 120) set_tests_properties(test_auto_parallel_autoconvert PROPERTIES TIMEOUT 120) set_tests_properties(test_collective_process_group PROPERTIES TIMEOUT 120) - set_tests_properties(test_eager_dist_api PROPERTIES TIMEOUT 300) + set_tests_properties(test_eager_dist_api PROPERTIES TIMEOUT 100) if(${NCCL_VERSION} VERSION_GREATER_EQUAL 2212) set_tests_properties(test_parallel_dygraph_sparse_embedding PROPERTIES TIMEOUT 200) From c2e12949f08205404d328317e67918a3ef5923f7 Mon Sep 17 00:00:00 2001 From: Allen Guo Date: Fri, 8 Apr 2022 20:38:41 +0800 Subject: [PATCH 035/211] fix running error for ipu (#41481) --- paddle/phi/common/backend.h | 7 +++++++ paddle/phi/core/compat/convert_utils.cc | 2 ++ 2 files changed, 9 insertions(+) diff --git a/paddle/phi/common/backend.h b/paddle/phi/common/backend.h index a9e12f5d81ed0..5543bee144b3b 100644 --- a/paddle/phi/common/backend.h +++ b/paddle/phi/common/backend.h @@ -55,6 +55,8 @@ enum class Backend : uint8_t { // paddle kernel primitives backend KPS, + IPU, + // end of backend types NUM_BACKENDS, @@ -121,6 +123,9 @@ inline std::ostream& operator<<(std::ostream& os, Backend backend) { case Backend::KPS: os << "KPS"; break; + case Backend::IPU: + os << "IPU"; + break; default: { size_t device_type_id_ = static_cast(backend) - static_cast(Backend::NUM_BACKENDS); @@ -155,6 +160,8 @@ inline Backend StringToBackend(const char* backend_cstr) { return Backend::GPUDNN; } else if (s == std::string("KPS")) { return Backend::KPS; + } else if (s == std::string("IPU")) { + return Backend::IPU; } else { return static_cast(static_cast(Backend::NUM_BACKENDS) + phi::GetOrRegisterGlobalDeviceTypeId(s)); diff --git a/paddle/phi/core/compat/convert_utils.cc b/paddle/phi/core/compat/convert_utils.cc index c08dfa64c7f1b..43febb2ac0430 100644 --- a/paddle/phi/core/compat/convert_utils.cc +++ b/paddle/phi/core/compat/convert_utils.cc @@ -38,6 +38,8 @@ Backend TransToPhiBackend(const phi::Place& place) { return Backend::XPU; } else if (allocation_type == phi::AllocationType::NPU) { return Backend::NPU; + } else if (allocation_type == phi::AllocationType::IPU) { + return Backend::IPU; } else if (allocation_type == phi::AllocationType::CUSTOM) { return static_cast( static_cast(Backend::NUM_BACKENDS) + From 04a4bdf8822688f8290bbd27d936d59e66fb2f9e Mon Sep 17 00:00:00 2001 From: crystal <62974595+Zjq9409@users.noreply.github.com> Date: Fri, 8 Apr 2022 20:46:11 +0800 Subject: [PATCH 036/211] fix group_norm (#41531) fix group_norm vectorized address misalignment --- paddle/fluid/operators/group_norm_op.cu | 39 ++----------------------- 1 file changed, 3 insertions(+), 36 deletions(-) diff --git a/paddle/fluid/operators/group_norm_op.cu b/paddle/fluid/operators/group_norm_op.cu index c08f1920205da..c93910bde5a2c 100644 --- a/paddle/fluid/operators/group_norm_op.cu +++ b/paddle/fluid/operators/group_norm_op.cu @@ -419,23 +419,6 @@ __global__ void GroupNormBackward(const T* x, const T* d_y, const T* scale, } } -template -__global__ void VectorizedGetDsDbCUDAKernel(int imsize, const T* x, const T* dy, - T* ds, T* db) { - int i = blockIdx.x; - AccT ds_sum = static_cast(0); - AccT db_sum = static_cast(0); - x += i * imsize; - const int input_offset = ((uint64_t)x) % ALIGN_BYTES / sizeof(T); - - phi::Array ins; - ins[0] = x; - ins[1] = dy; - ThreadReduce(ins, imsize, input_offset, &db_sum, - &ds_sum); - ReduceMeanAndVar(db, ds, db_sum, ds_sum, 1); -} - template __global__ void ScalarGetDsDbCUDAKernel(int imsize, const T* x, const T* dy, T* ds, T* db) { @@ -622,25 +605,9 @@ class GroupNormGradKernel int flags = (scale_data != nullptr) * kHasScale + (bias_data != nullptr) * kHasBias; if (data_layout == DataLayout::kNCHW) { - using AccT = typename details::MPTypeTrait::Type; - constexpr int vec_size = sizeof(float4) / sizeof(T); - const int max_num_threads = 1024; - int max_block_size = std::min(imsize / vec_size, max_num_threads); - int block_size_nchw = 1; - while (block_size_nchw < max_block_size) { - block_size_nchw *= 2; - } - block_size_nchw = std::max(block_size_nchw, kps::details::kWarpSize); - dim3 blocks(block_size_nchw); - if (imsize < vec_size * block_size_nchw) { - ScalarGetDsDbCUDAKernel< - T><<>>( - imsize, x_data, dy_data, ds_data, db_data); - } else { - VectorizedGetDsDbCUDAKernel< - T, AccT, vec_size><<>>( - imsize, x_data, dy_data, ds_data, db_data); - } + ScalarGetDsDbCUDAKernel< + T><<>>( + imsize, x_data, dy_data, ds_data, db_data); if (d_scale || d_bias) { const int block = 256; From 330582e24097f68525d4fb56ac76d9c29a0a0068 Mon Sep 17 00:00:00 2001 From: whs Date: Fri, 8 Apr 2022 22:11:17 +0800 Subject: [PATCH 037/211] Fix fake quant cuda kernel (#41305) --- paddle/fluid/operators/fake_quantize_op.cu.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/paddle/fluid/operators/fake_quantize_op.cu.h b/paddle/fluid/operators/fake_quantize_op.cu.h index d85d47f546131..ae448b7ff2c8b 100644 --- a/paddle/fluid/operators/fake_quantize_op.cu.h +++ b/paddle/fluid/operators/fake_quantize_op.cu.h @@ -305,7 +305,7 @@ __global__ void ChannelClipAndQuantKernelQuantAxisN( int64_t idx = blockDim.x * blockIdx.x + threadIdx.x; for (int64_t i = idx; i < n; i += blockDim.x * gridDim.x) { T s = scale[(i / quant_stride) % nScale]; - T inv_s = 1.0 / s; + T inv_s = inverse(s); T x = in[i]; T v = x > s ? s : x; v = v < -s ? -s : v; From b3e7973157ecd827c2ec467fcd685b254ed73222 Mon Sep 17 00:00:00 2001 From: LiYuRio <63526175+LiYuRio@users.noreply.github.com> Date: Sat, 9 Apr 2022 07:56:04 +0800 Subject: [PATCH 038/211] [fleet executor] Add sink interceptor and test (#41497) --- .../distributed/fleet_executor/CMakeLists.txt | 3 +- .../distributed/fleet_executor/carrier.cc | 1 + .../fleet_executor/sink_interceptor.cc | 65 ++++++++++++++ .../fleet_executor/sink_interceptor.h | 41 +++++++++ .../fleet_executor/test/CMakeLists.txt | 3 + .../test/sink_interceptor_test.cc | 89 +++++++++++++++++++ 6 files changed, 201 insertions(+), 1 deletion(-) create mode 100644 paddle/fluid/distributed/fleet_executor/sink_interceptor.cc create mode 100644 paddle/fluid/distributed/fleet_executor/sink_interceptor.h create mode 100644 paddle/fluid/distributed/fleet_executor/test/sink_interceptor_test.cc diff --git a/paddle/fluid/distributed/fleet_executor/CMakeLists.txt b/paddle/fluid/distributed/fleet_executor/CMakeLists.txt index 4a2dfcb554ad3..977a125627ba5 100644 --- a/paddle/fluid/distributed/fleet_executor/CMakeLists.txt +++ b/paddle/fluid/distributed/fleet_executor/CMakeLists.txt @@ -13,7 +13,7 @@ endif() cc_library(task_loop_thread_pool SRCS task_loop_thread_pool.cc task_loop_thread.cc task_loop.cc DEPS enforce glog) cc_library(fleet_executor SRCS fleet_executor.cc carrier.cc task_node.cc runtime_graph.cc dist_model.cc interceptor.cc - compute_interceptor.cc amplifier_interceptor.cc source_interceptor.cc message_service.cc message_bus.cc dist_model_tensor_wrapper.cc + compute_interceptor.cc amplifier_interceptor.cc source_interceptor.cc sink_interceptor.cc message_service.cc message_bus.cc dist_model_tensor_wrapper.cc DEPS proto_desc fleet_executor_desc_proto interceptor_message_proto task_loop_thread_pool collective_helper op_registry executor_gc_helper gflags glog ${BRPC_DEPS}) @@ -26,6 +26,7 @@ if(WITH_DISTRIBUTE) set_source_files_properties(compute_interceptor.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) set_source_files_properties(amplifier_interceptor.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) set_source_files_properties(source_interceptor.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) + set_source_files_properties(sink_interceptor.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) set_source_files_properties(message_bus.h PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) set_source_files_properties(message_bus.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) set_source_files_properties(fleet_executor.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) diff --git a/paddle/fluid/distributed/fleet_executor/carrier.cc b/paddle/fluid/distributed/fleet_executor/carrier.cc index 358393d97f071..2d2a3b688fefe 100644 --- a/paddle/fluid/distributed/fleet_executor/carrier.cc +++ b/paddle/fluid/distributed/fleet_executor/carrier.cc @@ -31,6 +31,7 @@ namespace distributed { USE_INTERCEPTOR(Source); USE_INTERCEPTOR(Compute); USE_INTERCEPTOR(Amplifier); +USE_INTERCEPTOR(Sink); void Carrier::Init( int64_t rank, diff --git a/paddle/fluid/distributed/fleet_executor/sink_interceptor.cc b/paddle/fluid/distributed/fleet_executor/sink_interceptor.cc new file mode 100644 index 0000000000000..af707c28acd9e --- /dev/null +++ b/paddle/fluid/distributed/fleet_executor/sink_interceptor.cc @@ -0,0 +1,65 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/distributed/fleet_executor/sink_interceptor.h" +#include "paddle/fluid/distributed/fleet_executor/task_node.h" + +namespace paddle { +namespace distributed { + +SinkInterceptor::SinkInterceptor(int64_t interceptor_id, TaskNode* node) + : Interceptor(interceptor_id, node), max_run_times_(node->max_run_times()) { + // prepare the upstream running status + for (const auto& up : node->upstream()) { + upstream_step_.emplace(up.first, 0); + } + RegisterMsgHandle([this](const InterceptorMessage& msg) { Run(msg); }); +} + +void SinkInterceptor::StopCarrierIfComplete() { + bool flag = true; + for (const auto& up : upstream_step_) { + flag = flag & (up.second == max_run_times_); + } + if (flag) { + VLOG(3) << "Sink Interceptor is stopping carrier"; + StopCarrier(); + for (const auto& up : upstream_step_) { + upstream_step_.at(up.first) = 0; + } + } +} + +void SinkInterceptor::ReplyCompletedToUpStream(int64_t upstream_id) { + int64_t micro_step = upstream_step_.at(upstream_id); + int64_t scope_idx = micro_step % max_run_times_; + InterceptorMessage msg; + msg.set_message_type(DATA_IS_USELESS); + msg.set_scope_idx(scope_idx); + Send(upstream_id, msg); + upstream_step_.at(upstream_id) = micro_step + 1; + if (micro_step == max_run_times_ - 1) { + StopCarrierIfComplete(); + } +} + +void SinkInterceptor::Run(const InterceptorMessage& msg) { + if (msg.message_type() == DATA_IS_READY) { + ReplyCompletedToUpStream(msg.src_id()); + } +} + +REGISTER_INTERCEPTOR(Sink, SinkInterceptor); +} // namespace distributed +} // namespace paddle diff --git a/paddle/fluid/distributed/fleet_executor/sink_interceptor.h b/paddle/fluid/distributed/fleet_executor/sink_interceptor.h new file mode 100644 index 0000000000000..cb1d698a78526 --- /dev/null +++ b/paddle/fluid/distributed/fleet_executor/sink_interceptor.h @@ -0,0 +1,41 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include "paddle/fluid/distributed/fleet_executor/interceptor.h" + +namespace paddle { +namespace distributed { + +/* + * Sink interceptor + * There is only one sink in the runtime graph + * Take charge of: + * 1. record the num of micro-step + * 2. check whether to notify carrier the current step is finished + */ +class SinkInterceptor : public Interceptor { + public: + SinkInterceptor(int64_t interceptor_id, TaskNode* node); + + private: + void ReplyCompletedToUpStream(int64_t up_id); + void Run(const InterceptorMessage& msg); + void StopCarrierIfComplete(); + int64_t max_run_times_; + // upstream_id->cur_step + std::map upstream_step_; +}; +} // namespace distributed +} // namespace paddle diff --git a/paddle/fluid/distributed/fleet_executor/test/CMakeLists.txt b/paddle/fluid/distributed/fleet_executor/test/CMakeLists.txt index 33c08acd4498d..e0db8a261b585 100644 --- a/paddle/fluid/distributed/fleet_executor/test/CMakeLists.txt +++ b/paddle/fluid/distributed/fleet_executor/test/CMakeLists.txt @@ -7,6 +7,9 @@ cc_test(compute_interceptor_test SRCS compute_interceptor_test.cc DEPS fleet_exe set_source_files_properties(source_interceptor_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) cc_test(source_interceptor_test SRCS source_interceptor_test.cc DEPS fleet_executor ${BRPC_DEPS}) +set_source_files_properties(sink_interceptor_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) +cc_test(sink_interceptor_test SRCS sink_interceptor_test.cc DEPS fleet_executor ${BRPC_DEPS}) + set_source_files_properties(interceptor_pipeline_short_path_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) cc_test(interceptor_pipeline_short_path_test SRCS interceptor_pipeline_short_path_test.cc DEPS fleet_executor ${BRPC_DEPS}) diff --git a/paddle/fluid/distributed/fleet_executor/test/sink_interceptor_test.cc b/paddle/fluid/distributed/fleet_executor/test/sink_interceptor_test.cc new file mode 100644 index 0000000000000..6b1a555e987a3 --- /dev/null +++ b/paddle/fluid/distributed/fleet_executor/test/sink_interceptor_test.cc @@ -0,0 +1,89 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include + +#include "gtest/gtest.h" + +#include "paddle/fluid/distributed/fleet_executor/carrier.h" +#include "paddle/fluid/distributed/fleet_executor/global.h" +#include "paddle/fluid/distributed/fleet_executor/interceptor.h" +#include "paddle/fluid/distributed/fleet_executor/message_bus.h" +#include "paddle/fluid/distributed/fleet_executor/task_node.h" + +namespace paddle { +namespace distributed { + +class FakeInterceptor : public Interceptor { + public: + FakeInterceptor(int64_t interceptor_id, TaskNode* node) + : Interceptor(interceptor_id, node) { + RegisterMsgHandle([this](const InterceptorMessage& msg) { NOP(msg); }); + } + + void NOP(const InterceptorMessage& msg) { + if (msg.message_type() == DATA_IS_READY) { + std::cout << "FakeInterceptor run in scope " << msg.scope_idx() + << std::endl; + InterceptorMessage reply; + reply.set_message_type(DATA_IS_USELESS); + Send(-1, reply); + InterceptorMessage ready; + ready.set_message_type(DATA_IS_READY); + Send(-2, ready); + } else if (msg.message_type() == DATA_IS_USELESS) { + std::cout << "FakeInterceptor remove result in scope " << msg.scope_idx() + << std::endl; + } + } + + private: + int64_t step_; +}; + +TEST(SourceInterceptor, Source) { + std::string carrier_id = "0"; + Carrier* carrier = + GlobalMap::Create(carrier_id, carrier_id); + carrier->Init(0, {{-1, 0}, {0, 0}, {-2, 0}}); + + MessageBus* msg_bus = GlobalVal::Create(); + msg_bus->Init(0, {{0, "127.0.0.0:0"}}, ""); + + // NOTE: don't delete, otherwise interceptor will use undefined node + TaskNode* source = new TaskNode(0, -1, 0, 3, 0); // role, rank, task_id + TaskNode* node_a = new TaskNode(0, 0, 0, 3, 0); // role, rank, task_id + TaskNode* sink = new TaskNode(0, -2, 0, 3, 0); // role, rank, task_id + + source->AddDownstreamTask(0, 1); + node_a->AddUpstreamTask(-1, 1); + node_a->AddDownstreamTask(-2, 1); + sink->AddUpstreamTask(0, 1); + carrier->SetInterceptor(-1, InterceptorFactory::Create("Source", -1, source)); + carrier->SetInterceptor(0, std::make_unique(0, node_a)); + carrier->SetInterceptor(-2, InterceptorFactory::Create("Sink", -2, sink)); + + // start + InterceptorMessage msg; + msg.set_message_type(START); + msg.set_dst_id(-1); + carrier->EnqueueInterceptorMessage(msg); + + carrier->Wait(); + carrier->Release(); +} + +} // namespace distributed +} // namespace paddle From f581f5bf69b929712c7df8513c399beda114005e Mon Sep 17 00:00:00 2001 From: Leo Chen Date: Sat, 9 Apr 2022 09:46:11 +0800 Subject: [PATCH 039/211] [new-exec] fix bug that no thread is waked up when adding task to threadpool (#41567) * fix bug that no thread is waked up when adding task to threadpool * fix typo --- .../new_executor/interpretercore_util.cc | 1 + .../new_executor/workqueue/event_count.h | 7 ++++++- .../workqueue/nonblocking_threadpool.h | 18 ++++++++++++------ 3 files changed, 19 insertions(+), 7 deletions(-) diff --git a/paddle/fluid/framework/new_executor/interpretercore_util.cc b/paddle/fluid/framework/new_executor/interpretercore_util.cc index a704411f3bb71..59703332efe95 100644 --- a/paddle/fluid/framework/new_executor/interpretercore_util.cc +++ b/paddle/fluid/framework/new_executor/interpretercore_util.cc @@ -39,6 +39,7 @@ constexpr size_t kPrepareWorkQueueIdx = 2; void AsyncWorkQueue::AddTask(const OpFuncType& op_func_type, std::function fn) { + VLOG(4) << "Add task: " << static_cast(op_func_type) << " "; // NOTE(zhiqiu): use thhe second queue of size of, so only one thread is used. if (FLAGS_new_executor_sequential_run) { VLOG(4) << "FLAGS_new_executor_sequential_run:" diff --git a/paddle/fluid/framework/new_executor/workqueue/event_count.h b/paddle/fluid/framework/new_executor/workqueue/event_count.h index 893c6d2d54ac7..7a826c3990713 100644 --- a/paddle/fluid/framework/new_executor/workqueue/event_count.h +++ b/paddle/fluid/framework/new_executor/workqueue/event_count.h @@ -54,6 +54,7 @@ #include #include #include +#include "glog/logging.h" namespace paddle { namespace framework { @@ -255,6 +256,7 @@ class EventCount { std::unique_lock lock(w->mu); while (w->state != Waiter::kSignaled) { w->state = Waiter::kWaiting; + VLOG(10) << "Go to wait " << &(w->cv); w->cv.wait(lock); } } @@ -270,7 +272,10 @@ class EventCount { w->state = Waiter::kSignaled; } // Avoid notifying if it wasn't waiting. - if (state == Waiter::kWaiting) w->cv.notify_one(); + if (state == Waiter::kWaiting) { + VLOG(10) << "Go to notify " << &(w->cv); + w->cv.notify_one(); + } } } }; diff --git a/paddle/fluid/framework/new_executor/workqueue/nonblocking_threadpool.h b/paddle/fluid/framework/new_executor/workqueue/nonblocking_threadpool.h index 384498584c66a..44953fa192e27 100644 --- a/paddle/fluid/framework/new_executor/workqueue/nonblocking_threadpool.h +++ b/paddle/fluid/framework/new_executor/workqueue/nonblocking_threadpool.h @@ -53,7 +53,6 @@ class ThreadPoolTempl { all_coprimes_.reserve(num_threads_); for (int i = 1; i <= num_threads_; ++i) { all_coprimes_.emplace_back(); - all_coprimes_.back().push_back(i); ComputeCoprimes(i, &(all_coprimes_.back())); } for (int i = 0; i < num_threads_; i++) { @@ -130,8 +129,11 @@ class ThreadPoolTempl { // this. We expect that such scenario is prevented by program, that is, // this is kept alive while any threads can potentially be in Schedule. if (!t.f) { - if (num_tasks > num_threads_ - blocked_.load(std::memory_order_relaxed)) { + if (num_tasks > num_threads_ - blocked_) { + VLOG(6) << "Add task, Notify"; ec_.Notify(false); + } else { + VLOG(6) << "Add task, No Notify"; } } else { num_tasks_.fetch_sub(1, std::memory_order_relaxed); @@ -376,17 +378,21 @@ class ThreadPoolTempl { ec_.CancelWait(); return false; } + + // Number of blocked threads is used as termination condition. + // If we are shutting down and all worker threads blocked without work, + // that's we are done. + blocked_++; + // Now do a reliable emptiness check. int victim = NonEmptyQueueIndex(); if (victim != -1) { ec_.CancelWait(); *t = thread_data_[victim].queue.PopBack(); + blocked_--; return true; } - // Number of blocked threads is used as termination condition. - // If we are shutting down and all worker threads blocked without work, - // that's we are done. - blocked_++; + if (done_ && blocked_ == static_cast(num_threads_)) { ec_.CancelWait(); // Almost done, but need to re-check queues. From 0e048fc601ee33ea13303905b7dca3b0a58fb168 Mon Sep 17 00:00:00 2001 From: sneaxiy <32832641+sneaxiy@users.noreply.github.com> Date: Sat, 9 Apr 2022 10:59:32 +0800 Subject: [PATCH 040/211] fix cross entropy (#41541) --- python/paddle/nn/functional/loss.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/paddle/nn/functional/loss.py b/python/paddle/nn/functional/loss.py index 593cea2d2cf64..fb9c22edc65ed 100755 --- a/python/paddle/nn/functional/loss.py +++ b/python/paddle/nn/functional/loss.py @@ -1905,7 +1905,7 @@ def cross_entropy(input, if reduction == "sum": return paddle.sum(out, name=name) elif reduction == "mean": - if ignore_index != -100: + if ignore_index >= 0: out_sum = paddle.sum(out, name=name) # for each label[i],set 1 or 0, according to ignore_index # mask[i]=0, if label[i]==ignore_index From be11648af30db3599f930db58b57caf146a46a87 Mon Sep 17 00:00:00 2001 From: wanghuancoder Date: Sat, 9 Apr 2022 11:11:37 +0800 Subject: [PATCH 041/211] fix pylayer mem leak, test=develop (#41559) --- paddle/fluid/eager/pylayer/py_layer_node.cc | 6 ++++++ paddle/fluid/pybind/eager_py_layer.cc | 12 ++++++++++++ 2 files changed, 18 insertions(+) diff --git a/paddle/fluid/eager/pylayer/py_layer_node.cc b/paddle/fluid/eager/pylayer/py_layer_node.cc index 5008e958c5f11..42036a28cfa15 100644 --- a/paddle/fluid/eager/pylayer/py_layer_node.cc +++ b/paddle/fluid/eager/pylayer/py_layer_node.cc @@ -154,6 +154,12 @@ operator()( } } + Py_XDECREF(backward_fn); + Py_XDECREF(backward_args); + if (!PyTuple_Check(outputs)) { + Py_XDECREF(outputs_tuple); + } + return grad_out; } } // namespace egr diff --git a/paddle/fluid/pybind/eager_py_layer.cc b/paddle/fluid/pybind/eager_py_layer.cc index cade856b3607a..605056e7af2b5 100644 --- a/paddle/fluid/pybind/eager_py_layer.cc +++ b/paddle/fluid/pybind/eager_py_layer.cc @@ -231,6 +231,10 @@ PyObject* pylayer_method_apply(PyObject* cls, PyObject* args, auto outputs = PyObject_Call(forward_fn, forward_args, kwargs); egr::Controller::Instance().SetHasGrad(trace_backward); if (!outputs) { + Py_XDECREF(forward_args); + Py_XDECREF(kwargs_value_list); + Py_XDECREF(backward_function); + Py_XDECREF(forward_fn); return nullptr; } @@ -367,6 +371,14 @@ PyObject* pylayer_method_apply(PyObject* cls, PyObject* args, VLOG(6) << "PyLayer construct backward node finish..."; } + if (!PyTuple_Check(outputs)) { + Py_XDECREF(outputs_tuple); + } + Py_XDECREF(forward_args); + Py_XDECREF(kwargs_value_list); + Py_XDECREF(backward_function); + Py_XDECREF(forward_fn); + return outputs; EAGER_CATCH_AND_THROW_RETURN_NULL } From 9cb2287cffe87eb4ba1cc880f87e52937ddd4d4f Mon Sep 17 00:00:00 2001 From: Jiabin Yang <360788950@qq.com> Date: Sat, 9 Apr 2022 11:13:04 +0800 Subject: [PATCH 042/211] fix_ci_problem3 (#41484) * fix_ci_problem3 * support windows no default error --- paddle/fluid/eager/backward.cc | 5 +- .../unittests/test_graph_khop_sampler.py | 29 ++++-- .../tests/unittests/test_switch_autotune.py | 28 +++++- .../fluid/tests/unittests/test_zeropad2d.py | 43 +++++++-- python/paddle/nn/functional/common.py | 94 +++++++++++-------- 5 files changed, 141 insertions(+), 58 deletions(-) diff --git a/paddle/fluid/eager/backward.cc b/paddle/fluid/eager/backward.cc index d5397e20e7d68..be425cf91bdef 100644 --- a/paddle/fluid/eager/backward.cc +++ b/paddle/fluid/eager/backward.cc @@ -22,10 +22,10 @@ #include "paddle/fluid/platform/profiler.h" #include "paddle/fluid/platform/profiler/event_tracing.h" +#include "glog/logging.h" #include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/errors.h" - -#include "glog/logging.h" +#include "paddle/phi/kernels/autotune/switch_autotune.h" namespace egr { @@ -799,6 +799,7 @@ void Backward( paddle::platform::RecordEvent backward_record_event( "backward", paddle::platform::TracerEventType::Operator, 1); RunBackward(tensors, grad_tensors, retain_graph); + phi::autotune::AutoTuneStatus::Instance().Update(); } std::vector Grad( diff --git a/python/paddle/fluid/tests/unittests/test_graph_khop_sampler.py b/python/paddle/fluid/tests/unittests/test_graph_khop_sampler.py index b8071222ac772..6e6175d669515 100644 --- a/python/paddle/fluid/tests/unittests/test_graph_khop_sampler.py +++ b/python/paddle/fluid/tests/unittests/test_graph_khop_sampler.py @@ -46,7 +46,7 @@ def setUp(self): self.sample_sizes = [5, 5] self.dst_src_dict = dst_src_dict - def test_sample_result(self): + def func_sample_result(self): paddle.disable_static() row = paddle.to_tensor(self.row) colptr = paddle.to_tensor(self.colptr) @@ -79,13 +79,25 @@ def test_sample_result(self): # Ensure the correct sample neighbors. self.assertTrue(np.sum(in_neighbors) == in_neighbors.shape[0]) - def test_uva_sample_result(self): + def test_sample_result(self): + with fluid.framework._test_eager_guard(): + self.func_sample_result() + self.func_sample_result() + + def func_uva_sample_result(self): paddle.disable_static() if paddle.fluid.core.is_compiled_with_cuda(): - row = paddle.fluid.core.to_uva_tensor( - self.row.astype(self.row.dtype)) - sorted_eid = paddle.fluid.core.to_uva_tensor( - self.sorted_eid.astype(self.sorted_eid.dtype)) + row = None + if fluid.framework.in_dygraph_mode(): + row = paddle.fluid.core.eager.to_uva_tensor( + self.row.astype(self.row.dtype), 0) + sorted_eid = paddle.fluid.core.eager.to_uva_tensor( + self.sorted_eid.astype(self.sorted_eid.dtype), 0) + else: + row = paddle.fluid.core.to_uva_tensor( + self.row.astype(self.row.dtype)) + sorted_eid = paddle.fluid.core.to_uva_tensor( + self.sorted_eid.astype(self.sorted_eid.dtype)) colptr = paddle.to_tensor(self.colptr) nodes = paddle.to_tensor(self.nodes) @@ -114,6 +126,11 @@ def test_uva_sample_result(self): in_neighbors = np.isin(edge_src_n.numpy(), self.dst_src_dict[n]) self.assertTrue(np.sum(in_neighbors) == in_neighbors.shape[0]) + def test_uva_sample_result(self): + with fluid.framework._test_eager_guard(): + self.func_uva_sample_result() + self.func_uva_sample_result() + def test_sample_result_static_with_eids(self): paddle.enable_static() with paddle.static.program_guard(paddle.static.Program()): diff --git a/python/paddle/fluid/tests/unittests/test_switch_autotune.py b/python/paddle/fluid/tests/unittests/test_switch_autotune.py index 08cf120a0366e..9fad1eeb5c247 100644 --- a/python/paddle/fluid/tests/unittests/test_switch_autotune.py +++ b/python/paddle/fluid/tests/unittests/test_switch_autotune.py @@ -87,12 +87,22 @@ def run_program(self, enable_autotune): } self.check_status(expected_res) - def test_enable_autotune(self): + def func_enable_autotune(self): self.run_program(enable_autotune=True) - def test_disable_autotune(self): + def test_enable_autotune(self): + with paddle.fluid.framework._test_eager_guard(): + self.func_enable_autotune() + self.func_enable_autotune() + + def func_disable_autotune(self): self.run_program(enable_autotune=False) + def test_disable_autotune(self): + with paddle.fluid.framework._test_eager_guard(): + self.func_disable_autotune() + self.func_disable_autotune() + class TestStaticAutoTuneStatus(TestAutoTune): def run_program(self, enable_autotune): @@ -136,12 +146,22 @@ def run_program(self, enable_autotune): self.check_status(expected_res) paddle.disable_static() - def test_enable_autotune(self): + def func_enable_autotune(self): self.run_program(enable_autotune=True) - def test_disable_autotune(self): + def test_enable_autotune(self): + with paddle.fluid.framework._test_eager_guard(): + self.func_enable_autotune() + self.func_enable_autotune() + + def func_disable_autotune(self): self.run_program(enable_autotune=False) + def test_disable_autotune(self): + with paddle.fluid.framework._test_eager_guard(): + self.func_disable_autotune() + self.func_disable_autotune() + if __name__ == '__main__': unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_zeropad2d.py b/python/paddle/fluid/tests/unittests/test_zeropad2d.py index 2849caf17c62d..e2913097ae1b1 100644 --- a/python/paddle/fluid/tests/unittests/test_zeropad2d.py +++ b/python/paddle/fluid/tests/unittests/test_zeropad2d.py @@ -16,6 +16,7 @@ import unittest import numpy as np +import paddle from paddle import to_tensor from paddle.nn.functional import zeropad2d from paddle.nn import ZeroPad2D @@ -33,7 +34,7 @@ def setUp(self): self.shape = [4, 3, 224, 224] self.unsupport_dtypes = ['bool', 'int8'] - def test_unsupport_dtypes(self): + def func_unsupport_dtypes(self): """ test unsupport dtypes. """ @@ -43,6 +44,11 @@ def test_unsupport_dtypes(self): x_tensor = to_tensor(x).astype(dtype) self.assertRaises(TypeError, zeropad2d, x=x_tensor, padding=pad) + def test_unsupport_dtypes(self): + with paddle.fluid.framework._test_eager_guard(): + self.func_unsupport_dtypes() + self.func_unsupport_dtypes() + class TestZeroPad2dAPI(unittest.TestCase): """ @@ -56,7 +62,7 @@ def setUp(self): self.shape = [4, 3, 224, 224] self.support_dtypes = ['float32', 'float64', 'int32', 'int64'] - def test_support_dtypes(self): + def func_support_dtypes(self): """ test support types """ @@ -69,7 +75,12 @@ def test_support_dtypes(self): ret_res = zeropad2d(x_tensor, [pad, pad, pad, pad]).numpy() self.assertTrue(np.allclose(expect_res, ret_res)) - def test_support_pad2(self): + def test_support_dtypes(self): + with paddle.fluid.framework._test_eager_guard(): + self.func_support_dtypes() + self.func_support_dtypes() + + def func_support_pad2(self): """ test the type of 'pad' is list. """ @@ -82,7 +93,12 @@ def test_support_pad2(self): ret_res = zeropad2d(x_tensor, pad).numpy() self.assertTrue(np.allclose(expect_res, ret_res)) - def test_support_pad3(self): + def test_support_pad2(self): + with paddle.fluid.framework._test_eager_guard(): + self.func_support_pad2() + self.func_support_pad2() + + def func_support_pad3(self): """ test the type of 'pad' is tuple. """ @@ -95,7 +111,12 @@ def test_support_pad3(self): ret_res = zeropad2d(x_tensor, pad).numpy() self.assertTrue(np.allclose(expect_res, ret_res)) - def test_support_pad4(self): + def test_support_pad3(self): + with paddle.fluid.framework._test_eager_guard(): + self.func_support_pad3() + self.func_support_pad3() + + def func_support_pad4(self): """ test the type of 'pad' is paddle.Tensor. """ @@ -109,6 +130,11 @@ def test_support_pad4(self): ret_res = zeropad2d(x_tensor, pad_tensor).numpy() self.assertTrue(np.allclose(expect_res, ret_res)) + def test_support_pad4(self): + with paddle.fluid.framework._test_eager_guard(): + self.func_support_pad4() + self.func_support_pad4() + class TestZeroPad2DLayer(unittest.TestCase): """ @@ -124,12 +150,17 @@ def setUp(self): [[0, 0], [0, 0], [self.pad[2], self.pad[3]], [self.pad[0], self.pad[1]]]) - def test_layer(self): + def func_layer(self): self.assertTrue( np.allclose( zeropad2d(to_tensor(self.x), self.pad).numpy(), self.padLayer(to_tensor(self.x)))) + def test_layer(self): + with paddle.fluid.framework._test_eager_guard(): + self.func_layer() + self.func_layer() + if __name__ == '__main__': unittest.main() diff --git a/python/paddle/nn/functional/common.py b/python/paddle/nn/functional/common.py index 5cbd66b7832d8..287dc7d67def8 100644 --- a/python/paddle/nn/functional/common.py +++ b/python/paddle/nn/functional/common.py @@ -1356,29 +1356,31 @@ def pad(x, pad, mode='constant', value=0, data_format="NCHW", name=None): unsqueezed_dim = [1] x = unsqueeze(x, axis=unsqueezed_dim) - if in_dynamic_mode(): + if in_dygraph_mode(): if isinstance(pad, Variable): - pad = pad.numpy() + pad = pad.numpy().tolist() + out = _C_ops.final_state_pad3d(x, pad, mode, value, data_format) + else: if _in_legacy_dygraph(): + if isinstance(pad, Variable): + pad = pad.numpy().tolist() out = _C_ops.pad3d(x, "paddings", pad, "mode", mode, "value", value, "data_format", data_format, "name", name) else: - out = _C_ops.final_state_pad3d(x, pad, mode, value, data_format) - else: - attrs = {'mode': mode, 'value': value, 'data_format': data_format} - inputs = {'X': [x]} - if isinstance(pad, Variable): - inputs['Paddings'] = [pad] - attrs['paddings'] = [] - else: - attrs['paddings'] = pad + attrs = {'mode': mode, 'value': value, 'data_format': data_format} + inputs = {'X': [x]} + if isinstance(pad, Variable): + inputs['Paddings'] = [pad] + attrs['paddings'] = [] + else: + attrs['paddings'] = pad - helper = LayerHelper('pad3d', **locals()) + helper = LayerHelper('pad3d', **locals()) - dtype = helper.input_dtype(input_param_name='input') - out = helper.create_variable_for_type_inference(dtype) - helper.append_op( - type='pad3d', inputs=inputs, outputs={"Out": out}, attrs=attrs) + dtype = helper.input_dtype(input_param_name='input') + out = helper.create_variable_for_type_inference(dtype) + helper.append_op( + type='pad3d', inputs=inputs, outputs={"Out": out}, attrs=attrs) if len(unsqueezed_dim) != 0: out = squeeze(out, axis=unsqueezed_dim) @@ -1531,38 +1533,50 @@ def linear(x, weight, bias=None, name=None): # [0.9440598 0.9440598 0.9440598 0.9440598 ] # [2.1077576 2.1077576 2.1077576 2.1077576 ]] """ - if in_dynamic_mode(): - pre_bias = _C_ops.matmul_v2(x, weight, 'trans_x', False, 'trans_y', - False) + if in_dygraph_mode(): + pre_bias = _C_ops.final_state_matmul(x, weight, False, False) if bias is None: return pre_bias - return _C_ops.elementwise_add(pre_bias, bias) + return _C_ops.final_state_add(pre_bias, bias) else: - helper = LayerHelper('linear', **locals()) - dtype = x.dtype + if _in_legacy_dygraph(): + pre_bias = _C_ops.matmul_v2(x, weight, 'trans_x', False, 'trans_y', + False) - check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64'], - 'linear') - check_dtype(dtype, 'dtype', ['float16', 'float32', 'float64'], 'linear') + if bias is None: + return pre_bias - inputs = {'X': [x], 'Y': [weight]} - attrs = {'trans_x': False, 'trans_y': False} - tmp = helper.create_variable_for_type_inference(dtype) - helper.append_op( - type='matmul_v2', inputs=inputs, outputs={'Out': tmp}, attrs=attrs) - if bias is not None: - res = helper.create_variable_for_type_inference(dtype) - helper.append_op( - type='elementwise_add', - inputs={'X': [tmp], - 'Y': [bias]}, - outputs={'Out': [res]}, - attrs={'axis': len(x.shape) - 1}) + return _C_ops.elementwise_add(pre_bias, bias) else: - res = tmp - return res + helper = LayerHelper('linear', **locals()) + dtype = x.dtype + + check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64'], + 'linear') + check_dtype(dtype, 'dtype', ['float16', 'float32', 'float64'], + 'linear') + + inputs = {'X': [x], 'Y': [weight]} + attrs = {'trans_x': False, 'trans_y': False} + tmp = helper.create_variable_for_type_inference(dtype) + helper.append_op( + type='matmul_v2', + inputs=inputs, + outputs={'Out': tmp}, + attrs=attrs) + if bias is not None: + res = helper.create_variable_for_type_inference(dtype) + helper.append_op( + type='elementwise_add', + inputs={'X': [tmp], + 'Y': [bias]}, + outputs={'Out': [res]}, + attrs={'axis': len(x.shape) - 1}) + else: + res = tmp + return res def label_smooth(label, prior_dist=None, epsilon=0.1, name=None): From e1792a31583a03b28536f4a31d96f71ab7d9fb46 Mon Sep 17 00:00:00 2001 From: chenjian Date: Sat, 9 Apr 2022 12:07:22 +0800 Subject: [PATCH 043/211] Add get profiler from config (#41532) * no * maintain old profiler * add get profiler from serialization config * add unit test * improve coverage * fix * Revert "improve coverage" This reverts commit 4a980bfda48adadee551d0e1c5740bc5b7389200. * fix unit * fix * fix --- .../fluid/tests/unittests/test_newprofiler.py | 141 ++++++++++++++++++ python/paddle/profiler/profiler.py | 72 +++++++++ python/paddle/profiler/profiler_statistic.py | 10 ++ 3 files changed, 223 insertions(+) diff --git a/python/paddle/fluid/tests/unittests/test_newprofiler.py b/python/paddle/fluid/tests/unittests/test_newprofiler.py index 0088687b12563..ac2b205e61128 100755 --- a/python/paddle/fluid/tests/unittests/test_newprofiler.py +++ b/python/paddle/fluid/tests/unittests/test_newprofiler.py @@ -16,6 +16,7 @@ import unittest import numpy as np +import tempfile import paddle import paddle.profiler as profiler @@ -138,6 +139,146 @@ def test_nvprof(self): y = x / 2.0 +class TestGetProfiler(unittest.TestCase): + def test_getprofiler(self): + config_content = ''' + { + "targets": ["CPU"], + "scheduler": [3,4], + "on_trace_ready": { + "export_chrome_tracing":{ + "module": "paddle.profiler", + "use_direct": false, + "args": [], + "kwargs": { + "dir_name": "testdebug/" + } + } + }, + "timer_only": false + } + ''' + filehandle = tempfile.NamedTemporaryFile(mode='w') + filehandle.write(config_content) + filehandle.flush() + import paddle.profiler.profiler as profiler + profiler = profiler.get_profiler(filehandle.name) + x_value = np.random.randn(2, 3, 3) + x = paddle.to_tensor( + x_value, stop_gradient=False, place=paddle.CPUPlace()) + with profiler: + for i in range(5): + y = x / 2.0 + ones_like_y = paddle.ones_like(y) + profiler.step() + + # below tests are just for coverage, wrong config + # test use_direct + config_content = ''' + { + "targets": ["Cpu", "Gpu"], + "scheduler": { + "make_scheduler":{ + "module": "paddle.profiler", + "use_direct": true, + "args": [], + "kwargs": {} + } + }, + "on_trace_ready": { + "export_chrome_tracing":{ + "module": "paddle.profiler1", + "use_direct": true, + "args": [], + "kwargs": { + } + } + }, + "timer_only": false + } + ''' + filehandle = tempfile.NamedTemporaryFile(mode='w') + filehandle.write(config_content) + filehandle.flush() + import paddle.profiler.profiler as profiler + try: + profiler = profiler.get_profiler(filehandle.name) + except: + pass + + # test scheduler + config_content = ''' + { + "targets": ["Cpu", "Gpu"], + "scheduler": { + "make_scheduler":{ + "module": "paddle.profiler", + "use_direct": false, + "args": [], + "kwargs": { + "closed": 1, + "ready": 1, + "record": 2 + } + } + }, + "on_trace_ready": { + "export_chrome_tracing":{ + "module": "paddle.profiler", + "use_direct": true, + "args": [], + "kwargs": { + } + } + }, + "timer_only": false + } + ''' + filehandle = tempfile.NamedTemporaryFile(mode='w') + filehandle.write(config_content) + filehandle.flush() + import paddle.profiler.profiler as profiler + profiler = profiler.get_profiler(filehandle.name) + + # test exception + config_content = ''' + { + "targets": [1], + "scheduler": { + "make_scheduler1":{ + "module": "paddle.profiler", + "use_direct": false, + "args": [], + "kwargs": { + "closed": 1, + "ready": 1, + "record": 2 + } + } + }, + "on_trace_ready": { + "export_chrome_tracing1":{ + "module": "paddle.profiler", + "use_direct": false, + "args": [], + "kwargs": { + "dir_name": "testdebug/" + } + } + }, + "timer_only": 1 + } + ''' + filehandle = tempfile.NamedTemporaryFile(mode='w') + filehandle.write(config_content) + filehandle.flush() + import paddle.profiler.profiler as profiler + profiler = profiler.get_profiler(filehandle.name) + # test path error + import paddle.profiler.profiler as profiler + profiler = profiler.get_profiler('nopath.json') + + class RandomDataset(Dataset): def __init__(self, num_samples): self.num_samples = num_samples diff --git a/python/paddle/profiler/profiler.py b/python/paddle/profiler/profiler.py index c1c4f4ff8c13c..2fae583397a8e 100644 --- a/python/paddle/profiler/profiler.py +++ b/python/paddle/profiler/profiler.py @@ -18,6 +18,8 @@ from enum import Enum from typing import Any, Callable, Iterable, Optional, Union from warnings import warn +import importlib +import json import paddle from paddle.fluid.core import (_Profiler, _ProfilerResult, ProfilerOptions, @@ -741,3 +743,73 @@ def summary(self, op_detail=op_detail, thread_sep=thread_sep, time_unit=time_unit)) + + +def get_profiler(config_path): + try: + with open(config_path, 'r') as filehandle: + config_dict = json.load(filehandle) + except Exception as e: + print('Load config file for profiler error: {}'.format(e)) + print('Use default parameters instead.') + return Profiler() + translated_config_dict = {} + if "targets" in config_dict: + try: + translated_config_dict['targets'] = [] + for target in config_dict['targets']: + if target.lower() == "cpu": + translated_config_dict['targets'].append(ProfilerTarget.CPU) + elif target.lower() == 'gpu': + translated_config_dict['targets'].append(ProfilerTarget.GPU) + except: + print('Set targets parameter error, use default parameter instead.') + translated_config_dict['targets'] = None + if "scheduler" in config_dict: + try: + if isinstance(config_dict['scheduler'], dict): + for key, value in config_dict['scheduler'].items(): + module_path = value['module'] + use_direct = value['use_direct'] + module = importlib.import_module(module_path) + method = getattr(module, key) + if not use_direct: + translated_config_dict['scheduler'] = method( + *value['args'], **value['kwargs']) + else: + translated_config_dict['scheduler'] = method + else: + translated_config_dict['scheduler'] = [ + config_dict['scheduler'][0], config_dict['scheduler'][1] + ] + + except: + print( + 'Set scheduler parameter error, use default parameter instead.') + translated_config_dict['scheduler'] = None + if "on_trace_ready" in config_dict: + try: + if isinstance(config_dict['on_trace_ready'], dict): + for key, value in config_dict['on_trace_ready'].items(): + module_path = value['module'] + use_direct = value['use_direct'] + module = importlib.import_module(module_path) + method = getattr(module, key) + if not use_direct: + translated_config_dict['on_trace_ready'] = method( + *value['args'], **value['kwargs']) + else: + translated_config_dict['on_trace_ready'] = method + except: + print( + 'Set on_trace_ready parameter error, use default parameter instead.' + ) + translated_config_dict['on_trace_ready'] = None + if "timer_only" in config_dict: + if isinstance(config_dict['timer_only'], bool): + translated_config_dict['timer_only'] = config_dict['timer_only'] + else: + print( + 'Set timer_only parameter error, use default parameter instead.') + + return Profiler(**translated_config_dict) diff --git a/python/paddle/profiler/profiler_statistic.py b/python/paddle/profiler/profiler_statistic.py index 5fed51476132e..e4d4ff8c183bc 100755 --- a/python/paddle/profiler/profiler_statistic.py +++ b/python/paddle/profiler/profiler_statistic.py @@ -743,6 +743,16 @@ def format_ratio(ratio, indent=0): TracerEventType. Communication] = statistic_data.distributed_summary.cpu_calls + for event_type in [ + TracerEventType.Dataloader, TracerEventType.Forward, + TracerEventType.Backward, TracerEventType.Optimization + ]: + event_type_name = str(event_type).split('.')[1] + if event_type in cpu_call_times and event_type_name in statistic_data.event_summary.model_perspective_items: + cpu_call_times[ + event_type] = statistic_data.event_summary.model_perspective_items[ + event_type_name].call + gpu_time_range = collections.defaultdict(list) for device_id, device_time_ranges in statistic_data.time_range_summary.GPUTimeRange.items( ): From b937cdc51794ee5112f9ec948c4518b9931b72c9 Mon Sep 17 00:00:00 2001 From: limingshu <61349199+JamesLim-sy@users.noreply.github.com> Date: Sat, 9 Apr 2022 13:19:42 +0800 Subject: [PATCH 044/211] Autotune the workspace_size_limit in conv. (#40338) * Using the maximum workspace_size of all alogirhms to limit the workspace size in exhaustive search mode. * Use the system cudaMalloc and cudaFree to allocate workspace during searching. * Enable switch of two kind of workspace setting methods. Co-authored-by: Liu Yiqun --- paddle/fluid/framework/conv_search_cache.h | 1 - paddle/fluid/operators/conv_base_helper.h | 99 +++ paddle/fluid/operators/conv_cudnn_helper.h | 613 +++++++++--------- paddle/fluid/operators/conv_cudnn_op_cache.h | 2 +- paddle/fluid/operators/conv_miopen_helper.h | 72 +- .../fused/fusion_conv_inception_op.cu | 2 - paddle/fluid/platform/device/gpu/gpu_info.cc | 4 + paddle/fluid/platform/device_context.cc | 7 +- paddle/fluid/platform/flags.cc | 7 +- paddle/phi/backends/gpu/gpu_context.cc | 40 +- paddle/phi/backends/gpu/gpu_context.h | 16 +- paddle/phi/kernels/autotune/CMakeLists.txt | 4 +- .../kernels/gpudnn/conv_grad_grad_kernel.cu | 71 +- paddle/phi/kernels/gpudnn/conv_grad_kernel.cu | 43 +- paddle/phi/kernels/gpudnn/conv_kernel.cu | 18 +- .../gpudnn/conv_transpose_grad_kernel.cu | 107 ++- .../kernels/gpudnn/conv_transpose_kernel.cu | 13 +- paddle/phi/kernels/impl/conv_cudnn_impl.h | 2 +- .../tests/unittests/test_switch_autotune.py | 12 + 19 files changed, 620 insertions(+), 513 deletions(-) create mode 100644 paddle/fluid/operators/conv_base_helper.h diff --git a/paddle/fluid/framework/conv_search_cache.h b/paddle/fluid/framework/conv_search_cache.h index 51446f287e94b..4da2aeb4d0472 100644 --- a/paddle/fluid/framework/conv_search_cache.h +++ b/paddle/fluid/framework/conv_search_cache.h @@ -16,7 +16,6 @@ limitations under the License. */ #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/operator_kernel_configs.h" - #include "paddle/fluid/platform/device/gpu/gpu_dnn.h" namespace paddle { diff --git a/paddle/fluid/operators/conv_base_helper.h b/paddle/fluid/operators/conv_base_helper.h new file mode 100644 index 0000000000000..c664d1935fe2e --- /dev/null +++ b/paddle/fluid/operators/conv_base_helper.h @@ -0,0 +1,99 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include +#include +#include +#include +#include "paddle/fluid/framework/conv_search_cache.h" +#include "paddle/fluid/operators/conv_cudnn_op_cache.h" +#include "paddle/phi/backends/gpu/gpu_context.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; +using DataLayout = platform::DataLayout; +using framework::AlgorithmsCache; +using framework::ConvSearchCache; + +template +using ScalingParamType = typename platform::CudnnDataType::ScalingParamType; + +// As the basic for SearchAlgorithm struct. +template +struct SearchAlgorithm {}; + +// As the container of searchAlgorithm::Find() result. +template +struct SearchResult { + public: + AlgoT algo = static_cast(0); + float time = -1.f; + size_t workspace_size = 0; +}; + +// As the container of conv relevant descriptors. +template +struct ConvArgsBase { + HandleT handle; + platform::TensorDescriptor idesc, odesc; + platform::FilterDescriptor wdesc; + platform::ConvolutionDescriptor cdesc; + const framework::Tensor *x, *w, *o; + DataT cudnn_dtype; + + // strides + std::vector s; + // paddings + std::vector p; + // dilations + std::vector d; + + ConvArgsBase(const framework::Tensor* x, const framework::Tensor* w, + const framework::Tensor* o, const std::vector s, + const std::vector p, const std::vector d, DataT dtype) + : x(x), w(w), o(o), s(s), p(p), d(d), cudnn_dtype(dtype) {} +}; + +static inline void GetNCDHW(const framework::DDim& dims, + const DataLayout& layout, int* N, int* C, int* D, + int* H, int* W) { + *N = dims[0]; + *C = layout == DataLayout::kNCHW ? dims[1] : dims[dims.size() - 1]; + int i = layout == DataLayout::kNCHW ? 0 : 1; + if (dims.size() == 5) { + *D = dims[2 - i]; + *H = dims[3 - i]; + *W = dims[4 - i]; + } else { + *D = 1; + *H = dims[2 - i]; + *W = dims[3 - i]; + } +} + +template +static std::ostream& operator<<(std::ostream& out, const std::vector& v) { + out << "["; + for (auto const& tmp : v) out << tmp << ","; + out << "]"; + return out; +} + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/conv_cudnn_helper.h b/paddle/fluid/operators/conv_cudnn_helper.h index 4e6fda3d09a07..3c29c60b21565 100644 --- a/paddle/fluid/operators/conv_cudnn_helper.h +++ b/paddle/fluid/operators/conv_cudnn_helper.h @@ -14,44 +14,15 @@ limitations under the License. */ #pragma once -#include -#include -#include -#include -#include - -#include "paddle/fluid/framework/conv_search_cache.h" -#include "paddle/fluid/framework/operator_kernel_configs.h" -#include "paddle/fluid/operators/conv_cudnn_op_cache.h" -#include "paddle/fluid/operators/eigen/eigen_function.h" +#include "paddle/fluid/operators/conv_base_helper.h" #include "paddle/fluid/platform/cuda_graph_with_memory_pool.h" -#include "paddle/fluid/platform/device/gpu/gpu_dnn.h" -#include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/fluid/platform/device/gpu/gpu_info.h" +#include "paddle/phi/kernels/funcs/eigen/eigen_function.h" namespace paddle { namespace operators { -using Tensor = framework::Tensor; -using DataLayout = platform::DataLayout; -template -using ScalingParamType = typename platform::CudnnDataType::ScalingParamType; -using framework::AlgorithmsCache; -static inline void GetNCDHW(const framework::DDim& dims, - const DataLayout& layout, int* N, int* C, int* D, - int* H, int* W) { - *N = dims[0]; - *C = layout == DataLayout::kNCHW ? dims[1] : dims[dims.size() - 1]; - int i = layout == DataLayout::kNCHW ? 0 : 1; - if (dims.size() == 5) { - *D = dims[2 - i]; - *H = dims[3 - i]; - *W = dims[4 - i]; - } else { - *D = 1; - *H = dims[2 - i]; - *W = dims[3 - i]; - } -} +using ConvArgs = ConvArgsBase; template static void RemovePaddingSlice(const phi::GPUContext& context, @@ -68,121 +39,103 @@ static void RemovePaddingSlice(const phi::GPUContext& context, extents[i] = new_out_dims[i]; } - int start; for (size_t i = 0; i < axes.size(); ++i) { - start = starts[i]; + int start = starts[i]; if (start < 0) { start = (start + in_dims[axes[i]]); } start = std::max(start, 0); offsets[axes[i]] = start; } + auto in_t = framework::EigenTensor::From( *input); - auto out_t = framework::EigenTensor::From( *out, new_out_dims); - EigenSlice, T, D>::Eval(place, out_t, in_t, - offsets, extents); + + phi::funcs::EigenSlice, T, D>::Eval( + place, out_t, in_t, offsets, extents); } -template -std::ostream& operator<<(std::ostream& out, const std::vector& v) { - out << "["; - for (auto const& tmp : v) out << tmp << ","; - out << "]"; - return out; +static inline double ToMegaBytes(size_t bytes) { + return static_cast(bytes) / (1 << 20); } -inline int MaxBwdFilterAlgos(cudnnHandle_t cudnn_handle) { - int max_algos = 0; -#if CUDNN_VERSION_MIN(7, 0, 1) - PADDLE_ENFORCE_GPU_SUCCESS( - platform::dynload::cudnnGetConvolutionBackwardFilterAlgorithmMaxCount( - cudnn_handle, &max_algos)); -#endif - return max_algos; +static inline bool UseFixedWorkspace() { + return FLAGS_conv_workspace_size_limit >= 0; } -template -void ChooseAlgoByWorkspace(PerfType* perf_results, size_t perf_num, - size_t workspace_byte, AlgoType* algo) { - for (size_t i = 0; i < perf_num; ++i) { - auto result = perf_results[i]; - if (result.status == CUDNN_STATUS_SUCCESS && - result.memory < workspace_byte) { - *algo = result.algo; - VLOG(3) << " algo: " << result.algo << ", time: " << result.time - << " ms, wksp = " << result.memory - << ", status = " << result.status; - return; - } +static size_t CaclWorkspaceLimitInBytes(const phi::GPUContext& ctx) { + if (!UseFixedWorkspace()) { + int device_id = platform::GetCurrentDeviceId(); + int64_t allocated = memory::StatGetCurrentValue("Allocated", device_id); + int64_t reserved = memory::StatGetCurrentValue("Reserved", device_id); + int64_t availble = platform::GpuAvailableMemToAlloc(); + int64_t cur_workspace_size = ctx.cudnn_workspace_handle().WorkspaceSize(); + VLOG(3) << "[memory] allocated=" << ToMegaBytes(allocated) + << " MB, reserved=" << ToMegaBytes(reserved) + << " MB, available_to_alloc=" << ToMegaBytes(availble) + << " MB, current_workspace_size=" << ToMegaBytes(cur_workspace_size) + << " MB."; + return std::max(std::max(availble, cur_workspace_size), + reserved - allocated); + } else { + return FLAGS_conv_workspace_size_limit * 1024 * 1024; } - VLOG(3) << "Can not find alog that requires memory < " - << static_cast(workspace_byte) / (1 << 20) << " MB"; } -template -void ChooseAlgo(const std::vector& perf_results, - size_t workspace_byte, AlgoType* algo) { - VLOG(3) << "=========BwdFilterAlgo Perf result========="; - for (const auto& result : perf_results) { - auto math_type_str = "False"; - if (result.mathType == CUDNN_TENSOR_OP_MATH) { - math_type_str = "True"; - } - VLOG(3) << " algo: " << result.algo << ", TensorCore: " << math_type_str - << ", time: " << result.time << " ms" - << ", wksp = " << result.memory << ", status = " << result.status; +template +std::string GetPerfResultString(std::string prefix, + const std::vector& perf_results, + int actual_algo_count, size_t workspace_limit) { + std::ostringstream out; + out << prefix << " (workspace limit=" << ToMegaBytes(workspace_limit) + << " MB):\n"; + for (int i = 0; i < actual_algo_count; ++i) { + const auto& result = perf_results[i]; + auto math_type_str = (result.mathType == CUDNN_TENSOR_OP_MATH) ? "T" : "F"; + out << " algo=" << result.algo << ": tensor_core=" << math_type_str + << ", time=" << result.time + << " ms, memory=" << ToMegaBytes(result.memory) + << " MB, status=" << result.status << "\n"; } + return out.str(); +} - for (size_t i = 0; i != perf_results.size(); ++i) { - const auto& result = perf_results[i]; +template +void ChooseAlgoByWorkspace(const std::vector& perf_results, + size_t workspace_limit, + SearchResult* algo_result) { + for (size_t i = 0; i < perf_results.size(); ++i) { + auto result = perf_results[i]; if (result.status == CUDNN_STATUS_SUCCESS && - (result.memory <= workspace_byte)) { - if ((result.mathType == CUDNN_TENSOR_OP_MATH) && - (i != perf_results.size() - 1)) { - const auto& next_result = perf_results[i + 1]; - if (next_result.status == CUDNN_STATUS_SUCCESS && - next_result.algo == result.algo && - next_result.memory == result.memory && - next_result.mathType != CUDNN_TENSOR_OP_MATH && - next_result.time < 1.01 * result.time) { - // Skip over this result- it's not really a Tensor Core algo. - // Because it is only 1% performance difference. - // Prefer to choose the next equivalent non-Tensor Core algo. - continue; - } - } - *algo = result.algo; - auto math_type_str = "0"; - if (result.mathType == CUDNN_TENSOR_OP_MATH) { - math_type_str = "1"; - } - VLOG(3) << " choose algo: " << result.algo << ", TC: " << math_type_str - << ", time: " << result.time << " ms" - << ", wksp = " << result.memory << ", status = " << result.status; - break; + result.memory < workspace_limit) { + algo_result->algo = result.algo; + algo_result->time = result.time; + algo_result->workspace_size = result.memory; + VLOG(3) << " algo=" << result.algo << ", time=" << result.time + << " ms, memory=" << ToMegaBytes(result.memory) + << " MB (limit=" << ToMegaBytes(workspace_limit) + << " MB), status=" << result.status; + return; } } + VLOG(3) << "Can not find an algorithm that requires memory < " + << ToMegaBytes(workspace_limit) << " MB"; } -using framework::ConvSearchCache; - static void SetConvMathType(const phi::GPUContext& ctx, cudnnDataType_t dtype, const platform::ConvolutionDescriptor& cdesc) { #if CUDA_VERSION >= 9000 && CUDNN_VERSION_MIN(7, 0, 1) - auto& dev_ctx = ctx; - if (dev_ctx.GetComputeCapability() >= 70 && dtype == CUDNN_DATA_HALF) { + if (ctx.GetComputeCapability() >= 70 && dtype == CUDNN_DATA_HALF) { PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnSetConvolutionMathType( cdesc.desc(), CUDNN_TENSOR_OP_MATH)); VLOG(5) << "use cudnn_tensor_op_math"; #if CUDA_VERSION >= 11000 #if CUDNN_VERSION_MIN(8, 1, 0) - } else if (dev_ctx.GetComputeCapability() >= 80 && - dtype == CUDNN_DATA_BFLOAT16) { + } else if (ctx.GetComputeCapability() >= 80 && dtype == CUDNN_DATA_BFLOAT16) { PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnSetConvolutionMathType( cdesc.desc(), CUDNN_TENSOR_OP_MATH)); #endif // CUDNN_VERSION_MIN(8, 1, 0) @@ -198,76 +151,49 @@ static void SetConvMathType(const phi::GPUContext& ctx, cudnnDataType_t dtype, #endif } -struct ConvArgs { - cudnnHandle_t handle; - platform::TensorDescriptor idesc, odesc; - platform::FilterDescriptor wdesc; - platform::ConvolutionDescriptor cdesc; - const framework::Tensor *x, *w, *o; - cudnnDataType_t cudnn_dtype; - - // strides - std::vector s; - // paddings - std::vector p; - // dilations - std::vector d; - - ConvArgs(const framework::Tensor* x, const framework::Tensor* w, - const framework::Tensor* o, const std::vector s, - const std::vector p, const std::vector d, - cudnnDataType_t dtype) - : x(x), w(w), o(o), s(s), p(p), d(d), cudnn_dtype(dtype) {} -}; - -template -struct SearchAlgorithm {}; - template <> struct SearchAlgorithm { - using perf_t = cudnnConvolutionFwdAlgoPerf_t; - using algo_t = cudnnConvolutionFwdAlgo_t; + using PerfT = cudnnConvolutionFwdAlgoPerf_t; + using AlgoT = cudnnConvolutionFwdAlgo_t; template - static algo_t Find(const ConvArgs& args, bool exhaustive_search, - bool deterministic, const phi::GPUContext& ctx) { + static SearchResult Find(const ConvArgs& args, bool exhaustive_search, + bool deterministic, + const phi::GPUContext& ctx) { + SearchResult result; auto dtype = platform::CudnnDataType::type; - bool has_got_workspace_size = true; - size_t workspace_size_limit = FLAGS_conv_workspace_size_limit * 1024 * 1024; - size_t workspace_size = 0; - algo_t algo; + size_t workspace_size_limit = CaclWorkspaceLimitInBytes(ctx); SetConvMathType(ctx, dtype, args.cdesc); if (!exhaustive_search && !deterministic) { #if CUDNN_VERSION >= 7001 - int perf_count; + int actual_perf_count; int best_algo_idx = 0; - std::unique_ptr perf_results(new perf_t[kNUM_CUDNN_FWD_ALGS]); + std::vector perf_results(kNUM_CUDNN_FWD_ALGS); PADDLE_ENFORCE_GPU_SUCCESS( platform::dynload::cudnnGetConvolutionForwardAlgorithm_v7( args.handle, args.idesc.desc(), args.wdesc.desc(), args.cdesc.desc(), args.odesc.desc(), kNUM_CUDNN_FWD_ALGS, - &perf_count, perf_results.get())); - algo = (perf_results.get())[best_algo_idx].algo; - workspace_size = (perf_results.get())[best_algo_idx].memory; + &actual_perf_count, perf_results.data())); + result.algo = perf_results[best_algo_idx].algo; + result.workspace_size = perf_results[best_algo_idx].memory; - if (workspace_size > workspace_size_limit) { + if (result.workspace_size > workspace_size_limit) { #if CUDNN_VERSION >= 8000 // cudnnGetConvolutionForwardAlgorithm is removed in CUDNN-8 - ChooseAlgoByWorkspace(perf_results.get(), - kNUM_CUDNN_FWD_ALGS, - workspace_size_limit, &algo); + ChooseAlgoByWorkspace(perf_results, workspace_size_limit, + &result); #else - VLOG(1) << "Fallback to non-v7 method to find conv algorithm becasue " - "the workspace size request(" - << workspace_size << ") exceeds the limit(" + VLOG(3) << "Fallback to non-v7 method to find conv algorithm " + "becasue the workspace size request(" + << result.workspace_size << ") exceeds the limit(" << workspace_size_limit << ")"; PADDLE_ENFORCE_GPU_SUCCESS( platform::dynload::cudnnGetConvolutionForwardAlgorithm( args.handle, args.idesc.desc(), args.wdesc.desc(), args.cdesc.desc(), args.odesc.desc(), CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT, - workspace_size_limit, &algo)); + workspace_size_limit, &(result.algo))); #endif } #else @@ -276,30 +202,30 @@ struct SearchAlgorithm { args.handle, args.idesc.desc(), args.wdesc.desc(), args.cdesc.desc(), args.odesc.desc(), CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT, - workspace_size_limit, &algo)); + workspace_size_limit, &(result.algo))); #endif - VLOG(3) << "choose algo " << algo; } else if (deterministic) { - algo = static_cast(1); + result.algo = static_cast(1); } else { - auto& dev_ctx = ctx; - auto workspace_handle = dev_ctx.cudnn_workspace_handle(); - - AlgorithmsCache& algo_cache = - *(framework::ConvSearchCache::Instance().GetForward()); - + auto workspace_handle = ctx.cudnn_workspace_handle(); auto x_dims = phi::vectorize(args.x->dims()); auto w_dims = phi::vectorize(args.w->dims()); - VLOG(10) << "cudnnConvolutionFwdAlgoPerf_t:" << ", x_dims:" << x_dims << ", w_dims:" << w_dims << ", args.s" << args.s << ", args.p" << args.p << ", args.d" << args.d; - algo = algo_cache.GetAlgorithm( + AlgorithmsCache& algo_cache = + *(framework::ConvSearchCache::Instance().GetForward()); + + result.algo = algo_cache.GetAlgorithm( x_dims, w_dims, args.s, args.p, args.d, 0, static_cast(args.cudnn_dtype), [&]() { int returned_algo_count; - std::array perf_stat; + std::vector perf_results(kNUM_CUDNN_FWD_ALGS); + size_t max_workspace_size = + FindMaxWorkspaceSize(args, workspace_size_limit); + VLOG(4) << "max_workspace_size=" << ToMegaBytes(max_workspace_size) + << " MB"; auto cudnn_find_func = [&](void* cudnn_workspace_ptr) { PADDLE_ENFORCE_GPU_SUCCESS( @@ -308,25 +234,28 @@ struct SearchAlgorithm { args.wdesc.desc(), args.w->data(), args.cdesc.desc(), args.odesc.desc(), const_cast(args.o->data()), kNUM_CUDNN_FWD_ALGS, &returned_algo_count, - perf_stat.data(), cudnn_workspace_ptr, - workspace_size_limit)); + perf_results.data(), cudnn_workspace_ptr, + max_workspace_size)); }; - workspace_handle.RunFuncSync(cudnn_find_func, workspace_size_limit); - - VLOG(3) << "FwdAlgo Perf result: (algo: stat, time, memory)"; - for (int i = 0; i < returned_algo_count; ++i) { - const auto& stat = perf_stat[i]; - VLOG(3) << stat.algo << ": " << stat.status << " " << stat.time - << " " << stat.memory; - } - return perf_stat[0].algo; + workspace_handle.RunFuncSync(cudnn_find_func, max_workspace_size, + UseFixedWorkspace()); + + VLOG(4) << GetPerfResultString( + "[Exhaustive Search] FwdAlgo Perf result", perf_results, + returned_algo_count, workspace_size_limit); + result.time = perf_results[0].time; + return perf_results[0].algo; }); } - VLOG(3) << "choose algo " << algo; - return algo; + VLOG(3) << "[cuDNN Convoltion] exhaustive_search=" << exhaustive_search + << ", deterministic=" << deterministic + << ", choose algo=" << result.algo << ", workspace=" + << ToMegaBytes(GetWorkspaceSize(args, result.algo)) << " MB"; + return result; } - static size_t GetWorkspaceSize(const ConvArgs& args, algo_t algo) { + static size_t GetWorkspaceSize(const ConvArgs& args, + cudnnConvolutionFwdAlgo_t algo) { size_t workspace_size = 0; PADDLE_ENFORCE_GPU_SUCCESS( platform::dynload::cudnnGetConvolutionForwardWorkspaceSize( @@ -334,68 +263,84 @@ struct SearchAlgorithm { args.cdesc.desc(), args.odesc.desc(), algo, &workspace_size)); return workspace_size; } + + private: + static size_t FindMaxWorkspaceSize(const ConvArgs& args, + size_t workspace_size_limit) { + if (!UseFixedWorkspace()) { + size_t max_workspace_size = 0; + for (size_t algo = 0; algo < kNUM_CUDNN_FWD_ALGS; ++algo) { + size_t workspace_size = 0; + auto status = + platform::dynload::cudnnGetConvolutionForwardWorkspaceSize( + args.handle, args.idesc.desc(), args.wdesc.desc(), + args.cdesc.desc(), args.odesc.desc(), + static_cast(algo), &workspace_size); + if (status == CUDNN_STATUS_SUCCESS) { + max_workspace_size = std::max(workspace_size, max_workspace_size); + } + } + return std::min(max_workspace_size, workspace_size_limit); + } else { + return workspace_size_limit; + } + } }; template <> struct SearchAlgorithm { - using perf_t = cudnnConvolutionBwdDataAlgoPerf_t; - using algo_t = cudnnConvolutionBwdDataAlgo_t; + using PerfT = cudnnConvolutionBwdDataAlgoPerf_t; + using AlgoT = cudnnConvolutionBwdDataAlgo_t; template - static algo_t Find(const ConvArgs& args, bool exhaustive_search, - bool deterministic, const phi::GPUContext& ctx) { + static SearchResult Find(const ConvArgs& args, bool exhaustive_search, + bool deterministic, + const phi::GPUContext& ctx) { + SearchResult result; auto dtype = platform::CudnnDataType::type; - size_t workspace_size_limit = FLAGS_conv_workspace_size_limit * 1024 * 1024; - size_t workspace_size = 0; - bool has_got_workspace_size = true; - algo_t algo; + size_t workspace_size_limit = CaclWorkspaceLimitInBytes(ctx); SetConvMathType(ctx, dtype, args.cdesc); if (!exhaustive_search && !deterministic) { #if CUDNN_VERSION >= 7001 - int perf_count; + int actual_perf_count; int best_algo_idx = 0; - std::unique_ptr perf_results( - new perf_t[kNUM_CUDNN_BWD_DATA_ALGS]); + std::vector perf_results(kNUM_CUDNN_BWD_DATA_ALGS); PADDLE_ENFORCE_GPU_SUCCESS( platform::dynload::cudnnGetConvolutionBackwardDataAlgorithm_v7( args.handle, args.wdesc.desc(), args.odesc.desc(), args.cdesc.desc(), args.idesc.desc(), kNUM_CUDNN_BWD_DATA_ALGS, - &perf_count, perf_results.get())); - algo = (perf_results.get())[best_algo_idx].algo; + &actual_perf_count, perf_results.data())); + result.algo = perf_results[best_algo_idx].algo; #if CUDNN_VERSION < 7500 int stride_dim = args.x->dims().size() - 2; bool blacklist = std::any_of(args.s.begin(), args.s.begin() + stride_dim, [=](int n) { return n != 1; }); - if (blacklist && (static_cast( - perf_results[best_algo_idx].algo) == + if (blacklist && (perf_results[best_algo_idx].algo == CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT_TILING || - static_cast( - perf_results[best_algo_idx].algo) == + perf_results[best_algo_idx].algo == CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT)) { - algo = CUDNN_CONVOLUTION_BWD_DATA_ALGO_1; + result.algo = CUDNN_CONVOLUTION_BWD_DATA_ALGO_1; } #endif - workspace_size = GetWorkspaceSize(args, algo); - if (workspace_size > workspace_size_limit) { - has_got_workspace_size = false; + result.workspace_size = GetWorkspaceSize(args, result.algo); + if (result.workspace_size > workspace_size_limit) { #if CUDNN_VERSION >= 8000 // cudnnGetConvolutionBackwardDataAlgorithm is removed in CUDNN-8 - ChooseAlgoByWorkspace(perf_results.get(), - kNUM_CUDNN_BWD_DATA_ALGS, - workspace_size_limit, &algo); + ChooseAlgoByWorkspace(perf_results, workspace_size_limit, + &result); #else VLOG(1) << "Fallback to non-v7 method to find conv algorithm becasue " "the workspace size request(" - << workspace_size << ") exceeds the limit(" + << result.workspace_size << ") exceeds the limit(" << workspace_size_limit << ")"; PADDLE_ENFORCE_GPU_SUCCESS( platform::dynload::cudnnGetConvolutionBackwardDataAlgorithm( args.handle, args.wdesc.desc(), args.odesc.desc(), args.cdesc.desc(), args.idesc.desc(), CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT, - workspace_size_limit, &algo)); + workspace_size_limit, &(result.algo))); #endif } #else @@ -404,29 +349,29 @@ struct SearchAlgorithm { args.handle, args.wdesc.desc(), args.odesc.desc(), args.cdesc.desc(), args.idesc.desc(), CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT, - workspace_size_limit, &algo)); + workspace_size_limit, &(result.algo))); #endif } else if (deterministic) { - return CUDNN_CONVOLUTION_BWD_DATA_ALGO_1; + result.algo = CUDNN_CONVOLUTION_BWD_DATA_ALGO_1; } else { - auto& dev_ctx = ctx; - auto workspace_handle = dev_ctx.cudnn_workspace_handle(); - - AlgorithmsCache& algo_cache = - *(framework::ConvSearchCache::Instance().GetBackwardData()); - + auto workspace_handle = ctx.cudnn_workspace_handle(); auto x_dims = phi::vectorize(args.x->dims()); auto w_dims = phi::vectorize(args.w->dims()); - VLOG(10) << "cudnnConvolutionFwdAlgoPerf_t" << ", x_dims:" << x_dims << ", w_dims:" << w_dims << ", args.s" << args.s << ", args.p" << args.p << ", args.d" << args.d; - algo = algo_cache.GetAlgorithm( + AlgorithmsCache& algo_cache = + *(framework::ConvSearchCache::Instance().GetBackwardData()); + result.algo = algo_cache.GetAlgorithm( x_dims, w_dims, args.s, args.p, args.d, 0, static_cast(args.cudnn_dtype), [&]() { int returned_algo_count; - std::array perf_stat; + std::vector perf_results(kNUM_CUDNN_BWD_DATA_ALGS); + size_t max_workspace_size = + FindMaxWorkspaceSize(args, workspace_size_limit); + VLOG(3) << "max_workspace_size=" << ToMegaBytes(max_workspace_size) + << " MB"; auto cudnn_find_func = [&](void* cudnn_workspace_ptr) { PADDLE_ENFORCE_GPU_SUCCESS( @@ -437,26 +382,28 @@ struct SearchAlgorithm { args.cdesc.desc(), args.idesc.desc(), const_cast(args.x->data()), kNUM_CUDNN_BWD_DATA_ALGS, &returned_algo_count, - perf_stat.data(), cudnn_workspace_ptr, - workspace_size_limit)); + perf_results.data(), cudnn_workspace_ptr, + max_workspace_size)); }; - workspace_handle.RunFuncSync(cudnn_find_func, workspace_size_limit); - - VLOG(3) << "BwdDataAlgo Perf result: (algo: stat, time, memory)"; - for (int i = 0; i < returned_algo_count; ++i) { - const auto& stat = perf_stat[i]; - VLOG(3) << stat.algo << ": " << stat.status << " " << stat.time - << " " << stat.memory; - } - - return perf_stat[0].algo; + workspace_handle.RunFuncSync(cudnn_find_func, max_workspace_size, + UseFixedWorkspace()); + + VLOG(3) << GetPerfResultString( + "[Exhaustive Search] BwdDataAlgo Perf result", perf_results, + returned_algo_count, workspace_size_limit); + result.time = perf_results[0].time; + return perf_results[0].algo; }); } - VLOG(3) << "choose algo " << algo; - return algo; + VLOG(3) << "[cuDNN Convoltion] exhaustive_search=" << exhaustive_search + << ", deterministic=" << deterministic + << ", choose algo=" << result.algo << ", workspace=" + << ToMegaBytes(GetWorkspaceSize(args, result.algo)) << " MB"; + return result; } - static size_t GetWorkspaceSize(const ConvArgs& args, algo_t algo) { + static size_t GetWorkspaceSize(const ConvArgs& args, + cudnnConvolutionBwdDataAlgo_t algo) { size_t workspace_size = 0; PADDLE_ENFORCE_GPU_SUCCESS( platform::dynload::cudnnGetConvolutionBackwardDataWorkspaceSize( @@ -464,57 +411,75 @@ struct SearchAlgorithm { args.cdesc.desc(), args.idesc.desc(), algo, &workspace_size)); return workspace_size; } + + private: + static size_t FindMaxWorkspaceSize(const ConvArgs& args, + size_t workspace_size_limit) { + if (!UseFixedWorkspace()) { + size_t max_workspace_size = 0; + for (size_t algo = 0; algo < kNUM_CUDNN_BWD_DATA_ALGS; ++algo) { + size_t workspace_size = 0; + auto status = + platform::dynload::cudnnGetConvolutionBackwardDataWorkspaceSize( + args.handle, args.wdesc.desc(), args.odesc.desc(), + args.cdesc.desc(), args.idesc.desc(), + static_cast(algo), + &workspace_size); + if (status == CUDNN_STATUS_SUCCESS) { + max_workspace_size = std::max(workspace_size, max_workspace_size); + } + } + return std::min(max_workspace_size, workspace_size_limit); + } else { + return workspace_size_limit; + } + } }; template <> struct SearchAlgorithm { - using perf_t = cudnnConvolutionBwdFilterAlgoPerf_t; - using algo_t = cudnnConvolutionBwdFilterAlgo_t; + using PerfT = cudnnConvolutionBwdFilterAlgoPerf_t; + using AlgoT = cudnnConvolutionBwdFilterAlgo_t; template - static algo_t Find(const ConvArgs& args, bool exhaustive_search, - bool deterministic, const phi::GPUContext& ctx) { + static SearchResult Find(const ConvArgs& args, bool exhaustive_search, + bool deterministic, + const phi::GPUContext& ctx) { platform::CUDAGraphCaptureModeGuard guard; + SearchResult result; auto dtype = platform::CudnnDataType::type; - size_t workspace_size_limit = FLAGS_conv_workspace_size_limit * 1024 * 1024; - size_t workspace_size = 0; - bool has_got_workspace_size = true; + size_t workspace_size_limit = CaclWorkspaceLimitInBytes(ctx); SetConvMathType(ctx, dtype, args.cdesc); - algo_t algo; if (!exhaustive_search && !deterministic) { #if CUDNN_VERSION >= 7001 - using perf_t = cudnnConvolutionBwdFilterAlgoPerf_t; - int perf_count; + int actual_perf_count; int best_algo_idx = 0; - std::unique_ptr perf_results( - new perf_t[kNUM_CUDNN_BWD_FILTER_ALGS]); + std::vector perf_results(kNUM_CUDNN_BWD_FILTER_ALGS); PADDLE_ENFORCE_GPU_SUCCESS( platform::dynload::cudnnGetConvolutionBackwardFilterAlgorithm_v7( args.handle, args.idesc.desc(), args.odesc.desc(), args.cdesc.desc(), args.wdesc.desc(), kNUM_CUDNN_BWD_FILTER_ALGS, - &perf_count, perf_results.get())); - algo = (perf_results.get())[best_algo_idx].algo; - workspace_size = (perf_results.get())[best_algo_idx].memory; + &actual_perf_count, perf_results.data())); + result.algo = perf_results[best_algo_idx].algo; + result.workspace_size = perf_results[best_algo_idx].memory; - if (workspace_size > workspace_size_limit) { - workspace_size = workspace_size_limit; + if (result.workspace_size > workspace_size_limit) { #if CUDNN_VERSION >= 8000 // cudnnGetConvolutionBackwardFilterAlgorithm is removed in CUDNN-8 - ChooseAlgoByWorkspace(perf_results.get(), - kNUM_CUDNN_BWD_FILTER_ALGS, - workspace_size_limit, &algo); + ChooseAlgoByWorkspace(perf_results, workspace_size_limit, + &result); #else VLOG(1) << "Fallback to non-v7 method to find conv algorithm becasue " "the workspace size request(" - << workspace_size << ") exceeds the limit(" + << result.workspace_size << ") exceeds the limit(" << workspace_size_limit << ")"; PADDLE_ENFORCE_GPU_SUCCESS( platform::dynload::cudnnGetConvolutionBackwardFilterAlgorithm( args.handle, args.idesc.desc(), args.odesc.desc(), args.cdesc.desc(), args.wdesc.desc(), CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT, - workspace_size_limit, &algo)); + workspace_size_limit, &(result.algo))); #endif } #else @@ -523,28 +488,32 @@ struct SearchAlgorithm { args.handle, args.idesc.desc(), args.odesc.desc(), args.cdesc.desc(), args.wdesc.desc(), CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT, - workspace_size_limit, &algo)); + workspace_size_limit, &(result.algo))); #endif } else if (deterministic) { - return CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1; + result.algo = CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1; } else { - auto& dev_ctx = ctx; - auto workspace_handle = dev_ctx.cudnn_workspace_handle(); - AlgorithmsCache& algo_cache = - *(framework::ConvSearchCache::Instance().GetBackwardFilter()); - + auto workspace_handle = ctx.cudnn_workspace_handle(); auto x_dims = phi::vectorize(args.x->dims()); auto w_dims = phi::vectorize(args.w->dims()); - VLOG(10) << "cudnnConvolutionFwdAlgoPerf_t:" << ", x_dims:" << x_dims << ", w_dims:" << w_dims << ", args.s" << args.s << ", args.p" << args.p << ", args.d" << args.d; + + AlgorithmsCache& algo_cache = + *(framework::ConvSearchCache::Instance().GetBackwardFilter()); + if (dtype != CUDNN_DATA_HALF) { - algo = algo_cache.GetAlgorithm( + result.algo = algo_cache.GetAlgorithm( x_dims, w_dims, args.s, args.p, args.d, 0, static_cast(args.cudnn_dtype), [&]() { int returned_algo_count; - std::array perf_stat; + std::vector perf_results(kNUM_CUDNN_BWD_FILTER_ALGS); + size_t max_workspace_size = + FindMaxWorkspaceSize(args, workspace_size_limit); + VLOG(3) << "max_workspace_size=" + << ToMegaBytes(max_workspace_size) << " MB"; + auto cudnn_find_func = [&](void* cudnn_workspace_ptr) { PADDLE_ENFORCE_GPU_SUCCESS( platform::dynload:: @@ -554,29 +523,26 @@ struct SearchAlgorithm { args.cdesc.desc(), args.wdesc.desc(), const_cast(args.w->data()), kNUM_CUDNN_BWD_FILTER_ALGS, &returned_algo_count, - perf_stat.data(), cudnn_workspace_ptr, - workspace_size_limit)); + perf_results.data(), cudnn_workspace_ptr, + max_workspace_size)); }; - workspace_handle.RunFuncSync(cudnn_find_func, - workspace_size_limit); - - VLOG(3) - << "BwdFilterAlgo Perf result: (algo: stat, time, memory)"; - for (int i = 0; i < returned_algo_count; ++i) { - const auto& stat = perf_stat[i]; - VLOG(3) << stat.algo << ": " << stat.status << " " << stat.time - << " " << stat.memory; - } - return perf_stat[0].algo; + workspace_handle.RunFuncSync(cudnn_find_func, max_workspace_size, + UseFixedWorkspace()); + + VLOG(3) << GetPerfResultString( + "[Exhaustive Search] BwdFilterAlgo Perf result", perf_results, + returned_algo_count, workspace_size_limit); + result.time = perf_results[0].time; + return perf_results[0].algo; }); } else { - auto max_algos = MaxBwdFilterAlgos(args.handle); - algo = algo_cache.GetAlgorithm( + result.algo = algo_cache.GetAlgorithm( x_dims, w_dims, args.s, args.p, args.d, 0, static_cast(args.cudnn_dtype), [&]() { - algo_t chosen_algo; - std::vector perf_results(max_algos); + SearchResult algo_result; int actual_algos = 0; + std::vector perf_results(kNUM_CUDNN_BWD_FILTER_ALGS); + PADDLE_ENFORCE_GPU_SUCCESS( platform::dynload:: cudnnFindConvolutionBackwardFilterAlgorithm( @@ -585,17 +551,21 @@ struct SearchAlgorithm { perf_results.size(), &actual_algos, perf_results.data())); perf_results.resize(actual_algos); - ChooseAlgo(perf_results, workspace_size_limit, - &chosen_algo); - return chosen_algo; + ChooseAlgo(perf_results, workspace_size_limit, &algo_result); + result.time = algo_result.time; + return algo_result.algo; }); } } - VLOG(3) << "choose algo " << algo; - return algo; + VLOG(3) << "[cuDNN Convoltion] exhaustive_search=" << exhaustive_search + << ", deterministic=" << deterministic + << ", choose algo=" << result.algo << ", workspace=" + << ToMegaBytes(GetWorkspaceSize(args, result.algo)) << " MB"; + return result; } - static size_t GetWorkspaceSize(const ConvArgs& args, algo_t algo) { + static size_t GetWorkspaceSize(const ConvArgs& args, + cudnnConvolutionBwdFilterAlgo_t algo) { platform::CUDAGraphCaptureModeGuard guard; size_t workspace_size = 0; PADDLE_ENFORCE_GPU_SUCCESS( @@ -604,6 +574,69 @@ struct SearchAlgorithm { args.cdesc.desc(), args.wdesc.desc(), algo, &workspace_size)); return workspace_size; } + + private: + static size_t FindMaxWorkspaceSize(const ConvArgs& args, + size_t workspace_size_limit) { + if (!UseFixedWorkspace()) { + size_t max_workspace_size = 0; + for (size_t algo = 0; algo < kNUM_CUDNN_BWD_FILTER_ALGS; ++algo) { + size_t workspace_size = 0; + auto status = + platform::dynload::cudnnGetConvolutionBackwardFilterWorkspaceSize( + args.handle, args.idesc.desc(), args.odesc.desc(), + args.cdesc.desc(), args.wdesc.desc(), + static_cast(algo), + &workspace_size); + if (status == CUDNN_STATUS_SUCCESS) { + max_workspace_size = std::max(workspace_size, max_workspace_size); + } + } + return std::min(max_workspace_size, workspace_size_limit); + } else { + return workspace_size_limit; + } + } + + static void ChooseAlgo(const std::vector& perf_results, + size_t workspace_limit, + SearchResult* algo_result) { + VLOG(3) << GetPerfResultString( + "[Exhaustive Search] BwdFilterAlgo Perf result", perf_results, + perf_results.size(), workspace_limit); + + for (size_t i = 0; i != perf_results.size(); ++i) { + const auto& result = perf_results[i]; + if (result.status == CUDNN_STATUS_SUCCESS && + (result.memory <= workspace_limit)) { + if ((result.mathType == CUDNN_TENSOR_OP_MATH) && + (i != perf_results.size() - 1)) { + const auto& next_result = perf_results[i + 1]; + if (next_result.status == CUDNN_STATUS_SUCCESS && + next_result.algo == result.algo && + next_result.memory == result.memory && + next_result.mathType != CUDNN_TENSOR_OP_MATH && + next_result.time < 1.01 * result.time) { + // Skip over this result- it's not really a Tensor Core algo. + // Because it is only 1% performance difference. + // Prefer to choose the next equivalent non-Tensor Core algo. + continue; + } + } + algo_result->algo = result.algo; + algo_result->time = result.time; + auto math_type_str = "0"; + if (result.mathType == CUDNN_TENSOR_OP_MATH) { + math_type_str = "1"; + } + VLOG(3) << " choose algo: " << result.algo + << ", TC: " << math_type_str << ", time: " << result.time + << " ms, wksp = " << result.memory + << ", status = " << result.status; + break; + } + } + } }; } // namespace operators diff --git a/paddle/fluid/operators/conv_cudnn_op_cache.h b/paddle/fluid/operators/conv_cudnn_op_cache.h index 291e5f92f322c..af67d857e0eb7 100644 --- a/paddle/fluid/operators/conv_cudnn_op_cache.h +++ b/paddle/fluid/operators/conv_cudnn_op_cache.h @@ -20,7 +20,7 @@ limitations under the License. */ #include "paddle/fluid/framework/operator.h" #include "paddle/fluid/platform/device/gpu/gpu_dnn.h" -DECLARE_uint64(conv_workspace_size_limit); +DECLARE_int64(conv_workspace_size_limit); DECLARE_bool(cudnn_exhaustive_search); DECLARE_int64(cudnn_exhaustive_search_times); diff --git a/paddle/fluid/operators/conv_miopen_helper.h b/paddle/fluid/operators/conv_miopen_helper.h index 66f7186938478..abc7be7fb8b8a 100644 --- a/paddle/fluid/operators/conv_miopen_helper.h +++ b/paddle/fluid/operators/conv_miopen_helper.h @@ -14,42 +14,12 @@ limitations under the License. */ #pragma once -#include -#include -#include -#include -#include - -#include "paddle/fluid/framework/conv_search_cache.h" -#include "paddle/fluid/framework/operator_kernel_configs.h" -#include "paddle/fluid/operators/conv_cudnn_op_cache.h" -#include "paddle/fluid/platform/device/gpu/gpu_dnn.h" -#include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/fluid/operators/conv_base_helper.h" namespace paddle { namespace operators { -using Tensor = framework::Tensor; -using DataLayout = platform::DataLayout; -template -using ScalingParamType = typename platform::CudnnDataType::ScalingParamType; -using framework::AlgorithmsCache; -static inline void GetNCDHW(const framework::DDim& dims, - const DataLayout& layout, int* N, int* C, int* D, - int* H, int* W) { - *N = dims[0]; - *C = layout == DataLayout::kNCHW ? dims[1] : dims[dims.size() - 1]; - int i = layout == DataLayout::kNCHW ? 0 : 1; - if (dims.size() == 5) { - *D = dims[2 - i]; - *H = dims[3 - i]; - *W = dims[4 - i]; - } else { - *D = 1; - *H = dims[2 - i]; - *W = dims[3 - i]; - } -} +using ConvArgs = ConvArgsBase; template static void RemovePaddingSlice(const phi::GPUContext& context, @@ -66,9 +36,8 @@ static void RemovePaddingSlice(const phi::GPUContext& context, extents[i] = new_out_dims[i]; } - int start; for (size_t i = 0; i < axes.size(); ++i) { - start = starts[i]; + int start = starts[i]; if (start < 0) { start = (start + in_dims[axes[i]]); } @@ -85,41 +54,6 @@ static void RemovePaddingSlice(const phi::GPUContext& context, out_t.device(place) = in_t.slice(offsets, extents); } -template -std::ostream& operator<<(std::ostream& out, const std::vector& v) { - out << "["; - for (auto const& tmp : v) out << tmp << ","; - out << "]"; - return out; -} - -using framework::ConvSearchCache; - -struct ConvArgs { - miopenHandle_t handle; - platform::TensorDescriptor idesc, odesc; - platform::FilterDescriptor wdesc; - platform::ConvolutionDescriptor cdesc; - const framework::Tensor *x, *w, *o; - miopenDataType_t cudnn_dtype; - - // strides - std::vector s; - // paddings - std::vector p; - // dilations - std::vector d; - - ConvArgs(const framework::Tensor* x, const framework::Tensor* w, - const framework::Tensor* o, const std::vector s, - const std::vector p, const std::vector d, - miopenDataType_t dtype) - : x(x), w(w), o(o), s(s), p(p), d(d), cudnn_dtype(dtype) {} -}; - -template -struct SearchAlgorithm {}; - template <> struct SearchAlgorithm { using perf_t = miopenConvAlgoPerf_t; diff --git a/paddle/fluid/operators/fused/fusion_conv_inception_op.cu b/paddle/fluid/operators/fused/fusion_conv_inception_op.cu index 39b42ec194c3b..bd7134f2f3354 100644 --- a/paddle/fluid/operators/fused/fusion_conv_inception_op.cu +++ b/paddle/fluid/operators/fused/fusion_conv_inception_op.cu @@ -16,8 +16,6 @@ limitations under the License. */ #include "paddle/fluid/operators/conv_cudnn_op_cache.h" #include "paddle/fluid/platform/device/gpu/gpu_dnn.h" -DECLARE_uint64(conv_workspace_size_limit); - namespace paddle { namespace operators { diff --git a/paddle/fluid/platform/device/gpu/gpu_info.cc b/paddle/fluid/platform/device/gpu/gpu_info.cc index a671381d07ff3..89e3b74bb3aca 100644 --- a/paddle/fluid/platform/device/gpu/gpu_info.cc +++ b/paddle/fluid/platform/device/gpu/gpu_info.cc @@ -188,6 +188,8 @@ class RecordedGpuMallocHelper { if (UNLIKELY(malloc_managed_memory)) { result = cudaMallocManaged(ptr, size); } else { + VLOG(10) << "[cudaMalloc] size=" << static_cast(size) / (1 << 20) + << " MB"; result = cudaMalloc(ptr, size); } #endif @@ -226,6 +228,8 @@ class RecordedGpuMallocHelper { if (err != hipErrorDeinitialized) { #else auto err = cudaFree(ptr); + VLOG(10) << "[cudaFree] size=" << static_cast(size) / (1 << 20) + << " MB"; if (err != cudaErrorCudartUnloading) { #endif PADDLE_ENFORCE_GPU_SUCCESS(err); diff --git a/paddle/fluid/platform/device_context.cc b/paddle/fluid/platform/device_context.cc index f3934c7d8713b..904e4854ba6b4 100644 --- a/paddle/fluid/platform/device_context.cc +++ b/paddle/fluid/platform/device_context.cc @@ -522,8 +522,8 @@ CUDADeviceContext::CUDADeviceContext(CUDAPlace place) : phi::GPUContext(place) { cuda_stream_.reset(new stream::CUDAStream(phi::GPUContext::stream(), place)); auto& instance = memory::allocation::AllocatorFacade::Instance(); instance.SetDefaultStream(place, phi::GPUContext::stream()); - workspace_.reset( - new phi::DnnWorkspaceHandle(instance.GetAllocator(place).get())); + workspace_.reset(new phi::DnnWorkspaceHandle( + instance.GetAllocator(place).get(), stream())); } CUDADeviceContext::~CUDADeviceContext() = default; @@ -623,7 +623,8 @@ phi::DnnWorkspaceHandle CUDADeviceContext::cudnn_workspace_handle() const { return phi::DnnWorkspaceHandle( memory::allocation::AllocatorFacade::Instance() .GetAllocator(GetPlace()) - .get()); + .get(), + stream()); } return phi::GPUContext::cudnn_workspace_handle(); } diff --git a/paddle/fluid/platform/flags.cc b/paddle/fluid/platform/flags.cc index c3d3f6a4f6893..8209c0a5d6f8e 100644 --- a/paddle/fluid/platform/flags.cc +++ b/paddle/fluid/platform/flags.cc @@ -161,10 +161,9 @@ PADDLE_DEFINE_EXPORTED_bool( * increased. * Users need to balance memory and speed. */ -PADDLE_DEFINE_EXPORTED_uint64( - conv_workspace_size_limit, - paddle::platform::kDefaultConvWorkspaceSizeLimitMB, - "cuDNN convolution workspace limit in MB unit."); +PADDLE_DEFINE_EXPORTED_int64(conv_workspace_size_limit, + paddle::platform::kDefaultConvWorkspaceSizeLimitMB, + "cuDNN convolution workspace limit in MB unit."); /** * CUDNN related FLAG diff --git a/paddle/phi/backends/gpu/gpu_context.cc b/paddle/phi/backends/gpu/gpu_context.cc index 0394835aa8b70..ff238b7997865 100644 --- a/paddle/phi/backends/gpu/gpu_context.cc +++ b/paddle/phi/backends/gpu/gpu_context.cc @@ -12,6 +12,7 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ + #include "paddle/phi/backends/gpu/gpu_context.h" #include #include @@ -155,6 +156,39 @@ static void StreamCallbackFunc(gpuStream_t stream, } // namespace internal +void DnnWorkspaceHandle::RunFuncSync( + const std::function& cudnn_func, + size_t required_workspace_bytes, + bool use_cached_allocation) { + bool need_realloc = required_workspace_bytes > WorkspaceSize(); + if (need_realloc && !use_cached_allocation) { + void* workspace_ptr = nullptr; + size_t size = ((required_workspace_bytes + 255) >> 8) << 8; + std::lock_guard guard(*mtx_); +#ifdef PADDLE_WITH_HIP + auto status = hipMalloc(&workspace_ptr, size); +#else + auto status = cudaMalloc(&workspace_ptr, size); +#endif + if (status == gpuSuccess) { + cudnn_func(workspace_ptr); + phi::backends::gpu::GpuStreamSync(stream_); +#ifdef PADDLE_WITH_HIP + PADDLE_ENFORCE_GPU_SUCCESS(hipFree(workspace_ptr)); +#else + PADDLE_ENFORCE_GPU_SUCCESS(cudaFree(workspace_ptr)); +#endif + return; + } + } + + RunFunc(cudnn_func, required_workspace_bytes); + if (need_realloc) { + // Release the workspace allocated in this running. + ResetWorkspace(); + } +} + void DnnWorkspaceHandle::ResetWorkspace() { allocation_ = nullptr; } void DnnWorkspaceHandle::ReallocWorkspace(size_t required_workspace_bytes) { @@ -295,13 +329,13 @@ struct GPUContext::Impl { void InitDnnWorkspace() { PD_CHECK(allocator_ != nullptr, "the device allocator for gpu context is nullptr."); - workspace_ = new DnnWorkspaceHandle(allocator_); + workspace_ = new DnnWorkspaceHandle(allocator_, stream_); } void DestoryInternalWorkspace() { if (owned_ && workspace_ != nullptr) { delete workspace_; - stream_ = nullptr; + workspace_ = nullptr; } } @@ -313,7 +347,7 @@ struct GPUContext::Impl { DnnWorkspaceHandle GetDnnWorkspace() { PD_CHECK(allocator_ != nullptr, "the device allocator for gpu context is nullptr."); - return DnnWorkspaceHandle(allocator_); + return DnnWorkspaceHandle(allocator_, stream_); } void InitStream() { diff --git a/paddle/phi/backends/gpu/gpu_context.h b/paddle/phi/backends/gpu/gpu_context.h index cd08da1c0f2f8..ffae1f1f1353e 100644 --- a/paddle/phi/backends/gpu/gpu_context.h +++ b/paddle/phi/backends/gpu/gpu_context.h @@ -21,6 +21,7 @@ limitations under the License. */ #include "paddle/phi/backends/gpu/forwards.h" #include "paddle/phi/backends/gpu/gpu_decls.h" #include "paddle/phi/backends/gpu/gpu_helper.h" +#include "paddle/phi/backends/gpu/gpu_info.h" #include "paddle/phi/common/place.h" #include "paddle/phi/core/device_context.h" @@ -28,8 +29,8 @@ namespace phi { class DnnWorkspaceHandle { public: - explicit inline DnnWorkspaceHandle(Allocator* allocator) - : allocator_(allocator) { + inline DnnWorkspaceHandle(Allocator* allocator, gpuStream_t stream) + : allocator_(allocator), stream_(stream) { mtx_.reset(new std::mutex()); } @@ -48,11 +49,9 @@ class DnnWorkspaceHandle { * running the function. Currently this function is only used when cudnn * exhaustive searching and callers have to guarantee that the input function * is host blocking */ - inline void RunFuncSync(const std::function& cudnn_func, - size_t required_workspace_bytes) { - RunFunc(cudnn_func, required_workspace_bytes); - ResetWorkspace(); - } + void RunFuncSync(const std::function& cudnn_func, + size_t required_workspace_bytes, + bool use_cached_allocation = true); inline size_t WorkspaceSize() { if (allocation_ == nullptr) { @@ -70,7 +69,8 @@ class DnnWorkspaceHandle { private: Allocator::AllocationPtr allocation_{nullptr}; - Allocator* allocator_{nullptr}; + Allocator* allocator_{nullptr}; // Not owned + gpuStream_t stream_{nullptr}; // Not owned std::unique_ptr mtx_; }; diff --git a/paddle/phi/kernels/autotune/CMakeLists.txt b/paddle/phi/kernels/autotune/CMakeLists.txt index b933e0993deef..f1702d883b9f0 100644 --- a/paddle/phi/kernels/autotune/CMakeLists.txt +++ b/paddle/phi/kernels/autotune/CMakeLists.txt @@ -1,6 +1,6 @@ if (WITH_GPU) - nv_test(gpu_timer_test SRCS gpu_timer_test.cu DEPS gtest) - nv_test(auto_tune_test SRCS auto_tune_test.cu DEPS gtest) + nv_test(gpu_timer_test SRCS gpu_timer_test.cu DEPS gtest) + nv_test(auto_tune_test SRCS auto_tune_test.cu DEPS gtest) elseif (WITH_ROCM) hip_test(gpu_timer_test SRCS gpu_timer_test.cu DEPS gtest) hip_test(auto_tune_test SRCS auto_tune_test.cu DEPS gtest) diff --git a/paddle/phi/kernels/gpudnn/conv_grad_grad_kernel.cu b/paddle/phi/kernels/gpudnn/conv_grad_grad_kernel.cu index 9c5e77d5fd846..74525e63f476b 100644 --- a/paddle/phi/kernels/gpudnn/conv_grad_grad_kernel.cu +++ b/paddle/phi/kernels/gpudnn/conv_grad_grad_kernel.cu @@ -289,21 +289,17 @@ void ConvCudnnGradGradKernel( dtype}; #ifdef PADDLE_WITH_HIP - miopenConvFwdAlgorithm_t fwd_algo1 = static_cast(0); - miopenConvFwdAlgorithm_t fwd_algo2 = static_cast(0); - miopenConvBwdDataAlgorithm_t data_algo = - static_cast(0); - miopenConvBwdWeightsAlgorithm_t filter_algo = - static_cast(0); + paddle::operators::SearchResult fwd_result1; + paddle::operators::SearchResult fwd_result2; + paddle::operators::SearchResult data_result; + paddle::operators::SearchResult + filter_result; #else - cudnnConvolutionFwdAlgo_t fwd_algo1 = - static_cast(0); - cudnnConvolutionFwdAlgo_t fwd_algo2 = - static_cast(0); - cudnnConvolutionBwdDataAlgo_t data_algo = - static_cast(0); - cudnnConvolutionBwdFilterAlgo_t filter_algo = - static_cast(0); + paddle::operators::SearchResult fwd_result1; + paddle::operators::SearchResult fwd_result2; + paddle::operators::SearchResult data_result; + paddle::operators::SearchResult + filter_result; #endif auto layout = paddle::platform::GetCudnnTensorFormat( @@ -332,13 +328,13 @@ void ConvCudnnGradGradKernel( using search1 = paddle::operators::SearchAlgorithm; workspace_size = search1::GetWorkspaceSize(args1); - fwd_algo1 = search1::Find( + fwd_result1.algo = search1::Find( args1, exhaustive_search, false, workspace_size, ctx); #else using search1 = paddle::operators::SearchAlgorithm; - fwd_algo1 = search1::Find(args1, exhaustive_search, false, ctx); - workspace_size = search1::GetWorkspaceSize(args1, fwd_algo1); + fwd_result1 = search1::Find(args1, exhaustive_search, false, ctx); + workspace_size = search1::GetWorkspaceSize(args1, fwd_result1.algo); #endif } @@ -360,14 +356,14 @@ void ConvCudnnGradGradKernel( paddle::operators::SearchAlgorithm; workspace_size = std::max(workspace_size, search2::GetWorkspaceSize(args2)); - fwd_algo2 = search2::Find( + fwd_result2.algo = search2::Find( args2, exhaustive_search, false, workspace_size, ctx); #else using search2 = paddle::operators::SearchAlgorithm; - fwd_algo2 = search2::Find(args2, exhaustive_search, false, ctx); - workspace_size = - std::max(workspace_size, search2::GetWorkspaceSize(args2, fwd_algo2)); + fwd_result2 = search2::Find(args2, exhaustive_search, false, ctx); + workspace_size = std::max( + workspace_size, search2::GetWorkspaceSize(args2, fwd_result2.algo)); #endif } } @@ -389,15 +385,15 @@ void ConvCudnnGradGradKernel( using search3 = paddle::operators::SearchAlgorithm; workspace_size = std::max(workspace_size, search3::GetWorkspaceSize(args3)); - filter_algo = search3::Find( + filter_result.algo = search3::Find( args3, exhaustive_search, deterministic, workspace_size, ctx); #else using search3 = paddle::operators::SearchAlgorithm; - filter_algo = + filter_result = search3::Find(args3, exhaustive_search, deterministic, ctx); - workspace_size = - std::max(workspace_size, search3::GetWorkspaceSize(args3, filter_algo)); + workspace_size = std::max( + workspace_size, search3::GetWorkspaceSize(args3, filter_result.algo)); #endif } @@ -419,14 +415,15 @@ void ConvCudnnGradGradKernel( using search4 = paddle::operators::SearchAlgorithm; workspace_size = std::max(workspace_size, search4::GetWorkspaceSize(args4)); - data_algo = search4::Find( + data_result.algo = search4::Find( args4, exhaustive_search, deterministic, workspace_size, ctx); #else using search4 = paddle::operators::SearchAlgorithm; - data_algo = search4::Find(args4, exhaustive_search, deterministic, ctx); - workspace_size = - std::max(workspace_size, search4::GetWorkspaceSize(args4, data_algo)); + data_result = + search4::Find(args4, exhaustive_search, deterministic, ctx); + workspace_size = std::max( + workspace_size, search4::GetWorkspaceSize(args4, data_result.algo)); #endif } @@ -471,7 +468,7 @@ void ConvCudnnGradGradKernel( args1.wdesc.desc(), w, args1.cdesc.desc(), - fwd_algo1, + fwd_result1.algo, &beta, args1.odesc.desc(), transformed_ddy_channel, @@ -492,7 +489,7 @@ void ConvCudnnGradGradKernel( args1.wdesc.desc(), w + i * group_offset_filter, args1.cdesc.desc(), - fwd_algo1, + fwd_result1.algo, workspace_ptr, workspace_size, &beta, @@ -517,7 +514,7 @@ void ConvCudnnGradGradKernel( args2.wdesc.desc(), ddw, args2.cdesc.desc(), - fwd_algo2, + fwd_result2.algo, &beta, args2.odesc.desc(), transformed_ddy_channel, @@ -538,7 +535,7 @@ void ConvCudnnGradGradKernel( args2.wdesc.desc(), ddw + i * group_offset_filter, args2.cdesc.desc(), - fwd_algo2, + fwd_result2.algo, workspace_ptr, workspace_size, &alpha, @@ -568,7 +565,7 @@ void ConvCudnnGradGradKernel( args3.idesc.desc(), ddx, args3.cdesc.desc(), - filter_algo, + filter_result.algo, &beta, args3.wdesc.desc(), dw, @@ -589,7 +586,7 @@ void ConvCudnnGradGradKernel( args3.odesc.desc(), transformed_dy_channel + i * group_offset_out, args3.cdesc.desc(), - filter_algo, + filter_result.algo, workspace_ptr, workspace_size, &beta, @@ -615,7 +612,7 @@ void ConvCudnnGradGradKernel( args4.wdesc.desc(), ddw, args4.cdesc.desc(), - data_algo, + data_result.algo, &beta, args4.idesc.desc(), transformed_dx, @@ -636,7 +633,7 @@ void ConvCudnnGradGradKernel( args4.odesc.desc(), transformed_dy_channel + i * group_offset_out, args4.cdesc.desc(), - data_algo, + data_result.algo, workspace_ptr, workspace_size, &beta, diff --git a/paddle/phi/kernels/gpudnn/conv_grad_kernel.cu b/paddle/phi/kernels/gpudnn/conv_grad_kernel.cu index e09c33380b307..985371ede9c5d 100644 --- a/paddle/phi/kernels/gpudnn/conv_grad_kernel.cu +++ b/paddle/phi/kernels/gpudnn/conv_grad_kernel.cu @@ -322,17 +322,16 @@ void ConvCudnnGradKernel(const Context& ctx, int group_offset_in = i_c / groups * i_h * i_w * i_d; int group_offset_out = o_c / groups * o_h * o_w * o_d; int group_offset_filter = transformed_filter_channel.numel() / groups; + // ------------------- cudnn backward algorithm --------------------- #ifdef PADDLE_WITH_HIP - miopenConvBwdDataAlgorithm_t data_algo = - static_cast(0); - miopenConvBwdWeightsAlgorithm_t filter_algo = - static_cast(0); + paddle::operators::SearchResult bwd_result; + paddle::operators::SearchResult + filter_result; #else - cudnnConvolutionBwdDataAlgo_t data_algo = - static_cast(0); - cudnnConvolutionBwdFilterAlgo_t filter_algo = - static_cast(0); + paddle::operators::SearchResult bwd_result; + paddle::operators::SearchResult + filter_result; #endif // input data workspace_size size_t workspace_size_d = 0; @@ -368,14 +367,14 @@ void ConvCudnnGradKernel(const Context& ctx, paddle::operators::SearchAlgorithm; workspace_size_d = std::max(workspace_size_d, search1::GetWorkspaceSize(args1)); - data_algo = search1::Find( + bwd_result.algo = search1::Find( args1, exhaustive_search, deterministic, workspace_size_d, ctx); #else using search1 = paddle::operators::SearchAlgorithm; - data_algo = search1::Find(args1, exhaustive_search, deterministic, ctx); - workspace_size_d = - std::max(workspace_size_d, search1::GetWorkspaceSize(args1, data_algo)); + bwd_result = search1::Find(args1, exhaustive_search, deterministic, ctx); + workspace_size_d = std::max( + workspace_size_d, search1::GetWorkspaceSize(args1, bwd_result.algo)); #endif } @@ -397,15 +396,17 @@ void ConvCudnnGradKernel(const Context& ctx, paddle::operators::SearchAlgorithm; workspace_size_w = std::max(workspace_size_w, search2::GetWorkspaceSize(args2)); - filter_algo = search2::Find( + filter_result.algo = search2::Find( args2, exhaustive_search, deterministic, workspace_size_w, ctx); #else using search2 = paddle::operators::SearchAlgorithm; - filter_algo = + filter_result = search2::Find(args2, exhaustive_search, deterministic, ctx); - workspace_size_w = std::max(workspace_size_w, - search2::GetWorkspaceSize(args2, filter_algo)); + VLOG(3) << "filter algo: " << filter_result.algo << ", time " + << filter_result.time; + workspace_size_w = std::max( + workspace_size_w, search2::GetWorkspaceSize(args2, filter_result.algo)); #endif } @@ -439,7 +440,7 @@ void ConvCudnnGradKernel(const Context& ctx, args1.wdesc.desc(), filter_data, args1.cdesc.desc(), - data_algo, + bwd_result.algo, &beta, args1.idesc.desc(), temp_tensor_data, @@ -471,7 +472,7 @@ void ConvCudnnGradKernel(const Context& ctx, args1.wdesc.desc(), filter_data, args1.cdesc.desc(), - data_algo, + bwd_result.algo, &beta, args1.idesc.desc(), transformed_input_grad_data, @@ -494,7 +495,7 @@ void ConvCudnnGradKernel(const Context& ctx, args1.odesc.desc(), output_grad_data + i * group_offset_out, args1.cdesc.desc(), - data_algo, + bwd_result.algo, cudnn_workspace_ptr, workspace_size_d, &beta, @@ -554,7 +555,7 @@ void ConvCudnnGradKernel(const Context& ctx, args2.idesc.desc(), input_data, args2.cdesc.desc(), - filter_algo, + filter_result.algo, &beta, args2.wdesc.desc(), filter_grad_data, @@ -575,7 +576,7 @@ void ConvCudnnGradKernel(const Context& ctx, args2.odesc.desc(), output_grad_data + i * group_offset_out, args2.cdesc.desc(), - filter_algo, + filter_result.algo, cudnn_workspace_ptr, workspace_size_w, &beta_filter, diff --git a/paddle/phi/kernels/gpudnn/conv_kernel.cu b/paddle/phi/kernels/gpudnn/conv_kernel.cu index c2970cc8cde75..37f66e0b25a61 100644 --- a/paddle/phi/kernels/gpudnn/conv_kernel.cu +++ b/paddle/phi/kernels/gpudnn/conv_kernel.cu @@ -18,7 +18,6 @@ #include "paddle/phi/backends/gpu/gpu_context.h" #include "paddle/phi/core/kernel_registry.h" -#include "paddle/fluid/framework/eigen.h" #ifdef PADDLE_WITH_HIP #include "paddle/fluid/operators/conv_miopen_helper.h" #else @@ -68,7 +67,6 @@ void ConvCudnnKernel(const Context& ctx, "FLAGS_cudnn_deterministic True at same time.")); const bool channel_last = (data_format == "NHWC" || data_format == "NDHWC"); - auto dtype = paddle::platform::CudnnDataType::type; #ifdef PADDLE_WITH_HIP @@ -309,17 +307,17 @@ void ConvCudnnKernel(const Context& ctx, size_t workspace_size = 0; // final workspace to allocate. // ------------------- cudnn conv algorithm --------------------- #ifdef PADDLE_WITH_HIP - miopenConvFwdAlgorithm_t algo{}; + paddle::operators::SearchResult fwd_result; using search = paddle::operators::SearchAlgorithm; workspace_size = search::GetWorkspaceSize(args); - algo = search::Find( + fwd_result.algo = search::Find( args, exhaustive_search, deterministic, workspace_size, ctx); #else - cudnnConvolutionFwdAlgo_t algo{}; + paddle::operators::SearchResult fwd_result; using search = paddle::operators::SearchAlgorithm; - algo = search::Find(args, exhaustive_search, deterministic, ctx); - workspace_size = search::GetWorkspaceSize(args, algo); + fwd_result = search::Find(args, exhaustive_search, deterministic, ctx); + workspace_size = search::GetWorkspaceSize(args, fwd_result.algo); #endif #if defined(PADDLE_WITH_CUDA) && CUDNN_VERSION_MIN(7, 0, 1) @@ -328,7 +326,7 @@ void ConvCudnnKernel(const Context& ctx, // in forward computation, so change the algorithm to CUDNN_CONVOLUTION_\ // FWD_ALGO_IMPLICIT_GEMM manually. if (groups > 1) { - algo = static_cast(0); + fwd_result.algo = static_cast(0); } #endif @@ -352,7 +350,7 @@ void ConvCudnnKernel(const Context& ctx, args.wdesc.desc(), filter_data, args.cdesc.desc(), - algo, + fwd_result.algo, &beta, args.odesc.desc(), output_data, @@ -373,7 +371,7 @@ void ConvCudnnKernel(const Context& ctx, args.wdesc.desc(), filter_data + i * group_offset_filter, args.cdesc.desc(), - algo, + fwd_result.algo, workspace_ptr, workspace_size, &beta, diff --git a/paddle/phi/kernels/gpudnn/conv_transpose_grad_kernel.cu b/paddle/phi/kernels/gpudnn/conv_transpose_grad_kernel.cu index 2893bd74b1bce..601ac43eeefd3 100644 --- a/paddle/phi/kernels/gpudnn/conv_transpose_grad_kernel.cu +++ b/paddle/phi/kernels/gpudnn/conv_transpose_grad_kernel.cu @@ -188,11 +188,13 @@ void ConvTransposeGradRawGPUDNNKernel(const Context& ctx, dtype}; #ifdef PADDLE_WITH_HIP - miopenConvFwdAlgorithm_t data_algo{}; - miopenConvBwdWeightsAlgorithm_t filter_algo{}; + paddle::operators::SearchResult fwd_result; + paddle::operators::SearchResult + filter_result; #else - cudnnConvolutionFwdAlgo_t data_algo{}; - cudnnConvolutionBwdFilterAlgo_t filter_algo{}; + paddle::operators::SearchResult fwd_result; + paddle::operators::SearchResult + filter_result; #endif auto layout_tensor = paddle::platform::GetCudnnTensorFormat(layout); @@ -218,14 +220,14 @@ void ConvTransposeGradRawGPUDNNKernel(const Context& ctx, using search1 = paddle::operators::SearchAlgorithm; workspace_size = std::max(workspace_size, search1::GetWorkspaceSize(args1)); - data_algo = + fwd_result.algo = search1::Find(args1, false, deterministic, workspace_size, ctx); #else using search1 = paddle::operators::SearchAlgorithm; - data_algo = search1::Find(args1, false, deterministic, ctx); - workspace_size = - std::max(workspace_size, search1::GetWorkspaceSize(args1, data_algo)); + fwd_result = search1::Find(args1, false, deterministic, ctx); + workspace_size = std::max( + workspace_size, search1::GetWorkspaceSize(args1, fwd_result.algo)); #endif } @@ -245,14 +247,14 @@ void ConvTransposeGradRawGPUDNNKernel(const Context& ctx, using search2 = paddle::operators::SearchAlgorithm; workspace_size = std::max(workspace_size, search2::GetWorkspaceSize(args2)); - filter_algo = + filter_result.algo = search2::Find(args2, false, deterministic, workspace_size, ctx); #else using search2 = paddle::operators::SearchAlgorithm; - filter_algo = search2::Find(args2, false, deterministic, ctx); - workspace_size = - std::max(workspace_size, search2::GetWorkspaceSize(args2, filter_algo)); + filter_result = search2::Find(args2, false, deterministic, ctx); + workspace_size = std::max( + workspace_size, search2::GetWorkspaceSize(args2, filter_result.algo)); #endif } @@ -278,7 +280,7 @@ void ConvTransposeGradRawGPUDNNKernel(const Context& ctx, args1.wdesc.desc(), filter_data + filter_offset * g, args1.cdesc.desc(), - data_algo, + fwd_result.algo, &beta, args1.odesc.desc(), dx_data + x_offset * g, @@ -295,7 +297,7 @@ void ConvTransposeGradRawGPUDNNKernel(const Context& ctx, args1.wdesc.desc(), filter_data + filter_offset * g, args1.cdesc.desc(), - data_algo, + fwd_result.algo, cudnn_workspace, workspace_size, &beta, @@ -338,7 +340,7 @@ void ConvTransposeGradRawGPUDNNKernel(const Context& ctx, args2.idesc.desc(), dout_data + dout_offset * g, args2.cdesc.desc(), - filter_algo, + filter_result.algo, &beta, args2.wdesc.desc(), dfilter_data + filter_offset * g, @@ -355,7 +357,7 @@ void ConvTransposeGradRawGPUDNNKernel(const Context& ctx, args2.odesc.desc(), x_data + x_offset * g, args2.cdesc.desc(), - filter_algo, + filter_result.algo, cudnn_workspace, workspace_size, &beta, @@ -653,22 +655,17 @@ void Conv2dTransposeDoubleGradGPUDNNKernel( dilations_, dtype}; #ifdef PADDLE_WITH_HIP - miopenConvBwdDataAlgorithm_t bwd_algo1 = - static_cast(0); - miopenConvBwdDataAlgorithm_t bwd_algo2 = - static_cast(0); - miopenConvFwdAlgorithm_t data_algo = static_cast(0); - miopenConvBwdWeightsAlgorithm_t filter_algo = - static_cast(0); + paddle::operators::SearchResult bwd_result1; + paddle::operators::SearchResult bwd_result2; + paddle::operators::SearchResult + filter_result; + paddle::operators::SearchResult fwd_result; #else - cudnnConvolutionBwdDataAlgo_t bwd_algo1 = - static_cast(0); - cudnnConvolutionBwdDataAlgo_t bwd_algo2 = - static_cast(0); - cudnnConvolutionFwdAlgo_t data_algo = - static_cast(0); - cudnnConvolutionBwdFilterAlgo_t filter_algo = - static_cast(0); + paddle::operators::SearchResult bwd_result1; + paddle::operators::SearchResult bwd_result2; + paddle::operators::SearchResult + filter_result; + paddle::operators::SearchResult fwd_result; #endif auto layout = paddle::platform::GetCudnnTensorFormat(GPUDNNDataLayout::kNCHW); @@ -696,13 +693,13 @@ void Conv2dTransposeDoubleGradGPUDNNKernel( using search1 = paddle::operators::SearchAlgorithm; workspace_size = search1::GetWorkspaceSize(args1); - bwd_algo1 = + bwd_result1.algo = search1::Find(args1, false, deterministic, workspace_size, ctx); #else using search1 = paddle::operators::SearchAlgorithm; - bwd_algo1 = search1::Find(args1, false, deterministic, ctx); - workspace_size = search1::GetWorkspaceSize(args1, bwd_algo1); + bwd_result1 = search1::Find(args1, false, deterministic, ctx); + workspace_size = search1::GetWorkspaceSize(args1, bwd_result1.algo); #endif ddfilter_ = ddfilter.data(); @@ -720,14 +717,14 @@ void Conv2dTransposeDoubleGradGPUDNNKernel( using search2 = paddle::operators::SearchAlgorithm; workspace_size = std::max(workspace_size, search2::GetWorkspaceSize(args2)); - bwd_algo2 = + bwd_result2.algo = search2::Find(args2, false, deterministic, workspace_size, ctx); #else using search2 = paddle::operators::SearchAlgorithm; - bwd_algo2 = search2::Find(args2, false, deterministic, ctx); - workspace_size = - std::max(workspace_size, search2::GetWorkspaceSize(args2, bwd_algo2)); + bwd_result2 = search2::Find(args2, false, deterministic, ctx); + workspace_size = std::max( + workspace_size, search2::GetWorkspaceSize(args2, bwd_result2.algo)); #endif } @@ -736,9 +733,7 @@ void Conv2dTransposeDoubleGradGPUDNNKernel( args3.handle = handle; args3.idesc.set(transformed_dout, iwo_group); args3.wdesc.set(*dfilter, layout, iwo_group); - args3.odesc.set(transformed_ddx_channel, iwo_group); - args3.cdesc.set(dtype, padding_common, strides, @@ -749,14 +744,14 @@ void Conv2dTransposeDoubleGradGPUDNNKernel( using search3 = paddle::operators::SearchAlgorithm; workspace_size = std::max(workspace_size, search3::GetWorkspaceSize(args3)); - filter_algo = + filter_result.algo = search3::Find(args3, false, deterministic, workspace_size, ctx); #else using search3 = paddle::operators::SearchAlgorithm; - filter_algo = search3::Find(args3, false, deterministic, ctx); - workspace_size = - std::max(workspace_size, search3::GetWorkspaceSize(args3, filter_algo)); + filter_result = search3::Find(args3, false, deterministic, ctx); + workspace_size = std::max( + workspace_size, search3::GetWorkspaceSize(args3, filter_result.algo)); #endif } @@ -777,14 +772,14 @@ void Conv2dTransposeDoubleGradGPUDNNKernel( using search4 = paddle::operators::SearchAlgorithm; workspace_size = std::max(workspace_size, search4::GetWorkspaceSize(args4)); - data_algo = + fwd_result.algo = search4::Find(args4, false, deterministic, workspace_size, ctx); #else using search4 = paddle::operators::SearchAlgorithm; - data_algo = search4::Find(args4, false, deterministic, ctx); - workspace_size = - std::max(workspace_size, search4::GetWorkspaceSize(args4, data_algo)); + fwd_result = search4::Find(args4, false, deterministic, ctx); + workspace_size = std::max( + workspace_size, search4::GetWorkspaceSize(args4, fwd_result.algo)); #endif } @@ -831,7 +826,7 @@ void Conv2dTransposeDoubleGradGPUDNNKernel( args1.wdesc.desc(), filter_ + i * group_offset_filter, args1.cdesc.desc(), - bwd_algo1, + bwd_result1.algo, &beta, args1.idesc.desc(), transformed_ddout_channel_ + i * group_offset_out, @@ -850,7 +845,7 @@ void Conv2dTransposeDoubleGradGPUDNNKernel( args1.odesc.desc(), ddx_ + i * group_offset_in, args1.cdesc.desc(), - bwd_algo1, + bwd_result1.algo, workspace_ptr, workspace_size, &beta, @@ -877,7 +872,7 @@ void Conv2dTransposeDoubleGradGPUDNNKernel( args2.wdesc.desc(), ddfilter_ + i * group_offset_filter, args2.cdesc.desc(), - bwd_algo2, + bwd_result2.algo, &beta, args2.idesc.desc(), conv_x_ddfilter_data + i * group_offset_out, @@ -908,7 +903,7 @@ void Conv2dTransposeDoubleGradGPUDNNKernel( args2.odesc.desc(), x_ + i * group_offset_in, args2.cdesc.desc(), - bwd_algo2, + bwd_result2.algo, workspace_ptr, workspace_size, &alpha, @@ -964,7 +959,7 @@ void Conv2dTransposeDoubleGradGPUDNNKernel( args3.idesc.desc(), transformed_dout_channel_ + i * group_offset_out, args3.cdesc.desc(), - filter_algo, + filter_result.algo, &beta, args3.wdesc.desc(), dfilter_ + i * group_offset_filter, @@ -983,7 +978,7 @@ void Conv2dTransposeDoubleGradGPUDNNKernel( args3.odesc.desc(), ddx_ + i * group_offset_in, args3.cdesc.desc(), - filter_algo, + filter_result.algo, workspace_ptr, workspace_size, &beta, @@ -1009,7 +1004,7 @@ void Conv2dTransposeDoubleGradGPUDNNKernel( args4.wdesc.desc(), ddfilter_ + i * group_offset_filter, args4.cdesc.desc(), - data_algo, + fwd_result.algo, &beta, args4.odesc.desc(), transformed_dx_ + i * group_offset_in, @@ -1028,7 +1023,7 @@ void Conv2dTransposeDoubleGradGPUDNNKernel( args4.wdesc.desc(), ddfilter_ + i * group_offset_filter, args4.cdesc.desc(), - data_algo, + fwd_result.algo, workspace_ptr, workspace_size, &beta, diff --git a/paddle/phi/kernels/gpudnn/conv_transpose_kernel.cu b/paddle/phi/kernels/gpudnn/conv_transpose_kernel.cu index 5de2df4a70c88..ce02a00162b57 100644 --- a/paddle/phi/kernels/gpudnn/conv_transpose_kernel.cu +++ b/paddle/phi/kernels/gpudnn/conv_transpose_kernel.cu @@ -217,16 +217,19 @@ void ConvTransposeRawGPUDNNKernel(const Context& ctx, c_groups); #ifdef PADDLE_WITH_HIP + paddle::operators::SearchResult bwd_result; using search = paddle::operators::SearchAlgorithm; workspace_size = std::max(workspace_size, search::GetWorkspaceSize(args)); - algo = search::Find(args, false, deterministic, workspace_size, ctx); + bwd_result.algo = + search::Find(args, false, deterministic, workspace_size, ctx); #else + paddle::operators::SearchResult bwd_result; using search = paddle::operators::SearchAlgorithm; - algo = search::Find(args, false, deterministic, ctx); + bwd_result = search::Find(args, false, deterministic, ctx); workspace_size = - std::max(workspace_size, search::GetWorkspaceSize(args, algo)); + std::max(workspace_size, search::GetWorkspaceSize(args, bwd_result.algo)); #endif // ------------------- cudnn conv transpose forward --------------------- @@ -247,7 +250,7 @@ void ConvTransposeRawGPUDNNKernel(const Context& ctx, args.wdesc.desc(), filter_data + filter_offset * g, args.cdesc.desc(), - algo, + bwd_result.algo, &beta, args.idesc.desc(), transformed_out_data + out_offset * g, @@ -264,7 +267,7 @@ void ConvTransposeRawGPUDNNKernel(const Context& ctx, args.odesc.desc(), x_data + x_offset * g, args.cdesc.desc(), - algo, + bwd_result.algo, cudnn_workspace, workspace_size, &beta, diff --git a/paddle/phi/kernels/impl/conv_cudnn_impl.h b/paddle/phi/kernels/impl/conv_cudnn_impl.h index 93bc5b64adc17..5cf59fe01920a 100644 --- a/paddle/phi/kernels/impl/conv_cudnn_impl.h +++ b/paddle/phi/kernels/impl/conv_cudnn_impl.h @@ -36,7 +36,7 @@ #include "paddle/phi/kernels/funcs/batch_norm_utils.h" DECLARE_bool(cudnn_deterministic); -DECLARE_uint64(conv_workspace_size_limit); +DECLARE_int64(conv_workspace_size_limit); DECLARE_bool(cudnn_exhaustive_search); namespace phi { diff --git a/python/paddle/fluid/tests/unittests/test_switch_autotune.py b/python/paddle/fluid/tests/unittests/test_switch_autotune.py index 9fad1eeb5c247..1c08811d4b95c 100644 --- a/python/paddle/fluid/tests/unittests/test_switch_autotune.py +++ b/python/paddle/fluid/tests/unittests/test_switch_autotune.py @@ -43,6 +43,16 @@ def static_program(net, data): return loss +def set_flags(enable_autotune): + if paddle.is_compiled_with_cuda(): + if enable_autotune: + paddle.set_flags({'FLAGS_conv_workspace_size_limit': -1}) + paddle.set_flags({'FLAGS_cudnn_exhaustive_search': 1}) + else: + paddle.set_flags({'FLAGS_conv_workspace_size_limit': 512}) + paddle.set_flags({'FLAGS_cudnn_exhaustive_search': 0}) + + class TestAutoTune(unittest.TestCase): def test_autotune(self): paddle.fluid.core.disable_autotune() @@ -61,6 +71,7 @@ def check_status(self, expected_res): class TestDygraphAutoTuneStatus(TestAutoTune): def run_program(self, enable_autotune): + set_flags(enable_autotune) if enable_autotune: paddle.fluid.core.enable_autotune() else: @@ -107,6 +118,7 @@ def test_disable_autotune(self): class TestStaticAutoTuneStatus(TestAutoTune): def run_program(self, enable_autotune): paddle.enable_static() + set_flags(enable_autotune) if enable_autotune: paddle.fluid.core.enable_autotune() else: From 96ced1a1dbacc02fee80291bce39253ca44db293 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=8E=8B=E6=98=8E=E5=86=AC?= <78149749+winter-wang@users.noreply.github.com> Date: Sat, 9 Apr 2022 13:57:18 +0800 Subject: [PATCH 045/211] [infrt] opt support input valid places by commondline. (#41544) --- paddle/infrt/dialect/infrt/common/types.h | 11 +-- .../dialect/phi/pass/phi_op_convert_pass.cc | 78 ++++++++++++++----- .../dialect/phi/pass/phi_op_convert_pass.h | 2 - paddle/infrt/tests/dialect/phi/phi_pass.mlir | 2 +- .../infrt/tests/dialect/phi/resnet50.mlir.in | 2 +- paddle/phi/infermeta/unary.cc | 1 + tools/check_file_diff_approvals.sh | 2 +- tools/infrt/skipped_phi_api.json | 2 +- 8 files changed, 67 insertions(+), 33 deletions(-) diff --git a/paddle/infrt/dialect/infrt/common/types.h b/paddle/infrt/dialect/infrt/common/types.h index 2ebe2b8ccdba6..5bd1f40262b47 100644 --- a/paddle/infrt/dialect/infrt/common/types.h +++ b/paddle/infrt/dialect/infrt/common/types.h @@ -39,15 +39,12 @@ enum class PrecisionType : uint8_t { }; struct Place { - TargetType target; - PrecisionType precision; - LayoutType layout; + TargetType target = TargetType::UNK; + PrecisionType precision = PrecisionType::UNK; + LayoutType layout = LayoutType::UNK; Place(TargetType tar, PrecisionType pre, LayoutType lay) : target(tar), precision(pre), layout(lay) {} - Place() - : target(TargetType::UNK), - precision(PrecisionType::UNK), - layout(LayoutType::UNK) {} + Place() = default; }; llvm::Optional GetTargetType(llvm::StringRef key); diff --git a/paddle/infrt/dialect/phi/pass/phi_op_convert_pass.cc b/paddle/infrt/dialect/phi/pass/phi_op_convert_pass.cc index e3fdd5ae5bb9f..e9b426a5088fc 100644 --- a/paddle/infrt/dialect/phi/pass/phi_op_convert_pass.cc +++ b/paddle/infrt/dialect/phi/pass/phi_op_convert_pass.cc @@ -36,12 +36,35 @@ #include "paddle/phi/ops/compat/signatures.h" namespace { + +infrt::Place ParsePlaceFromStr(const std::string &key) { + size_t first_index = key.find_first_of('-'); + size_t second_index = key.find_last_of('-'); + if (first_index != second_index) { + llvm::Optional tar = + infrt::GetTargetType(key.substr(0, first_index)); + llvm::Optional pre = infrt::GetPrecisionType( + key.substr(first_index + 1, second_index - first_index - 1)); + llvm::Optional lay = + infrt::GetLayoutType(key.substr(second_index + 1)); + if (tar && pre && lay) { + return infrt::Place(tar.getValue(), pre.getValue(), lay.getValue()); + } + } + LOG(FATAL) << "Can't parse infrt::Place from string:" << key; + return infrt::Place(); +} + class PhiOpConvertPass : public mlir::PassWrapper { public: ::llvm::StringRef getName() const override { return "PhiOpConvertPass"; } void runOnFunction() override; - PhiOpConvertPass(); + + /// Initialize the valid_places_ by the valid_places_options_ while + /// valid_places_options_ has values. + mlir::LogicalResult initialize(mlir::MLIRContext *context) override; + PhiOpConvertPass() {} explicit PhiOpConvertPass(const std::vector &valid_places) : valid_places_(valid_places) {} @@ -56,14 +79,35 @@ class PhiOpConvertPass void convertStage(); void dispatchStage(); - // Force a specified data format for all layout sensitive operations. - Option valid_places_options_{ + ListOption valid_places_options_{ *this, "valid-targets", - llvm::cl::desc("Set the valid target, [CPU-FP32-NCHW]")}; + llvm::cl::desc( + "Set the valids target, such as: CPU-FP32-NCHW,GPU-FP32-NCHW"), + llvm::cl::MiscFlags::CommaSeparated}; std::vector valid_places_; }; + +/// Initialize the canonicalizer by building the set of patterns used during +/// execution. +mlir::LogicalResult PhiOpConvertPass::initialize(mlir::MLIRContext *context) { + if (valid_places_options_.hasValue()) { + VLOG(4) << "Start parse valid_places from commond line:"; + if (!valid_places_.empty()) { + LOG(WARNING) << "Find valid place from commandline, current value will " + "be overwrittern."; + valid_places_.clear(); + } + for (auto &val : *valid_places_options_) { + VLOG(4) << "place string:" << val; + valid_places_.emplace_back(ParsePlaceFromStr(val)); + } + VLOG(4) << "End parse valid_places from commond line:"; + } + return mlir::success(); +} + // Implementation of the PhiOpConvertPass. void PhiOpConvertPass::runOnFunction() { convertStage(); @@ -191,7 +235,16 @@ void PhiOpConvertPass::dispatchStage() { .output(); phi_context[infrt::TargetType::CPU] = context_value; } break; - case infrt::TargetType::GPU: + case infrt::TargetType::GPU: { + auto context_value = + builder + .create( + kernel_op.getLoc(), + infrt::phi::ContextType::get(kernel_op.getContext(), + infrt::TargetType::GPU)) + .output(); + phi_context[infrt::TargetType::GPU] = context_value; + } break; case infrt::TargetType::UNK: default: LOG(FATAL) << "Unsupported TargetType"; @@ -237,17 +290,6 @@ void PhiOpConvertPass::dispatchStage() { } } -PhiOpConvertPass::PhiOpConvertPass() { - if (!valid_places_options_.hasValue()) { - valid_places_.emplace_back(infrt::TargetType::CPU, - infrt::PrecisionType::FLOAT32, - infrt::LayoutType::NCHW); - return; - } - - LOG(FATAL) << "To be done for specifying places in command line"; -} - void PhiOpConvertPass::getDependentDialects( mlir::DialectRegistry ®istry) const { registry.insert(); @@ -265,7 +307,3 @@ std::unique_ptr infrt::CreatePhiOpCvtPass( std::vector valid_places) { return std::make_unique(valid_places); } - -std::unique_ptr infrt::CreatePhiOpCvtPass() { - return std::make_unique(); -} diff --git a/paddle/infrt/dialect/phi/pass/phi_op_convert_pass.h b/paddle/infrt/dialect/phi/pass/phi_op_convert_pass.h index c426bbf11518b..a0e74426a4097 100644 --- a/paddle/infrt/dialect/phi/pass/phi_op_convert_pass.h +++ b/paddle/infrt/dialect/phi/pass/phi_op_convert_pass.h @@ -23,6 +23,4 @@ namespace infrt { */ std::unique_ptr CreatePhiOpCvtPass(std::vector valid_places); -std::unique_ptr CreatePhiOpCvtPass(); - } // namespace infrt diff --git a/paddle/infrt/tests/dialect/phi/phi_pass.mlir b/paddle/infrt/tests/dialect/phi/phi_pass.mlir index 784ead5b2a0e3..0d9e312ce0bfd 100644 --- a/paddle/infrt/tests/dialect/phi/phi_pass.mlir +++ b/paddle/infrt/tests/dialect/phi/phi_pass.mlir @@ -1,4 +1,4 @@ -// RUN: infrtopt -phi-op-convert -infrt-op-fuse %s +// RUN: infrtopt -phi-op-convert=valid-targets=CPU-FP32-NCHW -infrt-op-fuse %s // CHECK-LABEL: @ops func @ops(%a:!infrt.lod_tensor, %b:!infrt.lod_tensor) { diff --git a/paddle/infrt/tests/dialect/phi/resnet50.mlir.in b/paddle/infrt/tests/dialect/phi/resnet50.mlir.in index 2803ebb41cfd7..3591a62e88ed0 100644 --- a/paddle/infrt/tests/dialect/phi/resnet50.mlir.in +++ b/paddle/infrt/tests/dialect/phi/resnet50.mlir.in @@ -444,7 +444,7 @@ module { %387 = "pd.flatten_contiguous_range"(%386) {start_axis = 1 : si32, stop_axis = 3 : si32} : (!infrt.dense_tensor) -> !infrt.dense_tensor %388 = "pd.matmul_v2"(%387, %245) {trans_x = false, trans_y = false} : (!infrt.dense_tensor, !infrt.dense_tensor) -> !infrt.dense_tensor %389 = "pd.elementwise_add"(%388, %30) {axis = 1 : si32} : (!infrt.dense_tensor, !infrt.dense_tensor) -> !infrt.dense_tensor - infrt.return %270 : !infrt.dense_tensor + infrt.return %389 : !infrt.dense_tensor } func @main() { diff --git a/paddle/phi/infermeta/unary.cc b/paddle/phi/infermeta/unary.cc index c6e2cb761911e..a47fc698777f7 100644 --- a/paddle/phi/infermeta/unary.cc +++ b/paddle/phi/infermeta/unary.cc @@ -2931,4 +2931,5 @@ void WhereIndexInferMeta(const MetaTensor& condition, MetaTensor* out) { } // namespace phi PD_REGISTER_INFER_META_FN(copy_to, phi::CopyToInferMeta); +PD_REGISTER_INFER_META_FN(flatten, phi::FlattenInferMeta); PD_REGISTER_INFER_META_FN(split, phi::SplitInferMeta); diff --git a/tools/check_file_diff_approvals.sh b/tools/check_file_diff_approvals.sh index e0598112c822a..49b84da01b9bb 100644 --- a/tools/check_file_diff_approvals.sh +++ b/tools/check_file_diff_approvals.sh @@ -201,7 +201,7 @@ fi # infrt needs to temporarily use LOG(FATAL) during the debugging period, and will replace it with standard error format in the future. NO_INFRT_FILES=`git diff --name-only upstream/develop | grep -v "tools/\|paddle/infrt/" || true` HAS_LOG_FATAL=`git diff -U0 upstream/$BRANCH $NO_INFRT_FILES |grep "^+" |grep -o -m 1 "LOG(FATAL)" || true` -if [ ${HAS_LOG_FATAL} ] && [ "${GIT_PR_ID}" != "" ]; then +if [ ${NO_INFRT_FILES} ] && [ ${HAS_LOG_FATAL} ] && [ "${GIT_PR_ID}" != "" ]; then echo_line="LOG(FATAL) is not recommended, because it will throw exception without standard stack information, so please use PADDLE_THROW macro here. If you have to use LOG(FATAL) here, please request chenwhql (Recommend), luotao1 or lanxianghit review and approve.\n" check_approval 1 6836917 47554610 22561442 fi diff --git a/tools/infrt/skipped_phi_api.json b/tools/infrt/skipped_phi_api.json index 64fc4c618aebc..8e2dd0f65d7d5 100644 --- a/tools/infrt/skipped_phi_api.json +++ b/tools/infrt/skipped_phi_api.json @@ -1,4 +1,4 @@ { -"phi_apis":["conj", "dropout", "expand_as", "flatten", "nll_loss", "psroi_pool", "roi_align", "roi_pool", "label_smooth"], +"phi_apis":["conj", "dropout", "expand_as", "nll_loss", "psroi_pool", "roi_align", "roi_pool", "label_smooth"], "phi_kernels":["equal_all"] } From b3b8d345288211cac35eced2cbcb32493c5bd1b7 Mon Sep 17 00:00:00 2001 From: hong <43953930+phlrain@users.noreply.github.com> Date: Sat, 9 Apr 2022 14:05:15 +0800 Subject: [PATCH 046/211] add depthwise conv hip support (#41537) --- paddle/phi/kernels/gpudnn/conv_grad_kernel.cu | 40 +++++++++++++++++++ paddle/phi/kernels/gpudnn/conv_kernel.cu | 37 +++++++++++++++++ 2 files changed, 77 insertions(+) diff --git a/paddle/phi/kernels/gpudnn/conv_grad_kernel.cu b/paddle/phi/kernels/gpudnn/conv_grad_kernel.cu index 985371ede9c5d..3696ab08ea83e 100644 --- a/paddle/phi/kernels/gpudnn/conv_grad_kernel.cu +++ b/paddle/phi/kernels/gpudnn/conv_grad_kernel.cu @@ -627,6 +627,39 @@ void Conv3DCudnnGradKernel(const Context& dev_ctx, filter_grad); } +template +void DepthwiseConvCudnnGradKernel(const Context& dev_ctx, + const DenseTensor& input, + const DenseTensor& filter, + const DenseTensor& out_grad, + const std::vector& strides, + const std::vector& paddings, + const std::string& paddding_algorithm, + int groups, + const std::vector& dilations, + const std::string& data_format, + bool use_addto, + int workspace_size_MB, + bool exhaustive_search, + DenseTensor* input_grad, + DenseTensor* filter_grad) { + ConvCudnnGradKernel(dev_ctx, + input, + filter, + out_grad, + strides, + paddings, + paddding_algorithm, + groups, + dilations, + data_format, + use_addto, + workspace_size_MB, + exhaustive_search, + input_grad, + filter_grad); +} + } // namespace phi #ifdef PADDLE_WITH_HIP @@ -643,6 +676,13 @@ PD_REGISTER_KERNEL(conv3d_grad, phi::Conv3DCudnnGradKernel, float, phi::dtype::float16) {} + +PD_REGISTER_KERNEL(depthwise_conv2d_grad, + GPUDNN, + ALL_LAYOUT, + phi::DepthwiseConvCudnnGradKernel, + float, + phi::dtype::float16) {} #else #if CUDNN_VERSION_MIN(8, 1, 0) PD_REGISTER_KERNEL(conv2d_grad, diff --git a/paddle/phi/kernels/gpudnn/conv_kernel.cu b/paddle/phi/kernels/gpudnn/conv_kernel.cu index 37f66e0b25a61..d40cbecaee6d5 100644 --- a/paddle/phi/kernels/gpudnn/conv_kernel.cu +++ b/paddle/phi/kernels/gpudnn/conv_kernel.cu @@ -416,6 +416,35 @@ void Conv3DCudnnKernel(const Context& dev_ctx, out); } +template +void DepthwiseConvCudnnKernel(const Context& dev_ctx, + const DenseTensor& input, + const DenseTensor& filter, + const std::vector& strides, + const std::vector& paddings, + const std::string& padding_algorithm, + int groups, + const std::vector& dilations, + const std::string& data_format, + bool use_addto, + int workspace_size_MB, + bool exhaustive_search, + DenseTensor* out) { + ConvCudnnKernel(dev_ctx, + input, + filter, + strides, + paddings, + padding_algorithm, + groups, + dilations, + data_format, + use_addto, + workspace_size_MB, + exhaustive_search, + out); +} + } // namespace phi #ifdef PADDLE_WITH_HIP @@ -432,6 +461,14 @@ PD_REGISTER_KERNEL(conv3d, phi::Conv3DCudnnKernel, float, phi::dtype::float16) {} + +PD_REGISTER_KERNEL(depthwise_conv2d, + GPUDNN, + ALL_LAYOUT, + phi::DepthwiseConvCudnnKernel, + float, + phi::dtype::float16) {} + #else #if CUDNN_VERSION_MIN(8, 1, 0) PD_REGISTER_KERNEL(conv2d, From 9872da00c40ced8ca6df97ba6493126c520efdd4 Mon Sep 17 00:00:00 2001 From: Weilong Wu Date: Sat, 9 Apr 2022 19:12:22 +0800 Subject: [PATCH 047/211] [Eager] Support allclose and linalg_cond to eager mode (#41545) --- .../tests/unittests/test_allclose_layer.py | 10 +++++++-- .../fluid/tests/unittests/test_linalg_cond.py | 22 ++++++++++++++++--- 2 files changed, 27 insertions(+), 5 deletions(-) diff --git a/python/paddle/fluid/tests/unittests/test_allclose_layer.py b/python/paddle/fluid/tests/unittests/test_allclose_layer.py index c376a5c95c393..1e080c80367f0 100644 --- a/python/paddle/fluid/tests/unittests/test_allclose_layer.py +++ b/python/paddle/fluid/tests/unittests/test_allclose_layer.py @@ -16,6 +16,7 @@ import paddle.fluid as fluid import unittest import numpy as np +from paddle.fluid.framework import _test_eager_guard class TestAllcloseLayer(unittest.TestCase): @@ -95,7 +96,7 @@ def test_allclose_gpu_fp64(self): with fluid.program_guard(main, startup): self.allclose_check(use_cuda=True, dtype='float64') - def test_dygraph_mode(self): + def func_dygraph_mode(self): x_1 = np.array([10000., 1e-07]).astype("float32") y_1 = np.array([10000.1, 1e-08]).astype("float32") x_2 = np.array([10000., 1e-08]).astype("float32") @@ -171,9 +172,14 @@ def test_dygraph_mode(self): x_v_5 = paddle.to_tensor(x_5) y_v_5 = paddle.to_tensor(y_5) ret_5 = paddle.allclose( - x_v_5, y_v_5, rtol=0.01, atol=0.0, name='test_8') + x_v_5, y_v_5, rtol=0.015, atol=0.0, name='test_8') self.assertEqual(ret_5.numpy()[0], True) + def test_dygraph_mode(self): + with _test_eager_guard(): + self.func_dygraph_mode() + self.func_dygraph_mode() + if __name__ == "__main__": unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_linalg_cond.py b/python/paddle/fluid/tests/unittests/test_linalg_cond.py index 9e3edd82681bc..42fb2fbc578bf 100644 --- a/python/paddle/fluid/tests/unittests/test_linalg_cond.py +++ b/python/paddle/fluid/tests/unittests/test_linalg_cond.py @@ -18,6 +18,7 @@ import numpy as np import paddle import paddle.static as static +from paddle.fluid.framework import _test_eager_guard p_list_n_n = ("fro", "nuc", 1, -1, np.inf, -np.inf) p_list_m_n = (None, 2, -2) @@ -89,16 +90,21 @@ def test_out(self): class API_TestDygraphCond(unittest.TestCase): - def test_out(self): + def func_out(self): paddle.disable_static() # test calling results of 'cond' in dynamic mode x_list_n_n, x_list_m_n = gen_input() test_dygraph_assert_true(self, x_list_n_n, p_list_n_n + p_list_m_n) test_dygraph_assert_true(self, x_list_m_n, p_list_m_n) + def test_out(self): + with _test_eager_guard(): + self.func_out() + self.func_out() + class TestCondAPIError(unittest.TestCase): - def test_dygraph_api_error(self): + def func_dygraph_api_error(self): paddle.disable_static() # test raising errors when 'cond' is called in dygraph mode p_list_error = ('fro_', '_nuc', -0.7, 0, 1.5, 3) @@ -113,6 +119,11 @@ def test_dygraph_api_error(self): x_tensor = paddle.to_tensor(x) self.assertRaises(ValueError, paddle.linalg.cond, x_tensor, p) + def test_dygraph_api_error(self): + with _test_eager_guard(): + self.func_dygraph_api_error() + self.func_dygraph_api_error() + def test_static_api_error(self): paddle.enable_static() # test raising errors when 'cond' is called in static mode @@ -149,13 +160,18 @@ def test_static_empty_input_error(self): class TestCondEmptyTensorInput(unittest.TestCase): - def test_dygraph_empty_tensor_input(self): + def func_dygraph_empty_tensor_input(self): paddle.disable_static() # test calling results of 'cond' when input is an empty tensor in dynamic mode x_list_n_n, x_list_m_n = gen_empty_input() test_dygraph_assert_true(self, x_list_n_n, p_list_n_n + p_list_m_n) test_dygraph_assert_true(self, x_list_m_n, p_list_m_n) + def test_dygraph_empty_tensor_input(self): + with _test_eager_guard(): + self.func_dygraph_empty_tensor_input() + self.func_dygraph_empty_tensor_input() + if __name__ == "__main__": paddle.enable_static() From ff2fba3987ef9eac3bf240b92564e87832a331b4 Mon Sep 17 00:00:00 2001 From: crystal <62974595+Zjq9409@users.noreply.github.com> Date: Sat, 9 Apr 2022 19:33:07 +0800 Subject: [PATCH 048/211] modify the block size of the group_norm backward (#41570) --- paddle/fluid/operators/group_norm_op.cu | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/paddle/fluid/operators/group_norm_op.cu b/paddle/fluid/operators/group_norm_op.cu index c93910bde5a2c..bb8031b0cc4e6 100644 --- a/paddle/fluid/operators/group_norm_op.cu +++ b/paddle/fluid/operators/group_norm_op.cu @@ -605,8 +605,16 @@ class GroupNormGradKernel int flags = (scale_data != nullptr) * kHasScale + (bias_data != nullptr) * kHasBias; if (data_layout == DataLayout::kNCHW) { + const int max_num_threads = 1024; + int max_block_size = std::min(imsize, max_num_threads); + int block_size_nchw = 1; + while (block_size_nchw < max_block_size) { + block_size_nchw *= 2; + } + block_size_nchw = std::max(block_size_nchw, kps::details::kWarpSize); + dim3 blocks(block_size_nchw); ScalarGetDsDbCUDAKernel< - T><<>>( + T><<>>( imsize, x_data, dy_data, ds_data, db_data); if (d_scale || d_bias) { From 7a07c4a5a46249813cf813d6b3792f80249ba336 Mon Sep 17 00:00:00 2001 From: zhaocaibei123 <48509226+zhaocaibei123@users.noreply.github.com> Date: Sat, 9 Apr 2022 22:54:23 +0800 Subject: [PATCH 049/211] Unittest recover (#41431) * update name * update name * fix test * fix fleet bind * update name * update name * fix test * fix gpups wrapper * remove Push/Pull/Load/Save with context in client and wrapper base class * fix * fix * remove some interface * fix * remove * code style * recover * fix * remove code unused * remove some unused table & accessor & CommonDenseTable => MemoryDenseTable * fix * fix * fix * recover * remove unused code * recover unittest * fix * remove * fix * remove code unuseful * remove * fix * recover * remove Co-authored-by: esythan --- .../common/sparse_sharding_merge.h | 310 --------------- paddle/fluid/distributed/ps/README.md | 38 +- .../ps/table/common_sparse_table.h | 203 ---------- .../fluid/distributed/ps/table/ctr_accessor.h | 1 + .../ps/table/depends/large_scale_kv.h | 353 ------------------ paddle/fluid/framework/dist_multi_trainer.cc | 3 + paddle/fluid/pybind/fleet_py.cc | 8 - paddle/fluid/pybind/fleet_py.h | 1 - paddle/fluid/pybind/pybind.cc | 1 - python/paddle/distributed/ps/utils/public.py | 2 +- .../tests/unittests/test_dist_fleet_ctr.py | 15 +- .../tests/unittests/test_dist_fleet_ctr2.py | 10 +- 12 files changed, 52 insertions(+), 893 deletions(-) delete mode 100644 paddle/fluid/distributed/common/sparse_sharding_merge.h delete mode 100644 paddle/fluid/distributed/ps/table/common_sparse_table.h delete mode 100644 paddle/fluid/distributed/ps/table/depends/large_scale_kv.h diff --git a/paddle/fluid/distributed/common/sparse_sharding_merge.h b/paddle/fluid/distributed/common/sparse_sharding_merge.h deleted file mode 100644 index 147403d08e6be..0000000000000 --- a/paddle/fluid/distributed/common/sparse_sharding_merge.h +++ /dev/null @@ -1,310 +0,0 @@ -// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -#pragma once -#include - -#include -#include -#include -#include // NOLINT -#include - -#include -#include "glog/logging.h" -#include "paddle/fluid/distributed/common/utils.h" -#include "paddle/fluid/framework/blocking_queue.h" -#include "paddle/fluid/framework/framework.pb.h" -#include "paddle/fluid/framework/tensor.h" -#include "paddle/fluid/framework/tensor_util.h" -#include "paddle/fluid/string/split.h" -#include "paddle/phi/core/utils/dim.h" - -constexpr int FG = 256 * 1024 * 1024; -constexpr int Q_SIZE = 10000; -constexpr int BUCKET = 10; -constexpr char XEOF[] = "EOF"; - -inline double GetCurrentUS() { - struct timeval time; - gettimeofday(&time, NULL); - return 1e+6 * time.tv_sec + time.tv_usec; -} - -namespace paddle { -namespace distributed { - -class ShardingMerge { - public: - ShardingMerge() {} - ~ShardingMerge() {} - - void Merge(const std::vector &inputs, - const std::vector &feasigns, const std::string &output, - const int embedding_dim) { - pool_.reset(new ::ThreadPool(inputs.size())); - - std::vector> tasks(inputs.size()); - std::vector> rows; - rows.resize(inputs.size()); - - auto begin = GetCurrentUS(); - for (int x = 0; x < inputs.size(); ++x) { - tasks[x] = pool_->enqueue([this, x, &rows, &inputs, &feasigns]() -> int { - DeserializeRowsFromFile(inputs[x], feasigns[x], &rows[x]); - return 0; - }); - } - - for (size_t x = 0; x < tasks.size(); ++x) { - tasks[x].wait(); - } - - int64_t total_rows = 0; - for (auto x = 0; x < rows.size(); x++) { - total_rows += rows[x].size(); - } - - auto end = GetCurrentUS(); - - VLOG(0) << "got " << total_rows - << " feasigin ids from sparse embedding using " << end - begin; - - std::vector total_dims = {total_rows, - static_cast(embedding_dim)}; - - std::vector> batch_buckets; - batch_buckets.resize(inputs.size()); - - for (int x = 0; x < rows.size(); ++x) { - batch_buckets[x] = bucket(rows[x].size(), BUCKET); - } - - std::ofstream out(output, std::ios::binary); - - begin = GetCurrentUS(); - SerializeRowsToStream(out, rows, batch_buckets, total_rows); - end = GetCurrentUS(); - VLOG(0) << "write rows to oostrream using " << end - begin; - - begin = GetCurrentUS(); - SerializePreTensorToStream(out, total_dims); - end = GetCurrentUS(); - VLOG(0) << "write pretensor to oostrream using " << end - begin; - - begin = GetCurrentUS(); - SerializeValueToStream(out, inputs, batch_buckets, embedding_dim); - end = GetCurrentUS(); - VLOG(0) << "write values to oostrream using " << end - begin; - } - - private: - void SerializeRowsToStream(std::ostream &os, - const std::vector> &rows, - const std::vector> &batch_buckets, - int64_t total_rows) { - { // the 1st field, uint32_t version - constexpr uint32_t version = 0; - os.write(reinterpret_cast(&version), sizeof(version)); - } - - { - // the 2st field, rows information - os.write(reinterpret_cast(&total_rows), sizeof(total_rows)); - - for (int b = 0; b < BUCKET; ++b) { - for (int x = 0; x < batch_buckets.size(); ++x) { - auto begin = batch_buckets[x][b]; - auto end = batch_buckets[x][b + 1]; - - if (end - begin == 0) continue; - - os.write(reinterpret_cast(rows[x].data() + begin), - sizeof(int64_t) * (end - begin)); - } - } - - // the 3st field, the height of SelectedRows - int64_t height = total_rows; - os.write(reinterpret_cast(&height), sizeof(height)); - } - } - - void SerializePreTensorToStream(std::ostream &os, - const std::vector &dims) { - { // the 1st field, uint32_t version - constexpr uint32_t version = 0; - os.write(reinterpret_cast(&version), sizeof(version)); - } - { // the 2nd field, tensor description - // int32_t size - framework::proto::VarType::TensorDesc desc; - desc.set_data_type(framework::proto::VarType::FP32); - auto *pb_dims = desc.mutable_dims(); - pb_dims->Resize(static_cast(dims.size()), 0); - std::copy(dims.begin(), dims.end(), pb_dims->begin()); - int32_t size = desc.ByteSize(); - os.write(reinterpret_cast(&size), sizeof(size)); - auto out = desc.SerializeAsString(); - os.write(out.data(), size); - } - } - - void SerializeValueToVec(std::ifstream &in, const int batch, - const int embedding_dim, std::vector *out) { - auto queue = - std::make_shared>>(); - - auto read = [batch, &in, &queue]() { - std::string line; - std::vector columns; - std::vector values_str; - - int count = 0; - - while (std::getline(in, line)) { - ++count; - columns = string::Split(line, '\t'); - - if (columns.size() != 5) { - VLOG(0) << "unexpected line: " << line << ", skip it"; - continue; - } - - values_str = string::Split(columns[4], ','); - queue->Push(values_str); - - if (count >= batch) { - break; - } - } - queue->Push({}); - }; - - auto write = [embedding_dim, &out, &queue]() { - std::vector values_str; - std::string line; - - while (true) { - queue->Pop(&values_str); - - if (values_str.size() == 0) { - break; - } - - for (int x = 0; x < embedding_dim; ++x) { - float v = 0.0; - try { - v = std::stof(values_str[x]); - } catch (std::invalid_argument &e) { - VLOG(0) << " get unexpected line: " << line; - } catch (std::out_of_range &e) { - VLOG(0) << " get unexpected line: " << line; - } - out->push_back(v); - } - } - }; - - std::thread p_read(read); - std::thread p_write(write); - p_read.join(); - p_write.join(); - } - - void SerializeVecToStream(std::ostream &out, - const std::vector &value) { - out.write(reinterpret_cast(value.data()), - static_cast(sizeof(float) * value.size())); - } - - void SerializeValueToStream( - std::ostream &out, const std::vector &ins, - const std::vector> &batch_buckets, - const int embedding_dim) { - std::vector> in_streams; - - for (int x = 0; x < ins.size(); ++x) { - in_streams.emplace_back(std::make_shared(ins[x])); - } - - std::vector> tasks(ins.size()); - - for (int b = 0; b < BUCKET; ++b) { - std::vector> values; - values.resize(tasks.size()); - - auto begin = GetCurrentUS(); - - for (int x = 0; x < tasks.size(); ++x) { - auto batch = batch_buckets[x][b + 1] - batch_buckets[x][b]; - values[x].clear(); - values[x].reserve(batch * embedding_dim); - } - - for (int x = 0; x < tasks.size(); ++x) { - tasks[x] = - pool_->enqueue([this, b, x, &out, &in_streams, &batch_buckets, - &values, embedding_dim]() -> int { - auto batch = batch_buckets[x][b + 1] - batch_buckets[x][b]; - if (batch == 0) return 0; - SerializeValueToVec(*(in_streams[x].get()), batch, embedding_dim, - &values[x]); - return 0; - }); - } - - for (size_t x = 0; x < tasks.size(); ++x) { - tasks[x].wait(); - } - - auto end = GetCurrentUS(); - - auto begin1 = GetCurrentUS(); - for (size_t x = 0; x < tasks.size(); ++x) { - SerializeVecToStream(out, values[x]); - } - auto end1 = GetCurrentUS(); - - VLOG(0) << "serialize buckets " << b << " read using " << end - begin - << ", to oostream using " << end1 - begin1; - } - } - - void DeserializeRowsFromFile(const std::string &input_file, - const int64_t feasigns, - std::vector *rows) { - std::string line; - std::vector columns; - std::ifstream file(input_file); - - rows->reserve(feasigns); - - while (std::getline(file, line)) { - columns = string::Split(line, '\t'); - if (columns.size() != 5) { - VLOG(0) << "unexpected line: " << line << ", skip it"; - continue; - } - rows->push_back(std::stoull(columns[0])); - } - - VLOG(0) << "parse " << rows->size() << " embedding rows from " - << input_file; - } - - private: - std::unique_ptr<::ThreadPool> pool_; -}; -} // namespace distributed -} // namespace paddle diff --git a/paddle/fluid/distributed/ps/README.md b/paddle/fluid/distributed/ps/README.md index d287dcd111198..afa6d60a4e0bb 100755 --- a/paddle/fluid/distributed/ps/README.md +++ b/paddle/fluid/distributed/ps/README.md @@ -1,3 +1,39 @@ # 目录说明 -> 成型之后,上级目录的 table、thirdparty、table、service 目录可以删除,communicator_common.h 、fleet.cc、fleet.h 删除 +Table: for param storage and update +-----MemorySparseTable: table for sparse param, used in cpu async mode +-----MemoryDenseTable: table for dense param, used in cpu async/geo mode +-----MemorySparseGeoTable: table for sparse param, used in cpu async mode +-----CommonGraphTable: table used for graph learning +-----BarrierTable: table for barrier function, used in cpu sync mode +-----TensorTable: table which run program, used for learning rate decay only + +ValueAccessor: for pull param and push gradient +-----CtrCommonAccessor: pull/push value with show/click, float type +-----DownpourCtrDoubleAccessor: same as CtrCommonAccessor, other than show/click with double type +-----SparseAccessor: used for common embedding, pull value without show/click, push value with show/click +-----CommMergeAccessor: used for dense table only, for get param dim + +PsService(proto): for server to handle request +-----PsBaseService +----------BrpcPsService: for cpu dnn training task +----------GraphBrpcService: for graph learning +-----HeterService: for dnn training task with heterogeneous computing resources + +PSServer: recv request from trainer and handle it by service +-----BrpcPsServer: for cpu dnn training task +-----GraphBrpcServer: for graph learning +-----PsLocalServer: for GpuPS + +HeterServer: for HeterPS + +PSClient: pull param and push gradient for trainer +-----BrpcPsClient: for cpu dnn training task +----------GraphBrpcClient: for graph learning +-----PsLocalClient: for GpuPS + +HeterClient: for HeterPS + +PSCore: Wrapper for InitServer + +GraphPyService: for graph learning diff --git a/paddle/fluid/distributed/ps/table/common_sparse_table.h b/paddle/fluid/distributed/ps/table/common_sparse_table.h deleted file mode 100644 index 2673e8dfae3c6..0000000000000 --- a/paddle/fluid/distributed/ps/table/common_sparse_table.h +++ /dev/null @@ -1,203 +0,0 @@ -// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include -#include -#include -#include -#include // NOLINT -#include -#include -#include -#include -#include "Eigen/Dense" -#include "paddle/fluid/distributed/ps/table/accessor.h" -#include "paddle/fluid/distributed/ps/table/common_table.h" -#include "paddle/fluid/distributed/ps/table/depends/initializers.h" -#include "paddle/fluid/distributed/ps/table/depends/large_scale_kv.h" -#include "paddle/fluid/distributed/ps/table/depends/sparse.h" -#include "paddle/fluid/string/string_helper.h" -#include "paddle/phi/core/utils/rw_lock.h" - -#define PSERVER_SAVE_SUFFIX ".shard" - -namespace paddle { -namespace distributed { - -class SparseOptimizer; - -enum SaveMode { all, base, delta }; - -struct Meta { - std::string param; - int shard_id; - std::vector names; - std::vector dims; - uint64_t count; - std::unordered_map dims_map; - - explicit Meta(const std::string& metapath) { - std::ifstream file(metapath); - std::string line; - int num_lines = 0; - while (std::getline(file, line)) { - if (StartWith(line, "#")) { - continue; - } - auto pairs = paddle::string::split_string(line, "="); - PADDLE_ENFORCE_EQ( - pairs.size(), 2, - paddle::platform::errors::InvalidArgument( - "info in %s except k=v, but got %s", metapath, line)); - - if (pairs[0] == "param") { - param = pairs[1]; - } - if (pairs[0] == "shard_id") { - shard_id = std::stoi(pairs[1]); - } - if (pairs[0] == "row_names") { - names = paddle::string::split_string(pairs[1], ","); - } - if (pairs[0] == "row_dims") { - auto dims_strs = - paddle::string::split_string(pairs[1], ","); - for (auto& str : dims_strs) { - dims.push_back(std::stoi(str)); - } - } - if (pairs[0] == "count") { - count = std::stoull(pairs[1]); - } - } - for (int x = 0; x < names.size(); ++x) { - dims_map[names[x]] = dims[x]; - } - } - - Meta(std::string param, int shard_id, std::vector row_names, - std::vector dims, uint64_t count) { - this->param = param; - this->shard_id = shard_id; - this->names = row_names; - this->dims = dims; - this->count = count; - } - - std::string ToString() { - std::stringstream ss; - ss << "param=" << param << "\n"; - ss << "shard_id=" << shard_id << "\n"; - ss << "row_names=" << paddle::string::join_strings(names, ',') << "\n"; - ss << "row_dims=" << paddle::string::join_strings(dims, ',') << "\n"; - ss << "count=" << count << "\n"; - return ss.str(); - } -}; - -class CommonSparseTable : public Table { - public: - CommonSparseTable() { rwlock_.reset(new phi::RWLock); } - virtual ~CommonSparseTable() {} - - // unused method begin - // virtual int32_t PullDense(float* pull_values, size_t num) { return 0; } - // virtual int32_t PushDenseParam(const float* values, size_t num) { return - // 0; } - // virtual int32_t PushDense(const float* values, size_t num) { return 0; } - // unused method end - - virtual int32_t Pull(TableContext& context); - virtual int32_t Push(TableContext& context); - - virtual int32_t Initialize(); - virtual int32_t InitializeShard() { return 0; } - virtual int32_t InitializeValue(); - virtual int32_t InitializeOptimizer(); - virtual int32_t InitializeRecorder(); - - virtual int32_t Load(const std::string& path, const std::string& param); - - virtual int32_t Save(const std::string& path, const std::string& param); - - void SaveMetaToText(std::ostream* os, const CommonAccessorParameter& common, - const size_t shard_idx, const int64_t total); - - int64_t SaveValueToText(std::ostream* os, std::shared_ptr block, - std::shared_ptr<::ThreadPool> pool, const int mode, - int shard_id); - - virtual void ProcessALine(const std::vector& columns, - const Meta& meta, const int64_t id, - std::vector>* values); - - virtual int64_t LoadFromText( - const std::string& valuepath, const std::string& metapath, - const int pserver_id, const int pserver_num, const int local_shard_num, - std::vector>* blocks); - - virtual std::pair PrintTableStat(); - virtual int32_t PullSparse(float* values, const PullSparseValue& pull_value); - - virtual int32_t PullSparsePtr(char** pull_values, const uint64_t* keys, - size_t num); - - virtual int32_t PushSparse(const uint64_t* keys, const float* values, - size_t num); - - virtual int32_t PushSparse(const uint64_t* keys, const float** values, - size_t num); - - // only for sparse geo table - virtual int32_t PushSparseParam(const uint64_t* keys, const float* values, - size_t num); - virtual int32_t SetGlobalLR(float* lr); - - virtual int32_t Pour(); - virtual int32_t Flush(); - virtual int32_t Shrink(const std::string& param); - virtual void Clear(); - - virtual void* GetShard(size_t shard_idx) { return 0; } - - protected: - virtual int32_t _PushSparse(const uint64_t* keys, const float* values, - size_t num); - virtual int32_t _PushSparse(const uint64_t* keys, const float** values, - size_t num); - - protected: - const int task_pool_size_ = 11; - std::vector> _shards_task_pool; - - bool sync = false; - int param_dim_ = 0; - int param_offset_ = 0; - - std::unordered_map value_idx_; - std::vector value_names_; - std::vector value_dims_; - std::vector value_offsets_; - std::vector initializer_attrs_; - - std::shared_ptr optimizer_; - std::vector> shard_values_; - std::unordered_map> pull_reservoir_; - std::unique_ptr rwlock_{nullptr}; -}; - -} // namespace distributed -} // namespace paddle diff --git a/paddle/fluid/distributed/ps/table/ctr_accessor.h b/paddle/fluid/distributed/ps/table/ctr_accessor.h index b8895e74d1d09..a599bfca7f6d2 100644 --- a/paddle/fluid/distributed/ps/table/ctr_accessor.h +++ b/paddle/fluid/distributed/ps/table/ctr_accessor.h @@ -186,6 +186,7 @@ class CtrCommonAccessor : public ValueAccessor { // CtrCommonFeatureValue common_feature_value; float _show_click_decay_rate; int32_t _ssd_unseenday_threshold; + bool _show_scale = false; public: // TODO(zhaocaibei123): it should be private, but we make it public // for unit test diff --git a/paddle/fluid/distributed/ps/table/depends/large_scale_kv.h b/paddle/fluid/distributed/ps/table/depends/large_scale_kv.h deleted file mode 100644 index 68c80ad737ec4..0000000000000 --- a/paddle/fluid/distributed/ps/table/depends/large_scale_kv.h +++ /dev/null @@ -1,353 +0,0 @@ -// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include -#include -#include // NOLINT -#include -#include -#include // NOLINT -#include -#include -#include -#include -#include "gflags/gflags.h" - -#include "butil/object_pool.h" -#include "paddle/fluid/distributed/common/utils.h" -#include "paddle/fluid/distributed/ps/table/depends/initializers.h" -#include "paddle/fluid/distributed/ps/thirdparty/round_robin.h" -#include "paddle/fluid/framework/generator.h" -#include "paddle/fluid/framework/lod_tensor.h" -#include "paddle/fluid/framework/selected_rows_utils.h" -#include "paddle/fluid/framework/tensor.h" -#include "paddle/fluid/framework/threadpool.h" -#include "paddle/fluid/framework/variable.h" -#include "paddle/fluid/platform/device_context.h" -#include "paddle/fluid/platform/enforce.h" -#include "paddle/fluid/platform/place.h" -#include "paddle/fluid/string/printf.h" -#include "paddle/fluid/string/string_helper.h" -#include "paddle/phi/backends/dynload/port.h" -#include "paddle/phi/core/utils/rw_lock.h" - -namespace paddle { -namespace distributed { - -enum Mode { training, infer }; - -static const int SPARSE_SHARD_BUCKET_NUM_BITS = 6; -static const size_t SPARSE_SHARD_BUCKET_NUM = (size_t)1 - << SPARSE_SHARD_BUCKET_NUM_BITS; - -struct VALUE { - explicit VALUE(size_t length) - : length_(length), - count_(0), - unseen_days_(0), - need_save_(false), - is_entry_(false) { - data_.resize(length); - memset(data_.data(), 0, sizeof(float) * length); - } - - size_t length_; - std::vector data_; - int count_; - int unseen_days_; // use to check knock-out - bool need_save_; // whether need to save - bool is_entry_; // whether knock-in -}; - -inline bool count_entry(VALUE *value, int threshold) { - return value->count_ >= threshold; -} - -inline bool probility_entry(VALUE *value, float threshold) { - UniformInitializer uniform = UniformInitializer({"uniform", "0", "0", "1"}); - return uniform.GetValue() >= threshold; -} - -class ValueBlock { - public: - typedef typename robin_hood::unordered_map map_type; - explicit ValueBlock(const std::vector &value_names, - const std::vector &value_dims, - const std::vector &value_offsets, - const std::unordered_map &value_idx, - const std::vector &init_attrs, - const std::string &entry_attr) - : value_names_(value_names), - value_dims_(value_dims), - value_offsets_(value_offsets), - value_idx_(value_idx) { - for (size_t x = 0; x < value_dims.size(); ++x) { - value_length_ += value_dims[x]; - } - - // for Entry - { - auto slices = string::split_string(entry_attr, ":"); - if (slices[0] == "none") { - entry_func_ = std::bind(&count_entry, std::placeholders::_1, 0); - threshold_ = 0; - } else if (slices[0] == "count_filter_entry") { - threshold_ = std::stoi(slices[1]); - entry_func_ = - std::bind(&count_entry, std::placeholders::_1, threshold_); - } else if (slices[0] == "probability_entry") { - threshold_ = std::stof(slices[1]); - entry_func_ = - std::bind(&probility_entry, std::placeholders::_1, threshold_); - } else { - PADDLE_THROW(platform::errors::InvalidArgument( - "Not supported Entry Type : %s, Only support [CountFilterEntry, " - "ProbabilityEntry]", - slices[0])); - } - } - - // for Initializer - { - for (auto &attr : init_attrs) { - auto slices = string::split_string(attr, "&"); - - if (slices[0] == "gaussian_random") { - initializers_.emplace_back( - std::make_shared(slices)); - } else if (slices[0] == "fill_constant") { - initializers_.emplace_back( - std::make_shared(slices)); - } else if (slices[0] == "uniform_random") { - initializers_.emplace_back( - std::make_shared(slices)); - } else if (slices[0] == "truncated_gaussian_random") { - initializers_.emplace_back( - std::make_shared(slices)); - } else { - PADDLE_THROW(platform::errors::InvalidArgument( - "%s can not be supported", attr)); - } - } - } - } - - ~ValueBlock() {} - - std::vector Get(const uint64_t &id, - const std::vector &value_names, - const std::vector &value_dims) { - auto pts = std::vector(); - pts.reserve(value_names.size()); - auto values = GetValue(id); - for (int i = 0; i < static_cast(value_names.size()); i++) { - PADDLE_ENFORCE_EQ( - value_dims[i], value_dims_[i], - platform::errors::InvalidArgument("value dims is not match")); - pts.push_back(values->data_.data() + - value_offsets_.at(value_idx_.at(value_names[i]))); - } - return pts; - } - - // pull - float *Init(const uint64_t &id, const bool with_update = true, - const int counter = 1) { - size_t hash = _hasher(id); - size_t bucket = compute_bucket(hash); - - auto &table = values_[bucket]; - auto res = table.find(id); - - VALUE *value = nullptr; - if (res == table.end()) { - value = butil::get_object(value_length_); - - table[id] = value; - - } else { - value = res->second; - } - - if (with_update) { - AttrUpdate(value, counter); - } - return value->data_.data(); - } - - VALUE *InitGet(const uint64_t &id, const bool with_update = true, - const int counter = 1) { - size_t hash = _hasher(id); - size_t bucket = compute_bucket(hash); - - auto &table = values_[bucket]; - auto res = table.find(id); - - VALUE *value = nullptr; - if (res == table.end()) { - value = butil::get_object(value_length_); - // value = _alloc.acquire(value_length_); - table[id] = value; - } else { - value = (VALUE *)(void *)(res->second); // NOLINT - } - return value; - } - - void AttrUpdate(VALUE *value, const int counter) { - // update state - value->unseen_days_ = 0; - value->count_ += counter; - - if (!value->is_entry_) { - value->is_entry_ = entry_func_(value); - if (value->is_entry_) { - // initialize - for (size_t x = 0; x < value_names_.size(); ++x) { - initializers_[x]->GetValue(value->data_.data() + value_offsets_[x], - value_dims_[x]); - } - value->need_save_ = true; - } - } else { - value->need_save_ = true; - } - - return; - } - - // dont jude if (has(id)) - float *Get(const uint64_t &id) { - size_t hash = _hasher(id); - size_t bucket = compute_bucket(hash); - auto &table = values_[bucket]; - - // auto &value = table.at(id); - // return value->data_.data(); - auto res = table.find(id); - VALUE *value = res->second; - return value->data_.data(); - } - - // for load, to reset count, unseen_days - VALUE *GetValue(const uint64_t &id) { - size_t hash = _hasher(id); - size_t bucket = compute_bucket(hash); - - auto &table = values_[bucket]; - auto res = table.find(id); - return res->second; - } - - bool GetEntry(const uint64_t &id) { - auto value = GetValue(id); - return value->is_entry_; - } - - void SetEntry(const uint64_t &id, const bool state) { - auto value = GetValue(id); - value->is_entry_ = state; - } - - void erase(uint64_t feasign) { - size_t hash = _hasher(feasign); - size_t bucket = compute_bucket(hash); - auto &table = values_[bucket]; - - auto iter = table.find(feasign); - if (iter != table.end()) { - butil::return_object(iter->second); - iter = table.erase(iter); - } - } - - void Shrink(const int threshold) { - for (auto &table : values_) { - for (auto iter = table.begin(); iter != table.end();) { - // VALUE* value = (VALUE*)(void*)(iter->second); - VALUE *value = iter->second; - value->unseen_days_++; - if (value->unseen_days_ >= threshold) { - butil::return_object(iter->second); - // _alloc.release(iter->second); - // _alloc.release(value); - iter = table.erase(iter); - } else { - ++iter; - } - } - } - return; - } - - float GetThreshold() { return threshold_; } - size_t compute_bucket(size_t hash) { - if (SPARSE_SHARD_BUCKET_NUM == 1) { - return 0; - } else { - return hash >> (sizeof(size_t) * 8 - SPARSE_SHARD_BUCKET_NUM_BITS); - } - } - - map_type::iterator end() { - return values_[SPARSE_SHARD_BUCKET_NUM - 1].end(); - } - - map_type::iterator Find(uint64_t id) { - size_t hash = _hasher(id); - size_t bucket = compute_bucket(hash); - auto &table = values_[bucket]; - - auto got = table.find(id); - if (got == table.end()) { - return end(); - } else { - return got; - } - } - - private: - bool Has(const uint64_t id) { - size_t hash = _hasher(id); - size_t bucket = compute_bucket(hash); - auto &table = values_[bucket]; - - auto got = table.find(id); - if (got == table.end()) { - return false; - } else { - return true; - } - } - - public: - map_type values_[SPARSE_SHARD_BUCKET_NUM]; - size_t value_length_ = 0; - std::hash _hasher; - - private: - const std::vector &value_names_; - const std::vector &value_dims_; - const std::vector &value_offsets_; - const std::unordered_map &value_idx_; - - std::function entry_func_; - std::vector> initializers_; - float threshold_; -}; - -} // namespace distributed -} // namespace paddle diff --git a/paddle/fluid/framework/dist_multi_trainer.cc b/paddle/fluid/framework/dist_multi_trainer.cc index d16469e265e2e..7f9aac4d3f1d3 100644 --- a/paddle/fluid/framework/dist_multi_trainer.cc +++ b/paddle/fluid/framework/dist_multi_trainer.cc @@ -117,6 +117,9 @@ void DistMultiTrainer::InitOtherEnv(const ProgramDesc &main_program) { InitDumpEnv(); } pull_dense_worker_->SetRootScope(root_scope_); +#if defined(PADDLE_WITH_PSCORE) && defined(PADDLE_WITH_CUDA) + pull_dense_worker_->CreatePinVar(); +#endif pull_dense_worker_->Start(); #if defined(PADDLE_WITH_PSLIB) || defined(PADDLE_WITH_PSCORE) for (int i = 0; i < thread_num_; ++i) { diff --git a/paddle/fluid/pybind/fleet_py.cc b/paddle/fluid/pybind/fleet_py.cc index 330719762ae08..8d8301689521b 100644 --- a/paddle/fluid/pybind/fleet_py.cc +++ b/paddle/fluid/pybind/fleet_py.cc @@ -28,7 +28,6 @@ limitations under the License. */ #include #include -#include "paddle/fluid/distributed/common/sparse_sharding_merge.h" #include "paddle/fluid/distributed/index_dataset/index_sampler.h" #include "paddle/fluid/distributed/index_dataset/index_wrapper.h" #include "paddle/fluid/distributed/ps/service/communicator/communicator.h" @@ -49,7 +48,6 @@ using paddle::distributed::GraphNode; using paddle::distributed::GraphPyServer; using paddle::distributed::GraphPyClient; using paddle::distributed::FeatureNode; -using paddle::distributed::ShardingMerge; namespace paddle { namespace pybind { @@ -93,12 +91,6 @@ void BindPSHost(py::module* m) { .def("to_string", &distributed::PSHost::ToString); } -void BindSparseShardingTools(py::module* m) { - py::class_(*m, "ShardingMerge") - .def(py::init<>()) - .def("merge", &ShardingMerge::Merge); -} - void BindCommunicatorContext(py::module* m) { py::class_(*m, "CommContext") .def( diff --git a/paddle/fluid/pybind/fleet_py.h b/paddle/fluid/pybind/fleet_py.h index 4dc0f002ad3c1..206a69f5a8019 100644 --- a/paddle/fluid/pybind/fleet_py.h +++ b/paddle/fluid/pybind/fleet_py.h @@ -36,6 +36,5 @@ void BindIndexNode(py::module* m); void BindTreeIndex(py::module* m); void BindIndexWrapper(py::module* m); void BindIndexSampler(py::module* m); -void BindSparseShardingTools(py::module* m); } // namespace pybind } // namespace paddle diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc index 396c6c5e42d37..0427fcece0b8b 100644 --- a/paddle/fluid/pybind/pybind.cc +++ b/paddle/fluid/pybind/pybind.cc @@ -4544,7 +4544,6 @@ All parameter, weight, gradient are variables in Paddle. BindTreeIndex(&m); BindIndexWrapper(&m); BindIndexSampler(&m); - BindSparseShardingTools(&m); #endif } } // namespace pybind diff --git a/python/paddle/distributed/ps/utils/public.py b/python/paddle/distributed/ps/utils/public.py index b76484a3ebc11..e7edc6fd859a6 100755 --- a/python/paddle/distributed/ps/utils/public.py +++ b/python/paddle/distributed/ps/utils/public.py @@ -58,7 +58,7 @@ def logger_config(log_path, logging_name): logger = logging.getLogger(logging_name) - logger.setLevel(level=logging.DEBUG) + logger.setLevel(level=logging.WARNING) handler = logging.FileHandler( log_path, mode='a', encoding='UTF-8', delay=True) handler.setLevel(logging.INFO) diff --git a/python/paddle/fluid/tests/unittests/test_dist_fleet_ctr.py b/python/paddle/fluid/tests/unittests/test_dist_fleet_ctr.py index 8ec3fecceb960..59d196fdf55e5 100644 --- a/python/paddle/fluid/tests/unittests/test_dist_fleet_ctr.py +++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_ctr.py @@ -51,9 +51,8 @@ def check_with_place(self, tr0_losses, tr1_losses = self._run_cluster(model_file, required_envs) def test_dist_train(self): - # self.check_with_place( - # "dist_fleet_ctr.py", delta=1e-5, check_error_log=False) - print('recover later') + self.check_with_place( + "dist_fleet_ctr.py", delta=1e-5, check_error_log=False) class TestDistMnistAsync2x2(TestFleetBase): @@ -86,9 +85,8 @@ def check_with_place(self, tr0_losses, tr1_losses = self._run_cluster(model_file, required_envs) def test_dist_train(self): - # self.check_with_place( - # "dist_fleet_ctr.py", delta=1e-5, check_error_log=False) - print('recover later') + self.check_with_place( + "dist_fleet_ctr.py", delta=1e-5, check_error_log=False) class TestDistCtrHalfAsync2x2(TestFleetBase): @@ -124,9 +122,8 @@ def check_with_place(self, tr0_losses, tr1_losses = self._run_cluster(model_file, required_envs) def test_dist_train(self): - # self.check_with_place( - # "dist_fleet_ctr.py", delta=1e-5, check_error_log=False) - print('recover later') + self.check_with_place( + "dist_fleet_ctr.py", delta=1e-5, check_error_log=False) if __name__ == "__main__": diff --git a/python/paddle/fluid/tests/unittests/test_dist_fleet_ctr2.py b/python/paddle/fluid/tests/unittests/test_dist_fleet_ctr2.py index e5e486d706845..e73eff2acc967 100644 --- a/python/paddle/fluid/tests/unittests/test_dist_fleet_ctr2.py +++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_ctr2.py @@ -52,9 +52,8 @@ def check_with_place(self, tr0_losses, tr1_losses = self._run_cluster(model_file, required_envs) def test_dist_train(self): - # self.check_with_place( - # "dist_fleet_ctr.py", delta=1e-5, check_error_log=False) - print('recover later') + self.check_with_place( + "dist_fleet_ctr.py", delta=1e-5, check_error_log=False) # @unittest.skip(reason="Skip unstable ut, reader need to be rewrite") @@ -92,9 +91,8 @@ def check_with_place(self, tr0_losses, tr1_losses = self._run_cluster(model_file, required_envs) def test_dist_train(self): - # self.check_with_place( - # "dist_fleet_ctr.py", delta=1e-5, check_error_log=False) - print('recover later') + self.check_with_place( + "dist_fleet_ctr.py", delta=1e-5, check_error_log=False) if __name__ == "__main__": From e68da187fd59a476828b723510139ac9af85cc35 Mon Sep 17 00:00:00 2001 From: baoachun <962571062@qq.com> Date: Sun, 10 Apr 2022 10:28:17 +0800 Subject: [PATCH 050/211] add mkldnn int8 pass [step1] (#41579) * add mkldnn int8 pass * add mkldnn int8 pass * update pass --- paddle/fluid/framework/ir/CMakeLists.txt | 1 + .../framework/ir/mkldnn/mkldnn_pass_util.h | 77 +++ .../ir/mkldnn/quant_dequant_mkldnn_pass.cc | 582 ++++++++++++++++++ .../ir/mkldnn/quant_dequant_mkldnn_pass.h | 91 +++ 4 files changed, 751 insertions(+) create mode 100644 paddle/fluid/framework/ir/mkldnn/mkldnn_pass_util.h create mode 100644 paddle/fluid/framework/ir/mkldnn/quant_dequant_mkldnn_pass.cc create mode 100644 paddle/fluid/framework/ir/mkldnn/quant_dequant_mkldnn_pass.h diff --git a/paddle/fluid/framework/ir/CMakeLists.txt b/paddle/fluid/framework/ir/CMakeLists.txt index 16a95b2ccf7f1..4ee0b08375d2e 100755 --- a/paddle/fluid/framework/ir/CMakeLists.txt +++ b/paddle/fluid/framework/ir/CMakeLists.txt @@ -140,6 +140,7 @@ if(WITH_MKLDNN) pass_library(batch_norm_act_fuse_pass inference DIR mkldnn) pass_library(multi_gru_fuse_pass inference DIR mkldnn) pass_library(multi_gru_seq_fuse_pass inference DIR mkldnn) + pass_library(quant_dequant_mkldnn_pass inference DIR mkldnn) endif() if(WITH_IPU) diff --git a/paddle/fluid/framework/ir/mkldnn/mkldnn_pass_util.h b/paddle/fluid/framework/ir/mkldnn/mkldnn_pass_util.h new file mode 100644 index 0000000000000..505bb2739e1d4 --- /dev/null +++ b/paddle/fluid/framework/ir/mkldnn/mkldnn_pass_util.h @@ -0,0 +1,77 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include "paddle/fluid/framework/ir/graph_helper.h" + +namespace paddle { +namespace framework { +namespace ir { + +static void SaveInfoInTheFirstOp( + ir::Graph* graph, const std::string& flag, const std::string& key_suffix, + const std::unordered_map>& info_map) { + VLOG(3) << "save variables in the first op's attr"; + + const std::string suffix = "_" + key_suffix + "_" + flag; + for (auto* op_node : + ir::TopologyVarientSort(*graph, static_cast(0))) { + if (!op_node->IsOp() || op_node->Op()->Type() == "feed" || + op_node->Op()->Type() == "fetch") + continue; + + op_node->Op()->SetAttr(flag, true); + for (auto iter = info_map.begin(); iter != info_map.end(); ++iter) { + op_node->Op()->SetAttr(iter->first + suffix, iter->second); + } + break; + } +} + +static void GetInfoFromTheFirstOp( + ir::Graph* graph, const std::string& flag, const std::string& key_suffix, + std::unordered_map>* info_map) { + VLOG(3) << "get variables from the first op's attr"; + + const std::string suffix = "_" + key_suffix + "_" + flag; + for (auto* op_node : + ir::TopologyVarientSort(*graph, static_cast(0))) { + if (!op_node->IsOp() || op_node->Op()->Type() == "feed" || + op_node->Op()->Type() == "fetch") + continue; + + auto* op_desc = op_node->Op(); + if (op_desc->GetAttrIfExists(flag)) { + op_desc->RemoveAttr(flag); + std::vector attr_names = op_desc->AttrNames(); + for (auto fake_name : attr_names) { + size_t pos = fake_name.find(suffix); + if (pos != std::string::npos) { + std::string name = fake_name.substr(0, pos); + auto scales_vector = + BOOST_GET_CONST(std::vector, op_desc->GetAttr(fake_name)); + info_map->insert(std::make_pair(name, scales_vector)); + op_desc->RemoveAttr(fake_name); + } + } + break; + } + } +} + +} // namespace ir +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/ir/mkldnn/quant_dequant_mkldnn_pass.cc b/paddle/fluid/framework/ir/mkldnn/quant_dequant_mkldnn_pass.cc new file mode 100644 index 0000000000000..808d043a4b226 --- /dev/null +++ b/paddle/fluid/framework/ir/mkldnn/quant_dequant_mkldnn_pass.cc @@ -0,0 +1,582 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/ir/mkldnn/quant_dequant_mkldnn_pass.h" +#include +#include "paddle/fluid/framework/ir/graph_helper.h" +#include "paddle/fluid/framework/ir/mkldnn/mkldnn_pass_util.h" +#include "paddle/fluid/framework/op_version_registry.h" + +namespace paddle { +namespace framework { +namespace ir { + +void QuantDequantMkldnnPass::MarkSkipQuantizedOps( + ir::Graph* graph, const std::unordered_set& skip_ops) const { + VLOG(3) << "mark skip quantized ops"; + for (auto* op_node : + ir::TopologyVarientSort(*graph, static_cast(0))) { + if (!op_node->IsOp()) continue; + + if (skip_ops.count(op_node->Name())) { + auto* op_desc = op_node->Op(); + if (!op_desc->HasAttr("quantization_type")) { + bool is_quantized_op = true; + for (auto* node_input : op_node->inputs) { + for (auto* node_input_input : node_input->inputs) { + if (!node_input_input->IsOp()) continue; + if (node_input_input->Name().find("quantize_dequantize") == + std::string::npos) { + is_quantized_op = false; + break; + } + } + if (!is_quantized_op) break; + } + + if (!is_quantized_op) { + op_node->Op()->SetAttr("skip_quant", 1); + } + } + } + } +} + +void QuantDequantMkldnnPass::MarkSkipQuantizedPool2d(ir::Graph* graph) const { + VLOG(3) << "mark avg pool2d as skip quantized op"; + for (auto* op_node : + ir::TopologyVarientSort(*graph, static_cast(0))) { + if (!op_node->IsOp()) continue; + + if (op_node->Name() == "pool2d") { + auto* op_desc = op_node->Op(); + auto pool_type = + BOOST_GET_CONST(std::string, op_desc->GetAttr("pooling_type")); + if (pool_type == "avg") { + op_node->Op()->SetAttr("skip_quant", 1); + } + } + } +} + +void QuantDequantMkldnnPass::CollectInfoFromFake( + ir::Graph* graph, Scope* scope, + const std::unordered_set& fake_dequantize_types, + std::unordered_map>* weight_thresholds) + const { + VLOG(3) << "gather weight_thresholds from fake dequantized ops"; + for (auto* op_node : + ir::TopologyVarientSort(*graph, static_cast(0))) { + if (!op_node->IsOp()) continue; + + if (fake_dequantize_types.count(op_node->Name())) { + auto* op_desc = op_node->Op(); + auto x_var_name = op_desc->Input("X")[0]; + + if (op_desc->HasAttr("max_range")) { + const float max_range = + BOOST_GET_CONST(float, op_desc->GetAttr("max_range")); + std::vector thresholds = {127 * 127 / max_range}; + weight_thresholds->insert(std::make_pair(x_var_name, thresholds)); + } else { + auto scale_name = op_desc->Input("Scales")[0]; + auto* var = scope->FindVar(scale_name); + PADDLE_ENFORCE_NOT_NULL( + var, platform::errors::NotFound( + "The Scales variable [%s] of dequantize op is not found.", + var)); + + auto* scale_tensor = var->GetMutable(); + auto* scale_data = scale_tensor->data(); + std::vector thresholds{}; + for (int i = 0; i < scale_tensor->numel(); i++) { + thresholds.push_back(scale_data[i]); + } + weight_thresholds->insert(std::make_pair(x_var_name, thresholds)); + } + } + } +} + +void QuantDequantMkldnnPass::CollectInputScalesFromFake( + ir::Graph* graph, Scope* scope, + const std::unordered_set& fake_quantize_types, + std::unordered_map>* var_quant_scales) + const { + VLOG(3) << "gather input scales from fake quantized ops"; + for (auto* op_node : + ir::TopologyVarientSort(*graph, static_cast(0))) { + if (!op_node->IsOp()) continue; + + if (op_node->Name() == "fake_quantize_dequantize_moving_average_abs_max" || + fake_quantize_types.count(op_node->Name())) { + auto* op_desc = op_node->Op(); + const int bit_length = + BOOST_GET_CONST(int, op_desc->GetAttr("bit_length")); + PADDLE_ENFORCE_EQ(bit_length, 8, platform::errors::InvalidArgument( + "Unsupported number quantization " + "bits: %d, only 8 is supported now.", + bit_length)); + + auto x_var_name = op_desc->Input("X")[0]; + auto scale_name = op_desc->Input("InScale")[0]; + auto out_var_name = op_desc->Output("Out")[0]; + auto* var = scope->FindVar(scale_name); + PADDLE_ENFORCE_NOT_NULL( + var, + platform::errors::NotFound( + "The InScale variable [%s] of quantize op is not found.", var)); + + auto* scale_tensor = var->GetMutable(); + auto* scale_data = scale_tensor->data(); + float scale = 1.0 / scale_data[0]; + if (std::isinf(scale) || std::isnan(scale)) { + scale = 0.0; + } + + if (!var_quant_scales->count(x_var_name)) { + std::vector scale_v = {scale}; + var_quant_scales->insert(std::make_pair(x_var_name, scale_v)); + } + + if (!var_quant_scales->count(out_var_name)) { + std::vector scale_v = {scale}; + var_quant_scales->insert(std::make_pair(out_var_name, scale_v)); + } + } + } +} + +void QuantDequantMkldnnPass::CollectOutputScalesFromAttr( + ir::Graph* graph, + std::unordered_map>* var_quant_scales) + const { + VLOG(3) << "gather output scales from op's attr"; + for (auto* op_node : + ir::TopologyVarientSort(*graph, static_cast(0))) { + if (!op_node->IsOp()) continue; + + auto* op_desc = op_node->Op(); + if (op_desc->HasAttr("out_threshold")) { + const float attr_scale = + BOOST_GET_CONST(float, op_desc->GetAttr("out_threshold")); + if (attr_scale == 0.0) continue; + float scale = 1.0 / attr_scale; + std::vector scale_v = {scale}; + + auto var_name_map = op_desc->Outputs(); + for (auto iter = var_name_map.begin(); iter != var_name_map.end(); + ++iter) { + for (auto var_name : iter->second) { + var_quant_scales->insert(std::make_pair(var_name, scale_v)); + } + } + } + } +} + +void QuantDequantMkldnnPass::CollectFakeQuantizeOps( + ir::Graph* graph, Node* op_node, + std::unordered_set* nodes2rm) const { + auto* op_desc = op_node->Op(); + auto x_var_name = op_desc->Input("X")[0]; + auto in_scale_name = op_desc->Input("InScale")[0]; + auto out_var_name = op_desc->Output("Out")[0]; + auto out_scale_name = op_desc->Output("OutScale")[0]; + + Node* fake_quant_in = nullptr; + Node* fake_quant_in_scale = nullptr; + for (auto* node_input : op_node->inputs) { + if (node_input->Name() == x_var_name) { + fake_quant_in = node_input; + break; + } else if (node_input->Name() == in_scale_name) { + fake_quant_in_scale = node_input; + break; + } + } + + Node* fake_quant_out = nullptr; + Node* fake_quant_out_scale = nullptr; + for (auto* node_output : op_node->outputs) { + if (node_output->Name() == out_var_name) { + fake_quant_out = node_output; + break; + } else if (node_output->Name() == out_scale_name) { + fake_quant_out_scale = node_output; + break; + } + } + + PADDLE_ENFORCE_NOT_NULL( + fake_quant_in, + platform::errors::NotFound( + "The input var [%s] of quantize op is not found.", x_var_name)); + PADDLE_ENFORCE_NOT_NULL( + fake_quant_out, + platform::errors::NotFound( + "The output var [%s] of quantize op is not found.", out_var_name)); + + std::string input_act_name = fake_quant_in->Var()->Name(); + std::string output_act_name = fake_quant_out->Var()->Name(); + auto outlinks = fake_quant_out->outputs; + for (auto* next_node : outlinks) { + if (!next_node->IsOp()) continue; + next_node->Op()->RenameInput(output_act_name, input_act_name); + IR_NODE_LINK_TO(fake_quant_in, next_node); + } + + nodes2rm->insert(op_node); + nodes2rm->insert(fake_quant_in_scale); + nodes2rm->insert(fake_quant_out); + nodes2rm->insert(fake_quant_out_scale); +} + +void QuantDequantMkldnnPass::CollectFakeDequantizeOps( + ir::Graph* graph, Node* op_node, + std::unordered_set* nodes2rm) const { + auto* op_desc = op_node->Op(); + auto x_var_name = op_desc->Input("X")[0]; + auto out_var_name = op_desc->Output("Out")[0]; + + Node* fake_dequant_in = nullptr; + for (auto* node_input : op_node->inputs) { + if (node_input->Name() == x_var_name) { + fake_dequant_in = node_input; + break; + } + } + + Node* fake_dequant_out = nullptr; + for (auto* node_output : op_node->outputs) { + if (node_output->Name() == out_var_name) { + fake_dequant_out = node_output; + break; + } + } + + PADDLE_ENFORCE_NOT_NULL( + fake_dequant_in, + platform::errors::NotFound( + "The input var [%s] of dequantize op is not found.", x_var_name)); + PADDLE_ENFORCE_NOT_NULL( + fake_dequant_out, + platform::errors::NotFound( + "The output var [%s] of dequantize op is not found.", out_var_name)); + + std::string input_act_name = fake_dequant_in->Var()->Name(); + std::string output_act_name = fake_dequant_out->Var()->Name(); + auto outlinks = fake_dequant_out->outputs; + for (auto* next_node : outlinks) { + next_node->Op()->RenameInput(output_act_name, input_act_name); + IR_NODE_LINK_TO(fake_dequant_in, next_node); + } + + nodes2rm->insert(op_node); + nodes2rm->insert(fake_dequant_out); +} + +void QuantDequantMkldnnPass::RemoveFakeOps( + ir::Graph* graph, + const std::unordered_set& fake_quantize_types, + const std::unordered_set& fake_dequantize_types, + const std::unordered_set& fake_quantize_dequantize_types) + const { + VLOG(3) << "remove fake quantize and dequantize ops"; + + std::unordered_set nodes2rm = {}; + for (auto* op_node : + ir::TopologyVarientSort(*graph, static_cast(0))) { + if (!op_node->IsOp()) continue; + + if (fake_quantize_types.count(op_node->Name())) { + CollectFakeQuantizeOps(graph, op_node, &nodes2rm); + } else if (fake_dequantize_types.count(op_node->Name())) { + CollectFakeDequantizeOps(graph, op_node, &nodes2rm); + } else if (fake_quantize_dequantize_types.count(op_node->Name())) { + CollectFakeDequantizeOps(graph, op_node, &nodes2rm); + } + } + + GraphSafeRemoveNodes(graph, nodes2rm); +} + +void QuantDequantMkldnnPass::TransposeWeight(Tensor* input) const { + const auto in_dims = input->dims(); + std::vector out_dim_v; + std::vector axis; + for (int i = in_dims.size() - 1; i >= 0; i--) { + axis.push_back(i); + out_dim_v.push_back(in_dims[i]); + } + + const auto out_dims = phi::make_ddim(out_dim_v); + const int rank = axis.size(); + auto in_stride = phi::stride(in_dims); + auto out_stride = phi::stride(out_dims); + const int count = input->numel(); + + Tensor trans_tensor; + trans_tensor.Resize(out_dims); + float* trans_data = trans_tensor.mutable_data(platform::CPUPlace()); + float* in_data = input->mutable_data(platform::CPUPlace()); + + for (int64_t out_idx = 0; out_idx < count; ++out_idx) { + int64_t in_idx = 0; + int64_t tmp_idx = out_idx; + for (int i = 0; i < rank; ++i) { + const int64_t coordinate = tmp_idx / out_stride[i]; + tmp_idx -= coordinate * out_stride[i]; + in_idx += coordinate * in_stride[axis[i]]; + } + trans_data[out_idx] = in_data[in_idx]; + } + + input->Resize(out_dims); + for (int i = 0; i < input->numel(); i++) { + in_data[i] = trans_data[i]; + } +} + +bool QuantDequantMkldnnPass::IsInt8Weight( + Node* op_node, Scope* scope, const std::string& weight_name) const { + auto* op_desc = op_node->Op(); + auto var_name = op_desc->Input(weight_name)[0]; + auto* var = scope->FindVar(var_name); + PADDLE_ENFORCE_NOT_NULL( + var, platform::errors::NotFound( + "The input persistable [%s] var of [%s] op is not found.", + var_name, op_desc->Type())); + auto* weight_tensor = var->GetMutable(); + auto* weight_data = weight_tensor->data(); + bool is_int8 = true; + for (int i = 0; i < weight_tensor->numel(); i++) { + if (weight_data[i] - static_cast(weight_data[i]) != 0) { + is_int8 = false; + break; + } + } + return is_int8; +} + +void QuantDequantMkldnnPass::DequantizeOpWeights( + Node* op_node, Scope* scope, const std::string& weight_name, + const std::string& output_name, + const std::unordered_map>& + weight_thresholds) const { + auto* op_desc = op_node->Op(); + std::string weight_var_name = op_desc->Input(weight_name)[0]; + std::string output_var_name = op_desc->Output(output_name)[0]; + + std::vector scales; + auto iter = weight_thresholds.find(output_var_name); + if (iter != weight_thresholds.end()) { + scales = iter->second; + } else { + PADDLE_THROW(paddle::platform::errors::Fatal( + "Could not find threshold information for [%s] var, please check if " + "the model is correct.", + output_var_name)); + } + + auto* var = scope->FindVar(weight_var_name); + PADDLE_ENFORCE_NOT_NULL( + var, platform::errors::NotFound( + "The input persistable [%s] var of [%s] op is not found.", + weight_var_name, op_desc->Type())); + auto* weight_tensor = var->GetMutable(); + const auto weight_dims = weight_tensor->dims(); + + const int size = scales.size(); + if (size == 1 || size == weight_dims[0]) { + auto* weight_data = + weight_tensor->mutable_data(platform::CPUPlace()); + for (int i = 0; i < weight_tensor->numel(); i++) { + weight_data[i] /= 127; + } + + TransposeWeight(weight_tensor); + + if (size == 1) { + for (int i = 0; i < weight_tensor->numel(); i++) { + weight_data[i] *= scales[0]; + } + } else { + for (int i = 0; i < weight_tensor->numel(); i++) { + weight_data[i] *= scales[i % size]; + } + } + + TransposeWeight(weight_tensor); + } else if (weight_dims.size() > 1 && size == weight_dims[1]) { + auto* weight_data = + weight_tensor->mutable_data(platform::CPUPlace()); + for (int i = 0; i < weight_tensor->numel(); i++) { + weight_data[i] /= 127; + } + + int step_n = 1; + for (int i = 1; i < weight_dims.size(); i++) { + step_n *= weight_dims[i]; + } + int step_c = step_n / size; + for (int i = 0; i < weight_dims[0]; i++) { + int begin_n = i * step_n; + for (int j = begin_n; j < begin_n + step_n; j++) { + for (int k = 0; k < size; k++) { + int begin_c = k * step_c; + for (int m = begin_c; m < begin_c + step_c; m++) { + weight_data[m] *= scales[k]; + } + } + } + } + } else { + PADDLE_THROW(platform::errors::InvalidArgument( + "The size of weight scales vector (%d) does not " + "match the dimensions (%d) of the weights tensor %s.", + size, weight_tensor->dims().size(), weight_var_name)); + } + + weight_tensor->Resize(weight_dims); +} + +void QuantDequantMkldnnPass::DequantizeWeights( + ir::Graph* graph, Scope* scope, + const std::unordered_map>& + weight_thresholds) const { + VLOG(3) << "dequantize weight for ops which has weight"; + + if (weight_thresholds.empty()) { + VLOG(3) + << "No need to dequantize weights because weight_thresholds is empty."; + return; + } + + for (auto* op_node : + ir::TopologyVarientSort(*graph, static_cast(0))) { + if (!op_node->IsOp()) continue; + if (op_node->Name() == "conv2d" || op_node->Name() == "depthwise_conv2d") { + if (IsInt8Weight(op_node, scope, "Filter")) { + DequantizeOpWeights(op_node, scope, "Filter", "Output", + weight_thresholds); + } + } else if (op_node->Name() == "mul" || op_node->Name() == "matmul" || + op_node->Name() == "matmul_v2") { + if (IsInt8Weight(op_node, scope, "Y")) { + DequantizeOpWeights(op_node, scope, "Y", "Out", weight_thresholds); + } + } + } +} + +void QuantDequantMkldnnPass::UpdateActivations(ir::Graph* graph) const { + VLOG(3) << "update conv2d or depthwise_conv2d fused activation"; + for (auto* op_node : + ir::TopologyVarientSort(*graph, static_cast(0))) { + if (!op_node->IsOp()) continue; + + if (op_node->Name() == "conv2d" || op_node->Name() == "depthwise_conv2d") { + auto* op_desc = op_node->Op(); + if (!op_desc->HasAttr("fuse_activation")) { + std::string activation; + if (op_desc->GetAttrIfExists("fuse_relu")) { + activation = "relu"; + } else if (op_desc->GetAttrIfExists("fuse_brelu")) { + activation = "relu6"; + float alpha = 6.0; + if (op_desc->HasAttr("fuse_brelu_threshold")) { + alpha = BOOST_GET_CONST(float, + op_desc->GetAttr("fuse_brelu_threshold")); + } + op_node->Op()->SetAttr("fuse_alpha", alpha); + } + op_node->Op()->SetAttr("fuse_activation", activation); + } + } + } +} + +void QuantDequantMkldnnPass::RemoveCtrlVars(ir::Graph* graph) const { + VLOG(3) << "remove control flow variable"; + std::unordered_set nodes2rm = {}; + for (auto* op_node : + ir::TopologyVarientSort(*graph, static_cast(0))) { + if (op_node->IsCtrlVar()) { + nodes2rm.insert(op_node); + } + } + + GraphSafeRemoveNodes(graph, nodes2rm); +} + +void QuantDequantMkldnnPass::ApplyImpl(ir::Graph* graph) const { + VLOG(3) << "Convert paddle slim quantized model to mkldnn quantized model."; + const std::string pattern_name = "quant_dequant_mkldnn_pass"; + FusePassBase::Init(pattern_name, graph); + + const std::unordered_set skip_ops = { + "conv2d", "depthwise_conv2d", "mul", "matmul", "matmul_v2"}; + + const std::unordered_set fake_quantize_types = { + "fake_quantize_moving_average_abs_max", "fake_quantize_range_abs_max"}; + + const std::unordered_set fake_dequantize_types = { + "fake_dequantize_max_abs", "fake_channel_wise_dequantize_max_abs"}; + + const std::unordered_set fake_quantize_dequantize_types = { + "fake_quantize_dequantize_abs_max", + "fake_quantize_dequantize_moving_average_abs_max", + "fake_channel_wise_quantize_dequantize_abs_max"}; + + std::unordered_map> weight_thresholds{}; + std::unordered_map> var_quant_scales{}; + + auto* scope = param_scope(); + MarkSkipQuantizedOps(graph, skip_ops); + MarkSkipQuantizedPool2d(graph); + CollectInfoFromFake(graph, scope, fake_dequantize_types, &weight_thresholds); + CollectInputScalesFromFake(graph, scope, fake_quantize_types, + &var_quant_scales); + CollectOutputScalesFromAttr(graph, &var_quant_scales); + RemoveFakeOps(graph, fake_quantize_types, fake_dequantize_types, + fake_quantize_dequantize_types); + DequantizeWeights(graph, scope, weight_thresholds); + UpdateActivations(graph); + RemoveCtrlVars(graph); + + // save var_quant_scales in the first op's attr + // for compute_propagate_scales_mkldnn_pass + SaveInfoInTheFirstOp(graph, "has_quant_info", "var_quant_scales", + var_quant_scales); +} + +} // namespace ir +} // namespace framework +} // namespace paddle + +REGISTER_PASS(quant_dequant_mkldnn_pass, + paddle::framework::ir::QuantDequantMkldnnPass); + +REGISTER_PASS_CAPABILITY(quant_dequant_mkldnn_pass) + .AddCombination( + paddle::framework::compatible::OpVersionComparatorCombination() + .LE("conv2d", 1) + .EQ("fc", 0) + .LE("conv2d_transpose", 2) + .EQ("fake_quantize_abs_max", 0) + .EQ("fake_quantize_range_abs_max", 0) + .EQ("fake_quantize_moving_average_abs_max", 0) + .LE("fake_channel_wise_quantize_abs_max", 1) + .EQ("fake_dequantize_max_abs", 0)); diff --git a/paddle/fluid/framework/ir/mkldnn/quant_dequant_mkldnn_pass.h b/paddle/fluid/framework/ir/mkldnn/quant_dequant_mkldnn_pass.h new file mode 100644 index 0000000000000..a9442f707402d --- /dev/null +++ b/paddle/fluid/framework/ir/mkldnn/quant_dequant_mkldnn_pass.h @@ -0,0 +1,91 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include "paddle/fluid/framework/ir/fuse_pass_base.h" + +namespace paddle { +namespace framework { +namespace ir { + +class QuantDequantMkldnnPass : public FusePassBase { + public: + QuantDequantMkldnnPass() = default; + virtual ~QuantDequantMkldnnPass() {} + + protected: + void ApplyImpl(ir::Graph* graph) const override; + + private: + void MarkSkipQuantizedOps( + ir::Graph* graph, const std::unordered_set& skip_ops) const; + + void MarkSkipQuantizedPool2d(ir::Graph* graph) const; + + void CollectInfoFromFake( + ir::Graph* graph, Scope* scope, + const std::unordered_set& fake_dequantize_types, + std::unordered_map>* weight_thresholds) + const; + + void CollectInputScalesFromFake( + ir::Graph* graph, Scope* scope, + const std::unordered_set& fake_quantize_types, + std::unordered_map>* var_quant_scales) + const; + + void CollectOutputScalesFromAttr( + ir::Graph* graph, + std::unordered_map>* var_quant_scales) + const; + + void CollectFakeQuantizeOps(ir::Graph* graph, Node* op_node, + std::unordered_set* nodes2rm) const; + + void CollectFakeDequantizeOps( + ir::Graph* graph, Node* op_node, + std::unordered_set* nodes2rm) const; + + void RemoveFakeOps( + ir::Graph* graph, + const std::unordered_set& fake_quantize_types, + const std::unordered_set& fake_dequantize_types, + const std::unordered_set& fake_quantize_dequantize_types) + const; + + bool IsInt8Weight(Node* op_node, Scope* scope, + const std::string& weight_name) const; + + void TransposeWeight(Tensor* input) const; + + void DequantizeOpWeights( + Node* op_node, Scope* scope, const std::string& weight_name, + const std::string& output_name, + const std::unordered_map>& + weight_thresholds) const; + + void DequantizeWeights( + ir::Graph* graph, Scope* scope, + const std::unordered_map>& + weight_thresholds) const; + + void UpdateActivations(ir::Graph* graph) const; + + void RemoveCtrlVars(ir::Graph* graph) const; +}; +} // namespace ir +} // namespace framework +} // namespace paddle From a78ca1cf87079a04591b536237c0415ba8526679 Mon Sep 17 00:00:00 2001 From: Wilber Date: Sun, 10 Apr 2022 12:40:37 +0800 Subject: [PATCH 051/211] predictor support trt (#41556) --- paddle/infrt/api/CMakeLists.txt | 2 + paddle/infrt/api/infrt_api.cc | 45 ++++++++++--- paddle/infrt/api/infrt_api.h | 8 +++ paddle/infrt/api/infrt_api_test.cc.in | 43 +++++++++++++ paddle/infrt/backends/tensorrt/trt_utils.h | 3 +- .../infrt/dialect/phi/ir/infrt_phi_tensor.td | 3 +- paddle/infrt/dialect/tensorrt/trt_exec.cc | 2 +- .../dialect/tensorrt/trt_graph_fuse_pass.cc | 5 ++ .../dialect/tensorrt/trt_graph_fuse_pass.h | 3 + .../dialect/tensorrt/trt_graph_split_pass.cc | 5 ++ .../dialect/tensorrt/trt_graph_split_pass.h | 3 + .../dialect/tensorrt/trt_op_converter_pass.cc | 4 ++ .../dialect/tensorrt/trt_op_converter_pass.h | 3 + .../dialect/tensorrt/trt_op_teller_pass.cc | 5 ++ .../dialect/tensorrt/trt_op_teller_pass.h | 3 + .../dialect/tensorrt/trt_type_convert_pass.cc | 2 +- .../dialect/tensorrt/trt_type_convert_pass.h | 2 +- paddle/infrt/host_context/paddle_mlir.cc | 63 +++++++++++++++---- paddle/infrt/host_context/paddle_mlir.h | 20 ++++-- paddle/infrt/kernel/phi/registry.cc | 2 +- 20 files changed, 193 insertions(+), 33 deletions(-) diff --git a/paddle/infrt/api/CMakeLists.txt b/paddle/infrt/api/CMakeLists.txt index 27d736cfdf7aa..6d4604edee6a0 100644 --- a/paddle/infrt/api/CMakeLists.txt +++ b/paddle/infrt/api/CMakeLists.txt @@ -7,3 +7,5 @@ configure_file(${CMAKE_CURRENT_SOURCE_DIR}/infrt_api_test.cc.in ${CMAKE_CURRENT_ # Disable temporarily for the external-kernel's mkldnn is outdate cc_test_tiny(test_infrt_api SRCS infrt_api_test.cc DEPS infrt ${MLIR_IR_LIBS}) +# TODO(inference): remove after optimize weight unfold. +set_tests_properties(test_infrt_api PROPERTIES TIMEOUT 200) diff --git a/paddle/infrt/api/infrt_api.cc b/paddle/infrt/api/infrt_api.cc index 2e8b64f768f13..8b4b14a3ca08b 100644 --- a/paddle/infrt/api/infrt_api.cc +++ b/paddle/infrt/api/infrt_api.cc @@ -17,12 +17,14 @@ #include #include #include +#include #include +#include +#include #include #include -#include "mlir/Pass/PassManager.h" #include "paddle/infrt/backends/host/phi_allocator.h" #include "paddle/infrt/common/global.h" #include "paddle/infrt/dialect/dense_tensor.h" @@ -48,8 +50,16 @@ #include "paddle/infrt/kernel/test_kernels.h" #include "paddle/infrt/tensor/tensor_map.h" +#include "paddle/infrt/dialect/infrt/pass/infrt_weights_unfold_pass.h" + #if defined(INFRT_WITH_GPU) && defined(INFRT_WITH_TRT) #include "paddle/infrt/kernel/tensorrt/registry.h" + +#include "paddle/infrt/dialect/tensorrt/trt_graph_fuse_pass.h" +#include "paddle/infrt/dialect/tensorrt/trt_graph_split_pass.h" +#include "paddle/infrt/dialect/tensorrt/trt_op_converter_pass.h" +#include "paddle/infrt/dialect/tensorrt/trt_op_teller_pass.h" +#include "paddle/infrt/dialect/tensorrt/trt_type_convert_pass.h" #endif using namespace infrt::host_context; // NOLINT @@ -233,17 +243,34 @@ int InfRtPredictor::Init(const InfRtConfig& config) { #endif // INFRT_WITH_GPU && INFRT_WITH_TRT #endif - auto module_op = impl_->module_gen_.ImportPaddleModel(config.model_dir(), - config.param_dir()); + mlir::ModuleOp module_op; + if (config.tensorrt_enabled()) { + module_op = impl_->module_gen_.ImportPaddleModel( + config.model_dir(), config.param_dir(), false); + } else { + module_op = impl_->module_gen_.ImportPaddleModel(config.model_dir(), + config.param_dir()); + } context->loadAllAvailableDialects(); ::mlir::PassManager pm(context); - ::mlir::OpPassManager& phi_pass_manager = pm.nest<::mlir::FuncOp>(); - std::vector<::infrt::Place> valid_places = {{::infrt::TargetType::CPU, - ::infrt::PrecisionType::FLOAT32, - ::infrt::LayoutType::NCHW}}; - phi_pass_manager.addPass(CreatePhiOpCvtPass(valid_places)); - phi_pass_manager.addPass(CreateInfrtOpFusePass()); + ::mlir::OpPassManager& pass_manager = pm.nest<::mlir::FuncOp>(); + if (config.tensorrt_enabled()) { + pass_manager.addPass(::infrt::CreateInfrtWeightsUnfoldPass()); + pass_manager.addPass(::infrt::trt::CreateTrtOpTellerPass()); + pass_manager.addPass(::infrt::trt::CreateTrtGraphFusePass()); + pass_manager.addPass(::infrt::trt::CreateTrtGraphSplitPass(1)); + pass_manager.addPass(::infrt::trt::CreateTrtOpConverterPass()); + pass_manager.addPass(::infrt::trt::CreateTrtTypeConvertPass()); + pass_manager.addPass(::mlir::createCanonicalizerPass()); + } else { + std::vector<::infrt::Place> valid_places = { + {::infrt::TargetType::CPU, + ::infrt::PrecisionType::FLOAT32, + ::infrt::LayoutType::NCHW}}; + pass_manager.addPass(CreatePhiOpCvtPass(valid_places)); + pass_manager.addPass(CreateInfrtOpFusePass()); + } if (mlir::failed(pm.run(module_op))) { std::cout << "\npass failed!\n" << std::endl; return 4; diff --git a/paddle/infrt/api/infrt_api.h b/paddle/infrt/api/infrt_api.h index cf14cab3c066e..231f496bb89d1 100644 --- a/paddle/infrt/api/infrt_api.h +++ b/paddle/infrt/api/infrt_api.h @@ -26,6 +26,9 @@ class InfRtConfig { std::string param_dir_; std::vector shared_libs_; + // TODO(wilber): Design an easy-to-use interface. + bool tensorrt_enabled_{false}; + public: InfRtConfig() = default; void set_model_dir(const std::string& model_dir) { model_dir_ = model_dir; } @@ -39,6 +42,11 @@ class InfRtConfig { } const std::vector& shared_libs() const { return shared_libs_; } + // TODO(wilber): Design an easy-to-use interface. + void enable_tensorrt() { tensorrt_enabled_ = true; } + void disable_tensorrt() { tensorrt_enabled_ = false; } + bool tensorrt_enabled() const { return tensorrt_enabled_; } + virtual ~InfRtConfig() = default; }; diff --git a/paddle/infrt/api/infrt_api_test.cc.in b/paddle/infrt/api/infrt_api_test.cc.in index 6323b6a540a31..13635ddaaab2f 100644 --- a/paddle/infrt/api/infrt_api_test.cc.in +++ b/paddle/infrt/api/infrt_api_test.cc.in @@ -57,4 +57,47 @@ TEST(InfRtPredictor, predictor) { ASSERT_EQ(output->dims(), ::phi::DDim({16, 10})); } +#ifdef INFRT_WITH_TRT +TEST(InfRtPredictor, trt_predictor) { + std::vector shared_libs; + + InfRtConfig config; + config.enable_tensorrt(); + + config.set_model_dir("@CMAKE_BINARY_DIR@/models/resnet50/model.pdmodel"); + config.set_param_dir("@CMAKE_BINARY_DIR@/models/resnet50/model.pdiparams"); + + std::unique_ptr predictor = CreateInfRtPredictor(config); + + ::infrt::backends::CpuPhiAllocator cpu_allocator; + ::phi::DenseTensor* input = predictor->GetInput(0); + input->Resize({2, 3, 256, 256}); + input->AllocateFrom(&cpu_allocator, ::phi::DataType::FLOAT32); + auto* input_data = reinterpret_cast(input->data()); + for (int i = 0; i < input->numel(); i++) input_data[i] = 1.0; + predictor->Run(); + + // get and print output tensor + auto* output = predictor->GetOutput(0); + + ASSERT_EQ(output->dims(), ::phi::DDim({2, 1000})); + const std::vector true_vals { + -3.319006264209747314e-01, -1.418896913528442383e+00, + -6.934890151023864746e-01, -1.498023152351379395e+00, + 3.078042864799499512e-01, -1.340998053550720215e+00, + 3.508620023727416992e+00, 2.274388313293457031e+00, + -1.321727275848388672e+00, -8.888689428567886353e-02, + -3.319006264209747314e-01, -1.418896913528442383e+00, + -6.934890151023864746e-01, -1.498023152351379395e+00, + 3.078042864799499512e-01, -1.340998053550720215e+00, + 3.508620023727416992e+00, 2.274388313293457031e+00, + -1.321727275848388672e+00, -8.888689428567886353e-02 + }; + + for (size_t i = 0; i < true_vals.size(); i+=100) { + CHECK_NEAR(output->data()[i*100], true_vals[i], 1e-5); + } +} +#endif + } // namespace infrt diff --git a/paddle/infrt/backends/tensorrt/trt_utils.h b/paddle/infrt/backends/tensorrt/trt_utils.h index c23d4608bb33f..b2d5659fd2520 100644 --- a/paddle/infrt/backends/tensorrt/trt_utils.h +++ b/paddle/infrt/backends/tensorrt/trt_utils.h @@ -50,7 +50,8 @@ inline nvinfer1::Dims VecToDims(const std::vector& vec) { assert(false); } // Pick first nvinfer1::Dims::MAX_DIMS elements - nvinfer1::Dims dims{std::min(static_cast(vec.size()), limit), {}}; + nvinfer1::Dims dims; + dims.nbDims = std::min(static_cast(vec.size()), limit); std::copy_n(vec.begin(), dims.nbDims, std::begin(dims.d)); return dims; } diff --git a/paddle/infrt/dialect/phi/ir/infrt_phi_tensor.td b/paddle/infrt/dialect/phi/ir/infrt_phi_tensor.td index c4707c367bc80..2078ebb1442ff 100644 --- a/paddle/infrt/dialect/phi/ir/infrt_phi_tensor.td +++ b/paddle/infrt/dialect/phi/ir/infrt_phi_tensor.td @@ -34,7 +34,8 @@ def CreateHostInitedDenseTensorOp : PDT_Op<"create_host_inited_dense_tensor.f32" I64ArrayAttr:$dims, LayoutAttr:$layout, I64ArrayAttr:$lod, - F32ArrayAttr:$values + F32ArrayAttr:$values, + DefaultValuedAttr:$run_once ); let results = (outs DenseTensor:$output); } diff --git a/paddle/infrt/dialect/tensorrt/trt_exec.cc b/paddle/infrt/dialect/tensorrt/trt_exec.cc index 837ca2093747c..2682a744bb056 100644 --- a/paddle/infrt/dialect/tensorrt/trt_exec.cc +++ b/paddle/infrt/dialect/tensorrt/trt_exec.cc @@ -81,7 +81,7 @@ int main(int argc, char** argv) { trt_pass_manager.addPass(std::make_unique()); trt_pass_manager.addPass(std::make_unique(1)); trt_pass_manager.addPass(std::make_unique()); - trt_pass_manager.addPass(infrt::trt::createTrtTypeConvertPass()); + trt_pass_manager.addPass(infrt::trt::CreateTrtTypeConvertPass()); trt_pass_manager.addPass(::mlir::createCanonicalizerPass()); if (mlir::failed(pm.run(*module))) { std::cout << "\npass failed!\n" << std::endl; diff --git a/paddle/infrt/dialect/tensorrt/trt_graph_fuse_pass.cc b/paddle/infrt/dialect/tensorrt/trt_graph_fuse_pass.cc index 55964b77e21ca..bbe9a76e87b00 100644 --- a/paddle/infrt/dialect/tensorrt/trt_graph_fuse_pass.cc +++ b/paddle/infrt/dialect/tensorrt/trt_graph_fuse_pass.cc @@ -181,5 +181,10 @@ void TRTGraphFusePass::runOnFunction() { // TODO(wilber): Implement a toposort for efficiency. // topoSortBlock(body); } + +std::unique_ptr CreateTrtGraphFusePass() { + return std::make_unique(); +} + } // namespace trt } // namespace infrt diff --git a/paddle/infrt/dialect/tensorrt/trt_graph_fuse_pass.h b/paddle/infrt/dialect/tensorrt/trt_graph_fuse_pass.h index 4c7214762303c..515e73df85480 100644 --- a/paddle/infrt/dialect/tensorrt/trt_graph_fuse_pass.h +++ b/paddle/infrt/dialect/tensorrt/trt_graph_fuse_pass.h @@ -17,6 +17,9 @@ namespace infrt { namespace trt { + +std::unique_ptr CreateTrtGraphFusePass(); + /* * trtGraphFusePass. * diff --git a/paddle/infrt/dialect/tensorrt/trt_graph_split_pass.cc b/paddle/infrt/dialect/tensorrt/trt_graph_split_pass.cc index 2136f19fd1af5..d5ce871edd1a3 100644 --- a/paddle/infrt/dialect/tensorrt/trt_graph_split_pass.cc +++ b/paddle/infrt/dialect/tensorrt/trt_graph_split_pass.cc @@ -44,5 +44,10 @@ void TRTGraphSplitPass::runOnFunction() { graph_op.erase(); } } + +std::unique_ptr CreateTrtGraphSplitPass(size_t min_subgraph_size) { + return std::make_unique(min_subgraph_size); +} + } // namespace trt } // namespace infrt diff --git a/paddle/infrt/dialect/tensorrt/trt_graph_split_pass.h b/paddle/infrt/dialect/tensorrt/trt_graph_split_pass.h index a71b9cb6536c5..fa101a8db027a 100644 --- a/paddle/infrt/dialect/tensorrt/trt_graph_split_pass.h +++ b/paddle/infrt/dialect/tensorrt/trt_graph_split_pass.h @@ -17,6 +17,9 @@ namespace infrt { namespace trt { + +std::unique_ptr CreateTrtGraphSplitPass(size_t min_subgraph_size); + /* * trtGraphSplitPass. * diff --git a/paddle/infrt/dialect/tensorrt/trt_op_converter_pass.cc b/paddle/infrt/dialect/tensorrt/trt_op_converter_pass.cc index e40bbd67c0b5e..6776f01e36d19 100644 --- a/paddle/infrt/dialect/tensorrt/trt_op_converter_pass.cc +++ b/paddle/infrt/dialect/tensorrt/trt_op_converter_pass.cc @@ -260,5 +260,9 @@ void TRTOpConverterPass::runOnOperation() { signalPassFailure(); } +std::unique_ptr CreateTrtOpConverterPass() { + return std::make_unique(); +} + } // namespace trt } // namespace infrt diff --git a/paddle/infrt/dialect/tensorrt/trt_op_converter_pass.h b/paddle/infrt/dialect/tensorrt/trt_op_converter_pass.h index 685686493c9ab..84bc719463638 100644 --- a/paddle/infrt/dialect/tensorrt/trt_op_converter_pass.h +++ b/paddle/infrt/dialect/tensorrt/trt_op_converter_pass.h @@ -20,6 +20,9 @@ namespace infrt { namespace trt { + +std::unique_ptr CreateTrtOpConverterPass(); + /* * trtOpConverterPass. * diff --git a/paddle/infrt/dialect/tensorrt/trt_op_teller_pass.cc b/paddle/infrt/dialect/tensorrt/trt_op_teller_pass.cc index 77c22c12854c6..d7b917385cf14 100644 --- a/paddle/infrt/dialect/tensorrt/trt_op_teller_pass.cc +++ b/paddle/infrt/dialect/tensorrt/trt_op_teller_pass.cc @@ -58,5 +58,10 @@ void TRTOpTellerPass::runOnFunction() { builder.create<::infrt::ReturnOp>(loc, op->getResults()); } } + +std::unique_ptr CreateTrtOpTellerPass() { + return std::make_unique(); +} + } // namespace trt } // namespace infrt diff --git a/paddle/infrt/dialect/tensorrt/trt_op_teller_pass.h b/paddle/infrt/dialect/tensorrt/trt_op_teller_pass.h index 47375d838a987..566c5a45da03a 100644 --- a/paddle/infrt/dialect/tensorrt/trt_op_teller_pass.h +++ b/paddle/infrt/dialect/tensorrt/trt_op_teller_pass.h @@ -17,6 +17,9 @@ namespace infrt { namespace trt { + +std::unique_ptr CreateTrtOpTellerPass(); + /* * trtOpTellerPass. * diff --git a/paddle/infrt/dialect/tensorrt/trt_type_convert_pass.cc b/paddle/infrt/dialect/tensorrt/trt_type_convert_pass.cc index 0ed79c79db6a2..35c81d0230161 100644 --- a/paddle/infrt/dialect/tensorrt/trt_type_convert_pass.cc +++ b/paddle/infrt/dialect/tensorrt/trt_type_convert_pass.cc @@ -175,7 +175,7 @@ void TrtTypeConvertPass::runOnFunction() { namespace infrt { namespace trt { -std::unique_ptr createTrtTypeConvertPass() { +std::unique_ptr CreateTrtTypeConvertPass() { return std::make_unique(); } diff --git a/paddle/infrt/dialect/tensorrt/trt_type_convert_pass.h b/paddle/infrt/dialect/tensorrt/trt_type_convert_pass.h index fbc30cdbeb767..68a15696b3e69 100644 --- a/paddle/infrt/dialect/tensorrt/trt_type_convert_pass.h +++ b/paddle/infrt/dialect/tensorrt/trt_type_convert_pass.h @@ -19,7 +19,7 @@ namespace infrt { namespace trt { -std::unique_ptr createTrtTypeConvertPass(); +std::unique_ptr CreateTrtTypeConvertPass(); } // namespace trt } // namespace infrt diff --git a/paddle/infrt/host_context/paddle_mlir.cc b/paddle/infrt/host_context/paddle_mlir.cc index 8b7bbe13260ff..0264920a600ff 100644 --- a/paddle/infrt/host_context/paddle_mlir.cc +++ b/paddle/infrt/host_context/paddle_mlir.cc @@ -15,11 +15,13 @@ #include "paddle/infrt/host_context/paddle_mlir.h" #include +#include #include "paddle/infrt/dialect/infrt/ir/basic_kernels.h" #include "paddle/infrt/dialect/infrt/ir/infrt_dialect.h" #include "paddle/infrt/dialect/pd/common/pd_ops_info.h" #include "paddle/infrt/dialect/phi/ir/infrt_phi_tensor.h" +#include "paddle/infrt/dialect/phi/ir/phi_base.h" MLIRModelGenImpl::MLIRModelGenImpl() : context_(infrt::Global::getMLIRContext()), builder_(context_) { @@ -35,32 +37,40 @@ MLIRModelGenImpl::MLIRModelGenImpl() infrt::paddle::framework_proto::ProgramDesc MLIRModelGenImpl::ParsePaddleModel( const std::string &model_file) { + model_file_ = model_file; infrt::paddle::framework_proto::ProgramDesc program_proto = *infrt::paddle::LoadProgram(model_file); return program_proto; } -mlir::ModuleOp MLIRModelGenImpl::ImportPaddleModel( - const std::string &model_dir) { +mlir::ModuleOp MLIRModelGenImpl::ImportPaddleModel(const std::string &model_dir, + bool arg_has_map) { + model_dir_ = model_dir; infrt::paddle::framework_proto::ProgramDesc program_proto = ParsePaddleModel(model_dir + "/__model__"); - return ImportPaddleModel(program_proto); + return ImportPaddleModel(program_proto, arg_has_map); } mlir::ModuleOp MLIRModelGenImpl::ImportPaddleModel( - const std::string &model_file, const std::string ¶m_file) { + const std::string &model_file, + const std::string ¶m_file, + bool arg_has_map) { + model_file_ = model_file; + params_file_ = param_file; infrt::paddle::framework_proto::ProgramDesc program_proto = ParsePaddleModel(model_file); - return ImportPaddleModel(program_proto); + return ImportPaddleModel(program_proto, arg_has_map); } mlir::ModuleOp MLIRModelGenImpl::ImportPaddleModel( - const infrt::paddle::framework_proto::ProgramDesc &program) { + const infrt::paddle::framework_proto::ProgramDesc &program, + bool arg_has_map) { main_block_ = program.blocks(0); - llvm::SmallVector operandTypes = GetModelInputsType(program); + llvm::SmallVector operandTypes = + GetModelInputsType(program, arg_has_map); llvm::SmallVector resultTypes = GetModelOutputsType(program); mlir::FuncOp mainFunc = UpdateModelModule(operandTypes, resultTypes); - UpdateModelParams(program, &mainFunc); + UpdateModelParams(program, &mainFunc, arg_has_map); UpdateModelOps(program); UpdateModelOutputs(program); return module_; @@ -83,9 +93,12 @@ mlir::FuncOp MLIRModelGenImpl::UpdateModelModule( } llvm::SmallVector MLIRModelGenImpl::GetModelInputsType( - const infrt::paddle::framework_proto::ProgramDesc &program) { + const infrt::paddle::framework_proto::ProgramDesc &program, + bool arg_has_map) { llvm::SmallVector operandTypes; - operandTypes.push_back(infrt::phi::DenseTensorMapType::get(context_)); + if (arg_has_map) { + operandTypes.push_back(infrt::phi::DenseTensorMapType::get(context_)); + } for (auto &op_desc : main_block_.ops()) { if (op_desc.type() != "feed") continue; for (int var_idx = 0; var_idx < op_desc.outputs_size(); ++var_idx) { @@ -155,9 +168,14 @@ void MLIRModelGenImpl::UpdateModelOps( void MLIRModelGenImpl::UpdateModelParams( const infrt::paddle::framework_proto::ProgramDesc &program, - mlir::FuncOp *mainFunc) { + mlir::FuncOp *mainFunc, + bool arg_has_map) { // update input vars - int input_index = 1; + int input_index; + if (arg_has_map) + input_index = 1; + else + input_index = 0; for (auto &op_desc : main_block_.ops()) { if (op_desc.type() == "feed") { for (int var_idx = 0; var_idx < op_desc.outputs_size(); ++var_idx) { @@ -170,9 +188,28 @@ void MLIRModelGenImpl::UpdateModelParams( } } } + ::mlir::Value map; + if (arg_has_map) { + map = mainFunc->getArgument(0); + } else { + builder_.setInsertionPointToStart(&mainFunc->body().front()); + if (!model_dir_.empty()) { + auto load_op = builder_.create<::infrt::phi::LoadParamsOp>( + mlir::UnknownLoc::get(context_), + ::infrt::phi::DenseTensorMapType::get(context_), + builder_.getStringAttr(model_dir_)); + map = load_op.out(); + } else if (!model_file_.empty()) { + auto load_op = builder_.create<::infrt::phi::LoadCombinedParamsOp>( + mlir::UnknownLoc::get(context_), + ::infrt::phi::DenseTensorMapType::get(context_), + builder_.getStringAttr(model_file_), + builder_.getStringAttr(params_file_)); + map = load_op.out(); + } + } // update persistable tensors - ::mlir::Value map = mainFunc->getArgument(0); for (int i = 0; i < main_block_.vars_size(); i++) { auto var_desc = main_block_.vars(i); if (params_map_.find(var_desc.name()) != params_map_.end()) continue; diff --git a/paddle/infrt/host_context/paddle_mlir.h b/paddle/infrt/host_context/paddle_mlir.h index 3d79d608e702d..57bdc1b48578b 100644 --- a/paddle/infrt/host_context/paddle_mlir.h +++ b/paddle/infrt/host_context/paddle_mlir.h @@ -37,8 +37,10 @@ class MLIRModelGenImpl { public: MLIRModelGenImpl(); mlir::ModuleOp ImportPaddleModel(const std::string &model_file, - const std::string ¶m_file); - mlir::ModuleOp ImportPaddleModel(const std::string &model_dir); + const std::string ¶m_file, + bool arg_has_map = true); + mlir::ModuleOp ImportPaddleModel(const std::string &model_dir, + bool arg_has_map = true); private: // parse paddle model file @@ -47,11 +49,13 @@ class MLIRModelGenImpl { // convert paddle model proto into paddle dialect module mlir::ModuleOp ImportPaddleModel( - const infrt::paddle::framework_proto::ProgramDesc &program); + const infrt::paddle::framework_proto::ProgramDesc &program, + bool arg_has_map); // get inputs and outputs info from program_desc llvm::SmallVector GetModelInputsType( - const infrt::paddle::framework_proto::ProgramDesc &program); + const infrt::paddle::framework_proto::ProgramDesc &program, + bool arg_has_map); llvm::SmallVector GetModelOutputsType( const infrt::paddle::framework_proto::ProgramDesc &program); // create main function module @@ -63,7 +67,8 @@ class MLIRModelGenImpl { // convert persistable params and inputs variable into mlir domain void UpdateModelParams( const infrt::paddle::framework_proto::ProgramDesc &program, - mlir::FuncOp *mainFunc); + mlir::FuncOp *mainFunc, + bool arg_has_map); // register model outpus into params_map_ void UpdateModelOutputs( const infrt::paddle::framework_proto::ProgramDesc &program); @@ -80,11 +85,16 @@ class MLIRModelGenImpl { void RegisterOpOutputVars(const infrt::paddle::framework_proto::OpDesc &op_, mlir::Operation *mlir_op_); + private: mlir::MLIRContext *context_; mlir::OpBuilder builder_; mlir::ModuleOp module_; infrt::paddle::framework_proto::BlockDesc main_block_; + std::string model_dir_{}; + std::string model_file_{}; + std::string params_file_{}; + std::map params_map_; }; diff --git a/paddle/infrt/kernel/phi/registry.cc b/paddle/infrt/kernel/phi/registry.cc index 928209ab182e6..848ff28faffc7 100644 --- a/paddle/infrt/kernel/phi/registry.cc +++ b/paddle/infrt/kernel/phi/registry.cc @@ -46,7 +46,7 @@ void RegisterPhiKernels(host_context::KernelRegistry* registry) { registry->AddKernel( "phi_dt.create_host_inited_dense_tensor.f32", INFRT_KERNEL(infrt::kernel::phi::CreateHostInitedDenseTensorF32), - {"dims", "lod", "layout", "values"}); + {"dims", "lod", "layout", "values", "run_once"}); registry->AddKernel("phi_dt.fill_dense_tensor.f32", INFRT_KERNEL(infrt::kernel::phi::FillDenseTensorF32), From c00d869b48847900d79eed0d4209c7ab7b43f92a Mon Sep 17 00:00:00 2001 From: baoachun <962571062@qq.com> Date: Sun, 10 Apr 2022 14:14:17 +0800 Subject: [PATCH 052/211] add mkldnn compute_propagate_scales int8 pass (#41592) --- paddle/fluid/framework/ir/CMakeLists.txt | 1 + .../compute_propagate_scales_mkldnn_pass.cc | 438 ++++++++++++++++++ .../compute_propagate_scales_mkldnn_pass.h | 92 ++++ 3 files changed, 531 insertions(+) create mode 100644 paddle/fluid/framework/ir/mkldnn/compute_propagate_scales_mkldnn_pass.cc create mode 100644 paddle/fluid/framework/ir/mkldnn/compute_propagate_scales_mkldnn_pass.h diff --git a/paddle/fluid/framework/ir/CMakeLists.txt b/paddle/fluid/framework/ir/CMakeLists.txt index 4ee0b08375d2e..834a2c953eab8 100755 --- a/paddle/fluid/framework/ir/CMakeLists.txt +++ b/paddle/fluid/framework/ir/CMakeLists.txt @@ -141,6 +141,7 @@ if(WITH_MKLDNN) pass_library(multi_gru_fuse_pass inference DIR mkldnn) pass_library(multi_gru_seq_fuse_pass inference DIR mkldnn) pass_library(quant_dequant_mkldnn_pass inference DIR mkldnn) + pass_library(compute_propagate_scales_mkldnn_pass inference DIR mkldnn) endif() if(WITH_IPU) diff --git a/paddle/fluid/framework/ir/mkldnn/compute_propagate_scales_mkldnn_pass.cc b/paddle/fluid/framework/ir/mkldnn/compute_propagate_scales_mkldnn_pass.cc new file mode 100644 index 0000000000000..d7d0b988b551e --- /dev/null +++ b/paddle/fluid/framework/ir/mkldnn/compute_propagate_scales_mkldnn_pass.cc @@ -0,0 +1,438 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include + +#include "paddle/fluid/framework/ir/graph_helper.h" +#include "paddle/fluid/framework/ir/mkldnn/compute_propagate_scales_mkldnn_pass.h" +#include "paddle/fluid/framework/ir/mkldnn/mkldnn_pass_util.h" +#include "paddle/fluid/framework/op_version_registry.h" + +namespace paddle { +namespace framework { +namespace ir { + +void ComputePropagateScalesMkldnnPass::GetTensorFromVector( + const std::vector& data_v, Tensor* tensor) const { + const int size = static_cast(data_v.size()); + auto* data = tensor->mutable_data({size}, platform::CPUPlace()); + for (int i = 0; i < size; i++) { + data[i] = data_v[i]; + } +} + +void ComputePropagateScalesMkldnnPass::GetQuantInfo( + ir::Graph* graph, StringPairMap* var_quant_scales) const { + std::unordered_map> info_map{}; + GetInfoFromTheFirstOp(graph, "has_quant_info", "var_quant_scales", &info_map); + + for (auto iter = info_map.begin(); iter != info_map.end(); iter++) { + Tensor tensor; + GetTensorFromVector(iter->second, &tensor); + auto pair = std::make_pair(false, tensor); + var_quant_scales->insert(std::make_pair(iter->first, pair)); + } +} + +std::vector ComputePropagateScalesMkldnnPass::GetScales(Tensor* tensor, + int axis) const { + PADDLE_ENFORCE_LT(axis, 2, + platform::errors::InvalidArgument( + "The input axis is required to be less than 2.")); + auto* data = tensor->data(); + const auto dims = tensor->dims(); + PADDLE_ENFORCE_EQ(dims.size(), 2, + platform::errors::InvalidArgument( + "The input tensor's rank is required to be 2.")); + + const int rows = dims.at(0); + const int columns = dims.at(1); + std::vector scales; + if (axis == 0) { + for (int i = 0; i < columns; i++) { + float max_value = FLT_MIN; + for (int j = 0; j < rows; j++) { + max_value = std::max(max_value, std::abs(data[i + j * columns])); + } + max_value = 1.0 / max_value; + if (std::isinf(max_value) || std::isnan(max_value)) { + max_value = 0.0; + } + scales.push_back(max_value); + } + } else { + for (int i = 0; i < rows; i++) { + float max_value = FLT_MIN; + for (int j = i * columns; j < (i + 1) * columns; j++) { + max_value = std::max(max_value, std::abs(data[j])); + } + max_value = 1.0 / max_value; + if (std::isinf(max_value) || std::isnan(max_value)) { + max_value = 0.0; + } + scales.push_back(max_value); + } + } + return scales; +} + +void ComputePropagateScalesMkldnnPass::ComputeVarScales( + ir::Graph* graph, Scope* scope, const std::unordered_set& ops, + const std::string& weight_name, const int axis, + StringPairMap* var_quant_scales) const { + for (auto* op_node : + ir::TopologyVarientSort(*graph, static_cast(0))) { + if (!op_node->IsOp()) continue; + + auto* op_desc = op_node->Op(); + if (ops.count(op_desc->Type())) { + auto var_name = op_desc->Input(weight_name)[0]; + auto* var = scope->FindVar(var_name); + PADDLE_ENFORCE_NOT_NULL( + var, platform::errors::NotFound( + "The input persistable var [%s] of [%s] op is not found.", + var_name, op_desc->Type())); + auto* weight_tensor = var->GetMutable(); + const auto dims = weight_tensor->dims(); + int volume = 1; + for (int i = 1; i < dims.size(); i++) { + volume *= dims[i]; + } + + Tensor tmp_tensor; + std::vector reshape_dims = {dims[0], volume}; + tmp_tensor.Resize(phi::make_ddim(reshape_dims)); + auto* weight_data = weight_tensor->data(); + auto* tmp_data = tmp_tensor.mutable_data(platform::CPUPlace()); + for (int i = 0; i < weight_tensor->numel(); i++) { + tmp_data[i] = std::abs(weight_data[i]); + } + + auto scales_v = GetScales(&tmp_tensor, axis); + Tensor tensor; + GetTensorFromVector(scales_v, &tensor); + auto pair = std::make_pair(false, tensor); + var_quant_scales->insert(std::make_pair(var_name, pair)); + } + } +} + +void ComputePropagateScalesMkldnnPass::ComputeSingleGruWeightScales( + Scope* scope, const std::string& wx_var_name, + const std::string& wh_var_name, Tensor* tensor) const { + auto* wx_var = scope->FindVar(wx_var_name); + PADDLE_ENFORCE_NOT_NULL( + wx_var, platform::errors::NotFound( + "The input persistable var [%s] is not found.", wx_var_name)); + auto* wh_var = scope->FindVar(wh_var_name); + PADDLE_ENFORCE_NOT_NULL( + wh_var, platform::errors::NotFound( + "The input persistable var [%s] is not found.", wh_var_name)); + + const auto* wx_tensor = wx_var->GetMutable(); + const auto* wh_tensor = wh_var->GetMutable(); + const int OC = wh_tensor->dims()[0]; + std::vector scale_ur(2 * OC); + std::vector scale_o(OC); + for (int row_id = 0; row_id < wx_tensor->dims()[0]; row_id++) { + for (int col_id = 0; col_id < 2 * OC; col_id++) { + int idx = (row_id * wx_tensor->dims()[1]) + col_id; + auto abs_value = std::abs(wx_tensor->data()[idx]); + if (row_id == 0) { + scale_ur[col_id] = abs_value; + } else { + if (abs_value > scale_ur[col_id]) scale_ur[col_id] = abs_value; + } + } + } + + for (int i = 0; i < 2 * OC * OC; i++) { + int col_id = i % (2 * OC); + auto abs_value = std::abs(wh_tensor->data()[i]); + if (abs_value > scale_ur[col_id]) scale_ur[col_id] = abs_value; + } + + for (int row_id = 0; row_id < wx_tensor->dims()[0]; row_id++) { + for (int col_id = 2 * OC; col_id < wx_tensor->dims()[1]; col_id++) { + int idx = (row_id * wx_tensor->dims()[1]) + col_id; + auto abs_value = std::abs(wx_tensor->data()[idx]); + if (row_id == 0) { + scale_o[col_id % OC] = abs_value; + } else { + if (abs_value > scale_o[col_id]) scale_o[col_id % OC] = abs_value; + } + } + } + + for (int i = 2 * OC * OC; i < OC * wh_tensor->dims()[1]; i++) { + int col_id = i % OC; + auto abs_value = std::abs(wh_tensor->data()[i]); + if (abs_value > scale_o[col_id]) scale_o[col_id] = abs_value; + } + + scale_ur.insert(scale_ur.end(), scale_o.begin(), scale_o.end()); + transform(scale_ur.begin(), scale_ur.end(), scale_ur.begin(), + [](float c) { return 1 / c; }); + GetTensorFromVector(scale_ur, tensor); +} + +void ComputePropagateScalesMkldnnPass::ComputeGruWeightScales( + ir::Graph* graph, Scope* scope, const std::string& wx_name, + const std::string& wh_name, StringPairMap* var_quant_scales) const { + for (auto* op_node : + ir::TopologyVarientSort(*graph, static_cast(0))) { + if (!op_node->IsOp()) continue; + + auto* op_desc = op_node->Op(); + if (op_desc->Type() == "fusion_gru" || op_desc->Type() == "multi_gru") { + auto wx_var_names = op_desc->Input(wx_name); + auto wh_var_names = op_desc->Input(wh_name); + const int wx_names_size = static_cast(wx_var_names.size()); + const int wh_names_size = static_cast(wh_var_names.size()); + PADDLE_ENFORCE_EQ( + wx_names_size, wh_names_size, + platform::errors::Fatal("Mismatch in number of weights inputs (%d " + "for WeightX vs. %d for WeightH).", + wx_names_size, wh_names_size)); + for (int i = 0; i < wx_names_size; i++) { + auto wh_var_name = wh_var_names[i]; + auto wx_var_name = wx_var_names[i]; + Tensor tensor; + ComputeSingleGruWeightScales(scope, wx_var_name, wh_var_name, &tensor); + auto pair = std::make_pair(false, tensor); + var_quant_scales->insert(std::make_pair(wx_var_name, pair)); + } + } + } +} + +void ComputePropagateScalesMkldnnPass::ComputeSingleLstmWeightScales( + Scope* scope, const std::string& wx_var_name, + const std::string& wh_var_name, Tensor* tensor) const { + auto* wx_var = scope->FindVar(wx_var_name); + PADDLE_ENFORCE_NOT_NULL( + wx_var, platform::errors::NotFound( + "The input persistable var [%s] is not found.", wx_var_name)); + auto* wh_var = scope->FindVar(wh_var_name); + PADDLE_ENFORCE_NOT_NULL( + wh_var, platform::errors::NotFound( + "The input persistable var [%s] is not found.", wh_var_name)); + + const auto* wx_tensor = wx_var->GetMutable(); + const auto* wh_tensor = wh_var->GetMutable(); + std::vector scale(wx_tensor->dims()[1]); + + for (int row_id = 0; row_id < wx_tensor->dims()[0]; row_id++) { + for (int col_id = 0; col_id < wx_tensor->dims()[1]; col_id++) { + int idx = (row_id * wx_tensor->dims()[1]) + col_id; + auto abs_value = std::abs(wx_tensor->data()[idx]); + if (row_id == 0) { + scale[col_id] = abs_value; + } else { + if (abs_value > scale[col_id]) scale[col_id] = abs_value; + } + } + } + for (int row_id = 0; row_id < wh_tensor->dims()[0]; row_id++) { + for (int col_id = 0; col_id < wh_tensor->dims()[1]; col_id++) { + int idx = (row_id * wh_tensor->dims()[1]) + col_id; + auto abs_value = std::abs(wh_tensor->data()[idx]); + if (abs_value > scale[col_id]) scale[col_id] = abs_value; + } + } + transform(scale.begin(), scale.end(), scale.begin(), + [](float c) { return 1 / c; }); + GetTensorFromVector(scale, tensor); +} + +void ComputePropagateScalesMkldnnPass::ComputeLstmWeightScales( + ir::Graph* graph, Scope* scope, const std::string& wx_name, + const std::string& wh_name, StringPairMap* var_quant_scales) const { + for (auto* op_node : + ir::TopologyVarientSort(*graph, static_cast(0))) { + if (!op_node->IsOp()) continue; + + auto* op_desc = op_node->Op(); + if (op_desc->Type() == "fusion_lstm") { + auto wx_var_names = op_desc->Input(wx_name); + auto wh_var_names = op_desc->Input(wh_name); + const int wx_names_size = static_cast(wx_var_names.size()); + const int wh_names_size = static_cast(wh_var_names.size()); + PADDLE_ENFORCE_EQ( + wx_names_size, wh_names_size, + platform::errors::Fatal("Mismatch in number of weights inputs (%d " + "for WeightX vs. %d for WeightH).", + wx_names_size, wh_names_size)); + + for (int i = 0; i < wx_names_size; i++) { + auto wh_var_name = wh_var_names[i]; + auto wx_var_name = wx_var_names[i]; + Tensor tensor; + ComputeSingleLstmWeightScales(scope, wx_var_name, wh_var_name, &tensor); + auto pair = std::make_pair(false, tensor); + var_quant_scales->insert(std::make_pair(wx_var_name, pair)); + } + } + } +} + +void ComputePropagateScalesMkldnnPass::ComputeWeightScales( + ir::Graph* graph, Scope* scope, StringPairMap* var_quant_scales) const { + ComputeVarScales(graph, scope, {"conv2d", "depthwise_conv2d"}, "Filter", 1, + var_quant_scales); + ComputeVarScales(graph, scope, {"fc"}, "W", 0, var_quant_scales); + ComputeVarScales(graph, scope, {"fusion_gru", "multi_gru"}, "WeightH", 0, + var_quant_scales); + ComputeVarScales(graph, scope, {"fusion_lstm"}, "WeightH", 0, + var_quant_scales); + ComputeGruWeightScales(graph, scope, "WeightX", "WeightH", var_quant_scales); + ComputeLstmWeightScales(graph, scope, "WeightX", "WeightH", var_quant_scales); +} + +void ComputePropagateScalesMkldnnPass::UpdateScaleOpInScale( + Node* op_node, const std::string& input_name, + const std::string& output_name, StringPairMap* var_quant_scales) const { + auto iter = var_quant_scales->find(output_name); + if (iter != var_quant_scales->end()) { + auto pair = iter->second; + const auto tensor = pair.second; + + const auto scale = BOOST_GET_CONST(float, op_node->Op()->GetAttr("scale")); + Tensor tmp_tensor; + tmp_tensor.Resize(tensor.dims()); + auto* data = tmp_tensor.mutable_data(platform::CPUPlace()); + for (int i = 0; i < tensor.numel(); i++) { + data[i] = data[i] * scale; + } + + auto new_pair = std::make_pair(pair.first, tmp_tensor); + var_quant_scales->insert(std::make_pair(input_name, new_pair)); + } +} + +std::unordered_set ComputePropagateScalesMkldnnPass::UpdateScales( + ir::Graph* graph, StringPairMap* var_quant_scales, + const std::unordered_set& scale_immutable_ops) const { + std::unordered_set waiting_for_scale{}; + for (auto* op_node : + ir::TopologyVarientSort(*graph, static_cast(0))) { + if (!op_node->IsOp()) continue; + + const auto op_name = op_node->Name(); + if (scale_immutable_ops.count(op_name)) { + std::string input_name; + if (op_name == "slice") { + input_name = op_node->Op()->Input("Input")[0]; + } else { + input_name = op_node->Op()->Input("X")[0]; + } + + const std::string output_name = op_node->Op()->Output("Out")[0]; + auto in_iter = var_quant_scales->find(input_name); + auto out_iter = var_quant_scales->find(output_name); + if (in_iter == var_quant_scales->end() && + out_iter == var_quant_scales->end()) { + waiting_for_scale.insert(input_name); + waiting_for_scale.insert(output_name); + } else if (in_iter != var_quant_scales->end()) { + out_iter->second = in_iter->second; + } else if (out_iter != var_quant_scales->end()) { + in_iter->second = out_iter->second; + } + } else if (op_name == "scale") { + const std::string output_name = op_node->Op()->Output("Out")[0]; + auto out_iter = var_quant_scales->find(output_name); + if (out_iter != var_quant_scales->end()) { + const std::string input_name = op_node->Op()->Input("X")[0]; + UpdateScaleOpInScale(op_node, input_name, output_name, + var_quant_scales); + } + } + } + return waiting_for_scale; +} + +void ComputePropagateScalesMkldnnPass::PropagateScales( + ir::Graph* graph, StringPairMap* var_quant_scales, + const std::unordered_set& scale_immutable_ops) const { + auto waiting_for_scale = + UpdateScales(graph, var_quant_scales, scale_immutable_ops); + std::unordered_set waiting_for_scale_prev{}; + while (waiting_for_scale.size() != 0 && + waiting_for_scale != waiting_for_scale_prev) { + waiting_for_scale_prev.clear(); + waiting_for_scale_prev.insert(waiting_for_scale.begin(), + waiting_for_scale.end()); + waiting_for_scale = + UpdateScales(graph, var_quant_scales, scale_immutable_ops); + } +} + +void ComputePropagateScalesMkldnnPass::ConvertStringPairMap( + const StringPairMap& var_quant_scales, + std::unordered_map>* info_map) const { + for (auto iter = var_quant_scales.begin(); iter != var_quant_scales.end(); + iter++) { + auto* data = iter->second.second.data(); + std::vector data_v; + for (int i = 0; i < iter->second.second.numel(); i++) { + data_v.push_back(data[i]); + } + + info_map->insert(std::make_pair(iter->first, data_v)); + } +} + +void ComputePropagateScalesMkldnnPass::ApplyImpl(ir::Graph* graph) const { + VLOG(3) << "Convert paddle model to mkldnn quantized model."; + const std::string pattern_name = "compute_propagate_scales_mkldnn_pass"; + FusePassBase::Init(pattern_name, graph); + + const std::unordered_set scale_immutable_ops = { + "transpose2", "reshape2", "pool2d", + "slice", "nearest_interp", "nearest_interp_v2"}; + + StringPairMap var_quant_scales{}; + + auto* scope = param_scope(); + GetQuantInfo(graph, &var_quant_scales); + ComputeWeightScales(graph, scope, &var_quant_scales); + PropagateScales(graph, &var_quant_scales, scale_immutable_ops); + + // save var_quant_scales in the first op's attr + // for cpu_quantize_pass + std::unordered_map> info_map; + ConvertStringPairMap(var_quant_scales, &info_map); + SaveInfoInTheFirstOp(graph, "has_quant_info", "var_quant_scales", info_map); +} + +} // namespace ir +} // namespace framework +} // namespace paddle + +REGISTER_PASS(compute_propagate_scales_mkldnn_pass, + paddle::framework::ir::ComputePropagateScalesMkldnnPass); + +REGISTER_PASS_CAPABILITY(compute_propagate_scales_mkldnn_pass) + .AddCombination( + paddle::framework::compatible::OpVersionComparatorCombination() + .LE("conv2d", 1) + .EQ("fc", 0) + .LE("conv2d_transpose", 2) + .EQ("fake_quantize_abs_max", 0) + .EQ("fake_quantize_range_abs_max", 0) + .EQ("fake_quantize_moving_average_abs_max", 0) + .LE("fake_channel_wise_quantize_abs_max", 1) + .EQ("fake_dequantize_max_abs", 0)); diff --git a/paddle/fluid/framework/ir/mkldnn/compute_propagate_scales_mkldnn_pass.h b/paddle/fluid/framework/ir/mkldnn/compute_propagate_scales_mkldnn_pass.h new file mode 100644 index 0000000000000..b0076c1b38cd4 --- /dev/null +++ b/paddle/fluid/framework/ir/mkldnn/compute_propagate_scales_mkldnn_pass.h @@ -0,0 +1,92 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include "paddle/fluid/framework/ir/fuse_pass_base.h" + +namespace paddle { +namespace framework { +namespace ir { + +using StringPairMap = std::unordered_map>; + +class ComputePropagateScalesMkldnnPass : public FusePassBase { + public: + ComputePropagateScalesMkldnnPass() = default; + virtual ~ComputePropagateScalesMkldnnPass() {} + +#ifdef PADDLE_WITH_TESTING + friend class ComputePropagateScalesMkldnnPassTest; +#endif + + protected: + void ApplyImpl(ir::Graph* graph) const override; + + private: + void GetTensorFromVector(const std::vector& data_v, + Tensor* tensor) const; + + void GetQuantInfo(ir::Graph* graph, StringPairMap* var_quant_scales) const; + + std::vector GetScales(Tensor* tensor, int axis) const; + + void ComputeVarScales(ir::Graph* graph, Scope* scope, + const std::unordered_set& ops, + const std::string& weight_name, const int axis, + StringPairMap* var_quant_scales) const; + + void ComputeSingleGruWeightScales(Scope* scope, + const std::string& wx_var_name, + const std::string& wh_var_name, + Tensor* tensor) const; + + void ComputeGruWeightScales(ir::Graph* graph, Scope* scope, + const std::string& wx_name, + const std::string& wh_name, + StringPairMap* var_quant_scales) const; + + void ComputeSingleLstmWeightScales(Scope* scope, + const std::string& wx_var_name, + const std::string& wh_var_name, + Tensor* tensor) const; + + void ComputeLstmWeightScales(ir::Graph* graph, Scope* scope, + const std::string& wx_name, + const std::string& wh_name, + StringPairMap* var_quant_scales) const; + + void ComputeWeightScales(ir::Graph* graph, Scope* scope, + StringPairMap* var_quant_scales) const; + + void UpdateScaleOpInScale(Node* op_node, const std::string& input_name, + const std::string& output_name, + StringPairMap* var_quant_scales) const; + + std::unordered_set UpdateScales( + ir::Graph* graph, StringPairMap* var_quant_scales, + const std::unordered_set& scale_immutable_ops) const; + + void PropagateScales( + ir::Graph* graph, StringPairMap* var_quant_scales, + const std::unordered_set& scale_immutable_ops) const; + + void ConvertStringPairMap( + const StringPairMap& var_quant_scales, + std::unordered_map>* info_map) const; +}; +} // namespace ir +} // namespace framework +} // namespace paddle From 81c40722869935d6e897f4b1aeb6e6f67606188a Mon Sep 17 00:00:00 2001 From: xiongkun Date: Sun, 10 Apr 2022 17:36:18 +0800 Subject: [PATCH 053/211] [Yaml] Modify api and add unittests for full api final state. (#41437) * full api fix * when out is None, go old dygraph mode * fix * add name for buffer * fix by code review * fix * by static check --- python/paddle/fluid/dygraph/layers.py | 2 + python/paddle/fluid/layers/tensor.py | 48 ++++++++++------ .../fluid/tests/unittests/test_full_op.py | 56 +++++++++++++++++++ 3 files changed, 89 insertions(+), 17 deletions(-) diff --git a/python/paddle/fluid/dygraph/layers.py b/python/paddle/fluid/dygraph/layers.py index 54a245aab81c9..193025b1864ab 100644 --- a/python/paddle/fluid/dygraph/layers.py +++ b/python/paddle/fluid/dygraph/layers.py @@ -1169,6 +1169,8 @@ def _remove_if_exist(*dicts): # add a persistable buffer. if name not in self._buffers: self._non_persistable_buffer_names_set.add(name) + if not value.name: + value.name = unique_name.generate('_buffers_' + name) _buffers[name] = value elif _buffers is not None and name in _buffers: # Note(Aurelius84): In Dy2stat, the value of the Buffer may be modified in diff --git a/python/paddle/fluid/layers/tensor.py b/python/paddle/fluid/layers/tensor.py index e302371988739..28e0d4eff377f 100644 --- a/python/paddle/fluid/layers/tensor.py +++ b/python/paddle/fluid/layers/tensor.py @@ -21,7 +21,7 @@ from ..layer_helper import LayerHelper from ..param_attr import ParamAttr from ..initializer import Initializer -from ..framework import _current_expected_place, convert_np_dtype_to_dtype_, _non_static_mode, _varbase_creator, device_guard, _in_legacy_dygraph, in_dygraph_mode +from ..framework import _current_expected_place, convert_np_dtype_to_dtype_, _non_static_mode, _varbase_creator, device_guard, _in_legacy_dygraph, in_dygraph_mode, _get_paddle_place from ..framework import Variable from ..initializer import Constant from ..core import VarDesc @@ -751,22 +751,36 @@ def fill_constant(shape, dtype, value, force_cpu=False, out=None, name=None): attrs['value'] = float(value) if _non_static_mode(): - shape = utils.convert_shape_to_list(shape) - if out is None: - out = _varbase_creator(dtype=dtype) - - if isinstance(value, Variable): - if dtype in ['uint8', 'int16', 'int32', 'int64']: - attrs['str_value'] = str(int(value.numpy().item(0))) - else: - attrs['str_value'] = str(float(value.numpy().item(0))) - - _C_ops.fill_constant(out, 'value', - float(value), 'force_cpu', force_cpu, 'dtype', - out.dtype, 'str_value', attrs['str_value'], - 'shape', shape) - out.stop_gradient = True - return out + if out is None and in_dygraph_mode(): + #Currently, final state mode don't support out is None. + place = _current_expected_place() + if force_cpu: + place = core.CPUPlace() + + shape = utils.convert_shape_to_list(shape) + if not isinstance(dtype, core.VarDesc.VarType): + dtype = convert_np_dtype_to_dtype_(dtype) + out = _C_ops.final_state_full(shape, float(value), dtype, place) + out.stop_gradient = True + return out + + else: + shape = utils.convert_shape_to_list(shape) + if out is None: + out = _varbase_creator(dtype=dtype) + + if isinstance(value, Variable): + if dtype in ['uint8', 'int16', 'int32', 'int64']: + attrs['str_value'] = str(int(value.numpy().item(0))) + else: + attrs['str_value'] = str(float(value.numpy().item(0))) + + _C_ops.fill_constant(out, 'value', + float(value), 'force_cpu', force_cpu, 'dtype', + out.dtype, 'str_value', attrs['str_value'], + 'shape', shape) + out.stop_gradient = True + return out helper = LayerHelper("fill_constant", **locals()) inputs = {} diff --git a/python/paddle/fluid/tests/unittests/test_full_op.py b/python/paddle/fluid/tests/unittests/test_full_op.py index c508d56c29a43..2442f2b681554 100644 --- a/python/paddle/fluid/tests/unittests/test_full_op.py +++ b/python/paddle/fluid/tests/unittests/test_full_op.py @@ -23,6 +23,7 @@ import paddle.fluid as fluid import paddle from paddle.fluid import compiler, Program, program_guard +from paddle.fluid.framework import _test_eager_guard # Test python API @@ -75,6 +76,61 @@ def test_api(self): assert np.array_equal(res_6, np.full([1, 2], 1.1, dtype="float32")) assert np.array_equal(res_7, np.full([1, 2], 1.1, dtype="float32")) + def test_api_eager(self): + with fluid.dygraph.base.guard(): + with _test_eager_guard(): + positive_2_int32 = fluid.layers.fill_constant([1], "int32", 2) + + positive_2_int64 = fluid.layers.fill_constant([1], "int64", 2) + out_1 = paddle.full( + shape=[1, 2], dtype="float32", fill_value=1.1) + + out_2 = paddle.full( + shape=[1, positive_2_int32.item()], + dtype="float32", + fill_value=1.1) + + out_3 = paddle.full( + shape=[1, positive_2_int64.item()], + dtype="float32", + fill_value=1.1) + + out_4 = paddle.full( + shape=[1, 2], dtype="float32", fill_value=1.2) + + out_5 = paddle.full( + shape=[1, 2], dtype="float32", fill_value=1.1) + + out_6 = paddle.full( + shape=[1, 2], dtype=np.float32, fill_value=1.1) + + val = fluid.layers.fill_constant( + shape=[1], dtype=np.float32, value=1.1) + out_7 = paddle.full( + shape=[1, 2], dtype=np.float32, fill_value=val) + + assert np.array_equal( + out_1, np.full( + [1, 2], 1.1, dtype="float32")) + assert np.array_equal( + out_2, np.full( + [1, 2], 1.1, dtype="float32")) + assert np.array_equal( + out_3, np.full( + [1, 2], 1.1, dtype="float32")) + assert np.array_equal( + out_4, np.full( + [1, 2], 1.2, dtype="float32")) + assert np.array_equal( + out_5, np.full( + [1, 2], 1.1, dtype="float32")) + assert np.array_equal( + out_6, np.full( + [1, 2], 1.1, dtype="float32")) + assert np.array_equal( + out_7, np.full( + [1, 2], 1.1, dtype="float32")) + class TestFullOpError(unittest.TestCase): def test_errors(self): From 91d6f47ac5df33b9158a4047f0896a06bd3d2490 Mon Sep 17 00:00:00 2001 From: Chen Weihang Date: Sun, 10 Apr 2022 20:11:46 +0800 Subject: [PATCH 054/211] fix warpctc grad kernel dep eror (#41598) --- paddle/phi/kernels/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/paddle/phi/kernels/CMakeLists.txt b/paddle/phi/kernels/CMakeLists.txt index d4b832cef0bd2..937024d450a36 100644 --- a/paddle/phi/kernels/CMakeLists.txt +++ b/paddle/phi/kernels/CMakeLists.txt @@ -62,7 +62,7 @@ kernel_library(triangular_solve_grad_kernel DEPS ${COMMON_KERNEL_DEPS} matrix_re kernel_library(rnn_kernel DEPS ${COMMON_KERNEL_DEPS} concat_and_split_functor lstm_compute gru_compute) kernel_library(rnn_grad_kernel DEPS ${COMMON_KERNEL_DEPS} concat_and_split_functor lstm_compute gru_compute) kernel_library(warpctc_kernel DEPS ${COMMON_KERNEL_DEPS} phi_dynload_warpctc sequence_padding sequence_scale) -kernel_library(warpctc_grad_kernel DEPS ${COMMON_KERNEL_DEPS} sequence_padding sequence_scale) +kernel_library(warpctc_grad_kernel DEPS ${COMMON_KERNEL_DEPS} phi_dynload_warpctc sequence_padding sequence_scale) # 4. auto parse and build kernel targets by cmake register_kernels(EXCLUDES ${COMMON_BAISC_KERNELS} ${MANUAL_BUILD_KERNELS} DEPS ${COMMON_KERNEL_DEPS} ${COMMON_BAISC_KERNELS} ) From c1394c6ae1a0293ca7800d6614f955f09c52785e Mon Sep 17 00:00:00 2001 From: Liu-xiandong <85323580+Liu-xiandong@users.noreply.github.com> Date: Sun, 10 Apr 2022 21:59:09 +0800 Subject: [PATCH 055/211] [KP]fix bug when TruncatedNormal cannot fall back in cpu (#41565) * [KP]fix bug when TruncatedNormal cannot fall back in cpu * delete useless comment * delete useless comment --- paddle/fluid/framework/operator.cc | 7 +++++-- paddle/fluid/imperative/prepared_operator.cc | 5 ++++- 2 files changed, 9 insertions(+), 3 deletions(-) diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc index 6af07caaf88b2..e6577f662ae7b 100644 --- a/paddle/fluid/framework/operator.cc +++ b/paddle/fluid/framework/operator.cc @@ -1333,7 +1333,7 @@ void OperatorWithKernel::RunImpl(const Scope& scope, // NOTE(Liu-xiandong): Determine whether the selected kernel is valid // If not, use the kernel registered in fluid. And if the fluid do not // contains the related heterogeneous kernel, use phi CPU kernel. -#if defined(PADDLE_WITH_XPU) && !defined(PADDLE_WITH_XPU_KP) +#if defined(PADDLE_WITH_XPU) bool is_xpu_unsupport = paddle::platform::is_xpu_place(kernel_type_->place_) && !paddle::platform::is_xpu_support_op(type_, *kernel_type_.get()) || @@ -1373,7 +1373,10 @@ void OperatorWithKernel::RunImpl(const Scope& scope, #if defined(PADDLE_WITH_XPU) && !defined(PADDLE_WITH_XPU_KP) || is_xpu_unsupport #endif - ) { +#if defined(PADDLE_WITH_XPU_KP) + || (is_xpu_unsupport && !is_xpu_kp_support) +#endif + ) { auto pt_cpu_kernel_key = FallBackToCpu(*kernel_type_.get(), pt_kernel_key, *this); pt_kernel_.reset( diff --git a/paddle/fluid/imperative/prepared_operator.cc b/paddle/fluid/imperative/prepared_operator.cc index b56d113937d69..0ad5e808b1d1a 100644 --- a/paddle/fluid/imperative/prepared_operator.cc +++ b/paddle/fluid/imperative/prepared_operator.cc @@ -263,7 +263,10 @@ PreparedOp PrepareImpl(const NameVarMap& ins, #if defined(PADDLE_WITH_XPU) && !defined(PADDLE_WITH_XPU_KP) || is_xpu_unsupport #endif - ) { +#if defined(PADDLE_WITH_XPU_KP) + || (is_xpu_unsupport && !is_xpu_kp_support) +#endif + ) { if (phi::KernelFactory::Instance().HasCompatiblePhiKernel(op.Type())) { auto pt_cpu_kernel_key = FallBackToCpu(expected_kernel_key, pt_kernel_key, op); From 795d7121b19da034a5cc3358b23097338fd2d6d1 Mon Sep 17 00:00:00 2001 From: sneaxiy <32832641+sneaxiy@users.noreply.github.com> Date: Mon, 11 Apr 2022 10:23:37 +0800 Subject: [PATCH 056/211] fix some ops (#41577) --- paddle/phi/kernels/cpu/size_kernel.cc | 1 + paddle/phi/kernels/gpu/cumsum_kernel.cu | 23 +++++++++++++---------- paddle/phi/kernels/gpu/size_kernel.cu | 1 + python/paddle/nn/functional/loss.py | 2 +- 4 files changed, 16 insertions(+), 11 deletions(-) diff --git a/paddle/phi/kernels/cpu/size_kernel.cc b/paddle/phi/kernels/cpu/size_kernel.cc index ff34ef26f6bd3..71ebf9cdc09f7 100644 --- a/paddle/phi/kernels/cpu/size_kernel.cc +++ b/paddle/phi/kernels/cpu/size_kernel.cc @@ -22,6 +22,7 @@ PD_REGISTER_KERNEL(size, CPU, ALL_LAYOUT, phi::SizeKernel, + int16_t, int, int64_t, phi::dtype::float16, diff --git a/paddle/phi/kernels/gpu/cumsum_kernel.cu b/paddle/phi/kernels/gpu/cumsum_kernel.cu index a253e6f4ad290..e04f2b5f87658 100644 --- a/paddle/phi/kernels/gpu/cumsum_kernel.cu +++ b/paddle/phi/kernels/gpu/cumsum_kernel.cu @@ -222,25 +222,28 @@ void CumsumKernel(const Context& dev_ctx, // Use thrust for parallel acceleration when the input size is equal to the // length of the ‘axis’ dimension. if (size == out_dims[axis]) { +#ifdef __HIPCC__ + const auto& policy = thrust::hip::par.on(dev_ctx.stream()); +#else + const auto& policy = thrust::cuda::par.on(dev_ctx.stream()); +#endif if (reverse) { - thrust::device_ptr dev_ptr = - thrust::device_pointer_cast(in_data); - thrust::device_vector vec(dev_ptr, dev_ptr + size); + thrust::reverse_iterator> reversed_in( + thrust::device_pointer_cast(in_data) + size); + thrust::reverse_iterator> reversed_out( + thrust::device_pointer_cast(out_data) + size); if (exclusive) { thrust::exclusive_scan( - thrust::device, vec.rbegin(), vec.rend(), out_data); + policy, reversed_in, reversed_in + size, reversed_out); } else { thrust::inclusive_scan( - thrust::device, vec.rbegin(), vec.rend(), out_data); + policy, reversed_in, reversed_in + size, reversed_out); } - thrust::reverse(thrust::device, out_data, out_data + size); } else { if (exclusive) { - thrust::exclusive_scan( - thrust::device, in_data, in_data + size, out_data); + thrust::exclusive_scan(policy, in_data, in_data + size, out_data); } else { - thrust::inclusive_scan( - thrust::device, in_data, in_data + size, out_data); + thrust::inclusive_scan(policy, in_data, in_data + size, out_data); } } return; diff --git a/paddle/phi/kernels/gpu/size_kernel.cu b/paddle/phi/kernels/gpu/size_kernel.cu index 17a39944eb04f..7051fb78c7587 100644 --- a/paddle/phi/kernels/gpu/size_kernel.cu +++ b/paddle/phi/kernels/gpu/size_kernel.cu @@ -22,6 +22,7 @@ PD_REGISTER_KERNEL(size, GPU, ALL_LAYOUT, phi::SizeKernel, + int16_t, int, int64_t, phi::dtype::float16, diff --git a/python/paddle/nn/functional/loss.py b/python/paddle/nn/functional/loss.py index fb9c22edc65ed..62f034c7b4149 100755 --- a/python/paddle/nn/functional/loss.py +++ b/python/paddle/nn/functional/loss.py @@ -1795,7 +1795,7 @@ def cross_entropy(input, # 2. else # numerator: loss's weighted sum # denominator: cal the sum of weight where the sample's class_index!=ignore_index - if ignore_index != -100: + if ignore_index >= 0: out_sum = _C_ops.reduce_sum(out, 'reduce_all', True) # for each label[i],set 1 or 0, according to ignore_index # mask[i]=0, if label[i]==ignore_index From 9107dc67fd9a8aabdafecb14796df406309e6178 Mon Sep 17 00:00:00 2001 From: 0x45f <23097963+0x45f@users.noreply.github.com> Date: Mon, 11 Apr 2022 10:23:58 +0800 Subject: [PATCH 057/211] Switch test_transformer to eager mode and fix roll error (#41548) --- .../fluid/tests/unittests/dygraph_to_static/CMakeLists.txt | 2 +- python/paddle/tensor/manipulation.py | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/CMakeLists.txt b/python/paddle/fluid/tests/unittests/dygraph_to_static/CMakeLists.txt index f046c7b73927e..ddc959a29a2ef 100644 --- a/python/paddle/fluid/tests/unittests/dygraph_to_static/CMakeLists.txt +++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/CMakeLists.txt @@ -6,7 +6,7 @@ set(DY2ST_EAGER_TEST_ENVS ${GC_ENVS} FLAGS_enable_eager_mode=1) set(TEST_EAGER_OPS test_bmn test_break_continue test_ifelse test_loop test_mnist_amp test_mnist_pure_fp16 test_mobile_net test_program_translator test_ptb_lm test_reinforcement_learning test_resnet test_resnet_amp test_resnet_pure_fp16 test_se_resnet test_sentiment test_seq2seq - test_tsm test_word2vec test_yolov3 test_bert test_cycle_gan test_lstm test_simnet) + test_tsm test_word2vec test_yolov3 test_bert test_cycle_gan test_lstm test_simnet test_transformer) list(REMOVE_ITEM TEST_OPS test_lac) # NOTE(Aurelius84): In case of Windows CI, if open ON_INFER, RWLOCK of Scope will # be removed and will cause some random failed in multi-thread. diff --git a/python/paddle/tensor/manipulation.py b/python/paddle/tensor/manipulation.py index d8021f36c211c..389b5dbd7dbec 100755 --- a/python/paddle/tensor/manipulation.py +++ b/python/paddle/tensor/manipulation.py @@ -784,6 +784,8 @@ def roll(x, shifts, axis=None, name=None): axis = [] if in_dygraph_mode(): + if isinstance(shifts, paddle.Tensor): + shifts = shifts.cpu() return _C_ops.final_state_roll(x, shifts, axis) if _in_legacy_dygraph(): From cd2a4cdf4762751431f3469c72922c1b6ff326c8 Mon Sep 17 00:00:00 2001 From: xiongkun Date: Mon, 11 Apr 2022 10:49:14 +0800 Subject: [PATCH 058/211] [Yaml] add yaml for Uniform random and add unit test. (#41517) * gather op * add mod * [Yaml] final state for uniform and uniform_random --- paddle/fluid/operators/uniform_random_op.cc | 76 ++----------------- paddle/phi/infermeta/nullary.cc | 12 +++ paddle/phi/infermeta/nullary.h | 7 ++ .../tests/unittests/test_uniform_random_op.py | 18 +++++ python/paddle/tensor/random.py | 9 ++- python/paddle/utils/code_gen/api.yaml | 12 +++ 6 files changed, 64 insertions(+), 70 deletions(-) diff --git a/paddle/fluid/operators/uniform_random_op.cc b/paddle/fluid/operators/uniform_random_op.cc index 1c22e60fa87aa..55c24e213d58b 100644 --- a/paddle/fluid/operators/uniform_random_op.cc +++ b/paddle/fluid/operators/uniform_random_op.cc @@ -16,9 +16,11 @@ limitations under the License. */ #include #include "paddle/fluid/framework/generator.h" +#include "paddle/fluid/framework/infershape_utils.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/operator.h" #include "paddle/fluid/platform/bfloat16.h" +#include "paddle/phi/infermeta/nullary.h" namespace paddle { namespace operators { @@ -122,74 +124,6 @@ class UniformRandomOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - void InferShape(framework::InferShapeContext *ctx) const override { - OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "UniformRandomOp"); - - PADDLE_ENFORCE_LT( - ctx->Attrs().Get("min"), ctx->Attrs().Get("max"), - platform::errors::InvalidArgument( - "The uniform_random's min must less then max. But received min = " - "%f great than or equal max = %f.", - ctx->Attrs().Get("min"), ctx->Attrs().Get("max"))); - PADDLE_ENFORCE_GE(ctx->Attrs().Get("diag_num"), 0, - platform::errors::InvalidArgument( - "The uniform_random's diag_num must greater than or " - "equal 0. But recevied diag_num (%d) < 0.", - ctx->Attrs().Get("diag_num"))); - PADDLE_ENFORCE_GE(ctx->Attrs().Get("diag_step"), 0, - platform::errors::InvalidArgument( - "The uniform_random's diag_step must greater than or " - "equal 0. But recevied diag_step (%d) < 0.", - ctx->Attrs().Get("diag_step"))); - - if (ctx->HasInputs("ShapeTensorList")) { - // top prority shape - auto inputs_name = ctx->Inputs("ShapeTensorList"); - PADDLE_ENFORCE_GT(inputs_name.size(), 0, - platform::errors::InvalidArgument( - "Input(ShapeTensorList)'size of " - "Op(uniform_random) can't be zero." - "Please check the Attr(shape)'s size of" - "Op(fluid.layers.uniform_random).)")); - auto out_dims = std::vector(inputs_name.size(), -1); - ctx->SetOutputDim("Out", phi::make_ddim(out_dims)); - - return; - } - auto &shape = ctx->Attrs().Get>("shape"); - if (ctx->HasInput("ShapeTensor") && shape.empty()) { - auto shape_dims = ctx->GetInputDim("ShapeTensor"); - PADDLE_ENFORCE_EQ( - shape_dims.size(), 1, - platform::errors::InvalidArgument( - "ShapeError: Input(ShapeTensor)' dimension size of " - "Op(uniform_random) must be 1." - "But received ShapeTensor's dimensions = %d, shape = [%s]", - shape_dims.size(), shape_dims)); - int num_ele = 1; - for (int i = 0; i < shape_dims.size(); ++i) { - num_ele *= shape_dims[i]; - } - auto vec_dims = std::vector(num_ele, -1); - auto out_dims = phi::make_ddim(vec_dims); - ctx->SetOutputDim("Out", out_dims); - return; - } - - PADDLE_ENFORCE_EQ(shape.empty(), false, - platform::errors::InvalidArgument( - "if there is no Input(ShapeTensorList) and no " - "Input(ShapeTensor),the " - "attr(shape) information must " - "be set by Attr(shape).")); - std::vector tensor_shape; - tensor_shape.reserve(shape.size()); - for (auto dim : shape) { - tensor_shape.push_back(static_cast(dim)); - } - ctx->SetOutputDim("Out", phi::make_ddim(tensor_shape)); - } - protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext &ctx) const override { @@ -274,12 +208,16 @@ class UniformRandomOpVarTypeInference : public framework::VarTypeInference { } // namespace operators } // namespace paddle +DECLARE_INFER_SHAPE_FUNCTOR(uniform_random, UniformRandomInferShapeFunctor, + PD_INFER_META(phi::UniformRandomInferMeta)); + REGISTER_OPERATOR( uniform_random, paddle::operators::UniformRandomOp, paddle::operators::UniformRandomOpMaker, paddle::framework::EmptyGradOpMaker, paddle::framework::EmptyGradOpMaker, - paddle::operators::UniformRandomOpVarTypeInference); + paddle::operators::UniformRandomOpVarTypeInference, + UniformRandomInferShapeFunctor); REGISTER_OP_CPU_KERNEL( uniform_random_batch_size_like, diff --git a/paddle/phi/infermeta/nullary.cc b/paddle/phi/infermeta/nullary.cc index f76e7910d77b5..3a99103eda5c2 100644 --- a/paddle/phi/infermeta/nullary.cc +++ b/paddle/phi/infermeta/nullary.cc @@ -63,6 +63,18 @@ void RandpermInferMeta(int n, DataType dtype, MetaTensor* out) { out->set_dtype(dtype); } +void UniformRandomInferMeta(const IntArray& shape, + DataType dtype, + float min, + float max, + int seed, + MetaTensor* out) { + auto out_dims = phi::make_ddim(shape.GetData()); + out->set_dims(out_dims); + out->set_dtype(dtype); + out->set_layout(DataLayout::NCHW); +} + void RandintInferMeta( int low, int high, const IntArray& shape, DataType dtype, MetaTensor* out) { PADDLE_ENFORCE_NOT_NULL( diff --git a/paddle/phi/infermeta/nullary.h b/paddle/phi/infermeta/nullary.h index f84ac01d002d3..8d952d842c0c4 100644 --- a/paddle/phi/infermeta/nullary.h +++ b/paddle/phi/infermeta/nullary.h @@ -65,4 +65,11 @@ void TruncatedGaussianRandomInferMeta(const std::vector& shape, DataType dtype, MetaTensor* out); +void UniformRandomInferMeta(const IntArray& shape, + DataType dtype, + float min, + float max, + int seed, + MetaTensor* out); + } // namespace phi diff --git a/python/paddle/fluid/tests/unittests/test_uniform_random_op.py b/python/paddle/fluid/tests/unittests/test_uniform_random_op.py index 683cc2fdf867e..0b27c61623089 100644 --- a/python/paddle/fluid/tests/unittests/test_uniform_random_op.py +++ b/python/paddle/fluid/tests/unittests/test_uniform_random_op.py @@ -26,6 +26,7 @@ from paddle.fluid.op import Operator import paddle.fluid as fluid from paddle.fluid import Program, program_guard +from paddle.fluid.framework import _test_eager_guard def output_hist(out): @@ -52,6 +53,7 @@ def output_hist_diag(out): class TestUniformRandomOp_attr_tensorlist(OpTest): def setUp(self): self.op_type = "uniform_random" + self.python_api = paddle.uniform self.new_shape = (1000, 784) shape_tensor = [] for index, ele in enumerate(self.new_shape): @@ -84,6 +86,7 @@ def init_attrs(self): class TestUniformRandomOp_attr_tensorlist_int32(OpTest): def setUp(self): self.op_type = "uniform_random" + self.python_api = paddle.uniform self.new_shape = (1000, 784) shape_tensor = [] for index, ele in enumerate(self.new_shape): @@ -110,6 +113,7 @@ def verify_output(self, outs): class TestUniformRandomOp_attr_tensor(OpTest): def setUp(self): self.op_type = "uniform_random" + self.python_api = paddle.uniform self.inputs = {"ShapeTensor": np.array([1000, 784]).astype("int64")} self.init_attrs() self.outputs = {"Out": np.zeros((1000, 784)).astype("float32")} @@ -131,6 +135,7 @@ def verify_output(self, outs): class TestUniformRandomOp_attr_tensor_int32(OpTest): def setUp(self): self.op_type = "uniform_random" + self.python_api = paddle.uniform self.inputs = {"ShapeTensor": np.array([1000, 784]).astype("int32")} self.init_attrs() self.outputs = {"Out": np.zeros((1000, 784)).astype("float32")} @@ -152,6 +157,7 @@ def verify_output(self, outs): class TestUniformRandomOp(OpTest): def setUp(self): self.op_type = "uniform_random" + self.python_api = paddle.uniform self.inputs = {} self.init_attrs() self.outputs = {"Out": np.zeros((1000, 784)).astype("float32")} @@ -174,6 +180,18 @@ def verify_output(self, outs): np.allclose( hist, prob, rtol=0, atol=0.01), "hist: " + str(hist)) + def test_check_api(self): + places = self._get_places() + for place in places: + with fluid.dygraph.base.guard(place=place): + out = self.python_api(self.attrs['shape'], 'float32', + self.attrs['min'], self.attrs['max'], + self.attrs['seed']) + + def test_check_api_eager(self): + with _test_eager_guard(): + self.test_check_api() + class TestUniformRandomOpError(unittest.TestCase): def test_errors(self): diff --git a/python/paddle/tensor/random.py b/python/paddle/tensor/random.py index d2e4363443720..82818d50510c9 100644 --- a/python/paddle/tensor/random.py +++ b/python/paddle/tensor/random.py @@ -548,7 +548,14 @@ def uniform(shape, dtype=None, min=-1.0, max=1.0, seed=0, name=None): if not isinstance(dtype, core.VarDesc.VarType): dtype = convert_np_dtype_to_dtype_(dtype) - if paddle.in_dynamic_mode(): + if in_dygraph_mode(): + shape = utils.convert_shape_to_list(shape) + return _C_ops.final_state_uniform_random(shape, dtype, + float(min), + float(max), seed, + _current_expected_place()) + + if _in_legacy_dygraph(): shape = utils.convert_shape_to_list(shape) return _C_ops.uniform_random('shape', shape, 'min', float(min), 'max', diff --git a/python/paddle/utils/code_gen/api.yaml b/python/paddle/utils/code_gen/api.yaml index 3a76e89bbb727..892577a46f4d2 100644 --- a/python/paddle/utils/code_gen/api.yaml +++ b/python/paddle/utils/code_gen/api.yaml @@ -2035,6 +2035,18 @@ func : unfold backward : unfold_grad +- api : uniform_random + args : (IntArray shape, DataType dtype, float min, float max, int seed, Place place={}) + output : Tensor(out) + infer_meta : + func : UniformRandomInferMeta + param: [shape, dtype, min, max, seed] + kernel : + func : uniform_random + param: [shape, dtype, min, max, seed] + data_type : dtype + backend : place + # The `axis` argument of Python API paddle.unique is not vector - api : unique args : (Tensor x, bool return_index, bool return_inverse, bool return_counts, int[] axis, DataType dtype=DataType::INT64) From 437bebdab84426c427780c56898af85d698d64ac Mon Sep 17 00:00:00 2001 From: chentianyu03 Date: Mon, 11 Apr 2022 11:22:31 +0800 Subject: [PATCH 059/211] [Yaml] Add assign yaml (#41428) * add assign yaml * add assign api * add assign backward api * add assign * add assign yaml * add assign * assign yaml * add assign raw kernel and use assign_raw in yaml * merge develop branch * add missing python_api --- paddle/phi/kernels/assign_kernel.cc | 24 ++++++++++++++++--- paddle/phi/kernels/assign_kernel.h | 5 ++++ .../fluid/dygraph/varbase_patch_methods.py | 5 +++- python/paddle/fluid/layers/tensor.py | 15 +++++++----- .../fluid/tests/unittests/test_assign_op.py | 12 ++++++---- python/paddle/utils/code_gen/api.yaml | 10 ++++++++ python/paddle/utils/code_gen/backward.yaml | 10 ++++++++ 7 files changed, 67 insertions(+), 14 deletions(-) diff --git a/paddle/phi/kernels/assign_kernel.cc b/paddle/phi/kernels/assign_kernel.cc index a330227fcfafd..5eafc869fa551 100644 --- a/paddle/phi/kernels/assign_kernel.cc +++ b/paddle/phi/kernels/assign_kernel.cc @@ -22,16 +22,23 @@ namespace phi { +template +void AssignRawKernel(const Context& dev_ctx, + const DenseTensor& x, + DenseTensor* out) { + Copy(dev_ctx, x, x.place(), false, out); +} + template void AssignKernel(const Context& dev_ctx, paddle::optional x, DenseTensor* out) { - if (x.get_ptr()) { - if (!x.is_initialized()) { + if (x) { + if (!x->IsInitialized()) { return; } auto& x_tensor = *x.get_ptr(); - Copy(dev_ctx, x_tensor, x_tensor.place(), false, out); + AssignRawKernel(dev_ctx, x_tensor, out); } } @@ -104,6 +111,12 @@ void AssignValueKernel(const Context& dev_ctx, } // namespace phi +PD_REGISTER_GENERAL_KERNEL(assign_raw, + CPU, + ALL_LAYOUT, + phi::AssignRawKernel, + ALL_DTYPE) {} + PD_REGISTER_GENERAL_KERNEL( assign, CPU, ALL_LAYOUT, phi::AssignKernel, ALL_DTYPE) { kernel->InputAt(0).SetBackend(phi::Backend::ALL_BACKEND); @@ -123,6 +136,11 @@ PD_REGISTER_KERNEL(assign_value, int64_t) {} #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +PD_REGISTER_GENERAL_KERNEL(assign_raw, + GPU, + ALL_LAYOUT, + phi::AssignRawKernel, + ALL_DTYPE) {} PD_REGISTER_GENERAL_KERNEL( assign, GPU, ALL_LAYOUT, phi::AssignKernel, ALL_DTYPE) { kernel->InputAt(0).SetBackend(phi::Backend::ALL_BACKEND); diff --git a/paddle/phi/kernels/assign_kernel.h b/paddle/phi/kernels/assign_kernel.h index f1f3f024205a1..437a2a0c189e8 100644 --- a/paddle/phi/kernels/assign_kernel.h +++ b/paddle/phi/kernels/assign_kernel.h @@ -21,6 +21,11 @@ namespace phi { +template +void AssignRawKernel(const Context& dev_ctx, + const DenseTensor& x, + DenseTensor* out); + // In order to be compatible with the `AsDispensable` input in the original // assign op maker, the input parameter here needs to be dispensable, but // this looks weird diff --git a/python/paddle/fluid/dygraph/varbase_patch_methods.py b/python/paddle/fluid/dygraph/varbase_patch_methods.py index 72aee0ba87e58..9bf245ff388b4 100644 --- a/python/paddle/fluid/dygraph/varbase_patch_methods.py +++ b/python/paddle/fluid/dygraph/varbase_patch_methods.py @@ -23,7 +23,7 @@ from ..framework import convert_np_dtype_to_dtype_, _in_legacy_dygraph from .. import core from .. import unique_name -from ..framework import Variable, Parameter, ParamBase, _getitem_impl_, _setitem_impl_, EagerParamBase +from ..framework import Variable, Parameter, ParamBase, _getitem_impl_, _setitem_impl_, EagerParamBase, in_dygraph_mode from .base import switch_to_static_graph from .math_op_patch import monkey_patch_math_varbase from .parallel import scale_loss @@ -798,6 +798,9 @@ def _set_grad_ivar(self, value): @framework.dygraph_only def clone(self): + if in_dygraph_mode(): + return _C_ops.final_state_assign(self) + if _in_legacy_dygraph(): output = core.VarBase() else: diff --git a/python/paddle/fluid/layers/tensor.py b/python/paddle/fluid/layers/tensor.py index 28e0d4eff377f..3a8dfdc858079 100644 --- a/python/paddle/fluid/layers/tensor.py +++ b/python/paddle/fluid/layers/tensor.py @@ -622,12 +622,15 @@ def assign(input, output=None): # after this api. if isinstance(input, (Variable, core.VarBase)): if _non_static_mode(): - if output is None: - if _in_legacy_dygraph(): - output = core.VarBase() - else: - output = core.eager.Tensor() - _C_ops.assign(input, output) + if in_dygraph_mode() and output is None: + output = _C_ops.final_state_assign(input) + else: + if output is None: + if _in_legacy_dygraph(): + output = core.VarBase() + else: + output = core.eager.Tensor() + _C_ops.assign(input, output) else: check_dtype(input.dtype, 'input', [ 'float16', 'uint16', 'float32', 'float64', 'int32', 'int64', diff --git a/python/paddle/fluid/tests/unittests/test_assign_op.py b/python/paddle/fluid/tests/unittests/test_assign_op.py index 3dbd9311a71ed..bfe23c621270d 100644 --- a/python/paddle/fluid/tests/unittests/test_assign_op.py +++ b/python/paddle/fluid/tests/unittests/test_assign_op.py @@ -27,30 +27,32 @@ class TestAssignOp(op_test.OpTest): def setUp(self): + self.python_api = paddle.assign self.op_type = "assign" x = np.random.random(size=(100, 10)).astype('float64') self.inputs = {'X': x} self.outputs = {'Out': x} def test_forward(self): - self.check_output() + self.check_output(check_eager=True) def test_backward(self): - self.check_grad(['X'], 'Out') + self.check_grad(['X'], 'Out', check_eager=True) class TestAssignFP16Op(op_test.OpTest): def setUp(self): + self.python_api = paddle.assign self.op_type = "assign" x = np.random.random(size=(100, 10)).astype('float16') self.inputs = {'X': x} self.outputs = {'Out': x} def test_forward(self): - self.check_output() + self.check_output(check_eager=True) def test_backward(self): - self.check_grad(['X'], 'Out') + self.check_grad(['X'], 'Out', check_eager=True) class TestAssignOpWithLoDTensorArray(unittest.TestCase): @@ -171,6 +173,8 @@ def test_assign_BasicTypes(self): def test_clone(self): paddle.disable_static() + self.python_api = paddle.clone + x = paddle.ones([2]) x.stop_gradient = False clone_x = paddle.clone(x) diff --git a/python/paddle/utils/code_gen/api.yaml b/python/paddle/utils/code_gen/api.yaml index 892577a46f4d2..3d0a6ae7b0988 100644 --- a/python/paddle/utils/code_gen/api.yaml +++ b/python/paddle/utils/code_gen/api.yaml @@ -167,6 +167,16 @@ func : asinh backward : asinh_grad +# assign +- api : assign + args : (Tensor x) + output : Tensor + infer_meta : + func : UnchangedInferMeta + kernel : + func : assign_raw + backward : assign_grad + # atan - api : atan args : (Tensor x) diff --git a/python/paddle/utils/code_gen/backward.yaml b/python/paddle/utils/code_gen/backward.yaml index 602fecc83b8f7..6ef11ca2b3df9 100644 --- a/python/paddle/utils/code_gen/backward.yaml +++ b/python/paddle/utils/code_gen/backward.yaml @@ -89,6 +89,16 @@ kernel : func : asinh_grad +- backward_api : assign_grad + forward : assign (Tensor x) -> Tensor(out) + args : (Tensor out_grad) + output : Tensor(x_grad) + infer_meta : + func : UnchangedInferMeta + param : [out_grad] + kernel : + func : assign_raw + - backward_api : atan2_grad forward : atan2 (Tensor x, Tensor y) -> Tensor(out) args : (Tensor x, Tensor y, Tensor out_grad) From 89bfa9644e781619ef828f786748da23c4efe60d Mon Sep 17 00:00:00 2001 From: Zhang Zheng <32410583+ZzSean@users.noreply.github.com> Date: Mon, 11 Apr 2022 11:23:53 +0800 Subject: [PATCH 060/211] Modify op-benchamrk script (#41470) * Modify op-benchamrk script * fix --- tools/ci_op_benchmark.sh | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tools/ci_op_benchmark.sh b/tools/ci_op_benchmark.sh index 0937ebe5343fc..8e84eccc083f2 100644 --- a/tools/ci_op_benchmark.sh +++ b/tools/ci_op_benchmark.sh @@ -135,6 +135,8 @@ function load_CHANGE_OP_MAP { for change_file in ${CHANGE_OP_FILES[@]} do change_file_name=${change_file#*paddle/fluid/operators/} + change_file_name=${change_file_name#*paddle/phi/kernels/gpu/} + change_file_name=${change_file_name#*paddle/phi/kernels/gpudnn/} if [ -n "${PADDLE_FILENAME_OP_MAP[$change_file_name]}" ] then for op_name in ${PADDLE_FILENAME_OP_MAP[$change_file_name]} From 36d76840acf06b6b7f95803001dce9952cc43b77 Mon Sep 17 00:00:00 2001 From: YuanRisheng Date: Mon, 11 Apr 2022 11:26:52 +0800 Subject: [PATCH 061/211] [Phi]Add multi_dot/maxout/multiplex op yaml (#41550) * add multi_dot,maxout,multiplex yaml * add code converage --- paddle/phi/api/lib/api_custom_impl.cc | 130 ++++++++++++++++++ paddle/phi/api/lib/api_custom_impl.h | 10 +- paddle/phi/infermeta/backward.cc | 32 +++++ paddle/phi/infermeta/backward.h | 8 ++ .../phi/kernels/impl/multi_dot_kernel_impl.h | 2 +- paddle/phi/kernels/multi_dot_grad_kernel.h | 2 +- paddle/phi/ops/compat/multi_dot_sig.cc | 2 +- python/paddle/fluid/layers/nn.py | 5 +- .../fluid/tests/unittests/test_maxout_op.py | 10 +- .../tests/unittests/test_multi_dot_op.py | 38 ++--- .../tests/unittests/test_multiplex_op.py | 25 ++++ python/paddle/nn/functional/activation.py | 6 +- python/paddle/tensor/linalg.py | 4 +- python/paddle/utils/code_gen/api.yaml | 28 ++++ python/paddle/utils/code_gen/api_base.py | 2 +- python/paddle/utils/code_gen/backward.yaml | 22 +++ 16 files changed, 298 insertions(+), 28 deletions(-) diff --git a/paddle/phi/api/lib/api_custom_impl.cc b/paddle/phi/api/lib/api_custom_impl.cc index 5d1851fb85aa2..637c3b9107a7d 100644 --- a/paddle/phi/api/lib/api_custom_impl.cc +++ b/paddle/phi/api/lib/api_custom_impl.cc @@ -1014,5 +1014,135 @@ std::vector meshgrid_grad_impl( return api_output; } +std::vector multi_dot_grad_impl(const std::vector& x, + const Tensor& out_grad) { + Backend kernel_backend = Backend::UNDEFINED; + DataLayout kernel_layout = DataLayout::UNDEFINED; + DataType kernel_data_type = DataType::UNDEFINED; + + if (kernel_backend == Backend::UNDEFINED || + kernel_layout == DataLayout::UNDEFINED || + kernel_data_type == DataType::UNDEFINED) { + auto kernel_key_set = ParseKernelKeyByInputArgs(x, out_grad); + auto kernel_key = kernel_key_set.GetHighestPriorityKernelKey(); + if (kernel_backend == Backend::UNDEFINED) { + kernel_backend = kernel_key.backend(); + } + if (kernel_layout == DataLayout::UNDEFINED) { + kernel_layout = kernel_key.layout(); + } + if (kernel_data_type == DataType::UNDEFINED) { + kernel_data_type = kernel_key.dtype(); + } + } + + VLOG(6) << "multi_dot_grad API kernel key: [" << kernel_backend << ", " + << kernel_layout << ", " << kernel_data_type << "]"; + const auto& kernel = phi::KernelFactory::Instance().SelectKernelOrThrowError( + "multi_dot_grad", {kernel_backend, kernel_layout, kernel_data_type}); + VLOG(6) << "multi_dot_grad API kernel: " << kernel; + + auto* dev_ctx = GetDeviceContextByBackend(kernel_backend); + + auto input_x_vec = PrepareData(x, kernel.InputAt(0), {}); + std::vector input_x(input_x_vec->size()); + for (size_t i = 0; i < input_x.size(); ++i) { + input_x[i] = &input_x_vec->at(i); + } + auto input_out_grad = PrepareData(out_grad, kernel.InputAt(1), {}); + + size_t out_number = input_x.size(); + std::vector api_output; + auto kernel_out = SetKernelOutput(out_number, kernel_backend, &api_output); + + auto x_meta_vec = MakeMetaTensor(input_x); + std::vector x_metas(x_meta_vec.size()); + for (size_t i = 0; i < x_meta_vec.size(); ++i) { + x_metas[i] = &x_meta_vec[i]; + } + + std::vector meta_outs; + meta_outs.reserve(out_number); + std::vector meta_out_ptrs; + meta_out_ptrs.reserve(out_number); + for (size_t i = 0; i < out_number; ++i) { + meta_outs.push_back(kernel_out[i]); + meta_out_ptrs.push_back(&meta_outs.back()); + } + + phi::MultiDotGradInferMeta( + x_metas, MakeMetaTensor(*input_out_grad), meta_out_ptrs); + + using kernel_signature = void (*)(const platform::DeviceContext&, + const std::vector&, + const phi::DenseTensor&, + std::vector&); + auto* kernel_fn = kernel.GetVariadicKernelFn(); + (*kernel_fn)(*dev_ctx, input_x, *input_out_grad, kernel_out); + + return api_output; +} + +std::vector multiplex_grad_impl(const std::vector& inputs, + const Tensor& ids, + const Tensor& out_grad) { + Backend kernel_backend = Backend::UNDEFINED; + DataLayout kernel_layout = DataLayout::UNDEFINED; + DataType kernel_data_type = DataType::UNDEFINED; + + if (kernel_backend == Backend::UNDEFINED || + kernel_layout == DataLayout::UNDEFINED || + kernel_data_type == DataType::UNDEFINED) { + auto kernel_key_set = ParseKernelKeyByInputArgs(out_grad); + auto kernel_key = kernel_key_set.GetHighestPriorityKernelKey(); + if (kernel_backend == Backend::UNDEFINED) { + kernel_backend = kernel_key.backend(); + } + if (kernel_layout == DataLayout::UNDEFINED) { + kernel_layout = kernel_key.layout(); + } + if (kernel_data_type == DataType::UNDEFINED) { + kernel_data_type = kernel_key.dtype(); + } + } + + VLOG(6) << "multiplex_grad API kernel key: [" << kernel_backend << ", " + << kernel_layout << ", " << kernel_data_type << "]"; + const auto& kernel = phi::KernelFactory::Instance().SelectKernelOrThrowError( + "multiplex_grad", {kernel_backend, kernel_layout, kernel_data_type}); + VLOG(6) << "multiplex_grad API kernel: " << kernel; + + auto* dev_ctx = GetDeviceContextByBackend(kernel_backend); + + auto input_ids = PrepareData(ids, kernel.InputAt(0), {}); + auto input_out_grad = PrepareData(out_grad, kernel.InputAt(1), {}); + + auto out_number = inputs.size(); + std::vector api_output; + auto kernel_out = SetKernelOutput(out_number, kernel_backend, &api_output); + + std::vector meta_outs; + meta_outs.reserve(out_number); + std::vector meta_out_ptrs; + meta_out_ptrs.reserve(out_number); + for (size_t i = 0; i < out_number; ++i) { + meta_outs.push_back(kernel_out[i]); + meta_out_ptrs.push_back(&meta_outs.back()); + } + + phi::MultiplexGradInferMeta(MakeMetaTensor(*input_ids), + MakeMetaTensor(*input_out_grad), + meta_out_ptrs); + + using kernel_signature = void (*)(const platform::DeviceContext&, + const phi::DenseTensor&, + const phi::DenseTensor&, + std::vector&); + auto* kernel_fn = kernel.GetVariadicKernelFn(); + (*kernel_fn)(*dev_ctx, *input_ids, *input_out_grad, kernel_out); + + return api_output; +} + } // namespace experimental } // namespace paddle diff --git a/paddle/phi/api/lib/api_custom_impl.h b/paddle/phi/api/lib/api_custom_impl.h index 80ace229316a9..0e360ce4a993f 100644 --- a/paddle/phi/api/lib/api_custom_impl.h +++ b/paddle/phi/api/lib/api_custom_impl.h @@ -62,6 +62,8 @@ std::vector split_impl(const Tensor& x, const IntArray& num_or_sections, const Scalar& axis); +std::vector meshgrid_impl(const std::vector& inputs); + std::tuple momentum_impl( const Tensor& param, const Tensor& grad, @@ -109,9 +111,15 @@ Tensor real_grad_impl(const Tensor& x); std::vector stack_grad_impl(const std::vector& x, const Tensor& out_grad, int axis); -std::vector meshgrid_impl(const std::vector& inputs); std::vector meshgrid_grad_impl(const std::vector& inputs, const std::vector& outputs_grad); +std::vector multi_dot_grad_impl(const std::vector& x, + const Tensor& out_grad); + +std::vector multiplex_grad_impl(const std::vector& inputs, + const Tensor& ids, + const Tensor& out_grad); + } // namespace experimental } // namespace paddle diff --git a/paddle/phi/infermeta/backward.cc b/paddle/phi/infermeta/backward.cc index 49e416fd0152d..81d3cb9ddf0f4 100644 --- a/paddle/phi/infermeta/backward.cc +++ b/paddle/phi/infermeta/backward.cc @@ -308,6 +308,38 @@ void MeshgridGradInferMeta(const std::vector& inputs, } } +void MultiDotGradInferMeta(const std::vector& x, + const MetaTensor& out_grad, + std::vector x_grad) { + PADDLE_ENFORCE_EQ( + x.size(), + x_grad.size(), + errors::InvalidArgument( + "Number of Inputs(X) should be equal with Outputs(X@Grad)." + "But received Inputs(X)' size = %d , Outputs(X@Grad)' size = %d.", + x.size(), + x_grad.size())); + for (size_t i = 0; i < x.size(); i++) { + if (x_grad[i] != nullptr) { + x_grad[i]->set_dims(x[i]->dims()); + x_grad[i]->share_lod(*x[i]); + } + } +} + +void MultiplexGradInferMeta(const MetaTensor& ids, + const MetaTensor& out_grad, + std::vector ins_grad) { + PADDLE_ENFORCE_NE( + ins_grad.empty(), + true, + errors::InvalidArgument("Output(X@Grad) should not be null.")); + auto dout_dim = out_grad.dims(); + for (auto in_grad : ins_grad) { + in_grad->set_dims(dout_dim); + } +} + void NllLossGradInferMeta(const MetaTensor& x, const MetaTensor& label, paddle::optional weight, diff --git a/paddle/phi/infermeta/backward.h b/paddle/phi/infermeta/backward.h index eff3731bf2253..058ff7541cd8b 100644 --- a/paddle/phi/infermeta/backward.h +++ b/paddle/phi/infermeta/backward.h @@ -139,6 +139,14 @@ void MeshgridGradInferMeta(const std::vector& inputs, const std::vector& outputs_grad, std::vector inputs_grad); +void MultiDotGradInferMeta(const std::vector& x, + const MetaTensor& out_grad, + std::vector x_grad); + +void MultiplexGradInferMeta(const MetaTensor& ids, + const MetaTensor& out_grad, + std::vector ins_grad); + void NllLossGradInferMeta(const MetaTensor& input, const MetaTensor& label, paddle::optional weight, diff --git a/paddle/phi/kernels/impl/multi_dot_kernel_impl.h b/paddle/phi/kernels/impl/multi_dot_kernel_impl.h index 0833e94fe2c18..039b056200fdd 100644 --- a/paddle/phi/kernels/impl/multi_dot_kernel_impl.h +++ b/paddle/phi/kernels/impl/multi_dot_kernel_impl.h @@ -339,8 +339,8 @@ void MultiDotGradMatChainOrder(const Context& ctx, template void MultiDotGradKernel(const Context& ctx, - const DenseTensor& out_grad, const std::vector& x, + const DenseTensor& out_grad, std::vector x_grad) { auto ins = x; auto dout = out_grad; diff --git a/paddle/phi/kernels/multi_dot_grad_kernel.h b/paddle/phi/kernels/multi_dot_grad_kernel.h index e6d8ecd744e12..f495c70452079 100644 --- a/paddle/phi/kernels/multi_dot_grad_kernel.h +++ b/paddle/phi/kernels/multi_dot_grad_kernel.h @@ -20,8 +20,8 @@ namespace phi { template void MultiDotGradKernel(const Context& ctx, - const DenseTensor& out_grad, const std::vector& x, + const DenseTensor& out_grad, std::vector x_grad); } // namespace phi diff --git a/paddle/phi/ops/compat/multi_dot_sig.cc b/paddle/phi/ops/compat/multi_dot_sig.cc index 598cbd980f3cc..2e05bd6d1557a 100644 --- a/paddle/phi/ops/compat/multi_dot_sig.cc +++ b/paddle/phi/ops/compat/multi_dot_sig.cc @@ -19,7 +19,7 @@ namespace phi { KernelSignature MultiDotGradOpArgumentMapping( const ArgumentMappingContext& ctx) { return KernelSignature( - "multi_dot_grad", {GradVarName("Out"), "X"}, {}, {GradVarName("X")}); + "multi_dot_grad", {"X", GradVarName("Out")}, {}, {GradVarName("X")}); } } // namespace phi diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py index 311a6278a89f8..68a58e8be49b8 100755 --- a/python/paddle/fluid/layers/nn.py +++ b/python/paddle/fluid/layers/nn.py @@ -5970,8 +5970,11 @@ def multiplex(inputs, index, name=None): print(res) # [array([[5., 6.], [3., 4.]], dtype=float32)] """ - if _non_static_mode(): + + if _in_legacy_dygraph(): return _C_ops.multiplex(index, inputs) + if in_dygraph_mode(): + return _C_ops.final_state_multiplex(inputs, index) helper = LayerHelper('multiplex', **locals()) check_type(inputs, 'inputs', (list), 'multiplex') diff --git a/python/paddle/fluid/tests/unittests/test_maxout_op.py b/python/paddle/fluid/tests/unittests/test_maxout_op.py index fac400caacdab..4bc7b09c71e6e 100644 --- a/python/paddle/fluid/tests/unittests/test_maxout_op.py +++ b/python/paddle/fluid/tests/unittests/test_maxout_op.py @@ -21,6 +21,7 @@ import paddle.fluid.core as core import paddle.nn.functional as F from op_test import OpTest +from paddle.fluid.framework import _test_eager_guard paddle.enable_static() np.random.seed(1) @@ -38,6 +39,7 @@ def maxout_forward_naive(x, groups, channel_axis): class TestMaxOutOp(OpTest): def setUp(self): self.op_type = "maxout" + self.python_api = paddle.nn.functional.maxout self.dtype = 'float64' self.shape = [3, 6, 2, 4] self.groups = 2 @@ -55,10 +57,10 @@ def set_attrs(self): pass def test_check_output(self): - self.check_output() + self.check_output(check_eager=True) def test_check_grad(self): - self.check_grad(['X'], 'Out') + self.check_grad(['X'], 'Out', check_eager=True) class TestMaxOutOpAxis0(TestMaxOutOp): @@ -144,6 +146,10 @@ def test_errors(self): x_float32 = paddle.fluid.data(name='x_float32', shape=[2, 4, 6, 8]) self.assertRaises(ValueError, F.maxout, x_float32, 2, 2) + def test_dygraph_final_state_api(self): + with _test_eager_guard(): + self.test_dygraph_api() + if __name__ == '__main__': unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_multi_dot_op.py b/python/paddle/fluid/tests/unittests/test_multi_dot_op.py index 8856624b4efc7..11c0436317076 100644 --- a/python/paddle/fluid/tests/unittests/test_multi_dot_op.py +++ b/python/paddle/fluid/tests/unittests/test_multi_dot_op.py @@ -18,6 +18,7 @@ from numpy.linalg import multi_dot from op_test import OpTest import paddle +from paddle.fluid.framework import _test_eager_guard paddle.enable_static() @@ -27,6 +28,7 @@ class TestMultiDotOp(OpTest): def setUp(self): self.op_type = "multi_dot" + self.python_api = paddle.linalg.multi_dot self.dtype = self.get_dtype() self.get_inputs_and_outputs() @@ -40,11 +42,11 @@ def get_inputs_and_outputs(self): self.outputs = {'Out': multi_dot([self.A, self.B])} def test_check_output(self): - self.check_output() + self.check_output(check_eager=True) def test_check_grad(self): - self.check_grad(['x0'], 'Out') - self.check_grad(['x1'], 'Out') + self.check_grad(['x0'], 'Out', check_eager=True) + self.check_grad(['x1'], 'Out', check_eager=True) #(A*B)*C @@ -57,9 +59,9 @@ def get_inputs_and_outputs(self): self.outputs = {'Out': multi_dot([self.A, self.B, self.C])} def test_check_grad(self): - self.check_grad(['x0'], 'Out') - self.check_grad(['x1'], 'Out') - self.check_grad(['x2'], 'Out') + self.check_grad(['x0'], 'Out', check_eager=True) + self.check_grad(['x1'], 'Out', check_eager=True) + self.check_grad(['x2'], 'Out', check_eager=True) #A*(B*C) @@ -72,9 +74,9 @@ def get_inputs_and_outputs(self): self.outputs = {'Out': multi_dot([self.A, self.B, self.C])} def test_check_grad(self): - self.check_grad(['x0'], 'Out') - self.check_grad(['x1'], 'Out') - self.check_grad(['x2'], 'Out') + self.check_grad(['x0'], 'Out', check_eager=True) + self.check_grad(['x1'], 'Out', check_eager=True) + self.check_grad(['x2'], 'Out', check_eager=True) class TestMultiDotOp4Mat(TestMultiDotOp): @@ -90,10 +92,10 @@ def get_inputs_and_outputs(self): self.outputs = {'Out': multi_dot([self.A, self.B, self.C, self.D])} def test_check_grad(self): - self.check_grad(['x0'], 'Out') - self.check_grad(['x1'], 'Out') - self.check_grad(['x2'], 'Out') - self.check_grad(['x3'], 'Out') + self.check_grad(['x0'], 'Out', check_eager=True) + self.check_grad(['x1'], 'Out', check_eager=True) + self.check_grad(['x2'], 'Out', check_eager=True) + self.check_grad(['x3'], 'Out', check_eager=True) class TestMultiDotOpFirst1D(TestMultiDotOp): @@ -143,9 +145,9 @@ def get_inputs_and_outputs(self): self.outputs = {'Out': multi_dot([self.A, self.B, self.C])} def test_check_grad(self): - self.check_grad(['x0'], 'Out') - self.check_grad(['x1'], 'Out') - self.check_grad(['x2'], 'Out') + self.check_grad(['x0'], 'Out', check_eager=True) + self.check_grad(['x1'], 'Out', check_eager=True) + self.check_grad(['x2'], 'Out', check_eager=True) class TestMultiDotOp4MatLast1D(TestMultiDotOp4Mat): @@ -260,6 +262,10 @@ def test_dygraph_without_out(self): expected_result = np.linalg.multi_dot([input_array1, input_array2]) self.assertTrue(np.allclose(expected_result, out.numpy())) + def test_dygraph_final_state_api(self): + with _test_eager_guard(): + self.test_dygraph_without_out() + if __name__ == "__main__": unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_multiplex_op.py b/python/paddle/fluid/tests/unittests/test_multiplex_op.py index a26eed12246e4..093ee86aeea6e 100644 --- a/python/paddle/fluid/tests/unittests/test_multiplex_op.py +++ b/python/paddle/fluid/tests/unittests/test_multiplex_op.py @@ -19,6 +19,7 @@ from op_test import OpTest import paddle import paddle.fluid as fluid +from paddle.fluid.framework import _test_eager_guard class TestMultiplexOp(OpTest): @@ -102,6 +103,30 @@ def test_multiplex_dygraph(self): res = paddle.multiplex(inputs, index) paddle.enable_static() + def test_dygraph_final_state_api(self): + with fluid.dygraph.guard(): + img1 = np.array([[1, 2], [3, 4]]).astype(np.float32) + img2 = np.array([[5, 6], [7, 8]]).astype(np.float32) + inputs = [paddle.to_tensor(img1), paddle.to_tensor(img2)] + index = paddle.to_tensor(np.array([[1], [0]]).astype(np.int32)) + inputs[0].stop_gradient = False + inputs[1].stop_gradient = False + res = paddle.multiplex(inputs, index) + res.backward() + with _test_eager_guard(): + inputs_eager = [paddle.to_tensor(img1), paddle.to_tensor(img2)] + index_eager = paddle.to_tensor( + np.array([[1], [0]]).astype(np.int32)) + inputs_eager[0].stop_gradient = False + inputs_eager[1].stop_gradient = False + res_eager = paddle.multiplex(inputs_eager, index_eager) + res_eager.backward() + self.assertEqual((res.numpy() == res_eager.numpy()).all(), True) + self.assertEqual((inputs[0].grad.numpy() == + inputs_eager[0].grad.numpy()).all(), True) + self.assertEqual((inputs[1].grad.numpy() == + inputs_eager[1].grad.numpy()).all(), True) + if __name__ == '__main__': unittest.main() diff --git a/python/paddle/nn/functional/activation.py b/python/paddle/nn/functional/activation.py index 90283b632ef2b..a0efdaac8ff7c 100644 --- a/python/paddle/nn/functional/activation.py +++ b/python/paddle/nn/functional/activation.py @@ -684,10 +684,10 @@ def maxout(x, groups, axis=1, name=None): # [0.95313174 0.6228939 0.7129065 0.7087491 ] # [0.7142536 0.88725346 0.61093384 0.38833922]]]] """ - - if in_dynamic_mode(): + if _in_legacy_dygraph(): return _C_ops.maxout(x, 'groups', groups, 'axis', axis) - + if in_dygraph_mode(): + return _C_ops.final_state_maxout(x, groups, axis) check_variable_and_dtype(x, 'x', ['float32', 'float64'], 'maxout') if axis not in [1, -1, 3]: raise ValueError( diff --git a/python/paddle/tensor/linalg.py b/python/paddle/tensor/linalg.py index 38616026f128a..509ae903f59e4 100644 --- a/python/paddle/tensor/linalg.py +++ b/python/paddle/tensor/linalg.py @@ -2273,8 +2273,10 @@ def multi_dot(x, name=None): # [10, 7] """ - if paddle.in_dynamic_mode(): + if _in_legacy_dygraph(): return _C_ops.multi_dot(x) + if in_dygraph_mode(): + return _C_ops.final_state_multi_dot(x) check_type(x, 'x', (list, tuple), 'multi_dot') for id, item in enumerate(x): diff --git a/python/paddle/utils/code_gen/api.yaml b/python/paddle/utils/code_gen/api.yaml index 3d0a6ae7b0988..6ca61de063b55 100644 --- a/python/paddle/utils/code_gen/api.yaml +++ b/python/paddle/utils/code_gen/api.yaml @@ -1261,6 +1261,15 @@ func : maximum backward : maximum_grad +- api : maxout + args : (Tensor x, int groups, int axis) + output : Tensor(out) + infer_meta : + func : MaxOutInferMeta + kernel : + func : maxout + backward : maxout_grad + - api : mean args : (Tensor x, int64_t[] dims={}, bool keep_dim=false) output : Tensor(out) @@ -1337,6 +1346,15 @@ invoke : momentum_impl(param, grad, velocity, learning_rate, master_param, mu, use_nesterov, regularization_method, regularization_coeff, multi_precision, rescale_grad) optional : master_param +- api : multi_dot + args : (Tensor[] x) + output : Tensor + infer_meta : + func : MultiDotInferMeta + kernel : + func : multi_dot + backward : multi_dot_grad + # multinomial - api : multinomial args : (Tensor x, int num_samples, bool replacement) @@ -1346,6 +1364,16 @@ kernel : func : multinomial +- api : multiplex + args : (Tensor[] ins, Tensor ids) + output : Tensor + infer_meta : + func : MultiplexInferMeta + kernel : + func : multiplex + data_type : ins + backward : multiplex_grad + - api : multiply args : (Tensor x, Tensor y) output : Tensor diff --git a/python/paddle/utils/code_gen/api_base.py b/python/paddle/utils/code_gen/api_base.py index 38aa3e0cb0b73..275adac8b4972 100644 --- a/python/paddle/utils/code_gen/api_base.py +++ b/python/paddle/utils/code_gen/api_base.py @@ -600,7 +600,7 @@ def get_kernel_args(self, code_indent): if self.inputs['input_info'][param] == "const Tensor&": kernel_args = kernel_args + "*" + PREFIX_TENSOR_NAME + param + ", " elif self.inputs['input_info'][ - input_name] == "const std::vector&": + param] == "const std::vector&": kernel_args = kernel_args + PREFIX_TENSOR_NAME + param + ", " else: # do nothing diff --git a/python/paddle/utils/code_gen/backward.yaml b/python/paddle/utils/code_gen/backward.yaml index 6ef11ca2b3df9..df5291b8ea358 100644 --- a/python/paddle/utils/code_gen/backward.yaml +++ b/python/paddle/utils/code_gen/backward.yaml @@ -902,6 +902,16 @@ kernel : func : maximum_grad +- backward_api : maxout_grad + forward : maxout(Tensor x, int groups, int axis) -> Tensor(out) + args : (Tensor x, Tensor out, Tensor out_grad, int groups, int axis) + output : Tensor(x_grad) + infer_meta : + func : GeneralUnaryGradInferMeta + param: [x] + kernel : + func : maxout_grad + - backward_api : mean_all_grad forward : mean_all(Tensor x) -> Tensor(out) args : (Tensor x, Tensor out_grad) @@ -979,6 +989,18 @@ func : modulo_grad no_need_buffer : x, y +- backward_api : multi_dot_grad + forward : multi_dot (Tensor[] x) -> Tensor(out) + args : (Tensor[] x, Tensor out_grad) + output : Tensor[](x_grad) + invoke : multi_dot_grad_impl(x, out_grad) + +- backward_api : multiplex_grad + forward : multiplex (Tensor[] ins, Tensor ids) -> Tensor(out) + args : (Tensor[] ins, Tensor ids, Tensor out_grad) + output : Tensor[](ins_grad) + invoke : multiplex_grad_impl(ins, ids, out_grad) + - backward_api : multiply_grad forward : multiply (Tensor x, Tensor y) -> Tensor(out) args : (Tensor x, Tensor y, Tensor out_grad, int axis = -1) From 9287d5a1e1cec7a628c755080d26bf09e1dfddff Mon Sep 17 00:00:00 2001 From: hong <43953930+phlrain@users.noreply.github.com> Date: Mon, 11 Apr 2022 14:16:21 +0800 Subject: [PATCH 062/211] Add no need buffer config (#41605) * add no need buffer * add no need buffer * remove determinant no need buffer --- python/paddle/utils/code_gen/backward.yaml | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/python/paddle/utils/code_gen/backward.yaml b/python/paddle/utils/code_gen/backward.yaml index df5291b8ea358..555ec600bf7e7 100644 --- a/python/paddle/utils/code_gen/backward.yaml +++ b/python/paddle/utils/code_gen/backward.yaml @@ -217,6 +217,7 @@ args : (Tensor[] x, Tensor out_grad, Scalar axis = 0) output : Tensor[](x_grad) invoke : concat_grad_impl(x, out_grad, axis) + no_need_buffer : x - backward_api : conj_grad forward : conj (Tensor x) -> Tensor(out) @@ -328,7 +329,7 @@ func : UnchangedInferMeta param : [x] kernel : - func : determinant_grad + func : determinant_grad - backward_api : diagonal_grad forward : diagonal (Tensor x, int offset, int axis1, int axis2) -> Tensor(out) @@ -452,6 +453,7 @@ param : [x] kernel : func : expand_as_grad + no_need_buffer : x - backward_api : expm1_grad forward : expm1 (Tensor x) -> Tensor(out) @@ -475,6 +477,7 @@ data_type: out_grad backend: out_grad layout: out_grad + no_need_buffer : x - backward_api : flip_grad forward : flip (Tensor x, int[] axis) -> Tensor(out) @@ -536,6 +539,7 @@ kernel : data_type: x func : gather_grad + no_need_buffer : x - backward_api : gather_nd_grad forward : gather_nd (Tensor x, Tensor index) -> Tensor(out) @@ -546,6 +550,7 @@ param : [x] kernel : func : gather_nd_grad + no_need_buffer : x - backward_api : gelu_grad forward : gelu(Tensor x, bool approximate) -> Tensor(out) @@ -646,6 +651,7 @@ kernel : func : index_select_grad data_type : x + no_need_buffer : x - backward_api : kldiv_loss_grad forward : kldiv_loss(Tensor x, Tensor label, str reduction) -> Tensor(out) @@ -656,6 +662,7 @@ param: [x] kernel : func : kldiv_loss_grad + no_need_buffer : x - backward_api : kron_grad forward : kron (Tensor x, Tensor y) -> Tensor(out) @@ -819,6 +826,7 @@ kernel : func : masked_select_grad data_type : x + no_need_buffer : x - backward_api : matmul_double_grad forward : matmul_grad (Tensor x, Tensor y, Tensor grad_out, bool transpose_x=false, bool transpose_y=false) -> Tensor(grad_x), Tensor(grad_y) @@ -931,6 +939,7 @@ param: [x] kernel : func : mean_grad + no_need_buffer : x - backward_api : meshgrid_grad forward : meshgrid (Tensor[] inputs) -> Tensor[](outputs) @@ -1224,6 +1233,7 @@ kernel : func : roll_grad data_type : x + no_need_buffer : x - backward_api : round_grad forward : round(Tensor x) -> Tensor(out) @@ -1376,6 +1386,7 @@ param : [input] kernel : func : slice_grad + no_need_buffer : input - backward_api : soft_shrink_grad forward : soft_shrink (Tensor x, float lambda) -> Tensor(out) @@ -1450,6 +1461,7 @@ param : [x] kernel : func : strided_slice_grad + no_need_buffer : x - backward_api : subtract_grad forward : subtract (Tensor x, Tensor y) -> Tensor(out) @@ -1471,6 +1483,7 @@ param : [x] kernel : func : sum_grad + no_need_buffer : x - backward_api : swish_grad forward : swish (Tensor x, float beta=1.0) -> Tensor(out) @@ -1630,3 +1643,4 @@ param : [x, y] kernel : func : where_grad + no_need_buffer : x, y From 368f1dda7e3c9ae5a50ba70344c4577215e6a6cf Mon Sep 17 00:00:00 2001 From: ykkk2333 <77383312+ykkk2333@users.noreply.github.com> Date: Mon, 11 Apr 2022 15:35:09 +0800 Subject: [PATCH 063/211] fix arg_max for int type, *test=kunlun (#41522) --- paddle/fluid/operators/arg_max_op_xpu.cc | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/paddle/fluid/operators/arg_max_op_xpu.cc b/paddle/fluid/operators/arg_max_op_xpu.cc index ba2ef81b5cdf1..e2acd84bd4e9d 100644 --- a/paddle/fluid/operators/arg_max_op_xpu.cc +++ b/paddle/fluid/operators/arg_max_op_xpu.cc @@ -28,12 +28,15 @@ class ArgMaxXPUKernel : public framework::OpKernel { auto* out = ctx.Output("Out"); auto dtype = ctx.Attr("dtype"); PADDLE_ENFORCE_EQ( - (dtype < 0 || dtype == 3), true, + (dtype < 0 || dtype == 2 || dtype == 3), true, platform::errors::InvalidArgument( - "The attribute of dtype in xpu argmin/argmax must be [%s], but " + "The attribute of dtype in xpu argmin/argmax must be [%s] or [%s], " + "but " "received [%s]", paddle::framework::DataTypeToString( framework::proto::VarType::INT64), + paddle::framework::DataTypeToString( + framework::proto::VarType::INT32), paddle::framework::DataTypeToString( static_cast(dtype)))); From 8fc9c41277f152cad1efdcbb28299c702fb73fbd Mon Sep 17 00:00:00 2001 From: jakpiase Date: Mon, 11 Apr 2022 09:41:13 +0200 Subject: [PATCH 064/211] fix for gaussian random (#41572) --- paddle/fluid/operators/mkldnn/gaussian_random_mkldnn_op.cc | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/paddle/fluid/operators/mkldnn/gaussian_random_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/gaussian_random_mkldnn_op.cc index 503d3ec33762f..de999035fa5d8 100644 --- a/paddle/fluid/operators/mkldnn/gaussian_random_mkldnn_op.cc +++ b/paddle/fluid/operators/mkldnn/gaussian_random_mkldnn_op.cc @@ -16,6 +16,7 @@ limitations under the License. */ #include "paddle/fluid/framework/generator.h" #include "paddle/fluid/operators/fill_constant_op.h" +#include "paddle/fluid/platform/mkldnn_helper.h" namespace paddle { namespace operators { @@ -42,7 +43,7 @@ class GaussianMKLDNNKernel : public paddle::framework::OpKernel { } tensor->set_layout(DataLayout::kMKLDNN); - tensor->set_format(dnnl::memory::format_tag::oihw); + tensor->set_format(platform::GetPlainMKLDNNFormat(tensor->dims().size())); } }; } // namespace operators From 535810bada218348583aa13a6bd9a39cc6355218 Mon Sep 17 00:00:00 2001 From: shentanyue <34421038+shentanyue@users.noreply.github.com> Date: Mon, 11 Apr 2022 15:48:11 +0800 Subject: [PATCH 065/211] update lite compile cmake (#41512) --- cmake/external/lite.cmake | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cmake/external/lite.cmake b/cmake/external/lite.cmake index 47bbfee57451c..f1d206dd5e199 100644 --- a/cmake/external/lite.cmake +++ b/cmake/external/lite.cmake @@ -86,7 +86,7 @@ if (NOT LITE_SOURCE_DIR OR NOT LITE_BINARY_DIR) GIT_REPOSITORY "${GIT_URL}/PaddlePaddle/Paddle-Lite.git" GIT_TAG ${LITE_GIT_TAG} PREFIX ${LITE_PREFIX_DIR} - PATCH_COMMAND mkdir -p ${LITE_PREFIX_DIR}/src/extern_lite-build/lite/gen_code && touch ${LITE_PREFIX_DIR}/src/extern_lite-build/lite/gen_code/__generated_code__.cc && sed -i "/aarch64-linux-gnu-gcc/d" ${LITE_PREFIX_DIR}/src/extern_lite/cmake/cross_compiling/armlinux.cmake && sed -i "/aarch64-linux-gnu-g++/d" ${LITE_PREFIX_DIR}/src/extern_lite/cmake/cross_compiling/armlinux.cmake + PATCH_COMMAND mkdir -p ${LITE_PREFIX_DIR}/src/extern_lite-build/lite/gen_code && touch ${LITE_PREFIX_DIR}/src/extern_lite-build/lite/gen_code/__generated_code__.cc && sed -i "/aarch64-linux-gnu-gcc/d" ${LITE_PREFIX_DIR}/src/extern_lite/cmake/os/armlinux.cmake && sed -i "/aarch64-linux-gnu-g++/d" ${LITE_PREFIX_DIR}/src/extern_lite/cmake/os/armlinux.cmake UPDATE_COMMAND "" BUILD_COMMAND ${LITE_BUILD_COMMAND} INSTALL_COMMAND "" From fc621dfea375095fb8a3d39801224859b8cf6aa7 Mon Sep 17 00:00:00 2001 From: Allen Guo Date: Mon, 11 Apr 2022 16:04:13 +0800 Subject: [PATCH 066/211] support more ops (#41421) --- cmake/external/poplar.cmake | 15 +++++++++ cmake/inference_lib.cmake | 7 +++- .../fluid/platform/device/ipu/ipu_compiler.cc | 32 +++++++++++++++---- .../fluid/platform/device/ipu/ipu_strategy.cc | 1 + .../fluid/platform/device/ipu/ipu_strategy.h | 28 ++++++++-------- .../ipu/popart_canonicalization/math_ops.cc | 2 +- .../ipu/popart_canonicalization/nn_ops.cc | 27 ++++++++++++++++ .../ipu/popart_canonicalization/tensor_ops.cc | 4 +-- 8 files changed, 93 insertions(+), 23 deletions(-) diff --git a/cmake/external/poplar.cmake b/cmake/external/poplar.cmake index 7a8fa3ef5d710..8b2de14e96620 100644 --- a/cmake/external/poplar.cmake +++ b/cmake/external/poplar.cmake @@ -12,6 +12,19 @@ # See the License for the specific language governing permissions and # limitations under the License. +macro(find_popart_version popart_version_file) + file(READ ${popart_version_file} popart_version_file_content) + string(REGEX MATCH "(POPART_VERSION_STRING)[ \t\r\n](\")([0-9]+\.[0-9]+\.[0-9]+)(\\+)([A-Za-z0-9_]*)(\")" POPART_VERSION ${popart_version_file_content}) + string(REPLACE "POPART_VERSION_STRING" "" POPART_VERSION "${POPART_VERSION}") + string(REPLACE "\"" "" POPART_VERSION "${POPART_VERSION}") + string(REPLACE " " "" POPART_VERSION "${POPART_VERSION}") + if(NOT POPART_VERSION) + set(POPART_VERSION "Unknown version") + else() + message(STATUS "Current PopART version is ${POPART_VERSION}") + endif() +endmacro() + if(WITH_IPU) set(POPLAR_DIR CACHE PATH "Path to a Poplar install") set(POPART_DIR CACHE PATH "Path to a Popart install") @@ -64,6 +77,8 @@ if(WITH_IPU) message(FATAL_ERROR "You must provide a path to a Popart build using -DPOPART_DIR=/path/to/popart/build") endif() + find_popart_version("${POPART_DIR}/include/popart/version.hpp") + add_definitions(-DONNX_NAMESPACE=onnx) add_custom_target(extern_poplar DEPENDS poplar popart-only) endif() diff --git a/cmake/inference_lib.cmake b/cmake/inference_lib.cmake index e3e6e1cced2aa..1b38f208716b3 100644 --- a/cmake/inference_lib.cmake +++ b/cmake/inference_lib.cmake @@ -398,7 +398,8 @@ function(version version_file) "WITH_GPU: ${WITH_GPU}\n" "WITH_ROCM: ${WITH_ROCM}\n" "WITH_ASCEND_CL: ${WITH_ASCEND_CL}\n" - "WITH_ASCEND_CXX11: ${WITH_ASCEND_CXX11}\n") + "WITH_ASCEND_CXX11: ${WITH_ASCEND_CXX11}\n" + "WITH_IPU: ${WITH_IPU}\n") if(WITH_GPU) file(APPEND ${version_file} "CUDA version: ${CUDA_VERSION}\n" @@ -414,6 +415,10 @@ function(version version_file) "Ascend Toolkit version: ${ASCEND_TOOLKIT_VERSION}\n" "Ascend Driver version: ${ASCEND_DRIVER_VERSION}\n") endif() + if(WITH_IPU) + file(APPEND ${version_file} + "PopART version: ${POPART_VERSION}\n") + endif() file(APPEND ${version_file} "CXX compiler version: ${CMAKE_CXX_COMPILER_VERSION}\n") if(TENSORRT_FOUND) file(APPEND ${version_file} diff --git a/paddle/fluid/platform/device/ipu/ipu_compiler.cc b/paddle/fluid/platform/device/ipu/ipu_compiler.cc index 1a3e600058b3b..7ae3b2303decd 100644 --- a/paddle/fluid/platform/device/ipu/ipu_compiler.cc +++ b/paddle/fluid/platform/device/ipu/ipu_compiler.cc @@ -474,6 +474,7 @@ void Compiler::LowerOptimizer(const Scope* scope) { auto adam_mode = AdamModeFromStr(adam_mode_, ipu_strategy_->use_no_bias_optimizer); auto weight_decay_mode_ = ipu_strategy_->weight_decay_mode; + auto scaled_optimizer_state_ = ipu_strategy_->scaled_optimizer_state; if (weight_decay_mode_.empty()) { weight_decay_mode_ = BOOST_GET_CONST( std::string, op_desc->GetAttr("weight_decay_mode")); @@ -492,7 +493,7 @@ void Compiler::LowerOptimizer(const Scope* scope) { auto optimizer_instance = std::make_unique( optimizer_value, adam_mode, weight_decay_mode, popart::DataType::UNDEFINED, accl1_type, accl2_type, - clip_norm_settings); + clip_norm_settings, scaled_optimizer_state_); for (int i = 0; i < weight_decay_vars.size(); i++) { optimizer_instance->insertSpecific( weight_decay_vars[i], @@ -511,11 +512,10 @@ void Compiler::LowerOptimizer(const Scope* scope) { popart::OptimizerValue(loss_scaling, true), popart::OptimizerValue(mwn, true), adam_mode, weight_decay_mode, popart::DataType::UNDEFINED, accl1_type, accl2_type, - clip_norm_settings); + clip_norm_settings, scaled_optimizer_state_); } }; - if (adam_mode == popart::AdamMode::Lamb || - adam_mode == popart::AdamMode::LambNoBias) { + if (adam_mode == popart::AdamMode::Lamb) { const std::map> optimizer_value = {{"defaultLearningRate", {0.0, false}}, {"defaultBeta1", {beta1, false}}, @@ -526,7 +526,26 @@ void Compiler::LowerOptimizer(const Scope* scope) { auto eval_optimizer = std::make_unique( optimizer_value, adam_mode, weight_decay_mode, popart::DataType::UNDEFINED, popart::DataType::FLOAT, - popart::DataType::FLOAT, clip_norm_settings); + popart::DataType::FLOAT, clip_norm_settings, + scaled_optimizer_state_); + for (int i = 0; i < weight_decay_vars.size(); i++) { + eval_optimizer->insertSpecific(weight_decay_vars[i], + {{"weightDecay", {0.0, false}}}); + } + resources_->eval_optimizer = std::move(eval_optimizer); + } else if (adam_mode == popart::AdamMode::LambNoBias) { + const std::map> optimizer_value = + {{"defaultLearningRate", {0.0, false}}, + {"defaultBeta1", {1.0, false}}, + {"defaultBeta2", {1.0, false}}, + {"defaultEps", {eps, true}}, + {"lossScaling", {loss_scaling, true}}, + {"defaultMaxWeightNorm", {mwn, true}}}; + auto eval_optimizer = std::make_unique( + optimizer_value, adam_mode, weight_decay_mode, + popart::DataType::UNDEFINED, popart::DataType::FLOAT, + popart::DataType::FLOAT, clip_norm_settings, + scaled_optimizer_state_); for (int i = 0; i < weight_decay_vars.size(); i++) { eval_optimizer->insertSpecific(weight_decay_vars[i], {{"weightDecay", {0.0, false}}}); @@ -542,7 +561,8 @@ void Compiler::LowerOptimizer(const Scope* scope) { popart::OptimizerValue(loss_scaling, true), popart::OptimizerValue(mwn, true), adam_mode, weight_decay_mode, popart::DataType::UNDEFINED, popart::DataType::FLOAT, - popart::DataType::FLOAT, clip_norm_settings); + popart::DataType::FLOAT, clip_norm_settings, + scaled_optimizer_state_); } } else if (type == "adaptive") { auto alpha = BOOST_GET_CONST(float, op_desc->GetAttr("alpha")); diff --git a/paddle/fluid/platform/device/ipu/ipu_strategy.cc b/paddle/fluid/platform/device/ipu/ipu_strategy.cc index 6172d4d7dc680..f52499a8d8fda 100644 --- a/paddle/fluid/platform/device/ipu/ipu_strategy.cc +++ b/paddle/fluid/platform/device/ipu/ipu_strategy.cc @@ -67,6 +67,7 @@ IpuStrategy::IpuStrategy() { ADD_BOOL_OPTION(transfer_cast_op); ADD_BOOL_OPTION(use_no_bias_optimizer); ADD_BOOL_OPTION(enable_distribution); + ADD_BOOL_OPTION(scaled_optimizer_state); ADD_UINT64_OPTION(num_ipus); ADD_UINT64_OPTION(batches_per_step); ADD_UINT64_OPTION(micro_batch_size); diff --git a/paddle/fluid/platform/device/ipu/ipu_strategy.h b/paddle/fluid/platform/device/ipu/ipu_strategy.h index 786e2419cc0be..1802eb16e5895 100644 --- a/paddle/fluid/platform/device/ipu/ipu_strategy.h +++ b/paddle/fluid/platform/device/ipu/ipu_strategy.h @@ -37,13 +37,13 @@ class IpuStrategy { // training flag, true for training bool is_training = true; - // average sharding, debugging used + // Average sharding, debugging used bool need_avg_shard = false; - // flag for fp16, true for pure fp16 + // Flag for fp16, true for pure fp16 bool enable_fp16 = false; - // enable transfer cast Op target from fp32 to fp16 in fp16 mode + // Enable transfer cast Op target from fp32 to fp16 in fp16 mode bool transfer_cast_op = true; // The mode of Adam/Lamb optimizer @@ -51,33 +51,35 @@ class IpuStrategy { // true: The Adam_No_Bias/Lamb_No_Bias optimizer from PopART bool use_no_bias_optimizer = false; - // enable distributed computing for POD128 or POD256 + // Enable distributed computing for POD128 or POD256 bool enable_distribution = false; + // Enable Scaled optimizer state only for Adam and Lamb + bool scaled_optimizer_state = false; + // Number ipus total needed, local_replica * ipu_per_replica int num_ipus = 1; - // batches per step + // Batches per step int batches_per_step = 1; - // micro batch-size + // Micro batch-size int micro_batch_size = 1; - // random seed + // Random seed std::uint64_t random_seed = std::numeric_limits::max(); - // TODO(alleng) remove this param - // available memory proportion, 0.0f for disable + // Available memory proportion, 0.0f for disable float available_memory_proportion = 0.0f; - // loss scaling, currently we can't get loss scaling from + // Loss scaling, currently we can't get loss scaling from // optimizer_extract_pass, so we have to set it here float loss_scaling = 1.0f; - // defaultMaxWeightNorm for adam optimizer + // DefaultMaxWeightNorm for adam optimizer float max_weight_norm = 65504.0f; - // file path for dumping compiled model in onnx format + // File path for dumping compiled model in onnx format std::string onnx_dump_path; // Data type to use for tensor that stores first-order momentum optimizer @@ -106,7 +108,7 @@ class IpuStrategy { // popart pattern manager popart::Patterns popart_patterns; - // custom ops + // Custom ops std::vector custom_ops; public: diff --git a/paddle/fluid/platform/device/ipu/popart_canonicalization/math_ops.cc b/paddle/fluid/platform/device/ipu/popart_canonicalization/math_ops.cc index 9a907cf5e880f..444b55959cf22 100644 --- a/paddle/fluid/platform/device/ipu/popart_canonicalization/math_ops.cc +++ b/paddle/fluid/platform/device/ipu/popart_canonicalization/math_ops.cc @@ -157,7 +157,6 @@ Node *softmax_handler(Graph *graph, Node *node) { Node *scale_handler(Graph *graph, Node *node) { auto *op = node->Op(); - auto scale_ = BOOST_GET_CONST(float, op->GetAttr("scale")); auto bias_ = BOOST_GET_CONST(float, op->GetAttr("bias")); auto bias_after_scale_ = BOOST_GET_CONST(bool, op->GetAttr("bias_after_scale")); @@ -191,6 +190,7 @@ Node *scale_handler(Graph *graph, Node *node) { } } } else { + auto scale_ = BOOST_GET_CONST(float, op->GetAttr("scale")); if (is_float_equal(bias_, 0.0) && is_float_equal(scale_, 1.0)) { return CreateBaseOp(graph, node, "popart_identity", {GetInputVarNode("X", node)}, node->outputs, {}); diff --git a/paddle/fluid/platform/device/ipu/popart_canonicalization/nn_ops.cc b/paddle/fluid/platform/device/ipu/popart_canonicalization/nn_ops.cc index a529a34e6d71a..a08fbaa26d9ed 100644 --- a/paddle/fluid/platform/device/ipu/popart_canonicalization/nn_ops.cc +++ b/paddle/fluid/platform/device/ipu/popart_canonicalization/nn_ops.cc @@ -95,6 +95,21 @@ Node *pool2d_handler(Graph *graph, Node *node) { auto *op = node->Op(); auto pooling_type = BOOST_GET_CONST(std::string, op->GetAttr("pooling_type")); auto global_pooling = BOOST_GET_CONST(bool, op->GetAttr("global_pooling")); + if (op->HasAttr("adaptive")) { + auto adaptive = BOOST_GET_CONST(bool, op->GetAttr("adaptive")); + if (adaptive) { + auto ksize = BOOST_GET_CONST(std::vector, op->GetAttr("ksize")); + if (ksize[0] != 1 || ksize[1] != 1) { + PADDLE_THROW(platform::errors::InvalidArgument( + "Only support pool_size=1 with adaptive mode.")); + } + // adaptive maxpool op is max_pool2d_with_index. Only process avgpool + // here. + return CreateBaseOp(graph, node, "popart_globalaveragepool", node->inputs, + node->outputs); + } + } + if (global_pooling) { if (pooling_type == "max") { return CreateBaseOp(graph, node, "popart_globalmaxpool", node->inputs, @@ -159,6 +174,17 @@ Node *pool2d_handler(Graph *graph, Node *node) { } } +Node *max_pool2d_with_index_handler(Graph *graph, Node *node) { + auto *op = node->Op(); + auto ksize = BOOST_GET_CONST(std::vector, op->GetAttr("ksize")); + if (ksize[0] != 1 || ksize[1] != 1) { + PADDLE_THROW(platform::errors::InvalidArgument( + "Only support pool_size=1 with adaptive mode.")); + } + return CreateBaseOp(graph, node, "popart_globalmaxpool", node->inputs, + {GetOutputVarNode("Out", node)}); +} + Node *group_norm_handler(Graph *graph, Node *node) { auto *op = node->Op(); auto epsilon_ = BOOST_GET_CONST(float, op->GetAttr("epsilon")); @@ -304,6 +330,7 @@ Node *dropout_handler(Graph *graph, Node *node) { } // namespace paddle REGISTER_HANDLER(pool2d, pool2d_handler); +REGISTER_HANDLER(max_pool2d_with_index, max_pool2d_with_index_handler); REGISTER_HANDLER(batch_norm, batch_norm_handler); REGISTER_HANDLER(group_norm, group_norm_handler); REGISTER_HANDLER(instance_norm, instance_norm_handler); diff --git a/paddle/fluid/platform/device/ipu/popart_canonicalization/tensor_ops.cc b/paddle/fluid/platform/device/ipu/popart_canonicalization/tensor_ops.cc index 4c086bffb240e..55c25bce15931 100644 --- a/paddle/fluid/platform/device/ipu/popart_canonicalization/tensor_ops.cc +++ b/paddle/fluid/platform/device/ipu/popart_canonicalization/tensor_ops.cc @@ -331,7 +331,7 @@ Node *shape_handler(Graph *graph, Node *node) { Node *slice_handler(Graph *graph, Node *node) { auto *op = node->Op(); Node *starts = nullptr; - if (!op->Input("StartsTensor").empty()) { + if (!op->HasAttr("starts")) { starts = GetInputVarNode("StartsTensor", node); } else { auto starts_ = BOOST_GET_CONST(std::vector, op->GetAttr("starts")); @@ -341,7 +341,7 @@ Node *slice_handler(Graph *graph, Node *node) { starts = starts->outputs[0]; } Node *ends = nullptr; - if (!op->Input("EndsTensor").empty()) { + if (!op->HasAttr("ends")) { ends = GetInputVarNode("EndsTensor", node); } else { auto ends_ = BOOST_GET_CONST(std::vector, op->GetAttr("ends")); From c64d9a44127cfd0ef7b08d31a94466024997c0f3 Mon Sep 17 00:00:00 2001 From: lilong12 Date: Mon, 11 Apr 2022 16:25:38 +0800 Subject: [PATCH 067/211] add backend for heter training (#41526) --- python/paddle/distributed/collective.py | 27 ++++++++++++++++++++++++- 1 file changed, 26 insertions(+), 1 deletion(-) diff --git a/python/paddle/distributed/collective.py b/python/paddle/distributed/collective.py index a5ea528d13450..fbad470cb3f13 100644 --- a/python/paddle/distributed/collective.py +++ b/python/paddle/distributed/collective.py @@ -138,7 +138,7 @@ def _get_global_env(): # Name of the default group for init_parallel_env _default_group_name = "_default_pg" -_valid_backend_list = ['nccl', 'gloo', 'hccl'] +_valid_backend_list = ['nccl', 'gloo', 'hccl', 'heter'] _default_store = None # the default tcp store _default_backend = None @@ -234,6 +234,31 @@ def _new_process_group_impl(backend, pg = core.ProcessGroupNCCL(store, rank, world_size, group_id) elif backend == "hccl": pg = core.ProcessGroupHCCL(store, rank, world_size, group_id) + elif backend == "heter": + cluster_id = int(os.getenv("CLUSTER_ID", "-1")) + assert cluster_id >= 0, "please set the CLUSTER_ID variable." + cluster_size = os.getenv("CLUSTER_SIZE", None) + assert cluster_size, "please set the CLUSTER_SIZE variable." + cluster_size = cluster_size.split(",") + cluster_size = [int(s) for s in cluster_size] + switch_ep = os.getenv("CLUSTER_SWITCH", None) + assert switch_ep, "please set the CLUSTER_SWITCH variable." + cluster_size_cumsum = np.cumsum(cluster_size) + cluster_offset = 0 if cluster_id == 0 else cluster_size_cumsum[ + cluster_id - 1] + global_rank = cluster_offset + rank + global_world_size = cluster_size_cumsum[-1] + pg = core.ProcessGroupHeter( + store, + rank=global_rank, + world_size=global_world_size, + gid=0, + local_rank=rank, + local_size=world_size, + gloo_rank=cluster_id, + gloo_size=len(cluster_size), + with_switch=True, + switch_endpoint=switch_ep) return pg From b026840a298bf2046afbae39aefecd2aac3c96ba Mon Sep 17 00:00:00 2001 From: Zhou Wei <1183042833@qq.com> Date: Mon, 11 Apr 2022 18:54:41 +0800 Subject: [PATCH 068/211] fix dynamic flag bug on mac (#41571) --- paddle/fluid/platform/flags.cc | 5 +++++ paddle/fluid/pybind/reader_py.cc | 5 +---- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/paddle/fluid/platform/flags.cc b/paddle/fluid/platform/flags.cc index 8209c0a5d6f8e..18b53563cd64e 100644 --- a/paddle/fluid/platform/flags.cc +++ b/paddle/fluid/platform/flags.cc @@ -534,6 +534,11 @@ PADDLE_DEFINE_EXPORTED_double( "you should set FLAGS_local_exe_sub_scope_limit=-1. " "The default value is 256 MBytes."); +PADDLE_DEFINE_EXPORTED_bool( + reader_queue_speed_test_mode, false, + "If set true, the queue.pop will only get data from queue but not " + "remove the data from queue for speed testing"); + /** * MKLDNN related FLAG * Name: use_mkldnn diff --git a/paddle/fluid/pybind/reader_py.cc b/paddle/fluid/pybind/reader_py.cc index 8c456a2d980d3..e0aab0dd06ecb 100644 --- a/paddle/fluid/pybind/reader_py.cc +++ b/paddle/fluid/pybind/reader_py.cc @@ -32,10 +32,7 @@ #include "paddle/phi/core/ddim.h" #include "pybind11/stl.h" -PADDLE_DEFINE_EXPORTED_bool( - reader_queue_speed_test_mode, false, - "If set true, the queue.pop will only get data from queue but not " - "remove the data from queue for speed testing"); +DECLARE_bool(reader_queue_speed_test_mode); // disable auto conversion to list in Python PYBIND11_MAKE_OPAQUE(paddle::framework::LoDTensorArray); From 5cb614174f45dd9eae6a621f607f2af8637861b8 Mon Sep 17 00:00:00 2001 From: zhiboniu <31800336+zhiboniu@users.noreply.github.com> Date: Mon, 11 Apr 2022 20:11:33 +0800 Subject: [PATCH 069/211] tensor fluid code transfer part3 (#40034) --- .../fluid/tests/unittests/test_logical_op.py | 24 +- python/paddle/tensor/array.py | 127 ++++++++- python/paddle/tensor/linalg.py | 96 ++++++- python/paddle/tensor/logic.py | 260 +++++++++++++++++- python/paddle/tensor/search.py | 18 +- 5 files changed, 489 insertions(+), 36 deletions(-) diff --git a/python/paddle/fluid/tests/unittests/test_logical_op.py b/python/paddle/fluid/tests/unittests/test_logical_op.py index 91d339940d114..e2c7cf3a5bb2b 100755 --- a/python/paddle/fluid/tests/unittests/test_logical_op.py +++ b/python/paddle/fluid/tests/unittests/test_logical_op.py @@ -18,8 +18,8 @@ import unittest import numpy as np import paddle -import paddle.fluid as fluid -from paddle.static import Program, program_guard +from paddle.static import Program, program_guard, Executor +from paddle.framework import _non_static_mode from paddle.fluid.framework import _test_eager_guard SUPPORTED_DTYPES = [ @@ -109,13 +109,13 @@ def run_static(x_np, y_np, op_str, use_gpu=False, binary_op=True): paddle.enable_static() - startup_program = fluid.Program() - main_program = fluid.Program() + startup_program = Program() + main_program = Program() place = paddle.CPUPlace() - if use_gpu and fluid.core.is_compiled_with_cuda(): + if use_gpu and paddle.is_compiled_with_cuda(): place = paddle.CUDAPlace(0) - exe = fluid.Executor(place) - with fluid.program_guard(main_program, startup_program): + exe = Executor(place) + with program_guard(main_program, startup_program): x = paddle.static.data(name='x', shape=x_np.shape, dtype=x_np.dtype) op = getattr(paddle, op_str) feed_list = {'x': x_np} @@ -132,7 +132,7 @@ def run_static(x_np, y_np, op_str, use_gpu=False, binary_op=True): def run_dygraph(x_np, y_np, op_str, use_gpu=False, binary_op=True): place = paddle.CPUPlace() - if use_gpu and fluid.core.is_compiled_with_cuda(): + if use_gpu and paddle.is_compiled_with_cuda(): place = paddle.CUDAPlace(0) paddle.disable_static(place) op = getattr(paddle, op_str) @@ -147,7 +147,7 @@ def run_dygraph(x_np, y_np, op_str, use_gpu=False, binary_op=True): def run_eager(x_np, y_np, op_str, use_gpu=False, binary_op=True): place = paddle.CPUPlace() - if use_gpu and fluid.core.is_compiled_with_cuda(): + if use_gpu and paddle.is_compiled_with_cuda(): place = paddle.CUDAPlace(0) paddle.disable_static(place) with _test_eager_guard(): @@ -213,16 +213,16 @@ def check_type(op_str, x, y, binary_op): if binary_op: if type_str_map['x'] != type_str_map['y']: unit_test.assertRaises(error_type, op, x=x, y=y) - if not fluid._non_static_mode(): + if not _non_static_mode(): error_type = TypeError unit_test.assertRaises(error_type, op, x=x, y=y, out=1) else: - if not fluid._non_static_mode(): + if not _non_static_mode(): error_type = TypeError unit_test.assertRaises(error_type, op, x=x, out=1) place = paddle.CPUPlace() - if use_gpu and fluid.core.is_compiled_with_cuda(): + if use_gpu and paddle.is_compiled_with_cuda(): place = paddle.CUDAPlace(0) for op_data in TEST_META_OP_DATA: meta_data = dict(op_data) diff --git a/python/paddle/tensor/array.py b/python/paddle/tensor/array.py index 49678443f1f1c..856b79c2a6894 100644 --- a/python/paddle/tensor/array.py +++ b/python/paddle/tensor/array.py @@ -14,7 +14,11 @@ # Define functions about array. -from ..fluid import layers +import paddle +from ..static import Variable +from ..framework import LayerHelper, core, _non_static_mode +from ..fluid.data_feeder import check_type +from ..fluid.data_feeder import check_variable_and_dtype __all__ = [] @@ -43,7 +47,24 @@ def array_length(array): arr_len = paddle.tensor.array_length(arr) print(arr_len) # 1 """ - return layers.array_length(array) + if _non_static_mode(): + assert isinstance( + array, + list), "The 'array' in array_write must be a list in dygraph mode" + return len(array) + + if not isinstance( + array, + Variable) or array.type != core.VarDesc.VarType.LOD_TENSOR_ARRAY: + raise TypeError( + "array should be tensor array vairable in array_length Op") + + helper = LayerHelper('array_length', **locals()) + tmp = helper.create_variable_for_type_inference(dtype='int64') + tmp.stop_gradient = True + helper.append_op( + type='lod_array_length', inputs={'X': [array]}, outputs={'Out': [tmp]}) + return tmp def array_read(array, i): @@ -85,7 +106,32 @@ def array_read(array, i): item = paddle.tensor.array_read(arr, i) print(item) # [[5., 5., 5.]] """ - return layers.array_read(array, i) + if _non_static_mode(): + assert isinstance( + array, + list), "The 'array' in array_read must be list in dygraph mode" + assert isinstance( + i, Variable + ), "The index 'i' in array_read must be Variable in dygraph mode" + assert i.shape == [ + 1 + ], "The shape of index 'i' should be [1] in dygraph mode" + i = i.numpy().item(0) + return array[i] + + check_variable_and_dtype(i, 'i', ['int64'], 'array_read') + helper = LayerHelper('array_read', **locals()) + if not isinstance( + array, + Variable) or array.type != core.VarDesc.VarType.LOD_TENSOR_ARRAY: + raise TypeError("array should be tensor array vairable") + out = helper.create_variable_for_type_inference(dtype=array.dtype) + helper.append_op( + type='read_from_array', + inputs={'X': [array], + 'I': [i]}, + outputs={'Out': [out]}) + return out def array_write(x, i, array=None): @@ -119,7 +165,51 @@ def array_write(x, i, array=None): item = paddle.tensor.array_read(arr, i) print(item) # [[5., 5., 5.]] """ - return layers.array_write(x, i, array) + if _non_static_mode(): + assert isinstance( + x, Variable + ), "The input data 'x' in array_write must be Variable in dygraph mode" + assert isinstance( + i, Variable + ), "The index 'i' in array_write must be Variable in dygraph mode" + assert i.shape == [ + 1 + ], "The shape of index 'i' should be [1] in dygraph mode" + i = i.numpy().item(0) + if array is None: + array = create_array(x.dtype) + assert isinstance( + array, + list), "The 'array' in array_write must be a list in dygraph mode" + assert i <= len( + array + ), "The index 'i' should not be greater than the length of 'array' in dygraph mode" + if i < len(array): + array[i] = x + else: + array.append(x) + return array + + check_variable_and_dtype(i, 'i', ['int64'], 'array_write') + check_type(x, 'x', (Variable), 'array_write') + helper = LayerHelper('array_write', **locals()) + if array is not None: + if not isinstance( + array, + Variable) or array.type != core.VarDesc.VarType.LOD_TENSOR_ARRAY: + raise TypeError( + "array should be tensor array vairable in array_write Op") + if array is None: + array = helper.create_variable( + name="{0}.out".format(helper.name), + type=core.VarDesc.VarType.LOD_TENSOR_ARRAY, + dtype=x.dtype) + helper.append_op( + type='write_to_array', + inputs={'X': [x], + 'I': [i]}, + outputs={'Out': [array]}) + return array def create_array(dtype, initialized_list=None): @@ -151,4 +241,31 @@ def create_array(dtype, initialized_list=None): print(item) # [[5., 5., 5.]] """ - return layers.create_array(dtype, initialized_list) + array = [] + if initialized_list is not None: + if not isinstance(initialized_list, (list, tuple)): + raise TypeError( + "Require type(initialized_list) should be list/tuple, but received {}". + format(type(initialized_list))) + array = list(initialized_list) + + # NOTE: Only support plain list like [x, y,...], not support nested list in static mode. + for val in array: + if not isinstance(val, Variable): + raise TypeError( + "All values in `initialized_list` should be Variable, but recevied {}.". + format(type(val))) + + if _non_static_mode(): + return array + + helper = LayerHelper("array", **locals()) + tensor_array = helper.create_variable( + name="{0}.out".format(helper.name), + type=core.VarDesc.VarType.LOD_TENSOR_ARRAY, + dtype=dtype) + + for val in array: + array_write(x=val, i=array_length(tensor_array), array=tensor_array) + + return tensor_array diff --git a/python/paddle/tensor/linalg.py b/python/paddle/tensor/linalg.py index 509ae903f59e4..2fcf9ff4213d4 100644 --- a/python/paddle/tensor/linalg.py +++ b/python/paddle/tensor/linalg.py @@ -14,11 +14,12 @@ import numpy as np from ..fluid.layer_helper import LayerHelper -from ..framework import _varbase_creator, _dygraph_tracer +from ..framework import _varbase_creator, _dygraph_tracer, in_dygraph_mode, _non_static_mode from ..fluid.data_feeder import check_variable_and_dtype, check_type, check_dtype from ..static import Variable -from ..fluid.framework import _in_legacy_dygraph, in_dygraph_mode, _non_static_mode -from ..fluid.layers import transpose, cast # noqa: F401 +from ..fluid.framework import _in_legacy_dygraph +from .manipulation import cast + from ..fluid import layers import paddle from paddle.common_ops_import import core @@ -31,6 +32,95 @@ K_DEFAULT_DIM = 9 +def transpose(x, perm, name=None): + """ + Permute the data dimensions of `input` according to `perm`. + + The `i`-th dimension of the returned tensor will correspond to the + perm[i]-th dimension of `input`. + + Args: + x (Tensor): The input Tensor. It is a N-D Tensor of data types bool, float32, float64, int32. + perm (list|tuple): Permute the input according to the data of perm. + name (str): The name of this layer. It is optional. + + Returns: + Tensor: A transposed n-D Tensor, with data type being bool, float32, float64, int32, int64. + + For Example: + + .. code-block:: text + + x = [[[ 1 2 3 4] [ 5 6 7 8] [ 9 10 11 12]] + [[13 14 15 16] [17 18 19 20] [21 22 23 24]]] + shape(x) = [2,3,4] + + # Example 1 + perm0 = [1,0,2] + y_perm0 = [[[ 1 2 3 4] [13 14 15 16]] + [[ 5 6 7 8] [17 18 19 20]] + [[ 9 10 11 12] [21 22 23 24]]] + shape(y_perm0) = [3,2,4] + + # Example 2 + perm1 = [2,1,0] + y_perm1 = [[[ 1 13] [ 5 17] [ 9 21]] + [[ 2 14] [ 6 18] [10 22]] + [[ 3 15] [ 7 19] [11 23]] + [[ 4 16] [ 8 20] [12 24]]] + shape(y_perm1) = [4,3,2] + + Examples: + + .. code-block:: python + + import paddle + + x = paddle.randn([2, 3, 4]) + x_transposed = paddle.transpose(x, perm=[1, 0, 2]) + print(x_transposed.shape) + # [3L, 2L, 4L] + + """ + if in_dygraph_mode(): + return _C_ops.final_state_transpose(x, perm) + else: + if _in_legacy_dygraph(): + out, _ = _C_ops.transpose2(x, 'axis', perm) + return out + + check_variable_and_dtype(x, 'x', [ + 'bool', 'float16', 'float32', 'float64', 'int32', 'int64', 'complex64', + 'complex128' + ], 'transpose') + check_type(perm, 'perm', (list, tuple), 'transpose') + if isinstance(perm, tuple): + perm = list(perm) + if len(perm) != len(x.shape): + raise ValueError( + "Input(perm) is the permutation of dimensions of Input(x), " + "its length should be equal to dimensions of Input(x), " + "but received dimension of Input(x) is %s, " + "the length of Input(perm) is %s." % (len(x.shape), len(perm))) + for idx, dim in enumerate(perm): + if dim >= len(x.shape): + raise ValueError( + "Each element in Input(perm) should be less than Input(x)'s dimension, " + "but %d-th element in Input(perm) is %d which exceeds Input(x)'s " + "dimension %d." % (idx, perm[idx], len(x.shape))) + + helper = LayerHelper('transpose', **locals()) + out = helper.create_variable_for_type_inference(x.dtype) + x_shape = helper.create_variable_for_type_inference(x.dtype) + helper.append_op( + type='transpose2', + inputs={'X': [x]}, + outputs={'Out': [out], + 'XShape': [x_shape]}, + attrs={'axis': perm}) + return out + + def matmul(x, y, transpose_x=False, transpose_y=False, name=None): """ Applies matrix multiplication to two tensors. `matmul` follows diff --git a/python/paddle/tensor/logic.py b/python/paddle/tensor/logic.py index 636b2ef17c6a0..6a18e1201785a 100755 --- a/python/paddle/tensor/logic.py +++ b/python/paddle/tensor/logic.py @@ -12,29 +12,267 @@ # See the License for the specific language governing permissions and # limitations under the License. -from ..fluid.layer_helper import LayerHelper +import paddle from ..fluid.data_feeder import check_type, check_variable_and_dtype from .layer_function_generator import templatedoc from ..static import Variable -from ..fluid.framework import _in_legacy_dygraph, in_dygraph_mode # TODO: define logic functions of a tensor -import paddle.fluid as fluid -if fluid.framework._in_eager_mode_: - Tensor = fluid.framework.core.eager.Tensor +from ..fluid.framework import _in_eager_mode_ +if _in_eager_mode_: + Tensor = paddle.fluid.framework.core.eager.Tensor else: from ..framework import VarBase as Tensor -from ..fluid.layers import is_empty # noqa: F401 -from ..fluid.layers import logical_and # noqa: F401 -from ..fluid.layers import logical_not # noqa: F401 -from ..fluid.layers import logical_or # noqa: F401 -from ..fluid.layers import logical_xor # noqa: F401 -import paddle + +from ..framework import in_dygraph_mode, _non_static_mode +from ..framework import LayerHelper +from ..fluid.framework import _in_legacy_dygraph +# TODO: define logic functions of a tensor from paddle import _C_ops from paddle.tensor.creation import full __all__ = [] +def _logical_op(op_name, x, y, out=None, name=None, binary_op=True): + if _non_static_mode(): + op = getattr(_C_ops, op_name) + if binary_op: + return op(x, y) + else: + return op(x) + check_variable_and_dtype(x, "x", [ + "bool", "int8", "int16", "int32", "int64", "float32", "float64" + ], op_name) + if y is not None: + check_variable_and_dtype(y, "y", [ + "bool", "int8", "int16", "int32", "int64", "float32", "float64" + ], op_name) + if out is not None: + check_type(out, "out", Variable, op_name) + + helper = LayerHelper(op_name, **locals()) + + if binary_op and x.dtype != y.dtype: + raise ValueError( + "(InvalidArgument) The DataType of %s Op's Variable must be consistent, but received %s and %s." + % (op_name, x.dtype, y.dtype)) + + if out is None: + out = helper.create_variable_for_type_inference(dtype=x.dtype) + + if binary_op: + helper.append_op( + type=op_name, inputs={"X": x, + "Y": y}, outputs={"Out": out}) + else: + helper.append_op(type=op_name, inputs={"X": x}, outputs={"Out": out}) + + return out + + +def logical_and(x, y, out=None, name=None): + r""" + + ``logical_and`` operator computes element-wise logical AND on ``x`` and ``y``, and returns ``out``. ``out`` is N-dim boolean ``Tensor``. + Each element of ``out`` is calculated by + + .. math:: + + out = x \&\& y + + .. note:: + ``paddle.logical_and`` supports broadcasting. If you want know more about broadcasting, please refer to :ref:`user_guide_broadcasting`. + + Args: + x (Tensor): the input tensor, it's data type should be one of bool, int8, int16, in32, in64, float32, float64. + y (Tensor): the input tensor, it's data type should be one of bool, int8, int16, in32, in64, float32, float64. + out(Tensor): The ``Tensor`` that specifies the output of the operator, which can be any ``Tensor`` that has been created in the program. The default value is None, and a new ``Tensor`` will be created to save the output. + name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`. + + Returns: + N-D Tensor. A location into which the result is stored. It's dimension equals with ``x``. + + Examples: + .. code-block:: python + + import paddle + + x = paddle.to_tensor([True]) + y = paddle.to_tensor([True, False, True, False]) + res = paddle.logical_and(x, y) + print(res) # [True False True False] + """ + if in_dygraph_mode(): + return _C_ops.final_state_logical_and(x, y) + + return _logical_op( + op_name="logical_and", x=x, y=y, name=name, out=out, binary_op=True) + + +def logical_or(x, y, out=None, name=None): + """ + + ``logical_or`` operator computes element-wise logical OR on ``x`` and ``y``, and returns ``out``. ``out`` is N-dim boolean ``Tensor``. + Each element of ``out`` is calculated by + + .. math:: + + out = x || y + + .. note:: + ``paddle.logical_or`` supports broadcasting. If you want know more about broadcasting, please refer to :ref:`user_guide_broadcasting`. + + Args: + x (Tensor): the input tensor, it's data type should be one of bool, int8, int16, in32, in64, float32, float64. + y (Tensor): the input tensor, it's data type should be one of bool, int8, int16, in32, in64, float32, float64. + out(Tensor): The ``Variable`` that specifies the output of the operator, which can be any ``Tensor`` that has been created in the program. The default value is None, and a new ``Tensor`` will be created to save the output. + name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`. + + Returns: + N-D Tensor. A location into which the result is stored. It's dimension equals with ``x``. + + Examples: + .. code-block:: python + + import paddle + import numpy as np + + x_data = np.array([True, False], dtype=np.bool).reshape(2, 1) + y_data = np.array([True, False, True, False], dtype=np.bool).reshape(2, 2) + x = paddle.to_tensor(x_data) + y = paddle.to_tensor(y_data) + res = paddle.logical_or(x, y) + print(res) # [[ True True] [ True False]] + """ + if in_dygraph_mode(): + return _C_ops.final_state_logical_or(x, y) + return _logical_op( + op_name="logical_or", x=x, y=y, name=name, out=out, binary_op=True) + + +def logical_xor(x, y, out=None, name=None): + r""" + + ``logical_xor`` operator computes element-wise logical XOR on ``x`` and ``y``, and returns ``out``. ``out`` is N-dim boolean ``Tensor``. + Each element of ``out`` is calculated by + + .. math:: + + out = (x || y) \&\& !(x \&\& y) + + .. note:: + ``paddle.logical_xor`` supports broadcasting. If you want know more about broadcasting, please refer to :ref:`user_guide_broadcasting`. + + Args: + x (Tensor): the input tensor, it's data type should be one of bool, int8, int16, in32, in64, float32, float64. + y (Tensor): the input tensor, it's data type should be one of bool, int8, int16, in32, in64, float32, float64. + out(Tensor): The ``Tensor`` that specifies the output of the operator, which can be any ``Tensor`` that has been created in the program. The default value is None, and a new ``Tensor`` will be created to save the output. + name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`. + + Returns: + N-D Tensor. A location into which the result is stored. It's dimension equals with ``x``. + + Examples: + .. code-block:: python + + import paddle + import numpy as np + + x_data = np.array([True, False], dtype=np.bool).reshape([2, 1]) + y_data = np.array([True, False, True, False], dtype=np.bool).reshape([2, 2]) + x = paddle.to_tensor(x_data) + y = paddle.to_tensor(y_data) + res = paddle.logical_xor(x, y) + print(res) # [[False, True], [ True, False]] + """ + if in_dygraph_mode(): + return _C_ops.final_state_logical_xor(x, y) + + return _logical_op( + op_name="logical_xor", x=x, y=y, name=name, out=out, binary_op=True) + + +@templatedoc() +def logical_not(x, out=None, name=None): + """ + + ``logical_not`` operator computes element-wise logical NOT on ``x``, and returns ``out``. ``out`` is N-dim boolean ``Variable``. + Each element of ``out`` is calculated by + + .. math:: + + out = !x + + Args: + x(Tensor): Operand of logical_not operator. Must be a Tensor of type bool, int8, int16, in32, in64, float32, or float64. + out(Tensor): The ``Tensor`` that specifies the output of the operator, which can be any ``Tensor`` that has been created in the program. The default value is None, and a new ``Tensor` will be created to save the output. + name(str|None): The default value is None. Normally there is no need for users to set this property. For more information, please refer to :ref:`api_guide_Name`. + + Returns: + Tensor: ${out_comment} + + Examples: + .. code-block:: python + + import paddle + + x = paddle.to_tensor([True, False, True, False]) + res = paddle.logical_not(x) + print(res) # [False True False True] + """ + if in_dygraph_mode(): + return _C_ops.final_state_logical_not(x) + return _logical_op( + op_name="logical_not", x=x, y=None, name=name, out=out, binary_op=False) + + +def is_empty(x, name=None): + """ + + Test whether a Tensor is empty. + + Args: + x (Tensor): The Tensor to be tested. + name (str, optional): The default value is ``None`` . Normally users + don't have to set this parameter. For more information, + please refer to :ref:`api_guide_Name` . + + Returns: + Tensor: A bool scalar Tensor. True if 'x' is an empty Tensor. + + Examples: + .. code-block:: python + + import paddle + + input = paddle.rand(shape=[4, 32, 32], dtype='float32') + res = paddle.is_empty(x=input) + print("res:", res) + # ('res:', Tensor: eager_tmp_1 + # - place: CPUPlace + # - shape: [1] + # - layout: NCHW + # - dtype: bool + # - data: [0]) + + """ + if in_dygraph_mode(): + return _C_ops.final_state_is_empty(x) + if _in_legacy_dygraph(): + return _C_ops.is_empty(x) + + check_variable_and_dtype(x, 'x', ['float32', 'float64', 'int32', 'int64'], + 'is_empty') + check_type(name, "name", (str, type(None)), "is_empty") + + helper = LayerHelper("is_empty", **locals()) + cond = helper.create_variable_for_type_inference(dtype='bool') + cond.stop_gradient = True + helper.append_op( + type='is_empty', inputs={'X': [x]}, outputs={'Out': [cond]}) + return cond + + def equal_all(x, y, name=None): """ This OP returns the truth value of :math:`x == y`. True if two inputs have the same elements, False otherwise. diff --git a/python/paddle/tensor/search.py b/python/paddle/tensor/search.py index 5c290aa0eb760..b2fb9d6c37ff2 100644 --- a/python/paddle/tensor/search.py +++ b/python/paddle/tensor/search.py @@ -17,8 +17,8 @@ from ..fluid.layer_helper import LayerHelper from ..fluid.data_feeder import check_variable_and_dtype, check_type, check_dtype from ..fluid import layers -from ..framework import core -from ..fluid.framework import _in_legacy_dygraph, in_dygraph_mode, _non_static_mode +from ..framework import core, in_dygraph_mode, _non_static_mode +from ..fluid.framework import _in_legacy_dygraph from paddle.common_ops_import import convert_np_dtype_to_dtype_ from paddle.common_ops_import import Variable from paddle.common_ops_import import VarDesc @@ -401,7 +401,15 @@ def nonzero(x, as_tuple=False): if paddle.in_dynamic_mode(): outs = _C_ops.where_index(x) else: - outs = layers.where(x) + helper = LayerHelper("where_index", **locals()) + + outs = helper.create_variable_for_type_inference( + dtype=core.VarDesc.VarType.INT64) + + helper.append_op( + type='where_index', + inputs={'Condition': x}, + outputs={'Out': [outs]}) if not as_tuple: return outs @@ -592,10 +600,10 @@ def where(condition, x=None, y=None, name=None): # [3]]),) """ if np.isscalar(x): - x = layers.fill_constant([1], np.array([x]).dtype.name, x) + x = paddle.full([1], x, np.array([x]).dtype.name) if np.isscalar(y): - y = layers.fill_constant([1], np.array([y]).dtype.name, y) + y = paddle.full([1], y, np.array([y]).dtype.name) if x is None and y is None: return nonzero(condition, as_tuple=True) From b45f80dde104fa9453bacc8022ab94adebfce3c4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=9F=B3=E6=99=93=E4=BC=9F?= <39303645+Shixiaowei02@users.noreply.github.com> Date: Mon, 11 Apr 2022 21:20:16 +0800 Subject: [PATCH 070/211] fix, test=document_fix (#41655) --- paddle/scripts/infrt_build.sh | 62 +++++++++++------------------------ 1 file changed, 20 insertions(+), 42 deletions(-) diff --git a/paddle/scripts/infrt_build.sh b/paddle/scripts/infrt_build.sh index ef753200971b3..6634f5396ac74 100755 --- a/paddle/scripts/infrt_build.sh +++ b/paddle/scripts/infrt_build.sh @@ -20,9 +20,6 @@ set -e -# TARGET: CPU/GPU/TensorRt -TARGET=GPU - if [ -z ${BRANCH} ]; then BRANCH="develop" fi @@ -35,14 +32,8 @@ function update_pd_ops() { # compile and install paddle rm -rf ${PADDLE_ROOT}/build && mkdir -p ${PADDLE_ROOT}/build cd ${PADDLE_ROOT}/build - - INFER_WITH_GPU=OFF - if [ "${TARGET}" == "GPU" ] || [ "${TARGET}" == "gpu" ] || [ "${TARGET}" == "TensorRt" ] || [ "${TARGET}" == "tensorrt" ]; then - INFER_WITH_GPU=ON - fi - - cmake .. -DWITH_PYTHON=ON -DWITH_MKL=OFF -DWITH_GPU=$INFER_WITH_GPU -DPYTHON_EXECUTABLE=`which python3` -DWITH_XBYAK=OFF -DWITH_NCCL=OFF -DWITH_RCCL=OFF -DWITH_CRYPTO=OFF - make -j24 paddle_python print_pten_kernels kernel_signature_generator + cmake .. -DWITH_PYTHON=ON -DWITH_MKL=OFF -DWITH_GPU=OFF -DPYTHON_EXECUTABLE=`which python3` -DWITH_XBYAK=OFF -DWITH_NCCL=OFF -DWITH_RCCL=OFF -DWITH_CRYPTO=OFF + make -j8 paddle_python print_pten_kernels kernel_signature_generator cd ${PADDLE_ROOT}/build ./paddle/phi/tools/print_pten_kernels > ../tools/infrt/kernels.json ./paddle/fluid/pybind/kernel_signature_generator > ../tools/infrt/kernel_signature.json @@ -90,7 +81,6 @@ function init() { } function infrt_gen_and_build() { - parallel_number=24 if [ "$1" != "" ]; then parallel_number=$1 fi @@ -103,13 +93,7 @@ function infrt_gen_and_build() { # step2. compile infrt cd ${PADDLE_ROOT}/build rm -f infrt_summary.txt - - INFER_WITH_GPU=OFF - if [ "${TARGET}" == "GPU" ] || [ "${TARGET}" == "gpu" ] || [ "${TARGET}" == "TensorRt" ]; then - INFER_WITH_GPU=ON - fi - - cmake .. -DWITH_MKL=OFF -DWITH_GPU=${INFER_WITH_GPU} -DWITH_TENSORRT=ON -DWITH_CRYPTO=OFF -DCMAKE_BUILD_TYPE=Release -DWITH_INFRT=ON -DINFRT_WITH_GPU=ON -DINFRT_WITH_TRT=ON -DWITH_PYTHON=OFF -DWITH_TESTING==${WITH_TESTING:-ON}; build_error=$? + cmake .. -DWITH_MKL=OFF -DWITH_GPU=OFF -DWITH_CRYPTO=OFF -DCMAKE_BUILD_TYPE=Release -DWITH_INFRT=ON -DWITH_PYTHON=OFF -DWITH_TESTING==${WITH_TESTING:-ON}; build_error=$? if [ "$build_error" != 0 ];then exit 7; fi @@ -172,34 +156,28 @@ function main() { echo " (2)bash infrt_build.sh build_only" echo " (3)bash infrt_build.sh test_only" echo " optional command: --update_pd_ops : pd_ops.td will be updated according to paddle's code." - echo " --target= : GPU/gpu/CPU/cpu/TensorRt/tensorrt, default value is GPU." exit 0 fi init - for i in "$@"; do - case $i in - --target=*) - TARGET="${i#*=}" - shift - ;; - build_and_test) - infrt_gen_and_build ${parallel_number} - test_infrt - ;; - build_only) - infrt_gen_and_build ${parallel_number} - ;; - test_only) - test_infrt - ;; - *) - print_usage - exit 1 - ;; - esac - done + case $CMD in + build_and_test) + infrt_gen_and_build ${parallel_number} + test_infrt + ;; + build_only) + infrt_gen_and_build ${parallel_number} + ;; + test_only) + test_infrt + ;; + *) + print_usage + exit 1 + ;; + esac + set +x if [[ -f ${PADDLE_ROOT}/build/infrt_summary.txt ]];then echo "=====================build summary======================" From d6e159144565b8c5b9aac1c12216614956a1fe31 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=8E=8B=E6=98=8E=E5=86=AC?= <78149749+winter-wang@users.noreply.github.com> Date: Tue, 12 Apr 2022 00:55:39 +0800 Subject: [PATCH 071/211] [Infrt] fix ci bug. test=document_fix (#41663) --- paddle/infrt/api/infrt_api.cc | 2 ++ 1 file changed, 2 insertions(+) diff --git a/paddle/infrt/api/infrt_api.cc b/paddle/infrt/api/infrt_api.cc index 8b4b14a3ca08b..ec6293f9a2bdb 100644 --- a/paddle/infrt/api/infrt_api.cc +++ b/paddle/infrt/api/infrt_api.cc @@ -257,11 +257,13 @@ int InfRtPredictor::Init(const InfRtConfig& config) { ::mlir::OpPassManager& pass_manager = pm.nest<::mlir::FuncOp>(); if (config.tensorrt_enabled()) { pass_manager.addPass(::infrt::CreateInfrtWeightsUnfoldPass()); +#if defined(INFRT_WITH_GPU) && defined(INFRT_WITH_TRT) pass_manager.addPass(::infrt::trt::CreateTrtOpTellerPass()); pass_manager.addPass(::infrt::trt::CreateTrtGraphFusePass()); pass_manager.addPass(::infrt::trt::CreateTrtGraphSplitPass(1)); pass_manager.addPass(::infrt::trt::CreateTrtOpConverterPass()); pass_manager.addPass(::infrt::trt::CreateTrtTypeConvertPass()); +#endif pass_manager.addPass(::mlir::createCanonicalizerPass()); } else { std::vector<::infrt::Place> valid_places = { From c055b50c3da8733d41282071ed2aa6ffd98176e4 Mon Sep 17 00:00:00 2001 From: zyfncg Date: Tue, 12 Apr 2022 09:00:33 +0800 Subject: [PATCH 072/211] fix data transform problem for cudnn backend (#41622) --- paddle/phi/api/lib/data_transform.cc | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/paddle/phi/api/lib/data_transform.cc b/paddle/phi/api/lib/data_transform.cc index 90d47977cdf60..82d2e741e9de8 100644 --- a/paddle/phi/api/lib/data_transform.cc +++ b/paddle/phi/api/lib/data_transform.cc @@ -40,7 +40,8 @@ inline bool NeedTransformPlace(const paddle::platform::Place& input, bool ret = input.GetType() == AllocationType::GPUPINNED || (transform_flag.need_trans_backend() && target != Backend::ALL_BACKEND && - phi::TransToPhiBackend(input) != target); + phi::TransToPhiBackend(input) != + (target != Backend::GPUDNN ? target : Backend::GPU)); return ret; } From c3e1d2570d5ae9156d729ee84958d36b289aa4df Mon Sep 17 00:00:00 2001 From: liutiexing <74819124+liutiexing@users.noreply.github.com> Date: Tue, 12 Apr 2022 10:02:31 +0800 Subject: [PATCH 073/211] Update Profiler (#41638) --- paddle/fluid/platform/profiler/CMakeLists.txt | 0 .../profiler/dump/test_serialization_logger.cc | 4 ++-- paddle/fluid/platform/profiler/event_node.h | 11 +++++++---- paddle/fluid/platform/profiler/event_python.cc | 2 +- paddle/fluid/platform/profiler/event_python.h | 4 ++-- paddle/fluid/platform/profiler/profiler_test.cc | 4 ++-- 6 files changed, 14 insertions(+), 11 deletions(-) mode change 100755 => 100644 paddle/fluid/platform/profiler/CMakeLists.txt diff --git a/paddle/fluid/platform/profiler/CMakeLists.txt b/paddle/fluid/platform/profiler/CMakeLists.txt old mode 100755 new mode 100644 diff --git a/paddle/fluid/platform/profiler/dump/test_serialization_logger.cc b/paddle/fluid/platform/profiler/dump/test_serialization_logger.cc index dee1019da2b52..d294bfee58c2b 100644 --- a/paddle/fluid/platform/profiler/dump/test_serialization_logger.cc +++ b/paddle/fluid/platform/profiler/dump/test_serialization_logger.cc @@ -152,7 +152,7 @@ TEST(SerializationLoggerTest, dump_case1) { TEST(DeserializationReaderTest, restore_case0) { DeserializationReader reader("test_serialization_logger_case0.pb"); auto profiler_result = reader.Parse(); - auto& tree = profiler_result->GetNodeTrees(); + auto tree = profiler_result->GetNodeTrees(); std::map> nodes = tree->Traverse(true); EXPECT_EQ(nodes[10].size(), 4u); @@ -179,7 +179,7 @@ TEST(DeserializationReaderTest, restore_case0) { TEST(DeserializationReaderTest, restore_case1) { DeserializationReader reader("test_serialization_logger_case1.pb"); auto profiler_result = reader.Parse(); - auto& tree = profiler_result->GetNodeTrees(); + auto tree = profiler_result->GetNodeTrees(); std::map> nodes = tree->Traverse(true); EXPECT_EQ(nodes[10].size(), 1u); diff --git a/paddle/fluid/platform/profiler/event_node.h b/paddle/fluid/platform/profiler/event_node.h index dd8dfd32df4f7..3e589b0be2e04 100644 --- a/paddle/fluid/platform/profiler/event_node.h +++ b/paddle/fluid/platform/profiler/event_node.h @@ -103,7 +103,7 @@ class CudaRuntimeTraceEventNode { device_node_ptrs_.push_back(node); } void LogMe(BaseLogger* logger) { logger->LogRuntimeTraceEventNode(*this); } - std::vector& GetDeviceTraceEventNodes() { + const std::vector& GetDeviceTraceEventNodes() const { return device_node_ptrs_; } @@ -139,8 +139,11 @@ class HostTraceEventNode { void AddCudaRuntimeNode(CudaRuntimeTraceEventNode* node) { runtime_node_ptrs_.push_back(node); } - std::vector& GetChildren() { return children_; } - std::vector& GetRuntimeTraceEventNodes() { + const std::vector& GetChildren() const { + return children_; + } + const std::vector& GetRuntimeTraceEventNodes() + const { return runtime_node_ptrs_; } void LogMe(BaseLogger* logger) { logger->LogHostTraceEventNode(*this); } @@ -188,7 +191,7 @@ class NodeTrees { void HandleTrees(std::function, std::function, std::function); - std::map GetNodeTrees() { + const std::map& GetNodeTrees() const { return thread_event_trees_map_; } std::map> Traverse(bool bfs) const; diff --git a/paddle/fluid/platform/profiler/event_python.cc b/paddle/fluid/platform/profiler/event_python.cc index 1a6f19d2f93af..5c42c8e8bf61e 100644 --- a/paddle/fluid/platform/profiler/event_python.cc +++ b/paddle/fluid/platform/profiler/event_python.cc @@ -81,7 +81,7 @@ HostPythonNode* ProfilerResult::CopyTree(HostTraceEventNode* root) { ProfilerResult::ProfilerResult(std::unique_ptr tree, const ExtraInfo& extra_info) - : tree_(std::move(tree)), extra_info_(extra_info) { + : tree_(tree.release()), extra_info_(extra_info) { if (tree_ != nullptr) { std::map nodetrees = tree_->GetNodeTrees(); for (auto it = nodetrees.begin(); it != nodetrees.end(); ++it) { diff --git a/paddle/fluid/platform/profiler/event_python.h b/paddle/fluid/platform/profiler/event_python.h index 12ecb9fde32aa..172116dbb0edd 100644 --- a/paddle/fluid/platform/profiler/event_python.h +++ b/paddle/fluid/platform/profiler/event_python.h @@ -82,11 +82,11 @@ class ProfilerResult { void Save(const std::string& file_name, const std::string format = std::string("json")); - std::unique_ptr& GetNodeTrees() { return tree_; } + std::shared_ptr GetNodeTrees() { return tree_; } private: std::map thread_event_trees_map_; - std::unique_ptr tree_; + std::shared_ptr tree_; ExtraInfo extra_info_; HostPythonNode* CopyTree(HostTraceEventNode* root); }; diff --git a/paddle/fluid/platform/profiler/profiler_test.cc b/paddle/fluid/platform/profiler/profiler_test.cc index 32310b9e86228..f2c867ffff217 100644 --- a/paddle/fluid/platform/profiler/profiler_test.cc +++ b/paddle/fluid/platform/profiler/profiler_test.cc @@ -46,7 +46,7 @@ TEST(ProfilerTest, TestHostTracer) { 3); } auto profiler_result = profiler->Stop(); - auto& nodetree = profiler_result->GetNodeTrees(); + auto nodetree = profiler_result->GetNodeTrees(); std::set host_events; for (const auto pair : nodetree->Traverse(true)) { for (const auto evt : pair.second) { @@ -79,7 +79,7 @@ TEST(ProfilerTest, TestCudaTracer) { hipStreamSynchronize(stream); #endif auto profiler_result = profiler->Stop(); - auto& nodetree = profiler_result->GetNodeTrees(); + auto nodetree = profiler_result->GetNodeTrees(); std::vector runtime_events; for (const auto pair : nodetree->Traverse(true)) { for (const auto host_node : pair.second) { From dead24dd4cb86e13b34c941bd8c87b968f134eee Mon Sep 17 00:00:00 2001 From: zyfncg Date: Tue, 12 Apr 2022 10:03:43 +0800 Subject: [PATCH 074/211] [Phi] Support setting size of vector for out in yaml (#41576) * support setting vector out size in yaml * support setting size of vector for out in yaml --- .../final_state_generator/codegen_utils.py | 2 +- paddle/phi/api/lib/api_custom_impl.cc | 440 +----------------- paddle/phi/api/lib/api_custom_impl.h | 51 +- paddle/phi/api/lib/api_gen_utils.cc | 10 + paddle/phi/api/lib/api_gen_utils.h | 3 + paddle/phi/common/int_array.h | 2 + python/paddle/utils/code_gen/api.yaml | 14 +- python/paddle/utils/code_gen/api_base.py | 72 +-- python/paddle/utils/code_gen/api_gen.py | 20 +- python/paddle/utils/code_gen/backward.yaml | 42 +- .../paddle/utils/code_gen/backward_api_gen.py | 23 +- 11 files changed, 150 insertions(+), 529 deletions(-) diff --git a/paddle/fluid/eager/auto_code_generator/final_state_generator/codegen_utils.py b/paddle/fluid/eager/auto_code_generator/final_state_generator/codegen_utils.py index 21b6b882a6f15..b2cdd947aaff9 100644 --- a/paddle/fluid/eager/auto_code_generator/final_state_generator/codegen_utils.py +++ b/paddle/fluid/eager/auto_code_generator/final_state_generator/codegen_utils.py @@ -226,7 +226,7 @@ def ParseYamlReturns(string): returns = [x.strip() for x in string.strip().split(",")] for i in range(len(returns)): - ret = returns[i] + ret = returns[i].split("{")[0].strip() ret_name = "" if "(" in ret and ")" in ret: diff --git a/paddle/phi/api/lib/api_custom_impl.cc b/paddle/phi/api/lib/api_custom_impl.cc index 637c3b9107a7d..0f1cbc3f1910e 100644 --- a/paddle/phi/api/lib/api_custom_impl.cc +++ b/paddle/phi/api/lib/api_custom_impl.cc @@ -297,10 +297,10 @@ std::vector split_impl(const Tensor& x, // Calculate the number of out tensors size_t out_number; - if (num_or_sections.GetData().size() == 1) { + if (num_or_sections.size() == 1) { out_number = num_or_sections.GetData()[0]; } else { - out_number = num_or_sections.GetData().size(); + out_number = num_or_sections.size(); } std::vector out; @@ -475,54 +475,6 @@ std::tuple momentum_impl( return api_output; } -std::vector unbind_impl(const Tensor& input, int axis) { - auto kernel_key_set = ParseKernelKeyByInputArgs(input); - auto kernel_key = kernel_key_set.GetHighestPriorityKernelKey(); - - Backend kernel_backend = kernel_key.backend(); - DataLayout kernel_layout = kernel_key.layout(); - DataType kernel_data_type = kernel_key.dtype(); - - auto kernel = phi::KernelFactory::Instance().SelectKernelOrThrowError( - "unbind", {kernel_backend, kernel_layout, kernel_data_type}); - VLOG(6) << "unbind API kernel key: [" << kernel_backend << ", " - << kernel_layout << ", " << kernel_data_type << "]"; - VLOG(6) << "unbind API kernel: " << kernel; - - auto* dev_ctx = GetDeviceContextByBackend(kernel_backend); - - auto dense_input = PrepareData(input, kernel.InputAt(0), {}); - - // Calculate the number of out tensors - auto input_shape = input.dims(); - if (axis < 0) { - axis = input_shape.size() + axis; - } - auto out_num = input_shape[axis]; - - std::vector out; - auto dense_outs = SetKernelOutput(out_num, kernel_backend, &out); - std::vector meta_outs; - meta_outs.reserve(out_num); - std::vector meta_out_ptrs; - meta_out_ptrs.reserve(out_num); - for (int64_t i = 0; i < out_num; ++i) { - meta_outs.push_back(dense_outs[i]); - meta_out_ptrs.push_back(&meta_outs.back()); - } - - phi::UnbindInferMeta(MakeMetaTensor(*dense_input), axis, meta_out_ptrs); - - using kernel_signature = void (*)(const phi::DeviceContext&, - const phi::DenseTensor&, - int, - std::vector&); - auto* kernel_fn = kernel.GetVariadicKernelFn(); - (*kernel_fn)(*dev_ctx, *dense_input, axis, dense_outs); - - return out; -} - ////////////////// Backward(grad) api impls ////////////////////// // TODO(chenweihang): the original sum grad op can support higher-level @@ -700,71 +652,6 @@ std::tuple batch_norm_impl( return api_output; } -std::vector concat_grad_impl(const std::vector& x, - const Tensor& out_grad, - const Scalar& axis) { - auto kernel_key_set = ParseKernelKeyByInputArgs(out_grad); - auto kernel_key = kernel_key_set.GetHighestPriorityKernelKey(); - - Backend kernel_backend = kernel_key.backend(); - DataLayout kernel_layout = kernel_key.layout(); - DataType kernel_data_type = kernel_key.dtype(); - - auto kernel = phi::KernelFactory::Instance().SelectKernelOrThrowError( - "concat_grad", {kernel_backend, kernel_layout, kernel_data_type}); - VLOG(6) << "concat_grad API kernel key: [" << kernel_backend << ", " - << kernel_layout << ", " << kernel_data_type << "]"; - VLOG(6) << "concat_grad API kernel: " << kernel; - - auto* dev_ctx = GetDeviceContextByBackend(kernel_backend); - - // std::unique_ptr> - auto dense_x = PrepareData(x, kernel.InputAt(0), {}); - auto dense_out_grad = PrepareData(out_grad, kernel.InputAt(1), {}); - - // Calculate the number of out tensors - size_t out_number = x.size(); - std::vector x_grad; - auto dense_x_grad = SetKernelOutput(out_number, kernel_backend, &x_grad); - - std::vector meta_x; - meta_x.reserve(x.size()); - std::vector meta_x_ptrs; - meta_x_ptrs.reserve(x.size()); - for (const auto& t : *dense_x) { - meta_x.push_back(t); - meta_x_ptrs.push_back(&meta_x.back()); - } - - std::vector meta_x_grad; - meta_x_grad.reserve(x.size()); - std::vector meta_x_grad_ptrs; - meta_x_grad_ptrs.reserve(x.size()); - for (size_t i = 0; i < out_number; ++i) { - meta_x_grad.push_back(*dense_x_grad[i]); - meta_x_grad_ptrs.push_back(&meta_x_grad.back()); - } - - phi::UnchangedMultiInferMeta(meta_x_ptrs, meta_x_grad_ptrs); - - std::vector dense_x_ptr; - dense_x_ptr.reserve(x.size()); - for (const auto& t : *dense_x) { - dense_x_ptr.push_back(&t); - } - - using kernel_signature = void (*)(const platform::DeviceContext&, - const std::vector&, - const phi::DenseTensor&, - const phi::Scalar&, - std::vector); - auto* kernel_fn = kernel.GetVariadicKernelFn(); - (*kernel_fn)( - *dev_ctx, dense_x_ptr, *dense_out_grad, phi::Scalar(axis), dense_x_grad); - - return x_grad; -} - Tensor imag_grad_impl(const Tensor& out_grad) { phi::KernelKey kernel_key{ParseBackend(out_grad), out_grad.layout(), @@ -821,328 +708,5 @@ Tensor real_grad_impl(const Tensor& out_grad) { return out; } -std::vector stack_grad_impl(const std::vector& x, - const Tensor& out_grad, - int axis) { - auto kernel_key_set = ParseKernelKeyByInputArgs(out_grad); - auto kernel_key = kernel_key_set.GetHighestPriorityKernelKey(); - - Backend kernel_backend = kernel_key.backend(); - DataLayout kernel_layout = kernel_key.layout(); - DataType kernel_data_type = kernel_key.dtype(); - - auto kernel = phi::KernelFactory::Instance().SelectKernelOrThrowError( - "stack_grad", {kernel_backend, kernel_layout, kernel_data_type}); - VLOG(6) << "stack_grad API kernel key: [" << kernel_backend << ", " - << kernel_layout << ", " << kernel_data_type << "]"; - VLOG(6) << "stack_grad API kernel: " << kernel; - - auto* dev_ctx = GetDeviceContextByBackend(kernel_backend); - - auto dense_out_grad = PrepareData(out_grad, kernel.InputAt(0), {}); - - size_t out_number = x.size(); - std::vector x_grad; - auto dense_x_grad = SetKernelOutput(out_number, kernel_backend, &x_grad); - std::vector meta_x_grad; - meta_x_grad.reserve(out_number); - std::vector meta_x_grad_ptrs; - meta_x_grad_ptrs.reserve(out_number); - for (size_t i = 0; i < out_number; ++i) { - meta_x_grad.push_back(dense_x_grad[i]); - meta_x_grad_ptrs.push_back(&meta_x_grad.back()); - } - - phi::StackGradInferMeta( - MakeMetaTensor(*dense_out_grad), axis, meta_x_grad_ptrs); - - using kernel_signature = void (*)(const platform::DeviceContext&, - const phi::DenseTensor&, - int axis, - std::vector); - auto* kernel_fn = kernel.GetVariadicKernelFn(); - (*kernel_fn)(*dev_ctx, *dense_out_grad, axis, dense_x_grad); - - return x_grad; -} - -std::vector meshgrid_impl(const std::vector& inputs) { - Backend kernel_backend = Backend::UNDEFINED; - DataLayout kernel_layout = DataLayout::UNDEFINED; - DataType kernel_data_type = DataType::UNDEFINED; - - if (kernel_backend == Backend::UNDEFINED || - kernel_layout == DataLayout::UNDEFINED || - kernel_data_type == DataType::UNDEFINED) { - auto kernel_key_set = ParseKernelKeyByInputArgs(inputs); - auto kernel_key = kernel_key_set.GetHighestPriorityKernelKey(); - if (kernel_backend == Backend::UNDEFINED) { - kernel_backend = kernel_key.backend(); - } - if (kernel_layout == DataLayout::UNDEFINED) { - kernel_layout = kernel_key.layout(); - } - if (kernel_data_type == DataType::UNDEFINED) { - kernel_data_type = kernel_key.dtype(); - } - } - - const auto& kernel = phi::KernelFactory::Instance().SelectKernelOrThrowError( - "meshgrid", {kernel_backend, kernel_layout, kernel_data_type}); - VLOG(6) << "meshgrid API kernel key: [" << kernel_backend << ", " - << kernel_layout << ", " << kernel_data_type << "]"; - VLOG(6) << "meshgrid API kernel: " << kernel; - - auto* dev_ctx = GetDeviceContextByBackend(kernel_backend); - - auto input_inputs_vec = PrepareData(inputs, kernel.InputAt(0), {}); - std::vector input_inputs(input_inputs_vec->size()); - for (size_t i = 0; i < input_inputs.size(); ++i) { - input_inputs[i] = &input_inputs_vec->at(i); - } - - auto x_meta_vec = MakeMetaTensor(input_inputs); - std::vector inputs_metas(x_meta_vec.size()); - for (size_t i = 0; i < x_meta_vec.size(); ++i) { - inputs_metas[i] = &x_meta_vec[i]; - } - - // Calculate the number of out tensors - size_t out_number = inputs.size(); - - std::vector out; - auto dense_outs = SetKernelOutput(out_number, kernel_backend, &out); - - std::vector meta_outs; - meta_outs.reserve(out_number); - std::vector meta_out_ptrs; - meta_out_ptrs.reserve(out_number); - for (size_t i = 0; i < out_number; ++i) { - meta_outs.push_back(dense_outs[i]); - meta_out_ptrs.push_back(&meta_outs.back()); - } - phi::MeshgridInferMeta(inputs_metas, meta_out_ptrs); - - using kernel_signature = void (*)(const platform::DeviceContext&, - const std::vector&, - std::vector&); - auto* kernel_fn = kernel.GetVariadicKernelFn(); - (*kernel_fn)(*dev_ctx, input_inputs, dense_outs); - - return out; -} - -std::vector meshgrid_grad_impl( - const std::vector& inputs, - const std::vector& outputs_grad) { - Backend kernel_backend = Backend::UNDEFINED; - DataLayout kernel_layout = DataLayout::UNDEFINED; - DataType kernel_data_type = DataType::UNDEFINED; - - if (kernel_backend == Backend::UNDEFINED || - kernel_layout == DataLayout::UNDEFINED || - kernel_data_type == DataType::UNDEFINED) { - auto kernel_key_set = ParseKernelKeyByInputArgs(inputs, outputs_grad); - auto kernel_key = kernel_key_set.GetHighestPriorityKernelKey(); - if (kernel_backend == Backend::UNDEFINED) { - kernel_backend = kernel_key.backend(); - } - if (kernel_layout == DataLayout::UNDEFINED) { - kernel_layout = kernel_key.layout(); - } - if (kernel_data_type == DataType::UNDEFINED) { - kernel_data_type = kernel_key.dtype(); - } - } - - const auto& kernel = phi::KernelFactory::Instance().SelectKernelOrThrowError( - "meshgrid_grad", {kernel_backend, kernel_layout, kernel_data_type}); - VLOG(6) << "meshgrid_grad API kernel key: [" << kernel_backend << ", " - << kernel_layout << ", " << kernel_data_type << "]"; - VLOG(6) << "meshgrid_grad API kernel: " << kernel; - - auto* dev_ctx = GetDeviceContextByBackend(kernel_backend); - - auto input_inputs_vec = PrepareData(inputs, kernel.InputAt(0), {}); - std::vector input_inputs(input_inputs_vec->size()); - for (size_t i = 0; i < input_inputs.size(); ++i) { - input_inputs[i] = &input_inputs_vec->at(i); - } - auto input_outputs_grad_vec = - PrepareData(outputs_grad, kernel.InputAt(1), {}); - std::vector input_outputs_grad( - input_outputs_grad_vec->size()); - for (size_t i = 0; i < input_outputs_grad.size(); ++i) { - input_outputs_grad[i] = &input_outputs_grad_vec->at(i); - } - - size_t out_number = inputs.size(); - std::vector api_output; - auto kernel_out = SetKernelOutput(out_number, kernel_backend, &api_output); - - auto inputs_meta_vec = MakeMetaTensor(input_inputs); - std::vector inputs_metas(inputs_meta_vec.size()); - for (size_t i = 0; i < inputs_meta_vec.size(); ++i) { - inputs_metas[i] = &inputs_meta_vec[i]; - } - - auto outputs_grad_meta_vec = MakeMetaTensor(input_outputs_grad); - std::vector outputs_grad_metas( - outputs_grad_meta_vec.size()); - for (size_t i = 0; i < outputs_grad_meta_vec.size(); ++i) { - outputs_grad_metas[i] = &outputs_grad_meta_vec[i]; - } - - std::vector meta_outs; - meta_outs.reserve(out_number); - std::vector meta_out_ptrs; - meta_out_ptrs.reserve(out_number); - for (size_t i = 0; i < out_number; ++i) { - meta_outs.push_back(kernel_out[i]); - meta_out_ptrs.push_back(&meta_outs.back()); - } - - phi::MeshgridGradInferMeta(inputs_metas, outputs_grad_metas, meta_out_ptrs); - - using kernel_signature = void (*)(const platform::DeviceContext&, - const std::vector&, - const std::vector&, - std::vector&); - auto* kernel_fn = kernel.GetVariadicKernelFn(); - (*kernel_fn)(*dev_ctx, input_inputs, input_outputs_grad, kernel_out); - - return api_output; -} - -std::vector multi_dot_grad_impl(const std::vector& x, - const Tensor& out_grad) { - Backend kernel_backend = Backend::UNDEFINED; - DataLayout kernel_layout = DataLayout::UNDEFINED; - DataType kernel_data_type = DataType::UNDEFINED; - - if (kernel_backend == Backend::UNDEFINED || - kernel_layout == DataLayout::UNDEFINED || - kernel_data_type == DataType::UNDEFINED) { - auto kernel_key_set = ParseKernelKeyByInputArgs(x, out_grad); - auto kernel_key = kernel_key_set.GetHighestPriorityKernelKey(); - if (kernel_backend == Backend::UNDEFINED) { - kernel_backend = kernel_key.backend(); - } - if (kernel_layout == DataLayout::UNDEFINED) { - kernel_layout = kernel_key.layout(); - } - if (kernel_data_type == DataType::UNDEFINED) { - kernel_data_type = kernel_key.dtype(); - } - } - - VLOG(6) << "multi_dot_grad API kernel key: [" << kernel_backend << ", " - << kernel_layout << ", " << kernel_data_type << "]"; - const auto& kernel = phi::KernelFactory::Instance().SelectKernelOrThrowError( - "multi_dot_grad", {kernel_backend, kernel_layout, kernel_data_type}); - VLOG(6) << "multi_dot_grad API kernel: " << kernel; - - auto* dev_ctx = GetDeviceContextByBackend(kernel_backend); - - auto input_x_vec = PrepareData(x, kernel.InputAt(0), {}); - std::vector input_x(input_x_vec->size()); - for (size_t i = 0; i < input_x.size(); ++i) { - input_x[i] = &input_x_vec->at(i); - } - auto input_out_grad = PrepareData(out_grad, kernel.InputAt(1), {}); - - size_t out_number = input_x.size(); - std::vector api_output; - auto kernel_out = SetKernelOutput(out_number, kernel_backend, &api_output); - - auto x_meta_vec = MakeMetaTensor(input_x); - std::vector x_metas(x_meta_vec.size()); - for (size_t i = 0; i < x_meta_vec.size(); ++i) { - x_metas[i] = &x_meta_vec[i]; - } - - std::vector meta_outs; - meta_outs.reserve(out_number); - std::vector meta_out_ptrs; - meta_out_ptrs.reserve(out_number); - for (size_t i = 0; i < out_number; ++i) { - meta_outs.push_back(kernel_out[i]); - meta_out_ptrs.push_back(&meta_outs.back()); - } - - phi::MultiDotGradInferMeta( - x_metas, MakeMetaTensor(*input_out_grad), meta_out_ptrs); - - using kernel_signature = void (*)(const platform::DeviceContext&, - const std::vector&, - const phi::DenseTensor&, - std::vector&); - auto* kernel_fn = kernel.GetVariadicKernelFn(); - (*kernel_fn)(*dev_ctx, input_x, *input_out_grad, kernel_out); - - return api_output; -} - -std::vector multiplex_grad_impl(const std::vector& inputs, - const Tensor& ids, - const Tensor& out_grad) { - Backend kernel_backend = Backend::UNDEFINED; - DataLayout kernel_layout = DataLayout::UNDEFINED; - DataType kernel_data_type = DataType::UNDEFINED; - - if (kernel_backend == Backend::UNDEFINED || - kernel_layout == DataLayout::UNDEFINED || - kernel_data_type == DataType::UNDEFINED) { - auto kernel_key_set = ParseKernelKeyByInputArgs(out_grad); - auto kernel_key = kernel_key_set.GetHighestPriorityKernelKey(); - if (kernel_backend == Backend::UNDEFINED) { - kernel_backend = kernel_key.backend(); - } - if (kernel_layout == DataLayout::UNDEFINED) { - kernel_layout = kernel_key.layout(); - } - if (kernel_data_type == DataType::UNDEFINED) { - kernel_data_type = kernel_key.dtype(); - } - } - - VLOG(6) << "multiplex_grad API kernel key: [" << kernel_backend << ", " - << kernel_layout << ", " << kernel_data_type << "]"; - const auto& kernel = phi::KernelFactory::Instance().SelectKernelOrThrowError( - "multiplex_grad", {kernel_backend, kernel_layout, kernel_data_type}); - VLOG(6) << "multiplex_grad API kernel: " << kernel; - - auto* dev_ctx = GetDeviceContextByBackend(kernel_backend); - - auto input_ids = PrepareData(ids, kernel.InputAt(0), {}); - auto input_out_grad = PrepareData(out_grad, kernel.InputAt(1), {}); - - auto out_number = inputs.size(); - std::vector api_output; - auto kernel_out = SetKernelOutput(out_number, kernel_backend, &api_output); - - std::vector meta_outs; - meta_outs.reserve(out_number); - std::vector meta_out_ptrs; - meta_out_ptrs.reserve(out_number); - for (size_t i = 0; i < out_number; ++i) { - meta_outs.push_back(kernel_out[i]); - meta_out_ptrs.push_back(&meta_outs.back()); - } - - phi::MultiplexGradInferMeta(MakeMetaTensor(*input_ids), - MakeMetaTensor(*input_out_grad), - meta_out_ptrs); - - using kernel_signature = void (*)(const platform::DeviceContext&, - const phi::DenseTensor&, - const phi::DenseTensor&, - std::vector&); - auto* kernel_fn = kernel.GetVariadicKernelFn(); - (*kernel_fn)(*dev_ctx, *input_ids, *input_out_grad, kernel_out); - - return api_output; -} - } // namespace experimental } // namespace paddle diff --git a/paddle/phi/api/lib/api_custom_impl.h b/paddle/phi/api/lib/api_custom_impl.h index 0e360ce4a993f..0d1ba3e98e53e 100644 --- a/paddle/phi/api/lib/api_custom_impl.h +++ b/paddle/phi/api/lib/api_custom_impl.h @@ -30,6 +30,20 @@ namespace experimental { ////////////////// Forward api impls ////////////////////// +std::tuple batch_norm_impl( + const Tensor& x, + const Tensor& scale, + const Tensor& bias, + const Tensor& mean, + const Tensor& variance, + float momentum, + float epsilon, + const std::string& data_layout, + bool is_test, + bool use_global_stats, + bool trainable_statistics, + bool fuse_with_relu); + Tensor conv2d_impl(const Tensor& input, const Tensor& filter, const std::vector& strides, @@ -62,8 +76,6 @@ std::vector split_impl(const Tensor& x, const IntArray& num_or_sections, const Scalar& axis); -std::vector meshgrid_impl(const std::vector& inputs); - std::tuple momentum_impl( const Tensor& param, const Tensor& grad, @@ -77,49 +89,14 @@ std::tuple momentum_impl( bool multi_precision, float rescale_grad); -std::vector unbind_impl(const Tensor& input, int axis); - ////////////////// Backward(grad) api impls ////////////////////// std::vector add_n_grad_impl(const std::vector& x, const Tensor& out_grad); -std::tuple batch_norm_impl( - const Tensor& x, - const Tensor& scale, - const Tensor& bias, - const Tensor& mean, - const Tensor& variance, - float momentum, - float epsilon, - const std::string& data_layout, - bool is_test, - bool use_global_stats, - bool trainable_statistics, - bool fuse_with_relu); - -/************************ backward api impl ***************************/ - -std::vector concat_grad_impl(const std::vector& x, - const Tensor& out_grad, - const Scalar& axis); - Tensor imag_grad_impl(const Tensor& x); Tensor real_grad_impl(const Tensor& x); -std::vector stack_grad_impl(const std::vector& x, - const Tensor& out_grad, - int axis); -std::vector meshgrid_grad_impl(const std::vector& inputs, - const std::vector& outputs_grad); - -std::vector multi_dot_grad_impl(const std::vector& x, - const Tensor& out_grad); - -std::vector multiplex_grad_impl(const std::vector& inputs, - const Tensor& ids, - const Tensor& out_grad); - } // namespace experimental } // namespace paddle diff --git a/paddle/phi/api/lib/api_gen_utils.cc b/paddle/phi/api/lib/api_gen_utils.cc index 732ecacde94d7..f9db152956923 100644 --- a/paddle/phi/api/lib/api_gen_utils.cc +++ b/paddle/phi/api/lib/api_gen_utils.cc @@ -76,6 +76,16 @@ std::vector MakeMetaTensor( return meta_tensors; } +std::vector MakeMetaTensor( + const std::vector& tensors) { + std::vector meta_tensors; + meta_tensors.reserve(tensors.size()); + for (auto* t : tensors) { + meta_tensors.emplace_back(*t); + } + return meta_tensors; +} + phi::MetaTensor MakeMetaTensor(const phi::SelectedRows& tensor) { return phi::MetaTensor(tensor); } diff --git a/paddle/phi/api/lib/api_gen_utils.h b/paddle/phi/api/lib/api_gen_utils.h index d7ecef61c5be3..035dfc5204720 100644 --- a/paddle/phi/api/lib/api_gen_utils.h +++ b/paddle/phi/api/lib/api_gen_utils.h @@ -53,6 +53,9 @@ phi::MetaTensor MakeMetaTensor(const phi::DenseTensor& tensor); std::vector MakeMetaTensor( const std::vector& tensors); +std::vector MakeMetaTensor( + const std::vector& tensors); + phi::MetaTensor MakeMetaTensor(const phi::SelectedRows& tensor); phi::MetaTensor MakeMetaTensor(const phi::StringTensor& tensor); diff --git a/paddle/phi/common/int_array.h b/paddle/phi/common/int_array.h index 490d7dabd4007..f9d07249e0fc9 100644 --- a/paddle/phi/common/int_array.h +++ b/paddle/phi/common/int_array.h @@ -96,6 +96,8 @@ class IntArrayBase { template IntArrayBase(const IntArrayBase& other) : array_(other.GetData()) {} + size_t size() const { return array_.size(); } + const std::vector& GetData() const { return array_; } private: diff --git a/python/paddle/utils/code_gen/api.yaml b/python/paddle/utils/code_gen/api.yaml index 6ca61de063b55..96bb3aafa5085 100644 --- a/python/paddle/utils/code_gen/api.yaml +++ b/python/paddle/utils/code_gen/api.yaml @@ -1290,8 +1290,11 @@ - api : meshgrid args : (Tensor[] inputs) - output : Tensor[] - invoke : meshgrid_impl(inputs) + output : Tensor[]{inputs.size()} + infer_meta : + func : MeshgridInferMeta + kernel : + func : meshgrid backward : meshgrid_grad - api : min @@ -2059,8 +2062,11 @@ - api : unbind args : (Tensor input, int axis) - output : Tensor[] - invoke : unbind_impl(input, axis) + output : Tensor[] {axis<0 ? input.dims()[input.dims().size()+axis]:input.dims()[axis]} + infer_meta : + func : UnbindInferMeta + kernel : + func : unbind backward : unbind_grad # unfold diff --git a/python/paddle/utils/code_gen/api_base.py b/python/paddle/utils/code_gen/api_base.py index 275adac8b4972..4325807746e7c 100644 --- a/python/paddle/utils/code_gen/api_base.py +++ b/python/paddle/utils/code_gen/api_base.py @@ -31,6 +31,7 @@ def __init__(self, api_item_yaml): # outputs: # names : [], list of output names # types : [], list of output types + # out_size_expr : [], expression for getting size of vector # return_type : Tensor, vector, ..., the return type of api # args_str: # args_declare : "str" // str of function params with default value. Example: (..., bool flag=false) @@ -67,11 +68,12 @@ def parse_args(self, api_name, api_item_yaml): ] inputs, attrs, args_str = self.parse_input_and_attr( api_name, api_item_yaml['args'], optional_vars) - output_type_list, output_names, return_type = self.parse_output( + output_type_list, output_names, out_size_expr, return_type = self.parse_output( api_name, api_item_yaml['output']) return inputs, attrs, { 'names': output_names, 'types': output_type_list, + 'out_size_expr': out_size_expr, 'return_type': return_type }, args_str, optional_vars @@ -184,39 +186,36 @@ def parse_output_item(output_item): 'Tensor': 'Tensor', 'Tensor[]': 'std::vector' } - if re.search(r'\([a-zA-Z0-9_@]*\)', output_item): - result = re.search( - r"(?P[a-zA-Z0-9_[\]]+)\s*\((?P[a-zA-Z0-9_@]+)\)", - output_item) - out_type = result.group('out_type') - assert out_type in output_type_map, \ - f"{api_name} : Output type error: the output type only support Tensor and Tensor[], \ - but now is {out_type}." - - return output_type_map[out_type], result.group('name') - - else: - if output_item.strip() in output_type_map: - return output_type_map[output_item.strip()], 'out' - else: - raise ValueError( - "{} : Output type error: the output type only support Tensor and Tensor[], \ - but now is {}.".format(api_name, output_item.strip())) + result = re.search( + r"(?P[a-zA-Z0-9_[\]]+)\s*(?P\([a-zA-Z0-9_@]+\))?\s*(?P\{[^\}]+\})?", + output_item) + assert result is not None, f"{api_name} : the output config parse error." + out_type = result.group('out_type') + assert out_type in output_type_map, \ + f"{api_name} : Output type error: the output type only support Tensor and Tensor[], \ + but now is {out_type}." + + out_name = 'out' if result.group('name') is None else result.group( + 'name')[1:-1] + out_size_expr = None if result.group( + 'expr') is None else result.group('expr')[1:-1] + return output_type_map[out_type], out_name, out_size_expr temp_list = output_config.split(',') if len(temp_list) == 1: - out_type, out_name = parse_output_item(temp_list[0]) - return [out_type], [out_name], self.get_return_type([out_type]) + out_type, out_name, size_expr = parse_output_item(temp_list[0]) + return [out_type], [out_name], size_expr, self.get_return_type( + [out_type]) else: out_type_list = [] out_name_list = [] for output_item in temp_list: - out_type, out_name = parse_output_item(output_item) + out_type, out_name, size_expr = parse_output_item(output_item) out_type_list.append(out_type) out_name_list.append(out_name) - return out_type_list, out_name_list, self.get_return_type( + return out_type_list, out_name_list, size_expr, self.get_return_type( out_type_list) def parse_infer_meta(self, infer_meta_config): @@ -462,9 +461,8 @@ def gene_infer_meta(self, kernel_output_names, code_indent) -> str: attr_names = self.attrs['names'] infer_meta = self.infer_meta - infer_meta_params = infer_meta[ - 'param'] + kernel_output_names if infer_meta[ - 'param'] is not None else input_names + attr_names + kernel_output_names + infer_meta_params = infer_meta['param'] if infer_meta[ + 'param'] is not None else input_names + attr_names # generate meta tensors meta_tensor_code = "" param_code = "" @@ -500,11 +498,6 @@ def gene_infer_meta(self, kernel_output_names, code_indent) -> str: raise ValueError( f"{self.api} : Param of infer_meta error : {self.inputs['input_info'][param]} type is not supported." ) - elif param in kernel_output_names: - meta_tensor_code = meta_tensor_code + code_indent + " phi::MetaTensor " + param.replace( - 'kernel_', PREFIX_META_TENSOR_NAME) + "(" + param + ");\n" - param_code = param_code + "&" + param.replace( - 'kernel_', PREFIX_META_TENSOR_NAME) + ", " elif param in attr_names: param_code = param_code + param + ", " elif isinstance(param, str): @@ -514,6 +507,23 @@ def gene_infer_meta(self, kernel_output_names, code_indent) -> str: else: param_code = param_code + str(param) + ", " + for i, out_name in enumerate(kernel_output_names): + if self.outputs['types'][i] == 'std::vector': + meta_tensor_code = meta_tensor_code + f""" +{code_indent} auto {out_name}_{PREFIX_META_TENSOR_NAME}vec = MakeMetaTensor({out_name}); +{code_indent} std::vector {out_name}_metas({out_name}_{PREFIX_META_TENSOR_NAME}vec.size()); +{code_indent} for (size_t i = 0; i < {out_name}_{PREFIX_META_TENSOR_NAME}vec.size(); ++i) {{ +{code_indent} {out_name}_metas[i] = &{out_name}_{PREFIX_META_TENSOR_NAME}vec[i]; +{code_indent} }}""" + + param_code = param_code + out_name + '_metas, ' + else: + meta_tensor_code = meta_tensor_code + code_indent + " phi::MetaTensor " + out_name.replace( + 'kernel_', + PREFIX_META_TENSOR_NAME) + "(" + out_name + ");\n" + param_code = param_code + "&" + out_name.replace( + 'kernel_', PREFIX_META_TENSOR_NAME) + ", " + param_code = param_code[:-2] return f"""{meta_tensor_code} {code_indent} phi::{infer_meta['func']}({param_code}); diff --git a/python/paddle/utils/code_gen/api_gen.py b/python/paddle/utils/code_gen/api_gen.py index 4087b55b51324..538958c2361bc 100644 --- a/python/paddle/utils/code_gen/api_gen.py +++ b/python/paddle/utils/code_gen/api_gen.py @@ -91,7 +91,16 @@ def gene_output(self, 0]] if inplace_flag and self.inplace_map is not None and self.outputs[ 'names'][0] in self.inplace_map else "" output_create = f""" -{code_indent} {self.outputs['return_type']} api_output{inplace_assign}; +{code_indent} {self.outputs['return_type']} api_output{inplace_assign};""" + + if self.outputs['return_type'] == 'std::vector': + assert self.outputs['out_size_expr'] is not None, \ + f"{api_name}: The out size expr : '{{expr}}' should be set when output has Tensor[]. You can refer 'split' api." + output_create = output_create + f""" +{code_indent} auto kernel_out = {set_out_func}({self.outputs['out_size_expr']}, kernel_backend, &api_output);""" + + else: + output_create = output_create + f""" {code_indent} auto kernel_out = {set_out_func}(kernel_backend, &api_output);""" if not inplace_flag and self.view_map is not None and self.outputs[ @@ -113,7 +122,14 @@ def gene_output(self, output_create = output_create + f""" {code_indent} std::get<{i}>(api_output) = {self.inplace_map[self.outputs['names'][i]]};""" - output_create = output_create + f""" + if output_type_list[i] == 'std::vector': + assert self.outputs['out_size_expr'][i] is not None, \ + f"{api_name}: The out size expr : '{{expr}}' should be set when output has Tensor[]. You can refer 'split' api." + output_create = output_create + f""" +{code_indent} auto kernel_out_{i} = {set_out_func}({self.outputs['out_size_expr'][i]}, kernel_backend, &std::get<{i}>(api_output));""" + + else: + output_create = output_create + f""" {code_indent} auto kernel_out_{i} = {set_out_func}(kernel_backend, &std::get<{i}>(api_output));""" if not inplace_flag and self.view_map is not None and self.outputs[ diff --git a/python/paddle/utils/code_gen/backward.yaml b/python/paddle/utils/code_gen/backward.yaml index 555ec600bf7e7..90815cfe9af93 100644 --- a/python/paddle/utils/code_gen/backward.yaml +++ b/python/paddle/utils/code_gen/backward.yaml @@ -44,7 +44,7 @@ - backward_api : add_n_grad forward : add_n (Tensor[] x) -> Tensor(out) args : (Tensor[] x, Tensor out_grad) - output : Tensor[](x_grad) + output : Tensor[](x_grad){x.size()} invoke : add_n_grad_impl(x, out_grad) no_need_buffer : x @@ -215,8 +215,12 @@ - backward_api : concat_grad forward : concat (Tensor[] x, Scalar axis) -> Tensor(out) args : (Tensor[] x, Tensor out_grad, Scalar axis = 0) - output : Tensor[](x_grad) - invoke : concat_grad_impl(x, out_grad, axis) + output : Tensor[](x_grad){x.size()} + infer_meta : + func : UnchangedMultiInferMeta + param : [x] + kernel : + func : concat_grad no_need_buffer : x - backward_api : conj_grad @@ -944,8 +948,11 @@ - backward_api : meshgrid_grad forward : meshgrid (Tensor[] inputs) -> Tensor[](outputs) args : (Tensor[] inputs, Tensor[] outputs_grad) - output : Tensor[](inputs_grad) - invoke : meshgrid_grad_impl(inputs, outputs_grad) + output : Tensor[](inputs_grad){inputs.size()} + infer_meta : + func : MeshgridGradInferMeta + kernel : + func : meshgrid_grad - backward_api : min_grad forward: min (Tensor x, int64_t[] dims={}, bool keep_dim=false) -> Tensor(out) @@ -1001,14 +1008,22 @@ - backward_api : multi_dot_grad forward : multi_dot (Tensor[] x) -> Tensor(out) args : (Tensor[] x, Tensor out_grad) - output : Tensor[](x_grad) - invoke : multi_dot_grad_impl(x, out_grad) + output : Tensor[](x_grad) {x.size()} + infer_meta : + func : MultiDotGradInferMeta + kernel : + func : multi_dot_grad - backward_api : multiplex_grad forward : multiplex (Tensor[] ins, Tensor ids) -> Tensor(out) args : (Tensor[] ins, Tensor ids, Tensor out_grad) - output : Tensor[](ins_grad) - invoke : multiplex_grad_impl(ins, ids, out_grad) + output : Tensor[](ins_grad){ins.size()} + infer_meta : + func : MultiplexGradInferMeta + param : [ids, out_grad] + kernel : + func : multiplex_grad + param : [ids, out_grad] - backward_api : multiply_grad forward : multiply (Tensor x, Tensor y) -> Tensor(out) @@ -1448,8 +1463,13 @@ - backward_api : stack_grad forward : stack (Tensor[] x, int axis) -> Tensor(out) args : (Tensor[] x, Tensor out_grad, int axis) - output : Tensor[](x_grad) - invoke : stack_grad_impl(x, out_grad, axis) + output : Tensor[](x_grad){x.size()} + infer_meta : + func : StackGradInferMeta + param: [out_grad, axis] + kernel : + func : stack_grad + param : [out_grad, axis] no_need_buffer : x - backward_api : strided_slice_grad diff --git a/python/paddle/utils/code_gen/backward_api_gen.py b/python/paddle/utils/code_gen/backward_api_gen.py index 46aa3e7e23d51..a88339c607c55 100644 --- a/python/paddle/utils/code_gen/backward_api_gen.py +++ b/python/paddle/utils/code_gen/backward_api_gen.py @@ -35,7 +35,7 @@ def parse_forward_config(self, forward_config): r"(?P[a-z][a-z0-9_]+)\s*(?P\([^\)]+\))\s*->\s*(?P.+)", forward_config) api = result.group('api') - _, outputs, _ = self.parse_output(self.api, result.group('outputs')) + _, outputs, _, _ = self.parse_output(self.api, result.group('outputs')) outputs = [item.split('@')[0] for item in outputs] fw_inputs, fw_attrs, _, = self.parse_input_and_attr( api, result.group('args')) @@ -110,7 +110,16 @@ def gene_output(self, 0]] if inplace_flag and self.inplace_map is not None and self.outputs[ 'names'][0] in self.inplace_map else "" output_create = f""" -{code_indent} {self.outputs['return_type']} api_output{inplace_assign}; +{code_indent} {self.outputs['return_type']} api_output{inplace_assign};""" + + if output_type_list[0] == 'std::vector': + assert self.outputs['out_size_expr'] is not None, \ + f"{api_name}: The out size expr : '{{expr}}' should be set when output has Tensor[]. You can refer 'split' api." + output_create = output_create + f""" +{code_indent} auto kernel_out = {set_out_func}({self.outputs['out_size_expr']}, kernel_backend, &api_output);""" + + else: + output_create = output_create + f""" {code_indent} auto kernel_out = {set_out_func}(kernel_backend, &api_output);""" elif len(output_type_list) > 1: @@ -121,7 +130,6 @@ def gene_output(self, kernel_output = kernel_output + f'kernel_out_{i}, ' output_names.append(f'kernel_out_{i}') if out_type_item == 'Tensor': - get_out_code = f'&api_output[{i}][0]' if inplace_flag and self.inplace_map is not None and self.outputs[ 'names'][i] in self.inplace_map: output_create = output_create + f""" @@ -131,6 +139,9 @@ def gene_output(self, output_create = output_create + f""" {code_indent} api_output[{i}].emplace_back();""" + output_create = output_create + f""" +{code_indent} auto kernel_out_{i} = {set_out_func}(kernel_backend, &api_output[{i}][0]);""" + else: get_out_code = f'&api_output[{i}]' if inplace_flag and self.inplace_map is not None and self.outputs[ @@ -138,8 +149,10 @@ def gene_output(self, output_create = output_create + f""" {code_indent} api_output[{i}] = {self.inplace_map[self.outputs['names'][i]]};""" - output_create = output_create + f""" -{code_indent} auto kernel_out_{i} = {set_out_func}(kernel_backend, {get_out_code});""" + assert self.outputs['out_size_expr'][i] is not None, \ + f"{api_name}: The out size expr : '{{expr}}' should be set when output has Tensor[]. You can refer 'split' api." + output_create = output_create + f""" +{code_indent} auto kernel_out_{i} = {set_out_func}({self.outputs['out_size_expr'][i]}, kernel_backend, &api_output[{i}]);""" kernel_output = kernel_output[:-2] else: From 362c7c803f32c077993a738de04def0becea83a8 Mon Sep 17 00:00:00 2001 From: Chen Weihang Date: Tue, 12 Apr 2022 10:09:50 +0800 Subject: [PATCH 075/211] [CustomOp]Add new method for custom double grad (#41538) * add new method for custom double grad * add tanh double grad unittest * change year * revert tensor init method --- paddle/fluid/framework/custom_operator.cc | 30 ++-- paddle/phi/api/ext/op_meta_info.h | 10 ++ .../fluid/tests/custom_op/CMakeLists.txt | 1 + .../fluid/tests/custom_op/custom_tanh_op.cc | 149 ++++++++++++++++++ .../custom_op/test_custom_tanh_double_grad.py | 89 +++++++++++ 5 files changed, 269 insertions(+), 10 deletions(-) create mode 100644 python/paddle/fluid/tests/custom_op/custom_tanh_op.cc create mode 100644 python/paddle/fluid/tests/custom_op/test_custom_tanh_double_grad.py diff --git a/paddle/fluid/framework/custom_operator.cc b/paddle/fluid/framework/custom_operator.cc index 6f74fbe5f8f8b..df4879735bb82 100644 --- a/paddle/fluid/framework/custom_operator.cc +++ b/paddle/fluid/framework/custom_operator.cc @@ -67,9 +67,17 @@ inline static bool IsDuplicableVar(const std::string& var_name) { return var_name.rfind(suffix) != std::string::npos; } -inline static std::string NoGrad(const std::string& var_name) { +inline static std::string NoGrad(const std::string& var_name, + bool is_double_grad = false) { std::string suffix = kGradVarSuffix; - return var_name.substr(0, var_name.size() - kGradVarSuffixSize); + std::string new_out_suffix = kDoubleGradNewOutSuffix; + std::string tmp_var_name(var_name); + if (is_double_grad && + (tmp_var_name.rfind(new_out_suffix) != std::string::npos)) { + tmp_var_name = tmp_var_name.substr( + 0, tmp_var_name.size() - /*kDoubleGradNewOutSuffix length*/ 4); + } + return tmp_var_name.substr(0, tmp_var_name.size() - kGradVarSuffixSize); } inline static bool IsGradVar(const std::string& var_name, bool is_double_grad) { @@ -533,11 +541,12 @@ class CustomGradOpMaker : public SingleGradOpMaker { for (auto& out_name : outputs_) { VLOG(3) << "Custom Operator: GradOpDescMaker - output: " << out_name; if (detail::IsDuplicableVar(out_name)) { - grad_op->SetOutput(out_name, - this->InputGrad(detail::NoGrad(out_name), - /*drop_empty_grad=*/false)); + grad_op->SetOutput( + out_name, this->InputGrad(detail::NoGrad(out_name, is_double_grad_), + /*drop_empty_grad=*/false)); } else { - grad_op->SetOutput(out_name, this->InputGrad(detail::NoGrad(out_name))); + grad_op->SetOutput(out_name, this->InputGrad(detail::NoGrad( + out_name, is_double_grad_))); } } grad_op->SetAttrMap(this->Attrs()); @@ -600,7 +609,8 @@ class CustomGradOpMaker } for (auto& out_name : outputs_) { VLOG(3) << "Custom Operator: GradOpBaseMaker - output: " << out_name; - grad_op->SetOutput(out_name, this->InputGrad(detail::NoGrad(out_name))); + grad_op->SetOutput( + out_name, this->InputGrad(detail::NoGrad(out_name, is_double_grad_))); } grad_op->SetAttrMap(this->Attrs()); } @@ -885,8 +895,8 @@ void RegisterOperatorWithMetaInfo(const std::vector& op_meta_infos, // Grad InferShape if (grad_infer_shape_fn == nullptr) { - grad_info.infer_shape_ = [grad_op_inputs, - grad_op_outputs](InferShapeContext* ctx) { + grad_info.infer_shape_ = [grad_op_inputs, grad_op_outputs, + is_double_grad](InferShapeContext* ctx) { // 1. if forward input exists, gradient's shape is same with forward // input // default @@ -897,7 +907,7 @@ void RegisterOperatorWithMetaInfo(const std::vector& op_meta_infos, // [Suitable for the situation that forward input is not used as // backward input] for (auto& out_name : grad_op_outputs) { - auto fwd_name = detail::NoGrad(out_name); + auto fwd_name = detail::NoGrad(out_name, is_double_grad); if (detail::IsDuplicableVar(fwd_name)) { // Duplicable forward var must as backward input ctx->ShareDim(fwd_name, out_name); diff --git a/paddle/phi/api/ext/op_meta_info.h b/paddle/phi/api/ext/op_meta_info.h index 88660449b6821..f820d225eff8c 100644 --- a/paddle/phi/api/ext/op_meta_info.h +++ b/paddle/phi/api/ext/op_meta_info.h @@ -58,6 +58,7 @@ using Tensor = paddle::experimental::Tensor; constexpr char kGradTensorSuffix[] = "@GRAD"; constexpr char kTensorVectorSuffix[] = "@VECTOR"; +constexpr char kDoubleGradNewOutSuffix[] = "@NEW"; // Used for Construct Grad Tensor name inline std::string Grad(const std::string& t_name) { @@ -77,6 +78,15 @@ inline std::string Vec(const std::string& t_name) { return result; } +// Used for Construct double grad output name +inline std::string New(const std::string& t_name) { + std::string result; + result.reserve(t_name.size() + 4U); + result += t_name; + result += kDoubleGradNewOutSuffix; + return result; +} + PADDLE_API void AssignTensorImpl(const Tensor& src, Tensor* dst); ////////////////////// Kernel Context //////////////////////// diff --git a/python/paddle/fluid/tests/custom_op/CMakeLists.txt b/python/paddle/fluid/tests/custom_op/CMakeLists.txt index a97afde385092..c20630f1a093e 100644 --- a/python/paddle/fluid/tests/custom_op/CMakeLists.txt +++ b/python/paddle/fluid/tests/custom_op/CMakeLists.txt @@ -21,6 +21,7 @@ py_test(test_custom_concat SRCS test_custom_concat.py) py_test(test_custom_conj SRCS test_custom_conj.py) py_test(test_custom_linear SRCS test_custom_linear.py) py_test(test_custom_simple_slice SRCS test_custom_simple_slice.py) +py_test(test_custom_tanh_double_grad SRCS test_custom_tanh_double_grad.py) # other tests py_test(test_sysconfig SRCS test_sysconfig.py) diff --git a/python/paddle/fluid/tests/custom_op/custom_tanh_op.cc b/python/paddle/fluid/tests/custom_op/custom_tanh_op.cc new file mode 100644 index 0000000000000..f96297d69bd5b --- /dev/null +++ b/python/paddle/fluid/tests/custom_op/custom_tanh_op.cc @@ -0,0 +1,149 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include +#include + +#include "paddle/extension.h" + +#define CHECK_CPU_INPUT(x) \ + PD_CHECK(x.place() == paddle::PlaceType::kCPU, #x " must be a CPU Tensor.") + +template +void tanh_cpu_forward_kernel(const data_t* x_data, + data_t* out_data, + int64_t x_numel) { + PD_CHECK(x_data != nullptr, "x_data is nullptr."); + PD_CHECK(out_data != nullptr, "out_data is nullptr."); + for (int64_t i = 0; i < x_numel; ++i) { + out_data[i] = std::tanh(x_data[i]); + } +} + +template +void tanh_cpu_backward_kernel(const data_t* grad_out_data, + const data_t* out_data, + data_t* grad_x_data, + int64_t out_numel) { + PD_CHECK(grad_out_data != nullptr, "grad_out_data is nullptr."); + PD_CHECK(out_data != nullptr, "out_data is nullptr."); + PD_CHECK(grad_x_data != nullptr, "grad_x_data is nullptr."); + for (int64_t i = 0; i < out_numel; ++i) { + grad_x_data[i] = + grad_out_data[i] * (static_cast(1) - out_data[i] * out_data[i]); + } +} + +template +void tanh_cpu_double_backward_kernel(const data_t* out_data, + const data_t* ddx_data, + const data_t* dout_data, + data_t* dout_new_data, + data_t* ddout_data, + int64_t ddout_numel) { + PD_CHECK(out_data != nullptr, "out_data is nullptr."); + PD_CHECK(ddx_data != nullptr, "ddx_data is nullptr."); + PD_CHECK(dout_data != nullptr, "dout_data is nullptr."); + PD_CHECK(dout_new_data != nullptr, "dout_new_data is nullptr."); + PD_CHECK(ddout_data != nullptr, "ddout_data is nullptr."); + for (int64_t i = 0; i < ddout_numel; ++i) { + dout_new_data[i] = static_cast(-1) * dout_data[i] * + static_cast(2) * out_data[i] * ddx_data[i]; + ddout_data[i] = + ddx_data[i] * (static_cast(1) - out_data[i] * out_data[i]); + } +} + +std::vector TanhForward(const paddle::Tensor& x) { + CHECK_CPU_INPUT(x); + auto out = paddle::Tensor(paddle::PlaceType::kCPU, x.shape()); + + PD_DISPATCH_FLOATING_TYPES( + x.dtype(), "tanh_cpu_forward", ([&] { + tanh_cpu_forward_kernel( + x.data(), out.mutable_data(x.place()), x.size()); + })); + + return {out}; +} + +std::vector TanhBackward(const paddle::Tensor& out, + const paddle::Tensor& grad_out) { + CHECK_CPU_INPUT(out); + auto grad_x = paddle::Tensor(paddle::PlaceType::kCPU, out.shape()); + + PD_DISPATCH_FLOATING_TYPES(out.dtype(), "tanh_cpu_backward", ([&] { + tanh_cpu_backward_kernel( + grad_out.data(), + out.data(), + grad_x.mutable_data(out.place()), + out.size()); + })); + + return {grad_x}; +} + +std::vector TanhDoubleBackward(const paddle::Tensor& out, + const paddle::Tensor& ddx, + const paddle::Tensor& dout) { + CHECK_CPU_INPUT(out); + CHECK_CPU_INPUT(ddx); + CHECK_CPU_INPUT(dout); + auto dout_new = paddle::Tensor(paddle::PlaceType::kCPU, out.shape()); + auto ddout = paddle::Tensor(paddle::PlaceType::kCPU, out.shape()); + + PD_DISPATCH_FLOATING_TYPES(out.dtype(), "tanh_cpu_double_backward", ([&] { + tanh_cpu_double_backward_kernel( + out.data(), + ddx.data(), + dout.data(), + dout_new.mutable_data(out.place()), + ddout.mutable_data(out.place()), + ddout.size()); + })); + + return {dout_new, ddout}; +} + +std::vector> TanhBackwardInferShape( + const std::vector& out_shape, + const std::vector& dout_shape) { + return {out_shape}; +} + +std::vector> TanhDoubleBackwardInferShape( + const std::vector& out_shape, + const std::vector& ddx_shape, + const std::vector& dout_shape) { + return {dout_shape, dout_shape}; +} + +PD_BUILD_OP(custom_tanh) + .Inputs({"X"}) + .Outputs({"Out"}) + .SetKernelFn(PD_KERNEL(TanhForward)); + +PD_BUILD_GRAD_OP(custom_tanh) + .Inputs({"Out", paddle::Grad("Out")}) + .Outputs({paddle::Grad("X")}) + .SetKernelFn(PD_KERNEL(TanhBackward)) + .SetInferShapeFn(PD_INFER_SHAPE(TanhBackwardInferShape)); + +PD_BUILD_DOUBLE_GRAD_OP(custom_tanh) + .Inputs({"Out", paddle::Grad(paddle::Grad("X")), paddle::Grad("Out")}) + .Outputs({paddle::New(paddle::Grad("Out")), + paddle::Grad(paddle::Grad("Out"))}) + .SetKernelFn(PD_KERNEL(TanhDoubleBackward)) + .SetInferShapeFn(PD_INFER_SHAPE(TanhDoubleBackwardInferShape)); diff --git a/python/paddle/fluid/tests/custom_op/test_custom_tanh_double_grad.py b/python/paddle/fluid/tests/custom_op/test_custom_tanh_double_grad.py new file mode 100644 index 0000000000000..1127108c361ad --- /dev/null +++ b/python/paddle/fluid/tests/custom_op/test_custom_tanh_double_grad.py @@ -0,0 +1,89 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import unittest +import numpy as np + +import paddle +import paddle.static as static +from paddle.utils.cpp_extension import load, get_build_directory +from paddle.utils.cpp_extension.extension_utils import run_cmd +from utils import paddle_includes, extra_cc_args, extra_nvcc_args +from paddle.fluid.framework import _test_eager_guard + +# Because Windows don't use docker, the shared lib already exists in the +# cache dir, it will not be compiled again unless the shared lib is removed. +file = '{}\\custom_tanh\\custom_tanh.pyd'.format(get_build_directory()) +if os.name == 'nt' and os.path.isfile(file): + cmd = 'del {}'.format(file) + run_cmd(cmd, True) + +custom_ops = load( + name='custom_tanh_jit', + sources=['custom_tanh_op.cc'], + extra_include_paths=paddle_includes, # add for Coverage CI + extra_cxx_cflags=extra_cc_args, # test for cc flags + extra_cuda_cflags=extra_nvcc_args, # test for nvcc flags + verbose=True) + + +def custom_tanh_double_grad_dynamic(func, device, dtype, np_x): + paddle.set_device(device) + + t = paddle.to_tensor(np_x, dtype=dtype, stop_gradient=False) + + out = func(t) + out.stop_gradient = False + + dx = paddle.grad( + outputs=[out], inputs=[t], create_graph=True, retain_graph=True) + + dx[0].backward() + + assert out.grad is not None + assert dx[0].grad is not None + return dx[0].numpy(), dx[0].grad.numpy(), out.grad.numpy() + + +class TestCustomTanhDoubleGradJit(unittest.TestCase): + def setUp(self): + paddle.set_device('cpu') + self.dtypes = ['float32', 'float64'] + self.devices = ['cpu'] + + def test_func_double_grad_dynamic(self): + for device in self.devices: + for dtype in self.dtypes: + x = np.random.uniform(-1, 1, [4, 8]).astype(dtype) + out, dx_grad, dout = custom_tanh_double_grad_dynamic( + custom_ops.custom_tanh, device, dtype, x) + pd_out, pd_dx_grad, pd_dout = custom_tanh_double_grad_dynamic( + paddle.tanh, device, dtype, x) + self.assertTrue( + np.allclose(out, pd_out), + "custom op out: {},\n paddle api out: {}".format(out, + pd_out)) + self.assertTrue( + np.allclose(dx_grad, pd_dx_grad), + "custom op dx grad: {},\n paddle api dx grad: {}".format( + dx_grad, pd_dx_grad)) + self.assertTrue( + np.allclose(dout, pd_dout), + "custom op out grad: {},\n paddle api out grad: {}".format( + dout, pd_dout)) + + +if __name__ == "__main__": + unittest.main() From bc01242b87897046ac540166b9182b1d0b2b1fcf Mon Sep 17 00:00:00 2001 From: FlyingQianMM <245467267@qq.com> Date: Tue, 12 Apr 2022 10:12:45 +0800 Subject: [PATCH 076/211] add a inner loop for index_select_grad_init() in index_select op when dealing with large-shape data (#41563) * replace for with CUDA_KERNEL_LOOP for index_select_grad_init() in index_select op * use CUDA_KERNEL_LOOP_TYPE * fix code style * replace index_select_grad_init with SetConstant --- paddle/phi/kernels/funcs/gather.cu.h | 15 +++++---------- paddle/phi/kernels/funcs/scatter.cu.h | 16 +++++++--------- .../phi/kernels/gpu/index_select_grad_kernel.cu | 16 ++++------------ paddle/phi/kernels/gpu/index_select_kernel.cu | 2 +- 4 files changed, 17 insertions(+), 32 deletions(-) diff --git a/paddle/phi/kernels/funcs/gather.cu.h b/paddle/phi/kernels/funcs/gather.cu.h index 59c8c9f3b8f0e..617d249308cda 100644 --- a/paddle/phi/kernels/funcs/gather.cu.h +++ b/paddle/phi/kernels/funcs/gather.cu.h @@ -17,6 +17,7 @@ limitations under the License. */ #include #include "paddle/fluid/memory/memcpy.h" // TODO(paddle-dev): move gpu_primitives.h to phi +#include "paddle/fluid/platform/device/gpu/gpu_launch_config.h" #include "paddle/fluid/platform/device/gpu/gpu_primitives.h" #include "paddle/phi/backends/gpu/gpu_launch_config.h" #include "paddle/phi/common/place.h" @@ -110,11 +111,8 @@ void GPUGather(const phi::GPUContext& ctx, int block = 512; int64_t n = slice_size * index_size; - int64_t grid = (n + block - 1) / block; - unsigned int maxGridDimX = ctx.GetCUDAMaxGridDimSize()[0]; - if (grid > maxGridDimX) { - grid = maxGridDimX; - } + dim3 grid = dim3((n + block - 1) / block); + paddle::platform::LimitGridDim(ctx, &grid); GatherCUDAKernel<<>>( p_src, p_index, p_output, index_size, slice_size); @@ -155,11 +153,8 @@ void GPUGatherNd(const phi::GPUContext& ctx, int block = 512; int64_t n = slice_size * remain_numel; - int64_t grid = (n + block - 1) / block; - unsigned int maxGridDimX = ctx.GetCUDAMaxGridDimSize()[0]; - if (grid > maxGridDimX) { - grid = maxGridDimX; - } + dim3 grid = dim3((n + block - 1) / block); + paddle::platform::LimitGridDim(ctx, &grid); GatherNdCUDAKernel<<>>(p_input, g_input_dims, diff --git a/paddle/phi/kernels/funcs/scatter.cu.h b/paddle/phi/kernels/funcs/scatter.cu.h index 254dd45edb596..87083af3bc6a2 100644 --- a/paddle/phi/kernels/funcs/scatter.cu.h +++ b/paddle/phi/kernels/funcs/scatter.cu.h @@ -15,6 +15,7 @@ limitations under the License. */ #pragma once #include #include +#include "paddle/fluid/platform/device/gpu/gpu_launch_config.h" #include "paddle/fluid/platform/device/gpu/gpu_primitives.h" #include "paddle/phi/common/place.h" #include "paddle/phi/core/dense_tensor.h" @@ -155,9 +156,8 @@ void GPUScatterAssign(const phi::GPUContext& ctx, // set block and grid num int block = 512; int64_t n = slice_size * index_size; - int64_t grid = (n + block - 1) / block; - unsigned int maxGridDimX = ctx.GetCUDAMaxGridDimSize()[0]; - grid = grid > maxGridDimX ? maxGridDimX : grid; + dim3 grid = dim3((n + block - 1) / block); + paddle::platform::LimitGridDim(ctx, &grid); // if not overwrite mode, init data if (!overwrite) { @@ -188,9 +188,8 @@ void GPUScatterGradForX(const phi::GPUContext& ctx, int64_t block = 512; int64_t n = slice_size * index_size; int64_t height = (n + block - 1) / block; - - int64_t max_grid_dimx = ctx.GetCUDAMaxGridDimSize()[0]; - int64_t grid = height < max_grid_dimx ? height : max_grid_dimx; + dim3 grid = dim3((n + block - 1) / block); + paddle::platform::LimitGridDim(ctx, &grid); ScatterInitCUDAKernel<<>>( p_index, p_output, index_size, slice_size); @@ -230,9 +229,8 @@ void GPUScatterNdAdd(const phi::GPUContext& ctx, int block = 512; int64_t n = slice_size * remain_numel; - int64_t grid = (n + block - 1) / block; - unsigned int maxGridDimX = ctx.GetCUDAMaxGridDimSize()[0]; - grid = grid > maxGridDimX ? maxGridDimX : grid; + dim3 grid = dim3((n + block - 1) / block); + paddle::platform::LimitGridDim(ctx, &grid); ScatterNdCUDAKernel<<>>( p_update, diff --git a/paddle/phi/kernels/gpu/index_select_grad_kernel.cu b/paddle/phi/kernels/gpu/index_select_grad_kernel.cu index 75ae1bbcd0a08..84094f4c1ee5a 100644 --- a/paddle/phi/kernels/gpu/index_select_grad_kernel.cu +++ b/paddle/phi/kernels/gpu/index_select_grad_kernel.cu @@ -19,6 +19,7 @@ #include "paddle/phi/backends/gpu/gpu_info.h" #include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/core/utils/data_type.h" +#include "paddle/phi/kernels/funcs/math_function.h" DECLARE_bool(cudnn_deterministic); @@ -35,7 +36,7 @@ __global__ void index_select_grad_cuda_kernel(const T* output_grad, int64_t stride, int64_t size, int64_t delta) { - CUDA_KERNEL_LOOP(idx, N) { + CUDA_KERNEL_LOOP_TYPE(idx, N, int64_t) { int64_t pre_idx = idx / (stride * size); int64_t dim_idx = idx % (stride * size) / stride; IndexT src_dim_idx = index[dim_idx]; @@ -45,15 +46,6 @@ __global__ void index_select_grad_cuda_kernel(const T* output_grad, } } -template -__global__ void index_select_grad_init(T* input_grad, int64_t N) { - int64_t idx = blockIdx.x * blockDim.x + threadIdx.x; - if (idx >= N) { - return; - } - input_grad[idx] = 0.0; -} - template void IndexSelectGradKernel(const Context& ctx, const DenseTensor& x, @@ -97,8 +89,8 @@ void IndexSelectGradKernel(const Context& ctx, dim3 grid_dim = dim3((numel + block_dim - 1) / block_dim); paddle::platform::LimitGridDim(ctx, &grid_dim); - index_select_grad_init<<>>(in_grad_data, - numel); + phi::funcs::SetConstant index_select_grad_init; + index_select_grad_init(ctx, x_grad, static_cast(0)); if (FLAGS_cudnn_deterministic) { VLOG(2) << "Run grad kernel of index_select with single thread."; diff --git a/paddle/phi/kernels/gpu/index_select_kernel.cu b/paddle/phi/kernels/gpu/index_select_kernel.cu index 38a6582d790f8..0a6ac69cef098 100644 --- a/paddle/phi/kernels/gpu/index_select_kernel.cu +++ b/paddle/phi/kernels/gpu/index_select_kernel.cu @@ -32,7 +32,7 @@ __global__ void index_select_cuda_kernel(const T* input, int64_t stride, int64_t size, int64_t delta) { - CUDA_KERNEL_LOOP(idx, N) { + CUDA_KERNEL_LOOP_TYPE(idx, N, int64_t) { int64_t pre_idx = idx / (stride * size); int64_t dim_idx = idx % (stride * size) / stride; IndexT src_dim_idx = index[dim_idx]; From 51cae7f78a6ed5af750ea49f84852a064396a0f9 Mon Sep 17 00:00:00 2001 From: JingZhuangzhuang <75348594+JZZ-NOTE@users.noreply.github.com> Date: Tue, 12 Apr 2022 10:13:56 +0800 Subject: [PATCH 077/211] fix_paddle_numel_check (#41607) * fix_paddle_numel_check * fix_paddle_numel_check --- paddle/fluid/platform/device/gpu/gpu_launch_config.h | 9 +++++---- paddle/phi/backends/gpu/gpu_launch_config.h | 12 ++++++------ 2 files changed, 11 insertions(+), 10 deletions(-) diff --git a/paddle/fluid/platform/device/gpu/gpu_launch_config.h b/paddle/fluid/platform/device/gpu/gpu_launch_config.h index 4a550e61d42da..80d60ca95bf6f 100644 --- a/paddle/fluid/platform/device/gpu/gpu_launch_config.h +++ b/paddle/fluid/platform/device/gpu/gpu_launch_config.h @@ -99,10 +99,11 @@ struct GpuLaunchConfig { inline GpuLaunchConfig GetGpuLaunchConfig1D( const platform::CUDADeviceContext& context, int64_t numel, int vec_size = 1) { - PADDLE_ENFORCE_GT(numel, 0, platform::errors::InvalidArgument( - "element quantity should be greater than 0," - " but received value is: %d.", - numel)); + PADDLE_ENFORCE_GE(numel, 0, + platform::errors::InvalidArgument( + "element quantity should be greater than or equal 0," + " but received value is: %d.", + numel)); // Get compute_capability const int capability = context.GetComputeCapability(); /* If thread number per block is 64/128/256/512, cuda performs better.*/ diff --git a/paddle/phi/backends/gpu/gpu_launch_config.h b/paddle/phi/backends/gpu/gpu_launch_config.h index ea54083e8179b..888b44632ea28 100644 --- a/paddle/phi/backends/gpu/gpu_launch_config.h +++ b/paddle/phi/backends/gpu/gpu_launch_config.h @@ -101,12 +101,12 @@ struct GpuLaunchConfig { inline GpuLaunchConfig GetGpuLaunchConfig1D(const phi::GPUContext& context, int64_t numel, int vec_size = 1) { - PADDLE_ENFORCE_GT( - numel, - 0, - phi::errors::InvalidArgument("element quantity should be greater than 0," - " but received value is: %d.", - numel)); + PADDLE_ENFORCE_GE(numel, + 0, + phi::errors::InvalidArgument( + "element quantity should be greater than or equal 0," + " but received value is: %d.", + numel)); // Get compute_capability const int capability = context.GetComputeCapability(); /* If thread number per block is 64/128/256/512, cuda performs better.*/ From b68bb4282358728fedfe7657ee2e700963bcf383 Mon Sep 17 00:00:00 2001 From: "joanna.wozna.intel" Date: Tue, 12 Apr 2022 04:16:17 +0200 Subject: [PATCH 078/211] Add possibility to test native config in mkldnn tests (#41562) --- ...er_bfloat16_image_classification_tester.cc | 6 ++++-- ...alyzer_int8_image_classification_tester.cc | 6 ++++-- .../analyzer_int8_object_detection_tester.cc | 21 ++++++++++++------- ...lyzer_quant_image_classification_tester.cc | 4 +++- 4 files changed, 24 insertions(+), 13 deletions(-) diff --git a/paddle/fluid/inference/tests/api/analyzer_bfloat16_image_classification_tester.cc b/paddle/fluid/inference/tests/api/analyzer_bfloat16_image_classification_tester.cc index f267f0f28d685..267fb17ee6baa 100644 --- a/paddle/fluid/inference/tests/api/analyzer_bfloat16_image_classification_tester.cc +++ b/paddle/fluid/inference/tests/api/analyzer_bfloat16_image_classification_tester.cc @@ -16,6 +16,8 @@ limitations under the License. */ #include "paddle/fluid/inference/tests/api/tester_helper.h" #include "paddle/fluid/platform/cpu_info.h" +DEFINE_bool(enable_mkldnn, true, "Enable MKLDNN"); + namespace paddle { namespace inference { namespace analysis { @@ -31,7 +33,7 @@ void SetConfig(AnalysisConfig *cfg) { cfg->SwitchIrOptim(); cfg->SwitchSpecifyInputNames(); cfg->SetCpuMathLibraryNumThreads(FLAGS_num_threads); - cfg->EnableMKLDNN(); + if (FLAGS_enable_mkldnn) cfg->EnableMKLDNN(); } TEST(Analyzer_bfloat16_image_classification, bfloat16) { @@ -44,7 +46,7 @@ TEST(Analyzer_bfloat16_image_classification, bfloat16) { // read data from file and prepare batches with test data std::vector> input_slots_all; SetInputs(&input_slots_all); - if (FLAGS_enable_bf16 && + if (FLAGS_enable_mkldnn && FLAGS_enable_bf16 && platform::MayIUse(platform::cpu_isa_t::avx512_bf16)) { b_cfg.EnableMkldnnBfloat16(); } else { diff --git a/paddle/fluid/inference/tests/api/analyzer_int8_image_classification_tester.cc b/paddle/fluid/inference/tests/api/analyzer_int8_image_classification_tester.cc index b07163b518b52..d11b5f0c218f2 100644 --- a/paddle/fluid/inference/tests/api/analyzer_int8_image_classification_tester.cc +++ b/paddle/fluid/inference/tests/api/analyzer_int8_image_classification_tester.cc @@ -17,6 +17,8 @@ limitations under the License. */ #include "paddle/fluid/inference/api/paddle_analysis_config.h" #include "paddle/fluid/inference/tests/api/tester_helper.h" +DEFINE_bool(enable_mkldnn, true, "Enable MKLDNN"); + namespace paddle { namespace inference { namespace analysis { @@ -32,7 +34,7 @@ void SetConfig(AnalysisConfig *cfg) { cfg->SwitchIrOptim(); cfg->SwitchSpecifyInputNames(); cfg->SetCpuMathLibraryNumThreads(FLAGS_cpu_num_threads); - cfg->EnableMKLDNN(); + if (FLAGS_enable_mkldnn) cfg->EnableMKLDNN(); } TEST(Analyzer_int8_image_classification, quantization) { @@ -46,7 +48,7 @@ TEST(Analyzer_int8_image_classification, quantization) { std::vector> input_slots_all; SetInputs(&input_slots_all); - if (FLAGS_enable_int8) { + if (FLAGS_enable_mkldnn && FLAGS_enable_int8) { // prepare warmup batch from input data read earlier // warmup batch size can be different than batch size std::shared_ptr> warmup_data = diff --git a/paddle/fluid/inference/tests/api/analyzer_int8_object_detection_tester.cc b/paddle/fluid/inference/tests/api/analyzer_int8_object_detection_tester.cc index 91a3233b9851f..57ab1b00908b1 100644 --- a/paddle/fluid/inference/tests/api/analyzer_int8_object_detection_tester.cc +++ b/paddle/fluid/inference/tests/api/analyzer_int8_object_detection_tester.cc @@ -17,6 +17,8 @@ limitations under the License. */ #include "paddle/fluid/inference/api/paddle_analysis_config.h" #include "paddle/fluid/inference/tests/api/tester_helper.h" +DEFINE_bool(enable_mkldnn, true, "Enable MKLDNN"); + // setting iterations to 0 means processing the whole dataset namespace paddle { namespace inference { @@ -28,7 +30,7 @@ void SetConfig(AnalysisConfig *cfg) { cfg->SwitchIrOptim(true); cfg->SwitchSpecifyInputNames(false); cfg->SetCpuMathLibraryNumThreads(FLAGS_cpu_num_threads); - cfg->EnableMKLDNN(); + if (FLAGS_enable_mkldnn) cfg->EnableMKLDNN(); } std::vector ReadObjectsNum(std::ifstream &file, size_t offset, @@ -268,13 +270,16 @@ TEST(Analyzer_int8_mobilenet_ssd, quantization) { GetWarmupData(input_slots_all); // configure quantizer - q_cfg.EnableMkldnnQuantizer(); - q_cfg.mkldnn_quantizer_config(); - std::unordered_set quantize_operators( - {"conv2d", "depthwise_conv2d", "prior_box", "transpose2", "reshape2"}); - q_cfg.mkldnn_quantizer_config()->SetEnabledOpTypes(quantize_operators); - q_cfg.mkldnn_quantizer_config()->SetWarmupData(warmup_data); - q_cfg.mkldnn_quantizer_config()->SetWarmupBatchSize(FLAGS_warmup_batch_size); + if (FLAGS_enable_mkldnn) { + q_cfg.EnableMkldnnQuantizer(); + q_cfg.mkldnn_quantizer_config(); + std::unordered_set quantize_operators( + {"conv2d", "depthwise_conv2d", "prior_box", "transpose2", "reshape2"}); + q_cfg.mkldnn_quantizer_config()->SetEnabledOpTypes(quantize_operators); + q_cfg.mkldnn_quantizer_config()->SetWarmupData(warmup_data); + q_cfg.mkldnn_quantizer_config()->SetWarmupBatchSize( + FLAGS_warmup_batch_size); + } // 0 is avg_cost, 1 is top1_acc, 2 is top5_acc or mAP CompareQuantizedAndAnalysis(&cfg, &q_cfg, input_slots_all, 2); diff --git a/paddle/fluid/inference/tests/api/analyzer_quant_image_classification_tester.cc b/paddle/fluid/inference/tests/api/analyzer_quant_image_classification_tester.cc index a5a3e60d04b90..5e867fc87fea3 100644 --- a/paddle/fluid/inference/tests/api/analyzer_quant_image_classification_tester.cc +++ b/paddle/fluid/inference/tests/api/analyzer_quant_image_classification_tester.cc @@ -17,6 +17,8 @@ limitations under the License. */ #include "paddle/fluid/inference/api/paddle_analysis_config.h" #include "paddle/fluid/inference/tests/api/tester_helper.h" +DEFINE_bool(enable_mkldnn, true, "Enable MKLDNN"); + namespace paddle { namespace inference { namespace analysis { @@ -27,7 +29,7 @@ void SetConfig(AnalysisConfig *cfg, std::string model_path) { cfg->SwitchIrOptim(false); cfg->SwitchSpecifyInputNames(); cfg->SetCpuMathLibraryNumThreads(FLAGS_cpu_num_threads); - cfg->EnableMKLDNN(); + if (FLAGS_enable_mkldnn) cfg->EnableMKLDNN(); } template From fdeec8c37e6a4d53557eb9715e39b6ff04ced5bc Mon Sep 17 00:00:00 2001 From: Aurelius84 Date: Tue, 12 Apr 2022 10:30:19 +0800 Subject: [PATCH 079/211] [Phi]Fix beta1_pow/beta2_pow/skip_update data transform problem in adam/adamw (#41641) * [Phi]Fix beta1_pow/beta2_pow/skip_update data transform problem in adam/adamw * fix xpu unittest failed --- paddle/phi/kernels/gpu/adam_kernel.cu | 7 ++++++- paddle/phi/kernels/gpu/adamw_kernel.cu | 7 ++++++- paddle/phi/kernels/selected_rows/gpu/adam_kernel.cu | 7 ++++++- paddle/phi/kernels/selected_rows/gpu/adamw_kernel.cu | 7 ++++++- 4 files changed, 24 insertions(+), 4 deletions(-) diff --git a/paddle/phi/kernels/gpu/adam_kernel.cu b/paddle/phi/kernels/gpu/adam_kernel.cu index d3317e258e538..33b6f3a5a1bee 100644 --- a/paddle/phi/kernels/gpu/adam_kernel.cu +++ b/paddle/phi/kernels/gpu/adam_kernel.cu @@ -272,4 +272,9 @@ PD_REGISTER_KERNEL(adam, phi::AdamDenseKernel, float, double, - phi::dtype::float16) {} + phi::dtype::float16) { + // Skip beta1_pow, beta2_pow, skip_update data transform + kernel->InputAt(5).SetBackend(phi::Backend::ALL_BACKEND); + kernel->InputAt(6).SetBackend(phi::Backend::ALL_BACKEND); + kernel->InputAt(8).SetBackend(phi::Backend::ALL_BACKEND); +} diff --git a/paddle/phi/kernels/gpu/adamw_kernel.cu b/paddle/phi/kernels/gpu/adamw_kernel.cu index 8fef101383bb0..3555df11b5e1f 100644 --- a/paddle/phi/kernels/gpu/adamw_kernel.cu +++ b/paddle/phi/kernels/gpu/adamw_kernel.cu @@ -299,4 +299,9 @@ PD_REGISTER_KERNEL(adamw, phi::AdamwDenseKernel, float, double, - phi::dtype::float16) {} + phi::dtype::float16) { + // Skip beta1_pow, beta2_pow, skip_update data transform + kernel->InputAt(5).SetBackend(phi::Backend::ALL_BACKEND); + kernel->InputAt(6).SetBackend(phi::Backend::ALL_BACKEND); + kernel->InputAt(8).SetBackend(phi::Backend::ALL_BACKEND); +} diff --git a/paddle/phi/kernels/selected_rows/gpu/adam_kernel.cu b/paddle/phi/kernels/selected_rows/gpu/adam_kernel.cu index 32c05765a9ab0..2cb086503283b 100644 --- a/paddle/phi/kernels/selected_rows/gpu/adam_kernel.cu +++ b/paddle/phi/kernels/selected_rows/gpu/adam_kernel.cu @@ -284,4 +284,9 @@ PD_REGISTER_KERNEL(adam_dense_param_sparse_grad, phi::sr::AdamDenseParamSparseGradKernel, float, double, - phi::dtype::float16) {} + phi::dtype::float16) { + // Skip beta1_pow, beta2_pow, skip_update data transform + kernel->InputAt(5).SetBackend(phi::Backend::ALL_BACKEND); + kernel->InputAt(6).SetBackend(phi::Backend::ALL_BACKEND); + kernel->InputAt(8).SetBackend(phi::Backend::ALL_BACKEND); +} diff --git a/paddle/phi/kernels/selected_rows/gpu/adamw_kernel.cu b/paddle/phi/kernels/selected_rows/gpu/adamw_kernel.cu index 2e48b8235ed72..0fc223e081506 100644 --- a/paddle/phi/kernels/selected_rows/gpu/adamw_kernel.cu +++ b/paddle/phi/kernels/selected_rows/gpu/adamw_kernel.cu @@ -310,4 +310,9 @@ PD_REGISTER_KERNEL(adamw_dense_param_sparse_grad, phi::sr::AdamwDenseParamSparseGradKernel, float, double, - phi::dtype::float16) {} + phi::dtype::float16) { + // Skip beta1_pow, beta2_pow, skip_update data transform + kernel->InputAt(5).SetBackend(phi::Backend::ALL_BACKEND); + kernel->InputAt(6).SetBackend(phi::Backend::ALL_BACKEND); + kernel->InputAt(8).SetBackend(phi::Backend::ALL_BACKEND); +} From 59ec95999722b12cc5e2c33213e4ea9fac0c5469 Mon Sep 17 00:00:00 2001 From: Chen Weihang Date: Tue, 12 Apr 2022 10:50:12 +0800 Subject: [PATCH 080/211] [CustomOp] Add context pool unittests (#41085) * add context pool unittests * fix timeout * polish details * change option pos * add dll decl for wndows * fix pre-commit error * move dll_decl and export DeviceContext * replace lost dll_decl.h --- paddle/phi/api/all.h | 2 +- paddle/phi/api/ext/op_meta_info.h | 2 +- paddle/phi/api/include/context_pool.h | 7 ++- paddle/phi/api/{ext => include}/dll_decl.h | 0 paddle/phi/api/include/tensor.h | 2 +- paddle/phi/api/lib/api_registry.h | 2 +- paddle/phi/backends/cpu/cpu_context.h | 2 +- paddle/phi/backends/gpu/gpu_context.h | 2 +- paddle/phi/common/place.h | 10 ++- paddle/phi/core/device_context.h | 3 +- .../fluid/tests/custom_op/CMakeLists.txt | 2 + .../tests/custom_op/context_pool_test_op.cc | 54 ++++++++++++++++ .../fluid/tests/custom_op/ps_usr_print_log | 0 .../tests/custom_op/test_context_pool.py | 62 +++++++++++++++++++ .../utils/cpp_extension/extension_utils.py | 3 + 15 files changed, 142 insertions(+), 11 deletions(-) rename paddle/phi/api/{ext => include}/dll_decl.h (100%) create mode 100644 python/paddle/fluid/tests/custom_op/context_pool_test_op.cc create mode 100644 python/paddle/fluid/tests/custom_op/ps_usr_print_log create mode 100644 python/paddle/fluid/tests/custom_op/test_context_pool.py diff --git a/paddle/phi/api/all.h b/paddle/phi/api/all.h index 4e0a4729916b3..ac8607597a436 100644 --- a/paddle/phi/api/all.h +++ b/paddle/phi/api/all.h @@ -26,6 +26,7 @@ limitations under the License. */ // new phi apis #include "paddle/phi/api/include/api.h" +#include "paddle/phi/api/include/context_pool.h" #include "paddle/phi/api/include/sparse_api.h" #include "paddle/phi/api/include/tensor.h" @@ -38,7 +39,6 @@ limitations under the License. */ // original custom op headers #include "paddle/phi/api/ext/dispatch.h" -#include "paddle/phi/api/ext/dll_decl.h" #include "paddle/phi/api/ext/exception.h" #include "paddle/phi/api/ext/op_meta_info.h" #include "paddle/phi/api/ext/place.h" diff --git a/paddle/phi/api/ext/op_meta_info.h b/paddle/phi/api/ext/op_meta_info.h index f820d225eff8c..a9475db800816 100644 --- a/paddle/phi/api/ext/op_meta_info.h +++ b/paddle/phi/api/ext/op_meta_info.h @@ -20,8 +20,8 @@ limitations under the License. */ #include #include -#include "paddle/phi/api/ext/dll_decl.h" #include "paddle/phi/api/ext/exception.h" +#include "paddle/phi/api/include/dll_decl.h" #include "paddle/phi/api/include/tensor.h" #include "paddle/utils/any.h" diff --git a/paddle/phi/api/include/context_pool.h b/paddle/phi/api/include/context_pool.h index a2983d9c2aa65..b429252beb7fd 100644 --- a/paddle/phi/api/include/context_pool.h +++ b/paddle/phi/api/include/context_pool.h @@ -16,6 +16,7 @@ limitations under the License. */ #include +#include "paddle/phi/api/include/dll_decl.h" #include "paddle/phi/common/place.h" #include "paddle/phi/core/macros.h" #include "paddle/utils/flat_hash_map.h" @@ -55,8 +56,12 @@ struct DefaultDeviceContextType { * In order not to depend on the fluid's DeviceContextPool, * the DeviceContextPool here needs to be initialized in the fluid, and cannot * be initialized by itself. + * + * Note: DeviceContextPool is an experimental API and may be removed in the + * future. From 2.3, we recommend directly using the C++ API to combine new + * perators. */ -class DeviceContextPool { +class PADDLE_API DeviceContextPool { public: static DeviceContextPool& Instance(); diff --git a/paddle/phi/api/ext/dll_decl.h b/paddle/phi/api/include/dll_decl.h similarity index 100% rename from paddle/phi/api/ext/dll_decl.h rename to paddle/phi/api/include/dll_decl.h diff --git a/paddle/phi/api/include/tensor.h b/paddle/phi/api/include/tensor.h index 3c5c1531c4a2d..ad3933e2b2b53 100644 --- a/paddle/phi/api/include/tensor.h +++ b/paddle/phi/api/include/tensor.h @@ -29,8 +29,8 @@ using gpuStream_t = cudaStream_t; using gpuStream_t = hipStream_t; #endif -#include "paddle/phi/api/ext/dll_decl.h" #include "paddle/phi/api/ext/place.h" +#include "paddle/phi/api/include/dll_decl.h" #include "paddle/phi/common/data_type.h" #include "paddle/phi/common/layout.h" #include "paddle/phi/common/place.h" diff --git a/paddle/phi/api/lib/api_registry.h b/paddle/phi/api/lib/api_registry.h index 212a2f96452f6..ed1aaccb4e115 100644 --- a/paddle/phi/api/lib/api_registry.h +++ b/paddle/phi/api/lib/api_registry.h @@ -14,7 +14,7 @@ limitations under the License. */ #pragma once -#include "paddle/phi/api/ext/dll_decl.h" +#include "paddle/phi/api/include/dll_decl.h" namespace paddle { namespace experimental { diff --git a/paddle/phi/backends/cpu/cpu_context.h b/paddle/phi/backends/cpu/cpu_context.h index aa14c2a8e3862..e482fdc9e042f 100644 --- a/paddle/phi/backends/cpu/cpu_context.h +++ b/paddle/phi/backends/cpu/cpu_context.h @@ -24,7 +24,7 @@ limitations under the License. */ namespace phi { -class CPUContext : public DeviceContext { +class PADDLE_API CPUContext : public DeviceContext { public: CPUContext(); CPUContext(CPUContext&&); diff --git a/paddle/phi/backends/gpu/gpu_context.h b/paddle/phi/backends/gpu/gpu_context.h index ffae1f1f1353e..8d44acaa4a083 100644 --- a/paddle/phi/backends/gpu/gpu_context.h +++ b/paddle/phi/backends/gpu/gpu_context.h @@ -74,7 +74,7 @@ class DnnWorkspaceHandle { std::unique_ptr mtx_; }; -class GPUContext : public DeviceContext { +class PADDLE_API GPUContext : public DeviceContext { public: GPUContext(); GPUContext(GPUContext&&); diff --git a/paddle/phi/common/place.h b/paddle/phi/common/place.h index 4c6d47597bd2c..390684366db71 100644 --- a/paddle/phi/common/place.h +++ b/paddle/phi/common/place.h @@ -16,6 +16,8 @@ limitations under the License. */ #include +#include "paddle/phi/api/include/dll_decl.h" + namespace phi { enum class AllocationType : int8_t { @@ -33,11 +35,13 @@ enum class AllocationType : int8_t { const char* AllocationTypeStr(AllocationType type); -size_t GetOrRegisterGlobalDeviceTypeId(const std::string& device_type); -std::string GetGlobalDeviceType(size_t device_type_id_); +PADDLE_API size_t +GetOrRegisterGlobalDeviceTypeId(const std::string& device_type); + +PADDLE_API std::string GetGlobalDeviceType(size_t device_type_id_); /// \brief The place is used to specify where the data is stored. -class Place { +class PADDLE_API Place { public: Place() : device(0), alloc_type_(AllocationType::UNDEFINED) {} diff --git a/paddle/phi/core/device_context.h b/paddle/phi/core/device_context.h index 106d5ff7ddf98..d7c2c777ca632 100644 --- a/paddle/phi/core/device_context.h +++ b/paddle/phi/core/device_context.h @@ -16,6 +16,7 @@ limitations under the License. */ #include +#include "paddle/phi/api/include/dll_decl.h" #include "paddle/phi/common/data_type.h" #include "paddle/phi/common/place.h" #include "paddle/phi/core/allocator.h" @@ -30,7 +31,7 @@ class TensorBase; * All kernels must access the interfaces provided by the backend through * DeviceContext. */ -class DeviceContext { +class PADDLE_API DeviceContext { using DataType = paddle::experimental::DataType; public: diff --git a/python/paddle/fluid/tests/custom_op/CMakeLists.txt b/python/paddle/fluid/tests/custom_op/CMakeLists.txt index c20630f1a093e..c76b3da7428e3 100644 --- a/python/paddle/fluid/tests/custom_op/CMakeLists.txt +++ b/python/paddle/fluid/tests/custom_op/CMakeLists.txt @@ -3,11 +3,13 @@ if(WITH_GPU OR APPLE) py_test(test_custom_relu_op_setup SRCS test_custom_relu_op_setup.py) py_test(test_custom_relu_op_jit SRCS test_custom_relu_op_jit.py) py_test(test_custom_relu_model SRCS test_custom_relu_model.py) + py_test(test_context_pool SRCS test_context_pool.py) # Compiling shared library will cost some time, but running process is very fast. set_tests_properties(test_custom_relu_op_setup PROPERTIES TIMEOUT 250) set_tests_properties(test_custom_relu_op_jit PROPERTIES TIMEOUT 180) set_tests_properties(test_custom_relu_model PROPERTIES TIMEOUT 180) + set_tests_properties(test_context_pool PROPERTIES TIMEOUT 180) endif() py_test(test_custom_raw_op_kernel_op SRCS test_custom_raw_op_kernel_op.py) diff --git a/python/paddle/fluid/tests/custom_op/context_pool_test_op.cc b/python/paddle/fluid/tests/custom_op/context_pool_test_op.cc new file mode 100644 index 0000000000000..6b0edcc7ab148 --- /dev/null +++ b/python/paddle/fluid/tests/custom_op/context_pool_test_op.cc @@ -0,0 +1,54 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include + +#include "paddle/extension.h" +#include "paddle/phi/backends/all_context.h" + +#define CHECK_INPUT(x) \ + PD_CHECK(x.place() == paddle::PlaceType::kCPU, #x " must be a CPU Tensor.") + +std::vector ContextPoolTest(const paddle::Tensor& x) { + // 1. test cpu context + paddle::experimental::Place cpu_place( + paddle::experimental::AllocationType::CPU); + auto* cpu_ctx = + paddle::experimental::DeviceContextPool::Instance() + .Get(cpu_place); + PD_CHECK(cpu_ctx->GetPlace() == cpu_place); + // if want to use the eigen_device here, need to include eigen headers + auto* cpu_eigen_device = cpu_ctx->eigen_device(); + PD_CHECK(cpu_eigen_device != nullptr); + +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) + // 2. test gpu context + paddle::experimental::Place gpu_place( + paddle::experimental::AllocationType::GPU); + auto* gpu_ctx = + paddle::experimental::DeviceContextPool::Instance() + .Get(gpu_place); + PD_CHECK(gpu_ctx->GetPlace() == gpu_place); + // if want to use the eigen_device here, need to include eigen headers + auto* gpu_eigen_device = gpu_ctx->eigen_device(); + PD_CHECK(gpu_eigen_device != nullptr); +#endif + + return {x}; +} + +PD_BUILD_OP(context_pool_test) + .Inputs({"X"}) + .Outputs({"Out"}) + .SetKernelFn(PD_KERNEL(ContextPoolTest)); diff --git a/python/paddle/fluid/tests/custom_op/ps_usr_print_log b/python/paddle/fluid/tests/custom_op/ps_usr_print_log new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/python/paddle/fluid/tests/custom_op/test_context_pool.py b/python/paddle/fluid/tests/custom_op/test_context_pool.py new file mode 100644 index 0000000000000..d532b29688b39 --- /dev/null +++ b/python/paddle/fluid/tests/custom_op/test_context_pool.py @@ -0,0 +1,62 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import unittest +import numpy as np + +import paddle +from paddle.utils.cpp_extension import load, get_build_directory +from utils import paddle_includes, extra_cc_args, extra_nvcc_args +from paddle.utils.cpp_extension.extension_utils import run_cmd +from paddle.fluid.framework import _test_eager_guard + +# Because Windows don't use docker, the shared lib already exists in the +# cache dir, it will not be compiled again unless the shared lib is removed. +file = '{}\\context_pool_jit\\context_pool_jit.pyd'.format(get_build_directory( +)) +if os.name == 'nt' and os.path.isfile(file): + cmd = 'del {}'.format(file) + run_cmd(cmd, True) + +# Compile and load custom op Just-In-Time. +custom_ops = load( + name='context_pool_jit', + sources=['context_pool_test_op.cc'], + extra_include_paths=paddle_includes, # add for Coverage CI + extra_cxx_cflags=extra_cc_args, # test for cflags + extra_cuda_cflags=extra_nvcc_args, # test for cflags + verbose=True) + + +class TestContextPool(unittest.TestCase): + def setUp(self): + self.devices = ['cpu'] + if paddle.is_compiled_with_cuda(): + self.devices.append('gpu') + + def use_context_pool(self): + x = paddle.ones([2, 2], dtype='float32') + out = custom_ops.context_pool_test(x) + + self.assertTrue(np.array_equal(x.numpy(), out.numpy())) + + def test_using_context_pool(self): + with _test_eager_guard(): + self.use_context_pool() + self.use_context_pool() + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/utils/cpp_extension/extension_utils.py b/python/paddle/utils/cpp_extension/extension_utils.py index f5291bf77b56b..41add6e764a8c 100644 --- a/python/paddle/utils/cpp_extension/extension_utils.py +++ b/python/paddle/utils/cpp_extension/extension_utils.py @@ -543,6 +543,9 @@ def normalize_extension_kwargs(kwargs, use_cuda=False): runtime_library_dirs.extend(find_paddle_libraries(use_cuda)) kwargs['runtime_library_dirs'] = runtime_library_dirs + if compile_dir is None: + # Add this compile option to isolate fluid headers + add_compile_flag(extra_compile_args, ['-DPADDLE_WITH_CUSTOM_KERNEL']) kwargs['extra_compile_args'] = extra_compile_args kwargs['language'] = 'c++' From c448032bc894ffae90a2c962e1f8a1c640a08c81 Mon Sep 17 00:00:00 2001 From: Aurelius84 Date: Tue, 12 Apr 2022 10:51:10 +0800 Subject: [PATCH 081/211] [Eager]Fix clear_gradient bug in optimizer (#41658) --- python/paddle/optimizer/optimizer.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/python/paddle/optimizer/optimizer.py b/python/paddle/optimizer/optimizer.py index 36b773ac285df..0af8b8bb894b9 100644 --- a/python/paddle/optimizer/optimizer.py +++ b/python/paddle/optimizer/optimizer.py @@ -1112,8 +1112,7 @@ def clear_grad(self, set_to_zero=True): if _in_eager_without_dygraph_check(): for p in param_list: - clear_func = p._zero_grads if set_to_zero else p.clear_gradient - clear_func() + p.clear_gradient(set_to_zero) else: core.clear_gradients(param_list, set_to_zero) From 0b4c3c20710f8276dec18711001b3b600b91b456 Mon Sep 17 00:00:00 2001 From: Zhanlue Yang Date: Tue, 12 Apr 2022 10:56:04 +0800 Subject: [PATCH 082/211] [DoubleGrad] Enabled double grad test cases in eager_mode for test_imperative_double_grad (#41451) * [DoubleGrad] Enabled double grad test cases in eager_mode for test_imperative_double_grad * Fixed elementwise issue * Addressed CI failures --- .../final_state_generator/codegen_utils.py | 2 +- .../final_state_generator/eager_gen.py | 41 ++++++------ paddle/fluid/eager/backward.cc | 21 ++++--- .../kernels/cpu/elementwise_grad_kernel.cc | 2 +- paddle/phi/kernels/elementwise_grad_kernel.h | 2 +- .../kernels/gpu/elementwise_grad_kernel.cu | 2 +- paddle/phi/ops/compat/elementwise_sig.cc | 2 +- python/paddle/fluid/dygraph/math_op_patch.py | 33 +++++++--- python/paddle/fluid/layers/nn.py | 5 +- .../unittests/test_imperative_double_grad.py | 60 ++++++++---------- python/paddle/utils/code_gen/backward.yaml | 62 ++++++++++++++++++- 11 files changed, 152 insertions(+), 80 deletions(-) diff --git a/paddle/fluid/eager/auto_code_generator/final_state_generator/codegen_utils.py b/paddle/fluid/eager/auto_code_generator/final_state_generator/codegen_utils.py index b2cdd947aaff9..0081dbb595df3 100644 --- a/paddle/fluid/eager/auto_code_generator/final_state_generator/codegen_utils.py +++ b/paddle/fluid/eager/auto_code_generator/final_state_generator/codegen_utils.py @@ -23,7 +23,7 @@ ######################## ops_to_fill_zero_for_empty_grads = set([ "split_grad", "rnn_grad", "matmul_double_grad", "matmul_triple_grad", - "sigmoid_triple_grad" + "sigmoid_triple_grad, add_double_grad" ]) # For API dispatch used at python-level diff --git a/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py b/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py index 3b4c8f962179e..be6dda270093b 100644 --- a/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py +++ b/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py @@ -205,6 +205,7 @@ class {} : public egr::GradNodeBase {{ #endif }} // Forward API Call + VLOG(3) << \"Final State Running: \" << \"{}\"; {} // Get Outputs {} @@ -505,15 +506,11 @@ def ForwardsValidationCheck(self): for i in range(len(forward_attrs_list)): orig_attr_type = orig_forward_attrs_list[i][1] - orig_attr_default = orig_forward_attrs_list[i][2] orig_attr_pos = orig_forward_attrs_list[i][3] forward_attr_type = forward_attrs_list[i][1] - forward_attr_default = forward_attrs_list[i][2] forward_attr_pos = forward_attrs_list[i][3] assert orig_attr_type == forward_attr_type, AssertMessage( orig_attr_type, forward_attr_type) - assert orig_attr_default == forward_attr_default, AssertMessage( - orig_attr_default, forward_attr_default) assert orig_attr_pos == forward_attr_pos, AssertMessage( orig_attr_pos, forward_attr_pos) @@ -753,6 +750,15 @@ def GenerateNodeCreationCodes(self): set_grad_out_meta_list = [] set_edges_list = [] for name, (_, pos) in forward_inputs_position_map.items(): + # Has corresponding grad output + has_corresponding_grad_output = False + for _, (_, corresponding_pos, + _) in backward_grad_outputs_map.items(): + if pos == corresponding_pos: + has_corresponding_grad_output = True + if not has_corresponding_grad_output: + continue + input_autograd_meta_name = GetAutoGradMetaName(name) is_optional = (name in self.optional_inputs) if is_optional: @@ -1063,9 +1069,10 @@ def GenerateForwardDefinition(self, is_inplaced): self.forward_definition_str += FORWARD_FUNCTION_TEMPLATE.format( returns_type_str, forward_function_name, inputs_args_definition_str, dygraph_event_str, amp_logic_str, inputs_autograd_meta_str, - forward_call_str, get_outputs_str, outputs_autograd_meta_str, - compute_require_grad_args_str, check_inplace_str, - bump_inplace_version_str, node_creation_str, returns_str) + forward_function_name, forward_call_str, get_outputs_str, + outputs_autograd_meta_str, compute_require_grad_args_str, + check_inplace_str, bump_inplace_version_str, node_creation_str, + returns_str) self.forward_declaration_str += f"{returns_type_str} {forward_function_name}({inputs_args_declaration_str});\n" logging.info( @@ -1439,28 +1446,18 @@ def GenerateNodeDefinition(self, grad_node_creation_str): compute_require_grad_str += f"{indent}bool require_any_grad = egr::EagerUtils::ComputeRequireGrad({compute_require_grad_args_str});" # Construct grad_api returns - num_bwd_outputs = len(backward_grad_outputs_map.keys()) slot_num_bwd_outputs = len(self.forward_inputs_position_map.keys()) returns_str = f"{indent}std::vector> returns({slot_num_bwd_outputs});\n" for name, (ttype, fwd_position, grad_api_position) in backward_grad_outputs_map.items(): transformed_tensor_name = self.TransformToNextGradName(name) - # Infer Grad API Return Type - if num_bwd_outputs == 1: - # Single tensor output, return as is - if IsPlainTensorType(ttype): - returns_str += f"{indent}returns[0] = {{ {transformed_tensor_name} }};\n" - else: - assert IsVectorTensorType(ttype) - returns_str += f"{indent}returns[0] = {transformed_tensor_name};\n" + # Rearrange output order accordingly + if IsPlainTensorType(ttype): + returns_str += f"{indent}returns[{fwd_position}] = {{ {transformed_tensor_name} }};\n" else: - # Rearrange output order accordingly - if IsPlainTensorType(ttype): - returns_str += f"{indent}returns[{fwd_position}] = {{ {transformed_tensor_name} }};\n" - else: - assert IsVectorTensorType(ttype) - returns_str += f"{indent}returns[{fwd_position}] = {transformed_tensor_name};\n" + assert IsVectorTensorType(ttype) + returns_str += f"{indent}returns[{fwd_position}] = {transformed_tensor_name};\n" returns_str += f"{indent}if(NeedComplexToRealConversion()) HandleComplexGradToRealGrad(&returns);\n" returns_str += f"{indent}return returns;\n" diff --git a/paddle/fluid/eager/backward.cc b/paddle/fluid/eager/backward.cc index be425cf91bdef..60c5e52767a00 100644 --- a/paddle/fluid/eager/backward.cc +++ b/paddle/fluid/eager/backward.cc @@ -485,6 +485,7 @@ std::unordered_map getInDegreeMap( } } } + return node_in_degree_map; } @@ -526,6 +527,7 @@ std::vector RunBackward( bool allow_unused = false, const std::vector& no_grad_vars = {}) { VLOG(6) << "Start Backward"; + // *Gradient Hook should happen at node-level // *Inplace version check should perform at node-level // *Cross-batch accumulation happens at forward pass @@ -729,6 +731,16 @@ std::vector RunBackward( continue; } + auto* next_node = next_node_shared.get(); + if (!node_input_buffers_dict.count(next_node)) { + const auto& input_meta = next_node->InputMeta(); + auto grad_tensor_holder = + std::make_unique(input_meta); + VLOG(6) << "Construct GradTensorHolder for grad node: " + << next_node->name(); + node_input_buffers_dict[next_node] = std::move(grad_tensor_holder); + } + PADDLE_ENFORCE_LT( j, grad_output_tensors[i].size(), paddle::platform::errors::Fatal( @@ -748,15 +760,6 @@ std::vector RunBackward( << ", rank: " << j << " 's name is: " << grad_output_tensor.name(); - auto* next_node = next_node_shared.get(); - if (!node_input_buffers_dict.count(next_node)) { - const auto& input_meta = next_node->InputMeta(); - auto grad_tensor_holder = - std::make_unique(input_meta); - VLOG(6) << "Construct GradTensorHolder for grad node: " - << next_node->name(); - node_input_buffers_dict[next_node] = std::move(grad_tensor_holder); - } VLOG(6) << "Sum grad inputs for edge slot: " << edge_rank.first << ", rank: " << edge_rank.second; node_input_buffers_dict[next_node]->add( diff --git a/paddle/phi/kernels/cpu/elementwise_grad_kernel.cc b/paddle/phi/kernels/cpu/elementwise_grad_kernel.cc index 1548272f8622c..f452d9ffb7e89 100644 --- a/paddle/phi/kernels/cpu/elementwise_grad_kernel.cc +++ b/paddle/phi/kernels/cpu/elementwise_grad_kernel.cc @@ -63,9 +63,9 @@ void AddGradKernel(const Context& dev_ctx, template void AddDoubleGradKernel(const Context& dev_ctx, const DenseTensor& y, + const DenseTensor& dout, paddle::optional ddx, paddle::optional ddy, - const DenseTensor& dout, int axis, DenseTensor* ddout) { phi::AddDoubleGradImpl(dev_ctx, y, ddx, ddy, dout, axis, ddout); diff --git a/paddle/phi/kernels/elementwise_grad_kernel.h b/paddle/phi/kernels/elementwise_grad_kernel.h index 979bb61c2e3ca..0e730fbfbfa4d 100644 --- a/paddle/phi/kernels/elementwise_grad_kernel.h +++ b/paddle/phi/kernels/elementwise_grad_kernel.h @@ -31,9 +31,9 @@ void AddGradKernel(const Context& dev_ctx, template void AddDoubleGradKernel(const Context& dev_ctx, const DenseTensor& y, + const DenseTensor& dout, paddle::optional ddx, paddle::optional ddy, - const DenseTensor& dout, int axis, DenseTensor* ddout); diff --git a/paddle/phi/kernels/gpu/elementwise_grad_kernel.cu b/paddle/phi/kernels/gpu/elementwise_grad_kernel.cu index 3750e4b2bd61e..fae7978d3d2ea 100644 --- a/paddle/phi/kernels/gpu/elementwise_grad_kernel.cu +++ b/paddle/phi/kernels/gpu/elementwise_grad_kernel.cu @@ -56,9 +56,9 @@ void AddGradKernel(const Context& dev_ctx, template void AddDoubleGradKernel(const Context& dev_ctx, const DenseTensor& y, + const DenseTensor& dout, paddle::optional ddx, paddle::optional ddy, - const DenseTensor& dout, int axis, DenseTensor* ddout) { phi::AddDoubleGradImpl(dev_ctx, y, ddx, ddy, dout, axis, ddout); diff --git a/paddle/phi/ops/compat/elementwise_sig.cc b/paddle/phi/ops/compat/elementwise_sig.cc index 5ab71c0cd0fde..0a58d86b05b06 100644 --- a/paddle/phi/ops/compat/elementwise_sig.cc +++ b/paddle/phi/ops/compat/elementwise_sig.cc @@ -115,7 +115,7 @@ KernelSignature ElementwiseAddGradOpArgumentMapping( KernelSignature ElementwiseAddDoubleGradOpArgumentMapping( const ArgumentMappingContext& ctx) { return KernelSignature( - "add_double_grad", {"Y", "DDX", "DDY", "DOut"}, {"axis"}, {"DDOut"}); + "add_double_grad", {"Y", "DOut", "DDX", "DDY"}, {"axis"}, {"DDOut"}); } KernelSignature ElementwiseAddTripleGradOpArgumentMapping( diff --git a/python/paddle/fluid/dygraph/math_op_patch.py b/python/paddle/fluid/dygraph/math_op_patch.py index 8b80444fe9011..5b305325f3d2d 100644 --- a/python/paddle/fluid/dygraph/math_op_patch.py +++ b/python/paddle/fluid/dygraph/math_op_patch.py @@ -15,7 +15,7 @@ from __future__ import print_function from .. import core -from ..framework import Variable, convert_np_dtype_to_dtype_, _varbase_creator +from ..framework import Variable, convert_np_dtype_to_dtype_, _varbase_creator, _in_legacy_dygraph, in_dygraph_mode from ..layers.layer_function_generator import OpProtoHolder from . import no_grad from .. import framework @@ -62,6 +62,15 @@ _already_patch_varbase = False _already_patch_eager_tensor = False +# Dispatch to final state Python-C functions +_final_state_op_type_mapping = { + "elementwise_add": "final_state_add", + "elementwise_sub": "final_state_subtract", + "elementwise_div": "final_state_divide", + "elementwise_mul": "final_state_multiply", + "matmul_v2": "final_state_matmul", +} + def monkey_patch_math_varbase(): """ @@ -105,10 +114,15 @@ def astype(self, dtype): """ if not isinstance(dtype, core.VarDesc.VarType): dtype = convert_np_dtype_to_dtype_(dtype) - return _C_ops.cast(self, 'in_dtype', self.dtype, 'out_dtype', dtype) + + if _in_legacy_dygraph(): + return _C_ops.cast(self, 'in_dtype', self.dtype, 'out_dtype', dtype) + return _C_ops.final_state_cast(self, dtype) def _scalar_elementwise_op_(var, scale, bias): - return _C_ops.scale(var, 'scale', scale, 'bias', bias) + if _in_legacy_dygraph(): + return _C_ops.scale(var, 'scale', scale, 'bias', bias) + return _C_ops.final_state_scale(var, float(scale), bias, True) def _neg_(var): return _scalar_elementwise_op_(var, -1.0, 0.0) @@ -164,7 +178,10 @@ def _T_(var): perm = [] for i in range(len(var.shape)): perm.insert(0, i) - out, _ = _C_ops.transpose2(var, 'axis', perm) + if _in_legacy_dygraph(): + out, _ = _C_ops.transpose2(var, 'axis', perm) + else: + out = _C_ops.final_state_transpose(var, perm) return out def _scalar_add_(var, value): @@ -270,11 +287,13 @@ def __impl__(self, other_var): # 4. calculation axis = -1 - if framework._in_eager_mode_ and op_type == 'elementwise_add': - math_op = getattr(_C_ops, 'final_state_add') + if in_dygraph_mode( + ) and op_type in _final_state_op_type_mapping.keys(): + math_op = getattr(_C_ops, _final_state_op_type_mapping[op_type]) + return math_op(self, other_var) else: math_op = getattr(_C_ops, op_type) - return math_op(self, other_var, 'axis', axis) + return math_op(self, other_var, 'axis', axis) comment = OpProtoHolder.instance().get_op_proto(op_type).comment diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py index 68a58e8be49b8..a405bf829fd48 100755 --- a/python/paddle/fluid/layers/nn.py +++ b/python/paddle/fluid/layers/nn.py @@ -9036,7 +9036,10 @@ def relu(x, name=None): # [[0. 0. ] # [1. 2.6]] """ - if _non_static_mode(): + + if in_dygraph_mode(): + return _C_ops.final_state_relu(x) + if _in_legacy_dygraph(): return _C_ops.relu(x) check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64'], 'relu') diff --git a/python/paddle/fluid/tests/unittests/test_imperative_double_grad.py b/python/paddle/fluid/tests/unittests/test_imperative_double_grad.py index c9e41fe93ebe1..00b192b2a057b 100644 --- a/python/paddle/fluid/tests/unittests/test_imperative_double_grad.py +++ b/python/paddle/fluid/tests/unittests/test_imperative_double_grad.py @@ -385,26 +385,23 @@ def func_example_with_gradient_accumulation_and_create_graph(self): (x_np > 0) * 2).astype('float32') self.assertTrue(np.allclose(dx_actual.numpy(), dx_expected)) - if not _in_legacy_dygraph(): - pass - else: - loss = fluid.layers.reduce_mean(dx_actual * dx_actual + x * x) - loss.backward(retain_graph=True) + loss = fluid.layers.reduce_mean(dx_actual * dx_actual + x * x) + loss.backward(retain_graph=True) + x_grad_actual = x.gradient() + x_grad_expected = (2.0 / float(numel) * + (x_np + dx_expected * + (x_np > 0) * 2 / float(numel))).astype('float32') + self.assertTrue(np.allclose(x_grad_actual, x_grad_expected)) + + for i in range(5): + loss.backward(retain_graph=True) x_grad_actual = x.gradient() - x_grad_expected = (2.0 / float(numel) * ( + x_grad_expected = (i + 2) * (2.0 / float(numel) * ( x_np + dx_expected * (x_np > 0) * 2 / float(numel))).astype('float32') self.assertTrue(np.allclose(x_grad_actual, x_grad_expected)) - for i in range(5): - loss.backward(retain_graph=True) - x_grad_actual = x.gradient() - x_grad_expected = (i + 2) * (2.0 / float(numel) * ( - x_np + dx_expected * - (x_np > 0) * 2 / float(numel))).astype('float32') - self.assertTrue(np.allclose(x_grad_actual, x_grad_expected)) - def test_example_with_gradient_accumulation_and_create_graph(self): with _test_eager_guard(): self.func_example_with_gradient_accumulation_and_create_graph() @@ -426,7 +423,10 @@ def func_example_with_gradient_accumulation_and_no_grad_vars(self): del y1, z, w dx_actual, = self.grad( - [w_mean], [x], create_graph=True, no_grad_vars=[y2]) + [w_mean], [x], + retain_graph=True, + create_graph=True, + no_grad_vars=[y2]) self.assertFalse(y2.stop_gradient) self.assertFalse(dx_actual.stop_gradient) @@ -435,17 +435,14 @@ def func_example_with_gradient_accumulation_and_no_grad_vars(self): (x_np > 0) * 2).astype('float32') self.assertTrue(np.allclose(dx_actual.numpy(), dx_expected)) - if not _in_legacy_dygraph(): - pass - else: - loss = fluid.layers.reduce_mean(dx_actual * dx_actual + x * x) - loss.backward() + loss = fluid.layers.reduce_mean(dx_actual * dx_actual + x * x) + loss.backward() - x_grad_actual = x.gradient() - x_grad_expected = (2.0 / float(numel) * ( - x_np + dx_expected * - (x_np > 0) * 4 / float(numel))).astype('float32') - self.assertTrue(np.allclose(x_grad_actual, x_grad_expected)) + x_grad_actual = x.gradient() + x_grad_expected = (2.0 / float(numel) * + (x_np + dx_expected * + (x_np > 0) * 4 / float(numel))).astype('float32') + self.assertTrue(np.allclose(x_grad_actual, x_grad_expected)) def test_example_with_gradient_accumulation_and_no_grad_vars(self): with _test_eager_guard(): @@ -476,15 +473,12 @@ def func_example_with_gradient_accumulation_and_not_create_graph(self): self.assertTrue(np.allclose(dx_actual.numpy(), dx_expected)) - if not _in_legacy_dygraph(): - pass - else: - loss = fluid.layers.reduce_mean(dx_actual * dx_actual + x * x) - loss.backward() + loss = fluid.layers.reduce_mean(dx_actual * dx_actual + x * x) + loss.backward() - x_grad_actual = x.gradient() - x_grad_expected = (2.0 * x_np / float(numel)).astype('float32') - self.assertTrue(np.allclose(x_grad_actual, x_grad_expected)) + x_grad_actual = x.gradient() + x_grad_expected = (2.0 * x_np / float(numel)).astype('float32') + self.assertTrue(np.allclose(x_grad_actual, x_grad_expected)) def test_example_with_gradient_accumulation_and_not_create_graph(self): with _test_eager_guard(): diff --git a/python/paddle/utils/code_gen/backward.yaml b/python/paddle/utils/code_gen/backward.yaml index 90815cfe9af93..e2c5a970af17f 100644 --- a/python/paddle/utils/code_gen/backward.yaml +++ b/python/paddle/utils/code_gen/backward.yaml @@ -30,6 +30,18 @@ kernel : func : acosh_grad +- backward_api : add_double_grad + forward : add_grad (Tensor x, Tensor y, Tensor grad_out, int axis = -1) -> Tensor(grad_x), Tensor(grad_y) + args : (Tensor y, Tensor grad_out, Tensor grad_x_grad, Tensor grad_y_grad, int axis = -1) + output : Tensor(grad_out_grad) + infer_meta : + func : UnchangedInferMeta + param : [grad_out] + kernel : + func : add_double_grad + optional : grad_x_grad, grad_y_grad + backward : add_triple_grad + - backward_api : add_grad forward : add (Tensor x, Tensor y) -> Tensor(out) args : (Tensor x, Tensor y, Tensor out_grad, int axis = -1) @@ -40,6 +52,7 @@ kernel : func : add_grad no_need_buffer : x, y + backward : add_double_grad - backward_api : add_n_grad forward : add_n (Tensor[] x) -> Tensor(out) @@ -48,6 +61,16 @@ invoke : add_n_grad_impl(x, out_grad) no_need_buffer : x +- backward_api : add_triple_grad + forward : add_double_grad (Tensor y, Tensor grad_out, Tensor grad_grad_x, Tensor grad_grad_y, int axis = -1) -> Tensor(grad_grad_out) + args : (Tensor grad_grad_x, Tensor grad_grad_y, Tensor grad_grad_out_grad, int axis = -1) + output : Tensor(grad_grad_x_grad), Tensor(grad_grad_y_grad) + infer_meta : + func : GeneralBinaryGradInferMeta + param : [grad_grad_x, grad_grad_y] + kernel : + func : add_triple_grad + - backward_api : addmm_grad forward : addmm (Tensor input, Tensor x, Tensor y, float alpha, float beta) -> Tensor(out) args : (Tensor input, Tensor x, Tensor y, Tensor out_grad, float alpha, float beta) @@ -934,6 +957,12 @@ kernel : func : mean_all_grad +- backward_api : mean_double_grad + forward: mean_grad (Tensor x, Tensor grad_out, int64_t[] dims={}, bool keep_dim=false, bool reduce_all = false) -> Tensor(grad_x) + args : (Tensor grad_x_grad, int64_t[] dims={}, bool keep_dim=false, bool reduce_all=false) + output : Tensor(grad_out_grad) + invoke : mean(grad_x_grad, dims, keep_dim) + - backward_api : mean_grad forward: mean (Tensor x, int64_t[] dims={}, bool keep_dim=false) -> Tensor(out) args : (Tensor x, Tensor out_grad, int64_t[] dims={}, bool keep_dim=false, bool reduce_all=false) @@ -943,6 +972,7 @@ param: [x] kernel : func : mean_grad + backward : mean_double_grad no_need_buffer : x - backward_api : meshgrid_grad @@ -1025,6 +1055,17 @@ func : multiplex_grad param : [ids, out_grad] +- backward_api : multiply_double_grad + forward : multiply_grad (Tensor x, Tensor y, Tensor grad_out, int axis = -1) -> Tensor(grad_x), Tensor(grad_y) + args : (Tensor x, Tensor y, Tensor grad_out, Tensor grad_x_grad, Tensor grad_y_grad, int axis = -1) + output : Tensor(x_grad), Tensor(y_grad), Tensor(grad_out_grad) + infer_meta : + func : GeneralTernaryGradInferMeta + param : [x, y, grad_out] + kernel : + func : multiply_double_grad + optional : grad_x_grad, grad_y_grad + - backward_api : multiply_grad forward : multiply (Tensor x, Tensor y) -> Tensor(out) args : (Tensor x, Tensor y, Tensor out_grad, int axis = -1) @@ -1034,6 +1075,7 @@ param : [x, y] kernel : func : multiply_grad + backward : multiply_double_grad - backward_api : mv_grad forward : mv (Tensor x, Tensor vec) -> Tensor(out) @@ -1184,10 +1226,10 @@ - backward_api : relu_double_grad forward : relu_grad (Tensor out, Tensor grad_out) -> Tensor(grad_x) args : (Tensor out, Tensor grad_x_grad) - output : Tensor(out_grad), Tensor(grad_out_grad) + output : Tensor(grad_out_grad) infer_meta : - func : GeneralBinaryGradInferMeta - param : [out, out] + func : UnchangedInferMeta + param : [out] kernel : func : relu_double_grad @@ -1270,11 +1312,25 @@ kernel : func : rsqrt_grad +- backward_api : scale_double_grad + forward : scale_grad (Tensor grad_out, Scalar scale, float bias, bool bias_after_scale) -> Tensor(grad_x) + args : (Tensor grad_x_grad, Scalar scale=1.0, float bias=0.0, bool bias_after_scale=true) + output : Tensor(grad_out_grad) + invoke : scale(grad_x_grad, scale, 0.0, bias_after_scale) + backward : scale_triple_grad + - backward_api : scale_grad forward : scale (Tensor x, Scalar scale, float bias, bool bias_after_scale) -> Tensor(out) args : (Tensor out_grad, Scalar scale=1.0, float bias=0.0, bool bias_after_scale=true) output : Tensor(x_grad) invoke : scale(out_grad, scale, 0.0, bias_after_scale) + backward : scale_double_grad + +- backward_api : scale_triple_grad + forward : scale_double_grad (Tensor grad_grad_x, Scalar scale, float bias, bool bias_after_scale) -> Tensor(grad_grad_out) + args : (Tensor grad_grad_out_grad, Scalar scale=1.0, float bias=0.0, bool bias_after_scale=true) + output : Tensor(grad_grad_x_grad) + invoke : scale(grad_grad_out_grad, scale, 0.0, bias_after_scale) - backward_api : scatter_grad forward : scatter (Tensor x, Tensor index, Tensor updates, bool overwrite) -> Tensor(out) From cade0018ea985768baa307d738e7cc249a634dfc Mon Sep 17 00:00:00 2001 From: Zhanlue Yang Date: Tue, 12 Apr 2022 11:00:26 +0800 Subject: [PATCH 083/211] Adjusted CUDA Arches (#41628) --- cmake/cuda.cmake | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cmake/cuda.cmake b/cmake/cuda.cmake index e09429bc42957..75f4f19244494 100644 --- a/cmake/cuda.cmake +++ b/cmake/cuda.cmake @@ -11,7 +11,7 @@ elseif(NEW_RELEASE_ALL) add_definitions(-DNEW_RELEASE_ALL) set(paddle_known_gpu_archs "35 50 52 60 61 70 75 80 86") set(paddle_known_gpu_archs10 "35 50 52 60 61 70 75") - set(paddle_known_gpu_archs11 "35 50 60 61 70 75 80") + set(paddle_known_gpu_archs11 "50 60 61 70 75 80") elseif(NEW_RELEASE_PYPI) message("Using New Release Strategy - Cubin Packge") add_definitions(-DNEW_RELEASE_PYPI) From 137dc3e3d5a60dbf1e77bb20aa0cfc51388e1cf0 Mon Sep 17 00:00:00 2001 From: wuyefeilin <30919197+wuyefeilin@users.noreply.github.com> Date: Tue, 12 Apr 2022 11:20:22 +0800 Subject: [PATCH 084/211] add fp16 kernel to clip_grad (#41661) --- paddle/phi/kernels/gpu/clip_grad_kernel.cu | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/paddle/phi/kernels/gpu/clip_grad_kernel.cu b/paddle/phi/kernels/gpu/clip_grad_kernel.cu index b76086be64887..4566e8468ec16 100644 --- a/paddle/phi/kernels/gpu/clip_grad_kernel.cu +++ b/paddle/phi/kernels/gpu/clip_grad_kernel.cu @@ -15,6 +15,7 @@ #include "paddle/phi/kernels/clip_grad_kernel.h" #include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/common/float16.h" #include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/kernels/impl/clip_grad_kernel_impl.h" @@ -25,4 +26,5 @@ PD_REGISTER_KERNEL(clip_grad, float, double, int, - int64_t) {} + int64_t, + phi::dtype::float16) {} From f403fb69cef0e1f6400404e8f79d9770ab776d94 Mon Sep 17 00:00:00 2001 From: feng_shuai Date: Tue, 12 Apr 2022 11:41:22 +0800 Subject: [PATCH 085/211] add trt supoort for slice op (#41467) * add trt supoort for slice op * fix:output dims bug * fix: test * fix:for c++ coverage * fix:c++ coverage * fix: fix test bug * fix: CI test --- .../inference/tensorrt/convert/slice_op.cc | 8 +++-- paddle/fluid/inference/tensorrt/op_teller.cc | 29 +++++++++++-------- .../tensorrt/plugin/slice_op_plugin.cu | 25 ++++++++++++++-- .../tensorrt/plugin/slice_op_plugin.h | 7 +++-- .../ir/inference/test_trt_convert_slice.py | 20 ++++++++----- 5 files changed, 62 insertions(+), 27 deletions(-) diff --git a/paddle/fluid/inference/tensorrt/convert/slice_op.cc b/paddle/fluid/inference/tensorrt/convert/slice_op.cc index fde80ab42c61b..dea9a1ec3d76d 100644 --- a/paddle/fluid/inference/tensorrt/convert/slice_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/slice_op.cc @@ -44,6 +44,8 @@ class SliceOpConverter : public OpConverter { BOOST_GET_CONST(std::vector, op_desc.GetAttr("starts")); std::vector ends = BOOST_GET_CONST(std::vector, op_desc.GetAttr("ends")); + std::vector decrease_axises = + BOOST_GET_CONST(std::vector, op_desc.GetAttr("decrease_axis")); auto input_dims = input->getDimensions(); if (!engine_->with_dynamic_shape()) { @@ -107,8 +109,10 @@ class SliceOpConverter : public OpConverter { } else { bool with_fp16 = engine_->WithFp16() && !engine_->disable_trt_plugin_fp16(); - plugin::SlicePluginDynamic* plugin = - new plugin::SlicePluginDynamic(starts, ends, axes, with_fp16); + int decrease_axis = + decrease_axises.size() == 0 ? -1 : decrease_axises[0]; + plugin::SlicePluginDynamic* plugin = new plugin::SlicePluginDynamic( + starts, ends, axes, decrease_axis, with_fp16); layer = engine_->AddDynamicPlugin(&input, 1, plugin); } } else { diff --git a/paddle/fluid/inference/tensorrt/op_teller.cc b/paddle/fluid/inference/tensorrt/op_teller.cc index 85c5dc7107fec..6ccaf80c9f0dd 100644 --- a/paddle/fluid/inference/tensorrt/op_teller.cc +++ b/paddle/fluid/inference/tensorrt/op_teller.cc @@ -930,10 +930,16 @@ bool OpTeller::Tell(const framework::ir::Node* node, bool use_no_calib_int8, if (desc.HasAttr("decrease_axis")) { std::vector decrease_axis = BOOST_GET_CONST(std::vector, desc.GetAttr("decrease_axis")); - if (decrease_axis.size() > 0) { - VLOG(3) << "Invalid slice decrease_axis. decrease_axis.size() > 0" - "is not supported in TensorRT"; - return false; + if (with_dynamic_shape) { + if (decrease_axis.size() > 1) { + return false; + } + } else { + if (decrease_axis.size() > 0) { + VLOG(3) << "Invalid slice decrease_axis. decrease_axis.size() > 0" + "is not supported in TensorRT"; + return false; + } } } @@ -1054,17 +1060,15 @@ bool OpTeller::Tell(const framework::ir::Node* node, bool use_no_calib_int8, return false; } if (desc.Input("Ids").size() != desc.Input("Embs").size()) { - VLOG(3) << "The id and emb size of fused EmbEltwiseLayerNormOp " - "should be same "; return false; } } if (op_type == "fused_preln_embedding_eltwise_layernorm") { if (!with_dynamic_shape) { - VLOG(3) - << "fused_preln_embedding_eltwise_layernorm should run on dynamic " - "shape mode."; + VLOG(3) << "fused_preln_embedding_eltwise_layernorm should run on " + "dynamic " + "shape mode."; return false; } if (desc.Input("Ids").size() != desc.Input("Embs").size()) { @@ -1454,7 +1458,8 @@ bool OpTeller::Tell(const framework::ir::Node* node, bool use_no_calib_int8, const auto y_shape = y_var_desc->GetShape(); if (y_shape.size() != 2) { VLOG(3) - << " input_y(fc_op)'shapes must be 2, but input_y(fc_op)'shapes = " + << " input_y(fc_op)'shapes must be 2, but input_y(fc_op)'shapes = + " << y_shape.size(); return false; } @@ -1598,8 +1603,8 @@ bool OpTeller::Tell(const framework::ir::Node* node, bool use_no_calib_int8, } #else if (dtype != framework::proto::VarType::FP32) { - VLOG(3) - << "reduce op input data type must be float32 using TensorRT < 7.0"; + VLOG(3) << "reduce op input data type must be float32 using TensorRT " + "< 7.0"; return false; } #endif diff --git a/paddle/fluid/inference/tensorrt/plugin/slice_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/slice_op_plugin.cu index 2b6541c5515ce..4e6b82d2dc146 100644 --- a/paddle/fluid/inference/tensorrt/plugin/slice_op_plugin.cu +++ b/paddle/fluid/inference/tensorrt/plugin/slice_op_plugin.cu @@ -205,8 +205,9 @@ void SlicePlugin::serialize(void *buffer) const TRT_NOEXCEPT { #if IS_TRT_VERSION_GE(6000) SlicePluginDynamic::SlicePluginDynamic(std::vector starts, std::vector ends, - std::vector axes, bool with_fp16) - : starts_(starts), ends_(ends), axes_(axes) { + std::vector axes, int decrease_axis, + bool with_fp16) + : starts_(starts), ends_(ends), axes_(axes), decrease_axis_(decrease_axis) { with_fp16_ = with_fp16; cudaEventCreate(©_event_); cudaStreamCreate(©_stream_); @@ -217,6 +218,7 @@ SlicePluginDynamic::SlicePluginDynamic(void const *serialData, DeserializeValue(&serialData, &serialLength, &starts_); DeserializeValue(&serialData, &serialLength, &ends_); DeserializeValue(&serialData, &serialLength, &axes_); + DeserializeValue(&serialData, &serialLength, &decrease_axis_); DeserializeValue(&serialData, &serialLength, &with_fp16_); cudaEventCreate(©_event_); cudaStreamCreate(©_stream_); @@ -233,7 +235,8 @@ int SlicePluginDynamic::initialize() TRT_NOEXCEPT { return 0; } size_t SlicePluginDynamic::getSerializationSize() const TRT_NOEXCEPT { size_t size = SerializedSize(starts_) + SerializedSize(ends_) + - SerializedSize(axes_) + SerializedSize(with_fp16_); + SerializedSize(axes_) + SerializedSize(decrease_axis_) + + SerializedSize(with_fp16_); return size; } @@ -242,6 +245,7 @@ void SlicePluginDynamic::serialize(void *buffer) const TRT_NOEXCEPT { SerializeValue(&buffer, starts_); SerializeValue(&buffer, ends_); SerializeValue(&buffer, axes_); + SerializeValue(&buffer, decrease_axis_); SerializeValue(&buffer, with_fp16_); } @@ -265,6 +269,17 @@ nvinfer1::DimsExprs SlicePluginDynamic::getOutputDimensions( ret.d[axes_[i]] = expr_builder.constant(end - start); #endif } + if (decrease_axis_ != -1) { + nvinfer1::DimsExprs res; + res.nbDims = ret.nbDims - 1; + int j = 0; + for (size_t i = 0; i < in_dims.nbDims; i++) { + if (decrease_axis_ == i) continue; + res.d[j++] = expr_builder.operation(nvinfer1::DimensionOperation::kMAX, + *expr_builder.constant(0), *ret.d[i]); + } + return res; + } return ret; } @@ -318,6 +333,10 @@ int SlicePluginDynamic::enqueue(const nvinfer1::PluginTensorDesc *input_desc, cudaStream_t stream) TRT_NOEXCEPT { auto input_dims = input_desc[0].dims; auto out_dims = output_desc[0].dims; + if (decrease_axis_ != -1) { + out_dims = input_dims; + out_dims.d[decrease_axis_] = 1; + } auto num_dims = input_dims.nbDims; size_t out_num = ProductDim(out_dims); diff --git a/paddle/fluid/inference/tensorrt/plugin/slice_op_plugin.h b/paddle/fluid/inference/tensorrt/plugin/slice_op_plugin.h index 29f8f7c0999c4..4c07f0be36864 100644 --- a/paddle/fluid/inference/tensorrt/plugin/slice_op_plugin.h +++ b/paddle/fluid/inference/tensorrt/plugin/slice_op_plugin.h @@ -88,10 +88,12 @@ REGISTER_TRT_PLUGIN_V2(SlicePluginCreator); class SlicePluginDynamic : public DynamicPluginTensorRT { public: explicit SlicePluginDynamic(std::vector starts, std::vector ends, - std::vector axes, bool with_fp16); + std::vector axes, int decrease_axis, + bool with_fp16); nvinfer1::IPluginV2DynamicExt* clone() const TRT_NOEXCEPT override { - return new SlicePluginDynamic(starts_, ends_, axes_, with_fp16_); + return new SlicePluginDynamic(starts_, ends_, axes_, decrease_axis_, + with_fp16_); } SlicePluginDynamic(void const* serialData, size_t serialLength); @@ -140,6 +142,7 @@ class SlicePluginDynamic : public DynamicPluginTensorRT { std::vector starts_; std::vector ends_; std::vector axes_; + int decrease_axis_; int* offset_temp_data_{nullptr}; cudaEvent_t copy_event_; cudaStream_t copy_stream_; diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_slice.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_slice.py index 17a2c9cd74c07..86c52dad23af0 100644 --- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_slice.py +++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_slice.py @@ -55,11 +55,11 @@ def is_program_valid(self, program_config: ProgramConfig) -> bool: def sample_program_configs(self): def generate_input1(attrs: List[Dict[str, Any]]): - return np.ones([1, 3, 64, 64]).astype(np.float32) + return np.ones([6, 6, 64, 64]).astype(np.float32) for axes in [[0, 1], [1, 3], [2, 3]]: - for starts in [[0, 1], [-4, -3]]: - for ends in [[2, 2], [-1, -2], [5, 5]]: + for starts in [[0, 1]]: + for ends in [[2, 2], [5, 5]]: for decrease_axis in [[], [1], [2], [-1], [-100]]: for infer_flags in [[-1]]: dics = [{ @@ -97,8 +97,8 @@ def sample_predictor_configs( self, program_config) -> (paddle_infer.Config, List[int], float): def generate_dynamic_shape(attrs): self.dynamic_shape.min_input_shape = {"input_data": [1, 3, 32, 32]} - self.dynamic_shape.max_input_shape = {"input_data": [4, 3, 64, 64]} - self.dynamic_shape.opt_input_shape = {"input_data": [1, 3, 64, 64]} + self.dynamic_shape.max_input_shape = {"input_data": [8, 8, 64, 64]} + self.dynamic_shape.opt_input_shape = {"input_data": [6, 6, 64, 64]} def clear_dynamic_shape(): self.dynamic_shape.min_input_shape = {} @@ -107,7 +107,11 @@ def clear_dynamic_shape(): def generate_trt_nodes_num(attrs, dynamic_shape): inputs = program_config.inputs - if len(attrs[0]["decrease_axis"]) != 0: + if dynamic_shape == True and len(attrs[0]["decrease_axis"]) == 0: + return 1, 2 + if dynamic_shape == True and len(attrs[0]["decrease_axis"]) != 1: + return 0, 3 + if dynamic_shape == False and len(attrs[0]["decrease_axis"]) != 0: return 0, 3 if dynamic_shape: for i in range(len(attrs[0]["starts"])): @@ -123,7 +127,7 @@ def generate_trt_nodes_num(attrs, dynamic_shape): program_config.ops[i].attrs for i in range(len(program_config.ops)) ] - + self.trt_param.max_batch_size = 9 # for static_shape clear_dynamic_shape() self.trt_param.precision = paddle_infer.PrecisionType.Float32 @@ -146,7 +150,7 @@ def test(self): # TODO(inference): fix. # trt6 and trt7.1 has bug. # trt7.2 deserialize has bug. - # self.run_test() + self.run_test() pass From 93b37f6521bfb767ee682202e9675a921cedc637 Mon Sep 17 00:00:00 2001 From: Leo Chen Date: Tue, 12 Apr 2022 11:47:46 +0800 Subject: [PATCH 086/211] not use standalone_executor on fleet (#41630) --- python/paddle/fluid/executor.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/python/paddle/fluid/executor.py b/python/paddle/fluid/executor.py index 2232c34e63bd0..86b0d6560c927 100644 --- a/python/paddle/fluid/executor.py +++ b/python/paddle/fluid/executor.py @@ -395,7 +395,13 @@ def _is_enable_standalone_executor(): """ flag = False - env_val = os.environ.get('FLAGS_USE_STANDALONE_EXECUTOR', '1') + from ..distributed.fleet import fleet + if fleet._role_maker is not None: + warnings.warn("do not use standalone executor in fleet by default") + env_val = os.environ.get('FLAGS_USE_STANDALONE_EXECUTOR', None) + else: + env_val = os.environ.get('FLAGS_USE_STANDALONE_EXECUTOR', '1') + if env_val in [1, '1', True, 'True', 'true']: flag = True From a688ae2edba1f2f1bbc62ab2564dc7e5d6871885 Mon Sep 17 00:00:00 2001 From: QingshuChen Date: Tue, 12 Apr 2022 12:11:52 +0800 Subject: [PATCH 087/211] update kunlun xdnn (#41657) --- cmake/external/xpu.cmake | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cmake/external/xpu.cmake b/cmake/external/xpu.cmake index 2b84def46520f..90cb686700ef2 100644 --- a/cmake/external/xpu.cmake +++ b/cmake/external/xpu.cmake @@ -36,14 +36,14 @@ ENDIF() if(NOT DEFINED XPU_BASE_URL) SET(XPU_BASE_URL_WITHOUT_DATE "https://baidu-kunlun-product.cdn.bcebos.com/KL-SDK/klsdk-dev") - SET(XPU_BASE_URL "${XPU_BASE_URL_WITHOUT_DATE}/20220408") + SET(XPU_BASE_URL "${XPU_BASE_URL_WITHOUT_DATE}/20220411") else() SET(XPU_BASE_URL "${XPU_BASE_URL}") endif() SET(XPU_XRE_URL "${XPU_BASE_URL}/${XPU_XRE_DIR_NAME}.tar.gz" CACHE STRING "" FORCE) SET(XPU_XDNN_URL "${XPU_BASE_URL}/${XPU_XDNN_DIR_NAME}.tar.gz" CACHE STRING "" FORCE) -SET(XPU_XCCL_URL "${XPU_BASE_URL_WITHOUT_DATE}/20210623/${XPU_XCCL_DIR_NAME}.tar.gz" CACHE STRING "" FORCE) +SET(XPU_XCCL_URL "${XPU_BASE_URL_WITHOUT_DATE}/20220411/${XPU_XCCL_DIR_NAME}.tar.gz" CACHE STRING "" FORCE) SET(XPU_PACK_DEPENCE_URL "https://baidu-kunlun-public.su.bcebos.com/paddle_depence/pack_paddle_depence.sh" CACHE STRING "" FORCE) SET(SNAPPY_PREFIX_DIR "${THIRD_PARTY_PATH}/xpu") From a058b4744314826f25d76fcecaf885cfa2d904f0 Mon Sep 17 00:00:00 2001 From: Leo Chen Date: Tue, 12 Apr 2022 12:54:41 +0800 Subject: [PATCH 088/211] add dependency for send/recv to support pp parallel (#41652) --- .../new_executor/interpretercore_util.cc | 34 ++++++++++++++----- 1 file changed, 25 insertions(+), 9 deletions(-) diff --git a/paddle/fluid/framework/new_executor/interpretercore_util.cc b/paddle/fluid/framework/new_executor/interpretercore_util.cc index 59703332efe95..63fcf0cffaa84 100644 --- a/paddle/fluid/framework/new_executor/interpretercore_util.cc +++ b/paddle/fluid/framework/new_executor/interpretercore_util.cc @@ -709,8 +709,13 @@ std::map> build_op_downstream_map( // add dependences for random op, make sure that the random op is scheduled // sequentially const std::set random_op_set = { - "bernoulli", "poisson", "multinomial", "gaussian_random", - "uniform_random", "randint", "randperm", "exponential"}; + "bernoulli", "poisson", "multinomial", "gaussian_random", + "truncated_gaussian_random", "uniform_random", "randint", "randperm", + "exponential", + "sampling_id" + "dropout", + "class_center_sample", + }; int dependence_op_idx = -1; for (size_t op_idx = 0; op_idx < vec_instruction.size(); ++op_idx) { @@ -723,13 +728,26 @@ std::map> build_op_downstream_map( } // add dependency for communication op - const std::string communication_op_prefix = "c_"; + auto is_comm_op = [](std::string op) -> bool { + const std::set special_comm_op_set = { + "send", "recv", "send_v2", "recv_v2", + }; + const std::string communication_op_prefix = "c_"; + if (op.find(communication_op_prefix) != std::string::npos || + special_comm_op_set.count(op)) { + return true; + } + return false; + }; + dependence_op_idx = -1; for (size_t op_idx = 0; op_idx < vec_instruction.size(); ++op_idx) { - if (vec_instruction[op_idx].OpBase()->Type().find( - communication_op_prefix) != std::string::npos) { + if (is_comm_op(vec_instruction[op_idx].OpBase()->Type())) { if (dependence_op_idx != -1) { op2dependences[op_idx].insert(dependence_op_idx); + VLOG(4) << "Add depend from " + << vec_instruction[dependence_op_idx].OpBase()->Type() << " to " + << vec_instruction[op_idx].OpBase()->Type(); } dependence_op_idx = op_idx; } @@ -833,10 +851,8 @@ std::map> build_op_downstream_map( for (size_t j = first_read_fused_out_op + 1; j < vec_instruction.size(); ++j) { if (j == target + 1 && - vec_instruction[target].OpBase()->Type().find( - communication_op_prefix) != std::string::npos && - vec_instruction[j].OpBase()->Type().find(communication_op_prefix) != - std::string::npos) { + is_comm_op(vec_instruction[target].OpBase()->Type()) && + is_comm_op(vec_instruction[j].OpBase()->Type())) { VLOG(4) << "Found consecutive communication ops, " << vec_instruction[target].OpBase()->Type() << " -> " << vec_instruction[j].OpBase()->Type(); From 3749198e55867cfa1a40fd2e9226d1dfa5d61aac Mon Sep 17 00:00:00 2001 From: Lijunhui <1578034415@qq.com> Date: Tue, 12 Apr 2022 12:57:21 +0800 Subject: [PATCH 089/211] [KP] Add Logical/compare/bitwise registry & UT (#40802) * init commit no push * collect comile errors * bitwise UT * fix compile problem * cancel comments * restore miss deletion * fix compilation * fix UT * NO stash in multiple branch at the same times * fix error * combine .cu from gpu and kps * replace gpu by kps * fix by Chen-weihang * Revert "Fix kps compile error in Junhui logic compare bitwise" * fix backend test * rm comments Co-authored-by: Chen Weihang --- .../platform/device/xpu/xpu_op_kpfirst_list.h | 26 ++ paddle/phi/common/backend.h | 7 + paddle/phi/kernels/funcs/reduce_function.h | 24 +- paddle/phi/kernels/gpu/reduce.h | 3 +- .../kernels/{gpu => kps}/bitwise_kernel.cu | 26 +- .../kernels/{gpu => kps}/compare_kernel.cu | 37 +- .../kernels/{gpu => kps}/logical_kernel.cu | 17 +- .../phi/kernels/primitive/helper_primitives.h | 2 +- paddle/phi/tests/common/test_backend.cc | 4 + .../unittests/xpu/test_bitwise_op_xpu.py | 300 ++++++++++++ .../unittests/xpu/test_compare_op_xpu.py | 2 +- .../unittests/xpu/test_logical_op_xpu.py | 426 +++++++++--------- 12 files changed, 626 insertions(+), 248 deletions(-) rename paddle/phi/kernels/{gpu => kps}/bitwise_kernel.cu (84%) rename paddle/phi/kernels/{gpu => kps}/compare_kernel.cu (85%) rename paddle/phi/kernels/{gpu => kps}/logical_kernel.cu (86%) create mode 100644 python/paddle/fluid/tests/unittests/xpu/test_bitwise_op_xpu.py diff --git a/paddle/fluid/platform/device/xpu/xpu_op_kpfirst_list.h b/paddle/fluid/platform/device/xpu/xpu_op_kpfirst_list.h index ce9b09f60ca35..f1742f8eb5a1f 100644 --- a/paddle/fluid/platform/device/xpu/xpu_op_kpfirst_list.h +++ b/paddle/fluid/platform/device/xpu/xpu_op_kpfirst_list.h @@ -59,6 +59,32 @@ XPUOpMap& get_kp_ops() { {"swish", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, {"thresholded_relu", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, + // bitwise logical & compare + {"bitwise_and", XPUKernelSet({pOpKernelType(vartype::INT32, XPUPlace()), + pOpKernelType(vartype::BOOL, XPUPlace())})}, + {"bitwise_or", XPUKernelSet({pOpKernelType(vartype::INT32, XPUPlace()), + pOpKernelType(vartype::BOOL, XPUPlace())})}, + {"bitwise_not", XPUKernelSet({pOpKernelType(vartype::INT32, XPUPlace()), + pOpKernelType(vartype::BOOL, XPUPlace())})}, + {"bitwise_xor", XPUKernelSet({pOpKernelType(vartype::INT32, XPUPlace()), + pOpKernelType(vartype::BOOL, XPUPlace())})}, + + {"logical_and", + XPUKernelSet({pOpKernelType(vartype::INT32, XPUPlace())})}, + {"logical_or", XPUKernelSet({pOpKernelType(vartype::INT32, XPUPlace())})}, + {"logical_not", + XPUKernelSet({pOpKernelType(vartype::INT32, XPUPlace())})}, + {"logical_xor", + XPUKernelSet({pOpKernelType(vartype::INT32, XPUPlace())})}, + + {"less_than", XPUKernelSet({pOpKernelType(vartype::INT32, XPUPlace())})}, + {"less_equal", XPUKernelSet({pOpKernelType(vartype::INT32, XPUPlace())})}, + {"greater_than", + XPUKernelSet({pOpKernelType(vartype::INT32, XPUPlace())})}, + {"greater_equal", + XPUKernelSet({pOpKernelType(vartype::INT32, XPUPlace())})}, + {"equal", XPUKernelSet({pOpKernelType(vartype::INT32, XPUPlace())})}, + {"not_equal", XPUKernelSet({pOpKernelType(vartype::INT32, XPUPlace())})}, }; return s_xpu_kp_kernels; diff --git a/paddle/phi/common/backend.h b/paddle/phi/common/backend.h index 5543bee144b3b..bfa45869f5ff6 100644 --- a/paddle/phi/common/backend.h +++ b/paddle/phi/common/backend.h @@ -159,7 +159,14 @@ inline Backend StringToBackend(const char* backend_cstr) { } else if (s == std::string("GPUDNN")) { return Backend::GPUDNN; } else if (s == std::string("KPS")) { +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) + // NOTE(chenweihang) KPS is not yet a complete backend, and it still needs + // to be converted + // to GPU in the GPU environment + return Backend::GPU; +#else return Backend::KPS; +#endif } else if (s == std::string("IPU")) { return Backend::IPU; } else { diff --git a/paddle/phi/kernels/funcs/reduce_function.h b/paddle/phi/kernels/funcs/reduce_function.h index a2d7fd9544dee..4eb6ba0310886 100644 --- a/paddle/phi/kernels/funcs/reduce_function.h +++ b/paddle/phi/kernels/funcs/reduce_function.h @@ -15,7 +15,8 @@ #pragma once // CUDA, XPU and HIP use same api -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(__xpu__) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_XPU_KP) #include #include @@ -34,7 +35,6 @@ namespace cub = hipcub; #include "paddle/fluid/platform/device/gpu/gpu_device_function.h" #include "paddle/fluid/platform/device/gpu/gpu_launch_config.h" -#include "paddle/fluid/platform/fast_divmod.h" #include "paddle/phi/api/ext/dispatch.h" #include "paddle/phi/backends/gpu/gpu_context.h" #include "paddle/phi/backends/gpu/gpu_info.h" @@ -52,7 +52,9 @@ namespace cub = hipcub; #define REDUCE_VEC_SIZE 4 namespace kps = phi::kps; - +#ifdef PADDLE_WITH_XPU_KP +using dim3 = phi::kps::dim3; +#endif namespace phi { namespace funcs { @@ -82,12 +84,14 @@ static inline std::vector GetDimStrides(const std::vector& dims, return strides; } +#ifndef PADDLE_WITH_XPU_KP // get blockDim for reduceLastDim and reduceAny static inline int GetBlockDim(int block_dim) { return block_dim >= kps::details::kReduceMaxThread ? kps::details::kReduceMaxThread : GetLastPow2(block_dim); } +#endif // check reduce rand is valid static inline void CheckReduceRank(int reduce_rank, int rank) { @@ -180,12 +184,12 @@ struct IndexCalculator { strides = details::VectorToArray(full_strides); reduce_strides = details::VectorToArray(cal_strides); #ifndef PADDLE_WITH_XPU_KP - std::vector cal_divmoders; + std::vector cal_divmoders; // namespace // fast divmod for (auto i : cal_strides) { - cal_divmoders.push_back(paddle::platform::FastDivMod(i)); + cal_divmoders.push_back(kps::details::FastDivMod(i)); } - divmoders = details::VectorToArray( + divmoders = details::VectorToArray( cal_divmoders); #endif } @@ -222,7 +226,7 @@ struct IndexCalculator { phi::Array strides; phi::Array reduce_strides; #ifndef PADDLE_WITH_XPU_KP - phi::Array divmoders; + phi::Array divmoders; #endif }; @@ -579,11 +583,11 @@ struct ReduceConfig { void SetBlockDim() { // init - int block_num = details::GetBlockDim(reduce_num); should_reduce_again = false; - dim3 block_dim(block_num, 1, 1); + dim3 block_dim; dim3 grid_dim(left_num, 1, 1); blocking_size = reduce_num; + #ifdef PADDLE_WITH_XPU_KP if (reduce_last_dim) { block_dim.x = 64; @@ -990,6 +994,7 @@ static void LaunchReduceKernel(const Tx* x_data, } } +#if !defined(PADDLE_WITH_XPU_KP) template class ReduceOp, @@ -1044,6 +1049,7 @@ CubTensorReduceImpl(const Tx* x_data, PADDLE_THROW(phi::errors::InvalidArgument( "Tx should not be float16 when using cub::DeviceReduce::Reduce().")); } +#endif // PADDLE_WITH_XPU_KP template #include #include "paddle/phi/core/dense_tensor.h" -#include "paddle/phi/core/kernel_registry.h" -#include "paddle/phi/kernels/funcs/broadcast_function.h" +#include "paddle/phi/kernels/compare_kernel.h" #include "paddle/phi/kernels/funcs/elementwise_base.h" #include "paddle/phi/kernels/gpu/reduce.h" #include "paddle/phi/kernels/primitive/functor_primitives.h" +#endif namespace phi { @@ -53,6 +57,7 @@ inline void CompareKernelImpl(const Context& ctx, ctx, ins, &outs, axis, Functor()); } +#ifndef PADDLE_WITH_XPU_KP template inline void CompareAllKernelImpl(const Context& ctx, const DenseTensor& x, @@ -83,11 +88,22 @@ inline void CompareAllKernelImpl(const Context& ctx, funcs::ReduceKernel>( ctx, tmp, out, kps::IdentityFunctor(), reduce_dims); } +#endif } // namespace phi +#ifdef PADDLE_WITH_XPU_KP +PD_REGISTER_KERNEL(less_than, KPS, ALL_LAYOUT, phi::LessThanKernel, int) {} +PD_REGISTER_KERNEL(less_equal, KPS, ALL_LAYOUT, phi::LessEqualKernel, int) {} +PD_REGISTER_KERNEL(greater_than, KPS, ALL_LAYOUT, phi::GreaterThanKernel, int) { +} +PD_REGISTER_KERNEL( + greater_equal, KPS, ALL_LAYOUT, phi::GreaterEqualKernel, int) {} +PD_REGISTER_KERNEL(equal, KPS, ALL_LAYOUT, phi::EqualKernel, int) {} +PD_REGISTER_KERNEL(not_equal, KPS, ALL_LAYOUT, phi::NotEqualKernel, int) {} +#else PD_REGISTER_KERNEL(less_than, - GPU, + KPS, ALL_LAYOUT, phi::LessThanKernel, bool, @@ -97,7 +113,7 @@ PD_REGISTER_KERNEL(less_than, float, double) {} PD_REGISTER_KERNEL(less_equal, - GPU, + KPS, ALL_LAYOUT, phi::LessEqualKernel, bool, @@ -107,7 +123,7 @@ PD_REGISTER_KERNEL(less_equal, float, double) {} PD_REGISTER_KERNEL(greater_than, - GPU, + KPS, ALL_LAYOUT, phi::GreaterThanKernel, bool, @@ -117,7 +133,7 @@ PD_REGISTER_KERNEL(greater_than, float, double) {} PD_REGISTER_KERNEL(greater_equal, - GPU, + KPS, ALL_LAYOUT, phi::GreaterEqualKernel, bool, @@ -127,7 +143,7 @@ PD_REGISTER_KERNEL(greater_equal, float, double) {} PD_REGISTER_KERNEL(equal, - GPU, + KPS, ALL_LAYOUT, phi::EqualKernel, bool, @@ -137,7 +153,7 @@ PD_REGISTER_KERNEL(equal, float, double) {} PD_REGISTER_KERNEL(not_equal, - GPU, + KPS, ALL_LAYOUT, phi::NotEqualKernel, bool, @@ -148,7 +164,7 @@ PD_REGISTER_KERNEL(not_equal, double) {} PD_REGISTER_KERNEL(equal_all, - GPU, + KPS, ALL_LAYOUT, phi::EqualAllKernel, bool, @@ -156,3 +172,4 @@ PD_REGISTER_KERNEL(equal_all, int64_t, float, double) {} +#endif diff --git a/paddle/phi/kernels/gpu/logical_kernel.cu b/paddle/phi/kernels/kps/logical_kernel.cu similarity index 86% rename from paddle/phi/kernels/gpu/logical_kernel.cu rename to paddle/phi/kernels/kps/logical_kernel.cu index 1c0bafc932ee8..b732d371ad1ef 100644 --- a/paddle/phi/kernels/gpu/logical_kernel.cu +++ b/paddle/phi/kernels/kps/logical_kernel.cu @@ -10,11 +10,15 @@ // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and -// limitations under the License. +// limitation #include "paddle/phi/kernels/logical_kernel.h" - +#ifdef PADDLE_WITH_XPU_KP +#include "paddle/phi/backends/xpu/xpu_context.h" +#else #include "paddle/phi/backends/gpu/gpu_context.h" +#endif + #include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/kernels/funcs/broadcast_function.h" #include "paddle/phi/kernels/funcs/logical_functor.h" @@ -59,9 +63,15 @@ void LogicalNotKernel(const Context& dev_ctx, } // namespace phi +#ifdef PADDLE_WITH_XPU_KP +PD_REGISTER_KERNEL(logical_and, KPS, ALL_LAYOUT, phi::LogicalAndKernel, int) {} +PD_REGISTER_KERNEL(logical_Or, KPS, ALL_LAYOUT, phi::LogicalOrKernel, int) {} +PD_REGISTER_KERNEL(logical_Not, KPS, ALL_LAYOUT, phi::LogicalNotKernel, int) {} +PD_REGISTER_KERNEL(logical_Xor, KPS, ALL_LAYOUT, phi::LogicalXorKernel, int) {} +#else #define REGISTER_LOGICAL_CUDA_KERNEL(logical_and, func_type) \ PD_REGISTER_KERNEL(logical_and, \ - GPU, \ + KPS, \ ALL_LAYOUT, \ phi::Logical##func_type##Kernel, \ float, \ @@ -76,3 +86,4 @@ REGISTER_LOGICAL_CUDA_KERNEL(logical_and, And) REGISTER_LOGICAL_CUDA_KERNEL(logical_or, Or) REGISTER_LOGICAL_CUDA_KERNEL(logical_not, Not) REGISTER_LOGICAL_CUDA_KERNEL(logical_xor, Xor) +#endif diff --git a/paddle/phi/kernels/primitive/helper_primitives.h b/paddle/phi/kernels/primitive/helper_primitives.h index b0dd8c774f83a..85a9e0c420c38 100644 --- a/paddle/phi/kernels/primitive/helper_primitives.h +++ b/paddle/phi/kernels/primitive/helper_primitives.h @@ -23,7 +23,7 @@ struct dim3 { int y; int z; - explicit inline dim3(int split_x, int split_y = 1, int split_z = 1) { + explicit inline dim3(int split_x = 1, int split_y = 1, int split_z = 1) { x = split_x; y = split_y; z = split_z; diff --git a/paddle/phi/tests/common/test_backend.cc b/paddle/phi/tests/common/test_backend.cc index 5d6862c368c57..f93394f31df90 100644 --- a/paddle/phi/tests/common/test_backend.cc +++ b/paddle/phi/tests/common/test_backend.cc @@ -64,7 +64,11 @@ TEST(Backend, StringToBackend) { EXPECT_EQ(phi::Backend::NPU, pexp::StringToBackend("NPU")); EXPECT_EQ(phi::Backend::MKLDNN, pexp::StringToBackend("MKLDNN")); EXPECT_EQ(phi::Backend::GPUDNN, pexp::StringToBackend("GPUDNN")); +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) + EXPECT_EQ(phi::Backend::GPU, pexp::StringToBackend("KPS")); +#else EXPECT_EQ(phi::Backend::KPS, pexp::StringToBackend("KPS")); +#endif EXPECT_EQ(static_cast( static_cast(phi::Backend::NUM_BACKENDS) + 1), pexp::StringToBackend("CustomBackend")); diff --git a/python/paddle/fluid/tests/unittests/xpu/test_bitwise_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_bitwise_op_xpu.py new file mode 100644 index 0000000000000..9a1c9a61fff78 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/xpu/test_bitwise_op_xpu.py @@ -0,0 +1,300 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import unittest +import numpy as np +import sys +sys.path.append("..") + +import paddle +from op_test import OpTest +from op_test_xpu import XPUOpTest +from xpu.get_test_cover_info import create_test_class, get_xpu_op_support_types, XPUOpTestWrapper + +paddle.enable_static() + + +################## TEST OP: BitwiseAnd ################## +class XPUTestBitwiseAnd(XPUOpTestWrapper): + def __init__(self): + self.op_name = 'bitwise_and' + + class XPUTestBitwiseAndBase(XPUOpTest): + def setUp(self): + self.place = paddle.XPUPlace(0) + self.init_case() + self.set_case() + + def set_case(self): + self.op_type = 'bitwise_and' + + x = np.random.randint( + self.low, self.high, self.x_shape, dtype=self.dtype) + y = np.random.randint( + self.low, self.high, self.y_shape, dtype=self.dtype) + out = np.bitwise_and(x, y) + + self.attrs = {'use_xpu': True} + self.inputs = { + 'X': OpTest.np_dtype_to_fluid_dtype(x), + 'Y': OpTest.np_dtype_to_fluid_dtype(y) + } + self.outputs = {'Out': out} + + def init_case(self): + self.dtype = np.int32 + self.x_shape = [2, 3, 4, 5] + self.y_shape = [2, 3, 4, 5] + self.low = -100 + self.high = 100 + + def test_check_output(self): + self.check_output_with_place(self.place) + + def test_check_grad(self): + pass + + class XPUTestBitwiseAndCase1(XPUTestBitwiseAndBase): + def init_case(self): + self.dtype = np.int32 + self.x_shape = [4, 5] + self.y_shape = [2, 3, 4, 5] + self.low = -100 + self.high = 100 + + class XPUTestBitwiseAndCase2(XPUTestBitwiseAndBase): + def init_case(self): + self.dtype = np.int32 + self.x_shape = [2, 3, 4, 5] + self.y_shape = [4, 1] + self.low = -100 + self.high = 100 + + class XPUTestBitwiseAndCase3(XPUTestBitwiseAndBase): + def init_case(self): + self.dtype = np.int32 + self.x_shape = [2, 3, 4, 5] + self.y_shape = [2, 3, 4, 5] + self.low = 0 + self.high = 100 + + +support_types = get_xpu_op_support_types('bitwise_and') +for stype in support_types: + create_test_class(globals(), XPUTestBitwiseAnd, stype) + + +################## TEST OP: BitwiseOr ################## +class XPUTestBitwiseOr(XPUOpTestWrapper): + def __init__(self): + self.op_name = 'bitwise_or' + + class XPUTestBitwiseOrBase(XPUOpTest): + def setUp(self): + self.place = paddle.XPUPlace(0) + self.init_case() + self.set_case() + + def set_case(self): + self.op_type = 'bitwise_or' + + x = np.random.randint( + self.low, self.high, self.x_shape, dtype=self.dtype) + y = np.random.randint( + self.low, self.high, self.y_shape, dtype=self.dtype) + out = np.bitwise_or(x, y) + + self.attrs = {'use_xpu': True} + self.inputs = { + 'X': OpTest.np_dtype_to_fluid_dtype(x), + 'Y': OpTest.np_dtype_to_fluid_dtype(y) + } + self.outputs = {'Out': out} + + def init_case(self): + self.dtype = np.int32 + self.x_shape = [2, 3, 4, 5] + self.y_shape = [2, 3, 4, 5] + self.low = -100 + self.high = 100 + + def test_check_output(self): + self.check_output_with_place(self.place) + + def test_check_grad(self): + pass + + class XPUTestBitwiseOrCase1(XPUTestBitwiseOrBase): + def init_case(self): + self.dtype = np.int32 + self.x_shape = [4, 5] + self.y_shape = [2, 3, 4, 5] + self.low = -100 + self.high = 100 + + class XPUTestBitwiseOrCase2(XPUTestBitwiseOrBase): + def init_case(self): + self.dtype = np.int32 + self.x_shape = [2, 3, 4, 5] + self.y_shape = [4, 1] + self.low = -100 + self.high = 100 + + class XPUTestBitwiseOrCase3(XPUTestBitwiseOrBase): + def init_case(self): + self.dtype = np.int32 + self.x_shape = [2, 3, 4, 5] + self.y_shape = [2, 3, 4, 5] + self.low = 0 + self.high = 100 + + +support_types = get_xpu_op_support_types('bitwise_or') +for stype in support_types: + create_test_class(globals(), XPUTestBitwiseOr, stype) + + +################## TEST OP: BitwiseXor ################## +class XPUTestBitwiseXor(XPUOpTestWrapper): + def __init__(self): + self.op_name = 'bitwise_xor' + + class XPUTestBitwiseXorBase(XPUOpTest): + def setUp(self): + self.place = paddle.XPUPlace(0) + self.init_case() + self.set_case() + + def set_case(self): + self.op_type = 'bitwise_xor' + + x = np.random.randint( + self.low, self.high, self.x_shape, dtype=self.dtype) + y = np.random.randint( + self.low, self.high, self.y_shape, dtype=self.dtype) + out = np.bitwise_xor(x, y) + + self.attrs = {'use_xpu': True} + self.inputs = { + 'X': OpTest.np_dtype_to_fluid_dtype(x), + 'Y': OpTest.np_dtype_to_fluid_dtype(y) + } + self.outputs = {'Out': out} + + def init_case(self): + self.dtype = np.int32 + self.x_shape = [2, 3, 4, 5] + self.y_shape = [2, 3, 4, 5] + self.low = -100 + self.high = 100 + + def test_check_output(self): + self.check_output_with_place(self.place) + + def test_check_grad(self): + pass + + class XPUTestBitwiseXorCase1(XPUTestBitwiseXorBase): + def init_case(self): + self.dtype = np.int32 + self.x_shape = [4, 5] + self.y_shape = [2, 3, 4, 5] + self.low = -100 + self.high = 100 + + class XPUTestBitwiseXorCase2(XPUTestBitwiseXorBase): + def init_case(self): + self.dtype = np.int32 + self.x_shape = [2, 3, 4, 5] + self.y_shape = [4, 1] + self.low = -100 + self.high = 100 + + class XPUTestBitwiseXorCase3(XPUTestBitwiseXorBase): + def init_case(self): + self.dtype = np.int32 + self.x_shape = [2, 3, 4, 5] + self.y_shape = [2, 3, 4, 5] + self.low = 0 + self.high = 100 + + +support_types = get_xpu_op_support_types('bitwise_xor') +for stype in support_types: + create_test_class(globals(), XPUTestBitwiseXor, stype) + + +################## TEST OP: BitwiseNot ################## +class XPUTestBitwiseNot(XPUOpTestWrapper): + def __init__(self): + self.op_name = 'bitwise_not' + + class XPUTestBitwiseNotBase(XPUOpTest): + def setUp(self): + self.place = paddle.XPUPlace(0) + self.init_case() + self.set_case() + + def set_case(self): + self.op_type = 'bitwise_not' + + x = np.random.randint( + self.low, self.high, self.x_shape, dtype=self.dtype) + out = np.bitwise_not(x) + + self.attrs = {'use_xpu': True} + self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)} + self.outputs = {'Out': out} + + def init_case(self): + self.dtype = np.int32 + self.x_shape = [2, 3, 4, 5] + self.low = -100 + self.high = 100 + + def test_check_output(self): + self.check_output_with_place(self.place) + + def test_check_grad(self): + pass + + class XPUTestBitwiseNotBool(XPUTestBitwiseNotBase): + def setUp(self): + self.place = paddle.XPUPlace(0) + self.init_case() + self.set_case() + + def set_case(self): + self.op_type = 'bitwise_not' + + x = np.random.choice([True, False], self.x_shape) + out = np.bitwise_not(x) + + self.attrs = {'use_xpu': True} + self.inputs = {'X': x} + self.outputs = {'Out': out} + + def init_case(self): + self.dtype = np.bool + self.x_shape = [2, 3, 4, 5] + + +support_types = get_xpu_op_support_types('bitwise_not') +for stype in support_types: + create_test_class(globals(), XPUTestBitwiseNot, stype) + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/xpu/test_compare_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_compare_op_xpu.py index 5496c53a420b9..32b27652f7692 100644 --- a/python/paddle/fluid/tests/unittests/xpu/test_compare_op_xpu.py +++ b/python/paddle/fluid/tests/unittests/xpu/test_compare_op_xpu.py @@ -65,7 +65,7 @@ def test_errors(self): globals()[cls_name] = Cls -for _type_name in {'float32', 'int32', 'int64'}: +for _type_name in {'int32'}: if _type_name == 'float64' and core.is_compiled_with_rocm(): _type_name = 'float32' diff --git a/python/paddle/fluid/tests/unittests/xpu/test_logical_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_logical_op_xpu.py index 8338e99db006b..6b720b9717be4 100755 --- a/python/paddle/fluid/tests/unittests/xpu/test_logical_op_xpu.py +++ b/python/paddle/fluid/tests/unittests/xpu/test_logical_op_xpu.py @@ -13,232 +13,220 @@ # limitations under the License. from __future__ import print_function -from __future__ import print_function + import unittest import numpy as np import sys sys.path.append("..") -from paddle.fluid.op import Operator -import paddle.fluid.core as core -import paddle.fluid as fluid + import paddle +from op_test import OpTest from op_test_xpu import XPUOpTest -from paddle.static import Program, program_guard - -SUPPORTED_DTYPES = [ - bool, np.int8, np.int16, np.int32, np.int64, np.float32, np.float64 -] - -TEST_META_OP_DATA = [{ - 'op_str': 'logical_and', - 'binary_op': True -}, { - 'op_str': 'logical_or', - 'binary_op': True -}, { - 'op_str': 'logical_not', - 'binary_op': False -}] - -TEST_META_SHAPE_DATA = { - 'XDimLargerThanYDim1': { - 'x_shape': [2, 3, 4, 5], - 'y_shape': [4, 5] - }, - 'XDimLargerThanYDim2': { - 'x_shape': [2, 3, 4, 5], - 'y_shape': [4, 1] - }, - 'XDimLargerThanYDim3': { - 'x_shape': [2, 3, 4, 5], - 'y_shape': [1, 4, 1] - }, - 'XDimLargerThanYDim4': { - 'x_shape': [2, 3, 4, 5], - 'y_shape': [3, 4, 1] - }, - 'XDimLargerThanYDim5': { - 'x_shape': [2, 3, 1, 5], - 'y_shape': [3, 1, 1] - }, - 'XDimLessThanYDim1': { - 'x_shape': [4, 1], - 'y_shape': [2, 3, 4, 5] - }, - 'XDimLessThanYDim2': { - 'x_shape': [1, 4, 1], - 'y_shape': [2, 3, 4, 5] - }, - 'XDimLessThanYDim3': { - 'x_shape': [3, 4, 1], - 'y_shape': [2, 3, 4, 5] - }, - 'XDimLessThanYDim4': { - 'x_shape': [3, 1, 1], - 'y_shape': [2, 3, 1, 5] - }, - 'XDimLessThanYDim5': { - 'x_shape': [4, 5], - 'y_shape': [2, 3, 4, 5] - }, - 'Axis1InLargerDim': { - 'x_shape': [1, 4, 5], - 'y_shape': [2, 3, 1, 5] - }, - 'EqualDim1': { - 'x_shape': [10, 7], - 'y_shape': [10, 7] - }, - 'EqualDim2': { - 'x_shape': [1, 1, 4, 5], - 'y_shape': [2, 3, 1, 5] - } -} - -TEST_META_WRONG_SHAPE_DATA = { - 'ErrorDim1': { - 'x_shape': [2, 3, 4, 5], - 'y_shape': [3, 4] - }, - 'ErrorDim2': { - 'x_shape': [2, 3, 4, 5], - 'y_shape': [4, 3] - } -} - - -def run_static_xpu(x_np, y_np, op_str, binary_op=True): - paddle.enable_static() - startup_program = fluid.Program() - main_program = fluid.Program() - place = paddle.XPUPlace(0) - exe = fluid.Executor(place) - with fluid.program_guard(main_program, startup_program): - x = paddle.static.data(name='x', shape=x_np.shape, dtype=x_np.dtype) - op = getattr(paddle, op_str) - feed_list = {'x': x_np} - if not binary_op: - res = op(x) - else: - y = paddle.static.data(name='y', shape=y_np.shape, dtype=y_np.dtype) - feed_list['y'] = y_np - res = op(x, y) - exe.run(startup_program) - static_result = exe.run(main_program, feed=feed_list, fetch_list=[res]) - return static_result - - -def run_dygraph_xpu(x_np, y_np, op_str, binary_op=True): - place = paddle.XPUPlace(0) - paddle.disable_static(place) - op = getattr(paddle, op_str) - x = paddle.to_tensor(x_np, dtype=x_np.dtype) - if not binary_op: - dygraph_result = op(x) - else: - y = paddle.to_tensor(y_np, dtype=y_np.dtype) - dygraph_result = op(x, y) - return dygraph_result - - -def np_data_generator(np_shape, dtype, *args, **kwargs): - if dtype == bool: - return np.random.choice(a=[True, False], size=np_shape).astype(bool) - else: - return np.random.randn(*np_shape).astype(dtype) - - -def test_xpu(unit_test, test_error=False): - for op_data in TEST_META_OP_DATA: - meta_data = dict(op_data) - np_op = getattr(np, meta_data['op_str']) - META_DATA = dict(TEST_META_SHAPE_DATA) - if test_error: - META_DATA = dict(TEST_META_WRONG_SHAPE_DATA) - for shape_data in META_DATA.values(): - for data_type in SUPPORTED_DTYPES: - meta_data['x_np'] = np_data_generator( - shape_data['x_shape'], dtype=data_type) - meta_data['y_np'] = np_data_generator( - shape_data['y_shape'], dtype=data_type) - if meta_data['binary_op'] and test_error: - # catch C++ Exception - unit_test.assertRaises(BaseException, run_static_xpu, - **meta_data) - continue - static_result = run_static_xpu(**meta_data) - dygraph_result = run_dygraph_xpu(**meta_data) - if meta_data['binary_op']: - np_result = np_op(meta_data['x_np'], meta_data['y_np']) - else: - np_result = np_op(meta_data['x_np']) - unit_test.assertTrue((static_result == np_result).all()) - unit_test.assertTrue((dygraph_result.numpy() == np_result).all( - )) - - -def test_type_error(unit_test, type_str_map): - def check_type(op_str, x, y, binary_op): - op = getattr(paddle, op_str) - error_type = ValueError - if isinstance(x, np.ndarray): - x = paddle.to_tensor(x) - y = paddle.to_tensor(y) - error_type = BaseException - if binary_op: - if type_str_map['x'] != type_str_map['y']: - unit_test.assertRaises(error_type, op, x=x, y=y) - if not fluid._non_static_mode(): - error_type = TypeError - unit_test.assertRaises(error_type, op, x=x, y=y, out=1) - else: - if not fluid._non_static_mode(): - error_type = TypeError - unit_test.assertRaises(error_type, op, x=x, out=1) - - place = paddle.XPUPlace(0) - - for op_data in TEST_META_OP_DATA: - meta_data = dict(op_data) - binary_op = meta_data['binary_op'] - - paddle.disable_static(place) - x = np.random.choice(a=[0, 1], size=[10]).astype(type_str_map['x']) - y = np.random.choice(a=[0, 1], size=[10]).astype(type_str_map['y']) - check_type(meta_data['op_str'], x, y, binary_op) - - paddle.enable_static() - startup_program = paddle.static.Program() - main_program = paddle.static.Program() - with paddle.static.program_guard(main_program, startup_program): - x = paddle.static.data( - name='x', shape=[10], dtype=type_str_map['x']) - y = paddle.static.data( - name='y', shape=[10], dtype=type_str_map['y']) - check_type(meta_data['op_str'], x, y, binary_op) - - -def type_map_factory(): - return [{ - 'x': x_type, - 'y': y_type - } for x_type in SUPPORTED_DTYPES for y_type in SUPPORTED_DTYPES] - - -@unittest.skipIf(not paddle.is_compiled_with_xpu(), - "core is not compiled with XPU") -class TestXPU(unittest.TestCase): - def test(self): - test_xpu(self, True) - - def test_error(self): - test_xpu(self, True) - - def test_type_error(self): - type_map_list = type_map_factory() - for type_map in type_map_list: - test_type_error(self, type_map) +from xpu.get_test_cover_info import create_test_class, get_xpu_op_support_types, XPUOpTestWrapper + +paddle.enable_static() + + +################## TEST OP: logical_and ################## +class XPUTestLogicalAnd(XPUOpTestWrapper): + def __init__(self): + self.op_name = 'logical_and' + + class XPUTestLogicalAndBase(XPUOpTest): + def setUp(self): + self.place = paddle.XPUPlace(0) + self.init_case() + self.set_case() + + def set_case(self): + self.op_type = 'logical_and' + + x = np.random.randint( + self.low, self.high, self.x_shape, dtype=self.dtype) + y = np.random.randint( + self.low, self.high, self.y_shape, dtype=self.dtype) + out = np.logical_and(x, y) + + self.attrs = {'use_xpu': True} + self.inputs = { + 'X': OpTest.np_dtype_to_fluid_dtype(x), + 'Y': OpTest.np_dtype_to_fluid_dtype(y) + } + self.outputs = {'Out': out} + + def init_case(self): + self.dtype = np.int32 + self.x_shape = [2, 3, 4, 5] + self.y_shape = [2, 3, 4, 5] + self.low = -100 + self.high = 100 + + def test_check_output(self): + self.check_output_with_place(self.place) + + def test_check_grad(self): + pass + + class XPUTestLogicalAndCase1(XPUTestLogicalAndBase): + def init_case(self): + self.dtype = np.int32 + self.x_shape = [4, 5] + self.y_shape = [2, 3, 4, 5] + self.low = -100 + self.high = 100 + + +support_types = get_xpu_op_support_types('logical_and') +for stype in support_types: + create_test_class(globals(), XPUTestLogicalAnd, stype) + + +################## TEST OP: logical_or ################## +class XPUTestLogicalOr(XPUOpTestWrapper): + def __init__(self): + self.op_name = 'logical_or' + + class XPUTestLogicalOrBase(XPUOpTest): + def setUp(self): + self.place = paddle.XPUPlace(0) + self.init_case() + self.set_case() + + def set_case(self): + self.op_type = 'logical_or' + + x = np.random.randint( + self.low, self.high, self.x_shape, dtype=self.dtype) + y = np.random.randint( + self.low, self.high, self.y_shape, dtype=self.dtype) + out = np.logical_or(x, y) + + self.attrs = {'use_xpu': True} + self.inputs = { + 'X': OpTest.np_dtype_to_fluid_dtype(x), + 'Y': OpTest.np_dtype_to_fluid_dtype(y) + } + self.outputs = {'Out': out} + + def init_case(self): + self.dtype = np.int32 + self.x_shape = [2, 3, 4, 5] + self.y_shape = [2, 3, 4, 5] + self.low = -100 + self.high = 100 + + def test_check_output(self): + self.check_output_with_place(self.place) + + def test_check_grad(self): + pass + + class XPUTestLogicalOrCase1(XPUTestLogicalOrBase): + def init_case(self): + self.dtype = np.int32 + self.x_shape = [4, 5] + self.y_shape = [2, 3, 4, 5] + self.low = -100 + self.high = 100 + + +support_types = get_xpu_op_support_types('logical_or') +for stype in support_types: + create_test_class(globals(), XPUTestLogicalOr, stype) + + +################## TEST OP: logical_xor ################## +class XPUTestLogicalXor(XPUOpTestWrapper): + def __init__(self): + self.op_name = 'logical_xor' + + class XPUTestLogicalXorBase(XPUOpTest): + def setUp(self): + self.place = paddle.XPUPlace(0) + self.init_case() + self.set_case() + + def set_case(self): + self.op_type = 'logical_xor' + + x = np.random.randint( + self.low, self.high, self.x_shape, dtype=self.dtype) + y = np.random.randint( + self.low, self.high, self.y_shape, dtype=self.dtype) + out = np.logical_xor(x, y) + + self.attrs = {'use_xpu': True} + self.inputs = { + 'X': OpTest.np_dtype_to_fluid_dtype(x), + 'Y': OpTest.np_dtype_to_fluid_dtype(y) + } + self.outputs = {'Out': out} + + def init_case(self): + self.dtype = np.int64 + self.x_shape = [2, 3, 4, 5] + self.y_shape = [2, 3, 4, 5] + self.low = -100 + self.high = 100 + + def test_check_output(self): + self.check_output_with_place(self.place) + + def test_check_grad(self): + pass + + class XPUTestLogicalXorCase1(XPUTestLogicalXorBase): + def init_case(self): + self.dtype = np.int32 + self.x_shape = [4, 5] + self.y_shape = [2, 3, 4, 5] + self.low = -100 + self.high = 100 + + +support_types = get_xpu_op_support_types('logical_xor') +for stype in support_types: + create_test_class(globals(), XPUTestLogicalXor, stype) + + +################## TEST OP: LogicalNot ################## +class XPUTestLogicalNot(XPUOpTestWrapper): + def __init__(self): + self.op_name = 'logical_not' + + class XPUTestLogicalNotBase(XPUOpTest): + def setUp(self): + self.place = paddle.XPUPlace(0) + self.init_case() + self.set_case() + + def set_case(self): + self.op_type = 'logical_not' + + x = np.random.randint( + self.low, self.high, self.x_shape, dtype=self.dtype) + out = np.logical_not(x) + + self.attrs = {'use_xpu': True} + self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)} + self.outputs = {'Out': out} + + def init_case(self): + self.dtype = np.int32 + self.x_shape = [2, 3, 4, 5] + self.low = -100 + self.high = 100 + def test_check_output(self): + self.check_output_with_place(self.place) + + def test_check_grad(self): + pass + + +support_types = get_xpu_op_support_types('logical_not') +for stype in support_types: + create_test_class(globals(), XPUTestLogicalNot, stype) if __name__ == '__main__': unittest.main() From 7b627dd8487c2d06d0b40a1764f781f49078d980 Mon Sep 17 00:00:00 2001 From: hong <43953930+phlrain@users.noreply.github.com> Date: Tue, 12 Apr 2022 12:59:01 +0800 Subject: [PATCH 090/211] fix depthwise dnn bug (#41666) --- paddle/phi/kernels/gpudnn/conv_grad_kernel.cu | 1 + paddle/phi/kernels/gpudnn/conv_kernel.cu | 1 + 2 files changed, 2 insertions(+) diff --git a/paddle/phi/kernels/gpudnn/conv_grad_kernel.cu b/paddle/phi/kernels/gpudnn/conv_grad_kernel.cu index 3696ab08ea83e..9d4acb95ea48a 100644 --- a/paddle/phi/kernels/gpudnn/conv_grad_kernel.cu +++ b/paddle/phi/kernels/gpudnn/conv_grad_kernel.cu @@ -641,6 +641,7 @@ void DepthwiseConvCudnnGradKernel(const Context& dev_ctx, bool use_addto, int workspace_size_MB, bool exhaustive_search, + bool fuse_relu, DenseTensor* input_grad, DenseTensor* filter_grad) { ConvCudnnGradKernel(dev_ctx, diff --git a/paddle/phi/kernels/gpudnn/conv_kernel.cu b/paddle/phi/kernels/gpudnn/conv_kernel.cu index d40cbecaee6d5..3d3ab7b7a4e94 100644 --- a/paddle/phi/kernels/gpudnn/conv_kernel.cu +++ b/paddle/phi/kernels/gpudnn/conv_kernel.cu @@ -429,6 +429,7 @@ void DepthwiseConvCudnnKernel(const Context& dev_ctx, bool use_addto, int workspace_size_MB, bool exhaustive_search, + bool fuse_relu, DenseTensor* out) { ConvCudnnKernel(dev_ctx, input, From c202a613a991161d35cfb8c218c04905b5c5ede8 Mon Sep 17 00:00:00 2001 From: danleifeng <52735331+danleifeng@users.noreply.github.com> Date: Tue, 12 Apr 2022 13:04:18 +0800 Subject: [PATCH 091/211] =?UTF-8?q?=E3=80=90heterps=E3=80=91datafeed=20put?= =?UTF-8?q?tofeedvec=20performance=20(#40168)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * perform SlotRecordInMemoryDataFeed feedvec;test=develop --- paddle/fluid/framework/CMakeLists.txt | 10 +- paddle/fluid/framework/data_feed.cc | 331 +++++++++++++++++- paddle/fluid/framework/data_feed.cu | 149 ++++++++ paddle/fluid/framework/data_feed.h | 293 +++++++++++++++- .../fluid/framework/fleet/ps_gpu_wrapper.cc | 18 +- paddle/fluid/framework/ps_gpu_worker.cc | 14 +- 6 files changed, 793 insertions(+), 22 deletions(-) create mode 100644 paddle/fluid/framework/data_feed.cu diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt index fb4c9937611e7..1b9943df1b087 100755 --- a/paddle/fluid/framework/CMakeLists.txt +++ b/paddle/fluid/framework/CMakeLists.txt @@ -295,7 +295,7 @@ if(WITH_DISTRIBUTE) dist_multi_trainer.cc trainer_factory.cc trainer.cc data_feed_factory.cc heterxpu_trainer.cc data_feed.cc device_worker.cc hogwild_worker.cc hetercpu_worker.cc ps_gpu_worker.cc - ps_gpu_trainer.cc downpour_worker.cc downpour_worker_opt.cc + ps_gpu_trainer.cc downpour_worker.cc downpour_worker_opt.cc data_feed.cu pull_dense_worker.cc section_worker.cc device_worker_factory.cc data_set.cc DEPS op_registry device_context scope framework_proto trainer_desc_proto glog fs shell fleet_wrapper heter_wrapper ps_gpu_wrapper box_wrapper metrics lodtensor_printer @@ -316,7 +316,7 @@ if(WITH_DISTRIBUTE) dist_multi_trainer.cc trainer_factory.cc trainer.cc data_feed_factory.cc heterxpu_trainer.cc heter_pipeline_trainer.cc data_feed.cc device_worker.cc hogwild_worker.cc hetercpu_worker.cc - downpour_worker.cc downpour_lite_worker.cc downpour_worker_opt.cc + downpour_worker.cc downpour_lite_worker.cc downpour_worker_opt.cc data_feed.cu pull_dense_worker.cc section_worker.cc heter_section_worker.cc device_worker_factory.cc data_set.cc DEPS op_registry device_context scope framework_proto data_feed_proto heter_service_proto trainer_desc_proto glog index_sampler index_wrapper sampler index_dataset_proto @@ -339,7 +339,7 @@ if(WITH_DISTRIBUTE) dist_multi_trainer.cc trainer_factory.cc trainer.cc data_feed_factory.cc heterxpu_trainer.cc data_feed.cc device_worker.cc hogwild_worker.cc hetercpu_worker.cc ps_gpu_worker.cc - ps_gpu_trainer.cc downpour_worker.cc downpour_worker_opt.cc + ps_gpu_trainer.cc downpour_worker.cc downpour_worker_opt.cc data_feed.cu pull_dense_worker.cc section_worker.cc device_worker_factory.cc data_set.cc DEPS op_registry device_context scope framework_proto data_feed_proto heter_service_proto trainer_desc_proto glog lod_rank_table fs shell fleet_wrapper heter_wrapper ps_gpu_wrapper box_wrapper metrics lodtensor_printer feed_fetch_method @@ -359,7 +359,7 @@ elseif(WITH_PSLIB) dist_multi_trainer.cc trainer_factory.cc trainer.cc data_feed_factory.cc heterxpu_trainer.cc data_feed.cc device_worker.cc hogwild_worker.cc hetercpu_worker.cc ps_gpu_worker.cc - ps_gpu_trainer.cc downpour_worker.cc downpour_worker_opt.cc + ps_gpu_trainer.cc downpour_worker.cc downpour_worker_opt.cc data_feed.cu pull_dense_worker.cc section_worker.cc device_worker_factory.cc data_set.cc DEPS op_registry device_context scope framework_proto data_feed_proto heter_service_proto trainer_desc_proto glog lod_rank_table fs shell fleet_wrapper heter_wrapper ps_gpu_wrapper box_wrapper lodtensor_printer feed_fetch_method @@ -369,7 +369,7 @@ else() dist_multi_trainer.cc trainer_factory.cc trainer.cc data_feed_factory.cc heterxpu_trainer.cc data_feed.cc device_worker.cc hogwild_worker.cc hetercpu_worker.cc ps_gpu_worker.cc - ps_gpu_trainer.cc downpour_worker.cc downpour_worker_opt.cc + ps_gpu_trainer.cc downpour_worker.cc downpour_worker_opt.cc data_feed.cu pull_dense_worker.cc section_worker.cc device_worker_factory.cc data_set.cc DEPS op_registry device_context scope framework_proto data_feed_proto heter_service_proto trainer_desc_proto glog lod_rank_table fs shell fleet_wrapper heter_wrapper ps_gpu_wrapper box_wrapper lodtensor_printer feed_fetch_method diff --git a/paddle/fluid/framework/data_feed.cc b/paddle/fluid/framework/data_feed.cc index 330f5ea52958f..3b6370e11851f 100644 --- a/paddle/fluid/framework/data_feed.cc +++ b/paddle/fluid/framework/data_feed.cc @@ -2394,9 +2394,6 @@ bool SlotRecordInMemoryDataFeed::ParseOneInstance(const std::string& line, for (int j = 0; j < num; ++j) { uint64_t feasign = static_cast(strtoull(endptr, &endptr, 10)); - if (feasign == 0 && !used_slots_info_[info.used_idx].dense) { - continue; - } slot_fea.push_back(feasign); ++uint64_total_slot_num; } @@ -2419,8 +2416,21 @@ bool SlotRecordInMemoryDataFeed::ParseOneInstance(const std::string& line, return (uint64_total_slot_num > 0); } +void SlotRecordInMemoryDataFeed::AssignFeedVar(const Scope& scope) { + CheckInit(); + for (int i = 0; i < use_slot_size_; ++i) { + feed_vec_[i] = + scope.FindVar(used_slots_info_[i].slot)->GetMutable(); + } +} + void SlotRecordInMemoryDataFeed::PutToFeedVec(const SlotRecord* ins_vec, int num) { +#if defined(PADDLE_WITH_CUDA) && defined(PADDLE_WITH_HETERPS) + paddle::platform::SetDeviceId(place_.GetDeviceId()); + pack_->pack_instance(ins_vec, num); + BuildSlotBatchGPU(pack_->ins_num()); +#else for (int j = 0; j < use_slot_size_; ++j) { auto& feed = feed_vec_[j]; if (feed == nullptr) { @@ -2497,6 +2507,7 @@ void SlotRecordInMemoryDataFeed::PutToFeedVec(const SlotRecord* ins_vec, feed_vec_[j]->set_lod(data_lod); } } +#endif } void SlotRecordInMemoryDataFeed::ExpandSlotRecord(SlotRecord* rec) { @@ -2573,6 +2584,10 @@ bool SlotRecordInMemoryDataFeed::Start() { this->offset_index_ = 0; } this->finish_start_ = true; +#if defined(PADDLE_WITH_CUDA) && defined(PADDLE_WITH_HETERPS) + CHECK(paddle::platform::is_gpu_place(this->place_)); + pack_ = BatchGpuPackMgr().get(this->GetPlace(), used_slots_info_); +#endif return true; } @@ -2607,5 +2622,315 @@ int SlotRecordInMemoryDataFeed::Next() { #endif } +#if defined(PADDLE_WITH_CUDA) && defined(PADDLE_WITH_HETERPS) +void SlotRecordInMemoryDataFeed::BuildSlotBatchGPU(const int ins_num) { + int offset_cols_size = (ins_num + 1); + size_t slot_total_num = (use_slot_size_ * offset_cols_size); + pack_->resize_gpu_slot_offsets(slot_total_num * sizeof(size_t)); + + auto& value = pack_->value(); + const UsedSlotGpuType* used_slot_gpu_types = + static_cast(pack_->get_gpu_slots()); + FillSlotValueOffset(ins_num, use_slot_size_, + reinterpret_cast(pack_->gpu_slot_offsets()), + value.d_uint64_offset.data(), uint64_use_slot_size_, + value.d_float_offset.data(), float_use_slot_size_, + used_slot_gpu_types); + size_t* d_slot_offsets = reinterpret_cast(pack_->gpu_slot_offsets()); + + HostBuffer& offsets = pack_->offsets(); + offsets.resize(slot_total_num); + HostBuffer& h_tensor_ptrs = pack_->h_tensor_ptrs(); + h_tensor_ptrs.resize(use_slot_size_); + // alloc gpu memory + pack_->resize_tensor(); + + LoDTensor& float_tensor = pack_->float_tensor(); + LoDTensor& uint64_tensor = pack_->uint64_tensor(); + + int64_t float_offset = 0; + int64_t uint64_offset = 0; + + // copy index + CUDA_CHECK(cudaMemcpy(offsets.data(), d_slot_offsets, + slot_total_num * sizeof(size_t), + cudaMemcpyDeviceToHost)); + for (int j = 0; j < use_slot_size_; ++j) { + auto& feed = feed_vec_[j]; + if (feed == nullptr) { + h_tensor_ptrs[j] = nullptr; + continue; + } + + size_t* off_start_ptr = &offsets[j * offset_cols_size]; + + int total_instance = static_cast(off_start_ptr[offset_cols_size - 1]); + CHECK(total_instance >= 0) << "slot idx:" << j + << ", total instance:" << total_instance; + auto& info = used_slots_info_[j]; + + // fill slot value with default value 0 + if (info.type[0] == 'f') { // float + if (total_instance > 0) { + feed->ShareDataWith(float_tensor.Slice( + static_cast(float_offset), + static_cast(float_offset + total_instance))); + feed->Resize({total_instance, 1}); + float_offset += total_instance; + h_tensor_ptrs[j] = feed->mutable_data(this->place_); + } else { + h_tensor_ptrs[j] = + feed->mutable_data({total_instance, 1}, this->place_); + } + } else if (info.type[0] == 'u') { // uint64 + if (total_instance > 0) { + feed->ShareDataWith(uint64_tensor.Slice( + static_cast(uint64_offset), + static_cast(uint64_offset + total_instance))); + feed->Resize({total_instance, 1}); + uint64_offset += total_instance; + h_tensor_ptrs[j] = feed->mutable_data(this->place_); + } else { + h_tensor_ptrs[j] = + feed->mutable_data({total_instance, 1}, this->place_); + } + } + + if (info.dense) { + if (info.inductive_shape_index != -1) { + info.local_shape[info.inductive_shape_index] = + total_instance / info.total_dims_without_inductive; + } + feed->Resize(phi::make_ddim(info.local_shape)); + } else { + LoD& lod = (*feed->mutable_lod()); + lod.resize(1); + lod[0].resize(offset_cols_size); + paddle::framework::MixVector mixv_lod(&lod[0]); + memcpy(mixv_lod.MutableData(platform::CPUPlace()), off_start_ptr, + offset_cols_size * sizeof(size_t)); + } + } + void** dest_gpu_p = reinterpret_cast(pack_->slot_buf_ptr()); + CUDA_CHECK(cudaMemcpy(dest_gpu_p, h_tensor_ptrs.data(), + use_slot_size_ * sizeof(void*), + cudaMemcpyHostToDevice)); + + CopyForTensor(ins_num, use_slot_size_, dest_gpu_p, + (const size_t*)pack_->gpu_slot_offsets(), + (const uint64_t*)value.d_uint64_keys.data(), + (const int*)value.d_uint64_offset.data(), + (const int*)value.d_uint64_lens.data(), uint64_use_slot_size_, + (const float*)value.d_float_keys.data(), + (const int*)value.d_float_offset.data(), + (const int*)value.d_float_lens.data(), float_use_slot_size_, + used_slot_gpu_types); +} + +MiniBatchGpuPack::MiniBatchGpuPack(const paddle::platform::Place& place, + const std::vector& infos) { + place_ = place; + stream_ = dynamic_cast( + platform::DeviceContextPool::Instance().Get(place)) + ->stream(); + + ins_num_ = 0; + pv_num_ = 0; + used_float_num_ = 0; + used_uint64_num_ = 0; + + used_slot_size_ = static_cast(infos.size()); + for (int i = 0; i < used_slot_size_; ++i) { + auto& info = infos[i]; + if (info.type[0] == 'u') { + gpu_used_slots_.push_back({1, info.slot_value_idx}); + ++used_uint64_num_; + } else { + gpu_used_slots_.push_back({0, info.slot_value_idx}); + ++used_float_num_; + } + } + copy_host2device(&gpu_slots_, gpu_used_slots_.data(), gpu_used_slots_.size()); + + slot_buf_ptr_ = memory::AllocShared(place_, used_slot_size_ * sizeof(void*)); + + int device_id = place_.GetDeviceId(); + VLOG(3) << "begin get batch pack device id: " << device_id; + // sync + CUDA_CHECK(cudaStreamSynchronize(stream_)); +} + +MiniBatchGpuPack::~MiniBatchGpuPack() {} + +void MiniBatchGpuPack::reset(const paddle::platform::Place& place) { + place_ = place; + stream_ = dynamic_cast( + platform::DeviceContextPool::Instance().Get(place)) + ->stream(); + ins_num_ = 0; + pv_num_ = 0; +} + +void MiniBatchGpuPack::pack_all_data(const SlotRecord* ins_vec, int num) { + int uint64_total_num = 0; + int float_total_num = 0; + + buf_.h_uint64_lens.resize(num + 1); + buf_.h_uint64_lens[0] = 0; + buf_.h_float_lens.resize(num + 1); + buf_.h_float_lens[0] = 0; + + for (int i = 0; i < num; ++i) { + auto r = ins_vec[i]; + uint64_total_num += r->slot_uint64_feasigns_.slot_values.size(); + buf_.h_uint64_lens[i + 1] = uint64_total_num; + float_total_num += r->slot_float_feasigns_.slot_values.size(); + buf_.h_float_lens[i + 1] = float_total_num; + } + + int uint64_cols = (used_uint64_num_ + 1); + buf_.h_uint64_offset.resize(uint64_cols * num); + buf_.h_uint64_keys.resize(uint64_total_num); + + int float_cols = (used_float_num_ + 1); + buf_.h_float_offset.resize(float_cols * num); + buf_.h_float_keys.resize(float_total_num); + + size_t fea_num = 0; + uint64_total_num = 0; + float_total_num = 0; + for (int i = 0; i < num; ++i) { + auto r = ins_vec[i]; + auto& uint64_feasigns = r->slot_uint64_feasigns_; + fea_num = uint64_feasigns.slot_values.size(); + if (fea_num > 0) { + memcpy(&buf_.h_uint64_keys[uint64_total_num], + uint64_feasigns.slot_values.data(), fea_num * sizeof(uint64_t)); + } + uint64_total_num += fea_num; + // copy uint64 offset + memcpy(&buf_.h_uint64_offset[i * uint64_cols], + uint64_feasigns.slot_offsets.data(), sizeof(int) * uint64_cols); + + auto& float_feasigns = r->slot_float_feasigns_; + fea_num = float_feasigns.slot_values.size(); + memcpy(&buf_.h_float_keys[float_total_num], + float_feasigns.slot_values.data(), fea_num * sizeof(float)); + float_total_num += fea_num; + + // copy float offset + memcpy(&buf_.h_float_offset[i * float_cols], + float_feasigns.slot_offsets.data(), sizeof(int) * float_cols); + } + + CHECK(uint64_total_num == static_cast(buf_.h_uint64_lens.back())) + << "uint64 value length error"; + CHECK(float_total_num == static_cast(buf_.h_float_lens.back())) + << "float value length error"; +} +void MiniBatchGpuPack::pack_uint64_data(const SlotRecord* ins_vec, int num) { + int uint64_total_num = 0; + + buf_.h_float_lens.clear(); + buf_.h_float_keys.clear(); + buf_.h_float_offset.clear(); + + buf_.h_uint64_lens.resize(num + 1); + buf_.h_uint64_lens[0] = 0; + + for (int i = 0; i < num; ++i) { + auto r = ins_vec[i]; + uint64_total_num += r->slot_uint64_feasigns_.slot_values.size(); + buf_.h_uint64_lens[i + 1] = uint64_total_num; + } + + int uint64_cols = (used_uint64_num_ + 1); + buf_.h_uint64_offset.resize(uint64_cols * num); + buf_.h_uint64_keys.resize(uint64_total_num); + + size_t fea_num = 0; + uint64_total_num = 0; + for (int i = 0; i < num; ++i) { + auto r = ins_vec[i]; + auto& uint64_feasigns = r->slot_uint64_feasigns_; + fea_num = uint64_feasigns.slot_values.size(); + if (fea_num > 0) { + memcpy(&buf_.h_uint64_keys[uint64_total_num], + uint64_feasigns.slot_values.data(), fea_num * sizeof(uint64_t)); + } + uint64_total_num += fea_num; + // copy uint64 offset + memcpy(&buf_.h_uint64_offset[i * uint64_cols], + uint64_feasigns.slot_offsets.data(), sizeof(int) * uint64_cols); + } + CHECK(uint64_total_num == static_cast(buf_.h_uint64_lens.back())) + << "uint64 value length error"; +} +void MiniBatchGpuPack::pack_float_data(const SlotRecord* ins_vec, int num) { + int float_total_num = 0; + + buf_.h_uint64_lens.clear(); + buf_.h_uint64_offset.clear(); + buf_.h_uint64_keys.clear(); + + buf_.h_float_lens.resize(num + 1); + buf_.h_float_lens[0] = 0; + + for (int i = 0; i < num; ++i) { + auto r = ins_vec[i]; + float_total_num += r->slot_float_feasigns_.slot_values.size(); + buf_.h_float_lens[i + 1] = float_total_num; + } + + int float_cols = (used_float_num_ + 1); + buf_.h_float_offset.resize(float_cols * num); + buf_.h_float_keys.resize(float_total_num); + + size_t fea_num = 0; + float_total_num = 0; + for (int i = 0; i < num; ++i) { + auto r = ins_vec[i]; + auto& float_feasigns = r->slot_float_feasigns_; + fea_num = float_feasigns.slot_values.size(); + memcpy(&buf_.h_float_keys[float_total_num], + float_feasigns.slot_values.data(), fea_num * sizeof(float)); + float_total_num += fea_num; + + // copy float offset + memcpy(&buf_.h_float_offset[i * float_cols], + float_feasigns.slot_offsets.data(), sizeof(int) * float_cols); + } + CHECK(float_total_num == static_cast(buf_.h_float_lens.back())) + << "float value length error"; +} + +void MiniBatchGpuPack::pack_instance(const SlotRecord* ins_vec, int num) { + ins_num_ = num; + batch_ins_ = ins_vec; + CHECK(used_uint64_num_ > 0 || used_float_num_ > 0); + // uint64 and float + if (used_uint64_num_ > 0 && used_float_num_ > 0) { + pack_all_data(ins_vec, num); + } else if (used_uint64_num_ > 0) { // uint64 + pack_uint64_data(ins_vec, num); + } else { // only float + pack_float_data(ins_vec, num); + } + // to gpu + transfer_to_gpu(); +} + +void MiniBatchGpuPack::transfer_to_gpu(void) { + copy_host2device(&value_.d_uint64_lens, buf_.h_uint64_lens); + copy_host2device(&value_.d_uint64_keys, buf_.h_uint64_keys); + copy_host2device(&value_.d_uint64_offset, buf_.h_uint64_offset); + + copy_host2device(&value_.d_float_lens, buf_.h_float_lens); + copy_host2device(&value_.d_float_keys, buf_.h_float_keys); + copy_host2device(&value_.d_float_offset, buf_.h_float_offset); + CUDA_CHECK(cudaStreamSynchronize(stream_)); +} +#endif + } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/data_feed.cu b/paddle/fluid/framework/data_feed.cu new file mode 100644 index 0000000000000..f9435ec2a32d8 --- /dev/null +++ b/paddle/fluid/framework/data_feed.cu @@ -0,0 +1,149 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#if defined _WIN32 || defined __APPLE__ +#else +#define _LINUX +#endif +#if defined(PADDLE_WITH_CUDA) && defined(PADDLE_WITH_HETERPS) + +#include "paddle/fluid/framework/data_feed.h" + +namespace paddle { +namespace framework { + +#define CUDA_KERNEL_LOOP(i, n) \ + for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \ + i += blockDim.x * gridDim.x) + +// CUDA: use 512 threads per block +const int CUDA_NUM_THREADS = 512; +// CUDA: number of blocks for threads. +inline int GET_BLOCKS(const int N) { + return (N + CUDA_NUM_THREADS - 1) / CUDA_NUM_THREADS; +} +// fill slot values +__global__ void FillSlotValueOffsetKernel( + const int ins_num, const int used_slot_num, size_t *slot_value_offsets, + const int *uint64_offsets, const int uint64_slot_size, + const int *float_offsets, const int float_slot_size, + const UsedSlotGpuType *used_slots) { + int col_num = ins_num + 1; + int uint64_cols = uint64_slot_size + 1; + int float_cols = float_slot_size + 1; + + CUDA_KERNEL_LOOP(slot_idx, used_slot_num) { + int value_off = slot_idx * col_num; + slot_value_offsets[value_off] = 0; + + auto &info = used_slots[slot_idx]; + if (info.is_uint64_value) { + for (int k = 0; k < ins_num; ++k) { + int pos = k * uint64_cols + info.slot_value_idx; + int num = uint64_offsets[pos + 1] - uint64_offsets[pos]; + PADDLE_ENFORCE(num >= 0, "The number of slot size must be ge 0."); + slot_value_offsets[value_off + k + 1] = + slot_value_offsets[value_off + k] + num; + } + } else { + for (int k = 0; k < ins_num; ++k) { + int pos = k * float_cols + info.slot_value_idx; + int num = float_offsets[pos + 1] - float_offsets[pos]; + PADDLE_ENFORCE(num >= 0, "The number of slot size must be ge 0."); + slot_value_offsets[value_off + k + 1] = + slot_value_offsets[value_off + k] + num; + } + } + } +} + +void SlotRecordInMemoryDataFeed::FillSlotValueOffset( + const int ins_num, const int used_slot_num, size_t *slot_value_offsets, + const int *uint64_offsets, const int uint64_slot_size, + const int *float_offsets, const int float_slot_size, + const UsedSlotGpuType *used_slots) { + auto stream = + dynamic_cast( + paddle::platform::DeviceContextPool::Instance().Get(this->place_)) + ->stream(); + FillSlotValueOffsetKernel<<>>( + ins_num, used_slot_num, slot_value_offsets, uint64_offsets, + uint64_slot_size, float_offsets, float_slot_size, used_slots); + cudaStreamSynchronize(stream); +} + +__global__ void CopyForTensorKernel( + const int used_slot_num, const int ins_num, void **dest, + const size_t *slot_value_offsets, const uint64_t *uint64_feas, + const int *uint64_offsets, const int *uint64_ins_lens, + const int uint64_slot_size, const float *float_feas, + const int *float_offsets, const int *float_ins_lens, + const int float_slot_size, const UsedSlotGpuType *used_slots) { + int col_num = ins_num + 1; + int uint64_cols = uint64_slot_size + 1; + int float_cols = float_slot_size + 1; + + CUDA_KERNEL_LOOP(i, ins_num * used_slot_num) { + int slot_idx = i / ins_num; + int ins_idx = i % ins_num; + + uint32_t value_offset = slot_value_offsets[slot_idx * col_num + ins_idx]; + auto &info = used_slots[slot_idx]; + if (info.is_uint64_value) { + uint64_t *up = reinterpret_cast(dest[slot_idx]); + int index = info.slot_value_idx + uint64_cols * ins_idx; + int old_off = uint64_offsets[index]; + int num = uint64_offsets[index + 1] - old_off; + PADDLE_ENFORCE(num >= 0, "The number of slot size must be ge 0."); + int uint64_value_offset = uint64_ins_lens[ins_idx]; + for (int k = 0; k < num; ++k) { + up[k + value_offset] = uint64_feas[k + old_off + uint64_value_offset]; + } + } else { + float *fp = reinterpret_cast(dest[slot_idx]); + int index = info.slot_value_idx + float_cols * ins_idx; + int old_off = float_offsets[index]; + int num = float_offsets[index + 1] - old_off; + PADDLE_ENFORCE(num >= 0, "The number of slot size must be ge 0."); + int float_value_offset = float_ins_lens[ins_idx]; + for (int k = 0; k < num; ++k) { + fp[k + value_offset] = float_feas[k + old_off + float_value_offset]; + } + } + } +} + +void SlotRecordInMemoryDataFeed::CopyForTensor( + const int ins_num, const int used_slot_num, void **dest, + const size_t *slot_value_offsets, const uint64_t *uint64_feas, + const int *uint64_offsets, const int *uint64_ins_lens, + const int uint64_slot_size, const float *float_feas, + const int *float_offsets, const int *float_ins_lens, + const int float_slot_size, const UsedSlotGpuType *used_slots) { + auto stream = + dynamic_cast( + paddle::platform::DeviceContextPool::Instance().Get(this->place_)) + ->stream(); + + CopyForTensorKernel<<>>( + used_slot_num, ins_num, dest, slot_value_offsets, uint64_feas, + uint64_offsets, uint64_ins_lens, uint64_slot_size, float_feas, + float_offsets, float_ins_lens, float_slot_size, used_slots); + cudaStreamSynchronize(stream); +} + +} // namespace framework +} // namespace paddle +#endif diff --git a/paddle/fluid/framework/data_feed.h b/paddle/fluid/framework/data_feed.h index eb6ed2688095a..6f7f1dac52804 100644 --- a/paddle/fluid/framework/data_feed.h +++ b/paddle/fluid/framework/data_feed.h @@ -41,6 +41,10 @@ limitations under the License. */ #include "paddle/fluid/framework/variable.h" #include "paddle/fluid/platform/timer.h" #include "paddle/fluid/string/string_helper.h" +#if defined(PADDLE_WITH_CUDA) +#include "paddle/fluid/platform/cuda_device_guard.h" +#include "paddle/fluid/platform/device/gpu/gpu_info.h" +#endif DECLARE_int32(record_pool_max_size); DECLARE_int32(slotpool_thread_num); @@ -409,6 +413,266 @@ class CustomParser { } }; +struct UsedSlotGpuType { + int is_uint64_value; + int slot_value_idx; +}; + +#if defined(PADDLE_WITH_CUDA) && defined(PADDLE_WITH_HETERPS) +#define CUDA_CHECK(val) CHECK(val == gpuSuccess) +template +struct CudaBuffer { + T* cu_buffer; + uint64_t buf_size; + + CudaBuffer() { + cu_buffer = NULL; + buf_size = 0; + } + ~CudaBuffer() { free(); } + T* data() { return cu_buffer; } + uint64_t size() { return buf_size; } + void malloc(uint64_t size) { + buf_size = size; + CUDA_CHECK( + cudaMalloc(reinterpret_cast(&cu_buffer), size * sizeof(T))); + } + void free() { + if (cu_buffer != NULL) { + CUDA_CHECK(cudaFree(cu_buffer)); + cu_buffer = NULL; + } + buf_size = 0; + } + void resize(uint64_t size) { + if (size <= buf_size) { + return; + } + free(); + malloc(size); + } +}; +template +struct HostBuffer { + T* host_buffer; + size_t buf_size; + size_t data_len; + + HostBuffer() { + host_buffer = NULL; + buf_size = 0; + data_len = 0; + } + ~HostBuffer() { free(); } + + T* data() { return host_buffer; } + const T* data() const { return host_buffer; } + size_t size() const { return data_len; } + void clear() { free(); } + T& back() { return host_buffer[data_len - 1]; } + + T& operator[](size_t i) { return host_buffer[i]; } + const T& operator[](size_t i) const { return host_buffer[i]; } + void malloc(size_t len) { + buf_size = len; + CUDA_CHECK(cudaHostAlloc(reinterpret_cast(&host_buffer), + buf_size * sizeof(T), cudaHostAllocDefault)); + CHECK(host_buffer != NULL); + } + void free() { + if (host_buffer != NULL) { + CUDA_CHECK(cudaFreeHost(host_buffer)); + host_buffer = NULL; + } + buf_size = 0; + } + void resize(size_t size) { + if (size <= buf_size) { + data_len = size; + return; + } + data_len = size; + free(); + malloc(size); + } +}; + +struct BatchCPUValue { + HostBuffer h_uint64_lens; + HostBuffer h_uint64_keys; + HostBuffer h_uint64_offset; + + HostBuffer h_float_lens; + HostBuffer h_float_keys; + HostBuffer h_float_offset; + + HostBuffer h_rank; + HostBuffer h_cmatch; + HostBuffer h_ad_offset; +}; + +struct BatchGPUValue { + CudaBuffer d_uint64_lens; + CudaBuffer d_uint64_keys; + CudaBuffer d_uint64_offset; + + CudaBuffer d_float_lens; + CudaBuffer d_float_keys; + CudaBuffer d_float_offset; + + CudaBuffer d_rank; + CudaBuffer d_cmatch; + CudaBuffer d_ad_offset; +}; + +class MiniBatchGpuPack { + public: + MiniBatchGpuPack(const paddle::platform::Place& place, + const std::vector& infos); + ~MiniBatchGpuPack(); + void reset(const paddle::platform::Place& place); + void pack_instance(const SlotRecord* ins_vec, int num); + int ins_num() { return ins_num_; } + int pv_num() { return pv_num_; } + BatchGPUValue& value() { return value_; } + BatchCPUValue& cpu_value() { return buf_; } + UsedSlotGpuType* get_gpu_slots(void) { + return reinterpret_cast(gpu_slots_.data()); + } + SlotRecord* get_records(void) { return &ins_vec_[0]; } + + // tensor gpu memory reused + void resize_tensor(void) { + if (used_float_num_ > 0) { + int float_total_len = buf_.h_float_lens.back(); + if (float_total_len > 0) { + float_tensor_.mutable_data({float_total_len, 1}, this->place_); + } + } + if (used_uint64_num_ > 0) { + int uint64_total_len = buf_.h_uint64_lens.back(); + if (uint64_total_len > 0) { + uint64_tensor_.mutable_data({uint64_total_len, 1}, + this->place_); + } + } + } + LoDTensor& float_tensor(void) { return float_tensor_; } + LoDTensor& uint64_tensor(void) { return uint64_tensor_; } + + HostBuffer& offsets(void) { return offsets_; } + HostBuffer& h_tensor_ptrs(void) { return h_tensor_ptrs_; } + + void* gpu_slot_offsets(void) { return gpu_slot_offsets_->ptr(); } + + void* slot_buf_ptr(void) { return slot_buf_ptr_->ptr(); } + + void resize_gpu_slot_offsets(const size_t slot_total_bytes) { + if (gpu_slot_offsets_ == nullptr) { + gpu_slot_offsets_ = memory::AllocShared(place_, slot_total_bytes); + } else if (gpu_slot_offsets_->size() < slot_total_bytes) { + auto buf = memory::AllocShared(place_, slot_total_bytes); + gpu_slot_offsets_.swap(buf); + buf = nullptr; + } + } + const std::string& get_lineid(int idx) { + if (enable_pv_) { + return ins_vec_[idx]->ins_id_; + } + return batch_ins_[idx]->ins_id_; + } + + private: + void transfer_to_gpu(void); + void pack_all_data(const SlotRecord* ins_vec, int num); + void pack_uint64_data(const SlotRecord* ins_vec, int num); + void pack_float_data(const SlotRecord* ins_vec, int num); + + public: + template + void copy_host2device(CudaBuffer* buf, const T* val, size_t size) { + if (size == 0) { + return; + } + buf->resize(size); + CUDA_CHECK(cudaMemcpyAsync(buf->data(), val, size * sizeof(T), + cudaMemcpyHostToDevice, stream_)); + } + template + void copy_host2device(CudaBuffer* buf, const HostBuffer& val) { + copy_host2device(buf, val.data(), val.size()); + } + + private: + paddle::platform::Place place_; + cudaStream_t stream_; + BatchGPUValue value_; + BatchCPUValue buf_; + int ins_num_ = 0; + int pv_num_ = 0; + + bool enable_pv_ = false; + int used_float_num_ = 0; + int used_uint64_num_ = 0; + int used_slot_size_ = 0; + + CudaBuffer gpu_slots_; + std::vector gpu_used_slots_; + std::vector ins_vec_; + const SlotRecord* batch_ins_ = nullptr; + + // uint64 tensor + LoDTensor uint64_tensor_; + // float tensor + LoDTensor float_tensor_; + // batch + HostBuffer offsets_; + HostBuffer h_tensor_ptrs_; + + std::shared_ptr gpu_slot_offsets_ = nullptr; + std::shared_ptr slot_buf_ptr_ = nullptr; +}; +class MiniBatchGpuPackMgr { + static const int MAX_DEIVCE_NUM = 16; + + public: + MiniBatchGpuPackMgr() { + for (int i = 0; i < MAX_DEIVCE_NUM; ++i) { + pack_list_[i] = nullptr; + } + } + ~MiniBatchGpuPackMgr() { + for (int i = 0; i < MAX_DEIVCE_NUM; ++i) { + if (pack_list_[i] == nullptr) { + continue; + } + delete pack_list_[i]; + pack_list_[i] = nullptr; + } + } + // one device one thread + MiniBatchGpuPack* get(const paddle::platform::Place& place, + const std::vector& infos) { + int device_id = place.GetDeviceId(); + if (pack_list_[device_id] == nullptr) { + pack_list_[device_id] = new MiniBatchGpuPack(place, infos); + } else { + pack_list_[device_id]->reset(place); + } + return pack_list_[device_id]; + } + + private: + MiniBatchGpuPack* pack_list_[MAX_DEIVCE_NUM]; +}; +// global mgr +inline MiniBatchGpuPackMgr& BatchGpuPackMgr() { + static MiniBatchGpuPackMgr mgr; + return mgr; +} +#endif + typedef paddle::framework::CustomParser* (*CreateParserObjectFunc)(); class DLManager { @@ -1126,7 +1390,13 @@ class MultiSlotInMemoryDataFeed : public InMemoryDataFeed { class SlotRecordInMemoryDataFeed : public InMemoryDataFeed { public: SlotRecordInMemoryDataFeed() {} - virtual ~SlotRecordInMemoryDataFeed() {} + virtual ~SlotRecordInMemoryDataFeed() { +#if defined(PADDLE_WITH_CUDA) && defined(PADDLE_WITH_HETERPS) + if (pack_ != nullptr) { + pack_ = nullptr; + } +#endif + } virtual void Init(const DataFeedDesc& data_feed_desc); virtual void LoadIntoMemory(); void ExpandSlotRecord(SlotRecord* ins); @@ -1149,6 +1419,23 @@ class SlotRecordInMemoryDataFeed : public InMemoryDataFeed { } bool ParseOneInstance(const std::string& line, SlotRecord* rec); virtual void PutToFeedVec(const SlotRecord* ins_vec, int num); + virtual void AssignFeedVar(const Scope& scope); +#if defined(PADDLE_WITH_CUDA) && defined(PADDLE_WITH_HETERPS) + void BuildSlotBatchGPU(const int ins_num); + void FillSlotValueOffset(const int ins_num, const int used_slot_num, + size_t* slot_value_offsets, + const int* uint64_offsets, + const int uint64_slot_size, const int* float_offsets, + const int float_slot_size, + const UsedSlotGpuType* used_slots); + void CopyForTensor(const int ins_num, const int used_slot_num, void** dest, + const size_t* slot_value_offsets, + const uint64_t* uint64_feas, const int* uint64_offsets, + const int* uint64_ins_lens, const int uint64_slot_size, + const float* float_feas, const int* float_offsets, + const int* float_ins_lens, const int float_slot_size, + const UsedSlotGpuType* used_slots); +#endif float sample_rate_ = 1.0f; int use_slot_size_ = 0; int float_use_slot_size_ = 0; @@ -1157,6 +1444,10 @@ class SlotRecordInMemoryDataFeed : public InMemoryDataFeed { std::vector used_slots_info_; size_t float_total_dims_size_ = 0; std::vector float_total_dims_without_inductives_; + +#if defined(PADDLE_WITH_CUDA) && defined(PADDLE_WITH_HETERPS) + MiniBatchGpuPack* pack_ = nullptr; +#endif }; class PaddleBoxDataFeed : public MultiSlotInMemoryDataFeed { diff --git a/paddle/fluid/framework/fleet/ps_gpu_wrapper.cc b/paddle/fluid/framework/fleet/ps_gpu_wrapper.cc index c7852de00a18e..e167a39caa526 100755 --- a/paddle/fluid/framework/fleet/ps_gpu_wrapper.cc +++ b/paddle/fluid/framework/fleet/ps_gpu_wrapper.cc @@ -271,13 +271,13 @@ void PSGPUWrapper::PreBuildTask(std::shared_ptr gpu_task) { } timeline.Pause(); - VLOG(1) << "GpuPs task add keys cost " << timeline.ElapsedSec() + VLOG(0) << "GpuPs task add keys cost " << timeline.ElapsedSec() << " seconds."; timeline.Start(); gpu_task->UniqueKeys(); timeline.Pause(); - VLOG(1) << "GpuPs task unique cost " << timeline.ElapsedSec() << " seconds."; + VLOG(0) << "GpuPs task unique cost " << timeline.ElapsedSec() << " seconds."; if (!multi_mf_dim_) { for (int i = 0; i < thread_keys_shard_num_; i++) { @@ -667,7 +667,7 @@ void PSGPUWrapper::BuildGPUTask(std::shared_ptr gpu_task) { if (!multi_mf_dim_) { for (int i = 0; i < device_num; i++) { feature_keys_count[i] = gpu_task->device_keys_[i].size(); - VLOG(1) << i << " card contains feasign nums: " << feature_keys_count[i]; + VLOG(0) << i << " card contains feasign nums: " << feature_keys_count[i]; size_max = std::max(size_max, feature_keys_count[i]); } } else { @@ -675,7 +675,7 @@ void PSGPUWrapper::BuildGPUTask(std::shared_ptr gpu_task) { for (int j = 0; j < multi_mf_dim_; j++) { feature_keys_count[i] += gpu_task->device_dim_ptr_[i][j].size(); } - VLOG(1) << i << " card with dynamic mf contains feasign nums: " + VLOG(0) << i << " card with dynamic mf contains feasign nums: " << feature_keys_count[i]; size_max = std::max(size_max, feature_keys_count[i]); } @@ -685,7 +685,7 @@ void PSGPUWrapper::BuildGPUTask(std::shared_ptr gpu_task) { HeterPs_ = nullptr; } if (size_max <= 0) { - VLOG(1) << "Skip build gpu ps cause feasign nums = " << size_max; + VLOG(0) << "Skip build gpu ps cause feasign nums = " << size_max; return; } std::vector threads(device_num); @@ -707,7 +707,7 @@ void PSGPUWrapper::BuildGPUTask(std::shared_ptr gpu_task) { t.join(); } timeline.Pause(); - VLOG(1) << "GpuPs build table total costs: " << timeline.ElapsedSec() + VLOG(0) << "GpuPs build table total costs: " << timeline.ElapsedSec() << " s."; } @@ -749,7 +749,7 @@ void PSGPUWrapper::pre_build_thread() { // build cpu ps data process PreBuildTask(gpu_task); timer.Pause(); - VLOG(1) << "thread PreBuildTask end, cost time: " << timer.ElapsedSec() + VLOG(0) << "thread PreBuildTask end, cost time: " << timer.ElapsedSec() << "s"; buildcpu_ready_channel_->Put(gpu_task); } @@ -768,13 +768,13 @@ void PSGPUWrapper::build_task() { return; } - VLOG(1) << "BuildPull start."; + VLOG(0) << "BuildPull start."; platform::Timer timer; timer.Start(); BuildPull(gpu_task); BuildGPUTask(gpu_task); timer.Pause(); - VLOG(1) << "BuildPull + BuildGPUTask end, cost time: " << timer.ElapsedSec() + VLOG(0) << "BuildPull + BuildGPUTask end, cost time: " << timer.ElapsedSec() << "s"; current_task_ = gpu_task; diff --git a/paddle/fluid/framework/ps_gpu_worker.cc b/paddle/fluid/framework/ps_gpu_worker.cc index dc8935587e99c..d98deb0f188dc 100644 --- a/paddle/fluid/framework/ps_gpu_worker.cc +++ b/paddle/fluid/framework/ps_gpu_worker.cc @@ -119,6 +119,7 @@ void PSGPUWorker::SetChannelWriter(ChannelObject* queue) { } void PSGPUWorker::TrainFiles() { + VLOG(0) << "Begin to train files"; platform::SetNumThreads(1); platform::Timer timeline; timeline.Start(); @@ -129,6 +130,8 @@ void PSGPUWorker::TrainFiles() { device_reader_->Start(); int cur_batch; int batch_cnt = 0; + + platform::SetDeviceId(thread_id_); while ((cur_batch = device_reader_->Next()) > 0) { total_ins_num += cur_batch; for (auto& op : ops_) { @@ -190,14 +193,14 @@ void PSGPUWorker::TrainFiles() { writer_.Flush(); } timeline.Pause(); - VLOG(1) << "GpuPs worker " << thread_id_ << " train cost " + VLOG(0) << "GpuPs worker " << thread_id_ << " train cost " << timeline.ElapsedSec() << " seconds, ins_num: " << total_ins_num; return; } void PSGPUWorker::TrainFilesWithProfiler() { platform::SetNumThreads(1); - VLOG(1) << "Begin to train files with profiler"; + VLOG(0) << "Begin to train files with profiler"; device_reader_->Start(); std::vector op_total_time; std::vector op_name; @@ -225,6 +228,7 @@ void PSGPUWorker::TrainFilesWithProfiler() { int total_ins_num = 0; int cur_batch; timeline.Start(); + platform::SetDeviceId(thread_id_); while ((cur_batch = device_reader_->Next()) > 0) { total_ins_num += cur_batch; timeline.Pause(); @@ -260,13 +264,15 @@ void PSGPUWorker::TrainFilesWithProfiler() { total_time += timeline.ElapsedSec(); timeline.Start(); } - VLOG(1) << "GpuPs worker " << thread_id_ << " train cost " << total_time + VLOG(0) << "GpuPs worker " << thread_id_ << " train cost " << total_time << " seconds, ins_num: " << total_ins_num; for (size_t i = 0; i < op_name.size(); ++i) { - VLOG(1) << "card:" << thread_id_ << ", op: " << op_name[i] + VLOG(0) << "card:" << thread_id_ << ", op: " << op_name[i] << ", mean time: " << op_total_time[i] / total_ins_num << "s, totol time:" << op_total_time[i] << "sec"; } + VLOG(0) << "card: " << thread_id_ << " read time: " << read_time + << ", percent: " << read_time / total_time * 100; return; } From b01314c560dc3324d62dbfa790b922d556c3314e Mon Sep 17 00:00:00 2001 From: hong <43953930+phlrain@users.noreply.github.com> Date: Tue, 12 Apr 2022 13:09:24 +0800 Subject: [PATCH 092/211] fix search sort bug (#41664) --- python/paddle/utils/code_gen/api.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/python/paddle/utils/code_gen/api.yaml b/python/paddle/utils/code_gen/api.yaml index 96bb3aafa5085..7e6740b6cb178 100644 --- a/python/paddle/utils/code_gen/api.yaml +++ b/python/paddle/utils/code_gen/api.yaml @@ -1713,6 +1713,7 @@ func : SearchsortedInferMeta kernel : func : searchsorted + data_type : sorted_sequence # segment_pool - api : segment_pool From 606848af4a9062209179b5424a34fc310fd85884 Mon Sep 17 00:00:00 2001 From: Weilong Wu Date: Tue, 12 Apr 2022 13:25:39 +0800 Subject: [PATCH 093/211] [Eager] Fix multiprocessing eager mode global issue (#41645) --- .../fluid/tests/unittests/test_paddle_multiprocessing.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/python/paddle/fluid/tests/unittests/test_paddle_multiprocessing.py b/python/paddle/fluid/tests/unittests/test_paddle_multiprocessing.py index 7825b13001f28..3fc06e3c8dff7 100644 --- a/python/paddle/fluid/tests/unittests/test_paddle_multiprocessing.py +++ b/python/paddle/fluid/tests/unittests/test_paddle_multiprocessing.py @@ -19,13 +19,16 @@ import time import paddle import paddle.incubate.multiprocessing as mp -from paddle.fluid.framework import _test_eager_guard, _in_legacy_dygraph, in_dygraph_mode +from paddle.fluid.framework import _test_eager_guard, _in_legacy_dygraph, in_dygraph_mode, _enable_legacy_dygraph REPEAT = 20 HAS_SHM_FILES = os.path.isdir('/dev/shm') def fill_tensor(queue, event): + # make sure run in legacy dygraph + if in_dygraph_mode(): + _enable_legacy_dygraph() data = queue.get() with paddle.no_grad(): data[0][:] = 5 From 0835de7941a9363a6f23530a201436c9adee6014 Mon Sep 17 00:00:00 2001 From: Yanxing Shi <48111042+Yanxing-Shi@users.noreply.github.com> Date: Tue, 12 Apr 2022 14:30:38 +0800 Subject: [PATCH 094/211] add ParallelMode docs (#41326) --- python/paddle/distributed/__init__.py | 2 ++ python/paddle/distributed/fleet/base/topology.py | 16 ++++++++++++++++ 2 files changed, 18 insertions(+) diff --git a/python/paddle/distributed/__init__.py b/python/paddle/distributed/__init__.py index fdb7a3b2cb447..50e4f7285b169 100644 --- a/python/paddle/distributed/__init__.py +++ b/python/paddle/distributed/__init__.py @@ -25,6 +25,7 @@ from paddle.distributed.fleet.dataset import InMemoryDataset # noqa: F401 from paddle.distributed.fleet.dataset import QueueDataset # noqa: F401 +from paddle.distributed.fleet.base.topology import ParallelMode # noqa: F401 from .collective import broadcast # noqa: F401 from .collective import all_reduce # noqa: F401 @@ -86,4 +87,5 @@ "wait", "get_rank", "ProbabilityEntry", + "ParallelMode", ] diff --git a/python/paddle/distributed/fleet/base/topology.py b/python/paddle/distributed/fleet/base/topology.py index 5b8d185212c23..ef34fd144a703 100644 --- a/python/paddle/distributed/fleet/base/topology.py +++ b/python/paddle/distributed/fleet/base/topology.py @@ -27,6 +27,22 @@ class ParallelMode(object): + """ + There are all the parallel modes currently supported: + - DATA_PARALLEL: Distribute input data to different devices. + - TENSOR_PARALLEL: Shards tensors in the network to different devices. + - PIPELINE_PARALLEL: Place different layers of the network on different devices. + - SHARDING_PARALLEL: Segment the model parameters, parameter gradients and optimizer states + corresponding to the parameters to each device. + + Examples: + .. code-block:: python + + import paddle + parallel_mode = paddle.distributed.ParallelMode + print(parallel_mode.DATA_PARALLEL) # 0 + + """ DATA_PARALLEL = 0 TENSOR_PARALLEL = 1 PIPELINE_PARALLEL = 2 From de49a4b7534682bff8a34767cae11135d5361837 Mon Sep 17 00:00:00 2001 From: chentianyu03 Date: Tue, 12 Apr 2022 14:39:14 +0800 Subject: [PATCH 095/211] exchange assign and assign_raw kernel name (#41625) * exchange assign and assign_raw kernel name * fix register error --- paddle/phi/kernels/assign_kernel.cc | 28 +++++++++++----------- paddle/phi/kernels/assign_kernel.h | 12 +++++----- paddle/phi/ops/compat/assign_sig.cc | 4 ++-- python/paddle/utils/code_gen/api.yaml | 2 +- python/paddle/utils/code_gen/backward.yaml | 2 +- 5 files changed, 24 insertions(+), 24 deletions(-) diff --git a/paddle/phi/kernels/assign_kernel.cc b/paddle/phi/kernels/assign_kernel.cc index 5eafc869fa551..720ebb5b78c9a 100644 --- a/paddle/phi/kernels/assign_kernel.cc +++ b/paddle/phi/kernels/assign_kernel.cc @@ -23,22 +23,22 @@ namespace phi { template -void AssignRawKernel(const Context& dev_ctx, - const DenseTensor& x, - DenseTensor* out) { +void AssignKernel(const Context& dev_ctx, + const DenseTensor& x, + DenseTensor* out) { Copy(dev_ctx, x, x.place(), false, out); } template -void AssignKernel(const Context& dev_ctx, - paddle::optional x, - DenseTensor* out) { +void AssignRawKernel(const Context& dev_ctx, + paddle::optional x, + DenseTensor* out) { if (x) { if (!x->IsInitialized()) { return; } auto& x_tensor = *x.get_ptr(); - AssignRawKernel(dev_ctx, x_tensor, out); + AssignKernel(dev_ctx, x_tensor, out); } } @@ -111,14 +111,14 @@ void AssignValueKernel(const Context& dev_ctx, } // namespace phi +PD_REGISTER_GENERAL_KERNEL( + assign, CPU, ALL_LAYOUT, phi::AssignKernel, ALL_DTYPE) {} + PD_REGISTER_GENERAL_KERNEL(assign_raw, CPU, ALL_LAYOUT, phi::AssignRawKernel, - ALL_DTYPE) {} - -PD_REGISTER_GENERAL_KERNEL( - assign, CPU, ALL_LAYOUT, phi::AssignKernel, ALL_DTYPE) { + ALL_DTYPE) { kernel->InputAt(0).SetBackend(phi::Backend::ALL_BACKEND); } PD_REGISTER_GENERAL_KERNEL(assign_array, @@ -136,13 +136,13 @@ PD_REGISTER_KERNEL(assign_value, int64_t) {} #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +PD_REGISTER_GENERAL_KERNEL( + assign, GPU, ALL_LAYOUT, phi::AssignKernel, ALL_DTYPE) {} PD_REGISTER_GENERAL_KERNEL(assign_raw, GPU, ALL_LAYOUT, phi::AssignRawKernel, - ALL_DTYPE) {} -PD_REGISTER_GENERAL_KERNEL( - assign, GPU, ALL_LAYOUT, phi::AssignKernel, ALL_DTYPE) { + ALL_DTYPE) { kernel->InputAt(0).SetBackend(phi::Backend::ALL_BACKEND); } PD_REGISTER_GENERAL_KERNEL(assign_array, diff --git a/paddle/phi/kernels/assign_kernel.h b/paddle/phi/kernels/assign_kernel.h index 437a2a0c189e8..6881ac9f0ee22 100644 --- a/paddle/phi/kernels/assign_kernel.h +++ b/paddle/phi/kernels/assign_kernel.h @@ -22,17 +22,17 @@ namespace phi { template -void AssignRawKernel(const Context& dev_ctx, - const DenseTensor& x, - DenseTensor* out); +void AssignKernel(const Context& dev_ctx, + const DenseTensor& x, + DenseTensor* out); // In order to be compatible with the `AsDispensable` input in the original // assign op maker, the input parameter here needs to be dispensable, but // this looks weird template -void AssignKernel(const Context& dev_ctx, - paddle::optional x, - DenseTensor* out); +void AssignRawKernel(const Context& dev_ctx, + paddle::optional x, + DenseTensor* out); template void AssignArrayKernel(const Context& dev_ctx, diff --git a/paddle/phi/ops/compat/assign_sig.cc b/paddle/phi/ops/compat/assign_sig.cc index d149e8e6a9aa0..c8cd9e44ff9ae 100644 --- a/paddle/phi/ops/compat/assign_sig.cc +++ b/paddle/phi/ops/compat/assign_sig.cc @@ -23,10 +23,10 @@ KernelSignature AssignOpArgumentMapping(const ArgumentMappingContext& ctx) { } else if (ctx.IsSelectedRowsInput("X")) { return KernelSignature("assign_sr", {"X"}, {}, {"Out"}); } else { - return KernelSignature("assign", {"X"}, {}, {"Out"}); + return KernelSignature("assign_raw", {"X"}, {}, {"Out"}); } } else { - return KernelSignature("assign", {"X"}, {}, {"Out"}); + return KernelSignature("assign_raw", {"X"}, {}, {"Out"}); } } diff --git a/python/paddle/utils/code_gen/api.yaml b/python/paddle/utils/code_gen/api.yaml index 7e6740b6cb178..b20259d3ebd25 100644 --- a/python/paddle/utils/code_gen/api.yaml +++ b/python/paddle/utils/code_gen/api.yaml @@ -174,7 +174,7 @@ infer_meta : func : UnchangedInferMeta kernel : - func : assign_raw + func : assign backward : assign_grad # atan diff --git a/python/paddle/utils/code_gen/backward.yaml b/python/paddle/utils/code_gen/backward.yaml index e2c5a970af17f..78f4ac7c985fb 100644 --- a/python/paddle/utils/code_gen/backward.yaml +++ b/python/paddle/utils/code_gen/backward.yaml @@ -120,7 +120,7 @@ func : UnchangedInferMeta param : [out_grad] kernel : - func : assign_raw + func : assign - backward_api : atan2_grad forward : atan2 (Tensor x, Tensor y) -> Tensor(out) From be4a20774f1e07a9a174b68c074582d6eaa96dda Mon Sep 17 00:00:00 2001 From: JingZhuangzhuang <75348594+JZZ-NOTE@users.noreply.github.com> Date: Tue, 12 Apr 2022 14:47:34 +0800 Subject: [PATCH 096/211] add python share_data interface (#41626) * add python share_data interface * Update inference_api.cc * Update inference_api.cc * add python share_data interface --- paddle/fluid/inference/api/paddle_api.h | 1 + paddle/fluid/pybind/inference_api.cc | 40 ++++++++++++++++++++++++ python/paddle/fluid/inference/wrapper.py | 13 ++++++++ 3 files changed, 54 insertions(+) diff --git a/paddle/fluid/inference/api/paddle_api.h b/paddle/fluid/inference/api/paddle_api.h index 657dd9b600cce..0f8f9e0a975ba 100644 --- a/paddle/fluid/inference/api/paddle_api.h +++ b/paddle/fluid/inference/api/paddle_api.h @@ -36,6 +36,7 @@ namespace paddle { using PaddleDType = paddle_infer::DataType; using PaddlePlace = paddle_infer::PlaceType; +using PaddleDataLayout = paddle_infer::DataLayout; /// \brief Memory manager for PaddleTensor. /// diff --git a/paddle/fluid/pybind/inference_api.cc b/paddle/fluid/pybind/inference_api.cc index c8f0acd0b8a85..97f3722008769 100644 --- a/paddle/fluid/pybind/inference_api.cc +++ b/paddle/fluid/pybind/inference_api.cc @@ -76,6 +76,7 @@ using paddle::NativeConfig; using paddle::NativePaddlePredictor; using paddle::PaddleBuf; using paddle::PaddleDType; +using paddle::PaddleDataLayout; using paddle::PaddlePassBuilder; using paddle::PaddlePlace; using paddle::PaddlePredictor; @@ -85,6 +86,7 @@ using paddle::ZeroCopyTensor; namespace { void BindPaddleDType(py::module *m); +void BindPaddleDataLayout(py::module *m); void BindPaddleBuf(py::module *m); void BindPaddleTensor(py::module *m); void BindPaddlePlace(py::module *m); @@ -211,6 +213,34 @@ void PaddleInferTensorCreate( tensor.CopyFromCpu(static_cast(data.data())); } +paddle_infer::PlaceType ToPaddleInferPlace( + phi::AllocationType allocation_type) { + if (allocation_type == phi::AllocationType::CPU) { + return paddle_infer::PlaceType::kCPU; + } else if (allocation_type == phi::AllocationType::GPU) { + return paddle_infer::PlaceType::kGPU; + } else { + return paddle_infer::PlaceType::kCPU; + } +} + +void PaddleInferShareExternalData(paddle_infer::Tensor &tensor, // NOLINT + framework::Tensor input_tensor) { + std::vector shape; + for (int i = 0; i < input_tensor.dims().size(); ++i) { + shape.push_back(input_tensor.dims()[i]); + } + if (input_tensor.dtype() == phi::DataType::FLOAT32) { + tensor.ShareExternalData( + static_cast(input_tensor.data()), shape, + ToPaddleInferPlace(input_tensor.place().GetType())); + } else if (input_tensor.dtype() == phi::DataType::FLOAT16) { + tensor.ShareExternalData( + static_cast(input_tensor.data()), shape, + ToPaddleInferPlace(input_tensor.place().GetType())); + } +} + /// \brief Experimental interface. /// Create the Strings tensor from data. /// \param tensor The tensor will be created and @@ -327,6 +357,7 @@ void CopyPaddleInferTensor(paddle_infer::Tensor &dst, // NOLINT void BindInferenceApi(py::module *m) { BindPaddleDType(m); + BindPaddleDataLayout(m); BindPaddleBuf(m); BindPaddleTensor(m); BindPaddlePlace(m); @@ -372,6 +403,14 @@ void BindPaddleDType(py::module *m) { .value("INT32", PaddleDType::INT32); } +void BindPaddleDataLayout(py::module *m) { + py::enum_(*m, "PaddleDataLayout") + .value("UNK", PaddleDataLayout::kUNK) + .value("Any", PaddleDataLayout::kAny) + .value("NHWC", PaddleDataLayout::kNHWC) + .value("NCHW", PaddleDataLayout::kNCHW); +} + void BindPaddleBuf(py::module *m) { py::class_(*m, "PaddleBuf") .def(py::init()) @@ -817,6 +856,7 @@ void BindPaddleInferTensor(py::module *m) { .def("copy_from_cpu_bind", &PaddleInferTensorCreate) .def("copy_from_cpu_bind", &PaddleInferStringTensorCreate) + .def("share_external_data_bind", &PaddleInferShareExternalData) .def("copy_to_cpu", &PaddleInferTensorToNumpy) .def("shape", &paddle_infer::Tensor::shape) .def("set_lod", &paddle_infer::Tensor::SetLoD) diff --git a/python/paddle/fluid/inference/wrapper.py b/python/paddle/fluid/inference/wrapper.py index 6576ca785b6e1..950a89d08bcb9 100644 --- a/python/paddle/fluid/inference/wrapper.py +++ b/python/paddle/fluid/inference/wrapper.py @@ -14,6 +14,7 @@ from ..core import AnalysisConfig, PaddleDType, PaddlePlace from ..core import PaddleInferPredictor, PaddleInferTensor +from .. import core import numpy as np @@ -39,4 +40,16 @@ def tensor_copy_from_cpu(self, data): ) +def tensor_share_external_data(self, data): + ''' + Support input type check based on tensor.share_external_data. + ''' + if isinstance(data, core.LoDTensor): + self.share_external_data_bind(data) + else: + raise TypeError( + "In share_external_data, we only support LoDTensor data type.") + + Tensor.copy_from_cpu = tensor_copy_from_cpu +Tensor.share_external_data = tensor_share_external_data From 43d5cca64b1f3a7a3335270a6ac885eb50799fe4 Mon Sep 17 00:00:00 2001 From: hong <43953930+phlrain@users.noreply.github.com> Date: Tue, 12 Apr 2022 15:28:17 +0800 Subject: [PATCH 097/211] Add layer norm yaml (#41589) * add layer norm infermeta * add layer norm yaml * polish layer norm infer meta * add layer norm to black list --- paddle/phi/infermeta/ternary.cc | 97 +++++++++++++++++++ paddle/phi/infermeta/ternary.h | 18 ++++ .../phi/kernels/cpu/layer_norm_grad_kernel.cc | 4 +- .../phi/kernels/gpu/layer_norm_grad_kernel.cu | 4 +- paddle/phi/kernels/layer_norm_grad_kernel.h | 4 +- paddle/phi/ops/compat/layer_norm_sig.cc | 2 +- python/paddle/fluid/dygraph/nn.py | 17 +++- .../tests/unittests/test_layer_norm_op_v2.py | 49 ++++++++-- python/paddle/nn/functional/norm.py | 8 +- python/paddle/utils/code_gen/api.yaml | 10 ++ python/paddle/utils/code_gen/backward.yaml | 12 +++ tools/infrt/skipped_phi_api.json | 2 +- 12 files changed, 205 insertions(+), 22 deletions(-) diff --git a/paddle/phi/infermeta/ternary.cc b/paddle/phi/infermeta/ternary.cc index c692b6c8fcd13..ae8c7dd61c3bb 100644 --- a/paddle/phi/infermeta/ternary.cc +++ b/paddle/phi/infermeta/ternary.cc @@ -259,6 +259,103 @@ void GraphSendRecvInferMeta(const MetaTensor& x, } } +void LayerNormInferMeta(const MetaTensor& x, + paddle::optional scale, + paddle::optional bias, + float epsilon, + int begin_norm_axis, + bool is_test, + MetaTensor* out, + MetaTensor* mean, + MetaTensor* variance, + MetaConfig config) { + auto x_dim = x.dims(); + PADDLE_ENFORCE_LT( + begin_norm_axis, + x_dim.size(), + phi::errors::InvalidArgument( + "'begin_norm_axis' must be less than the dimensions of X," + "But received 'begin_norm_axis' is [%d]," + "received the dimensions of X is [%d].", + begin_norm_axis, + x_dim.size())); + + auto matrix_dim = phi::flatten_to_2d(x_dim, begin_norm_axis); + int left = static_cast(matrix_dim[0]); + int right = static_cast(matrix_dim[1]); + if (scale.get_ptr() != nullptr) { + PADDLE_ENFORCE_EQ(scale->dims().size(), + 1, + phi::errors::InvalidArgument( + "The dimensions of Input(Scale) must be 1, but " + "received dimensions of" + "Input(Scale) is [%d]", + scale->dims().size())); + } + + if (config.is_runtime && scale.get_ptr() != nullptr) { + PADDLE_ENFORCE_EQ( + scale->dims()[0], + right, + phi::errors::InvalidArgument( + "The first dimension value of Input(Scale) must equal to be the" + "second dimension value of the flattened 2D matrix of Input(X)," + "But received the first dimension value of Input(Scale) is" + "[%d], the second dimension value of the flattened 2D matrix of" + " Input(Scale) is [%d].", + scale->dims()[0], + right)); + } + if (bias.get_ptr() != nullptr) { + PADDLE_ENFORCE_EQ(bias->dims().size(), + 1, + phi::errors::InvalidArgument( + "The dimensions of Input(Bias) must be 1, but " + "received dimensions of" + "Input(Bias) is [%d]", + bias->dims().size())); + } + if (config.is_runtime && bias.get_ptr() != nullptr) { + PADDLE_ENFORCE_EQ( + bias->dims()[0], + right, + phi::errors::InvalidArgument( + "The first dimension value of Input(Bias) must equal to be the" + "second dimension value of the flattened 2D matrix of Input(X)," + "But received the first dimension value of Input(Bias) is" + "[%d], the second dimension value of the flattened 2D matrix of" + " Input(Bias) is [%d].", + bias->dims()[0], + right)); + } + + out->set_dims(x_dim); + if (mean) { + mean->set_dims({left}); + } + if (variance) { + variance->set_dims({left}); + } + out->share_lod(x); +} + +void LayerNormGradInferMeta(const MetaTensor& x, + paddle::optional y, + paddle::optional z, + MetaTensor* dx, + MetaTensor* dy, + MetaTensor* dz) { + if (dx) { + dx->share_meta(x); + } + if (dy && (y.get_ptr() != nullptr)) { + dy->share_meta(*y.get_ptr()); + } + if (dz && (z.get_ptr() != nullptr)) { + dz->share_meta(*z.get_ptr()); + } +} + void LerpInferMeta(const MetaTensor& x, const MetaTensor& y, const MetaTensor& weight, diff --git a/paddle/phi/infermeta/ternary.h b/paddle/phi/infermeta/ternary.h index 83505f2c2fada..4f561e0adf19d 100644 --- a/paddle/phi/infermeta/ternary.h +++ b/paddle/phi/infermeta/ternary.h @@ -60,6 +60,24 @@ void GraphSendRecvInferMeta(const MetaTensor& x, MetaTensor* out, MetaTensor* dst_count); +void LayerNormInferMeta(const MetaTensor& x, + paddle::optional scale, + paddle::optional bias, + float epsilon, + int begin_norm_axis, + bool is_test, + MetaTensor* out, + MetaTensor* mean, + MetaTensor* variance, + MetaConfig config = MetaConfig()); + +void LayerNormGradInferMeta(const MetaTensor& x, + paddle::optional y, + paddle::optional z, + MetaTensor* dx, + MetaTensor* dy, + MetaTensor* dz); + void LerpInferMeta(const MetaTensor& x, const MetaTensor& y, const MetaTensor& weight, diff --git a/paddle/phi/kernels/cpu/layer_norm_grad_kernel.cc b/paddle/phi/kernels/cpu/layer_norm_grad_kernel.cc index cee48ed96db1c..7c1b33f047b61 100644 --- a/paddle/phi/kernels/cpu/layer_norm_grad_kernel.cc +++ b/paddle/phi/kernels/cpu/layer_norm_grad_kernel.cc @@ -32,10 +32,10 @@ namespace phi { template void LayerNormGradKernel(const Context& dev_ctx, const DenseTensor& x, - const DenseTensor& mean, - const DenseTensor& variance, paddle::optional scale_opt, paddle::optional bias_opt, + const DenseTensor& mean, + const DenseTensor& variance, const DenseTensor& out_grad, float epsilon, int begin_norm_axis, diff --git a/paddle/phi/kernels/gpu/layer_norm_grad_kernel.cu b/paddle/phi/kernels/gpu/layer_norm_grad_kernel.cu index c3f7a5261712a..146d307a59380 100644 --- a/paddle/phi/kernels/gpu/layer_norm_grad_kernel.cu +++ b/paddle/phi/kernels/gpu/layer_norm_grad_kernel.cu @@ -24,10 +24,10 @@ namespace phi { template void LayerNormGradKernel(const Context &dev_ctx, const DenseTensor &x, - const DenseTensor &mean, - const DenseTensor &variance, paddle::optional scale_opt, paddle::optional bias_opt, + const DenseTensor &mean, + const DenseTensor &variance, const DenseTensor &out_grad, float epsilon, int begin_norm_axis, diff --git a/paddle/phi/kernels/layer_norm_grad_kernel.h b/paddle/phi/kernels/layer_norm_grad_kernel.h index c32be63db4178..65f19a11b94d6 100644 --- a/paddle/phi/kernels/layer_norm_grad_kernel.h +++ b/paddle/phi/kernels/layer_norm_grad_kernel.h @@ -21,10 +21,10 @@ namespace phi { template void LayerNormGradKernel(const Context& ctx, const DenseTensor& x, - const DenseTensor& mean, - const DenseTensor& variance, paddle::optional scale, paddle::optional bias, + const DenseTensor& mean, + const DenseTensor& variance, const DenseTensor& out_grad, float epsilon, int begin_norm_axis, diff --git a/paddle/phi/ops/compat/layer_norm_sig.cc b/paddle/phi/ops/compat/layer_norm_sig.cc index 17a81e9ec012f..4151b9e94fbdc 100644 --- a/paddle/phi/ops/compat/layer_norm_sig.cc +++ b/paddle/phi/ops/compat/layer_norm_sig.cc @@ -27,7 +27,7 @@ KernelSignature LayerNormGradOpArgumentMapping( const ArgumentMappingContext& ctx) { return KernelSignature( "layer_norm_grad", - {"X", "Mean", "Variance", "Scale", "Bias", GradVarName("Y")}, + {"X", "Scale", "Bias", "Mean", "Variance", GradVarName("Y")}, {"epsilon", "begin_norm_axis", "is_test"}, {GradVarName("X"), GradVarName("Scale"), GradVarName("Bias")}); } diff --git a/python/paddle/fluid/dygraph/nn.py b/python/paddle/fluid/dygraph/nn.py index 89fcbe1a5d18d..a3310f1a46ce4 100644 --- a/python/paddle/fluid/dygraph/nn.py +++ b/python/paddle/fluid/dygraph/nn.py @@ -1827,11 +1827,18 @@ def forward(self, input): 1:] + ', but got input shape ' + str(input_shape)) if _non_static_mode(): - pre_act, _, _ = _C_ops.layer_norm( - input, self.weight, self.bias, 'epsilon', self._epsilon, - 'begin_norm_axis', self._begin_norm_axis) - return dygraph_utils._append_activation_in_dygraph( - pre_act, act=self._act) + if in_dygraph_mode(): + pre_act, _, _, = _C_ops.final_state_layer_norm( + input, self.weight, self.bias, self._epsilon, + self._begin_norm_axis, False) + return dygraph_utils._append_activation_in_dygraph( + pre_act, act=self._act) + else: + pre_act, _, _ = _C_ops.layer_norm( + input, self.weight, self.bias, 'epsilon', self._epsilon, + 'begin_norm_axis', self._begin_norm_axis) + return dygraph_utils._append_activation_in_dygraph( + pre_act, act=self._act) check_variable_and_dtype(input, 'input', ['float32', 'float64'], 'LayerNorm') diff --git a/python/paddle/fluid/tests/unittests/test_layer_norm_op_v2.py b/python/paddle/fluid/tests/unittests/test_layer_norm_op_v2.py index 987c3da4dd7be..85c6694324d25 100644 --- a/python/paddle/fluid/tests/unittests/test_layer_norm_op_v2.py +++ b/python/paddle/fluid/tests/unittests/test_layer_norm_op_v2.py @@ -19,7 +19,7 @@ from paddle.fluid.op import Operator import paddle.fluid as fluid from op_test import OpTest, _set_use_system_allocator -from paddle.fluid.framework import grad_var_name +from paddle.fluid.framework import grad_var_name, _test_eager_guard import paddle.fluid as fluid from paddle.fluid import Program, program_guard import paddle @@ -36,13 +36,13 @@ def test_dygraph(self): def compute_v1(x): with fluid.dygraph.guard(p): ln = fluid.dygraph.LayerNorm(shape[1:]) - y = ln(fluid.dygraph.to_variable(x)) + y = ln(paddle.to_tensor(x)) return y.numpy() def compute_v2(x): with fluid.dygraph.guard(p): ln = paddle.nn.LayerNorm(shape[1:]) - y = ln(fluid.dygraph.to_variable(x)) + y = ln(paddle.to_tensor(x)) return y.numpy() x = np.random.randn(*shape).astype("float32") @@ -50,6 +50,38 @@ def compute_v2(x): y2 = compute_v2(x) self.assertTrue(np.allclose(y1, y2)) + def test_eager(self): + places = [fluid.CPUPlace()] + if core.is_compiled_with_cuda() and core.op_support_gpu("layer_norm"): + places.append(fluid.CUDAPlace(0)) + for p in places: + shape = [4, 10, 4, 4] + + def compute_v1(x): + with fluid.dygraph.guard(p): + ln = fluid.dygraph.LayerNorm(shape[1:]) + x1 = paddle.to_tensor(x) + x1.stop_gradient = False + y = ln(x1) + y.backward() + return y.numpy(), x1.gradient() + + def compute_v2(x): + with fluid.dygraph.guard(p): + with _test_eager_guard(): + ln = paddle.nn.LayerNorm(shape[1:]) + x1 = paddle.to_tensor(x) + x1.stop_gradient = False + y = ln(x1) + y.backward() + return y.numpy(), x1.gradient() + + x = np.random.randn(*shape).astype("float32") + y1, g1 = compute_v1(x) + y2, g2 = compute_v2(x) + self.assertTrue(np.allclose(y1, y2)) + self.assertTrue(np.allclose(g1, g2)) + def test_static(self): paddle.enable_static() places = [fluid.CPUPlace()] @@ -94,30 +126,30 @@ def test_dygraph(self): def compute_v0(x): with fluid.dygraph.guard(p): ln = fluid.dygraph.LayerNorm(shape[1:]) - y = ln(fluid.dygraph.to_variable(x)) + y = ln(paddle.to_tensor(x)) return y.numpy() def compute_v1(x): with fluid.dygraph.guard(p): - x = fluid.dygraph.to_variable(x) + x = paddle.to_tensor(x) y = paddle.nn.functional.layer_norm(x, shape[1:]) return y.numpy() def compute_v2(x): with fluid.dygraph.guard(p): - x = fluid.dygraph.to_variable(x) + x = paddle.to_tensor(x) y = paddle.nn.functional.layer_norm(x, tuple(shape[1:])) return y.numpy() def compute_v3(x): with fluid.dygraph.guard(p): ln = fluid.dygraph.LayerNorm(shape[-1]) - y = ln(fluid.dygraph.to_variable(x)) + y = ln(paddle.to_tensor(x)) return y.numpy() def compute_v4(x): with fluid.dygraph.guard(p): - x = fluid.dygraph.to_variable(x) + x = paddle.to_tensor(x) y = paddle.nn.functional.layer_norm(x, shape[-1]) return y.numpy() @@ -139,4 +171,5 @@ def compute_v4(x): if __name__ == '__main__': + paddle.enable_static() unittest.main() diff --git a/python/paddle/nn/functional/norm.py b/python/paddle/nn/functional/norm.py index 1a5fc109805e0..e719099b4b39d 100644 --- a/python/paddle/nn/functional/norm.py +++ b/python/paddle/nn/functional/norm.py @@ -318,7 +318,13 @@ def layer_norm(x, str_normalized_shape[ 1:] + ', but got input shape ' + str(input_shape)) - if in_dynamic_mode(): + if in_dygraph_mode(): + pre_act, _, _, = _C_ops.final_state_layer_norm(x, weight, bias, epsilon, + begin_norm_axis, False) + + return dygraph_utils._append_activation_in_dygraph(pre_act, act=None) + + if _in_legacy_dygraph(): pre_act, _, _ = _C_ops.layer_norm(x, weight, bias, 'epsilon', epsilon, 'begin_norm_axis', begin_norm_axis) return dygraph_utils._append_activation_in_dygraph(pre_act, act=None) diff --git a/python/paddle/utils/code_gen/api.yaml b/python/paddle/utils/code_gen/api.yaml index b20259d3ebd25..e3d8e8f5f47a5 100644 --- a/python/paddle/utils/code_gen/api.yaml +++ b/python/paddle/utils/code_gen/api.yaml @@ -1002,6 +1002,16 @@ optional : prior_dist backward : label_smooth_grad +- api : layer_norm + args : (Tensor x, Tensor scale, Tensor bias, float epsilon, int begin_norm_axis, bool is_test) + output : Tensor(out), Tensor(mean), Tensor(variance) + infer_meta : + func : LayerNormInferMeta + kernel : + func : layer_norm + backward : layer_norm_grad + optional : scale, bias + # leaky_relu - api : leaky_relu args : (Tensor x, float alpha) diff --git a/python/paddle/utils/code_gen/backward.yaml b/python/paddle/utils/code_gen/backward.yaml index 78f4ac7c985fb..f8366744bdbe6 100644 --- a/python/paddle/utils/code_gen/backward.yaml +++ b/python/paddle/utils/code_gen/backward.yaml @@ -723,6 +723,18 @@ func : label_smooth_grad optional : prior_dist +- backward_api : layer_norm_grad + forward : layer_norm (Tensor x, Tensor scale, Tensor bias, float epsilon, int begin_norm_axis, bool is_test) -> Tensor(out), Tensor(mean), Tensor(variance) + args : (Tensor x, Tensor scale, Tensor bias, Tensor mean, Tensor variance, Tensor out_grad, float epsilon, int begin_norm_axis, bool is_test) + output : Tensor(x_grad), Tensor(scale_grad), Tensor(bias_grad) + infer_meta : + func : LayerNormGradInferMeta + param : [x, scale, bias] + kernel : + func : layer_norm_grad + data_type : out_grad + optional : scale, bias + - backward_api : leaky_relu_grad forward : leaky_relu (Tensor x, float alpha) -> Tensor(out) args : (Tensor x, Tensor out_grad, float alpha) diff --git a/tools/infrt/skipped_phi_api.json b/tools/infrt/skipped_phi_api.json index 8e2dd0f65d7d5..b352240c6dcc5 100644 --- a/tools/infrt/skipped_phi_api.json +++ b/tools/infrt/skipped_phi_api.json @@ -1,4 +1,4 @@ { -"phi_apis":["conj", "dropout", "expand_as", "nll_loss", "psroi_pool", "roi_align", "roi_pool", "label_smooth"], +"phi_apis":["conj", "dropout", "expand_as", "nll_loss", "psroi_pool", "roi_align", "roi_pool", "label_smooth", "layer_norm"], "phi_kernels":["equal_all"] } From b861022aa55a397976329041378193caf35987c3 Mon Sep 17 00:00:00 2001 From: feng_shuai Date: Tue, 12 Apr 2022 15:33:57 +0800 Subject: [PATCH 098/211] strided_slice (#41573) * strided_slice * fix: compiler error because of size() * fix: warning * fix : warning * init input_shape * fix:forget punctuation --- .../fluid/inference/api/analysis_predictor.cc | 1 + .../inference/tensorrt/convert/CMakeLists.txt | 1 + .../tensorrt/convert/strided_slice_op.cc | 131 ++++++++++++++++++ paddle/fluid/inference/tensorrt/op_teller.cc | 2 + .../test_trt_convert_strided_slice.py | 120 ++++++++++++++++ 5 files changed, 255 insertions(+) create mode 100644 paddle/fluid/inference/tensorrt/convert/strided_slice_op.cc create mode 100644 python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_strided_slice.py diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc index b7e811e4c64d6..d37e4a468cac0 100644 --- a/paddle/fluid/inference/api/analysis_predictor.cc +++ b/paddle/fluid/inference/api/analysis_predictor.cc @@ -1755,6 +1755,7 @@ USE_TRT_CONVERTER(deformable_conv); USE_TRT_CONVERTER(pool3d) USE_TRT_CONVERTER(fused_preln_embedding_eltwise_layernorm) USE_TRT_CONVERTER(preln_skip_layernorm) +USE_TRT_CONVERTER(strided_slice) #endif namespace paddle_infer { diff --git a/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt b/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt index 4f8aa4c14cd7b..f1800afcb1d26 100644 --- a/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt +++ b/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt @@ -23,6 +23,7 @@ nv_library(tensorrt_converter pool3d_op.cc deformable_conv_op.cc preln_emb_eltwise_layernorm.cc + strided_slice_op.cc preln_skip_layernorm.cc DEPS tensorrt_engine tensorrt_plugin operator scope framework_proto op_registry) diff --git a/paddle/fluid/inference/tensorrt/convert/strided_slice_op.cc b/paddle/fluid/inference/tensorrt/convert/strided_slice_op.cc new file mode 100644 index 0000000000000..26046d38bcbd9 --- /dev/null +++ b/paddle/fluid/inference/tensorrt/convert/strided_slice_op.cc @@ -0,0 +1,131 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/inference/tensorrt/convert/op_converter.h" + +namespace paddle { +namespace framework { +class Scope; +namespace proto { +class OpDesc; +} // namespace proto +} // namespace framework +} // namespace paddle + +namespace paddle { +namespace inference { +namespace tensorrt { + +/* + * Stack converter from fluid to tensorRT. + */ +class StridedSliceOpConverter : public OpConverter { + public: + void operator()(const framework::proto::OpDesc& op, + const framework::Scope& scope, bool test_mode) override { + VLOG(4) << "convert fluid StridedSlice op to tensorrt Slice layer"; + + framework::OpDesc op_desc(op, nullptr); + auto* input = engine_->GetITensor(op_desc.Input("Input")[0]); + nvinfer1::Dims input_dims = input->getDimensions(); + + std::vector axes = + BOOST_GET_CONST(std::vector, op_desc.GetAttr("axes")); + std::vector starts = + BOOST_GET_CONST(std::vector, op_desc.GetAttr("starts")); + std::vector ends = + BOOST_GET_CONST(std::vector, op_desc.GetAttr("ends")); + std::vector strides = + BOOST_GET_CONST(std::vector, op_desc.GetAttr("strides")); + + nvinfer1::Dims start; + start.nbDims = input_dims.nbDims; + int axes_size = axes.size(); + for (int i = 0; i < start.nbDims; i++) { + start.d[i] = 0; + } + for (int i = 0; i < axes_size; i++) { + start.d[axes[i]] = starts[i]; + } + + nvinfer1::Dims stride; + stride.nbDims = input_dims.nbDims; + for (int i = 0; i < stride.nbDims; i++) { + stride.d[i] = 1; + } + for (int i = 0; i < axes_size; i++) { + stride.d[axes[i]] = strides[i]; + } + + nvinfer1::Dims size; + size.nbDims = input_dims.nbDims; + for (int i = 0; i < size.nbDims; i++) { + size.d[i] = 1; + } + + auto output_name = op_desc.Output("Out")[0]; + + auto create_weights = [&](const std::vector& data, + const std::string& type) -> int* { + std::unique_ptr tmp_tensor(new framework::Tensor()); + int data_size = data.size(); + tmp_tensor->Resize({data_size}); + auto* tmp_data = tmp_tensor->mutable_data(platform::CPUPlace()); + for (int i = 0; i < data_size; i++) { + tmp_data[i] = data[i]; + } + + engine_->SetWeights(output_name + "_add_slice_op_" + type, + std::move(tmp_tensor)); + return tmp_data; + }; + + std::vector const_weight(input_dims.nbDims, 1); + for (int i = 0; i < axes_size; i++) { + const_weight[axes[i]] = strides[i]; + } + + int* weight_data = create_weights(const_weight, "size"); + + TensorRTEngine::Weight weight{nvinfer1::DataType::kINT32, + static_cast(weight_data), + static_cast(input_dims.nbDims)}; + + int input_dim_size = input_dims.nbDims; + nvinfer1::Dims input_shape; + input_shape.nbDims = 1; + input_shape.d[0] = input_dim_size; + + auto const_layer = + TRT_ENGINE_ADD_LAYER(engine_, Constant, input_shape, weight.get()); + + auto shape_layer = TRT_ENGINE_ADD_LAYER(engine_, Shape, *input); + + auto size_layer = TRT_ENGINE_ADD_LAYER( + engine_, ElementWise, *shape_layer->getOutput(0), + *const_layer->getOutput(0), nvinfer1::ElementWiseOperation::kDIV); + + auto* layer = + TRT_ENGINE_ADD_LAYER(engine_, Slice, *input, start, size, stride); + layer->setInput(2, *size_layer->getOutput(0)); + + RreplenishLayerAndOutput(layer, "strided_slice", {output_name}, test_mode); + } +}; + +} // namespace tensorrt +} // namespace inference +} // namespace paddle + +REGISTER_TRT_OP_CONVERTER(strided_slice, StridedSliceOpConverter); diff --git a/paddle/fluid/inference/tensorrt/op_teller.cc b/paddle/fluid/inference/tensorrt/op_teller.cc index 6ccaf80c9f0dd..d9a874dd2b629 100644 --- a/paddle/fluid/inference/tensorrt/op_teller.cc +++ b/paddle/fluid/inference/tensorrt/op_teller.cc @@ -117,6 +117,7 @@ struct SimpleOpTypeSetTeller : public Teller { "multihead_matmul", "skip_layernorm", "slice", + "strided_slice", "fused_preln_embedding_eltwise_layernorm", "preln_skip_layernorm"}; std::unordered_set teller_set{ @@ -178,6 +179,7 @@ struct SimpleOpTypeSetTeller : public Teller { "multihead_matmul", "skip_layernorm", "slice", + "strided_slice", "fused_preln_embedding_eltwise_layernorm", "preln_skip_layernorm", "multiclass_nms3"}; diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_strided_slice.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_strided_slice.py new file mode 100644 index 0000000000000..04eb3ab10ba7a --- /dev/null +++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_strided_slice.py @@ -0,0 +1,120 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from trt_layer_auto_scan_test import TrtLayerAutoScanTest, SkipReasons +from program_config import TensorConfig, ProgramConfig +import numpy as np +import paddle.inference as paddle_infer +from functools import partial +from typing import Optional, List, Callable, Dict, Any, Set +import unittest + + +class TrtConvertStridedSliceTest(TrtLayerAutoScanTest): + def is_program_valid(self, program_config: ProgramConfig) -> bool: + inputs = program_config.inputs + weights = program_config.weights + attrs = [ + program_config.ops[i].attrs + for i in range(len(program_config.ops)) + ] + return True + + def sample_program_configs(self): + def generate_input1(attrs: List[Dict[str, Any]]): + return np.ones([1, 56, 56, 192]).astype(np.float32) + + for axes in [[1, 2]]: + for starts in [[1, 1]]: + for ends in [[10000000, 10000000]]: + for decrease_axis in [[]]: + for infer_flags in [[1, 1]]: + for strides in [[2, 2]]: + dics = [{ + "axes": axes, + "starts": starts, + "ends": ends, + "decrease_axis": decrease_axis, + "infer_flags": infer_flags, + "strides": strides + }] + + ops_config = [{ + "op_type": "strided_slice", + "op_inputs": { + "Input": ["input_data"] + }, + "op_outputs": { + "Out": ["slice_output_data"] + }, + "op_attrs": dics[0] + }] + ops = self.generate_op_config(ops_config) + + program_config = ProgramConfig( + ops=ops, + weights={}, + inputs={ + "input_data": TensorConfig( + data_gen=partial(generate_input1, + dics)) + }, + outputs=["slice_output_data"]) + + yield program_config + + def sample_predictor_configs( + self, program_config) -> (paddle_infer.Config, List[int], float): + def generate_dynamic_shape(attrs): + self.dynamic_shape.min_input_shape = { + "input_data": [1, 56, 56, 192] + } + self.dynamic_shape.max_input_shape = { + "input_data": [8, 56, 56, 192] + } + self.dynamic_shape.opt_input_shape = { + "input_data": [4, 56, 56, 192] + } + + def clear_dynamic_shape(): + self.dynamic_shape.min_input_shape = {} + self.dynamic_shape.max_input_shape = {} + self.dynamic_shape.opt_input_shape = {} + + def generate_trt_nodes_num(attrs, dynamic_shape): + inputs = program_config.inputs + + if dynamic_shape: + for i in range(len(attrs[0]["starts"])): + if attrs[0]["starts"][i] < 0 or attrs[0]["ends"][i] < 0: + return 0, 3 + if not dynamic_shape: + for x in attrs[0]["axes"]: + if x == 0: + return 0, 3 + return 1, 2 + + attrs = [ + program_config.ops[i].attrs + for i in range(len(program_config.ops)) + ] + + # for dynamic_shape + generate_dynamic_shape(attrs) + self.trt_param.precision = paddle_infer.PrecisionType.Float32 + yield self.create_inference_config(), generate_trt_nodes_num(attrs, + True), 1e-5 + + def test(self): + self.run_test() From af17c888d1cd15c5fe96220c4a8d5f7f34a8171d Mon Sep 17 00:00:00 2001 From: Weilong Wu Date: Tue, 12 Apr 2022 15:39:49 +0800 Subject: [PATCH 099/211] [Eager] Fix test_imperative_optimizer_v2 eager mode global issue (#41633) --- python/paddle/optimizer/lr.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/python/paddle/optimizer/lr.py b/python/paddle/optimizer/lr.py index 528fab1c2b98d..d0d5eef03c42c 100644 --- a/python/paddle/optimizer/lr.py +++ b/python/paddle/optimizer/lr.py @@ -1360,6 +1360,8 @@ def step(self, metrics, epoch=None): if not _in_legacy_dygraph(): tmp = core.eager.Tensor else: + # need to declarate explicitly + from paddle.framework import VarBase as Tensor tmp = Tensor # loss must be float, numpy.ndarray or 1-D Tensor with shape [1] if isinstance(metrics, (tmp, numpy.ndarray)): From bb427a3db4c6c05dec3ee9f334a9a2d97d87e697 Mon Sep 17 00:00:00 2001 From: Leo Chen Date: Tue, 12 Apr 2022 16:10:27 +0800 Subject: [PATCH 100/211] use standalone executor for test_nn_grad/test_norm_nn_grad (#41574) --- python/paddle/fluid/tests/unittests/CMakeLists.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt index 3f640a73a55c5..d0126013dcf82 100755 --- a/python/paddle/fluid/tests/unittests/CMakeLists.txt +++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt @@ -1037,7 +1037,7 @@ set_tests_properties(test_parallel_executor_seresnext_with_reduce_gpu PROPERTIES set_tests_properties(test_dropout_op PROPERTIES TIMEOUT 120) set_tests_properties(test_argsort_op PROPERTIES TIMEOUT 120) set_tests_properties(test_gather_nd_op PROPERTIES TIMEOUT 120) -set_tests_properties(test_nn_grad PROPERTIES TIMEOUT 120 ENVIRONMENT FLAGS_USE_STANDALONE_EXECUTOR=0) +set_tests_properties(test_nn_grad PROPERTIES TIMEOUT 180) set_tests_properties(test_elementwise_sub_op PROPERTIES TIMEOUT 120) set_tests_properties(test_row_conv_op PROPERTIES TIMEOUT 120) set_tests_properties(test_parallel_executor_seresnext_with_fuse_all_reduce_gpu PROPERTIES TIMEOUT 120) @@ -1076,7 +1076,7 @@ set_tests_properties(test_space_to_depth_op PROPERTIES TIMEOUT 200) set_tests_properties(test_dyn_rnn PROPERTIES TIMEOUT 120) set_tests_properties(test_sgd_op PROPERTIES TIMEOUT 250) set_tests_properties(test_parallel_executor_seresnext_base_gpu PROPERTIES TIMEOUT 120) -set_tests_properties(test_norm_nn_grad PROPERTIES TIMEOUT 120 ENVIRONMENT FLAGS_USE_STANDALONE_EXECUTOR=0) +set_tests_properties(test_norm_nn_grad PROPERTIES TIMEOUT 180) set_tests_properties(test_matrix_nms_op PROPERTIES TIMEOUT 120) set_tests_properties(test_generator_dataloader PROPERTIES TIMEOUT 120) set_tests_properties(test_partial_concat_op PROPERTIES TIMEOUT 120) From 18f569c3c7fcecc27137e376122b6f9eb8443b19 Mon Sep 17 00:00:00 2001 From: niuliling123 <51102941+niuliling123@users.noreply.github.com> Date: Tue, 12 Apr 2022 19:25:09 +0800 Subject: [PATCH 101/211] Replaced cp with copy in xpu_cmake (#41542) --- cmake/xpu_kp.cmake | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cmake/xpu_kp.cmake b/cmake/xpu_kp.cmake index 9047e6a9261ec..9cddbe1496478 100644 --- a/cmake/xpu_kp.cmake +++ b/cmake/xpu_kp.cmake @@ -128,7 +128,7 @@ macro(compile_kernel COMPILE_ARGS) COMMAND ${CMAKE_COMMAND} -E make_directory kernel_build COMMAND - cp ${kernel_path}/${kernel_name}.kps kernel_build/${kernel_name}.xpu -rf + ${CMAKE_COMMAND} -E copy ${kernel_path}/${kernel_name}.kps kernel_build/${kernel_name}.xpu COMMAND ${XPU_CLANG} --sysroot=${CXX_DIR} -std=c++11 -D_GLIBCXX_USE_CXX11_ABI=1 ${OPT_LEVEL} -fno-builtin -mcpu=xpu2 -fPIC ${XPU_CXX_DEFINES} ${XPU_CXX_FLAGS} ${XPU_CXX_INCLUDES} -I. -o kernel_build/${kernel_name}.bin.o.sec kernel_build/${kernel_name}.xpu @@ -151,7 +151,7 @@ macro(compile_kernel COMPILE_ARGS) COMMAND ${CMAKE_COMMAND} -E make_directory kernel_build COMMAND - cp ${kernel_path}/${kernel_name}.kps kernel_build/${kernel_name}.xpu -rf + ${CMAKE_COMMAND} -E copy ${kernel_path}/${kernel_name}.kps kernel_build/${kernel_name}.xpu COMMAND ${XPU_CLANG} --sysroot=${CXX_DIR} -std=c++11 -D_GLIBCXX_USE_CXX11_ABI=1 ${OPT_LEVEL} -fno-builtin -mcpu=xpu2 -fPIC ${XPU_CXX_DEFINES} ${XPU_CXX_FLAGS} ${XPU_CXX_INCLUDES} -I. -o kernel_build/${kernel_name}.host.o kernel_build/${kernel_name}.xpu From 4819ab4dc2248540b178e5e7aaf5df722f35cae3 Mon Sep 17 00:00:00 2001 From: chentianyu03 Date: Tue, 12 Apr 2022 19:43:52 +0800 Subject: [PATCH 102/211] [Yaml]add gaussian_random yaml and test case (#41312) * add guassian random yaml * add gaussian_random yaml and test case * fix error modify of full yaml * import in_dygraph_mode * import _in_legacy_dygraph * add place arg in api * import __current_expected_place * fix test_egr_python_api failed case * add test case * add cast for NormalInitializer * fix test error * fix test error * rm unsed check code * fix test error in test_initializer_nn * modify by review --- python/paddle/fluid/initializer.py | 75 ++++++++++++++++--- python/paddle/fluid/layers/nn.py | 11 ++- .../tests/unittests/test_egr_python_api.py | 3 - .../unittests/test_gaussian_random_op.py | 11 +++ .../fluid/tests/unittests/test_initializer.py | 64 +++++++++++++++- .../tests/unittests/test_initializer_nn.py | 2 +- python/paddle/tensor/random.py | 10 ++- python/paddle/utils/code_gen/api.yaml | 12 +++ 8 files changed, 170 insertions(+), 18 deletions(-) diff --git a/python/paddle/fluid/initializer.py b/python/paddle/fluid/initializer.py index b3baedc401504..ba5e51c11dd65 100644 --- a/python/paddle/fluid/initializer.py +++ b/python/paddle/fluid/initializer.py @@ -331,22 +331,56 @@ def __call__(self, var, block=None): ["uint16", "float16", "float32", "float64"], "guassian_random") + # to be compatible of fp16 initalizers + if var.dtype in [VarDesc.VarType.FP16, VarDesc.VarType.BF16]: + out_dtype = VarDesc.VarType.FP32 + out_var = block.create_var( + name=unique_name.generate(".".join( + ['normal_init', var.name, 'tmp'])), + shape=var.shape, + dtype=out_dtype, + type=VarDesc.VarType.LOD_TENSOR, + persistable=False) + else: + out_dtype = var.dtype + out_var = var + if self._seed == 0: self._seed = block.program.random_seed - if framework._non_static_mode(): + if in_dygraph_mode(): + place = _current_expected_place() + out_var = _C_ops.final_state_gaussian_random( + var.shape, self._mean, self._std_dev, self._seed, out_dtype, + place) + out_var._share_underline_tensor_to(var) + + if var.dtype in [VarDesc.VarType.FP16, VarDesc.VarType.BF16]: + var_tmp = _C_ops.final_state_cast(out_var, var.dtype) + var_tmp._share_underline_tensor_to(var) + else: + out_var._share_underline_tensor_to(var) + return None + + if _in_legacy_dygraph(): out_var = _C_ops.gaussian_random( - 'shape', var.shape, 'dtype', var.dtype, 'mean', self._mean, + 'shape', var.shape, 'dtype', out_dtype, 'mean', self._mean, 'std', self._std_dev, 'seed', self._seed, 'use_mkldnn', False) - out_var._share_underline_tensor_to(var) + + if var.dtype in [VarDesc.VarType.FP16, VarDesc.VarType.BF16]: + var_tmp = _C_ops.cast(out_var, 'in_dtype', out_var.dtype, + 'out_dtype', var.dtype) + var_tmp._share_underline_tensor_to(var) + else: + out_var._share_underline_tensor_to(var) return None else: op = block.append_op( type="gaussian_random", - outputs={"Out": var}, + outputs={"Out": out_var}, attrs={ "shape": var.shape, - "dtype": var.dtype, + "dtype": out_dtype, "mean": self._mean, "std": self._std_dev, "seed": self._seed, @@ -354,6 +388,13 @@ def __call__(self, var, block=None): }, stop_gradient=True) + if var.dtype in [VarDesc.VarType.FP16, VarDesc.VarType.BF16]: + block.append_op( + type="cast", + inputs={"X": out_var}, + outputs={"Out": var}, + attrs={"in_dtype": out_var.dtype, + "out_dtype": var.dtype}) var.op = op return op @@ -567,9 +608,15 @@ def __call__(self, var, block=None): self._seed, 'dtype', out_dtype) else: std = math.sqrt(2.0 / float(fan_in + fan_out)) - out_var = _C_ops.gaussian_random( - 'shape', out_var.shape, 'dtype', out_dtype, 'mean', 0.0, - 'std', std, 'seed', self._seed) + + if in_dygraph_mode(): + place = _current_expected_place() + out_var = _C_ops.final_state_gaussian_random( + out_var.shape, 0.0, std, self._seed, out_dtype, place) + else: + out_var = _C_ops.gaussian_random( + 'shape', out_var.shape, 'dtype', out_dtype, 'mean', 0.0, + 'std', std, 'seed', self._seed) if var.dtype == VarDesc.VarType.FP16 or ( var.dtype == VarDesc.VarType.BF16 and not self._uniform): @@ -720,9 +767,15 @@ def __call__(self, var, block=None): int(out_dtype)) else: std = math.sqrt(2.0 / float(fan_in)) - out_var = _C_ops.gaussian_random( - 'shape', out_var.shape, 'dtype', - int(out_dtype), 'mean', 0.0, 'std', std, 'seed', self._seed) + if in_dygraph_mode(): + place = _current_expected_place() + out_var = _C_ops.final_state_gaussian_random( + out_var.shape, 0.0, std, self._seed, out_dtype, place) + else: + out_var = _C_ops.gaussian_random( + 'shape', out_var.shape, 'dtype', + int(out_dtype), 'mean', 0.0, 'std', std, 'seed', + self._seed) if var.dtype == VarDesc.VarType.FP16 or ( var.dtype == VarDesc.VarType.BF16 and not self._uniform): diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py index a405bf829fd48..47f40a2e6a5af 100755 --- a/python/paddle/fluid/layers/nn.py +++ b/python/paddle/fluid/layers/nn.py @@ -28,6 +28,7 @@ from paddle.fluid.framework import _in_legacy_dygraph from ..initializer import Normal, Constant, NumpyArrayInitializer from ..framework import Variable, OpProtoHolder, _non_static_mode, dygraph_only, _dygraph_tracer, default_main_program, _varbase_creator, static_only, _global_flags, _in_legacy_dygraph, in_dygraph_mode +from ..framework import _current_expected_place from .. import dygraph_utils from ..param_attr import ParamAttr from .layer_function_generator import autodoc, templatedoc, _generate_doc_string_ @@ -10970,7 +10971,15 @@ def gaussian_random(shape, if not isinstance(dtype, core.VarDesc.VarType): dtype = convert_np_dtype_to_dtype_(dtype) - if _non_static_mode(): + if in_dygraph_mode(): + shape = utils.convert_shape_to_list(shape) + place = _current_expected_place() + return _C_ops.final_state_gaussian_random(shape, + float(mean), + float(std), seed, dtype, + place) + + if _in_legacy_dygraph(): shape = utils.convert_shape_to_list(shape) return _C_ops.gaussian_random('shape', shape, 'mean', float(mean), 'std', diff --git a/python/paddle/fluid/tests/unittests/test_egr_python_api.py b/python/paddle/fluid/tests/unittests/test_egr_python_api.py index e7abed0964679..ae29c6c262a84 100644 --- a/python/paddle/fluid/tests/unittests/test_egr_python_api.py +++ b/python/paddle/fluid/tests/unittests/test_egr_python_api.py @@ -251,9 +251,6 @@ def constructor(self, place): self.assertTrue(egr_tensor12.place._equals(paddle.fluid.CPUPlace())) self.assertTrue(np.array_equal(egr_tensor12.numpy(), x)) - egr_tensor13 = paddle.randn([2, 2]) - self.assertTrue("eager_tmp" in egr_tensor13.name) - with self.assertRaisesRegexp( ValueError, "The shape of Parameter should not be None"): eager_param = EagerParamBase(shape=None, dtype="float32") diff --git a/python/paddle/fluid/tests/unittests/test_gaussian_random_op.py b/python/paddle/fluid/tests/unittests/test_gaussian_random_op.py index 4fca8b9f2a118..4140ce44648fa 100644 --- a/python/paddle/fluid/tests/unittests/test_gaussian_random_op.py +++ b/python/paddle/fluid/tests/unittests/test_gaussian_random_op.py @@ -23,12 +23,14 @@ from paddle.fluid.op import Operator from paddle.fluid.executor import Executor from paddle.fluid.tests.unittests.op_test import OpTest, convert_uint16_to_float +from paddle.fluid.framework import _test_eager_guard import paddle class TestGaussianRandomOp(OpTest): def setUp(self): self.op_type = "gaussian_random" + self.python_api = paddle.normal self.set_attrs() self.inputs = {} self.use_mkldnn = False @@ -50,6 +52,10 @@ def set_attrs(self): def test_check_output(self): self.check_output_customized(self.verify_output) + def test_eager(self): + with _test_eager_guard(): + self.test_check_output() + def verify_output(self, outs): self.assertEqual(outs[0].shape, (123, 92)) hist, _ = np.histogram(outs[0], range=(-3, 5)) @@ -70,6 +76,7 @@ def verify_output(self, outs): class TestGaussianRandomBF16Op(OpTest): def setUp(self): self.op_type = "gaussian_random" + self.python_api = paddle.normal self.set_attrs() self.inputs = {} self.use_mkldnn = False @@ -93,6 +100,10 @@ def test_check_output(self): self.check_output_with_place_customized( self.verify_output, place=core.CUDAPlace(0)) + def test_eager(self): + with _test_eager_guard(): + self.test_check_output() + def verify_output(self, outs): outs = convert_uint16_to_float(outs) self.assertEqual(outs[0].shape, (123, 92)) diff --git a/python/paddle/fluid/tests/unittests/test_initializer.py b/python/paddle/fluid/tests/unittests/test_initializer.py index 91c2800836c9d..3a9387082e680 100644 --- a/python/paddle/fluid/tests/unittests/test_initializer.py +++ b/python/paddle/fluid/tests/unittests/test_initializer.py @@ -244,7 +244,7 @@ def test_normal_initializer(self, dtype="float32"): lod_level=0, name="param", initializer=initializer.NormalInitializer(2.3, 1.9, 123)) - num_ops = 1 + num_ops = 2 if (dtype == "float16" or dtype == "uint16") else 1 self.assertEqual(len(block.ops), num_ops) init_op = block.ops[0] self.assertEqual(init_op.type, 'gaussian_random') @@ -685,6 +685,68 @@ def test_uniform_initializer(self, dtype="float32"): self.func_uniform_initializer() +class TestXavierInitializerDygraph(unittest.TestCase): + def func_xvarier_initializer(self, dtype="float32"): + """ + In dygraph mode, we can use initializer directly to initialize a tensor. + """ + paddle.disable_static() + + tensor = paddle.zeros([1024, 1024, 16]) + tensor.stop_gradient = False + + xavier_ = paddle.fluid.initializer.XavierInitializer( + uniform=False, fan_in=3, fan_out=5) + xavier_(tensor) + + hist, _ = output_hist(tensor.numpy()) + + hist2, _ = output_hist( + np.random.normal(0, np.sqrt(2.0 / (3 + 5)), [1024, 1024, 16])) + + self.assertTrue( + np.allclose( + hist, hist2, rtol=0, atol=0.01), + "hist: " + str(hist) + " hist2: " + str(hist2)) + paddle.enable_static() + + def test_xavier_initializer(self, dtype="float32"): + with framework._test_eager_guard(): + self.func_xvarier_initializer() + self.func_xvarier_initializer() + + +class TestMSRAInitializerDygraph(unittest.TestCase): + def func_msra_initializer(self, dtype="float32"): + """ + In dygraph mode, we can use initializer directly to initialize a tensor. + """ + paddle.disable_static() + + tensor = paddle.zeros([1024, 1024, 16]) + tensor.stop_gradient = False + + msra_ = paddle.fluid.initializer.MSRAInitializer( + uniform=False, fan_in=4) + msra_(tensor) + + hist, _ = output_hist(tensor.numpy()) + + hist2, _ = output_hist( + np.random.normal(0, np.sqrt(2.0 / (4)), [1024, 1024, 16])) + + self.assertTrue( + np.allclose( + hist, hist2, rtol=0, atol=0.01), + "hist: " + str(hist) + " hist2: " + str(hist2)) + paddle.enable_static() + + def test_msra_initializer(self, dtype="float32"): + with framework._test_eager_guard(): + self.func_msra_initializer() + self.func_msra_initializer() + + class TesetconsistencyOfDynamicAndStaticGraph(unittest.TestCase): def func_order(self): paddle.set_device('cpu') diff --git a/python/paddle/fluid/tests/unittests/test_initializer_nn.py b/python/paddle/fluid/tests/unittests/test_initializer_nn.py index 74686652044ec..9953681e0f5bd 100644 --- a/python/paddle/fluid/tests/unittests/test_initializer_nn.py +++ b/python/paddle/fluid/tests/unittests/test_initializer_nn.py @@ -400,7 +400,7 @@ def test_normal_initializer(self, dtype="float32"): lod_level=0, name="param", initializer=initializer.Normal(2.3, 1.9)) - num_ops = 1 + num_ops = 2 if dtype in ["float16", "uint16"] else 1 self.assertEqual(len(block.ops), num_ops) init_op = block.ops[0] self.assertEqual(init_op.type, 'gaussian_random') diff --git a/python/paddle/tensor/random.py b/python/paddle/tensor/random.py index 82818d50510c9..3d0617e40d6b6 100644 --- a/python/paddle/tensor/random.py +++ b/python/paddle/tensor/random.py @@ -239,7 +239,15 @@ def gaussian(shape, mean=0.0, std=1.0, dtype=None, name=None): if not isinstance(dtype, core.VarDesc.VarType): dtype = convert_np_dtype_to_dtype_(dtype) - if paddle.in_dynamic_mode(): + if in_dygraph_mode(): + shape = utils.convert_shape_to_list(shape) + place = _current_expected_place() + return _C_ops.final_state_gaussian_random(shape, + float(mean), + float(std), seed, dtype, + place) + + if _in_legacy_dygraph(): shape = utils.convert_shape_to_list(shape) return _C_ops.gaussian_random('shape', shape, 'mean', float(mean), 'std', diff --git a/python/paddle/utils/code_gen/api.yaml b/python/paddle/utils/code_gen/api.yaml index e3d8e8f5f47a5..6b58c84061384 100644 --- a/python/paddle/utils/code_gen/api.yaml +++ b/python/paddle/utils/code_gen/api.yaml @@ -782,6 +782,18 @@ kernel : func : gather_tree +- api : gaussian_random + args : (IntArray shape, float mean, float std, int seed, DataType dtype, Place place={}) + output: Tensor + infer_meta : + func : GaussianRandomInferMeta + param : [shape, mean, std, seed, dtype] + kernel : + func : gaussian_random + param : [shape, mean, std, seed, dtype] + data_type : dtype + backend : place + - api : gelu args : (Tensor x, bool approximate) output : Tensor(out) From ce5e119696084cf8836a182df1b814c2dd80a256 Mon Sep 17 00:00:00 2001 From: Asthestarsfalll <72954905+Asthestarsfalll@users.noreply.github.com> Date: Tue, 12 Apr 2022 20:00:58 +0800 Subject: [PATCH 103/211] =?UTF-8?q?=E3=80=90Hackathon=20No.27=E3=80=91?= =?UTF-8?q?=E4=B8=BA=20Paddle=20=E6=96=B0=E5=A2=9E=20frac=20=E6=95=B0?= =?UTF-8?q?=E5=AD=A6=E8=AE=A1=E7=AE=97API=20=20(#41226)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- python/paddle/__init__.py | 2 + .../fluid/tests/unittests/test_frac_api.py | 118 ++++++++++++++++++ python/paddle/tensor/__init__.py | 2 + python/paddle/tensor/math.py | 54 ++++++++ 4 files changed, 176 insertions(+) create mode 100644 python/paddle/fluid/tests/unittests/test_frac_api.py diff --git a/python/paddle/__init__.py b/python/paddle/__init__.py index 227cf967642c1..63f16c4eb78f1 100755 --- a/python/paddle/__init__.py +++ b/python/paddle/__init__.py @@ -268,6 +268,7 @@ from .tensor.math import fmin # noqa: F401 from .tensor.math import inner # noqa: F401 from .tensor.math import outer # noqa: F401 +from .tensor.math import frac # noqa: F401 from .tensor.random import bernoulli # noqa: F401 from .tensor.random import poisson # noqa: F401 @@ -606,6 +607,7 @@ 'concat', 'check_shape', 'trunc', + 'frac', 'digamma', 'standard_normal', 'diagonal', diff --git a/python/paddle/fluid/tests/unittests/test_frac_api.py b/python/paddle/fluid/tests/unittests/test_frac_api.py new file mode 100644 index 0000000000000..4ee3096cde78f --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_frac_api.py @@ -0,0 +1,118 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import unittest +import numpy as np +import paddle +import paddle.fluid as fluid +import paddle.fluid.core as core +from paddle.fluid import Program, program_guard +from paddle.fluid.framework import _test_eager_guard + + +def ref_frac(x): + return x - np.trunc(x) + + +class TestFracAPI(unittest.TestCase): + """Test Frac API""" + + def set_dtype(self): + self.dtype = 'float64' + + def setUp(self): + self.set_dtype() + self.x_np = np.random.uniform(-3, 3, [2, 3]).astype(self.dtype) + self.place = paddle.CUDAPlace(0) if core.is_compiled_with_cuda() \ + else paddle.CPUPlace() + + def test_api_static(self): + paddle.enable_static() + with program_guard(Program()): + input = fluid.data('X', self.x_np.shape, self.x_np.dtype) + out = paddle.frac(input) + place = fluid.CPUPlace() + if fluid.core.is_compiled_with_cuda(): + place = fluid.CUDAPlace(0) + exe = fluid.Executor(place) + res = exe.run(feed={'X': self.x_np}, fetch_list=[out]) + out_ref = ref_frac(self.x_np) + self.assertTrue(np.allclose(out_ref, res)) + + def test_api_dygraph(self): + paddle.disable_static(self.place) + x = paddle.to_tensor(self.x_np) + out = paddle.frac(x) + out_ref = ref_frac(self.x_np) + self.assertTrue(np.allclose(out_ref, out.numpy())) + + def test_api_eager(self): + paddle.disable_static(self.place) + with _test_eager_guard(): + x_tensor = paddle.to_tensor(self.x_np) + out = paddle.frac(x_tensor) + out_ref = ref_frac(self.x_np) + self.assertTrue(np.allclose(out_ref, out.numpy())) + paddle.enable_static() + + def test_api_eager_dygraph(self): + with _test_eager_guard(): + self.test_api_dygraph() + + +class TestFracInt32(TestFracAPI): + """Test Frac API with data type int32""" + + def set_dtype(self): + self.dtype = 'int32' + + +class TestFracInt64(TestFracAPI): + """Test Frac API with data type int64""" + + def set_dtype(self): + self.dtype = 'int64' + + +class TestFracFloat32(TestFracAPI): + """Test Frac API with data type float32""" + + def set_dtype(self): + self.dtype = 'float32' + + +class TestFracError(unittest.TestCase): + """Test Frac Error""" + + def setUp(self): + self.x_np = np.random.uniform(-3, 3, [2, 3]).astype('int16') + self.place = paddle.CUDAPlace(0) if core.is_compiled_with_cuda() \ + else paddle.CPUPlace() + + def test_static_error(self): + paddle.enable_static() + with paddle.static.program_guard(paddle.static.Program()): + x = paddle.fluid.data('X', [5, 5], 'bool') + self.assertRaises(TypeError, paddle.frac, x) + + def test_dygraph_error(self): + paddle.disable_static(self.place) + x = paddle.to_tensor(self.x_np, dtype='int16') + self.assertRaises(TypeError, paddle.frac, x) + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/tensor/__init__.py b/python/paddle/tensor/__init__.py index fc6c8f106ce4f..3c4647d4d6b68 100755 --- a/python/paddle/tensor/__init__.py +++ b/python/paddle/tensor/__init__.py @@ -228,6 +228,7 @@ from .math import fmin # noqa: F401 from .math import inner # noqa: F401 from .math import outer # noqa: F401 +from .math import frac # noqa: F401 from .random import multinomial # noqa: F401 from .random import standard_normal # noqa: F401 @@ -454,6 +455,7 @@ 'digamma', 'diagonal', 'trunc', + 'frac', 'bitwise_and', 'bitwise_or', 'bitwise_xor', diff --git a/python/paddle/tensor/math.py b/python/paddle/tensor/math.py index 3a2d08af88ff8..cfc9abb86984d 100644 --- a/python/paddle/tensor/math.py +++ b/python/paddle/tensor/math.py @@ -4385,3 +4385,57 @@ def angle(x, name=None): outputs = {"Out": out} helper.append_op(type=op_type, inputs=inputs, outputs=outputs) return out + +def frac(x, name=None): + """ + This API is used to return the fractional portion of each element in input. + + Args: + x (Tensor): The input tensor, which data type should be int32, int64, float32, float64. + name: (str, optional): Name for operation (optional, default is None). For more + + Returns: + Tensor: The output Tensor of frac. + + Examples: + .. code-block:: Python + + import paddle + import numpy as np + + input = paddle.rand([3, 3], 'float32') + print(input.numpy()) + # [[ 1.2203873 -1.0035421 -0.35193074] + # [-0.00928353 0.58917075 -0.8407828 ] + # [-1.5131804 0.5850153 -0.17597814]] + + output = paddle.frac(input) + print(output.numpy()) + # [[ 0.22038734 -0.00354207 -0.35193074] + # [-0.00928353 0.58917075 -0.8407828 ] + # [-0.5131804 0.5850153 -0.17597814]] + """ + op_type = 'elementwise_sub' + axis = -1 + act = None + if x.dtype not in [paddle.int32, paddle.int64, paddle.float32, paddle.float64]: + raise TypeError( + "The data type of input must be one of ['int32', 'int64', 'float32', 'float64'], but got {}".format(x.dtype)) + if in_dygraph_mode(): + y = _C_ops.final_state_trunc(x) + return _C_ops.final_state_subtract(x, y) + else: + if _in_legacy_dygraph(): + y = _C_ops.trunc(x) + return _elementwise_op_in_dygraph( + x, y, axis=axis, act=act, op_name=op_type) + else: + inputs = {"X": x} + attrs = {} + + helper = LayerHelper("trunc", **locals()) + check_variable_and_dtype(x, "X", ['int32', 'int64', 'float32', 'float64'], 'trunc') + y = helper.create_variable_for_type_inference(dtype=x.dtype) + helper.append_op( + type="trunc", inputs=inputs, attrs=attrs, outputs={"Out": y}) + return _elementwise_op(LayerHelper(op_type, **locals())) From 78ef10712e93d09e2a0d8b47e259634eff28f19e Mon Sep 17 00:00:00 2001 From: Chen Weihang Date: Wed, 13 Apr 2022 08:42:25 +0800 Subject: [PATCH 104/211] [Phi&CustomOp] Remove deprecated enum PlaceType for custom op & add warning (#41647) * remove old custom op placetype * replace dist placetype using * add with gpu macro * fix mutable_data error * fix set value error * add comment --- paddle/fluid/distributed/collective/Common.cc | 9 +- .../fluid/distributed/collective/reducer.cc | 7 - paddle/fluid/distributed/collective/reducer.h | 2 - paddle/fluid/framework/custom_operator.cc | 9 +- paddle/fluid/pybind/eager_method.cc | 4 +- paddle/phi/api/all.h | 1 - paddle/phi/api/ext/place.h | 22 -- paddle/phi/api/ext/tensor_compat.h | 6 +- paddle/phi/api/include/tensor.h | 62 ++--- paddle/phi/api/lib/CMakeLists.txt | 8 +- paddle/phi/api/lib/ext_compat_utils.cc | 70 ------ paddle/phi/api/lib/ext_compat_utils.h | 31 --- paddle/phi/api/lib/tensor.cc | 212 +++++++++--------- paddle/phi/api/lib/tensor_method.cc | 31 ++- paddle/phi/common/place.cc | 13 ++ paddle/phi/common/place.h | 26 +++ paddle/phi/core/dense_tensor.cc | 19 +- paddle/phi/tests/api/test_pten_tensor.cc | 3 +- 18 files changed, 236 insertions(+), 299 deletions(-) delete mode 100644 paddle/phi/api/ext/place.h delete mode 100644 paddle/phi/api/lib/ext_compat_utils.cc delete mode 100644 paddle/phi/api/lib/ext_compat_utils.h diff --git a/paddle/fluid/distributed/collective/Common.cc b/paddle/fluid/distributed/collective/Common.cc index 02eab58478ccc..4a883f8196389 100644 --- a/paddle/fluid/distributed/collective/Common.cc +++ b/paddle/fluid/distributed/collective/Common.cc @@ -41,13 +41,14 @@ std::string GetKeyFromPlaces(const std::vector& places) { } static bool CheckTensorsInPlace(const std::vector& tensors, - const PlaceType type) { - return std::all_of(tensors.cbegin(), tensors.cend(), - [&](const Tensor& t) { return t.place() == type; }); + phi::AllocationType type) { + return std::all_of(tensors.cbegin(), tensors.cend(), [&](const Tensor& t) { + return t.place().GetType() == type; + }); } bool CheckTensorsInCudaPlace(const std::vector& tensors) { - return CheckTensorsInPlace(tensors, PlaceType::kGPU); + return CheckTensorsInPlace(tensors, phi::AllocationType::GPU); } } // namespace distributed diff --git a/paddle/fluid/distributed/collective/reducer.cc b/paddle/fluid/distributed/collective/reducer.cc index 71741515c90d5..02f7f25636410 100644 --- a/paddle/fluid/distributed/collective/reducer.cc +++ b/paddle/fluid/distributed/collective/reducer.cc @@ -414,20 +414,13 @@ void EagerReducer::InitializeDenseGroups( p_group->dense_tensors_.push_back(phi::DenseTensor()); const auto &dtype = tensor.dtype(); - const auto &place = tensor.place(); const auto &inner_place = tensor.impl()->place(); if (index > 0) { PADDLE_ENFORCE_EQ(dtype, p_group->dtype_, platform::errors::PreconditionNotMet( "Tensor %s has unexpected dtype.", tensor_name)); - PADDLE_ENFORCE_EQ(place, place_, - platform::errors::PreconditionNotMet( - "Tensor %s has different place. Expected place is " - "%s, but actual place is %s", - tensor_name, inner_place_, inner_place)); } else { p_group->dtype_ = dtype; - place_ = place; inner_place_ = inner_place; } } diff --git a/paddle/fluid/distributed/collective/reducer.h b/paddle/fluid/distributed/collective/reducer.h index 12c02509884e9..424bae0e5acd1 100644 --- a/paddle/fluid/distributed/collective/reducer.h +++ b/paddle/fluid/distributed/collective/reducer.h @@ -26,7 +26,6 @@ #include "paddle/fluid/platform/device/gpu/gpu_info.h" #include "paddle/phi/api/include/api.h" #include "paddle/phi/api/include/tensor.h" -#include "paddle/phi/api/lib/ext_compat_utils.h" #include "paddle/phi/common/data_type.h" #include "paddle/phi/kernels/funcs/math_function.h" #include "paddle/utils/string/string_helper.h" @@ -121,7 +120,6 @@ class EagerReducer { std::vector groups_; std::vector variable_locators_; - PlaceType place_; platform::Place inner_place_; size_t next_group_ = 0; int64_t nranks_ = -1; diff --git a/paddle/fluid/framework/custom_operator.cc b/paddle/fluid/framework/custom_operator.cc index df4879735bb82..fbcd920905c9d 100644 --- a/paddle/fluid/framework/custom_operator.cc +++ b/paddle/fluid/framework/custom_operator.cc @@ -36,7 +36,6 @@ limitations under the License. */ #include "paddle/fluid/platform/dynload/dynamic_loader.h" #include "paddle/fluid/string/string_helper.h" #include "paddle/phi/api/all.h" -#include "paddle/phi/api/lib/ext_compat_utils.h" #include "paddle/phi/api/lib/utils/tensor_utils.h" #include "paddle/phi/core/compat/convert_utils.h" #include "paddle/utils/any.h" @@ -627,8 +626,8 @@ class CustomGradOpMaker static void RegisterOperatorKernelWithPlace( const std::string& name, const OperatorWithKernel::OpKernelFunc& op_kernel_func, - const proto::VarType::Type type, const PlaceType& place) { - OpKernelType key(type, experimental::ConvertExtPlaceToInnerPlace(place)); + const proto::VarType::Type type, const platform::Place& place) { + OpKernelType key(type, place); VLOG(3) << "Custom Operator: op kernel key: " << key; OperatorWithKernel::AllOpKernels()[name][key] = op_kernel_func; } @@ -666,10 +665,10 @@ static void RegisterOperatorKernel(const std::string& name, op_kernel_func = func; } RegisterOperatorKernelWithPlace(name, op_kernel_func, proto::VarType::RAW, - PlaceType::kCPU); + platform::CPUPlace()); #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) RegisterOperatorKernelWithPlace(name, op_kernel_func, proto::VarType::RAW, - PlaceType::kGPU); + platform::CUDAPlace()); #endif } diff --git a/paddle/fluid/pybind/eager_method.cc b/paddle/fluid/pybind/eager_method.cc index 021899c5f3782..eb7f64a44126c 100644 --- a/paddle/fluid/pybind/eager_method.cc +++ b/paddle/fluid/pybind/eager_method.cc @@ -921,7 +921,7 @@ static PyObject* tensor_method__setitem_eager_tensor(TensorObject* self, "please check the type of tensor.")); } - if (value_tensor_tmp.place() == paddle::PlaceType::kUNK) { + if (!value_tensor_tmp.initialized()) { #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) SetTensorFromPyArray( static_cast(value_tensor_tmp.impl().get()), @@ -1009,7 +1009,7 @@ static PyObject* tensor_method__setitem_eager_tensor(TensorObject* self, VLOG(4) << "index is not tensor"; self_numpy[_index] = py::object(py::handle(value_obj), true); } - if (self->tensor.place() == paddle::PlaceType::kUNK) { + if (!self->tensor.initialized()) { #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) SetTensorFromPyArray(self_tensor, self_numpy, platform::Place(platform::CUDAPlace(0)), false); diff --git a/paddle/phi/api/all.h b/paddle/phi/api/all.h index ac8607597a436..5838e7b2eaab7 100644 --- a/paddle/phi/api/all.h +++ b/paddle/phi/api/all.h @@ -41,5 +41,4 @@ limitations under the License. */ #include "paddle/phi/api/ext/dispatch.h" #include "paddle/phi/api/ext/exception.h" #include "paddle/phi/api/ext/op_meta_info.h" -#include "paddle/phi/api/ext/place.h" #include "paddle/phi/api/ext/tensor_compat.h" diff --git a/paddle/phi/api/ext/place.h b/paddle/phi/api/ext/place.h deleted file mode 100644 index 91d4f41c21351..0000000000000 --- a/paddle/phi/api/ext/place.h +++ /dev/null @@ -1,22 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -namespace paddle { - -// TODO(yangjiabin): Add other place support in next PR -enum class PlaceType { kUNK = -1, kCPU, kGPU }; - -} // namespace paddle diff --git a/paddle/phi/api/ext/tensor_compat.h b/paddle/phi/api/ext/tensor_compat.h index 530275de50ec7..e63390db06e82 100644 --- a/paddle/phi/api/ext/tensor_compat.h +++ b/paddle/phi/api/ext/tensor_compat.h @@ -14,6 +14,7 @@ limitations under the License. */ #pragma once +#include "paddle/phi/api/include/api.h" #include "paddle/phi/api/include/tensor.h" // Note(chenweihang): In order to be compatible with the original custom @@ -21,5 +22,8 @@ limitations under the License. */ // cannot be includeed in paddle namespace paddle { -using Tensor = paddle::experimental::Tensor; +using Tensor = experimental::Tensor; +// using several Tensor initialize functions in paddle namespace +using experimental::empty; +using experimental::full; } // namespace paddle diff --git a/paddle/phi/api/include/tensor.h b/paddle/phi/api/include/tensor.h index ad3933e2b2b53..d3efb7ca1c21e 100644 --- a/paddle/phi/api/include/tensor.h +++ b/paddle/phi/api/include/tensor.h @@ -29,7 +29,6 @@ using gpuStream_t = cudaStream_t; using gpuStream_t = hipStream_t; #endif -#include "paddle/phi/api/ext/place.h" #include "paddle/phi/api/include/dll_decl.h" #include "paddle/phi/common/data_type.h" #include "paddle/phi/common/layout.h" @@ -109,21 +108,23 @@ class PADDLE_API Tensor final { /** * @brief Construct a new Tensor object on the target place. - * This is a deprecated method and may be removed in the future! + * + * This is a deprecated method and may be removed in the future!!! * * @param place */ - explicit Tensor(const PlaceType& place); + explicit Tensor(const Place& place); /** * @brief Construct a new Tensor object on the target place * with specified shape. - * This is a deprecated method and may be removed in the future! + * + * This is a deprecated method and may be removed in the future!!! * * @param place * @param shape */ - Tensor(const PlaceType& place, const std::vector& shape); + Tensor(const Place& place, const std::vector& shape); /** * @brief Construct a new Tensor object by a TensorBase pointer and name @@ -135,8 +136,9 @@ class PADDLE_API Tensor final { /** * @brief Construct a new Tensor object with name * - * @note Used to adapt original execution mechanism and debug analysis - * in the development of new dygraph. It may be removed in the future. + * @note Internal method, used to adapt original execution mechanism and + * debug analysis in the development of new dygraph. It may be removed in + * the future. * */ explicit Tensor(const std::string& name) : name_(name) {} @@ -151,6 +153,7 @@ class PADDLE_API Tensor final { /** * @brief Get the size of current tensor. + * * The compatible method of `Tensor::numel()`. * This is a deprecated method and may be removed in the future! * @@ -167,6 +170,7 @@ class PADDLE_API Tensor final { /** * @brief Return the shape (dimensions) of Tensor. + * * The compatible method of `Tensor::dims()`. * This is a deprecated method and may be removed in the future! * @@ -178,7 +182,7 @@ class PADDLE_API Tensor final { * @brief Reset the shape of the tensor. * @note: This method means Reset the shape of the tensor, * and must be called before calling mutable_data() or - * copy_to(const PlaceType& place), this is not a standard definition of + * copy_to(const Place& place), this is not a standard definition of * reshape behavior, so we will deprecated this feature in the future. * * @param shape @@ -194,6 +198,7 @@ class PADDLE_API Tensor final { /** * @brief Return the data type of Tensor. + * * The compatible method of `Tensor::dtype()`. * This is a deprecated method and may be removed in the future! * @@ -246,18 +251,18 @@ class PADDLE_API Tensor final { * @brief Return the place (device) of Tensor. * This is a deprecated method and may be removed in the future! * - * @return PlaceType + * @return Place */ - PlaceType place() const; + Place place() const; /** * @brief Return the place (device) of Tensor. - * Because the `place` method already exists, so we need to use a new name, - * here we temporarily use `inner_place`. * - * @return paddle::platform::Place + * This is a deprecated method and may be removed in the future!!! + * + * @return Place */ - phi::Place inner_place() const; + Place inner_place() const; /** * @brief Determine whether the tensor device is CPU @@ -287,7 +292,7 @@ class PADDLE_API Tensor final { /** * @brief Get the memory pointer in CPU or GPU with specific data type. - * It's usually used to get the output data pointer. + * It's usually used to get the output data pointer, same as the T* data(). * * @tparam T * @return T* @@ -297,6 +302,7 @@ class PADDLE_API Tensor final { /** * @brief Get the memory pointer in CPU or GPU with specific data type. + * * It's usually used to get the output data pointer. * This is a deprecated method and may be removed in the future! * @@ -305,7 +311,7 @@ class PADDLE_API Tensor final { * @return T* */ template - T* mutable_data(const PlaceType& place); + T* mutable_data(const Place& place); /** * @brief Get the const memory pointer directly. @@ -319,8 +325,7 @@ class PADDLE_API Tensor final { /** * @brief Get the memory pointer directly. - * It's usually used to get the output data pointer. - * This is a deprecated method and may be removed in the future! + * It's usually used to get the mutable output data pointer. * * @tparam T * @return T* @@ -409,7 +414,7 @@ class PADDLE_API Tensor final { * @return Tensor */ template - Tensor copy_to(const PlaceType& target_place) const; + Tensor copy_to(const Place& target_place) const; /** * @brief Transfer the current Tensor to the specified device and return. @@ -427,7 +432,8 @@ class PADDLE_API Tensor final { * @param blocking, Should we copy this in sync way. * @return void */ - void copy_(const Tensor& src, const phi::Place& target_place, bool blocking); + void copy_(const Tensor& src, const Place& target_place, bool blocking); + /** * @brief Cast datatype from one to another * @@ -489,11 +495,17 @@ class PADDLE_API Tensor final { /* Part 8: Autograd methods */ /** - * @brief Get the autograd meta object + * @brief Get the autograd meta object pointer * * @return AbstractAutogradMeta* */ AbstractAutogradMeta* get_autograd_meta() const; + + /** + * @brief Get the shared pointer of autograd meta object + * + * @return std::shared_ptr& + */ const std::shared_ptr& mutable_autograd_meta() const; /** @@ -524,7 +536,7 @@ class PADDLE_API Tensor final { /* Part 10: Auto generated Tensor methods */ - /* Part 11: Methods of converting SparseTensor and DenseTensor to each other + /* Part 11: Methods of converting underlying TensorType to each other */ /** * @brief Convert DenseTensor or SparseCsrTensor to SparseCooTensor @@ -587,12 +599,6 @@ class PADDLE_API Tensor final { * in the development of new dygraph. It may be removed in the future. */ std::string name_{""}; - - /** - * Place type: Return the expected memory location if the Tensor is - * uninitialized. - */ - PlaceType place_{PlaceType::kUNK}; }; } // namespace experimental diff --git a/paddle/phi/api/lib/CMakeLists.txt b/paddle/phi/api/lib/CMakeLists.txt index 7dfe7d8cf4d20..9cc5d620280bc 100644 --- a/paddle/phi/api/lib/CMakeLists.txt +++ b/paddle/phi/api/lib/CMakeLists.txt @@ -1,13 +1,11 @@ add_subdirectory(utils) -cc_library(ext_compat_utils SRCS ext_compat_utils.cc DEPS place) - if (WITH_GPU) - nv_library(phi_tensor_raw SRCS tensor.cc DEPS tensor_base dense_tensor phi_api_utils ext_compat_utils phi_enforce) + nv_library(phi_tensor_raw SRCS tensor.cc DEPS tensor_base dense_tensor phi_api_utils phi_enforce) elseif (WITH_ROCM) - hip_library(phi_tensor_raw SRCS tensor.cc DEPS tensor_base dense_tensor phi_api_utils ext_compat_utils phi_enforce) + hip_library(phi_tensor_raw SRCS tensor.cc DEPS tensor_base dense_tensor phi_api_utils phi_enforce) else() - cc_library(phi_tensor_raw SRCS tensor.cc DEPS tensor_base dense_tensor phi_api_utils ext_compat_utils phi_enforce) + cc_library(phi_tensor_raw SRCS tensor.cc DEPS tensor_base dense_tensor phi_api_utils phi_enforce) endif() set(api_gen_base ${CMAKE_SOURCE_DIR}/python/paddle/utils/code_gen/api_base.py) diff --git a/paddle/phi/api/lib/ext_compat_utils.cc b/paddle/phi/api/lib/ext_compat_utils.cc deleted file mode 100644 index 1d0f52b5f0b65..0000000000000 --- a/paddle/phi/api/lib/ext_compat_utils.cc +++ /dev/null @@ -1,70 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/phi/api/lib/ext_compat_utils.h" -#include "paddle/fluid/platform/device/gpu/gpu_info.h" - -namespace paddle { -namespace experimental { - -platform::Place ConvertExtPlaceToInnerPlace(PlaceType p) { - if (p == PlaceType::kCPU) { - return platform::Place(platform::CPUPlace()); -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) - } else if (p == PlaceType::kGPU) { - return platform::Place(platform::CUDAPlace(platform::GetCurrentDeviceId())); -#endif - } else { - PADDLE_THROW( - platform::errors::Unimplemented("Unsupported place type code(%d) when " - "casting enum place to paddle place.", - static_cast(p))); - } - return platform::Place(); -} - -PlaceType ConvertInnerPlaceToExtPlace(const platform::Place& p) { - if (platform::is_cpu_place(p)) { - return PlaceType::kCPU; - } else if (platform::is_gpu_place(p)) { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) - return PlaceType::kGPU; -#endif - } else { - PADDLE_THROW( - platform::errors::Unimplemented("Unsupported place type `%s` when " - "casting paddle place to enum place.", - p)); - } - return PlaceType::kUNK; -} - -Backend ConvertExtPlaceToBackend(PlaceType p) { - switch (p) { - case PlaceType::kCPU: - return Backend::CPU; -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) - case PlaceType::kGPU: - return Backend::GPU; -#endif - default: - PADDLE_THROW( - platform::errors::Unimplemented("Unsupported place type `%s` when " - "casting enum place to backend.", - static_cast(p))); - } -} - -} // namespace experimental -} // namespace paddle diff --git a/paddle/phi/api/lib/ext_compat_utils.h b/paddle/phi/api/lib/ext_compat_utils.h deleted file mode 100644 index 89f6f15b70ff2..0000000000000 --- a/paddle/phi/api/lib/ext_compat_utils.h +++ /dev/null @@ -1,31 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include "paddle/fluid/platform/place.h" -#include "paddle/phi/api/ext/place.h" -#include "paddle/phi/common/backend.h" - -namespace paddle { -namespace experimental { - -platform::Place ConvertExtPlaceToInnerPlace(PlaceType p); - -PlaceType ConvertInnerPlaceToExtPlace(const platform::Place& p); - -Backend ConvertExtPlaceToBackend(PlaceType p); - -} // namespace experimental -} // namespace paddle diff --git a/paddle/phi/api/lib/tensor.cc b/paddle/phi/api/lib/tensor.cc index ffc754feaed98..07204b7ffcf61 100644 --- a/paddle/phi/api/lib/tensor.cc +++ b/paddle/phi/api/lib/tensor.cc @@ -19,46 +19,41 @@ limitations under the License. */ #include #include "glog/logging.h" -#include "paddle/phi/api/lib/ext_compat_utils.h" + #include "paddle/phi/api/lib/utils/allocator.h" -#include "paddle/phi/api/lib/utils/storage.h" -#include "paddle/phi/core/compat/convert_utils.h" +#include "paddle/phi/backends/gpu/gpu_info.h" +#include "paddle/phi/core/ddim.h" #include "paddle/phi/core/dense_tensor.h" +#include "paddle/phi/core/enforce.h" #include "paddle/phi/core/selected_rows.h" #include "paddle/phi/core/sparse_coo_tensor.h" #include "paddle/phi/core/sparse_csr_tensor.h" #include "paddle/phi/core/tensor_base.h" #include "paddle/phi/core/tensor_meta.h" #include "paddle/phi/core/tensor_utils.h" -/** - * [ Why still include the fluid headers? ] - * - * We hope to organize the basic implementation of Tensor and the logic related - * to Tensor computation into an independent library, which we call - * [Tensor Operation Library, phi], so we extract or rewrite the original - * Kernels. - * - * In the future, the training library, inference library and custom operators - * will link to this Tensor Operation library. - * - * However, if we directly split the link relation, we need to make too many - * changes, which will affect the stability of the framework, so here we still - * rely on the implementation of the framework, which is a intermediate state. - * - * In the future, the necessary components will be moved to the this library, - * or the corresponding components will be re-implemented. - */ - -#include "paddle/fluid/memory/memory.h" -#include "paddle/fluid/platform/place.h" + #include "paddle/fluid/platform/stream/cuda_stream.h" -#include "paddle/phi/common/complex.h" -#include "paddle/phi/common/float16.h" -#include "paddle/phi/core/ddim.h" -#include "paddle/phi/core/enforce.h" namespace paddle { namespace experimental { +namespace detail { +static Place GetCorrectPlaceByPlaceType(const Place &place_type) { + auto alloc_type = place_type.GetType(); + switch (alloc_type) { + case AllocationType::CPU: + return place_type; +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) + case AllocationType::GPU: + return phi::Place(AllocationType::GPU, + phi::backends::gpu::GetCurrentDeviceId()); +#endif + default: + PADDLE_THROW(phi::errors::Unavailable( + "The PlaceType is a legacy design, only supports CPU and GPU, " + "and will not support other place types in the future.")); + } +} +} // namespace detail /////// Tensor Methods //////// @@ -71,27 +66,41 @@ Tensor::Tensor(std::shared_ptr tensor_impl) phi::errors::InvalidArgument("TensorImpl with nullptr is not supported")); } -Tensor::Tensor(const PlaceType &place) - : impl_(std::move(std::make_shared( - std::move(phi::make_intrusive( - ConvertExtPlaceToInnerPlace(place))), - std::move(phi::DenseTensorMeta(phi::DataType::UNDEFINED, - phi::make_ddim({}), - phi::DataLayout::NCHW))))), - place_{place} {} - -Tensor::Tensor(const PlaceType &place, const std::vector &shape) - : impl_(std::move(std::make_shared( - std::move(phi::make_intrusive( - ConvertExtPlaceToInnerPlace(place))), - std::move(phi::DenseTensorMeta(phi::DataType::UNDEFINED, - phi::make_ddim(shape), - phi::DataLayout::NCHW))))), - place_{place} {} +Tensor::Tensor(const Place &place) { + LOG(WARNING) << "The Tensor(place) constructor is deprecated since version " + "2.3, and will be removed in version 2.4! Please use " + "`paddle::empty/full` method to create a new " + "Tensor instead. " + "Reason: A legal tensor cannot be constructed only based on " + "the `place`, and datatype, shape, layout, etc. is also " + "required."; + DefaultAllocator alloc(detail::GetCorrectPlaceByPlaceType(place)); + impl_ = std::move(std::make_shared( + &alloc, + std::move(phi::DenseTensorMeta( + phi::DataType::FLOAT32, phi::make_ddim({}), phi::DataLayout::NCHW)))); +} + +Tensor::Tensor(const Place &place, const std::vector &shape) { + LOG(WARNING) << "The Tensor(place, shape) constructor is deprecated since " + "version 2.3, and will be removed in version 2.4! Please use " + "`paddle::empty/full` method to create a new " + "Tensor instead. " + "Reason: A legal tensor cannot be constructed only based on " + "the `place` and `shape`, and datatype, layout, etc. is also " + "required."; + DefaultAllocator alloc(detail::GetCorrectPlaceByPlaceType(place)); + impl_ = std::move(std::make_shared( + &alloc, + std::move(phi::DenseTensorMeta(phi::DataType::FLOAT32, + phi::make_ddim({shape}), + phi::DataLayout::NCHW)))); +} Tensor::Tensor(std::shared_ptr tensor_impl, const std::string &name) : impl_(std::move(tensor_impl)), name_(std::move(name)) {} + /* Part 2: Dimension, DataType and DataLayout methods */ int64_t Tensor::numel() const { return impl_->numel(); } @@ -112,14 +121,13 @@ void Tensor::reshape(const std::vector &shape) { LOG(WARNING) << "The function of resetting the shape of the uninitialized " "Tensor of the `reshape` method is deprecated since version " "2.3, and will be removed in version 2.4, please use " - "`paddle::experimental::full` method to create a new Tensor " + "`paddle::empty/full` method to create a new Tensor " "instead. " "reason: `reshape` means changing the tensor shape without " "touching underlying data, this requires the total size of " "the tensor to remain constant."; if (is_dense_tensor()) { - std::dynamic_pointer_cast(impl_)->Resize( - phi::make_ddim(shape)); + static_cast(impl_.get())->Resize(phi::make_ddim(shape)); } else { PADDLE_THROW(phi::errors::Unimplemented( "Only support reshape operation on DenseTensor now.")); @@ -146,15 +154,16 @@ bool Tensor::is_sparse_csr_tensor() const { } /* Part 3: Device and Backend methods */ -PlaceType Tensor::place() const { - if (!impl_->initialized()) { - return place_; - } else { - return ConvertInnerPlaceToExtPlace(impl_->place()); - } +Place Tensor::place() const { + PADDLE_ENFORCE_NOT_NULL( + impl_, + phi::errors::PermissionDenied( + "Null pointer error, the impl_ of Tensor should not be " + "Null when calling Tensor::place().")); + return impl_->place(); } -paddle::platform::Place Tensor::inner_place() const { +Place Tensor::inner_place() const { PADDLE_ENFORCE_NOT_NULL( impl_, phi::errors::PermissionDenied( @@ -179,9 +188,18 @@ bool Tensor::is_gpu_pinned() const { template T *Tensor::mutable_data() { + LOG(WARNING) << "Allocating memory through `mutable_data` method is " + "deprecated since version 2.3, and `mutable_data` method " + "will be removed in version 2.4! Please use " + "`paddle::empty/full` method to create a new " + "Tensor with allocated memory, and use data() method " + "to get the memory pointer of tensor instead. " + "Reason: When calling `mutable_data` to allocate memory, " + "the place, datatype, and data layout of tensor may be in " + "an illegal state."; if (is_dense_tensor()) { - return std::dynamic_pointer_cast(impl_)->mutable_data( - ConvertExtPlaceToInnerPlace(place())); + return static_cast(impl_.get()) + ->mutable_data(place()); } return nullptr; } @@ -202,51 +220,43 @@ template PADDLE_API phi::dtype::float16 * Tensor::mutable_data(); template -T *Tensor::mutable_data(const PlaceType &place) { - auto inner_place = ConvertExtPlaceToInnerPlace(place); - if (impl_->initialized()) { - PADDLE_ENFORCE_EQ( - platform::is_same_place(inner_place, impl_->place()), - true, - phi::errors::Unimplemented("Modification of tensor place through " - "mutable_data is not supported now")); - } +T *Tensor::mutable_data(const Place &place) { + LOG(WARNING) << "Allocating memory through `mutable_data` method is " + "deprecated since version 2.3, and `mutable_data` method " + "will be removed in version 2.4! Please use " + "`paddle::empty/full` method to create a new " + "Tensor with allocated memory, and use data() method " + "to get the memory pointer of tensor instead. " + "Reason: When calling `mutable_data` to allocate memory, " + "the datatype, and data layout of tensor may be in " + "an illegal state."; if (is_dense_tensor()) { - return std::dynamic_pointer_cast(impl_)->mutable_data( - inner_place); + return static_cast(impl_.get())->mutable_data(place); } return nullptr; } -template PADDLE_API float *Tensor::mutable_data(const PlaceType &place); -template PADDLE_API double *Tensor::mutable_data( - const PlaceType &place); -template PADDLE_API int64_t *Tensor::mutable_data( - const PlaceType &place); -template PADDLE_API int32_t *Tensor::mutable_data( - const PlaceType &place); -template PADDLE_API uint8_t *Tensor::mutable_data( - const PlaceType &place); -template PADDLE_API int8_t *Tensor::mutable_data( - const PlaceType &place); -template PADDLE_API int16_t *Tensor::mutable_data( - const PlaceType &place); -template PADDLE_API bool *Tensor::mutable_data(const PlaceType &place); +template PADDLE_API float *Tensor::mutable_data(const Place &place); +template PADDLE_API double *Tensor::mutable_data(const Place &place); +template PADDLE_API int64_t *Tensor::mutable_data(const Place &place); +template PADDLE_API int32_t *Tensor::mutable_data(const Place &place); +template PADDLE_API uint8_t *Tensor::mutable_data(const Place &place); +template PADDLE_API int8_t *Tensor::mutable_data(const Place &place); +template PADDLE_API int16_t *Tensor::mutable_data(const Place &place); +template PADDLE_API bool *Tensor::mutable_data(const Place &place); template PADDLE_API phi::dtype::complex - *Tensor::mutable_data>(const PlaceType &place); + *Tensor::mutable_data>(const Place &place); template PADDLE_API phi::dtype::complex - *Tensor::mutable_data>(const PlaceType &place); + *Tensor::mutable_data>(const Place &place); template PADDLE_API phi::dtype::float16 * -Tensor::mutable_data(const PlaceType &place); +Tensor::mutable_data(const Place &place); template const T *Tensor::data() const { if (is_dense_tensor()) { - return std::dynamic_pointer_cast(impl_)->data(); - } else if (phi::SelectedRows::classof(impl_.get())) { - return std::dynamic_pointer_cast(impl_) - ->value() - .data(); + return static_cast(impl_.get())->data(); + } else if (is_selected_rows()) { + return static_cast(impl_.get())->value().data(); } return nullptr; } @@ -271,9 +281,9 @@ Tensor::data() const; template T *Tensor::data() { if (is_dense_tensor()) { - return std::dynamic_pointer_cast(impl_)->data(); - } else if (phi::SelectedRows::classof(impl_.get())) { - return std::dynamic_pointer_cast(impl_) + return static_cast(impl_.get())->data(); + } else if (is_selected_rows()) { + return static_cast(impl_.get()) ->mutable_value() ->data(); } @@ -299,7 +309,7 @@ Tensor Tensor::slice(int64_t begin_idx, int64_t end_idx) const { if (is_dense_tensor()) { return Tensor(std::make_shared( std::move(phi::DenseTensorUtils::Slice( - *(std::dynamic_pointer_cast(impl_).get()), + *(static_cast(impl_.get())), begin_idx, end_idx)))); } else { @@ -331,6 +341,9 @@ bool Tensor::defined() const { return impl_ != nullptr; } bool Tensor::initialized() const { return defined() && impl_->initialized(); } bool Tensor::is_initialized() const { + LOG(WARNING) << "The `is_initialized` method is deprecated since version " + "2.3, and will be removed in version 2.4! " + "Please use `initialized` method instead."; return defined() && impl_->initialized(); } @@ -342,7 +355,6 @@ Tensor &Tensor::operator=(const Tensor &x) & { impl_ = x.impl_; autograd_meta_ = x.autograd_meta_; name_ = x.name_; - place_ = x.place_; return *this; } @@ -350,7 +362,6 @@ Tensor &Tensor::operator=(Tensor &&x) & { impl_ = std::move(x.impl_); autograd_meta_ = std::move(x.autograd_meta_); name_ = std::move(x.name_); - place_ = std::move(x.place_); return *this; } @@ -371,8 +382,7 @@ void Tensor::set_autograd_meta( void Tensor::bump_inplace_version() { if (is_dense_tensor()) { auto &inplace_version_counter = - std::dynamic_pointer_cast(impl_) - ->InplaceVersionCounter(); + static_cast(impl_.get())->InplaceVersionCounter(); inplace_version_counter.Bump(); } else { PADDLE_THROW(phi::errors::Unimplemented( @@ -383,8 +393,7 @@ void Tensor::bump_inplace_version() { uint32_t Tensor::current_inplace_version() { if (is_dense_tensor()) { auto &inplace_version_counter = - std::dynamic_pointer_cast(impl_) - ->InplaceVersionCounter(); + static_cast(impl_.get())->InplaceVersionCounter(); return inplace_version_counter.CurrentVersion(); } else { PADDLE_THROW(phi::errors::Unimplemented( @@ -397,8 +406,7 @@ void Tensor::reset_inplace_version(bool set_to_zero) { if (set_to_zero) { if (is_dense_tensor()) { auto &inplace_version_counter = - std::dynamic_pointer_cast(impl_) - ->InplaceVersionCounter(); + static_cast(impl_.get())->InplaceVersionCounter(); inplace_version_counter.SetInplaceVersionToZero(); } } diff --git a/paddle/phi/api/lib/tensor_method.cc b/paddle/phi/api/lib/tensor_method.cc index c4c77ab93790d..46ca457b2c10a 100644 --- a/paddle/phi/api/lib/tensor_method.cc +++ b/paddle/phi/api/lib/tensor_method.cc @@ -14,7 +14,6 @@ limitations under the License. */ #include "paddle/phi/api/include/tensor.h" -#include "paddle/phi/api/lib/ext_compat_utils.h" #include "paddle/phi/common/int_array.h" #include "paddle/phi/core/compat/convert_utils.h" #include "paddle/phi/core/tensor_base.h" @@ -39,37 +38,37 @@ Tensor Tensor::copy_to(Place place, bool blocking) const { } template -Tensor Tensor::copy_to(const PlaceType &target_place) const { +Tensor Tensor::copy_to(const Place &target_place) const { LOG(WARNING) << "The Tensor's `copy_to` method is deprecated since version " "2.3, and will be removed in version 2.4, please use " "`copy_to` method without template argument instead. " "reason: copying a Tensor to another device does not need " "to specify the data type template argument."; - return copy_to(ConvertExtPlaceToInnerPlace(target_place), /*blocking=*/false); + return copy_to(target_place, /*blocking=*/false); } template PADDLE_API Tensor -Tensor::copy_to(const PlaceType &target_place) const; +Tensor::copy_to(const Place &target_place) const; template PADDLE_API Tensor -Tensor::copy_to(const PlaceType &target_place) const; +Tensor::copy_to(const Place &target_place) const; template PADDLE_API Tensor -Tensor::copy_to(const PlaceType &target_place) const; +Tensor::copy_to(const Place &target_place) const; template PADDLE_API Tensor -Tensor::copy_to(const PlaceType &target_place) const; +Tensor::copy_to(const Place &target_place) const; template PADDLE_API Tensor -Tensor::copy_to(const PlaceType &target_place) const; +Tensor::copy_to(const Place &target_place) const; template PADDLE_API Tensor -Tensor::copy_to(const PlaceType &target_place) const; +Tensor::copy_to(const Place &target_place) const; template PADDLE_API Tensor -Tensor::copy_to(const PlaceType &target_place) const; +Tensor::copy_to(const Place &target_place) const; template PADDLE_API Tensor -Tensor::copy_to(const PlaceType &target_place) const; -template PADDLE_API Tensor Tensor::copy_to>( - const PlaceType &target_place) const; -template PADDLE_API Tensor Tensor::copy_to>( - const PlaceType &target_place) const; +Tensor::copy_to(const Place &target_place) const; template PADDLE_API Tensor -Tensor::copy_to(const PlaceType &target_place) const; +Tensor::copy_to>(const Place &target_place) const; +template PADDLE_API Tensor +Tensor::copy_to>(const Place &target_place) const; +template PADDLE_API Tensor +Tensor::copy_to(const Place &target_place) const; void Tensor::copy_(const Tensor &src, const phi::Place &target_place, diff --git a/paddle/phi/common/place.cc b/paddle/phi/common/place.cc index 2b5254d3d5f14..a77042757c7ba 100644 --- a/paddle/phi/common/place.cc +++ b/paddle/phi/common/place.cc @@ -109,3 +109,16 @@ uint32_t Place::Hash::operator()(const Place &place) const { } } // namespace phi + +namespace paddle { + +phi::Place PlaceType::kUNK = phi::Place(); +phi::Place PlaceType::kCPU = phi::Place(phi::AllocationType::CPU); +// GPU Place contains device id, here we use default value 0, so it cannot +// use for multi-casd cases, but because it is static variable, it is difficult +// to get the exact device id at all time. +// NOTE: Please DO NOT use this place in the framework!!! +// It only for external compatibility +phi::Place PlaceType::kGPU = phi::Place(phi::AllocationType::GPU); + +} // namespace paddle diff --git a/paddle/phi/common/place.h b/paddle/phi/common/place.h index 390684366db71..d43fc497277c5 100644 --- a/paddle/phi/common/place.h +++ b/paddle/phi/common/place.h @@ -213,4 +213,30 @@ using GPUPinnedPlace = phi::GPUPinnedPlace; using XPUPlace = phi::XPUPlace; using NPUPlace = phi::NPUPlace; } // namespace experimental + +/* NOTE: In order to remove and be compatible with the enumeration type +`PlaceType` of custom operator, we define a temporary type. + +This type cannot add any new type!!! It is only used for compatibility with +historical writing and we will remove this temporary type in the future. +This Type cannot be used in framework! only used for custom operator! + +The historical PlaceType define: + +- enum class PlaceType { kUNK = -1, kCPU, kGPU }; + +The historical PlaceType using: + +- PD_CHECK(x.place() == paddle::PlaceType::kCPU) +- auto out = paddle::Tensor(paddle::PlaceType::kCPU, x.shape()); + +The new type cannot be used as int value! If you use as int, please modify +the implementation. +*/ +struct PADDLE_API PlaceType { + static phi::Place kUNK; + static phi::Place kCPU; + static phi::Place kGPU; +}; + } // namespace paddle diff --git a/paddle/phi/core/dense_tensor.cc b/paddle/phi/core/dense_tensor.cc index 8acdd8b34f7d1..1bfe29bc9d3ba 100644 --- a/paddle/phi/core/dense_tensor.cc +++ b/paddle/phi/core/dense_tensor.cc @@ -19,7 +19,24 @@ limitations under the License. */ #include "paddle/phi/common/float16.h" #include "paddle/phi/core/compat/convert_utils.h" -// See Note [ Why still include the fluid headers? ] +/** + * [ Why still include the fluid headers? ] + * + * We hope to organize the basic implementation of Tensor and the logic related + * to Tensor computation into an independent library, which we call + * [Tensor Operation Library, phi], so we extract or rewrite the original + * Kernels. + * + * In the future, the training library, inference library and custom operators + * will link to this Tensor Operation library. + * + * However, if we directly split the link relation, we need to make too many + * changes, which will affect the stability of the framework, so here we still + * rely on the implementation of the framework, which is a intermediate state. + * + * In the future, the necessary components will be moved to the this library, + * or the corresponding components will be re-implemented. + */ #include "paddle/fluid/memory/malloc.h" namespace phi { diff --git a/paddle/phi/tests/api/test_pten_tensor.cc b/paddle/phi/tests/api/test_pten_tensor.cc index 74ed648f3ee6e..590717b8d7b77 100644 --- a/paddle/phi/tests/api/test_pten_tensor.cc +++ b/paddle/phi/tests/api/test_pten_tensor.cc @@ -15,7 +15,6 @@ #include "glog/logging.h" #include "gtest/gtest.h" #include "paddle/phi/api/include/tensor.h" -#include "paddle/phi/api/lib/ext_compat_utils.h" #include "paddle/phi/core/kernel_registry.h" PD_DECLARE_KERNEL(copy, CPU, ALL_LAYOUT); @@ -201,7 +200,7 @@ void GroupTestDtype() { void TestInitilized() { experimental::Tensor test_tensor(paddle::PlaceType::kCPU, {1, 1}); - CHECK(test_tensor.is_initialized() == false); + CHECK(test_tensor.is_initialized() == true); test_tensor.mutable_data(paddle::PlaceType::kCPU); CHECK(test_tensor.is_initialized() == true); float* tensor_data = test_tensor.mutable_data(); From d84934da23b533e1cb59095d16aa033471daa957 Mon Sep 17 00:00:00 2001 From: Aganlengzi Date: Wed, 13 Apr 2022 09:05:39 +0800 Subject: [PATCH 105/211] [CustomDevice] move member variable to dense_tensor.h (#41702) --- paddle/phi/core/dense_tensor.h | 19 +++++++++++++++++++ paddle/phi/core/dense_tensor.inl | 11 ----------- 2 files changed, 19 insertions(+), 11 deletions(-) diff --git a/paddle/phi/core/dense_tensor.h b/paddle/phi/core/dense_tensor.h index a740a9a5725d9..ef91319e1c961 100644 --- a/paddle/phi/core/dense_tensor.h +++ b/paddle/phi/core/dense_tensor.h @@ -190,6 +190,25 @@ class DenseTensor : public TensorBase, std::shared_ptr inplace_version_counter_{ std::make_shared()}; +/* @jim19930609: This is a hack +In general, it is badly designed to fuse MKLDNN-specific objects into a +generic Tensor. +We temporarily leave them here to unblock Tensor Unification progress. +In the final state, we should come up with a MKLDNN_Tensor and move the +following codes there. +*/ +#ifdef PADDLE_WITH_MKLDNN + /** + * @brief the detail format of memory block which have layout as kMKLDNN + * + * @note MKLDNN lib support various memory format like nchw, nhwc, nChw8C, + * nChw16c, etc. For a MKLDNN memory block, layout will be set as + * DataLayout::kMKLDNN meanwhile detail memory format will be kept in + * this field. + */ + dnnl::memory::format_tag format_ = dnnl::memory::format_tag::undef; +#endif + #ifndef PADDLE_WITH_CUSTOM_KERNEL #include "paddle/phi/core/dense_tensor.inl" #endif diff --git a/paddle/phi/core/dense_tensor.inl b/paddle/phi/core/dense_tensor.inl index 804360ea941ba..c6ca3c00cb558 100644 --- a/paddle/phi/core/dense_tensor.inl +++ b/paddle/phi/core/dense_tensor.inl @@ -133,17 +133,6 @@ inline void set_format(const dnnl::memory::format_tag format) { format_ = format; } -protected: -/** - * @brief the detail format of memory block which have layout as kMKLDNN - * - * @note MKLDNN lib support various memory format like nchw, nhwc, nChw8C, - * nChw16c, etc. For a MKLDNN memory block, layout will be set as - * DataLayout::kMKLDNN meanwhile detail memory format will be kept in - * this field. - */ - -dnnl::memory::format_tag format_ = dnnl::memory::format_tag::undef; #endif /* ------------------------------ */ From acd08a9b4dce03ebe9cedfbe8c98c823799feeea Mon Sep 17 00:00:00 2001 From: zhangkaihuo Date: Wed, 13 Apr 2022 09:50:21 +0800 Subject: [PATCH 106/211] Add kernel sparse_mask_helper; sparse_coo_tensor_grad (#41586) --- .../phi/kernels/funcs/sparse/common_shape.h | 39 ++++ .../kernels/sparse/cpu/sparse_mask_kernel.cc | 101 +++++++++-- .../kernels/sparse/cpu/sparse_utils_kernel.cc | 12 ++ .../kernels/sparse/gpu/sparse_mask_kernel.cu | 166 +++++++++++++++++- .../kernels/sparse/gpu/sparse_utils_kernel.cu | 12 ++ .../phi/kernels/sparse/sparse_mask_kernel.h | 6 + .../sparse/sparse_utils_grad_kernel.cc | 25 +++ .../kernels/sparse/sparse_utils_grad_kernel.h | 9 + .../phi/kernels/sparse/sparse_utils_kernel.h | 12 ++ .../tests/unittests/test_sparse_utils_op.py | 64 ++++++- python/paddle/sparse/creation.py | 36 +++- python/paddle/utils/code_gen/sparse_api.yaml | 8 + .../paddle/utils/code_gen/sparse_bw_api.yaml | 7 + 13 files changed, 476 insertions(+), 21 deletions(-) diff --git a/paddle/phi/kernels/funcs/sparse/common_shape.h b/paddle/phi/kernels/funcs/sparse/common_shape.h index 3617e3cd2f406..e4c836d116252 100644 --- a/paddle/phi/kernels/funcs/sparse/common_shape.h +++ b/paddle/phi/kernels/funcs/sparse/common_shape.h @@ -40,6 +40,45 @@ inline const DDim InferDenseDims(const DDim& x_dims, return values_dims; } +template +inline const IntT HOSTDEVICE IndicesToIndex(const IntT* indices, + const IntT* sparse_offsets, + const int64_t non_zero_num, + const int64_t sparse_dim, + const int i) { + IntT index = 0; + for (IntT j = 0; j < sparse_dim; j++) { + index += indices[j * non_zero_num + i] * sparse_offsets[j]; + } + return index; +} + +template +inline void HOSTDEVICE FlattenIndices(const IntT* indices, + const IntT* sparse_offsets, + const int64_t non_zero_num, + const int64_t sparse_dim, + const int start, + const int stride, + IntT* out) { + for (int i = start; i < non_zero_num; i += stride) { + out[i] = + IndicesToIndex(indices, sparse_offsets, non_zero_num, sparse_dim, i); + } +} + +// 1. indices.dims().size() == 2 +template +inline void CalcOffsetsPerDim(const DDim& dims, + const int64_t sparse_dim, + std::vector* offsets) { + IntT offset = 1; + for (IntT i = sparse_dim - 1; i >= 0; i--) { + (*offsets)[i] = offset; + offset *= dims[i]; + } +} + } // namespace sparse } // namespace funcs } // namespace phi diff --git a/paddle/phi/kernels/sparse/cpu/sparse_mask_kernel.cc b/paddle/phi/kernels/sparse/cpu/sparse_mask_kernel.cc index 0a5e145312e0e..a07a7fb2ecf44 100644 --- a/paddle/phi/kernels/sparse/cpu/sparse_mask_kernel.cc +++ b/paddle/phi/kernels/sparse/cpu/sparse_mask_kernel.cc @@ -19,6 +19,7 @@ limitations under the License. */ #include "paddle/phi/kernels/copy_kernel.h" #include "paddle/phi/kernels/empty_kernel.h" #include "paddle/phi/kernels/funcs/math_function.h" +#include "paddle/phi/kernels/funcs/sparse/common_shape.h" #include "paddle/phi/api/ext/dispatch.h" @@ -38,12 +39,6 @@ void SparseMaskCPUKernel(const CPUContext& dev_ctx, const DenseTensor& indices = mask.non_zero_indices(); const DenseTensor& values = mask.non_zero_elements(); int sparse_dim = indices.dims().size(); - std::vector sparse_offsets(sparse_dim); - int64_t offset = 1; - for (int i = sparse_dim - 1; i >= 0; i--) { - sparse_offsets[i] = offset; - offset *= dims[i]; - } DenseTensor out_indices = phi::EmptyLike(dev_ctx, indices); DenseTensor out_values = phi::EmptyLike(dev_ctx, values); @@ -51,21 +46,25 @@ void SparseMaskCPUKernel(const CPUContext& dev_ctx, // the out_indices is same as indices of mask phi::Copy(dev_ctx, indices, dev_ctx.GetPlace(), false, &out_indices); - const IntT* indices_ptr = indices.data(); T* out_values_ptr = out_values.data(); const T* x_ptr = x.data(); const int64_t non_zero_num = mask.nnz(); auto dims_2d = flatten_to_2d(dims, sparse_dim); const int cols = dims_2d[1]; + const IntT* indices_ptr = indices.data(); + + std::vector out_indexs(non_zero_num), sparse_offsets(sparse_dim); + + phi::funcs::sparse::CalcOffsetsPerDim( + dims, sparse_dim, &sparse_offsets); for (int64_t i = 0; i < non_zero_num; i++) { - int64_t index = 0; - for (int j = 0; j < sparse_dim; j++) { - index += indices_ptr[j * non_zero_num + i] * sparse_offsets[j]; - } + int64_t index = phi::funcs::sparse::IndicesToIndex( + indices_ptr, sparse_offsets.data(), non_zero_num, sparse_dim, i); memcpy(out_values_ptr + i * cols, x_ptr + index * cols, cols * sizeof(T)); } + out->SetMember(out_indices, out_values, dims, true); } @@ -85,6 +84,73 @@ void SparseMaskKernel(const Context& dev_ctx, })); } +template +void SparseMaskHelperCPUKernel(const CPUContext& dev_ctx, + const SparseCooTensor& x, + const DenseTensor& mask_indices, + DenseTensor* out) { + PADDLE_ENFORCE_EQ( + mask_indices.dims().size(), + 2, + phi::errors::InvalidArgument("the mask_indices must be 2-D tensor")); + + const int64_t sparse_dim = x.non_zero_indices().dims()[0]; + + std::vector sparse_offsets(sparse_dim), x_indexs(x.nnz()), + mask_indexs(mask_indices.dims()[1]); + phi::funcs::sparse::CalcOffsetsPerDim( + x.dims(), sparse_dim, &sparse_offsets); + + phi::funcs::sparse::FlattenIndices(x.non_zero_indices().data(), + sparse_offsets.data(), + x.nnz(), + sparse_dim, + 0, + 1, + x_indexs.data()); + phi::funcs::sparse::FlattenIndices(mask_indices.data(), + sparse_offsets.data(), + x.nnz(), + sparse_dim, + 0, + 1, + mask_indexs.data()); + + std::unordered_map x_indexs_map; + for (uint64_t i = 0; i < x_indexs.size(); i++) { + x_indexs_map[x_indexs[i]] = i; + } + *out = phi::EmptyLike(dev_ctx, x.non_zero_elements()); + T* out_ptr = out->data(); + memset(out_ptr, static_cast(0), out->numel() * sizeof(T)); + const int64_t stride = + x.dims().size() == sparse_dim ? 1 : x.dims().size() - sparse_dim; + const T* in_ptr = x.non_zero_elements().data(); + // TODO(zhangkaihuo): multithreading can be used for acceleration + for (uint64_t i = 0; i < mask_indexs.size(); i++) { + auto iter = x_indexs_map.find(mask_indexs[i]); + if (iter != x_indexs_map.end()) { + memcpy(out_ptr + i * stride, + in_ptr + iter->second * stride, + stride * sizeof(T)); + } + } +} + +/** + * @brief filter values from x.values() using mask_indices + */ +template +void SparseMaskHelperKernel(const Context& dev_ctx, + const SparseCooTensor& x, + const DenseTensor& mask_indices, + DenseTensor* out) { + PD_DISPATCH_INTEGRAL_TYPES( + x.non_zero_indices().dtype(), "SparseMaskHelperCPUKernel", ([&] { + SparseMaskHelperCPUKernel(dev_ctx, x, mask_indices, out); + })); +} + } // namespace sparse } // namespace phi @@ -101,3 +167,16 @@ PD_REGISTER_KERNEL(sparse_mask, int64_t) { kernel->InputAt(1).SetDataLayout(phi::DataLayout::SPARSE_COO); } + +PD_REGISTER_KERNEL(sparse_mask_helper, + CPU, + ALL_LAYOUT, + phi::sparse::SparseMaskHelperKernel, + float, + double, + uint8_t, + int16_t, + int, + int64_t) { + kernel->InputAt(0).SetDataLayout(phi::DataLayout::SPARSE_COO); +} diff --git a/paddle/phi/kernels/sparse/cpu/sparse_utils_kernel.cc b/paddle/phi/kernels/sparse/cpu/sparse_utils_kernel.cc index acc834269663d..0499371a4dd17 100644 --- a/paddle/phi/kernels/sparse/cpu/sparse_utils_kernel.cc +++ b/paddle/phi/kernels/sparse/cpu/sparse_utils_kernel.cc @@ -394,3 +394,15 @@ PD_REGISTER_KERNEL(csr_values, int64_t) { kernel->InputAt(0).SetDataLayout(phi::DataLayout::SPARSE_COO); } + +PD_REGISTER_KERNEL(sparse_coo_tensor, + CPU, + ALL_LAYOUT, + phi::sparse::SparseCooTensorKernel, + float, + double, + phi::dtype::float16, + uint8_t, + int16_t, + int, + int64_t) {} diff --git a/paddle/phi/kernels/sparse/gpu/sparse_mask_kernel.cu b/paddle/phi/kernels/sparse/gpu/sparse_mask_kernel.cu index d206d6bbc195c..96ab56697b9b0 100644 --- a/paddle/phi/kernels/sparse/gpu/sparse_mask_kernel.cu +++ b/paddle/phi/kernels/sparse/gpu/sparse_mask_kernel.cu @@ -12,6 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ +#include + #include "paddle/phi/backends/gpu/gpu_info.h" #include "paddle/phi/backends/gpu/gpu_launch_config.h" #include "paddle/phi/core/ddim.h" @@ -20,6 +22,7 @@ limitations under the License. */ #include "paddle/phi/kernels/copy_kernel.h" #include "paddle/phi/kernels/empty_kernel.h" #include "paddle/phi/kernels/funcs/math_function.h" +#include "paddle/phi/kernels/funcs/sparse/common_shape.h" #include "paddle/phi/kernels/sparse/sparse_mask_kernel.h" #include "paddle/phi/api/ext/dispatch.h" @@ -59,7 +62,7 @@ void SparseMaskGPUKernel(const GPUContext& dev_ctx, const DenseTensor& indices = mask.non_zero_indices(); const DenseTensor& values = mask.non_zero_elements(); int sparse_dim = indices.dims().size(); - DenseTensor sparse_offsets = phi::Empty( + DenseTensor sparse_offsets = phi::Empty( dev_ctx, DenseTensorMeta(DataType::INT64, {sparse_dim}, DataLayout::NCHW)); std::vector h_sparse_offsets(sparse_dim); @@ -121,6 +124,153 @@ void SparseMaskKernel(const Context& dev_ctx, })); } +// TODO(zhangkaihuo): Use an op to realize the function of FlattenIndices +template +__global__ void FlattenIndicesKernel(const IntT* indices, + const IntT* sparse_offsets, + const int64_t non_zero_num, + const int64_t sparse_dim, + IntT* out) { + int tid = threadIdx.x + blockIdx.x * blockDim.x; + phi::funcs::sparse::FlattenIndices(indices, + sparse_offsets, + non_zero_num, + sparse_dim, + tid, + gridDim.x * blockDim.x, + out); +} + +template +__global__ void SparseMaskCopyKernel(const IntT* x_indexs, + const IntT* mask_indexs, + const IntT* bound_out, + const T* x_values, + const int64_t n, + const int64_t stride, + T* out_values) { + CUDA_KERNEL_LOOP_TYPE(i, n, int64_t) { + const IntT j = bound_out[i]; + if (j >= 0 && j < n && mask_indexs[i] == x_indexs[j]) { + for (int k = 0; k < stride; k++) { + out_values[i * stride + k] = x_values[j * stride + k]; + } + } + } +} + +template +void SparseMaskHelperGPUKernel(const GPUContext& dev_ctx, + const SparseCooTensor& x, + const DenseTensor& mask_indices, + DenseTensor* out) { + PADDLE_ENFORCE_EQ( + mask_indices.dims().size(), + 2, + phi::errors::InvalidArgument("the mask_indices must be 2-D tensor")); + + const int64_t sparse_dim = x.non_zero_indices().dims()[0]; + auto indices_dtype = paddle::experimental::CppTypeToDataType::Type(); + + std::vector sparse_offsets(sparse_dim); + + DenseTensorMeta x_indexs_meta(indices_dtype, {x.nnz()}, DataLayout::NCHW); + DenseTensorMeta mask_indexs_meta( + indices_dtype, {mask_indices.dims()[1]}, DataLayout::NCHW); + DenseTensorMeta sparse_offset_meta( + indices_dtype, {sparse_dim}, DataLayout::NCHW); + + DenseTensor x_indexs = + phi::Empty(dev_ctx, std::move(x_indexs_meta)); + DenseTensor mask_indexs = + phi::Empty(dev_ctx, std::move(mask_indexs_meta)); + DenseTensor bound_out = + phi::Empty(dev_ctx, std::move(mask_indexs_meta)); + DenseTensor d_sparse_offsets = + phi::Empty(dev_ctx, std::move(sparse_offset_meta)); + IntT* x_indexs_ptr = x_indexs.data(); + IntT* mask_indexs_ptr = mask_indexs.data(); + IntT* bound_out_ptr = bound_out.data(); + + // 1. calc the offsets of per dim + phi::funcs::sparse::CalcOffsetsPerDim(x.dims(), sparse_dim, &sparse_offsets); + // 2. copy sparse_offsets to device + phi::backends::gpu::GpuMemcpyAsync(d_sparse_offsets.data(), + sparse_offsets.data(), + sizeof(IntT) * sparse_dim, +#ifdef PADDLE_WITH_HIP + hipMemcpyHostToDevice, +#else + cudaMemcpyHostToDevice, +#endif + dev_ctx.stream()); + + // 3. flatten x indices and mask indices + auto config = + phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, x_indexs.numel(), 1); + FlattenIndicesKernel<<>>(x.non_zero_indices().data(), + d_sparse_offsets.data(), + x_indexs.numel(), + sparse_dim, + x_indexs_ptr); + + config = + phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, mask_indexs.numel(), 1); + FlattenIndicesKernel<<>>(mask_indices.data(), + d_sparse_offsets.data(), + mask_indexs.numel(), + sparse_dim, + mask_indexs_ptr); +// 4. call thrust::lower_bound +#ifdef PADDLE_WITH_HIP + thrust::lower_bound(thrust::hip::par.on(dev_ctx.stream()), +#else + thrust::lower_bound(thrust::cuda::par.on(dev_ctx.stream()), +#endif + x_indexs_ptr, + x_indexs_ptr + x_indexs.numel(), + mask_indexs_ptr, + mask_indexs_ptr + mask_indexs.numel(), + bound_out_ptr); + + // 5. copy value to out + *out = phi::EmptyLike(dev_ctx, x.non_zero_elements()); + phi::funcs::SetConstant set_zero; + set_zero(dev_ctx, out, static_cast(0)); + T* out_ptr = out->data(); + + const int64_t stride = + x.dims().size() == sparse_dim ? 1 : x.dims().size() - sparse_dim; + + SparseMaskCopyKernel<<>>(x_indexs_ptr, + mask_indexs_ptr, + bound_out_ptr, + x.non_zero_elements().data(), + mask_indexs.numel(), + stride, + out_ptr); +} + +template +void SparseMaskHelperKernel(const Context& dev_ctx, + const SparseCooTensor& x, + const DenseTensor& mask_indices, + DenseTensor* out) { + PD_DISPATCH_INTEGRAL_TYPES( + x.non_zero_indices().dtype(), "SparseMaskHelperGPUKernel", ([&] { + SparseMaskHelperGPUKernel(dev_ctx, x, mask_indices, out); + })); +} + } // namespace sparse } // namespace phi @@ -138,3 +288,17 @@ PD_REGISTER_KERNEL(sparse_mask, int64_t) { kernel->InputAt(1).SetDataLayout(phi::DataLayout::SPARSE_COO); } + +PD_REGISTER_KERNEL(sparse_mask_helper, + GPU, + ALL_LAYOUT, + phi::sparse::SparseMaskHelperKernel, + float, + double, + phi::dtype::float16, + uint8_t, + int16_t, + int, + int64_t) { + kernel->InputAt(0).SetDataLayout(phi::DataLayout::SPARSE_COO); +} diff --git a/paddle/phi/kernels/sparse/gpu/sparse_utils_kernel.cu b/paddle/phi/kernels/sparse/gpu/sparse_utils_kernel.cu index 1109baf92e302..0b6ac1aed0147 100644 --- a/paddle/phi/kernels/sparse/gpu/sparse_utils_kernel.cu +++ b/paddle/phi/kernels/sparse/gpu/sparse_utils_kernel.cu @@ -665,3 +665,15 @@ PD_REGISTER_KERNEL(csr_values, int64_t) { kernel->InputAt(0).SetDataLayout(phi::DataLayout::SPARSE_COO); } + +PD_REGISTER_KERNEL(sparse_coo_tensor, + GPU, + ALL_LAYOUT, + phi::sparse::SparseCooTensorKernel, + float, + double, + phi::dtype::float16, + uint8_t, + int16_t, + int, + int64_t) {} diff --git a/paddle/phi/kernels/sparse/sparse_mask_kernel.h b/paddle/phi/kernels/sparse/sparse_mask_kernel.h index 210412abd8620..88899e3dc672e 100644 --- a/paddle/phi/kernels/sparse/sparse_mask_kernel.h +++ b/paddle/phi/kernels/sparse/sparse_mask_kernel.h @@ -26,5 +26,11 @@ void SparseMaskKernel(const Context& dev_ctx, const SparseCooTensor& mask, SparseCooTensor* out); +template +void SparseMaskHelperKernel(const Context& dev_ctx, + const SparseCooTensor& x, + const DenseTensor& mask_indices, + DenseTensor* out); + } // namespace sparse } // namespace phi diff --git a/paddle/phi/kernels/sparse/sparse_utils_grad_kernel.cc b/paddle/phi/kernels/sparse/sparse_utils_grad_kernel.cc index 35329807e7798..15d78692f4f35 100644 --- a/paddle/phi/kernels/sparse/sparse_utils_grad_kernel.cc +++ b/paddle/phi/kernels/sparse/sparse_utils_grad_kernel.cc @@ -66,6 +66,19 @@ PD_REGISTER_KERNEL(sparse_coo_to_dense_grad, kernel->InputAt(0).SetDataLayout(phi::DataLayout::SPARSE_COO); } +PD_REGISTER_KERNEL(sparse_coo_tensor_grad, + CPU, + ALL_LAYOUT, + phi::sparse::SparseCooTensorGradKernel, + float, + double, + uint8_t, + int16_t, + int, + int64_t) { + kernel->InputAt(1).SetDataLayout(phi::DataLayout::SPARSE_COO); +} + #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) PD_REGISTER_KERNEL(coo_values_grad, GPU, @@ -95,4 +108,16 @@ PD_REGISTER_KERNEL(sparse_coo_to_dense_grad, int64_t) { kernel->InputAt(0).SetDataLayout(phi::DataLayout::SPARSE_COO); } +PD_REGISTER_KERNEL(sparse_coo_tensor_grad, + GPU, + ALL_LAYOUT, + phi::sparse::SparseCooTensorGradKernel, + float, + double, + uint8_t, + int16_t, + int, + int64_t) { + kernel->InputAt(1).SetDataLayout(phi::DataLayout::SPARSE_COO); +} #endif diff --git a/paddle/phi/kernels/sparse/sparse_utils_grad_kernel.h b/paddle/phi/kernels/sparse/sparse_utils_grad_kernel.h index 0775582bf1fb8..a00b9c275c292 100644 --- a/paddle/phi/kernels/sparse/sparse_utils_grad_kernel.h +++ b/paddle/phi/kernels/sparse/sparse_utils_grad_kernel.h @@ -16,6 +16,7 @@ limitations under the License. */ #include "paddle/phi/core/dense_tensor.h" #include "paddle/phi/core/sparse_coo_tensor.h" +#include "paddle/phi/kernels/sparse/sparse_mask_kernel.h" namespace phi { namespace sparse { @@ -32,5 +33,13 @@ void SparseCooToDenseGradKernel(const Context& dev_ctx, const DenseTensor& out_grad, SparseCooTensor* x_grad); +template +void SparseCooTensorGradKernel(const Context& dev_ctx, + const DenseTensor& indices, + const SparseCooTensor& out_grad, + DenseTensor* values_grad) { + SparseMaskHelperKernel(dev_ctx, out_grad, indices, values_grad); +} + } // namespace sparse } // namespace phi diff --git a/paddle/phi/kernels/sparse/sparse_utils_kernel.h b/paddle/phi/kernels/sparse/sparse_utils_kernel.h index 961cd9f829eb2..8cf9c0a28648a 100644 --- a/paddle/phi/kernels/sparse/sparse_utils_kernel.h +++ b/paddle/phi/kernels/sparse/sparse_utils_kernel.h @@ -15,6 +15,7 @@ limitations under the License. */ #pragma once #include "paddle/phi/api/lib/utils/storage.h" +#include "paddle/phi/common/int_array.h" #include "paddle/phi/core/dense_tensor.h" #include "paddle/phi/core/sparse_coo_tensor.h" #include "paddle/phi/core/sparse_csr_tensor.h" @@ -147,5 +148,16 @@ void CsrValuesKernel(const Context& dev_ctx, *out = x.non_zero_elements(); } +template +void SparseCooTensorKernel(const Context& dev_ctx, + const DenseTensor& values, + const DenseTensor& indices, + const IntArray& dense_shape, + SparseCooTensor* out) { + *out = + SparseCooTensor(indices, values, phi::make_ddim(dense_shape.GetData())); + // TODO(zhangkaihuo): sort and merge the dumplicate indices +} + } // namespace sparse } // namespace phi diff --git a/python/paddle/fluid/tests/unittests/test_sparse_utils_op.py b/python/paddle/fluid/tests/unittests/test_sparse_utils_op.py index 04488ac58c5fb..89cfc711910ce 100644 --- a/python/paddle/fluid/tests/unittests/test_sparse_utils_op.py +++ b/python/paddle/fluid/tests/unittests/test_sparse_utils_op.py @@ -134,9 +134,11 @@ def test_to_sparse_coo(self): #test to_sparse_coo_grad backward out_grad_indices = [[0, 1], [0, 1]] out_grad_values = [2.0, 3.0] - out_grad = core.eager.sparse_coo_tensor( + out_grad = paddle.sparse.sparse_coo_tensor( paddle.to_tensor(out_grad_indices), - paddle.to_tensor(out_grad_values), out.shape, True) + paddle.to_tensor(out_grad_values), + shape=out.shape, + stop_gradient=True) out.backward(out_grad) assert np.array_equal(dense_x.grad.numpy(), out_grad.to_dense().numpy()) @@ -145,9 +147,11 @@ def test_coo_to_dense(self): with _test_eager_guard(): indices = [[0, 0, 1, 2, 2], [1, 3, 2, 0, 1]] values = [1.0, 2.0, 3.0, 4.0, 5.0] - sparse_x = core.eager.sparse_coo_tensor( + sparse_x = paddle.sparse.sparse_coo_tensor( paddle.to_tensor(indices), - paddle.to_tensor(values), [3, 4], False) + paddle.to_tensor(values), + shape=[3, 4], + stop_gradient=False) dense_tensor = sparse_x.to_dense() #test to_dense_grad backward out_grad = [[1.0, 2.0, 3.0, 4.0], [5.0, 6.0, 7.0, 8.0], @@ -158,6 +162,17 @@ def test_coo_to_dense(self): assert np.array_equal(correct_x_grad, sparse_x.grad.values().numpy()) + paddle.device.set_device("cpu") + sparse_x_cpu = paddle.sparse.sparse_coo_tensor( + paddle.to_tensor(indices), + paddle.to_tensor(values), + shape=[3, 4], + stop_gradient=False) + dense_tensor_cpu = sparse_x_cpu.to_dense() + dense_tensor_cpu.backward(paddle.to_tensor(out_grad)) + assert np.array_equal(correct_x_grad, + sparse_x_cpu.grad.values().numpy()) + def test_to_sparse_csr(self): with _test_eager_guard(): x = [[0, 1, 0, 2], [0, 0, 3, 0], [4, 5, 0, 0]] @@ -177,15 +192,52 @@ def test_coo_values_grad(self): with _test_eager_guard(): indices = [[0, 0, 1, 2, 2], [1, 3, 2, 0, 1]] values = [1.0, 2.0, 3.0, 4.0, 5.0] - sparse_x = core.eager.sparse_coo_tensor( + sparse_x = paddle.sparse.sparse_coo_tensor( paddle.to_tensor(indices), - paddle.to_tensor(values), [3, 4], False) + paddle.to_tensor(values), + shape=[3, 4], + stop_gradient=False) values_tensor = sparse_x.values() out_grad = [2.0, 3.0, 5.0, 8.0, 9.0] # test coo_values_grad values_tensor.backward(paddle.to_tensor(out_grad)) assert np.array_equal(out_grad, sparse_x.grad.values().numpy()) + def test_sparse_coo_tensor_grad(self): + with _test_eager_guard(): + indices = [[0, 1], [0, 1]] + values = [1, 2] + indices = paddle.to_tensor(indices, dtype='int32') + values = paddle.to_tensor( + values, dtype='float32', stop_gradient=False) + sparse_x = paddle.sparse.sparse_coo_tensor( + indices, values, shape=[2, 2], stop_gradient=False) + grad_indices = [[0, 1], [1, 1]] + grad_values = [2, 3] + grad_indices = paddle.to_tensor(grad_indices, dtype='int32') + grad_values = paddle.to_tensor(grad_values, dtype='float32') + sparse_out_grad = paddle.sparse.sparse_coo_tensor( + grad_indices, grad_values, shape=[2, 2]) + sparse_x.backward(sparse_out_grad) + correct_values_grad = [0, 3] + assert np.array_equal(correct_values_grad, values.grad.numpy()) + + place = core.CPUPlace() + indices_cpu = paddle.to_tensor(indices, dtype='int32', place=place) + values_cpu = paddle.to_tensor( + values, dtype='float32', place=place, stop_gradient=False) + sparse_x_cpu = paddle.sparse.sparse_coo_tensor( + indices_cpu, + values_cpu, + shape=[2, 2], + place=place, + stop_gradient=False) + + sparse_out_grad_cpu = paddle.sparse.sparse_coo_tensor( + grad_indices, grad_values, shape=[2, 2], place=place) + sparse_x_cpu.backward(sparse_out_grad_cpu) + assert np.array_equal(correct_values_grad, values_cpu.grad.numpy()) + if __name__ == "__main__": unittest.main() diff --git a/python/paddle/sparse/creation.py b/python/paddle/sparse/creation.py index e29351e3d179c..ac9276f3142c0 100644 --- a/python/paddle/sparse/creation.py +++ b/python/paddle/sparse/creation.py @@ -14,6 +14,7 @@ from paddle import _C_ops from ..framework import core, dygraph_only +from ..framework import _current_expected_place, _get_paddle_place from ..tensor import to_tensor from ..tensor import max from ..fluid.data_feeder import check_variable_and_dtype, check_type, check_dtype, convert_dtype @@ -38,6 +39,18 @@ def _infer_dense_shape(indices): return list(lens.numpy()) +def _get_place(place): + place = _get_paddle_place(place) + if place is None: + place = _current_expected_place() + elif not isinstance(place, (core.Place, core.CPUPlace, core.CUDAPinnedPlace, + core.CUDAPlace)): + raise ValueError( + "'place' must be any of paddle.Place, paddle.CPUPlace, paddle.CUDAPinnedPlace, paddle.CUDAPlace" + ) + return place + + @dygraph_only def sparse_coo_tensor(indices, values, @@ -94,6 +107,8 @@ def sparse_coo_tensor(indices, # values=[1., 2., 3.]) """ + place = _get_place(place) + if not isinstance(indices, core.eager.Tensor): indices = to_tensor( indices, dtype=None, place=place, stop_gradient=True) @@ -101,13 +116,20 @@ def sparse_coo_tensor(indices, values = to_tensor(values, dtype, place, stop_gradient) if len(indices.shape) != 2: raise ValueError("'indices' must be 2-D.") - if place is not None: + + if not indices.place._equals(place): indices = indices._copy_to(place, False) + + if not values.place._equals(place): values = values._copy_to(place, False) values = _handle_dtype(values, dtype) + values.stop_gradient = stop_gradient + if shape is None: shape = _infer_dense_shape(indices) - return core.eager.sparse_coo_tensor(indices, values, shape, stop_gradient) + + return _C_ops.final_state_sparse_create_sparse_coo_tensor(values, indices, + shape) #TODO: need to support shape is None @@ -171,6 +193,9 @@ def sparse_csr_tensor(crows, # cols=[1, 3, 2, 0, 1], # values=[1, 2, 3, 4, 5]) """ + + place = _get_place(place) + if not isinstance(crows, core.eager.Tensor): crows = to_tensor(crows, dtype=None, place=place, stop_gradient=True) if not isinstance(cols, core.eager.Tensor): @@ -182,10 +207,15 @@ def sparse_csr_tensor(crows, "SparseCsrTensor only support 2-D or 3-D matrix. The 'crows', 'cols' and 'values' must be 1-D." ) - if place is not None: + if not crows.place._equals(place): crows = crows._copy_to(place, False) + + if not cols.place._equals(place): cols = cols._copy_to(place, False) + + if not values.place._equals(place): values = values._copy_to(place, False) values = _handle_dtype(values, dtype) + values.stop_gradient = stop_gradient return core.eager.sparse_csr_tensor(crows, cols, values, shape, stop_gradient) diff --git a/python/paddle/utils/code_gen/sparse_api.yaml b/python/paddle/utils/code_gen/sparse_api.yaml index 7bdd77e27bcef..2187d4abb2d63 100644 --- a/python/paddle/utils/code_gen/sparse_api.yaml +++ b/python/paddle/utils/code_gen/sparse_api.yaml @@ -21,6 +21,14 @@ layout : x backward : coo_values_grad +- api : create_sparse_coo_tensor + args : (Tensor values, Tensor indices, IntArray dense_shape) + output : Tensor(out@SparseCooTensor) + kernel : + func : sparse_coo_tensor + layout : values + backward : create_sparse_coo_tensor_grad + - api : csr_values args : (Tensor x) output : Tensor(out@DenseTensor) diff --git a/python/paddle/utils/code_gen/sparse_bw_api.yaml b/python/paddle/utils/code_gen/sparse_bw_api.yaml index 800145b06e0b6..e3946cbf72bc2 100644 --- a/python/paddle/utils/code_gen/sparse_bw_api.yaml +++ b/python/paddle/utils/code_gen/sparse_bw_api.yaml @@ -19,6 +19,13 @@ kernel : func : coo_values_grad +- backward_api : create_sparse_coo_tensor_grad + forward : create_sparse_coo_tensor(Tensor values, Tensor indices, IntArray dense_shape) -> Tensor(out@SparseCooTensor) + args : (Tensor indices, Tensor out_grad) + output : Tensor(values_grad@DenseTensor) + kernel : + func : sparse_coo_tensor_grad + - backward_api : dense_to_coo_grad forward : dense_to_coo(Tensor x, int64_t sparse_dim) -> Tensor(out@SparseCooTensor) args : (Tensor out_grad) From 45e43dfe36dbb43394b6b72f5959d2a0cfa4ba90 Mon Sep 17 00:00:00 2001 From: Aurelius84 Date: Wed, 13 Apr 2022 09:54:49 +0800 Subject: [PATCH 107/211] [Eager]Fix final_state_frobenius_norm BUG with axis==None (#41693) --- python/paddle/tensor/linalg.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/python/paddle/tensor/linalg.py b/python/paddle/tensor/linalg.py index 2fcf9ff4213d4..b315e3e9673fc 100644 --- a/python/paddle/tensor/linalg.py +++ b/python/paddle/tensor/linalg.py @@ -347,7 +347,8 @@ def frobenius_norm(input, dim=None, keepdim=False, name=None): if in_dygraph_mode(): if dim is None: - return _C_ops.final_state_frobenius_norm(input, keepdim, True) + return _C_ops.final_state_frobenius_norm(input, [], keepdim, + True) return _C_ops.final_state_frobenius_norm(input, dim, keepdim, False) if _in_legacy_dygraph(): if dim is None: From 6a486ec2090476311e8e3d55abfdc927cdb8cf95 Mon Sep 17 00:00:00 2001 From: lilong12 Date: Wed, 13 Apr 2022 10:09:05 +0800 Subject: [PATCH 108/211] update (#41636) --- python/paddle/distributed/collective.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/python/paddle/distributed/collective.py b/python/paddle/distributed/collective.py index fbad470cb3f13..d2bed171aa27a 100644 --- a/python/paddle/distributed/collective.py +++ b/python/paddle/distributed/collective.py @@ -849,7 +849,9 @@ def all_gather(tensor_list, tensor, group=None, use_calc_stream=True): if in_dygraph_mode(): group = _get_default_group() if group is None else group - out = paddle.concat(tensor_list) + tensor_shape = list(tensor.shape) + tensor_shape[0] *= group.nranks + out = paddle.empty(tensor_shape, tensor.dtype) task = group.process_group.all_gather(tensor, out) task.wait() tensor_list.clear() From a4d4c1161745935dc341d3a2b3ce66fc624ca7ce Mon Sep 17 00:00:00 2001 From: houj04 <35131887+houj04@users.noreply.github.com> Date: Wed, 13 Apr 2022 10:09:51 +0800 Subject: [PATCH 109/211] Update sign op xpu (#41685) * update sign op on xpu. test=kunlun * fix typo. test=kunlun --- cmake/external/xpu.cmake | 46 +++++++---- paddle/fluid/operators/sign_op_xpu.cc | 8 +- .../fluid/platform/device/xpu/xpu2_op_list.h | 1 + .../tests/unittests/xpu/test_sign_op_xpu.py | 82 +++++++++++++------ 4 files changed, 93 insertions(+), 44 deletions(-) diff --git a/cmake/external/xpu.cmake b/cmake/external/xpu.cmake index 90cb686700ef2..d453e9d2a2acd 100644 --- a/cmake/external/xpu.cmake +++ b/cmake/external/xpu.cmake @@ -7,46 +7,62 @@ SET(XPU_PROJECT "extern_xpu") SET(XPU_API_LIB_NAME "libxpuapi.so") SET(XPU_RT_LIB_NAME "libxpurt.so") +if(NOT DEFINED XPU_BASE_URL) + SET(XPU_BASE_URL_WITHOUT_DATE "https://baidu-kunlun-product.cdn.bcebos.com/KL-SDK/klsdk-dev") + SET(XPU_BASE_URL "${XPU_BASE_URL_WITHOUT_DATE}/20220411") +else() + SET(XPU_BASE_URL "${XPU_BASE_URL}") +endif() + +# ubuntu and centos: use output by XDNN API team +if(NOT DEFINED XPU_XDNN_BASE_URL) + SET(XPU_XDNN_BASE_URL_WITHOUT_DATE "https://klx-sdk-release-public.su.bcebos.com/xdnn/dev") + SET(XPU_XDNN_BASE_URL "${XPU_XDNN_BASE_URL_WITHOUT_DATE}/20220412") +else() + SET(XPU_XDNN_BASE_URL "${XPU_XDNN_BASE_URL}") +endif() + IF(WITH_AARCH64) SET(XPU_XRE_DIR_NAME "xre-kylin_aarch64") SET(XPU_XDNN_DIR_NAME "xdnn-kylin_aarch64") SET(XPU_XCCL_DIR_NAME "xccl-kylin_aarch64") + SET(XPU_XDNN_URL "${XPU_BASE_URL}/${XPU_XDNN_DIR_NAME}.tar.gz" CACHE STRING "" FORCE) ELSEIF(WITH_SUNWAY) SET(XPU_XRE_DIR_NAME "xre-deepin_sw6_64") SET(XPU_XDNN_DIR_NAME "xdnn-deepin_sw6_64") SET(XPU_XCCL_DIR_NAME "xccl-deepin_sw6_64") + SET(XPU_XDNN_URL "${XPU_BASE_URL}/${XPU_XDNN_DIR_NAME}.tar.gz" CACHE STRING "" FORCE) ELSEIF(WITH_BDCENTOS) SET(XPU_XRE_DIR_NAME "xre-bdcentos_x86_64") - SET(XPU_XDNN_DIR_NAME "xdnn-bdcentos_x86_64") + SET(XPU_XDNN_DIR_NAME "XDNN-bdcentos_x86_64") SET(XPU_XCCL_DIR_NAME "xccl-bdcentos_x86_64") + # ubuntu and centos: use output by XDNN API team + SET(XPU_XDNN_URL "${XPU_XDNN_BASE_URL}/${XPU_XDNN_DIR_NAME}.tar.gz" CACHE STRING "" FORCE) ELSEIF(WITH_UBUNTU) SET(XPU_XRE_DIR_NAME "xre-ubuntu_x86_64") - SET(XPU_XDNN_DIR_NAME "xdnn-ubuntu_x86_64") + SET(XPU_XDNN_DIR_NAME "XDNN-ubuntu_x86_64") SET(XPU_XCCL_DIR_NAME "xccl-bdcentos_x86_64") + # ubuntu and centos: use output by XDNN API team + SET(XPU_XDNN_URL "${XPU_XDNN_BASE_URL}/${XPU_XDNN_DIR_NAME}.tar.gz" CACHE STRING "" FORCE) ELSEIF(WITH_CENTOS) SET(XPU_XRE_DIR_NAME "xre-centos7_x86_64") - SET(XPU_XDNN_DIR_NAME "xdnn-centos7_x86_64") + SET(XPU_XDNN_DIR_NAME "XDNN-bdcentos_x86_64") SET(XPU_XCCL_DIR_NAME "xccl-bdcentos_x86_64") - -ELSE () + # ubuntu and centos: use output by XDNN API team + SET(XPU_XDNN_URL "${XPU_XDNN_BASE_URL}/${XPU_XDNN_DIR_NAME}.tar.gz" CACHE STRING "" FORCE) +ELSE() SET(XPU_XRE_DIR_NAME "xre-ubuntu_x86_64") - SET(XPU_XDNN_DIR_NAME "xdnn-ubuntu_x86_64") + SET(XPU_XDNN_DIR_NAME "XDNN-bdcentos_x86_64") SET(XPU_XCCL_DIR_NAME "xccl-bdcentos_x86_64") + # default: use output by XDNN API team + SET(XPU_XDNN_URL "${XPU_XDNN_BASE_URL}/${XPU_XDNN_DIR_NAME}.tar.gz" CACHE STRING "" FORCE) ENDIF() -if(NOT DEFINED XPU_BASE_URL) - SET(XPU_BASE_URL_WITHOUT_DATE "https://baidu-kunlun-product.cdn.bcebos.com/KL-SDK/klsdk-dev") - SET(XPU_BASE_URL "${XPU_BASE_URL_WITHOUT_DATE}/20220411") -else() - SET(XPU_BASE_URL "${XPU_BASE_URL}") -endif() - SET(XPU_XRE_URL "${XPU_BASE_URL}/${XPU_XRE_DIR_NAME}.tar.gz" CACHE STRING "" FORCE) -SET(XPU_XDNN_URL "${XPU_BASE_URL}/${XPU_XDNN_DIR_NAME}.tar.gz" CACHE STRING "" FORCE) SET(XPU_XCCL_URL "${XPU_BASE_URL_WITHOUT_DATE}/20220411/${XPU_XCCL_DIR_NAME}.tar.gz" CACHE STRING "" FORCE) SET(XPU_PACK_DEPENCE_URL "https://baidu-kunlun-public.su.bcebos.com/paddle_depence/pack_paddle_depence.sh" CACHE STRING "" FORCE) -SET(SNAPPY_PREFIX_DIR "${THIRD_PARTY_PATH}/xpu") +SET(SNAPPY_PREFIX_DIR "${THIRD_PARTY_PATH}/xpu") SET(XPU_DOWNLOAD_DIR "${SNAPPY_PREFIX_DIR}/src/${XPU_PROJECT}") SET(XPU_INSTALL_DIR "${THIRD_PARTY_PATH}/install/xpu") SET(XPU_INC_DIR "${THIRD_PARTY_PATH}/install/xpu/include") diff --git a/paddle/fluid/operators/sign_op_xpu.cc b/paddle/fluid/operators/sign_op_xpu.cc index 22934cf482159..a00aa4bb7ce51 100644 --- a/paddle/fluid/operators/sign_op_xpu.cc +++ b/paddle/fluid/operators/sign_op_xpu.cc @@ -14,6 +14,7 @@ limitations under the License. */ #ifdef PADDLE_WITH_XPU #include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/platform/device/device_wrapper.h" #include "paddle/fluid/platform/device/xpu/xpu_header.h" namespace paddle { namespace operators { @@ -26,10 +27,9 @@ class SignXPUKernel : public framework::OpKernel { auto* in = context.Input("X"); out->mutable_data(in->place()); auto xpu_context = context.device_context().x_context(); - int r = xpu::activation_forward(xpu_context, xpu::Activation_t::SIGN, - in->numel(), in->data(), out->data()); - PADDLE_ENFORCE_EQ(r, xpu::Error_t::SUCCESS, - platform::errors::Fatal("XPU sign kernel error!")); + // int sign(Context* ctx, const T* x , T* y, int len); + int r = xpu::sign(xpu_context, in->data(), out->data(), in->numel()); + PADDLE_ENFORCE_XDNN_SUCCESS(r, "sign"); } }; diff --git a/paddle/fluid/platform/device/xpu/xpu2_op_list.h b/paddle/fluid/platform/device/xpu/xpu2_op_list.h index 08a7f08006957..6527371059806 100644 --- a/paddle/fluid/platform/device/xpu/xpu2_op_list.h +++ b/paddle/fluid/platform/device/xpu/xpu2_op_list.h @@ -316,6 +316,7 @@ XPUOpMap& get_kl2_ops() { {"sigmoid", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, {"sigmoid_grad", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, + {"sign", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, {"slice_grad", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()), pOpKernelType(vartype::FP16, XPUPlace()), pOpKernelType(vartype::INT32, XPUPlace())})}, diff --git a/python/paddle/fluid/tests/unittests/xpu/test_sign_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_sign_op_xpu.py index ab07221a07071..9254a84ec4217 100644 --- a/python/paddle/fluid/tests/unittests/xpu/test_sign_op_xpu.py +++ b/python/paddle/fluid/tests/unittests/xpu/test_sign_op_xpu.py @@ -18,37 +18,69 @@ import numpy as np import sys sys.path.append("..") -from op_test import OpTest -import paddle -import paddle.fluid as fluid -from paddle.fluid import Program, program_guard + import paddle +from op_test import OpTest +from op_test_xpu import XPUOpTest +from xpu.get_test_cover_info import create_test_class, get_xpu_op_support_types, XPUOpTestWrapper + paddle.enable_static() -@unittest.skipIf(not paddle.is_compiled_with_xpu(), - "core is not compiled with XPU") -class TestXPUSignOp(OpTest): - def setUp(self): - self.op_type = "sign" - self.dtype = np.float32 - self.inputs = { - 'X': np.random.uniform(-10, 10, (10, 10)).astype(self.dtype) - } - self.outputs = {'Out': np.sign(self.inputs['X'])} - self.attrs = {'use_xpu': True} - - def test_check_output(self): - if paddle.is_compiled_with_xpu(): - place = paddle.XPUPlace(0) - self.check_output_with_place(place) - - def test_check_grad(self): - if paddle.is_compiled_with_xpu(): - place = paddle.XPUPlace(0) - self.check_grad_with_place(place, ['X'], 'Out') +class XPUTestSignOP(XPUOpTestWrapper): + def __init__(self): + self.op_name = 'sign' + self.use_dynamic_create_class = False + + class TestSignOPBase(XPUOpTest): + def setUp(self): + self.place = paddle.XPUPlace(0) + self.init_dtype() + self.set_case() + + def set_case(self): + self.op_type = 'sign' + self.dtype = self.in_type + self.init_config() + self.x = np.random.uniform(-10, 10, + self.input_shape).astype(self.dtype) + self.inputs = {'X': self.x} + self.outputs = {'Out': np.sign(self.x)} + self.attrs = {'use_xpu': True} + + def init_dtype(self): + self.dtype = np.float32 + + def test_check_output(self): + self.check_output_with_place(self.place) + + def test_check_grad(self): + self.check_grad_with_place(self.place, ['X'], 'Out') + + def init_config(self): + self.input_shape = [864] + + class XPUTestSign1(TestSignOPBase): + def init_config(self): + self.input_shape = [2, 768] + + class XPUTestSign2(TestSignOPBase): + def init_config(self): + self.input_shape = [3, 8, 4096] + + class XPUTestSign3(TestSignOPBase): + def init_config(self): + self.input_shape = [1024] + + class XPUTestSign4(TestSignOPBase): + def init_config(self): + self.input_shape = [2, 2, 255] + +support_types = get_xpu_op_support_types('sign') +for stype in support_types: + create_test_class(globals(), XPUTestSignOP, stype) if __name__ == "__main__": unittest.main() From 8cbf79a3fc4db601b1f7fbdebc70bb2a115d0411 Mon Sep 17 00:00:00 2001 From: chentianyu03 Date: Wed, 13 Apr 2022 10:13:23 +0800 Subject: [PATCH 110/211] [Yaml]Add adam yaml (#41561) * add adam yaml * add adam final_state api * add adam_impl --- paddle/phi/api/lib/api_custom_impl.cc | 181 ++++++++++++++++++ paddle/phi/api/lib/api_custom_impl.h | 18 ++ .../fluid/tests/unittests/test_adam_op.py | 13 ++ .../fluid/tests/unittests/test_optimizer.py | 6 + python/paddle/optimizer/adam.py | 18 +- python/paddle/utils/code_gen/api.yaml | 6 + 6 files changed, 241 insertions(+), 1 deletion(-) diff --git a/paddle/phi/api/lib/api_custom_impl.cc b/paddle/phi/api/lib/api_custom_impl.cc index 0f1cbc3f1910e..d7f148fff818b 100644 --- a/paddle/phi/api/lib/api_custom_impl.cc +++ b/paddle/phi/api/lib/api_custom_impl.cc @@ -33,6 +33,187 @@ limitations under the License. */ namespace paddle { namespace experimental { +std::tuple adam_impl( + const Tensor& param, + const Tensor& grad, + const Tensor& learning_rate, + const Tensor& moment1, + const Tensor& moment2, + const Tensor& beta1_pow, + const Tensor& beta2_pow, + paddle::optional master_param, + paddle::optional skip_update, + const Scalar& beta1, + const Scalar& beta2, + const Scalar& epsilon, + bool lazy_mode, + int64_t min_row_size_to_use_multithread, + bool multi_precision, + bool use_global_beta_pow) { + Backend kernel_backend = Backend::UNDEFINED; + DataLayout kernel_layout = DataLayout::UNDEFINED; + DataType kernel_data_type = DataType::UNDEFINED; + if (kernel_backend == Backend::UNDEFINED || + kernel_layout == DataLayout::UNDEFINED || + kernel_data_type == DataType::UNDEFINED) { + auto kernel_key_set = ParseKernelKeyByInputArgs(param); + auto kernel_key = kernel_key_set.GetHighestPriorityKernelKey(); + if (kernel_backend == Backend::UNDEFINED) { + kernel_backend = kernel_key.backend(); + } + if (kernel_layout == DataLayout::UNDEFINED) { + kernel_layout = kernel_key.layout(); + } + if (kernel_data_type == DataType::UNDEFINED) { + kernel_data_type = kernel_key.dtype(); + } + } + std::string kernel_name = "adam"; + const auto& kernel = phi::KernelFactory::Instance().SelectKernelOrThrowError( + kernel_name, {kernel_backend, kernel_layout, kernel_data_type}); + VLOG(6) << kernel_name << " API kernel key: [" << kernel_backend << ", " + << kernel_layout << ", " << kernel_data_type << "]"; + VLOG(6) << kernel_name << " API kernel: " << kernel; + + auto* dev_ctx = GetDeviceContextByBackend(kernel_backend); + + auto input_param = PrepareData(param, kernel.InputAt(0), {}); + auto input_grad = PrepareData(grad, kernel.InputAt(1), {}); + auto input_lr = PrepareData(learning_rate, kernel.InputAt(2), {}); + auto input_moment1 = PrepareData(moment1, kernel.InputAt(3), {}); + auto input_moment2 = PrepareData(moment2, kernel.InputAt(4), {}); + auto input_beta1_pow = PrepareData(beta1_pow, kernel.InputAt(5), {}); + auto input_beta2_pow = PrepareData(beta2_pow, kernel.InputAt(6), {}); + paddle::optional input_master_param(paddle::none); + auto input_master_param_ptr = + PrepareData(master_param, kernel.InputAt(7), {}); + paddle::optional input_skip_update(paddle::none); + auto input_skip_update_ptr = PrepareData(skip_update, kernel.InputAt(8), {}); + + std::tuple api_output; + auto kernel_out_0 = input_param.get(); + auto kernel_out_1 = input_moment1.get(); + auto kernel_out_2 = input_moment2.get(); + auto kernel_out_3 = input_beta1_pow.get(); + auto kernel_out_4 = input_beta2_pow.get(); + phi::DenseTensor* kernel_out_5 = nullptr; + if (input_master_param_ptr) { + input_master_param = + paddle::make_optional(*input_master_param_ptr); + kernel_out_5 = + paddle::make_optional(*input_master_param_ptr) + .get_ptr(); + } + + if (input_skip_update_ptr) { + input_skip_update = + paddle::make_optional(*input_skip_update_ptr); + } + + paddle::optional input_meta_ref_master_param( + paddle::none); + phi::DenseTensor dt; + phi::MetaTensor input_meta_tmp_master_param(dt); + if (input_master_param_ptr) { + input_meta_tmp_master_param.set_dtype(input_master_param_ptr->dtype()); + input_meta_tmp_master_param.set_dims(input_master_param_ptr->dims()); + input_meta_tmp_master_param.set_layout(input_master_param_ptr->layout()); + input_meta_ref_master_param = input_meta_tmp_master_param; + } + + paddle::optional input_meta_ref_skip_update( + paddle::none); + phi::DenseTensor dt1; + phi::MetaTensor input_meta_tmp_skip_update(dt1); + if (input_skip_update_ptr) { + input_meta_tmp_skip_update.set_dtype(input_skip_update_ptr->dtype()); + input_meta_tmp_skip_update.set_dims(input_skip_update_ptr->dims()); + input_meta_tmp_skip_update.set_layout(input_skip_update_ptr->layout()); + input_meta_ref_skip_update = input_meta_tmp_skip_update; + } + + phi::MetaTensor meta_out_0(kernel_out_0); + phi::MetaTensor meta_out_1(kernel_out_1); + phi::MetaTensor meta_out_2(kernel_out_2); + phi::MetaTensor meta_out_3(kernel_out_3); + phi::MetaTensor meta_out_4(kernel_out_4); + phi::MetaTensor meta_out_5(kernel_out_5); + + phi::AdamInferMeta(MakeMetaTensor(*input_param), + MakeMetaTensor(*input_grad), + MakeMetaTensor(*input_lr), + MakeMetaTensor(*input_moment1), + MakeMetaTensor(*input_moment2), + MakeMetaTensor(*input_beta1_pow), + MakeMetaTensor(*input_beta2_pow), + input_meta_ref_master_param, + input_meta_ref_skip_update, + beta1, + beta2, + epsilon, + lazy_mode, + min_row_size_to_use_multithread, + multi_precision, + use_global_beta_pow, + &meta_out_0, + &meta_out_1, + &meta_out_2, + &meta_out_3, + &meta_out_4, + &meta_out_5); + + using kernel_signature = void (*)(const platform::DeviceContext&, + const phi::DenseTensor&, + const phi::DenseTensor&, + const phi::DenseTensor&, + const phi::DenseTensor&, + const phi::DenseTensor&, + const phi::DenseTensor&, + const phi::DenseTensor&, + paddle::optional, + paddle::optional, + const Scalar&, + const Scalar&, + const Scalar&, + bool, + int64_t, + bool, + bool, + phi::DenseTensor*, + phi::DenseTensor*, + phi::DenseTensor*, + phi::DenseTensor*, + phi::DenseTensor*, + phi::DenseTensor*); + auto* kernel_fn = kernel.GetVariadicKernelFn(); + + (*kernel_fn)(*dev_ctx, + *input_param, + *input_grad, + *input_lr, + *input_moment1, + *input_moment2, + *input_beta1_pow, + *input_beta2_pow, + input_master_param, + input_skip_update, + beta1, + beta2, + epsilon, + lazy_mode, + min_row_size_to_use_multithread, + multi_precision, + use_global_beta_pow, + kernel_out_0, + kernel_out_1, + kernel_out_2, + kernel_out_3, + kernel_out_4, + kernel_out_5); + + return api_output; +} + ////////////////// Forward api impls ////////////////////// Tensor conv2d_impl(const Tensor& input, diff --git a/paddle/phi/api/lib/api_custom_impl.h b/paddle/phi/api/lib/api_custom_impl.h index 0d1ba3e98e53e..5d46ed691816b 100644 --- a/paddle/phi/api/lib/api_custom_impl.h +++ b/paddle/phi/api/lib/api_custom_impl.h @@ -30,6 +30,24 @@ namespace experimental { ////////////////// Forward api impls ////////////////////// +std::tuple adam_impl( + const Tensor& param, + const Tensor& grad, + const Tensor& learning_rate, + const Tensor& moment1, + const Tensor& moment2, + const Tensor& beta1_pow, + const Tensor& beta2_pow, + paddle::optional master_param, + paddle::optional skip_update, + const Scalar& beta1, + const Scalar& beta2, + const Scalar& epsilon, + bool lazy_mode, + int64_t min_row_size_to_use_multithread, + bool multi_precision, + bool use_global_beta_pow); + std::tuple batch_norm_impl( const Tensor& x, const Tensor& scale, diff --git a/python/paddle/fluid/tests/unittests/test_adam_op.py b/python/paddle/fluid/tests/unittests/test_adam_op.py index d05c9a3c313bb..d254cd286e666 100644 --- a/python/paddle/fluid/tests/unittests/test_adam_op.py +++ b/python/paddle/fluid/tests/unittests/test_adam_op.py @@ -21,6 +21,7 @@ from paddle.fluid.op import Operator import paddle.fluid as fluid import paddle +from paddle.fluid.framework import _test_eager_guard class TestAdamOp1(OpTest): @@ -189,6 +190,10 @@ def test_check_output(self): self.inputs['Grad'] = np.random.uniform( -1, 1, (102, 105)).astype("float32") + def test_api_eager_dygraph(self): + with _test_eager_guard(): + self.test_check_output() + def adam_step(inputs, attributes): ''' @@ -732,6 +737,14 @@ def test_adam_op_with_sparse_input_and_weight_decay(self): adam.step() paddle.enable_static() + def test_api_eager_dygraph(self): + with _test_eager_guard(): + self.test_adam_op_dygraph() + self.test_adam_op_with_state_dict() + self.test_adam_with_grad_clip() + self.test_adam_op_with_set_lr() + self.test_adam_op_with_sparse_input_and_weight_decay() + class TestAdamOptimizer(unittest.TestCase): def _test(self, diff --git a/python/paddle/fluid/tests/unittests/test_optimizer.py b/python/paddle/fluid/tests/unittests/test_optimizer.py index e8820d5a8708e..ba1e9be815de6 100644 --- a/python/paddle/fluid/tests/unittests/test_optimizer.py +++ b/python/paddle/fluid/tests/unittests/test_optimizer.py @@ -24,6 +24,7 @@ import numpy as np from paddle.fluid.backward import append_backward from paddle.fluid.framework import Program, program_guard, convert_np_dtype_to_dtype_ +from paddle.fluid.framework import _test_eager_guard import paddle from paddle.io import Dataset import numpy @@ -1114,6 +1115,11 @@ def test_float64(self): def test_float32(self): self.check_with_dtype('float32') + def test_api_eager_dygraph(self): + with _test_eager_guard(): + self.test_float64() + self.test_float32() + class TestMasterWeightSaveForFP16(unittest.TestCase): ''' diff --git a/python/paddle/optimizer/adam.py b/python/paddle/optimizer/adam.py index aae3d97a79521..de09193ac798e 100644 --- a/python/paddle/optimizer/adam.py +++ b/python/paddle/optimizer/adam.py @@ -336,7 +336,23 @@ def _append_optimize_op(self, block, param_and_grad): lr = self._create_param_lr(param_and_grad) # create the adam optimize op - if framework._non_static_mode(): + if framework.in_dygraph_mode(): + found_inf = self._get_auxiliary_var('found_inf') + + _beta1 = self._beta1 if not isinstance( + self._beta1, Variable) else self._beta1.numpy().item(0) + _beta2 = self._beta2 if not isinstance( + self._beta2, Variable) else self._beta2.numpy().item(0) + + _, _, _, _, _, _ = _C_ops.final_state_adam( + param_and_grad[0], param_and_grad[1], lr, moment1, moment2, + beta1_pow_acc, beta2_pow_acc, master_weight, found_inf, _beta1, + _beta2, self._epsilon, self._lazy_mode, 1000, find_master, + False) + + return None + + if framework._in_legacy_dygraph(): _beta1 = self._beta1 if not isinstance( self._beta1, Variable) else self._beta1.numpy().item(0) diff --git a/python/paddle/utils/code_gen/api.yaml b/python/paddle/utils/code_gen/api.yaml index 6b58c84061384..08028ba17185c 100644 --- a/python/paddle/utils/code_gen/api.yaml +++ b/python/paddle/utils/code_gen/api.yaml @@ -45,6 +45,12 @@ kernel : func : adadelta +- api : adam + args : (Tensor param, Tensor grad, Tensor learning_rate, Tensor moment1, Tensor moment2, Tensor beta1_pow, Tensor beta2_pow, Tensor master_param, Tensor skip_update, Scalar beta1, Scalar beta2, Scalar epsilon, bool lazy_mode, int64_t min_row_size_to_use_multithread, bool multi_precision, bool use_global_beta_pow) + output : Tensor(param_out), Tensor(moment1_out), Tensor(moment2_out), Tensor(beta1_pow_out), Tensor(beta2_pow_out), Tensor(master_param_outs) + optional : master_param, skip_update + invoke : adam_impl(param, grad, learning_rate, moment1, moment2, beta1_pow, beta2_pow, master_param, skip_update, beta1, beta2, epsilon, lazy_mode, min_row_size_to_use_multithread, multi_precision, use_global_beta_pow) + - api : adamax args : (Tensor param, Tensor grad, Tensor learning_rate, Tensor moment, Tensor inf_norm, Tensor beta1_pow, float beta1, float beta2, float epsilon) output : Tensor(param_out), Tensor(avg_squared_grad_out), Tensor(avg_squared_update_out) From 325e5712b1865aaf068f0d2a67a606d2beabe609 Mon Sep 17 00:00:00 2001 From: chentianyu03 Date: Wed, 13 Apr 2022 10:16:57 +0800 Subject: [PATCH 111/211] [Phi]fix split error when sections has 0 size and add test case (#41708) * fix split error when sections has 0 size and add test case * fix test case --- paddle/fluid/operators/strided_memcpy.h | 2 +- .../fluid/tests/unittests/test_split_op.py | 19 +++++++++++++++++++ 2 files changed, 20 insertions(+), 1 deletion(-) diff --git a/paddle/fluid/operators/strided_memcpy.h b/paddle/fluid/operators/strided_memcpy.h index af29aac6b9052..90cf4128aae94 100644 --- a/paddle/fluid/operators/strided_memcpy.h +++ b/paddle/fluid/operators/strided_memcpy.h @@ -134,7 +134,7 @@ inline void StridedMemcpyWithAxis0( for (size_t i = 0; i < outputs->size(); ++i) { auto out_stride = stride_numel(shape_refer[i]->dims()); auto out = outputs->at(i); - if (out != nullptr) { + if (out != nullptr && out->initialized()) { StridedNumelCopyWithAxis(dev_ctx, axis, out->data(), out_stride, input.data() + input_offset, in_stride, out_stride[axis]); diff --git a/python/paddle/fluid/tests/unittests/test_split_op.py b/python/paddle/fluid/tests/unittests/test_split_op.py index aac904dc2e15d..c826a0e1030f4 100644 --- a/python/paddle/fluid/tests/unittests/test_split_op.py +++ b/python/paddle/fluid/tests/unittests/test_split_op.py @@ -459,5 +459,24 @@ def test_axis_tensor_input(self): self.assertTrue(np.allclose(ex_x2, x2_out)) +class API_TestEmptySplit(unittest.TestCase): + def test_axis_input_empty_section(self): + with fluid.dygraph.guard(): + input_1 = np.random.random([8, 6, 6]).astype("float32") + # input is a variable which shape is [8, 6, 6] + input = paddle.to_tensor(input_1) + x0, x1, x2 = paddle.split(input, num_or_sections=[5, 0, 3]) + x0_out = x0.numpy() + x1_out = x1.numpy() + x2_out = x2.numpy() + ex_x0, ex_x1, ex_x2 = np.split(input_1, [ + 5, + 5, + ]) + self.assertTrue(np.allclose(ex_x0, x0_out)) + self.assertTrue(np.allclose(ex_x1, x1_out)) + self.assertTrue(np.allclose(ex_x2, x2_out)) + + if __name__ == '__main__': unittest.main() From 468c1ad751433c0dc2b75063981b73be910e64e3 Mon Sep 17 00:00:00 2001 From: zhangyikun02 <48021248+zhangyk0314@users.noreply.github.com> Date: Wed, 13 Apr 2022 10:20:58 +0800 Subject: [PATCH 112/211] support bce_loss and bce_loss_grad in XPU, test=kunlun (#41610) --- paddle/fluid/operators/bce_loss_op_xpu.cc | 70 +++++++++++++++++ .../fluid/platform/device/xpu/xpu2_op_list.h | 3 + .../unittests/xpu/test_bce_loss_op_xpu.py | 76 +++++++++++++++++++ 3 files changed, 149 insertions(+) create mode 100644 paddle/fluid/operators/bce_loss_op_xpu.cc create mode 100644 python/paddle/fluid/tests/unittests/xpu/test_bce_loss_op_xpu.py diff --git a/paddle/fluid/operators/bce_loss_op_xpu.cc b/paddle/fluid/operators/bce_loss_op_xpu.cc new file mode 100644 index 0000000000000..8ec80efceb9ec --- /dev/null +++ b/paddle/fluid/operators/bce_loss_op_xpu.cc @@ -0,0 +1,70 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#ifdef PADDLE_WITH_XPU +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/platform/device/device_wrapper.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; + +template +class XPUBCELossKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + auto* x = context.Input("X"); + auto* labels = context.Input("Label"); + auto* out = context.Output("Out"); + out->mutable_data(context.GetPlace()); + + auto x_numel = x->numel(); + auto& dev_ctx = context.template device_context(); + int r = xpu::bce_loss(dev_ctx.x_context(), x->data(), + labels->data(), out->data(), x_numel); + PADDLE_ENFORCE_XDNN_SUCCESS(r, "bce_loss"); + } +}; + +template +class XPUBCELossGradKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + auto* x = context.Input("X"); + auto* labels = context.Input("Label"); + auto* dout = context.Input(framework::GradVarName("Out")); + auto* dx = context.Output(framework::GradVarName("X")); + dx->mutable_data(context.GetPlace()); + + auto x_numel = x->numel(); + auto& dev_ctx = context.template device_context(); + int r = xpu::bce_loss_grad(dev_ctx.x_context(), x->data(), + labels->data(), dout->data(), + dx->data(), x_numel); + PADDLE_ENFORCE_XDNN_SUCCESS(r, "bce_loss_grad"); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP_XPU_KERNEL( + bce_loss, ops::XPUBCELossKernel); +REGISTER_OP_XPU_KERNEL( + bce_loss_grad, + ops::XPUBCELossGradKernel); + +#endif // PADDLE_WITH_XPU diff --git a/paddle/fluid/platform/device/xpu/xpu2_op_list.h b/paddle/fluid/platform/device/xpu/xpu2_op_list.h index 6527371059806..3a047b8fce703 100644 --- a/paddle/fluid/platform/device/xpu/xpu2_op_list.h +++ b/paddle/fluid/platform/device/xpu/xpu2_op_list.h @@ -43,6 +43,9 @@ XPUOpMap& get_kl2_ops() { {"batch_norm_grad", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, {"batch_norm", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, + {"bce_loss_grad", + XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, + {"bce_loss", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, {"bilinear_interp_v2", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, {"bilinear_interp_v2_grad", diff --git a/python/paddle/fluid/tests/unittests/xpu/test_bce_loss_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_bce_loss_op_xpu.py new file mode 100644 index 0000000000000..a8173f054a133 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/xpu/test_bce_loss_op_xpu.py @@ -0,0 +1,76 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import sys +sys.path.append("..") +import paddle +import paddle.fluid as fluid +import numpy as np +import unittest +from op_test_xpu import XPUOpTest +from xpu.get_test_cover_info import create_test_class, get_xpu_op_support_types, XPUOpTestWrapper + +paddle.enable_static() + + +def bce_loss(input, label): + return -1 * (label * np.log(input) + (1. - label) * np.log(1. - input)) + + +class XPUTestBceLossOp(XPUOpTestWrapper): + def __init__(self): + self.op_name = 'bce_loss' + self.use_dynamic_create_class = False + + class TestBceLossOp(XPUOpTest): + def setUp(self): + self.op_type = "bce_loss" + self.dtype = self.in_type + self.place = paddle.XPUPlace(0) + self.init_test_case() + input_np = np.random.uniform(0.1, 0.8, + self.shape).astype(self.dtype) + label_np = np.random.randint(0, 2, self.shape).astype(self.dtype) + output_np = bce_loss(input_np, label_np) + + self.inputs = {'X': input_np, 'Label': label_np} + self.outputs = {'Out': output_np} + + def test_check_output(self): + self.check_output_with_place(self.place) + + def test_check_grad(self): + self.check_grad_with_place(self.place, ['X'], 'Out') + + def init_test_case(self): + self.shape = [10, 10] + + class TestBceLossOpCase1(TestBceLossOp): + def init_test_cast(self): + self.shape = [2, 3, 4, 5] + + class TestBceLossOpCase2(TestBceLossOp): + def init_test_cast(self): + self.shape = [2, 3, 20] + + +support_types = get_xpu_op_support_types('bce_loss') +for stype in support_types: + create_test_class(globals(), XPUTestBceLossOp, stype) + +if __name__ == "__main__": + paddle.enable_static() + unittest.main() From c4d5a77fec998ea21870d6479a0584daccf4aa0e Mon Sep 17 00:00:00 2001 From: zhangyikun02 <48021248+zhangyk0314@users.noreply.github.com> Date: Wed, 13 Apr 2022 10:21:21 +0800 Subject: [PATCH 113/211] concat and relu sopport FP16 in XPU, test=kunlun (#41631) --- paddle/fluid/operators/activation_op_xpu.cc | 8 ++++- paddle/fluid/operators/concat_op_xpu.cc | 31 +++++++++++++------ .../fluid/platform/device/xpu/xpu2_op_list.h | 12 ++++--- 3 files changed, 36 insertions(+), 15 deletions(-) diff --git a/paddle/fluid/operators/activation_op_xpu.cc b/paddle/fluid/operators/activation_op_xpu.cc index 4c2d3fc162f83..e950f952c24e6 100644 --- a/paddle/fluid/operators/activation_op_xpu.cc +++ b/paddle/fluid/operators/activation_op_xpu.cc @@ -490,7 +490,6 @@ REGISTER_ACTIVATION_XPU_KERNEL(leaky_relu, XPULeakyReluFunctor, XPULeakyReluGradFunctor) REGISTER_ACTIVATION_XPU_KERNEL(reciprocal, XPUReciprocalFunctor, XPUReciprocalGradFunctor) -REGISTER_ACTIVATION_XPU_KERNEL(relu, XPUReluFunctor, XPUReluGradFunctor) REGISTER_ACTIVATION_XPU_KERNEL(sigmoid, XPUSigmoidFunctor, XPUSigmoidGradFunctor) REGISTER_ACTIVATION_XPU_KERNEL(sqrt, XPUSqrtFunctor, XPUSqrtGradFunctor) @@ -500,6 +499,13 @@ REGISTER_ACTIVATION_XPU_KERNEL(softplus, XPUSoftPlusFunctor, REGISTER_ACTIVATION_XPU_KERNEL(swish, XPUSwishFunctor, XPUSwishGradFunctor) REGISTER_ACTIVATION_XPU_KERNEL(pow, XPUPowFunctor, XPUPowGradFunctor) +REGISTER_OP_XPU_KERNEL( + relu, ops::XPUActivationKernel>, + ops::XPUActivationKernel>); +REGISTER_OP_XPU_KERNEL( + relu_grad, ops::XPUActivationGradKernel>, + ops::XPUActivationGradKernel< + ops::XPUReluGradFunctor>); REGISTER_OP_XPU_KERNEL( tanh, ops::XPUActivationKernel>, ops::XPUActivationKernel>); diff --git a/paddle/fluid/operators/concat_op_xpu.cc b/paddle/fluid/operators/concat_op_xpu.cc index e4b0b0ee2e3b2..ba35098bbac10 100644 --- a/paddle/fluid/operators/concat_op_xpu.cc +++ b/paddle/fluid/operators/concat_op_xpu.cc @@ -26,6 +26,8 @@ using Tensor = framework::Tensor; template class ConcatXPUKernel : public framework::OpKernel { + using XPUType = typename XPUTypeTrait::Type; + public: void Compute(const framework::ExecutionContext& ctx) const override { auto ins = ctx.MultiInput("X"); @@ -79,10 +81,10 @@ class ConcatXPUKernel : public framework::OpKernel { auto place = ctx.GetPlace(); out->mutable_data(place); std::vector> xdims_list; - std::vector ptrs; + std::vector ptrs; for (unsigned int i = 0; i < ins.size(); ++i) { if (ins[i] && ins[i]->numel() > 0) { - ptrs.push_back(ins[i]->data()); + ptrs.push_back(reinterpret_cast(ins[i]->data())); int size = ins[i]->dims().size(); std::vector tmp_dims(size); for (int j = 0; j < size; ++j) { @@ -96,8 +98,9 @@ class ConcatXPUKernel : public framework::OpKernel { "No tensor need concat")); auto& dev_ctx = ctx.template device_context(); - int r = xpu::concat(dev_ctx.x_context(), ptrs, out->data(), - xdims_list, axis); + int r = xpu::concat(dev_ctx.x_context(), ptrs, + reinterpret_cast(out->data()), + xdims_list, axis); PADDLE_ENFORCE_EQ(r, XPU_SUCCESS, platform::errors::External( "XPU concat kernel return wrong value[%d %s]", r, @@ -107,6 +110,8 @@ class ConcatXPUKernel : public framework::OpKernel { template class ConcatGradXPUKernel : public framework::OpKernel { + using XPUType = typename XPUTypeTrait::Type; + public: void Compute(const framework::ExecutionContext& ctx) const { auto* out_grad = @@ -134,12 +139,12 @@ class ConcatGradXPUKernel : public framework::OpKernel { axis = ComputeAxis(static_cast(axis), static_cast(ins[0]->dims().size())); // get output tensor that the name is not kEmptyVarName - std::vector ptrs(outs.size()); + std::vector ptrs(outs.size()); for (size_t j = 0; j < outs.size(); ++j) { if (out_var_names[j] != framework::kEmptyVarName && outs[j]->numel() != 0UL) { outs[j]->mutable_data(ctx.GetPlace()); - ptrs[j] = outs[j]->data(); + ptrs[j] = reinterpret_cast(outs[j]->data()); } else { ptrs[j] = nullptr; } @@ -173,8 +178,10 @@ class ConcatGradXPUKernel : public framework::OpKernel { xdims_list[axis] = total_length; auto& dev_ctx = ctx.template device_context(); - int r = xpu::split(dev_ctx.x_context(), out_grad->data(), ptrs, - xdims_list, split_list, axis); + int r = xpu::split( + dev_ctx.x_context(), + reinterpret_cast(out_grad->data()), ptrs, xdims_list, + split_list, axis); PADDLE_ENFORCE_EQ( r, XPU_SUCCESS, platform::errors::External( @@ -189,9 +196,13 @@ class ConcatGradXPUKernel : public framework::OpKernel { namespace ops = paddle::operators; REGISTER_OP_XPU_KERNEL( - concat, ops::ConcatXPUKernel); + concat, ops::ConcatXPUKernel, + ops::ConcatXPUKernel); REGISTER_OP_XPU_KERNEL( concat_grad, - ops::ConcatGradXPUKernel); + ops::ConcatGradXPUKernel, + ops::ConcatGradXPUKernel); #endif diff --git a/paddle/fluid/platform/device/xpu/xpu2_op_list.h b/paddle/fluid/platform/device/xpu/xpu2_op_list.h index 3a047b8fce703..9915b4d8d34f8 100644 --- a/paddle/fluid/platform/device/xpu/xpu2_op_list.h +++ b/paddle/fluid/platform/device/xpu/xpu2_op_list.h @@ -56,8 +56,10 @@ XPUOpMap& get_kl2_ops() { pOpKernelType(vartype::INT64, XPUPlace()), pOpKernelType(vartype::INT32, XPUPlace())})}, {"clip", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, - {"concat_grad", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, - {"concat", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, + {"concat_grad", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()), + pOpKernelType(vartype::FP16, XPUPlace())})}, + {"concat", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()), + pOpKernelType(vartype::FP16, XPUPlace())})}, {"conv2d_grad", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()), pOpKernelType(vartype::FP16, XPUPlace())})}, {"conv2d", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()), @@ -288,8 +290,10 @@ XPUOpMap& get_kl2_ops() { {"reduce_sum_grad", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, {"reduce_sum", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, - {"relu_grad", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, - {"relu", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, + {"relu_grad", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()), + pOpKernelType(vartype::FP16, XPUPlace())})}, + {"relu", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()), + pOpKernelType(vartype::FP16, XPUPlace())})}, {"reshape2_grad", XPUKernelSet({pOpKernelType(vartype::FP64, XPUPlace()), pOpKernelType(vartype::INT64, XPUPlace()), From f4cc5def99caab5a95c34d681dd5434eedf08acd Mon Sep 17 00:00:00 2001 From: Baibaifan <39549453+Baibaifan@users.noreply.github.com> Date: Wed, 13 Apr 2022 10:31:29 +0800 Subject: [PATCH 114/211] sharding_for_eager_tensor (#41415) --- paddle/fluid/pybind/eager_method.cc | 2 +- .../group_sharded_optimizer_stage2.py | 410 ++++++++ .../sharding/group_sharded_stage2.py | 536 ++++++++++ .../sharding/group_sharded_stage3.py | 912 ++++++++++++++++++ .../sharding/group_sharded_storage.py | 313 ++++++ .../sharding/group_sharded_utils.py | 227 +++++ .../distributed/sharding/group_sharded.py | 79 +- .../fluid/dygraph/varbase_patch_methods.py | 5 + .../fluid/tests/unittests/CMakeLists.txt | 2 +- .../unittests/dygraph_group_sharded_api.py | 3 + .../dygraph_group_sharded_api_eager.py | 147 +++ .../unittests/dygraph_group_sharded_stage2.py | 229 +++++ .../dygraph_group_sharded_stage2_offload.py | 112 +++ .../unittests/dygraph_group_sharded_stage3.py | 283 ++++++ .../dygraph_group_sharded_stage3_offload.py | 205 ++++ .../dygraph_sharding_optimizer_stage2.py | 3 + .../unittests/dygraph_sharding_stage2.py | 3 + .../dygraph_sharding_stage2_offload.py | 3 + .../unittests/dygraph_sharding_stage3.py | 3 + .../dygraph_sharding_stage3_offload.py | 3 + .../test_dygraph_group_sharded_api.py | 1 + .../unittests/test_dygraph_sharding_stage2.py | 2 + .../unittests/test_dygraph_sharding_stage3.py | 2 + .../tests/unittests/test_egr_python_api.py | 2 +- 24 files changed, 3462 insertions(+), 25 deletions(-) create mode 100644 python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_optimizer_stage2.py create mode 100644 python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_stage2.py create mode 100644 python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_stage3.py create mode 100644 python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_storage.py create mode 100644 python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_utils.py create mode 100644 python/paddle/fluid/tests/unittests/dygraph_group_sharded_api_eager.py create mode 100644 python/paddle/fluid/tests/unittests/dygraph_group_sharded_stage2.py create mode 100644 python/paddle/fluid/tests/unittests/dygraph_group_sharded_stage2_offload.py create mode 100644 python/paddle/fluid/tests/unittests/dygraph_group_sharded_stage3.py create mode 100644 python/paddle/fluid/tests/unittests/dygraph_group_sharded_stage3_offload.py diff --git a/paddle/fluid/pybind/eager_method.cc b/paddle/fluid/pybind/eager_method.cc index eb7f64a44126c..6dbed97a55f40 100644 --- a/paddle/fluid/pybind/eager_method.cc +++ b/paddle/fluid/pybind/eager_method.cc @@ -473,7 +473,7 @@ static PyObject* tensor__share_buffer_to(TensorObject* self, PyObject* args, } auto dst_tensor = static_cast(dst_ptr->impl().get()); - dst_tensor->ShareDataWith(*src_tensor); + dst_tensor->ShareBufferWith(*src_tensor); dst_tensor->ShareDataTypeWith(*src_tensor); Py_INCREF(Py_None); return Py_None; diff --git a/python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_optimizer_stage2.py b/python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_optimizer_stage2.py new file mode 100644 index 0000000000000..9df68dc419efa --- /dev/null +++ b/python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_optimizer_stage2.py @@ -0,0 +1,410 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +#Taken and modified for fairscale from: +# https://github.com/facebookresearch/fairscale/blob/main/fairscale/optim/oss.py +#Commit: 8acbec718f3c70a6b9785470bb9e05cd84fc3f8e + +import copy +import logging +import numpy as np +from collections import OrderedDict + +import paddle +import paddle.fluid as fluid +from paddle.fluid import core +from paddle.optimizer import Optimizer +from paddle.fluid.clip import ClipGradByGlobalNorm +from paddle.distributed.collective import _get_global_group, new_group, broadcast, wait + +from .group_sharded_storage import ParamStorage, GradStorage +from .group_sharded_utils import Type, device_guard, GroupShardedClipGrad + +# CUDA alignment 256 bytes, cpu alignment 4096 bytes +alignment = {"gpu": 256, "cpu": 4096} +align = { + Type.fp16.value: 2, + Type.fp32.value: 4, +} + + +class GroupShardedOptimizerStage2(Optimizer): + """ + A wrapper for Sharding Stage2 Optimizer in Dygraph. + + .. warning: ShardingOptimizer encapsulates the optimization strategy and integrates it into the optimizer. + + .. ZeRO: 1.https://arxiv.org/pdf/1910.02054.pdf 2.https://arxiv.org/pdf/1910.02054.pdf. + + """ + + # TODO (Baibaifan) + # Feature Notes: + # 1. Unified memory for parameters and parameters.grad to InternalStorage. + # 2. Support the segmentation of optimizer parameters and partial updating of parameters. + # 3. Dynamically adjust training parameters and models. + # 4. Support offload function. + # 5. Support the establishment of independent communication groups. + # 6. Broadcast_fp16 is not supported now. + def __init__(self, + params, + optim, + group=None, + offload=False, + device="gpu", + pertrain_sync_models=True, + **kw): + + super().__init__(learning_rate=optim._learning_rate, parameters=params) + assert core.is_compiled_with_cuda(), "Only GPU is supported now" + + # Segmentation information + self._dtype_rank_params = OrderedDict( + ) # {dtype:[param1,param2]} device, rank, params + self._param2rank = {} + self.__segment_params = [] + self._rank_buffer_size = {} # {dtype: {rank: numel+alignment}} + self._param2align = {} # {param.name: align} + + # Default information + self._optim = optim + + assert hasattr(self._optim, "_master_weights" + ), "Must use optimizer with _master_weights attribute" + + # Support parameter group and parameter list + self._local_params = [] + if isinstance(params[0], dict): + for param_group in params: + self._local_params.extend(list(param_group["params"])) + else: + self._local_params.extend(list(params)) + + self._default_device = device + self._pfp16 = len( + list( + filter(lambda x: x.trainable and x.dtype == Type.fp16.value, + self._local_params))) > 0 + + self._group = new_group(_get_global_group() + .ranks) if group is None else group + + self.world_size = self._group.nranks + self._rank = self._group.rank + self._global_root_rank = self._group.ranks[0] + + # Synchronous all ranks models + if pertrain_sync_models: + self._sync_params_and_buffers() + + self.param_storages = {} # {dtype: {rank: InternalStorage}} + + if isinstance(self._optim._grad_clip, ClipGradByGlobalNorm): + logging.warning( + "While using ClipGradByGlobalNorm in GroupShardedOptimizerStage2, the grad clip of original optimizer will be changed." + ) + + self._optim._grad_clip = GroupShardedClipGrad( + self._optim._grad_clip, paddle.get_device(), self._group) + if self._optim._parameter_list and isinstance( + self._optim._parameter_list[0], dict): + for item in self._optim._param_groups: + if "grad_clip" in item.keys(): + item["grad_clip"] = self._optim._grad_clip + + if offload: + assert self._pfp16, "Only support offload strategy while using \'Adam\', \'AdamW\' and \'Momentum\' optimizer with AMP/Pure FP16" + + self.offload = offload # Using for offload + self.offload_device = "cpu" + self.offload_buffer_size = 0 + self.offload_param2align = {} + self.offload_params = None + self.offload_grads = None + self.dev_id = int(paddle.get_device().split(":")[1]) + + self._master_params = {} + + # Update optimizer parameters and adjust parameter storage and use according to rank. + self._update_opt_status() + + @paddle.autograd.no_grad() + def _sync_params_and_buffers(self): + """ + Sync all model states for all ranks + """ + + for p in self._local_params: + broadcast( + p, + src=self._global_root_rank, + group=self._group, + use_calc_stream=True) + + def _generate_master_params(self, trainable_params): + if self.offload: + for param in trainable_params: + if param.name not in self._master_params.keys(): + self._master_params[param.name] = core.eager.Tensor( + name=param.name, + value=param.cast(dtype=Type.fp32.value).numpy(), + place=core.CPUPlace(), + stop_gradient=param.stop_gradient) + else: + for param in trainable_params: + if param.dtype == Type.fp16.value: + master_tensor = paddle.cast(param, Type.fp32.value) + master_tensor.name = param.name + self._optim._master_weights[param.name] = master_tensor + + def _update_opt_status(self): + """Update optimizer status and parameter storage information, and special functions to be developed. + """ + # func 1 + self._integration_params() + + # Segement helpers + + def _segment_params(self): + """ + Divide all optimizer parameters equally into rank. + """ + if len(self.__segment_params) == 0: + self.__segment_params, param_lists = [ + [] for _ in range(self.world_size) + ], [[] for _ in range(self.world_size)] + sizes = [0] * self.world_size + for param in self._local_params: + # Add this param to rank with smallest size. + rank = sizes.index(min(sizes)) + param_lists[rank].append(param) + + # Statistical real numels + sizes[rank] += param._numel() if param.trainable else 0 + + for rank, params in enumerate(param_lists): + self.__segment_params[rank].extend(params) + return self.__segment_params + + @property + def local_params(self): + return self._local_params + + @property + def param2rank(self): + """Map the params to the rank which owns them""" + if len(self._param2rank) == 0: + for rank, params in enumerate(self._segment_params()): + for param in params: + self._param2rank[param.name] = rank + return self._param2rank + + @property + def dtype_rank_params(self): + """ + Divide the parameters into groups according to rank and dtype. + """ + if len(self._dtype_rank_params) == 0: + # Assign the parameters of each rank according to the type + for param in self._local_params: + if param.dtype not in self._dtype_rank_params.keys(): + self._dtype_rank_params[ + param.dtype] = [[] for _ in range(self.world_size)] + self._dtype_rank_params[param.dtype][self.param2rank[ + param.name]].append(param) + + # Sort per rank params by size + for dtype in self._dtype_rank_params.keys(): + for rank_params in self._dtype_rank_params[dtype]: + rank_params.sort(key=lambda x: x._numel()) + + return self._dtype_rank_params + + @property + def rank_buffer_size(self): + """ + Count the memory size of the parameters corresponding to rank under the corresponding dtype. + """ + # CUDA alignment 256 bytes + if len(self._rank_buffer_size) == 0: + for dtype in self.dtype_rank_params.keys(): + if dtype not in self._rank_buffer_size.keys(): + self._rank_buffer_size[dtype] = {} + for dst_rank, per_rank_params in enumerate( + self.dtype_rank_params[dtype]): + if dst_rank not in self._rank_buffer_size[dtype].keys(): + self._rank_buffer_size[dtype][dst_rank] = 0 + for param in per_rank_params: + if not param.trainable: + continue + size = param._numel() * align[dtype] + remaining = size % alignment[self._default_device] + ali = 0 if remaining == 0 else alignment[ + self._default_device] - remaining + align_ = ali // align[dtype] + self._rank_buffer_size[dtype][dst_rank] += param._numel( + ) + align_ + self._param2align[param.name] = align_ + + return self._rank_buffer_size + + def _integration_params(self): + """ + Integrate the parameters into a continuous memory according to rank, and support the update of training parameters. + """ + + for dtype, per_rank_params in self.dtype_rank_params.items(): + if dtype not in self.param_storages.keys(): + self.param_storages[dtype] = {} + + for dst_rank, params in enumerate(per_rank_params): + if len(params) > 0: + + # Merge all the trainable params in a single InternalStorage + trainable_params = list( + filter(lambda x: x.trainable, params)) + if self._pfp16 and dst_rank == self._rank: + self._generate_master_params(trainable_params) + if trainable_params: + param_storage = ParamStorage( + size=self.rank_buffer_size[dtype][dst_rank], + dtype=dtype, + device=self._default_device) + + param_storage.add_rank_params(trainable_params, + self._param2align) + self.param_storages[dtype][dst_rank] = param_storage + + # Clear the InternalStorage keys which are not in use anymore + dtype_in_use = list(self.dtype_rank_params.keys()) + dtype_to_pop = list( + filter(lambda x: x not in dtype_in_use, self.param_storages.keys())) + for d in dtype_to_pop: + self.param_storages.pop(d) + + if self.offload: + self._optim._master_weights = self._master_params + cpu_master_params = [p for p in self._master_params.values()] + for param in cpu_master_params: + size = param._numel() * align[Type.fp32.value] + remaining = size % alignment[self.offload_device] + ali = 0 if remaining == 0 else alignment[ + self.offload_device] - remaining + align_ = ali // align[Type.fp32.value] + self.offload_buffer_size += param._numel() + align_ + self.offload_param2align[param.name] = align_ + + if cpu_master_params: + with device_guard(self._rank, self.offload_device): + self.offload_params = ParamStorage( + size=self.offload_buffer_size, + dtype=Type.fp32.value, + device=self.offload_device) + self.offload_params.buffer.name = "offload_buffer" + self.offload_params.add_rank_params( + cpu_master_params, self.offload_param2align, False) + self.offload_params.buffer.stop_gradient = False + + self.offload_grads = GradStorage( + size=self.offload_buffer_size, + dtype=Type.fp32.value, + device=self.offload_device, + destination=self._rank, + parm2align=self.offload_param2align, + convert_cpu=True) + for p in cpu_master_params: + self.offload_grads.add_grad( + p, self.offload_param2align[p.name]) + + self._optim._master_weights[ + self.offload_params.buffer. + name] = self.offload_params.buffer + + def _offload_acc_grad(self, param_name, grad_fp32_cpu): + """accumulate grads with offload strategy""" + with device_guard(self._rank, self.offload_device): + if param_name in self._master_params.keys(): + if self._master_params[param_name].grad is None: + self._master_params[param_name]._copy_gradient_from( + grad_fp32_cpu) + else: + self._master_params[param_name].grad.add_(grad_fp32_cpu) + + self.offload_params.buffer._copy_gradient_from( + self.offload_grads.buffer) + + def _offload_scale_grad(self, scale_size): + """scale grads with offload strategy""" + with device_guard(self._rank, self.offload_device): + self.offload_grads.buffer.scale_(scale=scale_size) + + def _offload_clear_grad(self): + """clear grads with offload strategy""" + with device_guard(self._rank, self.offload_device): + self.offload_grads.buffer.zero_() + + def step(self): + """ + A wrapper for Optimizer's step function to finish the update operation of the optimizer. + """ + + if self.offload: + params_list = [self.offload_params.buffer] + + #TODO(Baibaifan): Offload will support param_groups later + if not isinstance(self._optim._param_groups[0], dict): + self._optim._parameter_list = params_list + self._optim._param_groups = params_list + + # Run the optimizer of the current rank step + if self.offload: + with device_guard(device=self.offload_device): + self._optim.step() + + for param in self._local_params: + if param.name in self._master_params.keys(): + param.set_value(self._master_params[param.name].cuda( + self.dev_id).cast(dtype=param.dtype)) + else: + self._optim.step() + + # Synchronize all the updated shards in between the ranks + self._broadcast_params() + + def minimize(self): + raise RuntimeError( + "optimizer.minimize() not support now, please use optimizer.step()") + + def set_state_dict(self, state_dict): + self._optim.set_state_dict(state_dict) + + def state_dict(self): + return self._optim.state_dict() + + def _clear_cache(self): + self.__segment_params.clear() + self._dtype_rank_params.clear() + self._param2rank.clear() + + @paddle.autograd.no_grad() + def _broadcast_params(self): + """Broadcast the parameters of the current rank to each rank""" + + # Exchange all the shards with the other ranks + for dtype_per_rank in self.param_storages.values(): + for dst_rank, internal_storage in dtype_per_rank.items(): + broadcast( + tensor=internal_storage.buffer, + src=self._group.ranks[dst_rank], + group=self._group, + use_calc_stream=True) diff --git a/python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_stage2.py b/python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_stage2.py new file mode 100644 index 0000000000000..5f39ea0fd900f --- /dev/null +++ b/python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_stage2.py @@ -0,0 +1,536 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +#Taken and modified for fairscale from: +# https://github.com/facebookresearch/fairscale/blob/main/fairscale/nn/data_parallel/sharded_ddp.py +#Commit: 8acbec718f3c70a6b9785470bb9e05cd84fc3f8e + +import logging +import time +import functools +import numpy as np +from functools import reduce +from collections import deque +from types import MethodType + +import paddle +from paddle import nn +from paddle.distributed import collective +from paddle.distributed.utils import get_logger + +from .group_sharded_storage import GradStorage +from .group_sharded_optimizer_stage2 import GroupShardedOptimizerStage2 +from .group_sharded_utils import Taskflow, Type, device_guard + +logger_ = get_logger(logging.INFO) + + +def _trainable(param): + return param.trainable + + +class GroupShardedStage2(nn.Layer): + """ + A wrapper for Sharding Stage2 Layer in Dygraph. + .. warning: GroupShardedStage2 encapsulates the layer strategy and integrates it into the nn.Layer. + .. ZeRO: https://arxiv.org/pdf/1910.02054.pdf. + """ + + # TODO (Baibaifan) + # Feature Notes:: + # 1. Unified memory for param and param.grad to InternalStorage. + # 2. Divide param.grad according to rank to centrally apply for and release GPU memory. + # 3. Dynamically adjust training parameters and models. + # 4. Support offload function. + # 5. Support the establishment of independent communication groups. + + def __init__( + self, + layer, + sharding_optimizer, + group=None, + sync_buffers=False, + buffer_max_size=2**23, #8MB + auto_refresh_trainable=True, + device="gpu"): + super().__init__() + + # training options + self._layer = layer + self._sharding_optimizers = [sharding_optimizer] if not isinstance( + sharding_optimizer, list) else sharding_optimizer + assert all( + list( + map(lambda opt: isinstance(opt, GroupShardedOptimizerStage2), + self._sharding_optimizers)) + ), "Please use GroupShardedOptimizerStage2 optimizer" + self._sync_buffers = sync_buffers + self._auto_refresh_trainable = auto_refresh_trainable + + # Communication related attributes + self._group = collective.new_group(collective._get_global_group() + .ranks) if group is None else group + self._world_size_scaling = 1.0 / self._group.nranks + assert self._group.nranks > 1, "Training must be distributed, ranks must be greater than 1" + self._rank = self._group.rank + self._global_root_rank = self._group.ranks[ + 0] # picking ranks index 0 as the reference + self._default_device = device + + # Global statistical parameters + self._all_params = [] + for optim in self._sharding_optimizers: + self._all_params.extend(list(optim.local_params)) + + self._trainable_params = [] + self._grad_reduced = [] + self._trainable_param2rank = {} + self._trainable_param2align = {} + self._trainable_mask = list(map(_trainable, self._all_params)) + self._param_grads = [] + + # Set grad storage size & Display param sizes and model sizes + model_size = sum([p._numel() for p in self._layer.parameters()]) + assert buffer_max_size >= 0, "buffer_max_size must be GE than 0." + self._buffer_max_size = self._rank_buffer_size(buffer_max_size, + model_size) + self._use_grad_storage = buffer_max_size > 0 + self._grad_storages = {} # {dtype: {rank: GradStorage}} + self._has_grad_storage = [] + self._grad_storage_list = [] + + # Offload + # TODO(haohongxiang): Now it's not be supported for multi-optimizers using Offload strategy + self._offload_optims = list( + filter(lambda optim: optim.offload, self._sharding_optimizers)) + if len(self._offload_optims) > 0: + assert len( + self._sharding_optimizers + ) == 1, "Only support offload strategy for single optimizer" + + self._offload = len(self._offload_optims) > 0 + self._offload_device = "cpu" + + # Set backward pass hooks + self._bw_hooks = [] + + # TODO (Baibaifan) Set tasks flow support asynchronous communicate + # self._tasks_flow = deque() + + # Define optimizer step and clear_grad + self._redefine_opt_step() + self._redefine_opt_clear() + + def forward(self, *inputs, **kwargs): + """ + A wrapper for Sharding Stage2 layer. + - Fresh trainable params or rebuild grad storage + - Sync layer's buffer params + - Clear all flags states + - Forward for origin layers + """ + + # Whether to need to reset trainable parameters + needs_fresh = len(self._bw_hooks) == 0 and self.training + + if self._auto_refresh_trainable: + needs_fresh |= self._detect_train_change() + + # Front hook + self._init_internal_storage(needs_fresh) + + # Sync layer's buffers state + if self._sync_buffers: + self.__sync_buffers() + + # Normal FW on the base model + fw = self._layer(*inputs, **kwargs) + + return fw + + def set_state_dict(self, state_dict, use_structured_name=True): + self._layer.set_state_dict( + state_dict, use_structured_name=use_structured_name) + + def state_dict(self, + destination=None, + include_sublayers=True, + structured_name_prefix=""): + return self._layer.state_dict( + destination=destination, + include_sublayers=include_sublayers, + structured_name_prefix=structured_name_prefix) + + def _clear_gradients(self): + """ + Set zero to the gradient of the optimizer's current rank trainable parameters. + """ + # Release grad storages + for dtype in self._grad_storages.keys(): + if not self._offload and self._rank in self._grad_storages[ + dtype].keys(): + self._grad_storages[dtype][self._rank].buffer.zero_() + + # Release grads of params + for param in self._trainable_params: + if param.name in self._param_grads and param.grad is not None: + param._zero_grads() + + # Release grads of master params with offload strategy + if self._offload: + self._sharding_optimizers[0]._offload_clear_grad() + + def _grad_scale(self): + """ + Before the gradient accumulation, scale the gradient. + """ + # Scale grad storages + for dtype in self._grad_storages.keys(): + if not self._offload and self._rank in self._grad_storages[ + dtype].keys(): + self._grad_storages[dtype][self._rank].buffer.scale_( + scale=self._world_size_scaling) + + # Scale grads of params + for param in self._trainable_params: + if param.name in self._param_grads and param.grad is not None: + param.grad.scale_(scale=self._world_size_scaling) + # param._reset_grad_inplace_version(True) + + # Scale grads of master params with offload strategy + if self._offload: + self._sharding_optimizers[0]._offload_scale_grad( + self._world_size_scaling) + + def _init_internal_storage(self, needs_fresh): + """ + Judge Fresh trainable params or rebuild grad storage. + """ + if needs_fresh: + self._fresh_trainable() + else: + self._build_grad_storages() + + # Clear all flags state + self._clear_counters() + + def to(self, device=None, dtype=None, blocking=True): + """ + Synchronously or asynchronously convert the data type of the layer, the device is not supported now. + """ + assert isinstance(device, str), "Device must be type str" + assert device == self._default_device, "New devices are not supported, because of the optimizer state is not sync" + + self._layer.to(device=device, dtype=dtype, blocking=blocking) + + # Re-build the buckets, hooks, etc.. + self._fresh_trainable() + + def _fresh_trainable(self): + """ Whether to update training parameters. """ + + # Make sure that this is not done while gradients are waiting to be reduced (if no_sync context for instance) + if reduce(lambda x, y: x or y, self._grad_reduced, False): + logging.warning("Grads waiting to be reduced.") + + self._trainable_params = list( + filter(lambda x: x.trainable, self._all_params)) + self._trainable_params.sort(key=lambda x: x._numel()) + + self._trainable_param2rank = {} + for optim in self._sharding_optimizers: + # Need to be wrappered for Sharding Stage2 Optimizer + if len(optim.param_storages.keys()) == 0: + optim._update_opt_status() + + # Get the parameters split by the optimizer according to rank + for per_rank_params in optim.dtype_rank_params.values( + ): # all the params from all ranks + for params in per_rank_params: + for param in filter(lambda x: x.trainable, params): + self._trainable_param2rank[ + param.name] = optim.param2rank[param.name] + self._trainable_param2align[ + param.name] = optim._param2align[param.name] + + # Create grad_storage + self._setup_use_grad_storage() + # setup backward hooks + self._setup_backward_hooks() + + @paddle.autograd.no_grad() + def __sync_buffers(self): + """ + Sync all the param buffers from all ranks (exp: batch norm statistics). + """ + + for buffer in self._layer.buffers(include_sublayers=True): + collective.broadcast( + buffer, + self._global_root_rank, + self._group, + use_calc_stream=True) + + def __getattr__(self, name): + """Forward missing attributes to wrapped layer.""" + try: + return super().__getattr__(name) + except AttributeError: + return getattr(self._layer, name) + + @paddle.autograd.no_grad() + def _clear_counters(self): + """Reset all the grad reduce and call counters.""" + if self.training: + self._grad_reduced = [True for _ in self._trainable_params] + + if self._use_grad_storage: + for grad_storage in self._grad_storage_list: + grad_storage.reset_checked_in() + + def _get_reduce_fn(self, index, param, dst_rank): + """ + There are two ways to reduce gradient. + - 1. Do not use self._use_grad_storage or exceeded buffer_max_size will be reduced separately. + - 2. Use grad_storage Reduce the storage to get the full gradient from different ranks. + """ + + if not self._use_grad_storage or not self._has_grad_storage[index]: + # Direct reduction + @paddle.autograd.no_grad() + def reduce(*_): + # Skip gradient reduction, do not change status information + if self._grad_reduced[index]: + assert param.grad is not None, "Parameter gradient cannot be None" + + # Change reduce information + self._grad_reduced[index] = False + + # Clear the gradient that does not belong to the current rank through the callback function + def cleanup(): + if dst_rank != self._rank: + param.clear_gradient(False) + elif self._offload: + tmp_grad = param.grad.cast( + dtype=Type.fp32.value).cpu() + + self._sharding_optimizers[0]._offload_acc_grad( + param.name, tmp_grad) + del tmp_grad + param.clear_gradient(False) + + # Synchronize the reduce parameter gradient + collective.reduce( + tensor=param.grad, + dst=self._group.ranks[dst_rank], + group=self._group) + # TODO (Baibaifan) Asynchronous the reduce parameter gradient + + # Clear the task flow and trigger callback to clear the redundant gradient + # self._clear_task_flow() + + cleanup() + + else: + # Buffer reduction + @paddle.autograd.no_grad() + def reduce(*_): + # Skip gradient reduction, do not change status information + if self._grad_reduced[index]: + assert param.grad is not None, "Parameter gradient cannot be None" + + # Change reduce information + self._grad_reduced[index] = False + grad_storage = self._grad_storages[param.dtype][dst_rank] + grad_storage.params_checked_in += 1 + + if grad_storage.all_checked_in: + assert grad_storage.buffer is not None + + # Clearing up the grad_storage buffer + def cleanup(): + if dst_rank != self._rank: + for p in grad_storage._params: + p.clear_gradient(False) + + grad_storage.buffer._clear_data() + elif self._offload: + grad_storage.to(device=self._offload_device) + for p in grad_storage._params: + with device_guard(): + tmp_grad = p.grad.cast( + dtype=Type.fp32.value) + self._sharding_optimizers[ + 0]._offload_acc_grad(p.name, tmp_grad) + p.clear_gradient(False) + grad_storage._device = self._default_device + grad_storage.buffer._clear_data() + + # Reduce the bucket + grad_storage.sent = True + # Synchronize the reduce parameter gradient + collective.reduce( + tensor=grad_storage.buffer, + dst=self._group.ranks[grad_storage.destination], + group=self._group) + # TODO (Baibaifan) Asynchronous the reduce parameter gradient + + cleanup() + + # Clear the task flow and trigger callback to clear the redundant gradient + # self._clear_task_flow() + + return reduce + + def _setup_backward_hooks(self): + """ + Set the backward hook to synchronize the gradients of all rank by reduce group ranks. + """ + + # Remove previous backward hooks + while len(self._bw_hooks) > 0: + self._bw_hooks.pop().remove() + + # Go through the parameters, attach the hook + if not self.training: + return + + for index, param in enumerate(self._trainable_params): + dst_rank = self._trainable_param2rank[param.name] + + reduce_function = self._get_reduce_fn(index, param, dst_rank) + + self._bw_hooks.append( + param._register_backward_hook(reduce_function)) + + def _setup_use_grad_storage(self): + """ + Integrate the parameters gradient into a continuous memory according to rank, and support the update of training parameters. + """ + + # According to parameters's numel sort, allocate memory of parameter gradient to continuous memory according to rank + self._grad_storages = {} + self._has_grad_storage = [False for _ in self._trainable_params] + + for index, param in enumerate(self._trainable_params): + dst_rank = self._trainable_param2rank[param.name] + + if param.dtype not in self._grad_storages.keys(): + self._grad_storages[param.dtype] = {} + + if dst_rank not in self._grad_storages[param.dtype].keys(): + self._grad_storages[param.dtype][dst_rank] = GradStorage( + self._buffer_max_size[param.dtype], + dtype=param.dtype, + device=self._default_device, + destination=dst_rank, + parm2align=self._trainable_param2align) + + # Criteria to decide whether this parameter is to be put in GradStorage + if self._grad_storages[param.dtype][dst_rank].can_add_grad_view( + param, self._trainable_param2align[param.name]): + self._grad_storages[param.dtype][dst_rank].add_grad( + param, self._trainable_param2align[param.name]) + self._has_grad_storage[index] = True + else: + self._param_grads.append(param.name) + print( + "Can not add param: {}, param's shape: {}, param align: {}, grad_storages fill: {}, ". + format(param.name, param.shape, self._trainable_param2align[ + param.name], self._grad_storages[param.dtype][dst_rank] + ._fill)) + + for dtype in self._grad_storages.keys(): + self._grad_storage_list.extend( + list(self._grad_storages[dtype].values())) + + # def _clear_task_flow(self): + # """Try to consume the previous tasks.""" + # while len(self._tasks_flow) > 0: + # task = self._tasks_flow.popleft() + # task.wait() + # if task.callback is not None: + # task.callback() + + def _detect_train_change(self): + # Current trainable parameters + trainable_mask = list(map(_trainable, self._all_params)) + + # Whether parameters trainability changed + trainability_changed = trainable_mask != self._trainable_mask + + if trainability_changed: + logging.warning( + "Trainable params changed, because of eval/train mode or parameter freezing/unfreeze." + ) + self._trainable_mask = trainable_mask + + return trainability_changed + + def _build_grad_storages(self): + """ + Rebuild grad storages. + """ + # Rebuild fp16/fp32 grad storages + for dtype in self._grad_storages.keys(): + for dst_rank, grad_storage in self._grad_storages[dtype].items(): + if self._offload or dst_rank != self._rank: + grad_storage.manumal_relase() + grad_storage.rebuild() + + def _rank_buffer_size(self, buffer_max_size, model_size): + """ + Generate the minimum buffer size for each rank & Display param sizes and model sizes. + """ + + # Initialize buffer size + rank_buffer_size = {} + for shard_opt in self._sharding_optimizers: + if shard_opt.rank_buffer_size: + for dtype in shard_opt.rank_buffer_size.keys(): + sizes = max(shard_opt.rank_buffer_size[dtype].values()) + rank_buffer_size[dtype] = min(sizes, buffer_max_size) + + if Type.fp16.value in rank_buffer_size.keys(): + # FP16 GradStorage and model size + logger_.info( + "====== FP16 GradStorage size: {:.2f}M parameters, Model size {:.2f}M parameters ======". + format(rank_buffer_size[Type.fp16.value] / 2**19, model_size / 2 + **19)) + if Type.fp32.value in rank_buffer_size.keys(): + # FP32 GradStorage and model size + logger_.info( + "====== FP32 GradStorage size: {:.2f}M parameters, Model size {:.2f}M parameters ======". + format(rank_buffer_size[Type.fp32.value] / 2**18, model_size / 2 + **18)) + return rank_buffer_size + + def _redefine_opt_step(self): + grad_func = self._grad_scale + for opt in self._sharding_optimizers: + opt_step = opt.step + + def _opt_step(self): + grad_func() + opt_step() + + opt.step = MethodType(_opt_step, opt) + + def _redefine_opt_clear(self): + clear_func = self._clear_gradients + + def _opt_clear(self): + clear_func() + + for opt in self._sharding_optimizers: + opt.clear_grad = MethodType(_opt_clear, opt) diff --git a/python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_stage3.py b/python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_stage3.py new file mode 100644 index 0000000000000..049d3ffa3694f --- /dev/null +++ b/python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_stage3.py @@ -0,0 +1,912 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import time +import logging +import numpy as np +from types import MethodType +from collections import OrderedDict + +import paddle +from paddle import nn +from paddle.autograd import EagerPyLayer +import paddle.fluid.core as core +import paddle.fluid.framework as framework +from paddle.fluid.framework import EagerParamBase +from paddle.fluid.clip import ClipGradByGlobalNorm +from paddle.distributed import collective + +from .group_sharded_storage import GradStorage +from .group_sharded_utils import Type, GroupShardedClipGrad, device_guard + + +def _all_gather(tensor, buffer_size, group): + """ + The main difference with paddle.distributed.all_gather: + no need to pass in tensor_list, the returned tensor is spliced + """ + + assert group is not None + if framework.in_dygraph_mode(): + out = paddle.zeros([buffer_size], dtype=tensor.dtype) + task = group.process_group.all_gather(tensor, out) + return out, task + + +# CUDA alignment 256 bytes +alignment = {"gpu": 256, } +align = { + Type.fp16.value: 2, + Type.fp32.value: 4, +} + +global CHECK_LAYER +CHECK_LAYER = dict() # Help to check layer's id -> layer's name + + +class GroupShardedStage3(nn.Layer): + """ + A wrapper for Sharding Stage3 Layer in Dygraph. + + .. warning: GroupShardedStage3 encapsulates the layer strategy and integrates it into the nn.Layer. + + .. ZeRO: https://arxiv.org/pdf/1910.02054.pdf. + """ + + # TODO (Baibaifan) + # Feature Notes:: + # 1. The model supports the segmentation of parameters by global ranks in layers. + # 2. Support communication flow and computing flow. + # 3. Support offload function. + # 4. Support the establishment of independent communication groups. + + def __init__(self, + layer, + optimizer, + group=None, + sync_buffers=False, + device="gpu", + segment_size=2**20, + pertrain_sync_models=True, + offload=False, + sync_comm=False): + super().__init__() + + # Default configs + assert core.is_compiled_with_cuda(), "Only support CUDA." + self._layer = layer + self._default_device = device + self.__sync_buffers = sync_buffers + self._offload = offload + self._sync_comm = sync_comm + # segmentation size + assert segment_size >= 0, "segment_size must be GE than 0." + self._segment_size = segment_size + + global DEV + DEV = "cpu" if paddle.get_device() == "cpu" else paddle.get_device( + ).split(":")[0] + global DEV_ID + DEV_ID = 0 if paddle.get_device() == "cpu" else int(paddle.get_device() + .split(":")[1]) + global param2dtype + param2dtype = dict() + + # Communication group establishment + self._group = collective.new_group(collective._get_global_group() + .ranks) if group is None else group + self._world_size_scaling = 1.0 / self._group.nranks + assert self._group.nranks > 1, "Training must be distributed, ranks must be greater than 1." + self._rank = self._group.rank + self._global_root_rank = self._group.ranks[ + 0] # picking ranks index 0 as the reference + + # Parameter segmentation for global ranks + # After flatten -> self._param2buffer_size, self._param2buffer, self._trainable_params + self._param2buffer_size = dict() # {param.name: size} + self._param2buffer = dict( + ) # {param.name: [(start0, end0),(start1, end1), ...]} + self._trainable_params = dict() # {id(layer): [trainable_params]} + self._unslice_params = set() # param's numel <= segment_size + self._unslice_params2align = dict() # {param.name: param's align} + self._grad_storages = dict() # {param.dtype: GradStorage} + + assert not isinstance( + optimizer, list), "Multiple optimizers are not supported now." + self._optim = _OptimizerWrapper(optimizer, self._offload, self._group, + self._update_params_slice) + self._ori_parameter_list = self._optim._parameter_list + self._ori_param_groups = self._optim._param_groups + + # Replace optimizer's _grad_clip + if isinstance(self._optim._grad_clip, ClipGradByGlobalNorm): + logging.warning( + "While using ClipGradByGlobalNorm in GroupShardedStage3, the grad clip of original optimizer will be changed." + ) + self._optim._grad_clip = GroupShardedClipGrad( + self._optim._grad_clip, paddle.get_device(), self._group) + if self._optim._parameter_list and isinstance( + self._optim._parameter_list[0], dict): + for item in self._optim._param_groups: + if "grad_clip" in item.keys(): + item["grad_clip"] = self._optim._grad_clip + + # Synchronous all ranks models + if pertrain_sync_models: + self._sync_params_and_buffers() + + self._segment_rank_params(self._layer) + + # Add unslice params to master_weight in fp16 + self._handle_unslice_params() + + # In the first step, record the execution order of the layer + self._order_tracer = OrderedDict() + self._order_tracer["order"] = 0 + self._order_tracer["layer"] = list() + + # Register task flow + self._task_flow = TaskFlow() + + # Register forward hooks + self._register_forward_hooks(self._layer) + + # Register backward parameter hooks + self._register_backward_hooks() + + # Redefine optimizer step and clear function + self._redefine_opt_step() + self._redefine_opt_clear() + + @paddle.autograd.no_grad() + def _sync_params_and_buffers(self): + """ + Sync all model states for all ranks + """ + + for p in self._layer.parameters(): + collective.broadcast( + p, + src=self._global_root_rank, + group=self._group, + use_calc_stream=True) + + def _clear_gradients(self): + assert len(self._trainable_params.keys()) > 0 + current_layer_params = self._layer.parameters(include_sublayers=True) + # 1.Handle param's slice + trainable_params = list( + filter(lambda p: p.trainable and p not in self._unslice_params, + current_layer_params)) + for param in trainable_params: + assert hasattr( + param, "fw_storage" + ), "Find {} don't have fw_storage attribute.".format(param.name) + + param.fw_storage.clear_gradient(False) + param.bw_storage._clear() + param.bw_storage = None + # 2.Handle unslice param + if not self._offload: + for grad_storage in self._grad_storages.values(): + grad_storage.buffer.zero_() + else: + for param in list(self._unslice_params): + param.clear_gradient(False) + tmp_var = param.cuda(DEV_ID) + param._clear_data() + if tmp_var.dtype == Type.fp32.value and param2dtype[ + param.name] == Type.fp16.value: + tmp_var = paddle.cast(tmp_var, Type.fp16.value) + tmp_var._share_buffer_to(param) + del tmp_var + for grad_storage in self._grad_storages.values(): + grad_storage.manumal_relase() + grad_storage.rebuild() + + # Update param memery slice + def _update_params_slice(self): + update_list = self._update_params() + + if not isinstance(self._optim._param_groups[0], dict): + slice_params = [param.fw_storage for param in update_list] + self._optim._parameter_list = slice_params + list( + self._unslice_params) + self._optim._param_groups = slice_params + list( + self._unslice_params) + else: + for param_group in self._optim._param_groups: + p_group = [] + for p in param_group['params']: + if hasattr(p, "fw_storage"): + p_group.append(p.fw_storage) + else: + p_group.append(p) + + param_group['params'] = p_group + + def forward(self, *inputs, **kwargs): + """ + A wrapper for Sharding Stage3 layer. + """ + # 1.Sync layer's buffers state + if self.__sync_buffers: + self._sync_buffers() + + # 2.Normal FW on the base model + fw = self._layer(*inputs, **kwargs) + + return fw + + def set_state_dict(self, state_dict, use_structured_name=True): + self._layer.set_state_dict( + state_dict, use_structured_name=use_structured_name) + + def state_dict(self, + destination=None, + include_sublayers=True, + structured_name_prefix=""): + return self._layer.state_dict( + destination=destination, + include_sublayers=include_sublayers, + structured_name_prefix=structured_name_prefix) + + def _handle_unslice_params(self): + buffer_size = dict() + buffer_size[Type.fp32.value] = 0 + buffer_size[Type.fp16.value] = 0 + for param in self._unslice_params: + # Updata optimizer master weights + if param.dtype == Type.fp16.value and not self._offload: + master_tensor = paddle.cast(param, Type.fp32.value) + master_tensor.name = param.name + self._optim._master_weights[param.name] = master_tensor + param2dtype[param.name] = param.dtype + p_align = self._param2align(param) + self._unslice_params2align[param.name] = p_align + buffer_size[param.dtype] += param._numel() + p_align + + # Create unslice_params'grad + for param in sorted(list(self._unslice_params), key=lambda p: p.name): + if param.dtype not in self._grad_storages.keys(): + self._grad_storages[param.dtype] = GradStorage( + buffer_size[param.dtype], + dtype=param.dtype, + device=self._default_device, + destination=self._rank, + parm2align=self._unslice_params2align) + self._grad_storages[param.dtype].add_grad( + param, self._unslice_params2align[param.name]) + + def _segment_rank_params(self, layer, name="last_layer"): + """ + Flatten parameters according to layer. + """ + current_layer_params = _current_layer_params(layer) + if current_layer_params: + CHECK_LAYER[id(layer)] = name + self._flatten_layer_params(layer, current_layer_params) + + for name, sub_layer in layer.named_children(): + self._segment_rank_params(sub_layer, name) + + def _flatten_layer_params(self, layer, current_layer_params): + """ + Parameter segmentation and memory integration. + """ + + def _add_manage_info(trainable_param): + return _PartitionParam(trainable_param) + + current_params = list() + for p in current_layer_params: + if p.trainable and p._numel() > self._segment_size: + current_params.append(_add_manage_info(p)) + elif p.trainable: + self._unslice_params.add(_UnsliceParam(p)) + + assert id(layer) not in self._trainable_params.keys() + self._trainable_params[id(layer)] = current_params + + for param in self._trainable_params[id(layer)]: + if param.name in self._param2buffer.keys(): + continue + self._param2buffer[param.name] = [] + # 1.Params alignment + align_ = self._param2align(param) + + offset = align_ + param._numel() + buffer_size = offset if offset % self._group.nranks == 0 else offset + self._group.nranks - ( + offset % self._group.nranks) + self._param2buffer_size[param.name] = buffer_size + + # 2.Combination param buffer + assert buffer_size % self._group.nranks == 0 + pre_buffer = buffer_size // self._group.nranks + + for rank_ in range(self._group.nranks): + self._param2buffer[param.name].append( + (rank_ * pre_buffer, (rank_ + 1) * pre_buffer)) + + # Record param's dtype + param2dtype[param.name] = param.dtype + # 3.Flatten layer params and release other rank buffer + self._param_storage(param, buffer_size) + + def _param_storage(self, param, buffer_size): + """ + This is a function to simplify the handling of parameter InternalStorages. + """ + assert isinstance(buffer_size, int) + value = np.zeros( + buffer_size, + dtype=np.float16) if Type.fp16.value == param.dtype else np.zeros( + buffer_size, dtype=np.float32) + buffer = core.eager.Tensor(value=value, place=core.CPUPlace()) + + param_shape = param.shape + origin_state = param.stop_gradient + param.stop_gradient = True + param.flatten_() + param.stop_gradient = origin_state + start, end = self._param2buffer[param.name][self._rank] + + # Copy the current param value + with device_guard(): + tmp_var = buffer._slice(0, param._numel()) + param_cpu = param.cpu() + tmp_var.get_tensor().set(param_cpu.get_tensor(), core.CPUPlace()) + del tmp_var + param.get_tensor()._set_dims(param_shape) + param._clear_data() + + # Current rank param_storage + if self._offload: + with device_guard(): + tmp_tensor = buffer._slice(start, end) + param.fw_storage = core.eager.Tensor( + value=tmp_tensor, + place=core.CPUPlace(), + name="slice@" + param.name) + else: + param.fw_storage = core.eager.Tensor( + value=buffer._slice(start, end), name="slice@" + param.name) + param.status = "part" + + # Updata optimizer master weights + if param.dtype == Type.fp16.value and not self._offload: + master_tensor = paddle.cast(param.fw_storage, Type.fp32.value) + master_tensor.name = param.name + self._optim._master_weights[param.fw_storage.name] = master_tensor + + def _register_forward_hooks(self, layer): + """ + Register EagerPyLayer to manage memory slices. + There are four stages: + FW + 1. Before the forward layers, synchronize the full parameters. + 2. After the forward layers, release the full parameter and keep the parameter slice. + BW + 3. Before the backward layers, synchronize the full parameters and create param's grad. + 4. After the gradient accumulation, release the full parameter and keep the parameter slice. + """ + current_layer_params = _current_layer_params(layer) + if current_layer_params: + self._register_forward_all_hooks(layer, self._task_flow) + + for _, sub_layer in layer.named_children(): + self._register_forward_hooks(sub_layer) + + def _register_forward_all_hooks(self, sub_layer, task_flow): + def _forward_pre_hook(layer, inputs): + return ForwardPreHooks(layer, self._order_tracer, + self._trainable_params, + self._param2buffer_size, self._group, + self._sync_comm, self._offload, task_flow) + + def _forward_post_hook(layer, inputs, outputs): + return ForwardPostHooks.apply( + outputs, layer, self._order_tracer, self._trainable_params, + self._param2buffer, self._param2buffer_size, self._rank, + self._group, self._sync_comm, self._offload, task_flow) + + # register previous forward hooks + sub_layer.register_forward_pre_hook(_forward_pre_hook) + + # register post forward hooks + sub_layer.register_forward_post_hook(_forward_post_hook) + + @paddle.autograd.no_grad() + def _sync_buffers(self): + """ + Sync all the param buffers from all ranks (exp: batch norm statistics). + """ + + for buffer in self._layer.buffers(include_sublayers=True): + collective.broadcast( + buffer, + self._global_root_rank, + self._group, + use_calc_stream=True) + + def __getattr__(self, name): + """Forward missing attributes to wrapped layer.""" + try: + return super().__getattr__(name) + except AttributeError: + return getattr(self._layer, name) + + def _update_params(self): + """ + Update parameters to optimizer memory slice. + """ + update_list = [] + assert len(self._trainable_params.keys()) > 0 + current_layer_params = self._layer.parameters(include_sublayers=True) + trainable_params = list( + filter(lambda p: p.trainable and p not in self._unslice_params, + current_layer_params)) + # 1.Handle param's slice + for param in trainable_params: + assert hasattr( + param, + "fw_storage"), "Find {} don't have fw_storage attribute".format( + param.name) + # Gradient average + if self._offload: + with device_guard(): + param.bw_storage.scale_(scale=self._world_size_scaling) + else: + param.bw_storage.scale_(scale=self._world_size_scaling) + param.fw_storage = _VarBaseWrapper(param) + assert param.fw_storage.grad is None + param.fw_storage._copy_gradient_from(param.bw_storage) + update_list.append(param) + + # 2.Handle unslice param + for grad_storage in self._grad_storages.values(): + grad_storage.buffer.scale_(scale=self._world_size_scaling) + collective.all_reduce(tensor=grad_storage.buffer, group=self._group) + if self._offload: + for param in list(self._unslice_params): + tmp_var = _device2cpu(param, convert_dtype=True) + tmp_var._share_buffer_to(param) + del tmp_var + + for grad_storage in self._grad_storages.values(): + for p in grad_storage._params: + tmp_g = _device2cpu(p.grad, convert_dtype=True) + p.clear_gradient(False) + p._copy_gradient_from(tmp_g) + del tmp_g + grad_storage.buffer._clear() + + return update_list + + def get_all_parameters(self, convert2cpu=False): + """ + Get the full parameters and return the corresponding task flows. + """ + assert len(self._trainable_params.keys()) > 0 + current_layer_params = self._layer.parameters(include_sublayers=True) + trainable_params = list( + filter(lambda p: p.trainable and p not in self._unslice_params, + current_layer_params)) + t_flow = _allgather_buffer( + trainable_params, + self._group, + param2buffer_size=self._param2buffer_size, + use_calc_stream=True, + task_flow=TaskFlow(), + sync_wait=True, + offload=self._offload, + convert2cpu=convert2cpu) + if convert2cpu: + for param in trainable_params: + t_flow.full_param[param.name][0]._share_buffer_to(param) + + self._optim._parameter_list = self._ori_parameter_list + self._optim._param_groups = self._ori_param_groups + + def _register_backward_hooks(self): + current_layer_params = self._layer.parameters(include_sublayers=True) + trainable_params = list( + filter(lambda p: p.trainable and p not in self._unslice_params, + current_layer_params)) + + for param in trainable_params: + allreduce_function = self._get_allreduce_fn(param) + param._register_backward_hook(allreduce_function) + + def _get_allreduce_fn(self, param): + @paddle.autograd.no_grad() + def allreduce_(*_): + if param.name in self._task_flow.full_grad.keys(): + full_grad = self._task_flow.full_grad[param.name] + # Only support sync allreduce current rank's layer now + collective.all_reduce(tensor=full_grad, group=self._group) + + start, end = self._param2buffer[param.name][self._rank] + if param.bw_storage is None: + param.bw_storage = full_grad._slice(start, + end).detach().clone() + if self._offload: + param.bw_storage = _device2cpu(param.bw_storage, True) + else: + if self._offload: + cpu_grad = _device2cpu( + full_grad._slice(start, end).detach().clone(), True) + with device_guard(): + param.bw_storage = paddle.add(param.bw_storage, + cpu_grad) + else: + param.bw_storage = paddle.add( + param.bw_storage, + full_grad._slice(start, end).detach().clone()) + param.clear_gradient(False) + del self._task_flow.full_grad[param.name] + + if param.name in self._task_flow.full_param.keys(): + if param.status == "all": + param.use_count = 0 + param._clear_data() + start, end = self._param2buffer[param.name][self._rank] + param.fw_storage = self._task_flow.full_param[param.name][ + 0]._slice(start, end).detach().clone() + param.status = "part" + del self._task_flow.full_param[param.name] + + if self._offload: + param.fw_storage = _device2cpu(param.fw_storage, True) + + return allreduce_ + + def _param2align(self, param): + # CUDA alignment 256 bytes + size = param._numel() * align[param.dtype] + remaining = size % alignment[self._default_device] + ali = 0 if remaining == 0 else alignment[ + self._default_device] - remaining + align_ = ali // align[param.dtype] + return align_ + + def _redefine_opt_step(self): + params_slice_func = self._update_params_slice + opt_step = self._optim.step + + def _opt_step(self): + if not self.update_scaler: + params_slice_func() + if self.offload: + with device_guard(): + opt_step() + else: + opt_step() + + def _opt_minimize(self): + raise RuntimeError( + "optimizer.minimize() not support now, please use optimizer.step()" + ) + + self._optim.step = MethodType(_opt_step, self._optim) + self._optim.minimize = MethodType(_opt_minimize, self._optim) + + def _redefine_opt_clear(self): + clear_func = self._clear_gradients + + def _opt_clear(self): + clear_func() + + self._optim.clear_grad = MethodType(_opt_clear, self._optim) + + +def ForwardPreHooks(layer, order_tracer, trainable_params, param2buffer_size, + group, sync_comm, offload, task_flow): + + # Record layer's id + layer_id = id(layer) + use_calc, sync_wait = False, False + + if layer_id not in order_tracer.keys() or sync_comm: + use_calc, sync_wait = True, True + + # Whether to use calc stream + task_flow.use_calc[layer_id] = use_calc + else: + # Whether to use calc stream + task_flow.use_calc[layer_id] = use_calc + # wait current layer params + _wait_layer(trainable_params[layer_id], task_flow, group, + param2buffer_size, use_calc, offload) + + if layer_id == order_tracer["layer"][-1]: return + order_ = order_tracer[layer_id] + layer_id = order_tracer["layer"][order_ + 1] + + _allgather_buffer( + trainable_params[layer_id], + group, + param2buffer_size=param2buffer_size, + use_calc_stream=use_calc, + task_flow=task_flow, + sync_wait=sync_wait, + offload=offload) + + return + + +class ForwardPostHooks(EagerPyLayer): + @staticmethod + def forward(ctx, inputs, layer, order_tracer, trainable_params, + param2buffer, param2buffer_size, rank, group, sync_comm, + offload, task_flow): + + layer_id = id(layer) + # release current layer full params + _release_param(trainable_params[layer_id], param2buffer, rank, + task_flow, offload) + + if layer_id not in order_tracer.keys(): + order_ = order_tracer["order"] + order_tracer[layer_id] = order_ + order_tracer["order"] += 1 + order_tracer["layer"].append(layer_id) + + #Record fw info + ctx.order_tracer = order_tracer + ctx.task_flow = task_flow + ctx.group = group + ctx.layer_id = layer_id + ctx.sync_comm = sync_comm + ctx.trainable_params = trainable_params + ctx.param2buffer_size = param2buffer_size + ctx.offload = offload + + return inputs + + @staticmethod + def backward(ctx, *args): + # Load context value + order_tracer = ctx.order_tracer + task_flow = ctx.task_flow + group = ctx.group + layer_id = ctx.layer_id + trainable_params = ctx.trainable_params + param2buffer_size = ctx.param2buffer_size + sync_comm = ctx.sync_comm + offload = ctx.offload + use_calc, sync_wait = False, False + + # Allgather params synchronization + if sync_comm: + use_calc, sync_wait = True, True + _allgather_buffer( + trainable_params[layer_id], + group, + param2buffer_size=param2buffer_size, + use_calc_stream=use_calc, + task_flow=task_flow, + sync_wait=sync_wait, + offload=offload) + else: + _wait_layer(trainable_params[layer_id], task_flow, group, + param2buffer_size, use_calc, offload) + + # Create params's grad + _create_params_grad(trainable_params[layer_id], param2buffer_size, + task_flow) + + # Whether to use calc stream + task_flow.use_calc[layer_id] = use_calc + if layer_id != order_tracer["layer"][0] and not sync_comm: + layer_next_id = order_tracer["layer"][order_tracer[layer_id] - 1] + _allgather_buffer( + trainable_params[layer_next_id], + group, + param2buffer_size=param2buffer_size, + use_calc_stream=use_calc, + task_flow=task_flow, + sync_wait=sync_wait, + offload=offload) + + return args + + +class TaskFlow: + """ + Task flows, one way linked list for task acquisition. + """ + + def __init__(self, + full_param=dict(), + full_grad=dict(), + use_calc=dict(), + callback=None): + self.full_param = full_param + self.full_grad = full_grad + self.use_calc = use_calc + self.callback = callback + + +def _release_param(trainable_params, + param2buffer, + rank, + task_flow, + offload=False): + for param in trainable_params: + # async communicate share weight not clear + param.use_count -= 1 + if param.use_count == 0: + param._clear_data() + if param.name in task_flow.full_param.keys(): + start, end = param2buffer[param.name][rank] + with paddle.amp.auto_cast(enable=False): + param.fw_storage = task_flow.full_param[param.name][ + 0]._slice(start, end).detach().clone() + param.status = "part" + del task_flow.full_param[param.name] + + if offload: + param.fw_storage = _device2cpu(param.fw_storage) + return + + +def _wait_layer(trainable_params, + task_flow, + group, + param2buffer_size, + use_calc_stream, + offload=False): + + for param in trainable_params: + if param.status == "all": + param.use_count += 1 + continue + if param.name in task_flow.full_param.keys(): + full_param, task = task_flow.full_param[param.name] + task.wait() + full_param._slice(0, param._numel())._share_buffer_to(param) + param.fw_storage._clear() + param.fw_storage = None + param.status = "all" + param.use_count += 1 + else: + _allgather_buffer( + trainable_params, + group, + param2buffer_size=param2buffer_size, + use_calc_stream=True, + task_flow=task_flow, + sync_wait=True, + offload=offload) + break + return task_flow + + +def _allgather_buffer(trainable_params, + group, + param2buffer_size, + use_calc_stream, + task_flow, + sync_wait=False, + offload=False, + convert2cpu=False): + + for param in trainable_params: + if param.status == "all": + param.use_count += 1 + continue + + if offload: + param.fw_storage = _cpu2device(param) + + buffer_size = param2buffer_size[param.name] + with paddle.amp.auto_cast(enable=False): + full_param, task = _all_gather(param.fw_storage, buffer_size, group) + + # Allgather current layer in the 1st step synchronously + if sync_wait: + with paddle.amp.auto_cast(enable=False): + task.wait() + full_param._slice(0, param._numel())._share_buffer_to(param) + param.fw_storage._clear() + param.fw_storage = None + param.status = "all" + param.use_count += 1 + task_flow.full_param[param.name] = (full_param, task) + + # parameter converts to cpu + if convert2cpu: + p_name = param.name + param = _device2cpu(param) + del task_flow.full_param[p_name] + task_flow.full_param[p_name] = (param, None) + + return task_flow + + +@paddle.autograd.no_grad() +def _create_params_grad(trainable_params, param2buffer_size, task_flow): + for param in trainable_params: + if param.name in task_flow.full_grad.keys(): + continue + assert isinstance(param2buffer_size[param.name], int) + temp_grad = paddle.zeros( + [param2buffer_size[param.name]], dtype=param.dtype) + temp_tensor = temp_grad._slice(0, param._numel()) + temp_tensor.get_tensor()._set_dims(param.shape) + param._copy_gradient_from(temp_tensor) + del temp_tensor + task_flow.full_grad[param.name] = temp_grad + return task_flow + + +def _PartitionParam(param): + if not hasattr(param, "fw_storage"): + setattr(param, "fw_storage", None) + setattr(param, "bw_storage", None) + setattr(param, "status", "all") + setattr(param, "use_count", 0) + return param + + +def _UnsliceParam(param): + if not hasattr(param, "unslice"): + setattr(param, "unslice", True) + return param + + +def _VarBaseWrapper(param): + varbase = param.fw_storage + tmp_param = EagerParamBase( + shape=varbase.shape, dtype=varbase.dtype, name="slice@" + param.name) + varbase._share_buffer_to(tmp_param) + tmp_param.regularizer = param.regularizer + tmp_param.optimize_attr['learning_rate'] = param.optimize_attr[ + 'learning_rate'] + varbase._clear() + return tmp_param + + +def _OptimizerWrapper(optimizer, offload, group, update_params_slice): + if not hasattr(optimizer, "_optim"): + setattr(optimizer, "_optim", optimizer) + setattr(optimizer, "offload", offload) + setattr(optimizer, "_group", group) + setattr(optimizer, "update_scaler", None) + setattr(optimizer, "update_slice", update_params_slice) + return optimizer + + +def _device2cpu(trans_param, convert_dtype=False): + if convert_dtype: + trans_param = paddle.cast(trans_param, Type.fp32.value) + tmp_p = trans_param.cpu() + trans_param._clear_data() + return tmp_p + + +def _cpu2device(param): + tmp_p = param.fw_storage.cuda(DEV_ID) + if tmp_p.dtype == Type.fp32.value and param2dtype[ + param.name] == Type.fp16.value: + tmp_p = paddle.cast(tmp_p, Type.fp16.value) + return tmp_p + + +def _current_layer_params(layer): + return layer.parameters( + include_sublayers=False) + list(layer.extra_parameters) if hasattr( + layer, "extra_parameters") else layer.parameters( + include_sublayers=False) diff --git a/python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_storage.py b/python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_storage.py new file mode 100644 index 0000000000000..7a57fb29b9472 --- /dev/null +++ b/python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_storage.py @@ -0,0 +1,313 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +#Taken and modified for fairscale from: +# https://github.com/facebookresearch/fairscale/blob/main/fairscale/nn/misc/param_bucket.py +#Commit: 8acbec718f3c70a6b9785470bb9e05cd84fc3f8e + +import os +import time +import numpy as np + +import paddle +from paddle.fluid import core +from .group_sharded_utils import Type, device_guard + + +class InternalStorage: + """ + This is a basic class, which is responsible for consolidating the basic storage tensor. + + """ + + # Support integration parameter tensor + def __init__(self, size, dtype, device, convert_cpu=False): + self._params = [] + self._param_ids = [] + self._fill = 0 + self._device = device + self._dtype = dtype + + # The flatten tensor + size = [size] if isinstance(size, int) else size + if convert_cpu: + value = np.zeros( + size, + dtype=np.float16) if Type.fp16.value == dtype else np.zeros( + size, dtype=np.float32) + self.buffer = core.eager.Tensor(value=value, place=core.CPUPlace()) + else: + self.buffer = paddle.zeros(size, dtype=dtype) + + self.dev_id = 0 if paddle.get_device() == "cpu" else int( + paddle.get_device().split(":")[1]) + + def to(self, device, dtype=None, keep_alignment=True): + """ + Move the underlying buffer + """ + assert self.buffer is not None, "Cannot move a collapsed bucket, please rebuild it" + assert (dtype == Type.fp32.value or + Type.fp16.value), "Conversion type is not supported now" + + if self._device != device: + tmp_buffer = self.buffer.cuda( + self.dev_id) if device == "gpu" else self.buffer.cpu() + for param in self._params: + param.clear_gradient(False) + + del self.buffer + self.buffer = tmp_buffer + self._device = device + + if dtype is not None: + self.buffer = self.buffer.cast(dtype=dtype) + self._dtype = dtype + + +class ParamStorage(InternalStorage): + """ + This is a basic class to simplify the handling of parameter InternalStorages. + """ + + def __init__(self, size, dtype, device): + super().__init__(size, dtype, device, convert_cpu=True) + self.param2align = None + + def to(self, device, dtype=None, keep_alignment=True): + """ + Move the underlying buffer + """ + + super().to(device, dtype) + + if keep_alignment: + self._array_params() + + @paddle.autograd.no_grad() + def add_rank_params(self, trainable_params, param2align, convert_gpu=True): + """ + Add new parameters to the InternalStorage. Params becomes a view of this InternalStorage buffer. + """ + + assert all([ + id(param) not in self._param_ids for param in trainable_params + ]), "The same param cannot be checked in twice" + assert self.buffer is not None + + self.param2align = param2align + + cpu_param_shape = list() + for param in trainable_params: + p_shape = self._add_param_as_view(param, param2align[param.name], + convert_gpu) + cpu_param_shape.append(p_shape) + + if convert_gpu: + # buffer convert from cpu to cuda + self.buffer = self.buffer.cuda(self.dev_id) + + self._fill = 0 + + for idx, param in enumerate(trainable_params): + self._convert_buffer(param, cpu_param_shape[idx], + param2align[param.name]) + self._params.append(param) + self._param_ids.append(id(param)) + + @paddle.autograd.no_grad() + def _add_param_as_view(self, param, align, convert_gpu=True): + + assert ( + param.dtype == self.buffer.dtype + ), "Different types for the InternalStorage and the param, cannot proceed: {} - {}".format( + param.dtype, self.buffer.dtype) + + var_end = self._fill + param._numel() + offset = var_end + align + assert offset <= self.buffer._numel() + + p_shape = param.shape + + origin_state = param.stop_gradient + param.stop_gradient = True + param.flatten_() + param.stop_gradient = origin_state + + # Copy the current param value + + with device_guard(self.dev_id, "cpu"): + tmp_var = self.buffer._slice(self._fill, var_end) + if convert_gpu: + param_cpu = param.cpu() + param._clear_data() + tmp_var.set_value(param_cpu) + else: + tmp_var.set_value(param) + del tmp_var + + self._fill = offset + return p_shape + + @paddle.autograd.no_grad() + def _convert_buffer(self, param, p_shape, align): + + var_end = self._fill + np.prod(p_shape).tolist() + offset = var_end + align + assert offset <= self.buffer._numel() + + # Convert the param value + with device_guard(self.dev_id, self._device): + tmp_tensor = self.buffer._slice(self._fill, var_end) + tmp_tensor._share_buffer_to(param) + param.get_tensor()._set_dims(p_shape) + + self._fill = offset + + @paddle.autograd.no_grad() + def _array_params(self): + """ + Given the parameters which have been registered previously, rebuild the whole InternalStorage. + """ + assert len(self._params) > 0 + assert self.param2align is not None + + self._fill = 0 + for p in self._params: + self._convert_buffer(p, p.shape, self.param2align[p.name]) # modify + + +class GradStorage(InternalStorage): + """ + This is a basic class to simplify the handling of gradient InternalStorages + """ + + def __init__(self, + size, + dtype, + device, + destination, + parm2align, + convert_cpu=False): + if isinstance(size, np.int64): + size = size.tolist() + super().__init__(size, dtype, device, convert_cpu) + + self._max_size = size + self._release = False + + self.params_checked_in = 0 + self.destination = destination + self._parm2align = parm2align + self.sent = False + + def reset_checked_in(self): + """ Reset the counter of the parameter grads which have been checked in + """ + self.params_checked_in = 0 + self.sent = False + + @property + def all_checked_in(self): + """ Judge all the expected gradient check-in happened """ + return len(self._params) == self.params_checked_in + + def can_add_grad_view(self, param, align): + """ Is there enough InternalStorage to add this parameter gradient, and whether this param have already checked in. + """ + return self._fill + param._numel() + align <= self._max_size and id( + param) not in self._param_ids + + def to(self, device, dtype=None, keep_alignment=True): + """ + Move the underlying buffer + """ + if self._release: + self.rebuild() + + super().to(device, dtype) + + if keep_alignment: + self._array_grads() + + @paddle.autograd.no_grad() + def add_grad(self, param, align): + """ + Add a new parameter gradient to the InternalStorage. Param.grad becomes a view of this InternalStorage buffer. + """ + + assert id( + param + ) not in self._param_ids, "The same gradients cannot be checked in twice" + + self._add_grad_as_view(param, align) + self._params.append(param) + self._param_ids.append(id(param)) + + @paddle.autograd.no_grad() + def manumal_relase(self): + """ + Release the buffer from InternalStorage. The InternalStorage will need to be rebuilt before use. + """ + if not self._release: + for p in self._params: + if p.grad is not None: + p.clear_gradient(False) + + self.buffer = None + self._fill = 0 + self.params_checked_in = 0 + self._release = True + + @paddle.autograd.no_grad() + def rebuild(self): + """ + Given the parameter gradients which have been registered previously, rebuild the whole InternalStorage. + """ + + if self._release: + self.buffer = paddle.zeros([self._max_size], dtype=self._dtype) + + for p in self._params: + self._add_grad_as_view(p, self._parm2align[p.name]) + + self._release = False + + @paddle.autograd.no_grad() + def _array_grads(self): + """ + Given the parameters gradients which have been registered previously, rebuild the whole InternalStorage. + """ + if len(self._params) > 0: + self._fill = 0 + for p in self._params: + self._add_grad_as_view(p, self._parm2align[p.name]) + + @paddle.autograd.no_grad() + def _add_grad_as_view(self, param, align): + assert param._numel( + ) > 0, "Cannot add a gradient to a released InternalStorage, please rebuild" + assert param.dtype == self.buffer.dtype + + grad_end = self._fill + param._numel() + offset = grad_end + align + assert offset <= self.buffer._numel() + + # Copy the current grad value to InternalStorage + with device_guard(self.dev_id, self._device): + tmp_var = self.buffer._slice(self._fill, grad_end) + tmp_var.get_tensor()._set_dims(param.shape) + param._copy_gradient_from(tmp_var) + del tmp_var + + self._fill = offset diff --git a/python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_utils.py b/python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_utils.py new file mode 100644 index 0000000000000..eae8f87b01420 --- /dev/null +++ b/python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_utils.py @@ -0,0 +1,227 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import contextlib +from enum import Enum +import numpy as np +from types import MethodType + +import paddle +from paddle import _C_ops +from paddle.fluid import core +from paddle.fluid import layers +from paddle.fluid.dygraph import to_variable +from paddle.fluid.framework import dygraph_only + + +class Taskflow: + """ + Task flows, one way linked list for task acquisition. + """ + + def __init__(self, task, callback): + self.task = task + self.callback = callback + + +class Type(Enum): + """ + Type of trainable parameters + """ + fp16 = paddle.float16 + fp32 = paddle.float32 + + +class GroupShardedClipGrad: + def __init__(self, clip, device, group): + self._clip = clip + self._device = device + self._group = group + + @paddle.autograd.no_grad() + def _dygraph_clip(self, params_grads): + sum_square_fp32, sum_square_fp16 = [], [] + unslice_params_fp32, unslice_params_fp16 = [], [] + + for p, g in params_grads: + p_slice = True # using for slice parameter in sharding stage3 + if g is None or getattr(p, 'need_clip', True) is False: + continue + if hasattr(p, "unslice"): + p_slice = False + + merge_grad = g + if g.type == core.VarDesc.VarType.SELECTED_ROWS: + merge_grad = layers.get_tensor_from_selected_rows( + layers.merge_selected_rows(g)) + square = layers.square(merge_grad) + sum_square = layers.reduce_sum(square) + + if p.dtype == paddle.float16: + if p_slice: sum_square_fp16.append(sum_square) + else: unslice_params_fp16.append(sum_square) + elif p.dtype == paddle.float32: + if p_slice: sum_square_fp32.append(sum_square) + else: unslice_params_fp32.append(sum_square) + + # global norm of non-distributed FP16 params_and_grads + if len(sum_square_fp16) == 0: + global_norm_fp16 = paddle.to_tensor([0.], dtype=paddle.float32) + else: + global_norm_fp16 = layers.concat(sum_square_fp16) + global_norm_fp16 = layers.reduce_sum(global_norm_fp16) + global_norm_fp16 = paddle.cast( + global_norm_fp16, dtype=paddle.float32) + + # global norm of non-distributed FP16 params_and_grads for unslice parameters + if len(unslice_params_fp16) == 0: + global_unslice_fp16 = paddle.to_tensor([0.], dtype=paddle.float32) + else: + global_unslice_fp16 = layers.concat(unslice_params_fp16) + global_unslice_fp16 = layers.reduce_sum(global_unslice_fp16) + global_unslice_fp16 = paddle.cast( + global_unslice_fp16, dtype=paddle.float32) + + # global norm of non-distributed FP32 params_and_grads + global_norm_fp32 = layers.concat(sum_square_fp32) if len( + sum_square_fp32) != 0 else paddle.to_tensor( + [0.], dtype=paddle.float32) + global_norm_fp32 = layers.reduce_sum(global_norm_fp32) + + # global norm of non-distributed FP32 params_and_grads for unslice parameters + global_unslice_fp32 = layers.concat(unslice_params_fp32) if len( + unslice_params_fp32) != 0 else paddle.to_tensor( + [0.], dtype=paddle.float32) + global_unslice_fp32 = layers.reduce_sum(global_unslice_fp32) + global_unslice_var = global_unslice_fp16 + global_unslice_fp32 + + global_norm_var = global_norm_fp16 + global_norm_fp32 + 1.0 / self._group.nranks * global_unslice_var + + # add all reduce to get global norm of distributed params_and_grads + dev_id = int(self._device.split(":")[1]) + if paddle.device.get_device() == "cpu": + global_norm_var = global_norm_var.cuda(dev_id) + + with device_guard(dev_id, "gpu"): + paddle.distributed.all_reduce(global_norm_var, group=self._group) + + global_norm_var = layers.sqrt(global_norm_var) + max_global_norm = layers.fill_constant( + shape=[1], dtype=global_norm_var.dtype, value=self.clip_norm) + + clip_var = layers.elementwise_div( + x=max_global_norm, + y=layers.elementwise_max( + x=global_norm_var, y=max_global_norm)) + clip_var_fp16 = paddle.cast(clip_var, paddle.float16) + + for p, g in params_grads: + if getattr(p, 'need_clip', True) is False or g is None: + continue + origin_state = g.stop_gradient + g.stop_gradient = True + if p.dtype == paddle.float16: + g.scale_(clip_var_fp16.item()) + else: + g.scale_(clip_var.item()) + g.stop_gradient = origin_state + # p._reset_grad_inplace_version(True) + + return params_grads + + def __getattr__(self, item): + return getattr(self._clip, item) + + def __call__(self, params_grads): + return self._dygraph_clip(params_grads) + + +@contextlib.contextmanager +def device_guard(dev_id=0, device="cpu"): + origin_device = paddle.device.get_device() + if device == "cpu": + paddle.set_device(device) + elif device == "gpu": + paddle.set_device("gpu:{}".format(dev_id)) + try: + yield + finally: + paddle.set_device(origin_device) + + +@dygraph_only +def GroupShardedScaler(scaler): + def unscale_method(self, optimizer): + if not self._enable: + return + param_grads = [] + param_grads_fp16 = [] + param_grads_fp32 = [] + if hasattr(optimizer, "update_slice"): + optimizer.update_slice() + optimizer.update_scaler = True + + if getattr(optimizer._optim, '_param_groups', None) and isinstance( + optimizer._optim._param_groups[0], dict): + + for group in optimizer._optim._param_groups: + for param in group['params']: + if param.grad is not None: + param_grads.append(param.grad) + if param.grad.dtype in [ + core.VarDesc.VarType.FP16, paddle.float16 + ]: + param_grads_fp16.append(param.grad) + else: + param_grads_fp32.append(param.grad) + else: + for param in optimizer._optim._parameter_list: + if param.grad is not None: + param_grads.append(param.grad) + if param.grad.dtype in [ + core.VarDesc.VarType.FP16, paddle.float16 + ]: + param_grads_fp16.append(param.grad) + else: + param_grads_fp32.append(param.grad) + + temp_found_inf_fp16 = to_variable(np.array([0]).astype(np.bool)) + temp_found_inf_fp32 = to_variable(np.array([0]).astype(np.bool)) + + device = "cpu" if optimizer.offload else "gpu" + dev_id = 0 if device == "cpu" else int(paddle.get_device().split(":")[ + 1]) + + with device_guard(dev_id, device): + if len(param_grads_fp16): + _C_ops.check_finite_and_unscale(param_grads_fp16, self._scale, + param_grads_fp16, + temp_found_inf_fp16) + if len(param_grads_fp32): + _C_ops.check_finite_and_unscale(param_grads_fp32, self._scale, + param_grads_fp32, + temp_found_inf_fp32) + + self._found_inf = 1 if temp_found_inf_fp16 or temp_found_inf_fp32 else 0 + is_found_inf = paddle.to_tensor([self._found_inf], dtype="int32") + + paddle.distributed.all_reduce( + is_found_inf, + op=paddle.distributed.ReduceOp.MAX, + group=optimizer._group) + self._found_inf = is_found_inf.numpy()[0] + + scaler._unscale = MethodType(unscale_method, scaler) + return scaler diff --git a/python/paddle/distributed/sharding/group_sharded.py b/python/paddle/distributed/sharding/group_sharded.py index 6fd4caa7b4a5c..4c22028b2304c 100644 --- a/python/paddle/distributed/sharding/group_sharded.py +++ b/python/paddle/distributed/sharding/group_sharded.py @@ -20,11 +20,20 @@ from paddle.optimizer import Optimizer from paddle.distributed.utils import get_logger +from paddle.fluid.framework import in_dygraph_mode + +# Old version from paddle.distributed.fleet.meta_optimizers.dygraph_optimizer.sharding_optimizer_stage2 import ShardingOptimizerStage2 from paddle.distributed.fleet.meta_parallel.sharding.sharding_stage2 import ShardingStage2 from paddle.distributed.fleet.meta_parallel.sharding.sharding_stage3 import ShardingStage3 from paddle.distributed.fleet.meta_parallel.sharding.sharding_utils import ShardingScaler +# New version +from paddle.distributed.fleet.meta_parallel.sharding.group_sharded_optimizer_stage2 import GroupShardedOptimizerStage2 +from paddle.distributed.fleet.meta_parallel.sharding.group_sharded_stage2 import GroupShardedStage2 +from paddle.distributed.fleet.meta_parallel.sharding.group_sharded_stage3 import GroupShardedStage3 +from paddle.distributed.fleet.meta_parallel.sharding.group_sharded_utils import GroupShardedScaler + logger_ = get_logger(logging.INFO) @@ -110,30 +119,56 @@ def check_dtype(param): logger_.info("*" * 30) logger_.info("Sharded level os uses sharded level os_g achieved now.") logger_.info("*" * 30) - optimizer = ShardingOptimizerStage2( - params=model.parameters(), - optim=optimizer, - group=group, - offload=offload) - model = ShardingStage2( - model, - optimizer, - group=group, - sync_buffers=sync_buffers, - buffer_max_size=buffer_max_size) + if in_dygraph_mode(): + optimizer = GroupShardedOptimizerStage2( + params=optimizer._parameter_list, + optim=optimizer, + group=group, + offload=offload) + model = GroupShardedStage2( + model, + optimizer, + group=group, + sync_buffers=sync_buffers, + buffer_max_size=buffer_max_size) + else: + optimizer = ShardingOptimizerStage2( + params=model.parameters(), + optim=optimizer, + group=group, + offload=offload) + model = ShardingStage2( + model, + optimizer, + group=group, + sync_buffers=sync_buffers, + buffer_max_size=buffer_max_size) elif level == 'p_g_os': - model = ShardingStage3( - model, - optimizer=optimizer, - group=group, - sync_buffers=sync_buffers, - segment_size=segment_size, - offload=offload, - sync_comm=sync_comm) + if in_dygraph_mode(): + model = GroupShardedStage3( + model, + optimizer=optimizer, + group=group, + sync_buffers=sync_buffers, + segment_size=segment_size, + offload=offload, + sync_comm=sync_comm) + else: + model = ShardingStage3( + model, + optimizer=optimizer, + group=group, + sync_buffers=sync_buffers, + segment_size=segment_size, + offload=offload, + sync_comm=sync_comm) else: raise ValueError("Please enter the correct level.") if params_fp16 and isinstance(scaler, paddle.amp.GradScaler): - scaler = ShardingScaler(scaler) + if in_dygraph_mode(): + scaler = GroupShardedScaler(scaler) + else: + scaler = ShardingScaler(scaler) logger_.info("*" * 30) logger_.info( "If there is a communication hang using group sharded, please check whether the communication operations of each process are unified." @@ -195,9 +230,9 @@ def save_group_sharded_model(model, output, optimizer=None): ), "Saving directory ({}) should be a directory, not a file".format(output) os.makedirs(output, exist_ok=True) output_model = os.path.join(output, "model.pdmodel") - if isinstance(model, ShardingStage2): + if isinstance(model, (ShardingStage2, GroupShardedStage2)): paddle.save(model._layer.state_dict(), output_model) - elif isinstance(model, ShardingStage3): + elif isinstance(model, (ShardingStage3, GroupShardedStage3)): convert2cpu = True if model._offload else False model.get_all_parameters(convert2cpu=convert2cpu) paddle.save(model._layer.state_dict(), output_model) diff --git a/python/paddle/fluid/dygraph/varbase_patch_methods.py b/python/paddle/fluid/dygraph/varbase_patch_methods.py index 9bf245ff388b4..b2441e90fc9fb 100644 --- a/python/paddle/fluid/dygraph/varbase_patch_methods.py +++ b/python/paddle/fluid/dygraph/varbase_patch_methods.py @@ -819,6 +819,10 @@ def _slice(self, begin_idx, end_idx): def _numel(self): return self.get_tensor()._numel() + @framework.dygraph_only + def _clear_data(self): + self.get_tensor()._clear() + @framework.dygraph_only def _uva(self, device_id=0): ''' @@ -934,6 +938,7 @@ def to_sparse_coo(self, sparse_dim): setattr(core.eager.Tensor, "_slice", _slice) setattr(core.eager.Tensor, "_numel", _numel) setattr(core.eager.Tensor, "_uva", _uva) + setattr(core.eager.Tensor, "_clear_data", _clear_data) else: setattr(core.VarBase, "__name__", "Tensor") setattr(core.VarBase, "grad", grad) diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt index d0126013dcf82..d947784e518c8 100755 --- a/python/paddle/fluid/tests/unittests/CMakeLists.txt +++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt @@ -1142,7 +1142,7 @@ if(WITH_DISTRIBUTE AND WITH_GPU AND WITH_NCCL) set_tests_properties(test_parallel_dygraph_sharding_parallel PROPERTIES TIMEOUT 120) set_tests_properties(test_dygraph_sharding_optimizer_stage2 PROPERTIES TIMEOUT 120) set_tests_properties(test_dygraph_sharding_stage2 PROPERTIES TIMEOUT 120) - set_tests_properties(test_dygraph_sharding_stage3 PROPERTIES TIMEOUT 120) + set_tests_properties(test_dygraph_sharding_stage3 PROPERTIES TIMEOUT 200) set_tests_properties(test_dygraph_group_sharded_api PROPERTIES TIMEOUT 120) set_tests_properties(test_auto_parallel_parallelizer PROPERTIES TIMEOUT 120) set_tests_properties(test_parallel_dygraph_mp_layers PROPERTIES TIMEOUT 120) diff --git a/python/paddle/fluid/tests/unittests/dygraph_group_sharded_api.py b/python/paddle/fluid/tests/unittests/dygraph_group_sharded_api.py index d4832782c329a..574a222ba18c9 100644 --- a/python/paddle/fluid/tests/unittests/dygraph_group_sharded_api.py +++ b/python/paddle/fluid/tests/unittests/dygraph_group_sharded_api.py @@ -22,6 +22,7 @@ from paddle.fluid.dygraph.nn import Linear from paddle.distributed import fleet from paddle.fluid.dygraph import nn +from paddle.fluid.framework import _test_eager_guard from paddle.distributed.sharding import group_sharded_parallel, save_group_sharded_model epoch = 10 @@ -144,4 +145,6 @@ def test_sharding_api(): if __name__ == '__main__': + with _test_eager_guard(): + pass test_sharding_api() diff --git a/python/paddle/fluid/tests/unittests/dygraph_group_sharded_api_eager.py b/python/paddle/fluid/tests/unittests/dygraph_group_sharded_api_eager.py new file mode 100644 index 0000000000000..85a5446cb6447 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/dygraph_group_sharded_api_eager.py @@ -0,0 +1,147 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import time +import shutil +import tempfile +import numpy as np + +import paddle +import paddle.fluid as fluid +from paddle.fluid.dygraph.nn import Linear +from paddle.distributed import fleet +from paddle.fluid.dygraph import nn +from paddle.fluid.framework import _test_eager_guard +from paddle.distributed.sharding import group_sharded_parallel, save_group_sharded_model + +epoch = 10 +paddle.seed(2022) +np.random.seed(2022) +base_lr = 0.1 +momentum_rate = 0.9 +l2_decay = 1e-4 +batch_size = 100 + + +class MLP(fluid.Layer): + def __init__(self, linear_size=1000, param_attr=None, bias_attr=None): + super(MLP, self).__init__() + + self._linear1 = Linear(linear_size, linear_size) + self._linear2 = Linear(linear_size, linear_size) + self._linear3 = Linear(linear_size, 10) + + def forward(self, inputs): + y = self._linear1(inputs) + y = self._linear2(y) + y = self._linear3(y) + return y + + +def reader_decorator(linear_size=1000): + def __reader__(): + for _ in range(100): + img = np.random.rand(linear_size).astype('float32') + label = np.ones(1).astype('int64') + yield img, label + + return __reader__ + + +def optimizer_setting(model, use_pure_fp16, opt_group=False): + clip = paddle.nn.ClipGradByGlobalNorm(clip_norm=1.0) + optimizer = paddle.optimizer.Momentum( + parameters=[{ + "params": list(model.parameters()) + }] if opt_group else list(model.parameters()), + learning_rate=0.001, + weight_decay=0.00001, + grad_clip=clip, + multi_precision=use_pure_fp16) + + return optimizer + + +def train_mlp(model, shard_level, use_pure_fp16, output_dir): + optimizer = optimizer_setting(model=model, use_pure_fp16=use_pure_fp16) + model = paddle.amp.decorate(models=model, level='O2', save_dtype='float32') + scaler = paddle.amp.GradScaler(init_loss_scaling=32768) + + model, optimizer, scaler = group_sharded_parallel( + model=model, optimizer=optimizer, level=shard_level, scaler=scaler) + + train_reader = paddle.batch( + reader_decorator(), batch_size=batch_size, drop_last=True) + + train_loader = paddle.io.DataLoader.from_generator( + capacity=32, + use_double_buffer=True, + iterable=True, + return_list=True, + use_multiprocess=True) + train_loader.set_sample_list_generator(train_reader) + + for eop in range(epoch): + model.train() + for batch_id, data in enumerate(train_loader()): + img, label = data + label.stop_gradient = True + img.stop_gradient = True + with paddle.amp.auto_cast(True, level='O2'): + out = model(img) + loss = paddle.nn.functional.cross_entropy( + input=out, label=label) + avg_loss = paddle.mean(x=loss.cast(dtype=paddle.float32)) + + if not use_pure_fp16: + avg_loss.backward() + optimizer.step() + else: + scaler.scale(avg_loss).backward() + scaler.step(optimizer) + scaler.update() + + optimizer.clear_grad() + + save_group_sharded_model(model, output=output_dir, optimizer=optimizer) + return model.parameters() + + +def test_sharding_api(): + paddle.distributed.init_parallel_env() + mlp, mlp1, mlp2 = MLP(), MLP(), MLP() + state_dict = mlp.state_dict() + mlp1.set_state_dict(state_dict) + mlp2.set_state_dict(state_dict) + + output_dir = tempfile.mkdtemp() + + # fp16 + stage2_params = train_mlp( + mlp1, shard_level="os_g", use_pure_fp16=True, output_dir=output_dir) + stage3_params = train_mlp( + mlp2, shard_level="p_g_os", use_pure_fp16=True, output_dir=output_dir) + + for i in range(len(stage3_params)): + np.testing.assert_allclose( + stage2_params[i].numpy(), + stage3_params[i].numpy(), + rtol=1e-4, + atol=1e-3) + shutil.rmtree(output_dir) + + +if __name__ == '__main__': + with _test_eager_guard(): + test_sharding_api() diff --git a/python/paddle/fluid/tests/unittests/dygraph_group_sharded_stage2.py b/python/paddle/fluid/tests/unittests/dygraph_group_sharded_stage2.py new file mode 100644 index 0000000000000..8c07734d513c4 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/dygraph_group_sharded_stage2.py @@ -0,0 +1,229 @@ +# -*- coding: UTF-8 -*- + +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import shutil +import numpy as np +import argparse +import tempfile +import ast +import time +import paddle +import paddle.fluid as fluid +from paddle.fluid.dygraph.nn import Linear +from paddle.distributed import fleet +from paddle.fluid.dygraph import nn +from paddle.fluid.framework import _test_eager_guard + +from paddle.distributed.fleet.meta_parallel.sharding.group_sharded_optimizer_stage2 import GroupShardedOptimizerStage2 +from paddle.distributed.fleet.meta_parallel.sharding.group_sharded_stage2 import GroupShardedStage2 + +seed = 2022 +epoch = 2 +linear_size = 1000 + +np.random.seed(seed) +paddle.seed(seed) + + +class MLP(fluid.Layer): + def __init__(self, linear_size=1000, param_attr=None, bias_attr=None): + super(MLP, self).__init__() + + self._linear1 = Linear(linear_size, linear_size) + self._linear2 = Linear(linear_size, linear_size) + self._linear3 = Linear(linear_size, 10) + + def forward(self, inputs): + y = self._linear1(inputs) + y = self._linear2(y) + y = self._linear3(y) + return y + + +def reader_decorator(linear_size=1000): + def __reader__(): + for _ in range(100): + img = np.random.rand(linear_size).astype('float32') + label = np.ones(1).astype('int64') + yield img, label + + return __reader__ + + +def optimizer_setting(model, use_pure_fp16, opt_group=False): + clip = paddle.nn.ClipGradByGlobalNorm(clip_norm=1.0) + optimizer = paddle.optimizer.AdamW( + parameters=[{ + "params": model.parameters(), + }] if opt_group else model.parameters(), + learning_rate=0.001, + weight_decay=0.00001, + grad_clip=clip, + multi_precision=use_pure_fp16) + + return optimizer + + +def train_mlp(model, + sharding_stage, + batch_size=100, + use_pure_fp16=False, + accumulate_grad=False, + opt_group=False, + save_model=False, + test_minimize=False): + if sharding_stage != "dp": + group = paddle.distributed.new_group([0, 1], backend="nccl") + if opt_group: + optimizer = optimizer_setting( + model=model, use_pure_fp16=use_pure_fp16, opt_group=opt_group) + else: + optimizer = optimizer_setting(model=model, use_pure_fp16=use_pure_fp16) + + if sharding_stage == 2: + optimizer = GroupShardedOptimizerStage2( + params=optimizer._parameter_list, optim=optimizer, group=group) + + model = GroupShardedStage2( + model, optimizer, group=group, buffer_max_size=2**21) + else: + model = paddle.DataParallel(model) + + # check optimizer.minimize() error + if test_minimize: + try: + optimizer.minimize() + except: + print( + "====== Find sharding_stage2_optimizer.minimize() error ======") + return + + train_reader = paddle.batch( + reader_decorator(), batch_size=batch_size, drop_last=True) + + train_loader = paddle.io.DataLoader.from_generator( + capacity=32, + use_double_buffer=True, + iterable=True, + return_list=True, + use_multiprocess=True) + train_loader.set_sample_list_generator(train_reader) + + if sharding_stage == 2: + model.to(device="gpu") + + for eop in range(epoch): + model.train() + + for batch_id, data in enumerate(train_loader()): + img, label = data + label.stop_gradient = True + img.stop_gradient = True + + out = model(img) + loss = paddle.nn.functional.cross_entropy(input=out, label=label) + + avg_loss = paddle.mean(x=loss.cast(dtype=paddle.float32)) + if batch_size == 20: + avg_loss = avg_loss / 5 + avg_loss.backward() + + if not accumulate_grad: + optimizer.step() + optimizer.clear_grad() + + if accumulate_grad: + optimizer.step() + optimizer.clear_grad() + + if save_model: + return model, optimizer + return model.parameters() + + +def test_dp_stage2(): + paddle.distributed.init_parallel_env() + mlp = MLP() + state_dict = mlp.state_dict() + mlp1 = MLP() + mlp2 = MLP() + mlp3 = MLP() + mlp4 = MLP() + mlp5 = MLP() + mlp6 = MLP() + mlp7 = MLP() + mlp1.set_state_dict(state_dict) + mlp2.set_state_dict(state_dict) + mlp3.set_state_dict(state_dict) + mlp4.set_state_dict(state_dict) + mlp5.set_state_dict(state_dict) + mlp6.set_state_dict(state_dict) + mlp7.set_state_dict(state_dict) + + # DP VS stage2 + dp_params = train_mlp( + mlp1, sharding_stage="dp", use_pure_fp16=False, opt_group=False) + stage2_params = train_mlp( + mlp2, sharding_stage=2, use_pure_fp16=False, opt_group=False) + for i in range(len(dp_params)): + np.testing.assert_allclose( + dp_params[i].numpy(), stage2_params[i].numpy(), rtol=1e-6) + + # stage2 accumulate grad + stage2_params = train_mlp(mlp3, sharding_stage=2, accumulate_grad=True) + stage2_accumulate_grad = train_mlp( + mlp4, sharding_stage=2, batch_size=20, accumulate_grad=True) + for i in range(len(stage2_params)): + np.testing.assert_allclose( + stage2_params[i].numpy(), + stage2_accumulate_grad[i].numpy(), + rtol=1e-5, + atol=1e-5) + + # stage2 param list VS param group + stage2_params = train_mlp( + mlp5, sharding_stage=2, use_pure_fp16=False, opt_group=True) + for i in range(len(dp_params)): + np.testing.assert_allclose( + dp_params[i].numpy(), stage2_params[i].numpy(), rtol=1e-6) + + # save/load model + output_dir = tempfile.mkdtemp() + model_file = os.path.join(output_dir, "model.pdmodel") + optimizer_file = os.path.join(output_dir, "model.pdopt") + model_stage2, optimizer_stage2 = train_mlp( + mlp6, + sharding_stage=2, + use_pure_fp16=False, + opt_group=False, + save_model=True) + paddle.save(model_stage2.state_dict(), model_file) + paddle.save(optimizer_stage2.state_dict(), optimizer_file) + m_state_dict = paddle.load(model_file) + opt_state_dict = paddle.load(optimizer_file) + model_stage2.set_state_dict(m_state_dict) + optimizer_stage2.set_state_dict(opt_state_dict) + shutil.rmtree(output_dir) + + # check optimizer.minimize() error + train_mlp(mlp7, sharding_stage=2, test_minimize=True) + return + + +if __name__ == '__main__': + with _test_eager_guard(): + test_dp_stage2() diff --git a/python/paddle/fluid/tests/unittests/dygraph_group_sharded_stage2_offload.py b/python/paddle/fluid/tests/unittests/dygraph_group_sharded_stage2_offload.py new file mode 100644 index 0000000000000..b09314ae9e31c --- /dev/null +++ b/python/paddle/fluid/tests/unittests/dygraph_group_sharded_stage2_offload.py @@ -0,0 +1,112 @@ +# -*- coding: UTF-8 -*- + +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np +import argparse +import ast +import time +import paddle +import paddle.fluid as fluid +from paddle.fluid.dygraph.nn import Linear +from paddle.distributed import fleet +from paddle.fluid.dygraph import nn +from paddle.fluid.framework import _test_eager_guard + +from paddle.distributed.fleet.meta_parallel.sharding.group_sharded_optimizer_stage2 import GroupShardedOptimizerStage2 +from paddle.distributed.fleet.meta_parallel.sharding.group_sharded_stage2 import GroupShardedStage2 +from paddle.distributed.fleet.meta_parallel.sharding.group_sharded_utils import GroupShardedScaler + +from dygraph_group_sharded_stage2 import MLP, reader_decorator, optimizer_setting + +seed = 2021 +epoch = 2 +batch_size = 32 +linear_size = 1000 + +np.random.seed(seed) +paddle.seed(seed) + + +def train_mlp(model, offload=False): + optimizer = optimizer_setting(model=model, use_pure_fp16=True) + + model = paddle.amp.decorate(models=model, level='O2', save_dtype='float32') + scaler = paddle.amp.GradScaler(init_loss_scaling=1024) + scaler = GroupShardedScaler(scaler) + + optimizer = GroupShardedOptimizerStage2( + params=optimizer._parameter_list, optim=optimizer, offload=offload) + model = GroupShardedStage2(model, optimizer, buffer_max_size=2**21) + + train_reader = paddle.batch( + reader_decorator(linear_size), batch_size=batch_size, drop_last=True) + + train_loader = paddle.io.DataLoader.from_generator( + capacity=32, + use_double_buffer=True, + iterable=True, + return_list=True, + use_multiprocess=True) + train_loader.set_sample_list_generator(train_reader) + + for eop in range(epoch): + model.train() + + for batch_id, data in enumerate(train_loader()): + img, label = data + label.stop_gradient = True + img.stop_gradient = True + + with paddle.amp.auto_cast(True, level='O2'): + out = model(img) + loss = paddle.nn.functional.cross_entropy( + input=out, label=label) + + avg_loss = paddle.mean(x=loss.cast(dtype=paddle.float32)) + scaler.scale(avg_loss).backward() + + scaler.step(optimizer) + scaler.update() + optimizer.clear_grad() + + for dtype in optimizer.param_storages: + for dst_rank, param_storage in optimizer.param_storages[dtype].items(): + param_storage.to(device="gpu", dtype=dtype) + + return model.parameters() + + +def test_sharding_stage2_offload(): + paddle.distributed.init_parallel_env() + mlp = MLP(linear_size) + mlp_offload = MLP(linear_size) + mlp_offload.set_state_dict(mlp.state_dict()) + + mlp_params = train_mlp(mlp, offload=False) + mlp_offload_params = train_mlp(mlp_offload, offload=True) + + for i in range(len(mlp_params)): + np.testing.assert_allclose( + mlp_params[i].numpy(), + mlp_offload_params[i].numpy(), + rtol=5e-3, + atol=5e-3) + return + + +if __name__ == '__main__': + with _test_eager_guard(): + test_sharding_stage2_offload() diff --git a/python/paddle/fluid/tests/unittests/dygraph_group_sharded_stage3.py b/python/paddle/fluid/tests/unittests/dygraph_group_sharded_stage3.py new file mode 100644 index 0000000000000..6c350e63f444c --- /dev/null +++ b/python/paddle/fluid/tests/unittests/dygraph_group_sharded_stage3.py @@ -0,0 +1,283 @@ +# -*- coding: UTF-8 -*- + +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import shutil +import tempfile +import numpy as np +import argparse +import ast +import time +import paddle +import paddle.fluid as fluid +from paddle.fluid.dygraph.nn import Linear +from paddle.distributed import fleet +from paddle.fluid.dygraph import nn +from paddle.fluid.framework import _test_eager_guard + +from paddle.distributed.fleet.meta_parallel.sharding.group_sharded_optimizer_stage2 import GroupShardedOptimizerStage2 +from paddle.distributed.fleet.meta_parallel.sharding.group_sharded_stage2 import GroupShardedStage2 +from paddle.distributed.fleet.meta_parallel.sharding.group_sharded_stage3 import GroupShardedStage3 +from paddle.distributed.fleet.meta_parallel.sharding.group_sharded_utils import GroupShardedScaler + +epoch = 10 +paddle.seed(2022) +np.random.seed(2022) +base_lr = 0.1 +momentum_rate = 0.9 +l2_decay = 1e-4 + + +class MLP(fluid.Layer): + def __init__(self, linear_size=1000, param_attr=None, bias_attr=None): + super(MLP, self).__init__() + + self._linear1 = Linear(linear_size, linear_size) + self._linear2 = Linear(linear_size, linear_size) + self._linear3 = Linear(linear_size, 10) + + def forward(self, inputs): + y = self._linear1(inputs) + y = self._linear2(y) + y = self._linear3(y) + return y + + +def reader_decorator(linear_size=1000): + def __reader__(): + for _ in range(100): + img = np.random.rand(linear_size).astype('float32') + label = np.ones(1).astype('int64') + yield img, label + + return __reader__ + + +def optimizer_setting(model, use_pure_fp16, opt_group=False): + clip = paddle.nn.ClipGradByGlobalNorm(clip_norm=1.0) + optimizer = paddle.optimizer.Momentum( + parameters=[{ + "params": list(model.parameters()) + }] if opt_group else list(model.parameters()), + learning_rate=0.001, + weight_decay=0.00001, + grad_clip=clip, + multi_precision=use_pure_fp16) + + return optimizer + + +def train_mlp(model, + sharding_stage, + use_pure_fp16=False, + accumulate_grad=False, + batch_size=100, + opt_group=False, + sync_comm=False, + test_minimize=False, + save_model=False): + group = paddle.distributed.new_group([0, 1]) + if opt_group: + optimizer = optimizer_setting( + model=model, use_pure_fp16=use_pure_fp16, opt_group=opt_group) + else: + optimizer = optimizer_setting(model=model, use_pure_fp16=use_pure_fp16) + + if use_pure_fp16: + model = paddle.amp.decorate( + models=model, level='O2', save_dtype='float32') + scaler = paddle.amp.GradScaler(init_loss_scaling=32768) + scaler = GroupShardedScaler(scaler) + if sharding_stage == 2: + optimizer = GroupShardedOptimizerStage2( + params=optimizer._parameter_list, optim=optimizer, group=group) + model = GroupShardedStage2( + model, optimizer, group=group, buffer_max_size=2**21) + elif sharding_stage == 3: + model = GroupShardedStage3( + model, + optimizer=optimizer, + group=group, + sync_comm=sync_comm, + segment_size=2**15) + + # check optimizer.minimize() error + if test_minimize: + try: + optimizer.minimize() + except: + print( + "====== Find sharding_stage3_optimizer.minimize() error ======") + return + + train_reader = paddle.batch( + reader_decorator(), batch_size=batch_size, drop_last=True) + + train_loader = paddle.io.DataLoader.from_generator( + capacity=32, + use_double_buffer=True, + iterable=True, + return_list=True, + use_multiprocess=True) + train_loader.set_sample_list_generator(train_reader) + + for eop in range(epoch): + model.train() + for batch_id, data in enumerate(train_loader()): + img, label = data + label.stop_gradient = True + img.stop_gradient = True + with paddle.amp.auto_cast(True, level='O2'): + out = model(img) + loss = paddle.nn.functional.cross_entropy( + input=out, label=label) + avg_loss = paddle.mean(x=loss.cast(dtype=paddle.float32)) + + if batch_size == 20: + avg_loss = avg_loss / 5 + + if not use_pure_fp16: + avg_loss.backward() + else: + scaler.scale(avg_loss).backward() + + if not accumulate_grad: + if not use_pure_fp16: + optimizer.step() + else: + scaler.step(optimizer) + scaler.update() + optimizer.clear_grad() + if accumulate_grad: + if not use_pure_fp16: + optimizer.step() + else: + scaler.step(optimizer) + scaler.update() + optimizer.clear_grad() + if sharding_stage == 3: + model.get_all_parameters() + + if save_model: + return model, optimizer + return model.parameters() + + +def test_stage2_stage3(): + paddle.distributed.init_parallel_env() + mlp, mlp1, mlp2, mlp3, mlp4, mlp5, mlp6, mlp7, mlp8, mlp9, mlp10 = MLP( + ), MLP(), MLP(), MLP(), MLP(), MLP(), MLP(), MLP(), MLP(), MLP(), MLP() + state_dict = mlp.state_dict() + mlp1.set_state_dict(state_dict) + mlp2.set_state_dict(state_dict) + mlp3.set_state_dict(state_dict) + mlp4.set_state_dict(state_dict) + mlp5.set_state_dict(state_dict) + mlp6.set_state_dict(state_dict) + mlp7.set_state_dict(state_dict) + mlp8.set_state_dict(state_dict) + mlp9.set_state_dict(state_dict) + mlp10.set_state_dict(state_dict) + + # fp32 + stage2_params = train_mlp( + mlp1, sharding_stage=2, use_pure_fp16=False, opt_group=False) + stage3_params = train_mlp( + mlp2, sharding_stage=3, use_pure_fp16=False, opt_group=False) + + for i in range(len(stage2_params)): + np.testing.assert_allclose( + stage2_params[i].numpy(), + stage3_params[i].numpy(), + rtol=1e-6, + atol=1e-6) + + # fp32 accumulate grad + stage3_params = train_mlp( + mlp3, + sharding_stage=3, + use_pure_fp16=False, + accumulate_grad=True, + opt_group=True) + stage3_params_add = train_mlp( + mlp4, + sharding_stage=3, + use_pure_fp16=False, + accumulate_grad=True, + batch_size=20, + opt_group=True) + for i in range(len(stage3_params)): + np.testing.assert_allclose( + stage3_params[i].numpy(), + stage3_params_add[i].numpy(), + rtol=1e-6, + atol=1e-4) + + # fp16 + stage2_params = train_mlp( + mlp5, sharding_stage=2, use_pure_fp16=True, opt_group=False) + stage3_params = train_mlp( + mlp6, sharding_stage=3, use_pure_fp16=True, opt_group=False) + for i in range(len(stage2_params)): + np.testing.assert_allclose( + stage2_params[i].numpy(), + stage3_params[i].numpy(), + rtol=1e-4, + atol=1e-3) + + # fp16 sync_comm + stage3_params = train_mlp( + mlp7, sharding_stage=3, use_pure_fp16=True, opt_group=False) + stage3_params_re = train_mlp( + mlp8, + sharding_stage=3, + use_pure_fp16=True, + opt_group=False, + sync_comm=True) + for i in range(len(stage3_params)): + np.testing.assert_allclose( + stage3_params[i].numpy(), stage3_params_re[i].numpy(), rtol=1e-6) + + # save/load model + output_dir = tempfile.mkdtemp() + model_file = os.path.join(output_dir, "model.pdmodel") + optimizer_file = os.path.join(output_dir, "model.pdopt") + model_stage3, optimizer_stage3 = train_mlp( + mlp9, + sharding_stage=3, + use_pure_fp16=False, + opt_group=False, + save_model=True) + paddle.save(model_stage3.state_dict(), model_file) + paddle.save(optimizer_stage3.state_dict(), optimizer_file) + m_state_dict = paddle.load(model_file) + opt_state_dict = paddle.load(optimizer_file) + model_stage3.set_state_dict(m_state_dict) + optimizer_stage3.set_state_dict(opt_state_dict) + shutil.rmtree(output_dir) + + # check optimizer.minimize() error + train_mlp( + mlp10, + sharding_stage=3, + use_pure_fp16=False, + opt_group=False, + test_minimize=True) + + +if __name__ == '__main__': + with _test_eager_guard(): + test_stage2_stage3() diff --git a/python/paddle/fluid/tests/unittests/dygraph_group_sharded_stage3_offload.py b/python/paddle/fluid/tests/unittests/dygraph_group_sharded_stage3_offload.py new file mode 100644 index 0000000000000..5f9ec5c6e708e --- /dev/null +++ b/python/paddle/fluid/tests/unittests/dygraph_group_sharded_stage3_offload.py @@ -0,0 +1,205 @@ +# -*- coding: UTF-8 -*- + +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np +import argparse +import ast +import time +import paddle +import paddle.fluid as fluid +from paddle.fluid.dygraph.nn import Linear +from paddle.distributed import fleet +from paddle.fluid.dygraph import nn +from paddle.fluid.framework import _test_eager_guard + +from paddle.distributed.fleet.meta_parallel.sharding.group_sharded_stage3 import GroupShardedStage3 +from paddle.distributed.fleet.meta_parallel.sharding.group_sharded_utils import GroupShardedScaler + +epoch = 10 +paddle.seed(2022) +np.random.seed(2022) +base_lr = 0.1 +momentum_rate = 0.9 +l2_decay = 1e-4 + + +class MLP(fluid.Layer): + def __init__(self, linear_size=1000, param_attr=None, bias_attr=None): + super(MLP, self).__init__() + + self._linear1 = Linear(linear_size, linear_size) + self._linear2 = Linear(linear_size, linear_size) + self._linear3 = Linear(linear_size, 10) + + def forward(self, inputs): + y = self._linear1(inputs) + y = self._linear2(y) + y = self._linear3(y) + return y + + +def reader_decorator(linear_size=1000): + def __reader__(): + for _ in range(100): + img = np.random.rand(linear_size).astype('float32') + label = np.ones(1).astype('int64') + yield img, label + + return __reader__ + + +def optimizer_setting(model, use_pure_fp16, opt_group=False): + clip = paddle.nn.ClipGradByGlobalNorm(clip_norm=1.0) + optimizer = paddle.optimizer.AdamW( + parameters=[{ + "params": model.parameters() + }] if opt_group else model.parameters(), + learning_rate=0.001, + weight_decay=0.00001, + grad_clip=clip, + multi_precision=use_pure_fp16) + + return optimizer + + +def train_mlp(model, + use_pure_fp16=False, + accumulate_grad=False, + offload=False, + batch_size=100, + convert2cpu=False): + group = paddle.distributed.new_group([0, 1]) + optimizer = optimizer_setting(model=model, use_pure_fp16=use_pure_fp16) + + if use_pure_fp16: + model = paddle.amp.decorate( + models=model, level='O2', save_dtype='float32') + scaler = paddle.amp.GradScaler(init_loss_scaling=32768) + scaler = GroupShardedScaler(scaler) + + model = GroupShardedStage3( + model, + optimizer=optimizer, + group=group, + offload=offload, + segment_size=2**15) + + train_reader = paddle.batch( + reader_decorator(), batch_size=batch_size, drop_last=True) + + train_loader = paddle.io.DataLoader.from_generator( + capacity=32, + use_double_buffer=True, + iterable=True, + return_list=True, + use_multiprocess=True) + train_loader.set_sample_list_generator(train_reader) + + for eop in range(epoch): + model.train() + for batch_id, data in enumerate(train_loader()): + img, label = data + label.stop_gradient = True + img.stop_gradient = True + with paddle.amp.auto_cast(True, level='O2'): + out = model(img) + loss = paddle.nn.functional.cross_entropy( + input=out, label=label) + avg_loss = paddle.mean(x=loss.cast(dtype=paddle.float32)) + + if accumulate_grad: + avg_loss = avg_loss / 5 + + if not use_pure_fp16: + avg_loss.backward() + else: + scaler.scale(avg_loss).backward() + + if not accumulate_grad: + if not use_pure_fp16: + optimizer.step() + else: + scaler.step(optimizer) + scaler.update() + optimizer.clear_grad() + if accumulate_grad: + if not use_pure_fp16: + optimizer.step() + else: + scaler.step(optimizer) + scaler.update() + optimizer.clear_grad() + if not convert2cpu: + model.get_all_parameters() + else: + model.get_all_parameters(convert2cpu) + return model.parameters() + + +def test_stage3_offload(): + paddle.distributed.init_parallel_env() + mlp, mlp1, mlp2, mlp3, mlp4, mlp5, mlp6 = MLP(), MLP(), MLP(), MLP(), MLP( + ), MLP(), MLP() + state_dict = mlp.state_dict() + mlp1.set_state_dict(state_dict) + mlp2.set_state_dict(state_dict) + mlp3.set_state_dict(state_dict) + mlp4.set_state_dict(state_dict) + mlp5.set_state_dict(state_dict) + mlp6.set_state_dict(state_dict) + + # fp32 offload + stage3_params = train_mlp(mlp1, use_pure_fp16=False) + stage3_params_offload = train_mlp(mlp2, use_pure_fp16=False, offload=True) + for i in range(len(stage3_params)): + np.testing.assert_allclose( + stage3_params[i].numpy(), + stage3_params_offload[i].numpy(), + rtol=1e-6, + atol=1e-8) + + # fp16 offload + stage3_params = train_mlp(mlp3, use_pure_fp16=True) + stage3_params_offload = train_mlp(mlp4, use_pure_fp16=True, offload=True) + for i in range(len(stage3_params)): + np.testing.assert_allclose( + stage3_params[i].numpy(), + stage3_params_offload[i].numpy(), + rtol=1e-2, + atol=1e-2) + + # fp32 accumulate grad offload + stage3_params = train_mlp( + mlp5, use_pure_fp16=False, batch_size=20, accumulate_grad=True) + stage3_params_offload = train_mlp( + mlp6, + use_pure_fp16=False, + accumulate_grad=True, + offload=True, + batch_size=20, + convert2cpu=True) + for i in range(len(stage3_params)): + np.testing.assert_allclose( + stage3_params[i].numpy(), + stage3_params_offload[i].numpy(), + rtol=1e-6, + atol=1e-8) + return + + +if __name__ == '__main__': + with _test_eager_guard(): + test_stage3_offload() diff --git a/python/paddle/fluid/tests/unittests/dygraph_sharding_optimizer_stage2.py b/python/paddle/fluid/tests/unittests/dygraph_sharding_optimizer_stage2.py index 705831d50f171..0ed9b681fdcf5 100644 --- a/python/paddle/fluid/tests/unittests/dygraph_sharding_optimizer_stage2.py +++ b/python/paddle/fluid/tests/unittests/dygraph_sharding_optimizer_stage2.py @@ -23,6 +23,7 @@ import paddle.fluid as fluid from paddle.fluid.dygraph.nn import Linear from paddle.distributed import fleet +from paddle.fluid.framework import _test_eager_guard from paddle.distributed.fleet.utils.internal_storage import GradStorage from paddle.distributed.fleet.meta_optimizers.dygraph_optimizer.sharding_optimizer_stage2 import ShardingOptimizerStage2 @@ -138,4 +139,6 @@ def train_mlp(): if __name__ == '__main__': + with _test_eager_guard(): + pass train_mlp() diff --git a/python/paddle/fluid/tests/unittests/dygraph_sharding_stage2.py b/python/paddle/fluid/tests/unittests/dygraph_sharding_stage2.py index fb01fd46c0d28..82edd1c17a541 100644 --- a/python/paddle/fluid/tests/unittests/dygraph_sharding_stage2.py +++ b/python/paddle/fluid/tests/unittests/dygraph_sharding_stage2.py @@ -26,6 +26,7 @@ from paddle.fluid.dygraph.nn import Linear from paddle.distributed import fleet from paddle.fluid.dygraph import nn +from paddle.fluid.framework import _test_eager_guard from paddle.distributed.fleet.meta_optimizers.dygraph_optimizer.sharding_optimizer_stage2 import ShardingOptimizerStage2 from paddle.distributed.fleet.meta_parallel.sharding.sharding_stage2 import ShardingStage2 @@ -222,4 +223,6 @@ def test_dp_stage2(): if __name__ == '__main__': + with _test_eager_guard(): + pass test_dp_stage2() diff --git a/python/paddle/fluid/tests/unittests/dygraph_sharding_stage2_offload.py b/python/paddle/fluid/tests/unittests/dygraph_sharding_stage2_offload.py index 39ba44815d940..a7b16bbb75977 100644 --- a/python/paddle/fluid/tests/unittests/dygraph_sharding_stage2_offload.py +++ b/python/paddle/fluid/tests/unittests/dygraph_sharding_stage2_offload.py @@ -23,6 +23,7 @@ from paddle.fluid.dygraph.nn import Linear from paddle.distributed import fleet from paddle.fluid.dygraph import nn +from paddle.fluid.framework import _test_eager_guard from paddle.distributed.fleet.meta_optimizers.dygraph_optimizer.sharding_optimizer_stage2 import ShardingOptimizerStage2 from paddle.distributed.fleet.meta_parallel.sharding.sharding_stage2 import ShardingStage2 @@ -106,4 +107,6 @@ def test_sharding_stage2_offload(): if __name__ == '__main__': + with _test_eager_guard(): + pass test_sharding_stage2_offload() diff --git a/python/paddle/fluid/tests/unittests/dygraph_sharding_stage3.py b/python/paddle/fluid/tests/unittests/dygraph_sharding_stage3.py index 82821cd7ee644..cdb1de020f56e 100644 --- a/python/paddle/fluid/tests/unittests/dygraph_sharding_stage3.py +++ b/python/paddle/fluid/tests/unittests/dygraph_sharding_stage3.py @@ -26,6 +26,7 @@ from paddle.fluid.dygraph.nn import Linear from paddle.distributed import fleet from paddle.fluid.dygraph import nn +from paddle.fluid.framework import _test_eager_guard from paddle.distributed.fleet.meta_optimizers.dygraph_optimizer.sharding_optimizer_stage2 import ShardingOptimizerStage2 from paddle.distributed.fleet.meta_parallel.sharding.sharding_stage2 import ShardingStage2 @@ -274,4 +275,6 @@ def test_stage2_stage3(): if __name__ == '__main__': + with _test_eager_guard(): + pass test_stage2_stage3() diff --git a/python/paddle/fluid/tests/unittests/dygraph_sharding_stage3_offload.py b/python/paddle/fluid/tests/unittests/dygraph_sharding_stage3_offload.py index df7ba78d345a3..2cb327a29a3da 100644 --- a/python/paddle/fluid/tests/unittests/dygraph_sharding_stage3_offload.py +++ b/python/paddle/fluid/tests/unittests/dygraph_sharding_stage3_offload.py @@ -23,6 +23,7 @@ from paddle.fluid.dygraph.nn import Linear from paddle.distributed import fleet from paddle.fluid.dygraph import nn +from paddle.fluid.framework import _test_eager_guard from paddle.distributed.fleet.meta_parallel.sharding.sharding_stage3 import ShardingStage3 from paddle.distributed.fleet.meta_parallel.sharding.sharding_utils import ShardingScaler @@ -196,4 +197,6 @@ def test_stage3_offload(): if __name__ == '__main__': + with _test_eager_guard(): + pass test_stage3_offload() diff --git a/python/paddle/fluid/tests/unittests/test_dygraph_group_sharded_api.py b/python/paddle/fluid/tests/unittests/test_dygraph_group_sharded_api.py index 7c296c7e40e98..e664face0483a 100644 --- a/python/paddle/fluid/tests/unittests/test_dygraph_group_sharded_api.py +++ b/python/paddle/fluid/tests/unittests/test_dygraph_group_sharded_api.py @@ -25,6 +25,7 @@ class TestDygraphGroupSharded(TestMultipleGpus): # check group sharded logic as well as the accuracy with single mode def test_dygraph_group_sharded(self): self.run_mnist_2gpu('dygraph_group_sharded_api.py') + self.run_mnist_2gpu('dygraph_group_sharded_api_eager.py') if __name__ == "__main__": diff --git a/python/paddle/fluid/tests/unittests/test_dygraph_sharding_stage2.py b/python/paddle/fluid/tests/unittests/test_dygraph_sharding_stage2.py index 669ab7d8f7f34..b7a5f9c9701c1 100644 --- a/python/paddle/fluid/tests/unittests/test_dygraph_sharding_stage2.py +++ b/python/paddle/fluid/tests/unittests/test_dygraph_sharding_stage2.py @@ -24,9 +24,11 @@ class TestDygraphShardingStage2(TestMultipleGpus): # check sharding logic as well as the accuracy with single mode def test_dygraph_sharding_stage2(self): + self.run_mnist_2gpu('dygraph_group_sharded_stage2.py') self.run_mnist_2gpu('dygraph_sharding_stage2.py') def test_dygraph_sharding_stage2_offload(self): + self.run_mnist_2gpu('dygraph_group_sharded_stage2_offload.py') self.run_mnist_2gpu('dygraph_sharding_stage2_offload.py') diff --git a/python/paddle/fluid/tests/unittests/test_dygraph_sharding_stage3.py b/python/paddle/fluid/tests/unittests/test_dygraph_sharding_stage3.py index c7da5d1e941b4..f69b52cae528a 100644 --- a/python/paddle/fluid/tests/unittests/test_dygraph_sharding_stage3.py +++ b/python/paddle/fluid/tests/unittests/test_dygraph_sharding_stage3.py @@ -24,9 +24,11 @@ class TestDygraphShardingStage3(TestMultipleGpus): # check sharding logic as well as the accuracy with single mode def test_dygraph_sharding_stage3(self): + self.run_mnist_2gpu('dygraph_group_sharded_stage3.py') self.run_mnist_2gpu('dygraph_sharding_stage3.py') def test_dygraph_sharding_stage3_offload(self): + self.run_mnist_2gpu('dygraph_group_sharded_stage3_offload.py') self.run_mnist_2gpu('dygraph_sharding_stage3_offload.py') diff --git a/python/paddle/fluid/tests/unittests/test_egr_python_api.py b/python/paddle/fluid/tests/unittests/test_egr_python_api.py index ae29c6c262a84..7e78b223b3f6a 100644 --- a/python/paddle/fluid/tests/unittests/test_egr_python_api.py +++ b/python/paddle/fluid/tests/unittests/test_egr_python_api.py @@ -677,7 +677,7 @@ def test_share_buffer_to(self): tensor2 = None tensor = paddle.to_tensor(arr, core.VarDesc.VarType.FP32, core.CPUPlace()) - tensor3 = core.eager.Tensor() + tensor3 = core.eager.Tensor(value=tensor, place=core.CPUPlace()) if core.is_compiled_with_cuda(): tensor2 = paddle.to_tensor(arr2, core.VarDesc.VarType.FP32, core.CUDAPlace(0)) From b1adde3dc1d74208a8bd2484adb700dc18ec1b8c Mon Sep 17 00:00:00 2001 From: helen88 Date: Wed, 13 Apr 2022 11:23:45 +0800 Subject: [PATCH 115/211] use bilstm_train for rnn forward, * test=kunlun (#41671) --- paddle/fluid/operators/rnn_op_xpu.cc | 117 ++++++++++----------------- 1 file changed, 43 insertions(+), 74 deletions(-) diff --git a/paddle/fluid/operators/rnn_op_xpu.cc b/paddle/fluid/operators/rnn_op_xpu.cc index c75c24ab0abc2..a18d0ebfca946 100644 --- a/paddle/fluid/operators/rnn_op_xpu.cc +++ b/paddle/fluid/operators/rnn_op_xpu.cc @@ -51,41 +51,6 @@ void reset_parameter_vector(const std::vector& raw_params_vec, } } -template -void RunLSTMLayer(const framework::ExecutionContext& ctx, int seq_len, - int batch_size, int xdim, int hidden_size, const T* x, T* y, - const T* init_h, const T* init_c, T* last_h, T* last_c, - int state_offset, const std::vector& seq_len_tensor, - const std::vector& param_list, T* i_f_g_o, T* c, - bool is_bidirect, int layer_idx, int offset) { - bool is_reverse = false; - if (is_bidirect) { - layer_idx = 2 * layer_idx + offset; - if (offset > 0) { - is_reverse = true; - } - } - auto w_x = param_list[0 + offset * 4]; - auto w_h = param_list[1 + offset * 4]; - auto b_x = param_list[2 + offset * 4]; - auto b_h = param_list[3 + offset * 4]; - - auto h_0 = init_h + layer_idx * state_offset; - auto c_0 = init_c + layer_idx * state_offset; - auto last_h_ptr = last_h + layer_idx * state_offset; - auto last_c_ptr = last_c + layer_idx * state_offset; - auto& dev_ctx = ctx.template device_context(); - int r = xpu::lstm_train( - dev_ctx.x_context(), (const T*)x, (const T*)h_0, (const T*)c_0, - (const T*)w_x, (const T*)w_h, (const T*)b_x, (const T*)b_h, - reinterpret_cast(y), reinterpret_cast(last_h_ptr), - reinterpret_cast(last_c_ptr), batch_size, xdim, hidden_size, seq_len, - seq_len_tensor, is_reverse, nullptr, nullptr, nullptr, nullptr, - reinterpret_cast(i_f_g_o), reinterpret_cast(c), - xpu::Activation_t::TANH, xpu::Activation_t::SIGMOID); - PADDLE_ENFORCE_XDNN_SUCCESS(r, "lstm_train"); -} - template class RnnXPUKernel : public framework::OpKernel { public: @@ -184,9 +149,9 @@ class RnnXPUKernel : public framework::OpKernel { auto y = output->data(); auto last_h_ptr = last_h->data(); auto last_c_ptr = last_c->data(); - auto i_f_g_o = reserve_data->data(); - auto c = - i_f_g_o + + auto i_f_g_o_ptr = reserve_data->data(); + auto c_ptr = + i_f_g_o_ptr + num_layers * direction_num * seq_len * batch_size * hidden_size * 4; std::vector seq_len_tensor(batch_size, seq_len); @@ -197,11 +162,12 @@ class RnnXPUKernel : public framework::OpKernel { int state_offset = pre_state[0]->dims()[1] * pre_state[0]->dims()[2]; for (int i = 0; i < num_layers; i++) { + auto i_f_g_o = i_f_g_o_ptr + + i * direction_num * seq_len * batch_size * hidden_size * 4; + auto c = c_ptr + i * direction_num * seq_len * batch_size * hidden_size; + const T* cur_input_ptr = nullptr; int cur_xdim = -1; - i_f_g_o += i * direction_num * seq_len * batch_size * hidden_size * 4; - c += i * direction_num * seq_len * batch_size * hidden_size; - if (i == 0) { cur_input_ptr = x; cur_xdim = input_dim; @@ -222,41 +188,44 @@ class RnnXPUKernel : public framework::OpKernel { cur_output_ptr = internal_output_1_ptr; } + auto h_0 = init_h_ptr + direction_num * i * state_offset; + auto c_0 = init_c_ptr + direction_num * i * state_offset; + auto last_h = last_h_ptr + direction_num * i * state_offset; + auto last_c = last_c_ptr + direction_num * i * state_offset; + + auto w_x = parameter_lists[i][0]; + auto w_h = parameter_lists[i][1]; + auto b_x = parameter_lists[i][2]; + auto b_h = parameter_lists[i][3]; if (is_bidirec) { - std::vector output_vec(2); - std::vector output_ptr_vec(2); - for (int k = 0; k < 2; ++k) { - output_vec[k].Resize({seq_len, batch_size, output->dims()[2] / 2}); - output_ptr_vec[k] = output_vec[k].mutable_data(ctx.GetPlace()); - } - RunLSTMLayer( - ctx, seq_len, batch_size, cur_xdim, hidden_size, cur_input_ptr, - output_ptr_vec[0], init_h_ptr, init_c_ptr, last_h_ptr, last_c_ptr, - state_offset, seq_len_tensor, parameter_lists[i], i_f_g_o, c, - is_bidirec, i, 0); - - T* bw_i_f_g_o = i_f_g_o + seq_len * batch_size * hidden_size * 4; - T* bw_c = c + seq_len * batch_size * hidden_size; - RunLSTMLayer( - ctx, seq_len, batch_size, cur_xdim, hidden_size, cur_input_ptr, - output_ptr_vec[1], init_h_ptr, init_c_ptr, last_h_ptr, last_c_ptr, - state_offset, seq_len_tensor, parameter_lists[i], bw_i_f_g_o, bw_c, - is_bidirec, i, 1); - - // concat - int r = xpu::concat( - dev_ctx.x_context(), {output_ptr_vec[0], output_ptr_vec[1]}, - cur_output_ptr, {{seq_len, batch_size, hidden_size}, - {seq_len, batch_size, hidden_size}}, - 2); - PADDLE_ENFORCE_XDNN_SUCCESS(r, "concat"); - xpu_wait(dev_ctx.x_context()->xpu_stream); + auto bw_x = parameter_lists[i][4]; + auto bw_h = parameter_lists[i][5]; + auto bb_x = parameter_lists[i][6]; + auto bb_h = parameter_lists[i][7]; + + int r = xpu::bilstm_train( + dev_ctx.x_context(), (const T*)cur_input_ptr, (const T*)h_0, + (const T*)c_0, (const T*)w_x, (const T*)w_h, (const T*)b_x, + (const T*)b_h, (const T*)bw_x, (const T*)bw_h, (const T*)bb_x, + (const T*)bb_h, reinterpret_cast(cur_output_ptr), + reinterpret_cast(last_h), reinterpret_cast(last_c), + batch_size, cur_xdim, hidden_size, seq_len, seq_len_tensor, nullptr, + nullptr, nullptr, nullptr, nullptr, nullptr, + reinterpret_cast(i_f_g_o), reinterpret_cast(c)); + + PADDLE_ENFORCE_XDNN_SUCCESS(r, "bilstm_train"); } else { - RunLSTMLayer( - ctx, seq_len, batch_size, cur_xdim, hidden_size, cur_input_ptr, - cur_output_ptr, init_h_ptr, init_c_ptr, last_h_ptr, last_c_ptr, - state_offset, seq_len_tensor, parameter_lists[i], i_f_g_o, c, - is_bidirec, i, 0); + int r = xpu::lstm_train( + dev_ctx.x_context(), (const T*)cur_input_ptr, (const T*)h_0, + (const T*)c_0, (const T*)w_x, (const T*)w_h, (const T*)b_x, + (const T*)b_h, reinterpret_cast(cur_output_ptr), + reinterpret_cast(last_h), reinterpret_cast(last_c), + batch_size, cur_xdim, hidden_size, seq_len, seq_len_tensor, nullptr, + nullptr, nullptr, nullptr, reinterpret_cast(i_f_g_o), + reinterpret_cast(c), xpu::Activation_t::TANH, + xpu::Activation_t::SIGMOID); + + PADDLE_ENFORCE_XDNN_SUCCESS(r, "lstm_train"); } } } From b8968390acbdc5578433ae9ce98ba256a6b40085 Mon Sep 17 00:00:00 2001 From: Ruibiao Chen Date: Wed, 13 Apr 2022 11:36:17 +0800 Subject: [PATCH 116/211] Add yaml for deformable_conv and deformable_conv_v1 OPs (#41644) * Add yaml for deformable_conv and deformable_conv_v1 OPs * Add UT * Add to skipped_phi_api list for infrt --- paddle/phi/infermeta/backward.cc | 21 +++++++++++ paddle/phi/infermeta/backward.h | 16 +++++++++ .../fluid/tests/unittests/CMakeLists.txt | 4 +-- .../tests/unittests/test_deform_conv2d.py | 9 +++++ .../unittests/test_deformable_conv_op.py | 35 ++++++++++++++++--- .../unittests/test_deformable_conv_v1_op.py | 35 ++++++++++++++++--- python/paddle/utils/code_gen/api.yaml | 10 ++++++ python/paddle/utils/code_gen/backward.yaml | 10 ++++++ python/paddle/vision/ops.py | 10 +++++- tools/infrt/skipped_phi_api.json | 2 +- 10 files changed, 139 insertions(+), 13 deletions(-) diff --git a/paddle/phi/infermeta/backward.cc b/paddle/phi/infermeta/backward.cc index 81d3cb9ddf0f4..efbf02e331433 100644 --- a/paddle/phi/infermeta/backward.cc +++ b/paddle/phi/infermeta/backward.cc @@ -169,6 +169,27 @@ void CrossEntropyWithSoftmaxGradInferMeta(const MetaTensor& label, logits_grad->set_dtype(softmax.dtype()); } +void DeformableConvGradInferMeta(const MetaTensor& x, + const MetaTensor& offset, + const MetaTensor& filter, + paddle::optional mask, + const MetaTensor& out_grad, + const std::vector& strides, + const std::vector& paddings, + const std::vector& dilations, + int deformable_groups, + int groups, + int im2col_step, + MetaTensor* dx, + MetaTensor* offset_grad, + MetaTensor* filter_grad, + MetaTensor* mask_grad) { + GeneralTernaryGradInferMeta(x, offset, filter, dx, offset_grad, filter_grad); + if (mask) { + UnchangedInferMeta(mask.get(), mask_grad); + } +} + void GatherNdGradInferMeta(const MetaTensor& x, const MetaTensor& index, const MetaTensor& out_grad, diff --git a/paddle/phi/infermeta/backward.h b/paddle/phi/infermeta/backward.h index 058ff7541cd8b..6e730c83d1d50 100644 --- a/paddle/phi/infermeta/backward.h +++ b/paddle/phi/infermeta/backward.h @@ -79,6 +79,22 @@ void CrossEntropyWithSoftmaxGradInferMeta(const MetaTensor& label, MetaTensor* logits_grad, MetaConfig config = MetaConfig()); +void DeformableConvGradInferMeta(const MetaTensor& x, + const MetaTensor& offset, + const MetaTensor& filter, + paddle::optional mask, + const MetaTensor& out_grad, + const std::vector& strides, + const std::vector& paddings, + const std::vector& dilations, + int deformable_groups, + int groups, + int im2col_step, + MetaTensor* dx, + MetaTensor* offset_grad, + MetaTensor* filter_grad, + MetaTensor* mask_grad); + void GatherNdGradInferMeta(const MetaTensor& x, const MetaTensor& index, const MetaTensor& out_grad, diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt index d947784e518c8..f7f88ab76f227 100755 --- a/python/paddle/fluid/tests/unittests/CMakeLists.txt +++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt @@ -968,7 +968,7 @@ set_tests_properties(test_lstm_op PROPERTIES TIMEOUT 120) set_tests_properties(test_imperative_star_gan_with_gradient_penalty PROPERTIES TIMEOUT 120) set_tests_properties(test_bicubic_interp_op PROPERTIES TIMEOUT 120) -set_tests_properties(test_deformable_conv_op PROPERTIES TIMEOUT 120) +set_tests_properties(test_deformable_conv_op PROPERTIES TIMEOUT 200) set_tests_properties(test_nearest_interp_op PROPERTIES TIMEOUT 120) set_tests_properties(test_profiler PROPERTIES TIMEOUT 120) set_tests_properties(test_inplace_softmax_with_cross_entropy PROPERTIES TIMEOUT 120) @@ -1045,7 +1045,7 @@ set_tests_properties(test_distributed_fused_lamb_op_with_clip PROPERTIES TIMEOUT set_tests_properties(test_distributed_fused_lamb_op_without_clip PROPERTIES TIMEOUT 120) set_tests_properties(test_elementwise_min_op PROPERTIES TIMEOUT 120) set_tests_properties(test_nan_inf PROPERTIES TIMEOUT 120) -set_tests_properties(test_deformable_conv_v1_op PROPERTIES TIMEOUT 120) +set_tests_properties(test_deformable_conv_v1_op PROPERTIES TIMEOUT 300) set_tests_properties(test_parallel_executor_transformer_auto_growth PROPERTIES TIMEOUT 120) set_tests_properties(test_py_reader_using_executor PROPERTIES TIMEOUT 120) set_tests_properties(test_elementwise_add_op PROPERTIES TIMEOUT 120) diff --git a/python/paddle/fluid/tests/unittests/test_deform_conv2d.py b/python/paddle/fluid/tests/unittests/test_deform_conv2d.py index 508fc1705218a..f5f1479d07d2f 100644 --- a/python/paddle/fluid/tests/unittests/test_deform_conv2d.py +++ b/python/paddle/fluid/tests/unittests/test_deform_conv2d.py @@ -17,6 +17,7 @@ import paddle.nn.initializer as I import numpy as np import unittest +from paddle.fluid.framework import _test_eager_guard from unittest import TestCase @@ -183,6 +184,10 @@ def test_identity(self): self.place = paddle.CUDAPlace(0) self._test_identity() + def test_identity_with_eager_guard(self): + with _test_eager_guard(): + self.test_identity() + class TestDeformConv2DFunctional(TestCase): batch_size = 4 @@ -418,6 +423,10 @@ def test_identity(self): self.place = paddle.CUDAPlace(0) self._test_identity() + def test_identity_with_eager_guard(self): + with _test_eager_guard(): + self.test_identity() + # testcases for DeformConv2D class TestDeformConv2DWithPadding(TestDeformConv2D): diff --git a/python/paddle/fluid/tests/unittests/test_deformable_conv_op.py b/python/paddle/fluid/tests/unittests/test_deformable_conv_op.py index 45a23231945ec..5fc849575b659 100644 --- a/python/paddle/fluid/tests/unittests/test_deformable_conv_op.py +++ b/python/paddle/fluid/tests/unittests/test_deformable_conv_op.py @@ -14,13 +14,15 @@ from __future__ import print_function +import paddle import unittest import numpy as np - -import paddle import paddle.fluid.core as core import paddle.fluid as fluid from op_test import OpTest +from paddle.fluid.framework import _test_eager_guard + +paddle.enable_static() def dmc_bilinear(data_im, height, width, h, w): @@ -108,8 +110,24 @@ def dconv_im2col_gemm(input, offset, mask, filter, group, conv_param): return out +def deform_conv2d_wrapper(x, + offset, + weight, + mask=None, + stride=1, + padding=0, + dilation=1, + deformable_groups=1, + groups=1, + im2col_step=1): + return paddle.vision.ops.deform_conv2d(x, offset, weight, None, stride, + padding, dilation, deformable_groups, + groups, mask) + + class TestModulatedDeformableConvOp(OpTest): def setUp(self): + self.python_api = deform_conv2d_wrapper self.op_type = "deformable_conv" self.init_type() self.init_group() @@ -148,13 +166,14 @@ def setUp(self): self.outputs = {'Output': output} def test_check_output(self): - self.check_output() + self.check_output(check_eager=True) def test_check_grad(self): self.check_grad( {'Input', 'Offset', 'Mask', 'Filter'}, 'Output', - max_relative_error=0.05) + max_relative_error=0.05, + check_eager=True) def init_test_case(self): self.pad = [1, 1] @@ -327,6 +346,10 @@ def test_invalid_filter(): self.assertRaises(ValueError, test_invalid_filter) + def test_error_with_eager_guard(self): + with _test_eager_guard(): + self.test_error() + class TestDeformConv2DAPI(unittest.TestCase): def test_api(self): @@ -358,6 +381,10 @@ def test_deform_conv2d_v2(): test_deform_conv2d_v2() + def test_api_with_eager_guard(self): + with _test_eager_guard(): + self.test_api() + if __name__ == '__main__': unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_deformable_conv_v1_op.py b/python/paddle/fluid/tests/unittests/test_deformable_conv_v1_op.py index e8b18d601afae..304a151c4d3bf 100644 --- a/python/paddle/fluid/tests/unittests/test_deformable_conv_v1_op.py +++ b/python/paddle/fluid/tests/unittests/test_deformable_conv_v1_op.py @@ -14,12 +14,13 @@ from __future__ import print_function +import paddle import unittest import numpy as np - -import paddle.fluid.core as core import paddle.fluid as fluid +import paddle.fluid.core as core from op_test import OpTest +from paddle.fluid.framework import _test_eager_guard def dmc_bilinear(data_im, height, width, h, w): @@ -105,8 +106,24 @@ def dconv_im2col_gemm(input, offset, filter, group, conv_param): return out +def deform_conv2d_wrapper(x, + offset, + weight, + mask=None, + stride=1, + padding=0, + dilation=1, + deformable_groups=1, + groups=1, + im2col_step=1): + return paddle.vision.ops.deform_conv2d(x, offset, weight, None, stride, + padding, dilation, deformable_groups, + groups, mask) + + class TestModulatedDeformableConvOp(OpTest): def setUp(self): + self.python_api = deform_conv2d_wrapper self.op_type = "deformable_conv_v1" self.init_type() self.init_group() @@ -142,18 +159,22 @@ def setUp(self): self.outputs = {'Output': output} def test_check_output(self): - self.check_output() + self.check_output(check_eager=True) def test_check_grad(self): self.check_grad( - ['Input', 'Offset', 'Filter'], 'Output', max_relative_error=0.05) + ['Input', 'Offset', 'Filter'], + 'Output', + max_relative_error=0.05, + check_eager=True) def test_check_grad_no_filter(self): self.check_grad( ['Input', 'Offset'], 'Output', max_relative_error=0.1, - no_grad_set=set(['Filter'])) + no_grad_set=set(['Filter']), + check_eager=True) def init_test_case(self): self.pad = [1, 1] @@ -292,6 +313,10 @@ def test_invalid_offset(): self.assertRaises(TypeError, test_invalid_offset) + def test_error_with_eager_guard(self): + with _test_eager_guard(): + self.test_error() + if __name__ == '__main__': unittest.main() diff --git a/python/paddle/utils/code_gen/api.yaml b/python/paddle/utils/code_gen/api.yaml index 08028ba17185c..6387525fa26f1 100644 --- a/python/paddle/utils/code_gen/api.yaml +++ b/python/paddle/utils/code_gen/api.yaml @@ -451,6 +451,16 @@ func : cumsum backward : cumsum_grad +- api : deformable_conv + args : (Tensor x, Tensor offset, Tensor filter, Tensor mask, int[] strides, int[] paddings, int[] dilations, int deformable_groups, int groups, int im2col_step) + output : Tensor(out) + infer_meta : + func : DeformableConvInferMeta + kernel : + func : deformable_conv + optional : mask + backward : deformable_conv_grad + - api : depthwise_conv2d_transpose args : (Tensor x, Tensor filter, int[] strides, int[] paddings, int[] output_padding, int[] output_size, str padding_algorithm, int groups, int[] dilations, str data_format) output : Tensor(out) diff --git a/python/paddle/utils/code_gen/backward.yaml b/python/paddle/utils/code_gen/backward.yaml index f8366744bdbe6..d243b4d160d57 100644 --- a/python/paddle/utils/code_gen/backward.yaml +++ b/python/paddle/utils/code_gen/backward.yaml @@ -339,6 +339,16 @@ output : Tensor(x_grad) invoke : cumsum(out_grad, axis, flatten, exclusive, !reverse) +- backward_api : deformable_conv_grad + forward : deformable_conv(Tensor x, Tensor offset, Tensor filter, Tensor mask, int[] strides, int[] paddings, int[] dilations, int deformable_groups, int groups, int im2col_step) -> Tensor(out) + args : (Tensor x, Tensor offset, Tensor filter, Tensor mask, Tensor out_grad, int[] strides, int[] paddings, int[] dilations, int deformable_groups, int groups, int im2col_step) + output : Tensor(x_grad), Tensor(offset_grad), Tensor(filter_grad), Tensor(mask_grad) + infer_meta : + func : DeformableConvGradInferMeta + kernel : + func : deformable_conv_grad + optional : mask + - backward_api : depthwise_conv2d_transpose_grad forward : depthwise_conv2d_transpose(Tensor x, Tensor filter, int[] strides, int[] paddings, int[] output_padding, int[] output_size, str padding_algorithm, int groups, int[] dilations, str data_format) -> Tensor(out) args : (Tensor x, Tensor filter, Tensor out_grad, int[] strides, int[] paddings, int[] output_padding, int[] output_size, str padding_algorithm, int groups, int[] dilations, str data_format) diff --git a/python/paddle/vision/ops.py b/python/paddle/vision/ops.py index 2ed01d42cfb8c..8fa51df9ac10d 100644 --- a/python/paddle/vision/ops.py +++ b/python/paddle/vision/ops.py @@ -558,7 +558,15 @@ def deform_conv2d(x, use_deform_conv2d_v1 = True if mask is None else False - if _non_static_mode(): + if in_dygraph_mode(): + pre_bias = _C_ops.final_state_deformable_conv( + x, offset, weight, mask, stride, padding, dilation, + deformable_groups, groups, 1) + if bias is not None: + out = nn.elementwise_add(pre_bias, bias, axis=1) + else: + out = pre_bias + elif _in_legacy_dygraph(): attrs = ('strides', stride, 'paddings', padding, 'dilations', dilation, 'deformable_groups', deformable_groups, 'groups', groups, 'im2col_step', 1) diff --git a/tools/infrt/skipped_phi_api.json b/tools/infrt/skipped_phi_api.json index b352240c6dcc5..2502e248c5c48 100644 --- a/tools/infrt/skipped_phi_api.json +++ b/tools/infrt/skipped_phi_api.json @@ -1,4 +1,4 @@ { -"phi_apis":["conj", "dropout", "expand_as", "nll_loss", "psroi_pool", "roi_align", "roi_pool", "label_smooth", "layer_norm"], +"phi_apis":["conj", "deformable_conv", "dropout", "expand_as", "nll_loss", "psroi_pool", "roi_align", "roi_pool", "label_smooth", "layer_norm"], "phi_kernels":["equal_all"] } From d95280c70d634a88527a9bcad6c69f36d82f5125 Mon Sep 17 00:00:00 2001 From: Thunderbrook <52529258+Thunderbrook@users.noreply.github.com> Date: Wed, 13 Apr 2022 11:40:25 +0800 Subject: [PATCH 117/211] optimize hbm (#41623) * optimize hbm * format * format --- paddle/fluid/framework/fleet/heter_context.h | 26 ++- .../fluid/framework/fleet/ps_gpu_wrapper.cc | 197 +++++++++++------- paddle/fluid/framework/fleet/ps_gpu_wrapper.h | 5 + 3 files changed, 154 insertions(+), 74 deletions(-) mode change 100755 => 100644 paddle/fluid/framework/fleet/heter_context.h mode change 100755 => 100644 paddle/fluid/framework/fleet/ps_gpu_wrapper.cc diff --git a/paddle/fluid/framework/fleet/heter_context.h b/paddle/fluid/framework/fleet/heter_context.h old mode 100755 new mode 100644 index 8e51f0e2405bf..6d3a4c5d9c0b9 --- a/paddle/fluid/framework/fleet/heter_context.h +++ b/paddle/fluid/framework/fleet/heter_context.h @@ -16,6 +16,7 @@ limitations under the License. */ #ifdef PADDLE_WITH_HETERPS +#include #include #include #include @@ -38,7 +39,7 @@ namespace framework { class HeterContext { public: - ~HeterContext() { + virtual ~HeterContext() { if (!multi_mf_dim_) { for (size_t i = 0; i < mutex_.size(); ++i) { delete mutex_[i]; @@ -56,9 +57,12 @@ class HeterContext { Scope* scope_{nullptr}; std::vector> feature_keys_; std::vector>> feature_dim_keys_; + std::vector>> device_task_keys_; #ifdef PADDLE_WITH_PSLIB std::vector> value_ptr_; + std::vector>> + device_task_ptr_; std::vector>> value_dim_ptr_; std::vector>> @@ -68,6 +72,8 @@ class HeterContext { std::vector> value_ptr_; std::vector>> value_dim_ptr_; + std::vector>> + device_task_ptr_; std::vector>> device_dim_ptr_; #endif @@ -93,6 +99,12 @@ class HeterContext { shard_num_ = shard_num; feature_keys_.resize(shard_num_); value_ptr_.resize(shard_num_); + device_task_ptr_.resize(shard_num_); + device_task_keys_.resize(shard_num_); + for (size_t i = 0; i < device_task_ptr_.size(); i++) { + device_task_ptr_[i].resize(device_num); + device_task_keys_[i].resize(device_num); + } device_values_.resize(device_num); device_keys_.resize(device_num); @@ -108,6 +120,12 @@ class HeterContext { feature_dim_keys_.resize(shard_num_); value_ptr_.resize(shard_num_); value_dim_ptr_.resize(shard_num_); + device_task_ptr_.resize(shard_num_); + device_task_keys_.resize(shard_num_); + for (size_t i = 0; i < device_task_ptr_.size(); i++) { + device_task_ptr_[i].resize(device_num); + device_task_keys_[i].resize(device_num); + } for (size_t i = 0; i < feature_dim_keys_.size(); i++) { feature_dim_keys_[i].resize(dim_num); value_dim_ptr_[i].resize(dim_num); @@ -151,6 +169,12 @@ class HeterContext { for (size_t i = 0; i < device_keys_.size(); ++i) { device_keys_[i].clear(); } + for (size_t i = 0; i < device_task_ptr_.size(); ++i) { + for (size_t j = 0; j < device_task_ptr_[i].size(); ++j) { + device_task_ptr_[i][j].clear(); + device_task_keys_[i][j].clear(); + } + } } else { VLOG(3) << "Reset gpu task with dynamic mf dimention"; for (size_t i = 0; i < feature_dim_keys_.size(); i++) { diff --git a/paddle/fluid/framework/fleet/ps_gpu_wrapper.cc b/paddle/fluid/framework/fleet/ps_gpu_wrapper.cc old mode 100755 new mode 100644 index e167a39caa526..115ec4d0102cc --- a/paddle/fluid/framework/fleet/ps_gpu_wrapper.cc +++ b/paddle/fluid/framework/fleet/ps_gpu_wrapper.cc @@ -298,6 +298,7 @@ void PSGPUWrapper::PreBuildTask(std::shared_ptr gpu_task) { void PSGPUWrapper::BuildPull(std::shared_ptr gpu_task) { platform::Timer timeline; + std::vector> task_futures; int device_num = heter_devices_.size(); auto& local_keys = gpu_task->feature_keys_; auto& local_ptr = gpu_task->value_ptr_; @@ -316,7 +317,7 @@ void PSGPUWrapper::BuildPull(std::shared_ptr gpu_task) { device_dim_ptr[dev].resize(multi_mf_dim_); } } - auto& device_mutex = gpu_task->mutex_; + // auto& device_mutex = gpu_task->mutex_; std::vector threads(thread_keys_shard_num_); #ifdef PADDLE_WITH_PSLIB @@ -502,6 +503,8 @@ void PSGPUWrapper::BuildPull(std::shared_ptr gpu_task) { table_id_, pass_id, pass_values); } #endif + auto& device_task_keys = gpu_task->device_task_keys_; + auto& device_task_ptrs = gpu_task->device_task_ptr_; auto build_dynamic_mf_func = [this, device_num, &local_dim_keys, &local_dim_ptr, &device_dim_keys, &device_dim_ptr, @@ -534,17 +537,14 @@ void PSGPUWrapper::BuildPull(std::shared_ptr gpu_task) { #endif }; auto build_func = [device_num, record_status, &pass_values, &local_keys, - &local_ptr, &device_keys, &device_vals, - &device_mutex](int i) { - std::vector> task_keys(device_num); + &local_ptr, &device_task_keys, &device_task_ptrs](int i) { + auto& task_keys = device_task_keys[i]; #ifdef PADDLE_WITH_PSLIB - std::vector> task_ptrs( - device_num); + auto& task_ptrs = device_task_ptrs[i]; #endif #ifdef PADDLE_WITH_PSCORE - std::vector> task_ptrs( - device_num); + auto& task_ptrs = device_task_ptrs[i]; #endif for (size_t j = 0; j < local_keys[i].size(); j++) { @@ -569,88 +569,139 @@ void PSGPUWrapper::BuildPull(std::shared_ptr gpu_task) { } } #endif - for (int dev = 0; dev < device_num; dev++) { - device_mutex[dev]->lock(); + }; + if (!multi_mf_dim_) { + for (int i = 0; i < thread_keys_shard_num_; i++) { + task_futures.emplace_back(hbm_thread_pool_[i]->enqueue(build_func, i)); + } + for (auto& f : task_futures) { + f.wait(); + } + task_futures.clear(); + VLOG(0) << "GpuPs build hbmps done"; + } + std::vector> prefix_sum; + prefix_sum.resize(device_num); + for (int i = 0; i < device_num; i++) { + prefix_sum[i].resize(thread_keys_shard_num_ + 1); + prefix_sum[i][0] = 0; + } + auto calc_prefix_func = [this, &prefix_sum, &device_keys, &device_vals, + &device_task_keys](int device_num) { + for (int j = 0; j < thread_keys_shard_num_; j++) { + prefix_sum[device_num][j + 1] = + prefix_sum[device_num][j] + device_task_keys[j][device_num].size(); + } + device_keys[device_num].resize( + prefix_sum[device_num][thread_keys_shard_num_]); + device_vals[device_num].resize( + prefix_sum[device_num][thread_keys_shard_num_]); + }; + if (!multi_mf_dim_) { + for (int i = 0; i < device_num; i++) { + task_futures.emplace_back( + hbm_thread_pool_[i]->enqueue(calc_prefix_func, i)); + } + for (auto& f : task_futures) { + f.wait(); + } + task_futures.clear(); + } + VLOG(0) << "prefix done"; + auto prepare_dev_value_func = [device_num, &prefix_sum, &device_keys, + &device_vals, &device_task_keys, + &device_task_ptrs](int dev, int shard_id) { + auto& task_keys = device_task_keys[shard_id]; +#ifdef PADDLE_WITH_PSLIB + auto& task_ptrs = device_task_ptrs[shard_id]; +#endif + +#ifdef PADDLE_WITH_PSCORE + auto& task_ptrs = device_task_ptrs[dev]; +#endif - int len = task_keys[dev].size(); - int cur = device_keys[dev].size(); - device_keys[dev].resize(device_keys[dev].size() + len); - device_vals[dev].resize(device_vals[dev].size() + len); + int len = prefix_sum[dev][shard_id + 1] - prefix_sum[dev][shard_id]; + int cur = prefix_sum[dev][shard_id]; #ifdef PADDLE_WITH_PSLIB - for (int j = 0; j < len; ++j) { - device_keys[dev][cur + j] = task_keys[dev][j]; - float* ptr_val = task_ptrs[dev][j]->data(); - FeatureValue& val = device_vals[dev][cur + j]; - size_t dim = task_ptrs[dev][j]->size(); - - val.delta_score = ptr_val[1]; - val.show = ptr_val[2]; - val.clk = ptr_val[3]; - val.slot = ptr_val[6]; - val.lr = ptr_val[4]; - val.lr_g2sum = ptr_val[5]; - val.cpu_ptr = (uint64_t)(task_ptrs[dev][j]); - - if (dim > 7) { - val.mf_size = MF_DIM + 1; - for (int x = 0; x < val.mf_size; x++) { - val.mf[x] = ptr_val[x + 7]; - } - } else { - val.mf_size = 0; - for (int x = 0; x < MF_DIM + 1; x++) { - val.mf[x] = 0; - } + for (int j = 0; j < len; ++j) { + device_keys[dev][cur + j] = task_keys[dev][j]; + float* ptr_val = task_ptrs[dev][j]->data(); + FeatureValue& val = device_vals[dev][cur + j]; + size_t dim = task_ptrs[dev][j]->size(); + + val.delta_score = ptr_val[1]; + val.show = ptr_val[2]; + val.clk = ptr_val[3]; + val.slot = ptr_val[6]; + val.lr = ptr_val[4]; + val.lr_g2sum = ptr_val[5]; + val.cpu_ptr = (uint64_t)(task_ptrs[dev][j]); + + if (dim > 7) { + val.mf_size = MF_DIM + 1; + for (int x = 0; x < val.mf_size; x++) { + val.mf[x] = ptr_val[x + 7]; + } + } else { + val.mf_size = 0; + for (int x = 0; x < MF_DIM + 1; x++) { + val.mf[x] = 0; } } + } #endif #ifdef PADDLE_WITH_PSCORE - for (int j = 0; j < len; ++j) { - device_keys[dev][cur + j] = task_keys[dev][j]; - float* ptr_val = task_ptrs[dev][j]->data(); - FeatureValue& val = device_vals[dev][cur + j]; - size_t dim = task_ptrs[dev][j]->size(); - val.delta_score = ptr_val[2]; - val.show = ptr_val[3]; - val.clk = ptr_val[4]; - val.slot = ptr_val[0]; - val.lr = ptr_val[5]; - val.lr_g2sum = ptr_val[6]; - val.cpu_ptr = (uint64_t)(task_ptrs[dev][j]); - - if (dim > 7) { - val.mf_size = MF_DIM + 1; - for (int x = 0; x < val.mf_size; x++) { - val.mf[x] = ptr_val[x + 7]; - } - } else { - val.mf_size = 0; - for (int x = 0; x < MF_DIM + 1; x++) { - val.mf[x] = 0; - } + for (int j = 0; j < len; ++j) { + device_keys[dev][cur + j] = task_keys[dev][j]; + float* ptr_val = task_ptrs[dev][j]->data(); + FeatureValue& val = device_vals[dev][cur + j]; + size_t dim = task_ptrs[dev][j]->size(); + val.delta_score = ptr_val[2]; + val.show = ptr_val[3]; + val.clk = ptr_val[4]; + val.slot = ptr_val[0]; + val.lr = ptr_val[5]; + val.lr_g2sum = ptr_val[6]; + val.cpu_ptr = (uint64_t)(task_ptrs[dev][j]); + + if (dim > 7) { + val.mf_size = MF_DIM + 1; + for (int x = 0; x < val.mf_size; x++) { + val.mf[x] = ptr_val[x + 7]; + } + } else { + val.mf_size = 0; + for (int x = 0; x < MF_DIM + 1; x++) { + val.mf[x] = 0; } } + } #endif - VLOG(3) << "GpuPs build hbmps done"; + VLOG(3) << "GpuPs build hbmps done"; - device_mutex[dev]->unlock(); - } }; - if (!multi_mf_dim_) { - for (size_t i = 0; i < threads.size(); i++) { - threads[i] = std::thread(build_func, i); - } - } else { + if (multi_mf_dim_) { for (int i = 0; i < thread_keys_shard_num_; i++) { for (int j = 0; j < multi_mf_dim_; j++) { threads[i * multi_mf_dim_ + j] = std::thread(build_dynamic_mf_func, i, j); } } - } - for (std::thread& t : threads) { - t.join(); + for (std::thread& t : threads) { + t.join(); + } + } else { + for (int i = 0; i < thread_keys_shard_num_; i++) { + for (int j = 0; j < device_num; j++) { + task_futures.emplace_back( + hbm_thread_pool_[i]->enqueue(prepare_dev_value_func, j, i)); + } + } + for (auto& f : task_futures) { + f.wait(); + } + task_futures.clear(); } timeline.Pause(); VLOG(0) << "GpuPs prepare for build hbm cost " << timeline.ElapsedSec() @@ -750,7 +801,7 @@ void PSGPUWrapper::pre_build_thread() { PreBuildTask(gpu_task); timer.Pause(); VLOG(0) << "thread PreBuildTask end, cost time: " << timer.ElapsedSec() - << "s"; + << " s"; buildcpu_ready_channel_->Put(gpu_task); } VLOG(3) << "build cpu thread end"; diff --git a/paddle/fluid/framework/fleet/ps_gpu_wrapper.h b/paddle/fluid/framework/fleet/ps_gpu_wrapper.h index 9b7d6de082d1c..9551e49b6b77b 100755 --- a/paddle/fluid/framework/fleet/ps_gpu_wrapper.h +++ b/paddle/fluid/framework/fleet/ps_gpu_wrapper.h @@ -83,6 +83,10 @@ class PSGPUWrapper { PSGPUWrapper() { HeterPs_ = NULL; sleep_seconds_before_fail_exit_ = 300; + hbm_thread_pool_.resize(thread_keys_shard_num_); + for (size_t i = 0; i < hbm_thread_pool_.size(); i++) { + hbm_thread_pool_[i].reset(new ::ThreadPool(1)); + } } void PullSparse(const paddle::platform::Place& place, const int table_id, @@ -399,6 +403,7 @@ class PSGPUWrapper { std::shared_ptr current_task_ = nullptr; std::thread pre_build_threads_; bool running_ = false; + std::vector> hbm_thread_pool_; protected: static bool is_initialized_; From 5f2c5b9e06ce1c531d59155b35b91a1ea5bd8764 Mon Sep 17 00:00:00 2001 From: Roc <30228238+sljlp@users.noreply.github.com> Date: Wed, 13 Apr 2022 11:42:31 +0800 Subject: [PATCH 118/211] fix moe apis (#41650) --- .../paddle/incubate/distributed/models/moe/gate/gshard_gate.py | 2 +- python/paddle/incubate/distributed/models/moe/utils.py | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/python/paddle/incubate/distributed/models/moe/gate/gshard_gate.py b/python/paddle/incubate/distributed/models/moe/gate/gshard_gate.py index b1c0cd4214dbb..3ab3cf6901402 100644 --- a/python/paddle/incubate/distributed/models/moe/gate/gshard_gate.py +++ b/python/paddle/incubate/distributed/models/moe/gate/gshard_gate.py @@ -62,6 +62,6 @@ def forward(self, x): if self.random_routing: rand_routing_prob = paddle.rand( shape=[gate_score.shape[0]], dtype="float32") - topk_idx = paddle.distributed.utils.random_routing( + topk_idx = paddle.distributed.models.moe.utils._random_routing( topk_idx, topk_val, rand_routing_prob) return topk_val, topk_idx diff --git a/python/paddle/incubate/distributed/models/moe/utils.py b/python/paddle/incubate/distributed/models/moe/utils.py index 99e31a16273bf..0e87fe3e31360 100644 --- a/python/paddle/incubate/distributed/models/moe/utils.py +++ b/python/paddle/incubate/distributed/models/moe/utils.py @@ -11,7 +11,8 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -from paddle.distributed.models.moe.utils import * +from paddle.distributed.models.moe.utils import _number_count, _limit_by_capacity, _prune_gate_by_capacity, _assign_pos +import paddle def _alltoall(in_tensor_list, group=None, use_calc_stream=True): From b2390438b2c70fa13897e0edb263512e89bd3ccf Mon Sep 17 00:00:00 2001 From: zyfncg Date: Wed, 13 Apr 2022 11:56:54 +0800 Subject: [PATCH 119/211] Fix problem of infermeta with vector output (#41646) * remove stack_grad infershape * fix bug of output with null * fix bug --- paddle/fluid/framework/infershape_utils.cc | 16 +++++-- .../new_executor/new_executor_defs.cc | 17 +++++--- .../new_executor/new_executor_defs.h | 3 +- paddle/fluid/framework/op_desc.cc | 19 +++++--- paddle/fluid/framework/operator.cc | 16 ++++--- paddle/fluid/framework/shape_inference.h | 3 +- paddle/fluid/imperative/infer_shape_context.h | 20 ++++++--- paddle/fluid/operators/stack_op.cc | 43 ++----------------- paddle/phi/infermeta/backward.cc | 6 ++- 9 files changed, 76 insertions(+), 67 deletions(-) diff --git a/paddle/fluid/framework/infershape_utils.cc b/paddle/fluid/framework/infershape_utils.cc index ecc5fbdcf945d..17acbde2a09e7 100644 --- a/paddle/fluid/framework/infershape_utils.cc +++ b/paddle/fluid/framework/infershape_utils.cc @@ -597,7 +597,7 @@ phi::InferMetaContext BuildInferMetaContext(InferShapeContext* ctx, } for (auto& out_name : output_names) { - if (ctx->HasOutputs(out_name)) { + if (ctx->HasOutputs(out_name, true)) { auto output_var = ctx->GetOutputVarPtrs(out_name); if (output_var.size() == 1) { infer_meta_context.EmplaceBackOutput(std::make_shared( @@ -606,8 +606,18 @@ phi::InferMetaContext BuildInferMetaContext(InferShapeContext* ctx, paddle::SmallVector> outputs; outputs.reserve(output_var.size()); for (const auto& out : output_var) { - outputs.emplace_back( - std::make_shared(out, ctx->IsRuntime())); + if (ctx->IsRuntime()) { + if (BOOST_GET_CONST(Variable*, out)) { + outputs.emplace_back( + std::make_shared(out, ctx->IsRuntime())); + continue; + } + } else if (BOOST_GET_CONST(VarDesc*, out)) { + outputs.emplace_back( + std::make_shared(out, ctx->IsRuntime())); + continue; + } + outputs.emplace_back(nullptr); } infer_meta_context.EmplaceBackOutputs(std::move(outputs)); } diff --git a/paddle/fluid/framework/new_executor/new_executor_defs.cc b/paddle/fluid/framework/new_executor/new_executor_defs.cc index ccdd9dc9d50ce..089e68fe48c52 100644 --- a/paddle/fluid/framework/new_executor/new_executor_defs.cc +++ b/paddle/fluid/framework/new_executor/new_executor_defs.cc @@ -93,19 +93,24 @@ bool InterpretercoreInferShapeContext::HasInputs( return true; } -bool InterpretercoreInferShapeContext::HasOutputs( - const std::string& name) const { +bool InterpretercoreInferShapeContext::HasOutputs(const std::string& name, + bool allow_null) const { const auto& outs = ctx_.outputs; auto it = outs.find(name); if (it == outs.end() || it->second.empty()) { return false; } - for (auto& output : it->second) { - if (output == nullptr) { - return false; + if (allow_null) { + for (auto& output : it->second) { + if (output != nullptr) return true; + } + return false; + } else { + for (auto& output : it->second) { + if (output == nullptr) return false; } + return true; } - return true; } AttrReader InterpretercoreInferShapeContext::Attrs() const { diff --git a/paddle/fluid/framework/new_executor/new_executor_defs.h b/paddle/fluid/framework/new_executor/new_executor_defs.h index 5704fa414bbb2..aab32cfa06d40 100644 --- a/paddle/fluid/framework/new_executor/new_executor_defs.h +++ b/paddle/fluid/framework/new_executor/new_executor_defs.h @@ -58,7 +58,8 @@ class InterpretercoreInferShapeContext : public InferShapeContext { bool HasInputs(const std::string& name) const override; - bool HasOutputs(const std::string& name) const override; + bool HasOutputs(const std::string& name, + bool allow_null = false) const override; AttrReader Attrs() const override; diff --git a/paddle/fluid/framework/op_desc.cc b/paddle/fluid/framework/op_desc.cc index f31fefcfade89..15b979086d1eb 100644 --- a/paddle/fluid/framework/op_desc.cc +++ b/paddle/fluid/framework/op_desc.cc @@ -39,7 +39,8 @@ class CompileTimeInferShapeContext : public InferShapeContext { bool HasInputs(const std::string &name) const override; - bool HasOutputs(const std::string &name) const override; + bool HasOutputs(const std::string &name, + bool allow_null = false) const override; AttrReader Attrs() const override; @@ -882,7 +883,8 @@ bool CompileTimeInferShapeContext::HasInputs(const std::string &name) const { return true; } -bool CompileTimeInferShapeContext::HasOutputs(const std::string &name) const { +bool CompileTimeInferShapeContext::HasOutputs(const std::string &name, + bool allow_null) const { if (op_.Outputs().find(name) == op_.Outputs().end()) { return false; } @@ -890,10 +892,17 @@ bool CompileTimeInferShapeContext::HasOutputs(const std::string &name) const { if (output_names.empty()) { return false; } - for (auto &output : output_names) { - if (!block_.HasVarRecursive(output)) return false; + if (allow_null) { + for (auto &output : output_names) { + if (block_.HasVarRecursive(output)) return true; + } + return false; + } else { + for (auto &output : output_names) { + if (!block_.HasVarRecursive(output)) return false; + } + return true; } - return true; } AttrReader CompileTimeInferShapeContext::Attrs() const { diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc index e6577f662ae7b..d9704d70b45ec 100644 --- a/paddle/fluid/framework/operator.cc +++ b/paddle/fluid/framework/operator.cc @@ -718,18 +718,24 @@ class RuntimeInferShapeContext : public InferShapeContext { return true; } - bool HasOutputs(const std::string& name) const override { + bool HasOutputs(const std::string& name, + bool allow_null = false) const override { const auto& outs = ctx_.outputs; auto it = outs.find(name); if (it == outs.end() || it->second.empty()) { return false; } - for (auto& output : it->second) { - if (output == nullptr) { - return false; + if (allow_null) { + for (auto& output : it->second) { + if (output != nullptr) return true; + } + return false; + } else { + for (auto& output : it->second) { + if (output == nullptr) return false; } + return true; } - return true; } AttrReader Attrs() const override { return AttrReader(op_.Attrs()); } diff --git a/paddle/fluid/framework/shape_inference.h b/paddle/fluid/framework/shape_inference.h index 31e3929362a04..6ba60590cf8f3 100644 --- a/paddle/fluid/framework/shape_inference.h +++ b/paddle/fluid/framework/shape_inference.h @@ -69,7 +69,8 @@ class InferShapeContext { const std::string &name) const = 0; virtual bool HasInputs(const std::string &name) const = 0; - virtual bool HasOutputs(const std::string &name) const = 0; + virtual bool HasOutputs(const std::string &name, + bool allow_null = false) const = 0; virtual DDim GetInputDim(const std::string &name) const = 0; virtual std::vector GetInputsDim(const std::string &name) const = 0; diff --git a/paddle/fluid/imperative/infer_shape_context.h b/paddle/fluid/imperative/infer_shape_context.h index f871e77fdf6e2..1e5b112ece21f 100644 --- a/paddle/fluid/imperative/infer_shape_context.h +++ b/paddle/fluid/imperative/infer_shape_context.h @@ -95,17 +95,27 @@ class DygraphInferShapeContext : public framework::InferShapeContext { return true; } - bool HasOutputs(const std::string& name) const override { + bool HasOutputs(const std::string& name, + bool allow_null = false) const override { auto it = var_map_out_->find(name); if (it == var_map_out_->end() || it->second.empty()) { return false; } - for (auto& output : it->second) { - if (output == nullptr) { - return false; + if (allow_null) { + for (auto& output : it->second) { + if (output != nullptr) { + return true; + } } + return false; + } else { + for (auto& output : it->second) { + if (output == nullptr) { + return false; + } + } + return true; } - return true; } framework::AttrReader Attrs() const override { diff --git a/paddle/fluid/operators/stack_op.cc b/paddle/fluid/operators/stack_op.cc index a9fa78c4e4943..6fc80ca379f3f 100644 --- a/paddle/fluid/operators/stack_op.cc +++ b/paddle/fluid/operators/stack_op.cc @@ -17,6 +17,7 @@ #include "paddle/fluid/framework/infershape_utils.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/phi/core/infermeta_utils.h" +#include "paddle/phi/infermeta/backward.h" #include "paddle/phi/infermeta/multiary.h" namespace plat = paddle::platform; @@ -68,44 +69,6 @@ Stack all of the Inputs(X) into one tensor along Attr(axis). The dims of all Inp class StackOpGrad : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - - void InferShape(framework::InferShapeContext *ctx) const override { - PADDLE_ENFORCE_EQ( - ctx->HasInput(framework::GradVarName("Y")), true, - platform::errors::InvalidArgument("Input(Y@Grad) not exist.")); - - int axis = ctx->Attrs().Get("axis"); - auto dy_dim = ctx->GetInputDim(framework::GradVarName("Y")); - int rank = dy_dim.size(); - PADDLE_ENFORCE_GE( - axis, -rank, - platform::errors::InvalidArgument( - "Attr(axis) must be inside [-rank, rank), where rank = %d, " - "but received axis is:%d.", - rank, axis)); - PADDLE_ENFORCE_LT( - axis, rank, - platform::errors::InvalidArgument( - "Attr(axis) must be inside [-rank, rank), where rank = %d, " - "but received axis is:%d.", - rank, axis)); - - if (axis < 0) axis += rank; - PADDLE_ENFORCE_EQ( - ctx->Outputs(framework::GradVarName("X")).size(), - static_cast(dy_dim[axis]), - platform::errors::InvalidArgument( - "Number of Outputs(X@Grad) is equal to dy dim at axis, but" - " received outputs size is:%d, dy dims is:%d.", - ctx->Outputs(framework::GradVarName("X")).size(), - static_cast(dy_dim[axis]))); - - auto vec = phi::vectorize(dy_dim); - vec.erase(vec.begin() + axis); - ctx->SetOutputsDim( - framework::GradVarName("X"), - std::vector(dy_dim[axis], phi::make_ddim(vec))); - } }; template @@ -127,8 +90,10 @@ class StackGradOpMaker : public framework::SingleGradOpMaker { DECLARE_INFER_SHAPE_FUNCTOR(stack, StackInferMetaFunctor, PD_INFER_META(phi::StackInferMeta)); +DECLARE_INFER_SHAPE_FUNCTOR(stack_grad, StackGradInferMetaFunctor, + PD_INFER_META(phi::StackGradInferMeta)); REGISTER_OPERATOR(stack, ops::StackOp, ops::StackOpMaker, ops::StackGradOpMaker, ops::StackGradOpMaker, StackInferMetaFunctor); -REGISTER_OPERATOR(stack_grad, ops::StackOpGrad); +REGISTER_OPERATOR(stack_grad, ops::StackOpGrad, StackGradInferMetaFunctor); diff --git a/paddle/phi/infermeta/backward.cc b/paddle/phi/infermeta/backward.cc index efbf02e331433..84db67978fc23 100644 --- a/paddle/phi/infermeta/backward.cc +++ b/paddle/phi/infermeta/backward.cc @@ -541,8 +541,10 @@ void StackGradInferMeta(const MetaTensor& out_grad, vec.erase(vec.begin() + axis); for (size_t i = 0; i < x_grad.size(); ++i) { - x_grad[i]->set_dims(phi::make_ddim(vec)); - x_grad[i]->set_dtype(out_grad.dtype()); + if (x_grad[i]) { + x_grad[i]->set_dims(phi::make_ddim(vec)); + x_grad[i]->set_dtype(out_grad.dtype()); + } } } From fe214af2733fd7cb14c2adc6bca3251917472039 Mon Sep 17 00:00:00 2001 From: YuanRisheng Date: Wed, 13 Apr 2022 12:20:58 +0800 Subject: [PATCH 120/211] [Phi] Support construct Scalar by using Non-CPU Tensosr (#41528) * support construct scalar using non-cpu tensor * fix bugs when run unittest * fix compile bugs * fix bugs when run ci * fix compile bugs * fix bugs when move copy * perfect unit test * perfect unittest * update according to comment * add target dependency --- paddle/fluid/platform/CMakeLists.txt | 6 +- paddle/phi/CMakeLists.txt | 2 +- paddle/phi/api/lib/CMakeLists.txt | 4 +- paddle/phi/api/lib/api_custom_impl.cc | 30 +--- paddle/phi/api/lib/scalar.cc | 48 ++++++ paddle/phi/api/lib/tensor_copy.cc | 57 +++++++ paddle/phi/api/lib/tensor_copy.h | 25 +++ paddle/phi/api/lib/utils/CMakeLists.txt | 2 +- paddle/phi/common/CMakeLists.txt | 2 +- paddle/phi/common/scalar.cc | 23 ++- paddle/phi/common/scalar.h | 90 +++++------ paddle/phi/core/CMakeLists.txt | 2 +- paddle/phi/core/selected_rows.cc | 26 +++ paddle/phi/core/selected_rows.h | 5 +- paddle/phi/core/utils/type_registry.h | 2 +- paddle/phi/tests/api/CMakeLists.txt | 4 +- paddle/phi/tests/common/CMakeLists.txt | 6 + paddle/phi/tests/common/test_scalar.cu | 205 ++++++++++++++++++++++++ paddle/phi/tests/core/CMakeLists.txt | 2 +- 19 files changed, 446 insertions(+), 95 deletions(-) create mode 100644 paddle/phi/api/lib/scalar.cc create mode 100644 paddle/phi/api/lib/tensor_copy.cc create mode 100644 paddle/phi/api/lib/tensor_copy.h create mode 100644 paddle/phi/core/selected_rows.cc create mode 100644 paddle/phi/tests/common/test_scalar.cu diff --git a/paddle/fluid/platform/CMakeLists.txt b/paddle/fluid/platform/CMakeLists.txt index 46059100b3802..f29546c5210d9 100644 --- a/paddle/fluid/platform/CMakeLists.txt +++ b/paddle/fluid/platform/CMakeLists.txt @@ -192,13 +192,13 @@ add_subdirectory(profiler) cc_library(device_tracer SRCS device_tracer.cc DEPS boost profiler_proto framework_proto ${GPU_CTX_DEPS}) if(WITH_GPU) - nv_library(profiler SRCS profiler.cc profiler.cu DEPS os_info device_tracer gpu_info enforce dynload_cuda new_profiler) + nv_library(profiler SRCS profiler.cc profiler.cu DEPS os_info device_tracer gpu_info enforce dynload_cuda new_profiler stats) nv_library(device_memory_aligment SRCS device_memory_aligment.cc DEPS cpu_info gpu_info place) elseif(WITH_ROCM) - hip_library(profiler SRCS profiler.cc profiler.cu DEPS os_info device_tracer gpu_info enforce new_profiler) + hip_library(profiler SRCS profiler.cc profiler.cu DEPS os_info device_tracer gpu_info enforce new_profiler stats) hip_library(device_memory_aligment SRCS device_memory_aligment.cc DEPS cpu_info gpu_info place) else() - cc_library(profiler SRCS profiler.cc DEPS os_info device_tracer enforce new_profiler) + cc_library(profiler SRCS profiler.cc DEPS os_info device_tracer enforce new_profiler stats) cc_library(device_memory_aligment SRCS device_memory_aligment.cc DEPS cpu_info place) endif() diff --git a/paddle/phi/CMakeLists.txt b/paddle/phi/CMakeLists.txt index 724b1ba556d4b..d43e327393f25 100644 --- a/paddle/phi/CMakeLists.txt +++ b/paddle/phi/CMakeLists.txt @@ -23,7 +23,7 @@ add_subdirectory(tools) add_subdirectory(tests) # make an unity target for compile deps -set(PHI_DEPS convert_utils dense_tensor phi_context kernel_factory kernel_context arg_map_context infermeta lod_utils op_compat_infos sparse_csr_tensor sparse_coo_tensor string_tensor) +set(PHI_DEPS convert_utils dense_tensor phi_context kernel_factory kernel_context arg_map_context infermeta lod_utils op_compat_infos sparse_csr_tensor sparse_coo_tensor string_tensor api_scalar) get_property(phi_kernels GLOBAL PROPERTY PHI_KERNELS) set(PHI_DEPS ${PHI_DEPS} ${phi_kernels}) diff --git a/paddle/phi/api/lib/CMakeLists.txt b/paddle/phi/api/lib/CMakeLists.txt index 9cc5d620280bc..e10ae8254a79e 100644 --- a/paddle/phi/api/lib/CMakeLists.txt +++ b/paddle/phi/api/lib/CMakeLists.txt @@ -164,7 +164,7 @@ cc_library(kernel_dispatch SRCS kernel_dispatch.cc DEPS phi_tensor_raw phi_conte cc_library(api_gen_utils SRCS api_gen_utils.cc DEPS phi_tensor_raw selected_rows sparse_csr_tensor sparse_coo_tensor) cc_library(phi_data_transform SRCS data_transform.cc DEPS phi_tensor_raw transfer_layout_kernel cast_kernel data_device_transform) cc_library(api_custom_impl SRCS api_custom_impl.cc DEPS phi_tensor_raw phi kernel_dispatch api_gen_utils backward_infermeta phi_data_transform) -cc_library(sparse_api_custom_impl SRCS sparse_api_custom_impl.cc DEPS phi_tensor_raw phi kernel_dispatch api_gen_utils phi_data_transform) +cc_library(sparse_api_custom_impl SRCS sparse_api_custom_impl.cc DEPS phi_tensor_raw phi kernel_dispatch api_gen_utils phi_data_transform tensor_copy) cc_library(phi_function_api SRCS ${api_source_file} DEPS phi_tensor_raw phi kernel_dispatch api_gen_utils phi_data_transform api_custom_impl) cc_library(phi_bw_function_api SRCS ${bw_api_source_file} DEPS phi_tensor_raw phi kernel_dispatch api_gen_utils backward_infermeta phi_data_transform phi_function_api api_custom_impl global_utils) @@ -173,3 +173,5 @@ cc_library(sparse_bw_api SRCS ${sparse_bw_api_source_file} DEPS phi_tensor_raw p cc_library(phi_dygraph_api SRCS ${dygraph_api_source_file} DEPS phi_tensor_raw phi kernel_dispatch api_gen_utils phi_data_transform phi_function_api sparse_api) cc_library(strings_api SRCS ${strings_api_source_file} DEPS phi_tensor_raw phi kernel_dispatch api_gen_utils) cc_library(phi_tensor SRCS tensor_method.cc DEPS phi_tensor_raw phi_function_api api_gen_utils kernel_dispatch infermeta sparse_api strings_api) +cc_library(tensor_copy SRCS tensor_copy.cc DEPS phi_tensor_raw copy_kernel kernel_dispatch api_gen_utils) +cc_library(api_scalar SRCS scalar.cc DEPS tensor_copy) diff --git a/paddle/phi/api/lib/api_custom_impl.cc b/paddle/phi/api/lib/api_custom_impl.cc index d7f148fff818b..81e7faeb87015 100644 --- a/paddle/phi/api/lib/api_custom_impl.cc +++ b/paddle/phi/api/lib/api_custom_impl.cc @@ -17,6 +17,7 @@ limitations under the License. */ #include "paddle/phi/api/lib/api_gen_utils.h" #include "paddle/phi/api/lib/data_transform.h" #include "paddle/phi/api/lib/kernel_dispatch.h" +#include "paddle/phi/api/lib/tensor_copy.h" #include "paddle/phi/api/lib/utils/storage.h" #include "paddle/phi/common/type_traits.h" #include "paddle/phi/core/compat/convert_utils.h" @@ -424,35 +425,8 @@ std::vector> conv2d_grad_impl( } Tensor copy_to_impl(const Tensor& x, Place place, bool blocking) { - auto kernel_key_set = ParseKernelKeyByInputArgs(x); - kernel_key_set.backend_set = - kernel_key_set.backend_set | BackendSet(phi::TransToPhiBackend(place)); - auto kernel_key = kernel_key_set.GetHighestPriorityKernelKey(); - auto kernel = phi::KernelFactory::Instance().SelectKernelOrThrowError( - "copy", kernel_key); - - VLOG(6) << "copy API kernel key: " << kernel_key; - VLOG(6) << "copy API kernel: " << kernel; - - auto* dev_ctx = GetDeviceContextByBackend(kernel_key.backend()); - - auto dense_x = TensorToDenseTensor(x); - Tensor out; - auto kernel_out = SetKernelOutput(kernel_key.backend(), &out); - phi::MetaTensor meta_out(kernel_out); - phi::UnchangedInferMeta(*dense_x, &meta_out); - - using kernel_signature = void (*)(const platform::DeviceContext&, - const phi::DenseTensor&, - phi::Place, - bool, - phi::DenseTensor*); - - auto* kernel_fn = kernel.GetVariadicKernelFn(); - - (*kernel_fn)(*dev_ctx, *dense_x, place, blocking, kernel_out); - + copy(x, place, blocking, &out); return out; } diff --git a/paddle/phi/api/lib/scalar.cc b/paddle/phi/api/lib/scalar.cc new file mode 100644 index 0000000000000..981487df86be4 --- /dev/null +++ b/paddle/phi/api/lib/scalar.cc @@ -0,0 +1,48 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/phi/common/scalar.h" + +#include "paddle/phi/api/lib/tensor_copy.h" +#include "paddle/phi/common/place.h" +#include "paddle/phi/core/enforce.h" + +namespace paddle { +namespace experimental { + +template <> +ScalarBase::ScalarBase(const Tensor& tensor_in) + : dtype_(tensor_in.dtype()) { // NOLINT + PADDLE_ENFORCE_EQ(tensor_in.numel(), + 1, + phi::errors::InvalidArgument( + "The Scalar only supports Tensor with 1 element, but " + "now Tensor has `%d` elements", + tensor_in.numel())); + if (tensor_in.place() == PlaceType::kGPU) { + Tensor dst_tensor; + copy(tensor_in, phi::CPUPlace(), true, &dst_tensor); + GetDataFromTensor(dst_tensor); + } else if (tensor_in.place() == PlaceType::kCPU) { + GetDataFromTensor(tensor_in); + } else { + PADDLE_THROW(phi::errors::Unimplemented( + "Now, it is not supported to construct Scalar using tensor that its " + "PlaceType is (%d)", + static_cast(tensor_in.place()))); + } +} + +} // namespace experimental +} // namespace paddle diff --git a/paddle/phi/api/lib/tensor_copy.cc b/paddle/phi/api/lib/tensor_copy.cc new file mode 100644 index 0000000000000..57e3c28d8cb1f --- /dev/null +++ b/paddle/phi/api/lib/tensor_copy.cc @@ -0,0 +1,57 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/phi/api/lib/tensor_copy.h" +#include "paddle/phi/api/lib/api_gen_utils.h" +#include "paddle/phi/api/lib/kernel_dispatch.h" +#include "paddle/phi/api/lib/utils/storage.h" +#include "paddle/phi/core/compat/convert_utils.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/core/meta_tensor.h" +#include "paddle/phi/infermeta/unary.h" + +namespace paddle { +namespace experimental { + +void copy(const Tensor& src, Place place, bool blocking, Tensor* dst) { + auto kernel_key_set = ParseKernelKeyByInputArgs(src); + kernel_key_set.backend_set = + kernel_key_set.backend_set | BackendSet(phi::TransToPhiBackend(place)); + auto kernel_key = kernel_key_set.GetHighestPriorityKernelKey(); + auto kernel = phi::KernelFactory::Instance().SelectKernelOrThrowError( + "copy", kernel_key); + + VLOG(6) << "copy API kernel key: " << kernel_key; + VLOG(6) << "copy API kernel: " << kernel; + + auto* dev_ctx = GetDeviceContextByBackend(kernel_key.backend()); + + auto dense_x = TensorToDenseTensor(src); + + auto kernel_out = SetKernelOutput(kernel_key.backend(), dst); + phi::MetaTensor meta_out(kernel_out); + phi::UnchangedInferMeta(*dense_x, &meta_out); + + using kernel_signature = void (*)(const platform::DeviceContext&, + const phi::DenseTensor&, + phi::Place, + bool, + phi::DenseTensor*); + + auto* kernel_fn = kernel.GetVariadicKernelFn(); + (*kernel_fn)(*dev_ctx, *dense_x, place, blocking, kernel_out); +} + +} // namespace experimental +} // namespace paddle diff --git a/paddle/phi/api/lib/tensor_copy.h b/paddle/phi/api/lib/tensor_copy.h new file mode 100644 index 0000000000000..3ce45853319ec --- /dev/null +++ b/paddle/phi/api/lib/tensor_copy.h @@ -0,0 +1,25 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include "paddle/phi/api/include/tensor.h" + +namespace paddle { +namespace experimental { + +void copy(const Tensor& src, Place place, bool blocking, Tensor* dst); + +} // namespace experimental +} // namespace paddle diff --git a/paddle/phi/api/lib/utils/CMakeLists.txt b/paddle/phi/api/lib/utils/CMakeLists.txt index 94a16da2b7720..de97e7516f619 100644 --- a/paddle/phi/api/lib/utils/CMakeLists.txt +++ b/paddle/phi/api/lib/utils/CMakeLists.txt @@ -1,2 +1,2 @@ cc_library(phi_api_utils SRCS storage.cc tensor_utils.cc DEPS -tensor_base convert_utils dense_tensor lod_tensor selected_rows_utils place var_type_traits scalar string_tensor) +tensor_base convert_utils dense_tensor lod_tensor selected_rows_utils place var_type_traits string_tensor scalar) diff --git a/paddle/phi/common/CMakeLists.txt b/paddle/phi/common/CMakeLists.txt index 9bf692703860f..aa839eab587cb 100644 --- a/paddle/phi/common/CMakeLists.txt +++ b/paddle/phi/common/CMakeLists.txt @@ -1,2 +1,2 @@ cc_library(phi_place SRCS place.cc) -cc_library(scalar SRCS scalar.cc DEPS phi_enforce) +cc_library(scalar SRCS scalar.cc DEPS phi_enforce tensor) diff --git a/paddle/phi/common/scalar.cc b/paddle/phi/common/scalar.cc index 5cd55c1e88bed..41f1c9541823d 100644 --- a/paddle/phi/common/scalar.cc +++ b/paddle/phi/common/scalar.cc @@ -14,21 +14,32 @@ limitations under the License. */ #include "paddle/phi/common/scalar.h" +#include "paddle/phi/common/place.h" #include "paddle/phi/core/enforce.h" +#include "paddle/fluid/framework/tensor_util.h" +#include "paddle/fluid/platform/place.h" namespace paddle { namespace experimental { -// NOTE(xiongkun): why we put definition here? -// test_custom_op can't include enforce.h, because enforce.h includes gflags. -// so we decouple the include dependence of enforce.h by link. -void ThrowTensorConvertError(int num) { - PADDLE_ENFORCE_EQ(num, +// The Tensor must have one dim +template <> +ScalarBase::ScalarBase(const phi::DenseTensor& tensor_in) + : dtype_(tensor_in.dtype()) { // NOLINT + PADDLE_ENFORCE_EQ(tensor_in.numel(), 1, phi::errors::InvalidArgument( "The Scalar only supports Tensor with 1 element, but " "now Tensor has `%d` elements", - num)); + tensor_in.numel())); + auto cpu_place = phi::CPUPlace(); + if (!paddle::platform::is_same_place(tensor_in.place(), cpu_place)) { + phi::DenseTensor tensor; + framework::TensorCopySync(tensor_in, cpu_place, &tensor); + GetDataFromTensor(tensor); + } else { + GetDataFromTensor(tensor_in); + } } } // namespace experimental diff --git a/paddle/phi/common/scalar.h b/paddle/phi/common/scalar.h index 5134f4eb72639..c28f6185a556a 100644 --- a/paddle/phi/common/scalar.h +++ b/paddle/phi/common/scalar.h @@ -23,8 +23,6 @@ limitations under the License. */ namespace paddle { namespace experimental { -void ThrowTensorConvertError(int); - template class ScalarBase { public: @@ -105,50 +103,7 @@ class ScalarBase { } // The Tensor must have one dim - ScalarBase(const T& tensor) : dtype_(tensor.dtype()) { // NOLINT - is_from_tensor_ = true; - ThrowTensorConvertError(tensor.numel()); - switch (dtype_) { - case DataType::FLOAT32: - data_.f32 = tensor.template data()[0]; - break; - case DataType::FLOAT64: - data_.f64 = tensor.template data()[0]; - break; - case DataType::FLOAT16: - data_.f16 = tensor.template data()[0]; - break; - case DataType::BFLOAT16: - data_.bf16 = tensor.template data()[0]; - break; - case DataType::INT32: - data_.i32 = tensor.template data()[0]; - break; - case DataType::INT64: - data_.i64 = tensor.template data()[0]; - break; - case DataType::INT16: - data_.i16 = tensor.template data()[0]; - break; - case DataType::INT8: - data_.i8 = tensor.template data()[0]; - break; - case DataType::UINT8: - data_.ui8 = tensor.template data()[0]; - break; - case DataType::BOOL: - data_.b = tensor.template data()[0]; - break; - case DataType::COMPLEX64: - data_.c64 = tensor.template data()[0]; - break; - case DataType::COMPLEX128: - data_.c128 = tensor.template data()[0]; - break; - default: - PD_THROW("Invalid tensor data type `", dtype_, "`."); - } - } + ScalarBase(const T& tensor_in); // NOLINT template ScalarBase(const ScalarBase& other) { @@ -200,6 +155,49 @@ class ScalarBase { private: template friend void CopyScalar(const ScalarBase& src, ScalarBase* dst); + void GetDataFromTensor(const T& tensor) { + is_from_tensor_ = true; + switch (dtype_) { + case DataType::FLOAT32: + data_.f32 = tensor.template data()[0]; + break; + case DataType::FLOAT64: + data_.f64 = tensor.template data()[0]; + break; + case DataType::FLOAT16: + data_.f16 = tensor.template data()[0]; + break; + case DataType::BFLOAT16: + data_.bf16 = tensor.template data()[0]; + break; + case DataType::INT32: + data_.i32 = tensor.template data()[0]; + break; + case DataType::INT64: + data_.i64 = tensor.template data()[0]; + break; + case DataType::INT16: + data_.i16 = tensor.template data()[0]; + break; + case DataType::INT8: + data_.i8 = tensor.template data()[0]; + break; + case DataType::UINT8: + data_.ui8 = tensor.template data()[0]; + break; + case DataType::BOOL: + data_.b = tensor.template data()[0]; + break; + case DataType::COMPLEX64: + data_.c64 = tensor.template data()[0]; + break; + case DataType::COMPLEX128: + data_.c128 = tensor.template data()[0]; + break; + default: + PD_THROW("Invalid tensor data type `", dtype_, "`."); + } + } private: bool is_from_tensor_{false}; diff --git a/paddle/phi/core/CMakeLists.txt b/paddle/phi/core/CMakeLists.txt index b42b4388c2ce1..23574e98fbf17 100644 --- a/paddle/phi/core/CMakeLists.txt +++ b/paddle/phi/core/CMakeLists.txt @@ -23,7 +23,7 @@ cc_library(string_tensor SRCS string_tensor.cc DEPS convert_utils tensor_meta te cc_library(meta_tensor SRCS meta_tensor.cc DEPS tensor_base tensor_meta dense_tensor) cc_library(infermeta_utils SRCS infermeta_utils.cc DEPS meta_tensor) -cc_library(selected_rows SRCS selected_rows_impl.cc DEPS dense_tensor phi_enforce ddim memcpy) +cc_library(selected_rows SRCS selected_rows_impl.cc selected_rows.cc DEPS tensor_base dense_tensor phi_enforce ddim memcpy) cc_library(phi_device_context SRCS device_context.cc DEPS dense_tensor selected_rows) cc_library(custom_kernel SRCS custom_kernel.cc DEPS kernel_factory) diff --git a/paddle/phi/core/selected_rows.cc b/paddle/phi/core/selected_rows.cc new file mode 100644 index 0000000000000..dcf9c4182157a --- /dev/null +++ b/paddle/phi/core/selected_rows.cc @@ -0,0 +1,26 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/phi/core/selected_rows.h" + +namespace phi { + +SelectedRows::SelectedRows(const std::vector& rows, + const int64_t& height) + : impl_(std::make_shared(rows, height)) {} + +SelectedRows::SelectedRows() + : impl_(std::make_shared()) {} + +} // namespace phi diff --git a/paddle/phi/core/selected_rows.h b/paddle/phi/core/selected_rows.h index 7ee475b4d5d9e..a71c0471cc431 100644 --- a/paddle/phi/core/selected_rows.h +++ b/paddle/phi/core/selected_rows.h @@ -42,10 +42,9 @@ class SelectedRows : public TensorBase, * */ public: - SelectedRows(const std::vector& rows, const int64_t& height) - : impl_(std::make_shared(rows, height)) {} + SelectedRows(const std::vector& rows, const int64_t& height); - SelectedRows() : impl_(std::make_shared()) {} + SelectedRows(); const DenseTensor& value() const { return impl_->value(); } diff --git a/paddle/phi/core/utils/type_registry.h b/paddle/phi/core/utils/type_registry.h index 8d9f9167242c8..f27c3db2275c3 100644 --- a/paddle/phi/core/utils/type_registry.h +++ b/paddle/phi/core/utils/type_registry.h @@ -51,7 +51,7 @@ TypeInfo TypeRegistry::RegisterType(const std::string& type) { std::lock_guard guard(mutex_); assert(name_to_id_.find(type) == name_to_id_.end()); assert(names_.size() < std::numeric_limits::max()); - int8_t id = names_.size(); + int8_t id = static_cast(names_.size()); names_.emplace_back(type); name_to_id_[type] = id; return TypeInfo(id); diff --git a/paddle/phi/tests/api/CMakeLists.txt b/paddle/phi/tests/api/CMakeLists.txt index 94378aceff58c..dd4b7e62ec52f 100644 --- a/paddle/phi/tests/api/CMakeLists.txt +++ b/paddle/phi/tests/api/CMakeLists.txt @@ -11,14 +11,14 @@ cc_test(test_mean_api SRCS test_mean_api.cc DEPS ${COMMON_API_TEST_DEPS}) cc_test(test_dot_api SRCS test_dot_api.cc DEPS ${COMMON_API_TEST_DEPS}) cc_test(test_matmul_api SRCS test_matmul_api.cc DEPS ${COMMON_API_TEST_DEPS}) cc_test(test_empty_api SRCS test_empty_api.cc DEPS ${COMMON_API_TEST_DEPS}) -cc_test(test_fill_api SRCS test_fill_api.cc DEPS ${COMMON_API_TEST_DEPS}) +cc_test(test_fill_api SRCS test_fill_api.cc DEPS ${COMMON_API_TEST_DEPS} api_scalar) cc_test(test_elementwise_api SRCS test_elementwise_api.cc DEPS ${COMMON_API_TEST_DEPS}) cc_test(test_cast_api SRCS test_cast_api.cc DEPS ${COMMON_API_TEST_DEPS}) cc_test(test_reshape_api SRCS test_reshape_api.cc DEPS ${COMMON_API_TEST_DEPS}) cc_test(test_to_api SRCS test_to_api.cc DEPS ${COMMON_API_TEST_DEPS}) cc_test(test_slice_api SRCS test_slice_api.cc DEPS ${COMMON_API_TEST_DEPS}) cc_test(test_sum_api SRCS test_sum_api.cc DEPS ${COMMON_API_TEST_DEPS}) -cc_test(test_scale_api SRCS test_scale_api.cc DEPS ${COMMON_API_TEST_DEPS}) +cc_test(test_scale_api SRCS test_scale_api.cc DEPS ${COMMON_API_TEST_DEPS} api_scalar) cc_test(test_scale_benchmark SRCS test_scale_benchmark.cc DEPS ${COMMON_API_TEST_DEPS}) cc_test(test_conj_api SRCS test_conj_api.cc DEPS ${COMMON_API_TEST_DEPS}) cc_test(test_concat_api SRCS test_concat_api.cc DEPS ${COMMON_API_TEST_DEPS}) diff --git a/paddle/phi/tests/common/CMakeLists.txt b/paddle/phi/tests/common/CMakeLists.txt index 710ea3c066472..ca6d20045d171 100644 --- a/paddle/phi/tests/common/CMakeLists.txt +++ b/paddle/phi/tests/common/CMakeLists.txt @@ -2,3 +2,9 @@ cc_test(phi_test_backend SRCS test_backend.cc DEPS gtest) cc_test(phi_test_data_layout SRCS test_data_layout.cc DEPS gtest) cc_test(phi_test_data_type SRCS test_data_type.cc DEPS gtest) cc_test(phi_test_place SRCS test_place.cc DEPS phi_place) +if (WITH_GPU) + nv_test(phi_test_scalar SRCS test_scalar.cu DEPS scalar api_scalar) +endif() +if(WITH_ROCM) + hip_test(phi_test_scalar SRCS test_scalar.cu DEPS scalar api_scalar) +endif() diff --git a/paddle/phi/tests/common/test_scalar.cu b/paddle/phi/tests/common/test_scalar.cu new file mode 100644 index 0000000000000..6b0caa175dc04 --- /dev/null +++ b/paddle/phi/tests/common/test_scalar.cu @@ -0,0 +1,205 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include // NOLINT +#include "gtest/gtest.h" +#include "paddle/fluid/memory/allocation/allocator_facade.h" +#include "paddle/phi/api/include/tensor.h" +#include "paddle/phi/api/lib/utils/allocator.h" +#include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/common/complex.h" +#include "paddle/phi/common/float16.h" +#include "paddle/phi/common/scalar.h" +#include "paddle/phi/core/dense_tensor.h" +#include "paddle/phi/core/kernel_registry.h" + +PD_DECLARE_KERNEL(copy, GPU, ALL_LAYOUT); + +namespace phi { +namespace tests { + +using DDim = phi::DDim; +using float16 = phi::dtype::float16; +using complex64 = ::phi::dtype::complex; +using complex128 = ::phi::dtype::complex; + +__global__ void FillTensor(float* data) { data[0] = 1; } + +TEST(Scalar, ConstructFromDenseTensor1) { + // 1. create tensor + const auto alloc = + std::make_unique(phi::CPUPlace()); + phi::DenseTensor dense_x( + alloc.get(), + phi::DenseTensorMeta( + phi::DataType::FLOAT16, phi::make_ddim({1}), phi::DataLayout::NCHW)); + phi::CPUContext dev_ctx; + dev_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance() + .GetAllocator(phi::CPUPlace()) + .get()); + dev_ctx.Init(); + + auto* dense_x_data = dev_ctx.Alloc(&dense_x); + dense_x_data[0] = 1; + phi::Scalar scalar_test(dense_x); + ASSERT_NEAR(1, scalar_test.to(), 1e-6); +} + +TEST(Scalar, ConstructFromDenseTensor2) { + // 1. create tensor + const auto alloc = + std::make_unique(phi::CPUPlace()); + phi::DenseTensor dense_x( + alloc.get(), + phi::DenseTensorMeta( + phi::DataType::INT16, phi::make_ddim({1}), phi::DataLayout::NCHW)); + phi::CPUContext dev_ctx; + dev_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance() + .GetAllocator(phi::CPUPlace()) + .get()); + dev_ctx.Init(); + + auto* dense_x_data = dev_ctx.Alloc(&dense_x); + dense_x_data[0] = 1; + phi::Scalar scalar_test(dense_x); + ASSERT_EQ(1, scalar_test.to()); +} + +TEST(Scalar, ConstructFromDenseTensor3) { + // 1. create tensor + const auto alloc = + std::make_unique(phi::CPUPlace()); + phi::DenseTensor dense_x( + alloc.get(), + phi::DenseTensorMeta( + phi::DataType::INT8, phi::make_ddim({1}), phi::DataLayout::NCHW)); + phi::CPUContext dev_ctx; + dev_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance() + .GetAllocator(phi::CPUPlace()) + .get()); + dev_ctx.Init(); + + auto* dense_x_data = dev_ctx.Alloc(&dense_x); + dense_x_data[0] = 1; + phi::Scalar scalar_test(dense_x); + ASSERT_EQ(1, scalar_test.to()); +} + +TEST(Scalar, ConstructFromDenseTensor4) { + // 1. create tensor + const auto alloc = + std::make_unique(phi::CPUPlace()); + phi::DenseTensor dense_x( + alloc.get(), + phi::DenseTensorMeta( + phi::DataType::BOOL, phi::make_ddim({1}), phi::DataLayout::NCHW)); + phi::CPUContext dev_ctx; + dev_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance() + .GetAllocator(phi::CPUPlace()) + .get()); + dev_ctx.Init(); + + auto* dense_x_data = dev_ctx.Alloc(&dense_x); + dense_x_data[0] = true; + phi::Scalar scalar_test(dense_x); + ASSERT_EQ(true, scalar_test.to()); +} + +TEST(Scalar, ConstructFromDenseTensor5) { + // 1. create tensor + const auto alloc = + std::make_unique(phi::CPUPlace()); + phi::DenseTensor dense_x(alloc.get(), + phi::DenseTensorMeta(phi::DataType::COMPLEX64, + phi::make_ddim({1}), + phi::DataLayout::NCHW)); + phi::CPUContext dev_ctx; + dev_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance() + .GetAllocator(phi::CPUPlace()) + .get()); + dev_ctx.Init(); + + auto* dense_x_data = dev_ctx.Alloc(&dense_x); + dense_x_data[0] = 1; + phi::Scalar scalar_test(dense_x); + complex64 expected_value(1, 0); + EXPECT_TRUE(expected_value == scalar_test.to()); +} + +TEST(Scalar, ConstructFromDenseTensor6) { + // 1. create tensor + const auto alloc = + std::make_unique(phi::CPUPlace()); + phi::DenseTensor dense_x(alloc.get(), + phi::DenseTensorMeta(phi::DataType::COMPLEX128, + phi::make_ddim({1}), + phi::DataLayout::NCHW)); + phi::CPUContext dev_ctx; + dev_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance() + .GetAllocator(phi::CPUPlace()) + .get()); + dev_ctx.Init(); + + auto* dense_x_data = dev_ctx.Alloc(&dense_x); + dense_x_data[0] = 1; + phi::Scalar scalar_test(dense_x); + complex128 expected_value(1, 0); + EXPECT_TRUE(expected_value == scalar_test.to()); +} + +TEST(Scalar, ConstructFromDenseTensor7) { + // 1. create tensor + const auto alloc = + std::make_unique(phi::GPUPlace()); + phi::DenseTensor dense_x( + alloc.get(), + phi::DenseTensorMeta( + phi::DataType::FLOAT32, phi::make_ddim({1}), phi::DataLayout::NCHW)); + phi::GPUContext dev_ctx; + dev_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance() + .GetAllocator(phi::GPUPlace()) + .get()); + dev_ctx.Init(); + + auto* dense_x_data = dev_ctx.Alloc(&dense_x); + FillTensor<<<1, 1, 0, dev_ctx.stream()>>>(dense_x_data); + dev_ctx.Wait(); + phi::Scalar scalar_test(dense_x); + ASSERT_NEAR(1, scalar_test.to(), 1e-6); +} + +TEST(Scalar, ConstructFromTensor) { + // 1. create tensor + const auto alloc = + std::make_unique(phi::GPUPlace()); + auto dense_x = std::make_shared( + alloc.get(), + phi::DenseTensorMeta( + phi::DataType::FLOAT32, phi::make_ddim({1}), phi::DataLayout::NCHW)); + + phi::GPUContext dev_ctx; + dev_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance() + .GetAllocator(phi::GPUPlace()) + .get()); + dev_ctx.Init(); + auto* dense_x_data = dev_ctx.Alloc(dense_x.get()); + FillTensor<<<1, 1, 0, dev_ctx.stream()>>>(dense_x_data); + dev_ctx.Wait(); + paddle::experimental::Tensor x(dense_x); + paddle::experimental::Scalar scalar_test(x); + ASSERT_NEAR(1, scalar_test.to(), 1e-6); +} + +} // namespace tests +} // namespace phi diff --git a/paddle/phi/tests/core/CMakeLists.txt b/paddle/phi/tests/core/CMakeLists.txt index 824d188457815..7d2fd90e6bb7b 100644 --- a/paddle/phi/tests/core/CMakeLists.txt +++ b/paddle/phi/tests/core/CMakeLists.txt @@ -1,4 +1,4 @@ -cc_test(test_custom_kernel SRCS test_custom_kernel.cc DEPS custom_kernel) +cc_test(test_custom_kernel SRCS test_custom_kernel.cc DEPS custom_kernel scalar) cc_test(test_dense_tensor SRCS test_dense_tensor.cc DEPS dense_tensor) cc_test(test_intrusive_ptr SRCS test_intrusive_ptr.cc) cc_test(test_type_info SRCS test_type_info.cc) From f48a37ef2400d5897ee9714c322754f5fa0da92c Mon Sep 17 00:00:00 2001 From: Chen Weihang Date: Wed, 13 Apr 2022 13:38:56 +0800 Subject: [PATCH 121/211] Fix declarations header block (#41718) * test declarations block, test=document_fix * fix check error, test=document_fix * remove test code, test=document_fix --- tools/check_file_diff_approvals.sh | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tools/check_file_diff_approvals.sh b/tools/check_file_diff_approvals.sh index 49b84da01b9bb..ce67912eb2266 100644 --- a/tools/check_file_diff_approvals.sh +++ b/tools/check_file_diff_approvals.sh @@ -231,10 +231,10 @@ if [ "${HAS_MODIFIED_ALLOCATION}" != "" ] && [ "${GIT_PR_ID}" != "" ]; then check_approval 1 6888866 39303645 fi -HAS_MODIFIED_DECLARATIONS=`git diff --name-only upstream/$BRANCH | grep "paddle/phi/kernels/declarations.h" || true` +HAS_MODIFIED_DECLARATIONS=`git diff -U0 upstream/$BRANCH |grep "^+" |grep "paddle/phi/kernels/declarations.h" || true` if [ "${HAS_MODIFIED_DECLARATIONS}" != "" ] && [ "${GIT_PR_ID}" != "" ]; then - echo_line="You must be approved by chenwhql for any use of paddle/phi/kernels/declarations.h. Thanks!\n" - check_approval 1 22561442 + echo_line="You must be approved by chenwhql or zyfncg for paddle/phi/kernels/declarations.h using. Thanks!\n" + check_approval 1 chenwhql zyfncg fi ALL_PADDLE_ENFORCE=`git diff -U0 upstream/$BRANCH |grep "^+" |grep -zoE "PADDLE_ENFORCE\(.[^,\);]+.[^;]*\);\s" || true` From e53d1837f701bf69faaf54a47de3938b04b715ca Mon Sep 17 00:00:00 2001 From: hong <43953930+phlrain@users.noreply.github.com> Date: Wed, 13 Apr 2022 13:42:09 +0800 Subject: [PATCH 122/211] Add expand equal all yaml (#41540) * add expand, poisson * add poison grad * add expand equal_all poisson triangular solve yaml --- paddle/fluid/operators/expand_v2_op.cc | 74 +++---------------- paddle/phi/infermeta/unary.cc | 72 ++++++++++++++++++ paddle/phi/infermeta/unary.h | 4 + .../kernels/impl/poisson_grad_kernel_impl.h | 4 +- paddle/phi/kernels/poisson_grad_kernel.h | 4 +- paddle/phi/ops/compat/poisson_sig.cc | 3 +- .../tests/unittests/test_compare_reduce_op.py | 12 ++- .../tests/unittests/test_expand_v2_op.py | 4 +- .../fluid/tests/unittests/test_poisson_op.py | 18 +++-- .../unittests/test_triangular_solve_op.py | 5 +- python/paddle/tensor/linalg.py | 4 + python/paddle/tensor/logic.py | 3 + python/paddle/tensor/manipulation.py | 3 + python/paddle/utils/code_gen/api.yaml | 23 +++++- python/paddle/utils/code_gen/backward.yaml | 30 ++++++++ 15 files changed, 180 insertions(+), 83 deletions(-) diff --git a/paddle/fluid/operators/expand_v2_op.cc b/paddle/fluid/operators/expand_v2_op.cc index 981cd11035129..292f706cb186b 100644 --- a/paddle/fluid/operators/expand_v2_op.cc +++ b/paddle/fluid/operators/expand_v2_op.cc @@ -16,7 +16,11 @@ limitations under the License. */ #include #include #include + +#include "paddle/fluid/framework/infershape_utils.h" #include "paddle/fluid/framework/op_registry.h" +#include "paddle/phi/core/infermeta_utils.h" +#include "paddle/phi/infermeta/unary.h" #define MAX_RANK_SUPPORTED 6 @@ -29,70 +33,6 @@ class ExpandV2Op : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - protected: - void InferShape(framework::InferShapeContext* ctx) const override { - OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "ExpandV2"); - OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "ExpandV2"); - auto x_dims = ctx->GetInputDim("X"); - auto expand_shape = ctx->Attrs().Get>("shape"); - - if (expand_shape.size() == 0) { - expand_shape = std::vector(x_dims.size(), -1); - } - - PADDLE_ENFORCE_GE( - expand_shape.size(), static_cast(x_dims.size()), - platform::errors::InvalidArgument( - "The number of elements (%d) of 'shape' for " - "expand_v2 op must be greater than or equal to the rank " - "(%d) of the input.", - expand_shape.size(), static_cast(x_dims.size()))); - PADDLE_ENFORCE_LE(expand_shape.size(), MAX_RANK_SUPPORTED, - platform::errors::InvalidArgument( - "The number of elements (%d) of 'shape' for " - "must not be greater than %d.", - expand_shape.size(), MAX_RANK_SUPPORTED)); - PADDLE_ENFORCE_GE(expand_shape.size(), 1, - platform::errors::InvalidArgument( - "The number of elements (%d) of 'shape' for " - "must be a positive integer.", - expand_shape.size())); - - auto out_rank = - std::max(static_cast(x_dims.size()), expand_shape.size()); - std::vector out_shape(out_rank); - auto x_dim_vec = phi::vectorize(x_dims); - auto diff = expand_shape.size() - x_dim_vec.size(); - x_dim_vec.insert(x_dim_vec.begin(), diff, -1); - for (size_t i = 0; i < expand_shape.size(); ++i) { - if (x_dims[i] == -1) { - out_shape[i] = -1; - } else if (expand_shape[i] == -1) { - if (static_cast(x_dims.size()) > i) { - out_shape[i] = x_dims[i]; - } else { - out_shape[i] = -1; - } - } else if (expand_shape[i] == -2) { - // We use -2 to represent the element in expand_shape is a var. - out_shape[i] = -1; - } else { - PADDLE_ENFORCE_GT( - expand_shape[i], 0, - platform::errors::InvalidArgument( - "The %uth element of 'shape' for expand_v2 op must be " - "greater than 0, but the value given is %d.", - i, expand_shape[i])); - out_shape[i] = expand_shape[i]; - } - } - - ctx->SetOutputDim("Out", phi::make_ddim(out_shape)); - if (out_shape[0] == x_dims[0]) { - ctx->ShareLoD("X", "Out"); - } - } - protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { @@ -291,10 +231,14 @@ DECLARE_NO_NEED_BUFFER_VARS_INFERER(ExpandV2GradNoNeedBufVarsInferer, "X"); } // namespace operators } // namespace paddle +DECLARE_INFER_SHAPE_FUNCTOR(expand_v2, ExpandInferShapeFunctor, + PD_INFER_META(phi::ExpandInferMeta)); + namespace ops = paddle::operators; REGISTER_OPERATOR(expand_v2, ops::ExpandV2Op, ops::ExpandV2OpMaker, ops::ExpandV2GradOpMaker, - ops::ExpandV2GradOpMaker); + ops::ExpandV2GradOpMaker, + ExpandInferShapeFunctor); REGISTER_OPERATOR(expand_v2_grad, ops::ExpandV2GradOp, ops::ExpandV2DoubleGradOpMaker, ops::ExpandV2DoubleGradOpMaker, diff --git a/paddle/phi/infermeta/unary.cc b/paddle/phi/infermeta/unary.cc index a47fc698777f7..7b50a37ac149f 100644 --- a/paddle/phi/infermeta/unary.cc +++ b/paddle/phi/infermeta/unary.cc @@ -405,6 +405,78 @@ void EighInferMeta(const MetaTensor& x, out_v->set_dims(input_dim); } +void ExpandInferMeta(const MetaTensor& x, + const IntArray& shape, + MetaTensor* out) { +#define MAX_RANK_SUPPORTED 6 + auto x_dims = x.dims(); + auto expand_shape = shape.GetData(); + + if (expand_shape.size() == 0) { + expand_shape = std::vector(x_dims.size(), -1); + } + + PADDLE_ENFORCE_GE( + expand_shape.size(), + static_cast(x_dims.size()), + phi::errors::InvalidArgument( + "The number of elements (%d) of 'shape' for " + "expand_v2 op must be greater than or equal to the rank " + "(%d) of the input.", + expand_shape.size(), + static_cast(x_dims.size()))); + PADDLE_ENFORCE_LE( + expand_shape.size(), + MAX_RANK_SUPPORTED, + phi::errors::InvalidArgument("The number of elements (%d) of 'shape' for " + "must not be greater than %d.", + expand_shape.size(), + MAX_RANK_SUPPORTED)); + PADDLE_ENFORCE_GE( + expand_shape.size(), + 1, + phi::errors::InvalidArgument("The number of elements (%d) of 'shape' for " + "must be a positive integer.", + expand_shape.size())); + + auto out_rank = + std::max(static_cast(x_dims.size()), expand_shape.size()); + std::vector out_shape(out_rank); + auto x_dim_vec = phi::vectorize(x_dims); + auto diff = expand_shape.size() - x_dim_vec.size(); + x_dim_vec.insert(x_dim_vec.begin(), diff, -1); + for (size_t i = 0; i < expand_shape.size(); ++i) { + if (x_dims[i] == -1) { + out_shape[i] = -1; + } else if (expand_shape[i] == -1) { + if (static_cast(x_dims.size()) > i) { + out_shape[i] = x_dims[i]; + } else { + out_shape[i] = -1; + } + } else if (expand_shape[i] == -2) { + // We use -2 to represent the element in expand_shape is a var. + out_shape[i] = -1; + } else { + PADDLE_ENFORCE_GT( + expand_shape[i], + 0, + phi::errors::InvalidArgument( + "The %uth element of 'shape' for expand_v2 op must be " + "greater than 0, but the value given is %d.", + i, + expand_shape[i])); + out_shape[i] = expand_shape[i]; + } + } + + out->set_dims(make_ddim(out_shape)); + out->set_dtype(x.dtype()); + if (out_shape[0] == x_dims[0]) { + out->share_lod(x); + } +} + void FlattenInferMeta(const MetaTensor& x, int start_axis, int stop_axis, diff --git a/paddle/phi/infermeta/unary.h b/paddle/phi/infermeta/unary.h index c49e4c88dd899..ac5040388b334 100644 --- a/paddle/phi/infermeta/unary.h +++ b/paddle/phi/infermeta/unary.h @@ -85,6 +85,10 @@ void EighInferMeta(const MetaTensor& x, MetaTensor* out_w, MetaTensor* out_v); +void ExpandInferMeta(const MetaTensor& x, + const IntArray& shape, + MetaTensor* out); + void FlattenInferMeta(const MetaTensor& x, int start_axis, int stop_axis, diff --git a/paddle/phi/kernels/impl/poisson_grad_kernel_impl.h b/paddle/phi/kernels/impl/poisson_grad_kernel_impl.h index 4e82cccac3422..17b6d7516e070 100644 --- a/paddle/phi/kernels/impl/poisson_grad_kernel_impl.h +++ b/paddle/phi/kernels/impl/poisson_grad_kernel_impl.h @@ -20,7 +20,9 @@ namespace phi { template -void PoissonGradKernel(const Context& ctx, DenseTensor* x_grad) { +void PoissonGradKernel(const Context& ctx, + const DenseTensor& out_grad, + DenseTensor* x_grad) { ctx.template Alloc(x_grad); phi::funcs::SetConstant functor; functor(ctx, x_grad, static_cast(0)); diff --git a/paddle/phi/kernels/poisson_grad_kernel.h b/paddle/phi/kernels/poisson_grad_kernel.h index 21720474f4a12..3ef60d7a51676 100644 --- a/paddle/phi/kernels/poisson_grad_kernel.h +++ b/paddle/phi/kernels/poisson_grad_kernel.h @@ -20,6 +20,8 @@ namespace phi { template -void PoissonGradKernel(const Context& ctx, DenseTensor* x_grad); +void PoissonGradKernel(const Context& ctx, + const DenseTensor& out_grad, + DenseTensor* x_grad); } // namespace phi diff --git a/paddle/phi/ops/compat/poisson_sig.cc b/paddle/phi/ops/compat/poisson_sig.cc index cb6ae28804669..e45640c11b6ee 100644 --- a/paddle/phi/ops/compat/poisson_sig.cc +++ b/paddle/phi/ops/compat/poisson_sig.cc @@ -18,7 +18,8 @@ namespace phi { KernelSignature PoissonGradOpArgumentMapping( const ArgumentMappingContext& ctx) { - return KernelSignature("poisson_grad", {}, {}, {GradVarName("X")}); + return KernelSignature( + "poisson_grad", {GradVarName("Out")}, {}, {GradVarName("X")}); } } // namespace phi diff --git a/python/paddle/fluid/tests/unittests/test_compare_reduce_op.py b/python/paddle/fluid/tests/unittests/test_compare_reduce_op.py index 2da5b770d052c..29e3436948e98 100644 --- a/python/paddle/fluid/tests/unittests/test_compare_reduce_op.py +++ b/python/paddle/fluid/tests/unittests/test_compare_reduce_op.py @@ -28,12 +28,13 @@ def setUp(self): x = np.random.random(size=(10, 7)).astype(typename) y = np.random.random(size=(10, 7)).astype(typename) z = callback(x, y) + self.python_api = paddle.tensor.equal_all self.inputs = {'X': x, 'Y': y} self.outputs = {'Out': z} self.op_type = op_type def test_output(self): - self.check_output() + self.check_output(check_eager=True) cls_name = "{0}_{1}_{2}".format(op_type, typename, 'not_equal_all') Cls.__name__ = cls_name @@ -46,12 +47,13 @@ def setUp(self): x = np.random.random(size=(10, 7)).astype(typename) y = np.random.random(size=(10)).astype(typename) z = callback(x, y) + self.python_api = paddle.tensor.equal_all self.inputs = {'X': x, 'Y': y} self.outputs = {'Out': z} self.op_type = op_type def test_output(self): - self.check_output() + self.check_output(check_eager=True) cls_name = "{0}_{1}_{2}".format(op_type, typename, 'not_shape_equal_all') Cls.__name__ = cls_name @@ -63,12 +65,13 @@ class Cls(op_test.OpTest): def setUp(self): x = y = np.random.random(size=(10, 7)).astype(typename) z = callback(x, y) + self.python_api = paddle.tensor.equal_all self.inputs = {'X': x, 'Y': y} self.outputs = {'Out': z} self.op_type = op_type def test_output(self): - self.check_output() + self.check_output(check_eager=True) cls_name = "{0}_{1}_{2}".format(op_type, typename, 'equal_all') Cls.__name__ = cls_name @@ -82,12 +85,13 @@ def setUp(self): x = np.array([True, False, True]).astype(typename) x = np.array([False, False, True]).astype(typename) z = callback(x, y) + self.python_api = paddle.tensor.equal_all self.inputs = {'X': x, 'Y': y} self.outputs = {'Out': z} self.op_type = op_type def test_output(self): - self.check_output() + self.check_output(check_eager=True) cls_name = "{0}_{1}_{2}".format(op_type, typename, 'equal_all') Cls.__name__ = cls_name diff --git a/python/paddle/fluid/tests/unittests/test_expand_v2_op.py b/python/paddle/fluid/tests/unittests/test_expand_v2_op.py index 70b3fda79b50f..fd46b41c5f07e 100644 --- a/python/paddle/fluid/tests/unittests/test_expand_v2_op.py +++ b/python/paddle/fluid/tests/unittests/test_expand_v2_op.py @@ -40,10 +40,10 @@ def init_data(self): self.expand_times = [1] def test_check_output(self): - self.check_output() + self.check_output(check_eager=True) def test_check_grad(self): - self.check_grad(['X'], 'Out') + self.check_grad(['X'], 'Out', check_eager=True) class TestExpandV2OpRank2_DimExpanding(TestExpandV2OpRank1): diff --git a/python/paddle/fluid/tests/unittests/test_poisson_op.py b/python/paddle/fluid/tests/unittests/test_poisson_op.py index f8183bb5f8db2..7dd3841fe4bcb 100644 --- a/python/paddle/fluid/tests/unittests/test_poisson_op.py +++ b/python/paddle/fluid/tests/unittests/test_poisson_op.py @@ -18,6 +18,7 @@ from op_test import OpTest import math import os +from paddle.fluid.framework import _test_eager_guard paddle.enable_static() paddle.seed(100) @@ -96,11 +97,18 @@ def test_static(self): self.assertTrue(np.min(y_np) >= 0) def test_dygraph(self): - paddle.disable_static() - x = paddle.randn([10, 10], dtype='float32') - y = paddle.poisson(x) - self.assertTrue(np.min(y.numpy()) >= 0) - paddle.enable_static() + with paddle.fluid.dygraph.base.guard(): + x = paddle.randn([10, 10], dtype='float32') + y = paddle.poisson(x) + self.assertTrue(np.min(y.numpy()) >= 0) + + with _test_eager_guard(): + x = paddle.randn([10, 10], dtype='float32') + x.stop_gradient = False + y = paddle.poisson(x) + y.backward() + self.assertTrue(np.min(y.numpy()) >= 0) + self.assertTrue(np.array_equal(np.zeros_like(x), x.gradient())) def test_fixed_random_number(self): # Test GPU Fixed random number, which is generated by 'curandStatePhilox4_32_10_t' diff --git a/python/paddle/fluid/tests/unittests/test_triangular_solve_op.py b/python/paddle/fluid/tests/unittests/test_triangular_solve_op.py index 45e88d681d8e0..4e79e8dca138e 100644 --- a/python/paddle/fluid/tests/unittests/test_triangular_solve_op.py +++ b/python/paddle/fluid/tests/unittests/test_triangular_solve_op.py @@ -47,6 +47,7 @@ def set_output(self): def setUp(self): self.op_type = "triangular_solve" + self.python_api = paddle.tensor.linalg.triangular_solve self.config() self.inputs = { @@ -62,10 +63,10 @@ def setUp(self): self.outputs = {'Out': self.output} def test_check_output(self): - self.check_output() + self.check_output(check_eager=True) def test_check_grad_normal(self): - self.check_grad(['X', 'Y'], 'Out') + self.check_grad(['X', 'Y'], 'Out', check_eager=True) # 2D(broadcast) + 3D, test 'transpose' diff --git a/python/paddle/tensor/linalg.py b/python/paddle/tensor/linalg.py index b315e3e9673fc..a00ae8046ed68 100644 --- a/python/paddle/tensor/linalg.py +++ b/python/paddle/tensor/linalg.py @@ -2834,6 +2834,10 @@ def triangular_solve(x, print(out) # [7, -2, -5] """ + if in_dygraph_mode(): + return _C_ops.final_state_triangular_solve(x, y, upper, transpose, + unitriangular) + if paddle.in_dynamic_mode(): return _C_ops.triangular_solve(x, y, 'upper', upper, 'transpose', transpose, 'unitriangular', diff --git a/python/paddle/tensor/logic.py b/python/paddle/tensor/logic.py index 6a18e1201785a..d99b9973b485e 100755 --- a/python/paddle/tensor/logic.py +++ b/python/paddle/tensor/logic.py @@ -301,6 +301,9 @@ def equal_all(x, y, name=None): result2 = paddle.equal_all(x, z) print(result2) # result2 = [False ] """ + if in_dygraph_mode(): + return _C_ops.final_state_equal_all(x, y) + if paddle.in_dynamic_mode(): return _C_ops.equal_all(x, y) diff --git a/python/paddle/tensor/manipulation.py b/python/paddle/tensor/manipulation.py index 389b5dbd7dbec..3a79abd2dc06e 100755 --- a/python/paddle/tensor/manipulation.py +++ b/python/paddle/tensor/manipulation.py @@ -2000,6 +2000,9 @@ def expand(x, shape, name=None): print(out) # [[1, 2, 3], [1, 2, 3]] """ + if in_dygraph_mode(): + return _C_ops.final_state_expand(x, shape) + if paddle.in_dynamic_mode(): return _C_ops.expand_v2(x, 'shape', shape) diff --git a/python/paddle/utils/code_gen/api.yaml b/python/paddle/utils/code_gen/api.yaml index 6387525fa26f1..329882317ee2b 100644 --- a/python/paddle/utils/code_gen/api.yaml +++ b/python/paddle/utils/code_gen/api.yaml @@ -603,6 +603,14 @@ kernel : func : equal +- api : equal_all + args : (Tensor x, Tensor y) + output : Tensor + infer_meta : + func : CompareAllInferMeta + kernel : + func : equal_all + # erf - api : erf args : (Tensor x) @@ -633,6 +641,16 @@ func : exp backward : exp_grad +# expand +- api : expand + args : (Tensor x, IntArray shape) + output : Tensor + infer_meta : + func : ExpandInferMeta + kernel : + func : expand + backward : expand_grad + # expand_as - api : expand_as args : (Tensor x, Tensor y, int[] target_shape) @@ -1513,7 +1531,7 @@ func : pixel_shuffle backward : pixel_shuffle_grad -# poisson // no need grad +# poisson - api : poisson args : (Tensor x) output : Tensor @@ -1521,6 +1539,7 @@ func : UnchangedInferMeta kernel : func : poisson + backward : poisson_grad - api : pool2d args : (Tensor x, int[] kernel_size, int[] strides, int[] paddings, bool ceil_mode, bool exclusive, str data_format, str pooling_type, bool global_pooling, bool adaptive, str padding_algorithm) @@ -2066,7 +2085,7 @@ func : TriangularSolveInferMeta kernel : func : triangular_solve - # backward : triangular_solve_grad + backward : triangular_solve_grad - api : tril_triu args : (Tensor x, int diagonal, bool lower) diff --git a/python/paddle/utils/code_gen/backward.yaml b/python/paddle/utils/code_gen/backward.yaml index d243b4d160d57..337e4f3e38c81 100644 --- a/python/paddle/utils/code_gen/backward.yaml +++ b/python/paddle/utils/code_gen/backward.yaml @@ -492,6 +492,16 @@ func : expand_as_grad no_need_buffer : x +- backward_api : expand_grad + forward : expand (Tensor x, IntArray shape) -> Tensor(out) + args : (Tensor x, Tensor out_grad, IntArray shape) + output : Tensor(x_grad) + infer_meta : + func : UnchangedInferMeta + param : [x] + kernel : + func : expand_grad + - backward_api : expm1_grad forward : expm1 (Tensor x) -> Tensor(out) args : (Tensor out, Tensor out_grad) @@ -1159,6 +1169,16 @@ kernel : func : pixel_shuffle_grad +- backward_api : poisson_grad + forward : poisson (Tensor x) -> Tensor(out) + args : (Tensor out_grad) + output : Tensor(x_grad) + infer_meta : + func : UnchangedInferMeta + param : [out_grad] + kernel : + func : poisson_grad + - backward_api : pool2d_grad forward : pool2d(Tensor x, int[] kernel_size, int[] strides, int[] paddings, bool ceil_mode, bool exclusive, str data_format, str pooling_type, bool global_pooling, bool adaptive, str padding_algorithm) -> Tensor(out) args : (Tensor x, Tensor out, Tensor out_grad, int[] kernel_size, int[] strides, int[] paddings, bool ceil_mode, bool exclusive, str data_format, str pooling_type, bool global_pooling, bool adaptive, str padding_algorithm) @@ -1685,6 +1705,16 @@ kernel : func : transpose_grad +- backward_api : triangular_solve_grad + forward : triangular_solve (Tensor x, Tensor y, bool upper, bool tranpose, bool unitriangular) -> Tensor(out) + args : (Tensor x, Tensor y, Tensor out, Tensor out_grad, bool upper, bool tranpose, bool unitriangular) + output : Tensor(x_grad), Tensor(y_grad) + infer_meta : + func : GeneralBinaryGradInferMeta + param : [x, y] + kernel : + func : triangular_solve_grad + - backward_api : tril_triu_grad forward : tril_triu(Tensor x, int diagonal, bool lower) -> Tensor(out) args : (Tensor out_grad, int diagonal, bool lower) From 150875529156888ff1a1c04cfea3e6763c1a51b5 Mon Sep 17 00:00:00 2001 From: Zhanlue Yang Date: Wed, 13 Apr 2022 14:09:43 +0800 Subject: [PATCH 123/211] [DoubleGrad] Enabled test_imperative_triple_grad test cases under eager_mode (#41612) * [DoubleGrad] Enabled double grad test cases in eager_mode for test_imperative_double_grad * Fixed elementwise issue * Addressed CI failures * [DoubleGrad] Enabled test_imperative_triple_grad test cases under eager_mode * Fixed minor issues --- .../auto_code_generator/eager_generator.cc | 11 +++--- .../final_state_generator/codegen_utils.py | 3 +- .../final_state_generator/eager_gen.py | 24 ++++++++----- paddle/fluid/eager/backward.cc | 21 ++++++------ .../custom_operator/custom_operator_node.h | 2 +- paddle/fluid/eager/grad_node_info.cc | 8 ++++- paddle/fluid/eager/grad_tensor_holder.cc | 1 + paddle/fluid/eager/tensor_wrapper.h | 34 +++++++++++++------ .../tensor_wrapper_test.cc | 8 ++--- paddle/fluid/eager/utils.cc | 9 +++-- paddle/fluid/eager/utils.h | 6 ++-- .../unittests/test_imperative_triple_grad.py | 7 ++-- python/paddle/utils/code_gen/backward.yaml | 2 +- 13 files changed, 81 insertions(+), 55 deletions(-) diff --git a/paddle/fluid/eager/auto_code_generator/eager_generator.cc b/paddle/fluid/eager/auto_code_generator/eager_generator.cc index de44a833f6e73..3ed17b67b842a 100644 --- a/paddle/fluid/eager/auto_code_generator/eager_generator.cc +++ b/paddle/fluid/eager/auto_code_generator/eager_generator.cc @@ -2011,8 +2011,7 @@ static std::string GenerateSingleOpBase( "egr::EagerUtils::TrySyncToVars(egr::EagerUtils::" "RecoverTensorWrapper(" "&" - "this->%s, " - "nullptr)) },"; + "this->%s)) },"; ins_contents_str += paddle::string::Sprintf(GRAD_INS_FWD_CONTENT_TEMPLATE, grad_input_name, struct_fwd_input_name); @@ -2058,15 +2057,15 @@ static std::string GenerateSingleOpBase( const char* DISPENSABLE_GRAD_INS_FWD_CONTENT_TEMPLATE = " if(this->%s.size() > 0) %s[\"%s\"] = " "egr::EagerUtils::TrySyncToVars(egr::EagerUtils::" - "RecoverTensorWrapper(&this->%s, nullptr));\n"; + "RecoverTensorWrapper(&this->%s));\n"; generated_grad_function_body += paddle::string::Sprintf( DISPENSABLE_GRAD_INS_FWD_CONTENT_TEMPLATE, struct_fwd_input_name, ins_name, grad_input_name, struct_fwd_input_name); } else { const char* DISPENSABLE_GRAD_INS_FWD_CONTENT_TEMPLATE = - " auto %s = egr::EagerUtils::RecoverTensorWrapper(&this->%s, " - "nullptr);\n if(%s.initialized()) %s[\"%s\"] = " - "egr::EagerUtils::TrySyncToVars(%s);\n"; + " auto %s = egr::EagerUtils::RecoverTensorWrapper(&this->%s);\n" + " if(%s.initialized()) %s[\"%s\"] = " + " egr::EagerUtils::TrySyncToVars(%s);\n"; generated_grad_function_body += paddle::string::Sprintf( DISPENSABLE_GRAD_INS_FWD_CONTENT_TEMPLATE, grad_input_name, struct_fwd_input_name, grad_input_name, ins_name, grad_input_name, diff --git a/paddle/fluid/eager/auto_code_generator/final_state_generator/codegen_utils.py b/paddle/fluid/eager/auto_code_generator/final_state_generator/codegen_utils.py index 0081dbb595df3..ea7b4a21a2c54 100644 --- a/paddle/fluid/eager/auto_code_generator/final_state_generator/codegen_utils.py +++ b/paddle/fluid/eager/auto_code_generator/final_state_generator/codegen_utils.py @@ -23,7 +23,8 @@ ######################## ops_to_fill_zero_for_empty_grads = set([ "split_grad", "rnn_grad", "matmul_double_grad", "matmul_triple_grad", - "sigmoid_triple_grad, add_double_grad" + "sigmoid_double_grad", "sigmoid_triple_grad", "add_double_grad", + "add_triple_grad" ]) # For API dispatch used at python-level diff --git a/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py b/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py index be6dda270093b..d6505ebaa1e68 100644 --- a/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py +++ b/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py @@ -236,7 +236,7 @@ class {} : public egr::GradNodeBase {{ {} // SetAttributes {} - // SetTensorWrappers + // Set TensorWrappers for Forward Inputs {} // SetGradOutMeta & SetEdges {} @@ -245,6 +245,8 @@ class {} : public egr::GradNodeBase {{ {} {} {} +{} + // Set TensorWrappers for Forward Outputs {} }} """ @@ -720,7 +722,8 @@ def GenerateNodeCreationCodes(self): set_attributes_str = "\n".join(set_attributes_list) # SetTensorWrappers - set_tensor_wrappers_list = [] + set_input_tensor_wrappers_list = [] + set_output_tensor_wrappers_list = [] num_fwd_outputs = len(forward_outputs_position_map.keys()) for name, (atype, is_fwd_input, pos) in backward_forward_inputs_map.items(): @@ -732,6 +735,7 @@ def GenerateNodeCreationCodes(self): set_tensor_wrappers = f"{indent}if({name}.get_ptr() != nullptr) grad_node->SetTensorWrapper{name}(*({name}.get_ptr()), true);" else: set_tensor_wrappers = f"{indent}grad_node->SetTensorWrapper{name}({name}, {need_input_data});" + set_input_tensor_wrappers_list.append(set_tensor_wrappers) else: if num_fwd_outputs > 1: # Aligned with forward output position @@ -743,8 +747,11 @@ def GenerateNodeCreationCodes(self): set_tensor_wrappers = f"{indent}if({name}.get_ptr() != nullptr) grad_node->SetTensorWrapper{name}(*({name}.get_ptr()), false);" else: set_tensor_wrappers = f"{indent}grad_node->SetTensorWrapper{name}({name}, false);" - set_tensor_wrappers_list.append(set_tensor_wrappers) - set_tensor_wrappers_str = "\n".join(set_tensor_wrappers_list) + set_output_tensor_wrappers_list.append(set_tensor_wrappers) + set_input_tensor_wrappers_str = "\n".join( + set_input_tensor_wrappers_list) + set_output_tensor_wrappers_str = "\n".join( + set_output_tensor_wrappers_list) # SetGradOutMeta & SetEdges set_grad_out_meta_list = [] @@ -801,9 +808,10 @@ def GenerateNodeCreationCodes(self): self.node_creation_str = FORWARD_BODY_TEMPLATE.format( node_creation_event_str, pass_stop_gradient_args_str, - node_construction_str, set_attributes_str, set_tensor_wrappers_str, - set_grad_out_meta_str, set_edges_str, set_out_rank_str, - set_history_str, set_grad_in_meta_str, set_retain_grad_str) + node_construction_str, set_attributes_str, + set_input_tensor_wrappers_str, set_grad_out_meta_str, set_edges_str, + set_out_rank_str, set_history_str, set_grad_in_meta_str, + set_retain_grad_str, set_output_tensor_wrappers_str) def run(self): # Basic Validation Check @@ -1296,7 +1304,7 @@ def GenerateNodeDefinition(self, grad_node_creation_str): transformed_tensor_name = self.TransformToNextGradName(name) is_optional = (name in self.optional_inputs) - tensor_wrapper_recover_str = f"{indent}auto {transformed_tensor_name} = egr::EagerUtils::RecoverTensorWrapper(&this->{tensor_wrapper_name}, this->shared_from_this());" + tensor_wrapper_recover_str = f"{indent}auto {transformed_tensor_name} = egr::EagerUtils::RecoverTensorWrapper(&this->{tensor_wrapper_name});" if is_optional: tensor_wrapper_recover_str += "\n" + CREATE_RECOVER_OPTIONAL_TENSOR_TEMPLATE.format( transformed_tensor_name, transformed_tensor_name, diff --git a/paddle/fluid/eager/backward.cc b/paddle/fluid/eager/backward.cc index 60c5e52767a00..974acb8646ca5 100644 --- a/paddle/fluid/eager/backward.cc +++ b/paddle/fluid/eager/backward.cc @@ -731,16 +731,6 @@ std::vector RunBackward( continue; } - auto* next_node = next_node_shared.get(); - if (!node_input_buffers_dict.count(next_node)) { - const auto& input_meta = next_node->InputMeta(); - auto grad_tensor_holder = - std::make_unique(input_meta); - VLOG(6) << "Construct GradTensorHolder for grad node: " - << next_node->name(); - node_input_buffers_dict[next_node] = std::move(grad_tensor_holder); - } - PADDLE_ENFORCE_LT( j, grad_output_tensors[i].size(), paddle::platform::errors::Fatal( @@ -760,8 +750,19 @@ std::vector RunBackward( << ", rank: " << j << " 's name is: " << grad_output_tensor.name(); + auto* next_node = next_node_shared.get(); + if (!node_input_buffers_dict.count(next_node)) { + const auto& input_meta = next_node->InputMeta(); + auto grad_tensor_holder = + std::make_unique(input_meta); + VLOG(6) << "Construct GradTensorHolder for grad node: " + << next_node->name(); + node_input_buffers_dict[next_node] = std::move(grad_tensor_holder); + } + VLOG(6) << "Sum grad inputs for edge slot: " << edge_rank.first << ", rank: " << edge_rank.second; + node_input_buffers_dict[next_node]->add( edge_rank.first, edge_rank.second, grad_output_tensor); diff --git a/paddle/fluid/eager/custom_operator/custom_operator_node.h b/paddle/fluid/eager/custom_operator/custom_operator_node.h index c483dc0ebd177..6db410fa0f1af 100644 --- a/paddle/fluid/eager/custom_operator/custom_operator_node.h +++ b/paddle/fluid/eager/custom_operator/custom_operator_node.h @@ -59,7 +59,7 @@ class RunCustomOpNode : public GradNodeBase { std::vector* fwd_var) { std::vector res; for (size_t i = 0; i < fwd_var->size(); i++) { - res.emplace_back(fwd_var->at(i).recover(nullptr)); + res.emplace_back(fwd_var->at(i).recover()); } return res; } diff --git a/paddle/fluid/eager/grad_node_info.cc b/paddle/fluid/eager/grad_node_info.cc index 23c7ea7c5e9b4..6afdd854344eb 100644 --- a/paddle/fluid/eager/grad_node_info.cc +++ b/paddle/fluid/eager/grad_node_info.cc @@ -61,6 +61,10 @@ void GradNodeBase::AddEdges(std::vector* metas, size_t slot_id) { if (!node || !node.get()) { meta->SetGradNode(std::make_shared(meta)); } + VLOG(6) << "Add Edges for slot: " << slot_id << ", the Edge is from " + << this->name() << " (addr: " << this << ") " + << " to " << meta->GetMutableGradNode()->name() + << " (addr: " << meta->GetMutableGradNode().get() << ")"; adj_edges_[slot_id].emplace_back(meta->GetMutableGradNode(), meta->OutRankInfo()); @@ -84,7 +88,9 @@ void GradNodeBase::AddEdges(AutogradMeta* meta, size_t slot_id) { meta->SetGradNode(std::make_shared(meta)); } VLOG(6) << "Add Edges for slot: " << slot_id << ", the Edge is from " - << this->name() << " to " << meta->GetMutableGradNode()->name(); + << this->name() << " (addr: " << this << ") " + << " to " << meta->GetMutableGradNode()->name() + << " (addr: " << meta->GetMutableGradNode().get() << ")"; adj_edges_[slot_id].emplace_back(meta->GetMutableGradNode(), meta->OutRankInfo()); diff --git a/paddle/fluid/eager/grad_tensor_holder.cc b/paddle/fluid/eager/grad_tensor_holder.cc index 2dacb588ff847..183282d6f87b2 100644 --- a/paddle/fluid/eager/grad_tensor_holder.cc +++ b/paddle/fluid/eager/grad_tensor_holder.cc @@ -110,6 +110,7 @@ void GradTensorHolder::add(size_t slot_id, size_t rank, "got tensor: %s is empty please check you network " "and make sure it creates grads.", t.name())); + if (t.is_dense_tensor()) { if (buffer_tensor.is_dense_tensor()) { buffer_tensor = add_final_state_dygraph_function(t, buffer_tensor); diff --git a/paddle/fluid/eager/tensor_wrapper.h b/paddle/fluid/eager/tensor_wrapper.h index 3d5d3139de14c..b5dd6b960b23a 100644 --- a/paddle/fluid/eager/tensor_wrapper.h +++ b/paddle/fluid/eager/tensor_wrapper.h @@ -77,16 +77,17 @@ class TensorWrapper { intermidiate_tensor_.set_name(tensor.name() + "@Saved"); - // If an output is marked "intermedaite", we won't create - // autograd_meta for it. - // In that case, simply skip OutRankInfo Copy - if (EagerUtils::nullable_autograd_meta(tensor)) { - out_rank_info_ = EagerUtils::OutRankInfo(tensor); + auto* tensor_autograd_meta = EagerUtils::nullable_autograd_meta(tensor); + if (tensor_autograd_meta) { + auto autograd_meta = std::make_shared( + Edge(nullptr, EagerUtils::OutRankInfo(tensor))); + autograd_meta->SetStopGradient(tensor_autograd_meta->StopGradient()); + intermidiate_tensor_.set_autograd_meta(autograd_meta); + weak_grad_node_ = tensor_autograd_meta->GetMutableGradNode(); } } - paddle::experimental::Tensor recover( - const std::shared_ptr& grad_node) { + paddle::experimental::Tensor recover() { VLOG(6) << "Recover tensor: " << intermidiate_tensor_.name() << " for wrapper"; if (!intermidiate_tensor_.defined()) { @@ -99,9 +100,20 @@ class TensorWrapper { // if it's full_reserved just return the full copy of tensor paddle::experimental::Tensor recovered_tensor = intermidiate_tensor_; if (!full_reserved_) { - std::shared_ptr new_grad_node = grad_node; - auto p_ab_autograd_meta = - std::make_shared(Edge(new_grad_node, out_rank_info_)); + std::shared_ptr new_grad_node = weak_grad_node_.lock(); + if (new_grad_node) { + VLOG(3) << "Recovered TensorWrapper with GradNode " + << new_grad_node->name() << " addr: " << new_grad_node.get(); + } else { + VLOG(3) << "Recovered TensorWrapper with Empth GradNode"; + } + auto* intermediate_autograd_meta = + EagerUtils::unsafe_autograd_meta(intermidiate_tensor_); + auto p_ab_autograd_meta = std::make_shared( + Edge(new_grad_node, intermediate_autograd_meta->OutRankInfo())); + p_ab_autograd_meta->SetStopGradient( + intermediate_autograd_meta->StopGradient()); + recovered_tensor.set_autograd_meta( std::static_pointer_cast( p_ab_autograd_meta)); @@ -149,8 +161,8 @@ class TensorWrapper { private: bool full_reserved_ = false; bool no_need_buffer_ = false; - std::pair out_rank_info_; paddle::experimental::Tensor intermidiate_tensor_; + std::weak_ptr weak_grad_node_; uint32_t inplace_version_snapshot_ = 0; }; } // namespace egr diff --git a/paddle/fluid/eager/tests/data_structure_tests/tensor_wrapper_test.cc b/paddle/fluid/eager/tests/data_structure_tests/tensor_wrapper_test.cc index a0c75c0200137..5f563edee39f1 100644 --- a/paddle/fluid/eager/tests/data_structure_tests/tensor_wrapper_test.cc +++ b/paddle/fluid/eager/tests/data_structure_tests/tensor_wrapper_test.cc @@ -41,7 +41,7 @@ TEST(TensorWrapper, Basic) { et1.set_autograd_meta(auto_grad0); et1.set_name("et1"); auto tw0 = egr::TensorWrapper(et1, true); - auto recover_et1 = tw0.recover(std::make_shared()); + auto recover_et1 = tw0.recover(); CHECK_EQ(recover_et1.name(), std::string("et1")); CHECK_EQ(egr::EagerUtils::OutRankInfo(recover_et1).first, egr::EagerUtils::OutRankInfo(et1).first); @@ -67,7 +67,7 @@ TEST(TensorWrapper, Basic) { auto auto_grad1 = std::make_shared(edge1); et2.set_autograd_meta(auto_grad1); auto tw1 = egr::TensorWrapper(et2, false); - auto recover_et2 = tw1.recover(grad_test_node1); + auto recover_et2 = tw1.recover(); CHECK_EQ(recover_et2.name(), std::string("et2@Saved")); CHECK_EQ(egr::EagerUtils::OutRankInfo(recover_et2).first, egr::EagerUtils::OutRankInfo(et2).first); @@ -76,7 +76,5 @@ TEST(TensorWrapper, Basic) { // Test Raw recover paddle::experimental::Tensor et3; auto tw2 = egr::TensorWrapper(et3, true); - CHECK( - tw2.recover(std::make_shared()).initialized() == - false); + CHECK(tw2.recover().initialized() == false); } diff --git a/paddle/fluid/eager/utils.cc b/paddle/fluid/eager/utils.cc index bcf4a4627bb76..756563df4dfe7 100644 --- a/paddle/fluid/eager/utils.cc +++ b/paddle/fluid/eager/utils.cc @@ -360,16 +360,15 @@ void EagerUtils::Output2Result( } paddle::experimental::Tensor EagerUtils::RecoverTensorWrapper( - TensorWrapper* tw, const std::shared_ptr& grad_node) { - return tw->recover(grad_node); + TensorWrapper* tw) { + return tw->recover(); } std::vector EagerUtils::RecoverTensorWrapper( - std::vector* tw, - const std::shared_ptr& grad_node) { + std::vector* tw) { std::vector ret; for (auto& t : *tw) { - ret.emplace_back(t.recover(grad_node)); + ret.emplace_back(t.recover()); } return ret; } diff --git a/paddle/fluid/eager/utils.h b/paddle/fluid/eager/utils.h index be534d4440561..51a322c8524ac 100644 --- a/paddle/fluid/eager/utils.h +++ b/paddle/fluid/eager/utils.h @@ -174,11 +174,9 @@ class EagerUtils { const std::shared_ptr& view_output_var); // TensorWrapper Utils - static paddle::experimental::Tensor RecoverTensorWrapper( - TensorWrapper* tw, const std::shared_ptr& grad_node); + static paddle::experimental::Tensor RecoverTensorWrapper(TensorWrapper* tw); static std::vector RecoverTensorWrapper( - std::vector* tw, - const std::shared_ptr& grad_node); + std::vector* tw); // Intermidate needed remove this once we don't need legacy // Inner Method diff --git a/python/paddle/fluid/tests/unittests/test_imperative_triple_grad.py b/python/paddle/fluid/tests/unittests/test_imperative_triple_grad.py index f0c5316412f1e..3a8a3a96e9a33 100644 --- a/python/paddle/fluid/tests/unittests/test_imperative_triple_grad.py +++ b/python/paddle/fluid/tests/unittests/test_imperative_triple_grad.py @@ -209,7 +209,9 @@ def func_example_with_gradient_and_create_graph(self): self.assertTrue(np.allclose(dddx_grad_actual, dddx_expected)) def test_all_cases(self): - if _in_legacy_dygraph(): + self.func_exception() + self.func_example_with_gradient_and_create_graph() + with _test_eager_guard(): self.func_exception() self.func_example_with_gradient_and_create_graph() @@ -296,7 +298,8 @@ def func_example_with_gradient_and_create_graph(self): self.assertTrue(np.allclose(dddx_grad_actual, dddx_expected)) def test_all_cases(self): - if _in_legacy_dygraph(): + self.func_example_with_gradient_and_create_graph() + with _test_eager_guard(): self.func_example_with_gradient_and_create_graph() diff --git a/python/paddle/utils/code_gen/backward.yaml b/python/paddle/utils/code_gen/backward.yaml index 337e4f3e38c81..d0f337cb054f4 100644 --- a/python/paddle/utils/code_gen/backward.yaml +++ b/python/paddle/utils/code_gen/backward.yaml @@ -1458,7 +1458,7 @@ func : GeneralTernaryGradInferMeta param : [out, fwd_grad_out, grad_grad_x] kernel : - func : sigmoid_double_grad + func : sigmoid_triple_grad - backward_api : silu_grad forward : silu (Tensor x) -> Tensor(out) From 404c4a6bf0bc6f422020142338583f77f7e52886 Mon Sep 17 00:00:00 2001 From: tianshuo78520a <707759223@qq.com> Date: Wed, 13 Apr 2022 14:17:06 +0800 Subject: [PATCH 124/211] Revert "[Phi] Support construct Scalar by using Non-CPU Tensosr (#41528)" (#41740) This reverts commit fe214af2733fd7cb14c2adc6bca3251917472039. --- paddle/fluid/platform/CMakeLists.txt | 6 +- paddle/phi/CMakeLists.txt | 2 +- paddle/phi/api/lib/CMakeLists.txt | 4 +- paddle/phi/api/lib/api_custom_impl.cc | 30 +++- paddle/phi/api/lib/scalar.cc | 48 ------ paddle/phi/api/lib/tensor_copy.cc | 57 ------- paddle/phi/api/lib/tensor_copy.h | 25 --- paddle/phi/api/lib/utils/CMakeLists.txt | 2 +- paddle/phi/common/CMakeLists.txt | 2 +- paddle/phi/common/scalar.cc | 23 +-- paddle/phi/common/scalar.h | 90 ++++++----- paddle/phi/core/CMakeLists.txt | 2 +- paddle/phi/core/selected_rows.cc | 26 --- paddle/phi/core/selected_rows.h | 5 +- paddle/phi/core/utils/type_registry.h | 2 +- paddle/phi/tests/api/CMakeLists.txt | 4 +- paddle/phi/tests/common/CMakeLists.txt | 6 - paddle/phi/tests/common/test_scalar.cu | 205 ------------------------ paddle/phi/tests/core/CMakeLists.txt | 2 +- 19 files changed, 95 insertions(+), 446 deletions(-) delete mode 100644 paddle/phi/api/lib/scalar.cc delete mode 100644 paddle/phi/api/lib/tensor_copy.cc delete mode 100644 paddle/phi/api/lib/tensor_copy.h delete mode 100644 paddle/phi/core/selected_rows.cc delete mode 100644 paddle/phi/tests/common/test_scalar.cu diff --git a/paddle/fluid/platform/CMakeLists.txt b/paddle/fluid/platform/CMakeLists.txt index f29546c5210d9..46059100b3802 100644 --- a/paddle/fluid/platform/CMakeLists.txt +++ b/paddle/fluid/platform/CMakeLists.txt @@ -192,13 +192,13 @@ add_subdirectory(profiler) cc_library(device_tracer SRCS device_tracer.cc DEPS boost profiler_proto framework_proto ${GPU_CTX_DEPS}) if(WITH_GPU) - nv_library(profiler SRCS profiler.cc profiler.cu DEPS os_info device_tracer gpu_info enforce dynload_cuda new_profiler stats) + nv_library(profiler SRCS profiler.cc profiler.cu DEPS os_info device_tracer gpu_info enforce dynload_cuda new_profiler) nv_library(device_memory_aligment SRCS device_memory_aligment.cc DEPS cpu_info gpu_info place) elseif(WITH_ROCM) - hip_library(profiler SRCS profiler.cc profiler.cu DEPS os_info device_tracer gpu_info enforce new_profiler stats) + hip_library(profiler SRCS profiler.cc profiler.cu DEPS os_info device_tracer gpu_info enforce new_profiler) hip_library(device_memory_aligment SRCS device_memory_aligment.cc DEPS cpu_info gpu_info place) else() - cc_library(profiler SRCS profiler.cc DEPS os_info device_tracer enforce new_profiler stats) + cc_library(profiler SRCS profiler.cc DEPS os_info device_tracer enforce new_profiler) cc_library(device_memory_aligment SRCS device_memory_aligment.cc DEPS cpu_info place) endif() diff --git a/paddle/phi/CMakeLists.txt b/paddle/phi/CMakeLists.txt index d43e327393f25..724b1ba556d4b 100644 --- a/paddle/phi/CMakeLists.txt +++ b/paddle/phi/CMakeLists.txt @@ -23,7 +23,7 @@ add_subdirectory(tools) add_subdirectory(tests) # make an unity target for compile deps -set(PHI_DEPS convert_utils dense_tensor phi_context kernel_factory kernel_context arg_map_context infermeta lod_utils op_compat_infos sparse_csr_tensor sparse_coo_tensor string_tensor api_scalar) +set(PHI_DEPS convert_utils dense_tensor phi_context kernel_factory kernel_context arg_map_context infermeta lod_utils op_compat_infos sparse_csr_tensor sparse_coo_tensor string_tensor) get_property(phi_kernels GLOBAL PROPERTY PHI_KERNELS) set(PHI_DEPS ${PHI_DEPS} ${phi_kernels}) diff --git a/paddle/phi/api/lib/CMakeLists.txt b/paddle/phi/api/lib/CMakeLists.txt index e10ae8254a79e..9cc5d620280bc 100644 --- a/paddle/phi/api/lib/CMakeLists.txt +++ b/paddle/phi/api/lib/CMakeLists.txt @@ -164,7 +164,7 @@ cc_library(kernel_dispatch SRCS kernel_dispatch.cc DEPS phi_tensor_raw phi_conte cc_library(api_gen_utils SRCS api_gen_utils.cc DEPS phi_tensor_raw selected_rows sparse_csr_tensor sparse_coo_tensor) cc_library(phi_data_transform SRCS data_transform.cc DEPS phi_tensor_raw transfer_layout_kernel cast_kernel data_device_transform) cc_library(api_custom_impl SRCS api_custom_impl.cc DEPS phi_tensor_raw phi kernel_dispatch api_gen_utils backward_infermeta phi_data_transform) -cc_library(sparse_api_custom_impl SRCS sparse_api_custom_impl.cc DEPS phi_tensor_raw phi kernel_dispatch api_gen_utils phi_data_transform tensor_copy) +cc_library(sparse_api_custom_impl SRCS sparse_api_custom_impl.cc DEPS phi_tensor_raw phi kernel_dispatch api_gen_utils phi_data_transform) cc_library(phi_function_api SRCS ${api_source_file} DEPS phi_tensor_raw phi kernel_dispatch api_gen_utils phi_data_transform api_custom_impl) cc_library(phi_bw_function_api SRCS ${bw_api_source_file} DEPS phi_tensor_raw phi kernel_dispatch api_gen_utils backward_infermeta phi_data_transform phi_function_api api_custom_impl global_utils) @@ -173,5 +173,3 @@ cc_library(sparse_bw_api SRCS ${sparse_bw_api_source_file} DEPS phi_tensor_raw p cc_library(phi_dygraph_api SRCS ${dygraph_api_source_file} DEPS phi_tensor_raw phi kernel_dispatch api_gen_utils phi_data_transform phi_function_api sparse_api) cc_library(strings_api SRCS ${strings_api_source_file} DEPS phi_tensor_raw phi kernel_dispatch api_gen_utils) cc_library(phi_tensor SRCS tensor_method.cc DEPS phi_tensor_raw phi_function_api api_gen_utils kernel_dispatch infermeta sparse_api strings_api) -cc_library(tensor_copy SRCS tensor_copy.cc DEPS phi_tensor_raw copy_kernel kernel_dispatch api_gen_utils) -cc_library(api_scalar SRCS scalar.cc DEPS tensor_copy) diff --git a/paddle/phi/api/lib/api_custom_impl.cc b/paddle/phi/api/lib/api_custom_impl.cc index 81e7faeb87015..d7f148fff818b 100644 --- a/paddle/phi/api/lib/api_custom_impl.cc +++ b/paddle/phi/api/lib/api_custom_impl.cc @@ -17,7 +17,6 @@ limitations under the License. */ #include "paddle/phi/api/lib/api_gen_utils.h" #include "paddle/phi/api/lib/data_transform.h" #include "paddle/phi/api/lib/kernel_dispatch.h" -#include "paddle/phi/api/lib/tensor_copy.h" #include "paddle/phi/api/lib/utils/storage.h" #include "paddle/phi/common/type_traits.h" #include "paddle/phi/core/compat/convert_utils.h" @@ -425,8 +424,35 @@ std::vector> conv2d_grad_impl( } Tensor copy_to_impl(const Tensor& x, Place place, bool blocking) { + auto kernel_key_set = ParseKernelKeyByInputArgs(x); + kernel_key_set.backend_set = + kernel_key_set.backend_set | BackendSet(phi::TransToPhiBackend(place)); + auto kernel_key = kernel_key_set.GetHighestPriorityKernelKey(); + auto kernel = phi::KernelFactory::Instance().SelectKernelOrThrowError( + "copy", kernel_key); + + VLOG(6) << "copy API kernel key: " << kernel_key; + VLOG(6) << "copy API kernel: " << kernel; + + auto* dev_ctx = GetDeviceContextByBackend(kernel_key.backend()); + + auto dense_x = TensorToDenseTensor(x); + Tensor out; - copy(x, place, blocking, &out); + auto kernel_out = SetKernelOutput(kernel_key.backend(), &out); + phi::MetaTensor meta_out(kernel_out); + phi::UnchangedInferMeta(*dense_x, &meta_out); + + using kernel_signature = void (*)(const platform::DeviceContext&, + const phi::DenseTensor&, + phi::Place, + bool, + phi::DenseTensor*); + + auto* kernel_fn = kernel.GetVariadicKernelFn(); + + (*kernel_fn)(*dev_ctx, *dense_x, place, blocking, kernel_out); + return out; } diff --git a/paddle/phi/api/lib/scalar.cc b/paddle/phi/api/lib/scalar.cc deleted file mode 100644 index 981487df86be4..0000000000000 --- a/paddle/phi/api/lib/scalar.cc +++ /dev/null @@ -1,48 +0,0 @@ -/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/phi/common/scalar.h" - -#include "paddle/phi/api/lib/tensor_copy.h" -#include "paddle/phi/common/place.h" -#include "paddle/phi/core/enforce.h" - -namespace paddle { -namespace experimental { - -template <> -ScalarBase::ScalarBase(const Tensor& tensor_in) - : dtype_(tensor_in.dtype()) { // NOLINT - PADDLE_ENFORCE_EQ(tensor_in.numel(), - 1, - phi::errors::InvalidArgument( - "The Scalar only supports Tensor with 1 element, but " - "now Tensor has `%d` elements", - tensor_in.numel())); - if (tensor_in.place() == PlaceType::kGPU) { - Tensor dst_tensor; - copy(tensor_in, phi::CPUPlace(), true, &dst_tensor); - GetDataFromTensor(dst_tensor); - } else if (tensor_in.place() == PlaceType::kCPU) { - GetDataFromTensor(tensor_in); - } else { - PADDLE_THROW(phi::errors::Unimplemented( - "Now, it is not supported to construct Scalar using tensor that its " - "PlaceType is (%d)", - static_cast(tensor_in.place()))); - } -} - -} // namespace experimental -} // namespace paddle diff --git a/paddle/phi/api/lib/tensor_copy.cc b/paddle/phi/api/lib/tensor_copy.cc deleted file mode 100644 index 57e3c28d8cb1f..0000000000000 --- a/paddle/phi/api/lib/tensor_copy.cc +++ /dev/null @@ -1,57 +0,0 @@ -/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/phi/api/lib/tensor_copy.h" -#include "paddle/phi/api/lib/api_gen_utils.h" -#include "paddle/phi/api/lib/kernel_dispatch.h" -#include "paddle/phi/api/lib/utils/storage.h" -#include "paddle/phi/core/compat/convert_utils.h" -#include "paddle/phi/core/kernel_registry.h" -#include "paddle/phi/core/meta_tensor.h" -#include "paddle/phi/infermeta/unary.h" - -namespace paddle { -namespace experimental { - -void copy(const Tensor& src, Place place, bool blocking, Tensor* dst) { - auto kernel_key_set = ParseKernelKeyByInputArgs(src); - kernel_key_set.backend_set = - kernel_key_set.backend_set | BackendSet(phi::TransToPhiBackend(place)); - auto kernel_key = kernel_key_set.GetHighestPriorityKernelKey(); - auto kernel = phi::KernelFactory::Instance().SelectKernelOrThrowError( - "copy", kernel_key); - - VLOG(6) << "copy API kernel key: " << kernel_key; - VLOG(6) << "copy API kernel: " << kernel; - - auto* dev_ctx = GetDeviceContextByBackend(kernel_key.backend()); - - auto dense_x = TensorToDenseTensor(src); - - auto kernel_out = SetKernelOutput(kernel_key.backend(), dst); - phi::MetaTensor meta_out(kernel_out); - phi::UnchangedInferMeta(*dense_x, &meta_out); - - using kernel_signature = void (*)(const platform::DeviceContext&, - const phi::DenseTensor&, - phi::Place, - bool, - phi::DenseTensor*); - - auto* kernel_fn = kernel.GetVariadicKernelFn(); - (*kernel_fn)(*dev_ctx, *dense_x, place, blocking, kernel_out); -} - -} // namespace experimental -} // namespace paddle diff --git a/paddle/phi/api/lib/tensor_copy.h b/paddle/phi/api/lib/tensor_copy.h deleted file mode 100644 index 3ce45853319ec..0000000000000 --- a/paddle/phi/api/lib/tensor_copy.h +++ /dev/null @@ -1,25 +0,0 @@ -/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include "paddle/phi/api/include/tensor.h" - -namespace paddle { -namespace experimental { - -void copy(const Tensor& src, Place place, bool blocking, Tensor* dst); - -} // namespace experimental -} // namespace paddle diff --git a/paddle/phi/api/lib/utils/CMakeLists.txt b/paddle/phi/api/lib/utils/CMakeLists.txt index de97e7516f619..94a16da2b7720 100644 --- a/paddle/phi/api/lib/utils/CMakeLists.txt +++ b/paddle/phi/api/lib/utils/CMakeLists.txt @@ -1,2 +1,2 @@ cc_library(phi_api_utils SRCS storage.cc tensor_utils.cc DEPS -tensor_base convert_utils dense_tensor lod_tensor selected_rows_utils place var_type_traits string_tensor scalar) +tensor_base convert_utils dense_tensor lod_tensor selected_rows_utils place var_type_traits scalar string_tensor) diff --git a/paddle/phi/common/CMakeLists.txt b/paddle/phi/common/CMakeLists.txt index aa839eab587cb..9bf692703860f 100644 --- a/paddle/phi/common/CMakeLists.txt +++ b/paddle/phi/common/CMakeLists.txt @@ -1,2 +1,2 @@ cc_library(phi_place SRCS place.cc) -cc_library(scalar SRCS scalar.cc DEPS phi_enforce tensor) +cc_library(scalar SRCS scalar.cc DEPS phi_enforce) diff --git a/paddle/phi/common/scalar.cc b/paddle/phi/common/scalar.cc index 41f1c9541823d..5cd55c1e88bed 100644 --- a/paddle/phi/common/scalar.cc +++ b/paddle/phi/common/scalar.cc @@ -14,32 +14,21 @@ limitations under the License. */ #include "paddle/phi/common/scalar.h" -#include "paddle/phi/common/place.h" #include "paddle/phi/core/enforce.h" -#include "paddle/fluid/framework/tensor_util.h" -#include "paddle/fluid/platform/place.h" namespace paddle { namespace experimental { -// The Tensor must have one dim -template <> -ScalarBase::ScalarBase(const phi::DenseTensor& tensor_in) - : dtype_(tensor_in.dtype()) { // NOLINT - PADDLE_ENFORCE_EQ(tensor_in.numel(), +// NOTE(xiongkun): why we put definition here? +// test_custom_op can't include enforce.h, because enforce.h includes gflags. +// so we decouple the include dependence of enforce.h by link. +void ThrowTensorConvertError(int num) { + PADDLE_ENFORCE_EQ(num, 1, phi::errors::InvalidArgument( "The Scalar only supports Tensor with 1 element, but " "now Tensor has `%d` elements", - tensor_in.numel())); - auto cpu_place = phi::CPUPlace(); - if (!paddle::platform::is_same_place(tensor_in.place(), cpu_place)) { - phi::DenseTensor tensor; - framework::TensorCopySync(tensor_in, cpu_place, &tensor); - GetDataFromTensor(tensor); - } else { - GetDataFromTensor(tensor_in); - } + num)); } } // namespace experimental diff --git a/paddle/phi/common/scalar.h b/paddle/phi/common/scalar.h index c28f6185a556a..5134f4eb72639 100644 --- a/paddle/phi/common/scalar.h +++ b/paddle/phi/common/scalar.h @@ -23,6 +23,8 @@ limitations under the License. */ namespace paddle { namespace experimental { +void ThrowTensorConvertError(int); + template class ScalarBase { public: @@ -103,7 +105,50 @@ class ScalarBase { } // The Tensor must have one dim - ScalarBase(const T& tensor_in); // NOLINT + ScalarBase(const T& tensor) : dtype_(tensor.dtype()) { // NOLINT + is_from_tensor_ = true; + ThrowTensorConvertError(tensor.numel()); + switch (dtype_) { + case DataType::FLOAT32: + data_.f32 = tensor.template data()[0]; + break; + case DataType::FLOAT64: + data_.f64 = tensor.template data()[0]; + break; + case DataType::FLOAT16: + data_.f16 = tensor.template data()[0]; + break; + case DataType::BFLOAT16: + data_.bf16 = tensor.template data()[0]; + break; + case DataType::INT32: + data_.i32 = tensor.template data()[0]; + break; + case DataType::INT64: + data_.i64 = tensor.template data()[0]; + break; + case DataType::INT16: + data_.i16 = tensor.template data()[0]; + break; + case DataType::INT8: + data_.i8 = tensor.template data()[0]; + break; + case DataType::UINT8: + data_.ui8 = tensor.template data()[0]; + break; + case DataType::BOOL: + data_.b = tensor.template data()[0]; + break; + case DataType::COMPLEX64: + data_.c64 = tensor.template data()[0]; + break; + case DataType::COMPLEX128: + data_.c128 = tensor.template data()[0]; + break; + default: + PD_THROW("Invalid tensor data type `", dtype_, "`."); + } + } template ScalarBase(const ScalarBase& other) { @@ -155,49 +200,6 @@ class ScalarBase { private: template friend void CopyScalar(const ScalarBase& src, ScalarBase* dst); - void GetDataFromTensor(const T& tensor) { - is_from_tensor_ = true; - switch (dtype_) { - case DataType::FLOAT32: - data_.f32 = tensor.template data()[0]; - break; - case DataType::FLOAT64: - data_.f64 = tensor.template data()[0]; - break; - case DataType::FLOAT16: - data_.f16 = tensor.template data()[0]; - break; - case DataType::BFLOAT16: - data_.bf16 = tensor.template data()[0]; - break; - case DataType::INT32: - data_.i32 = tensor.template data()[0]; - break; - case DataType::INT64: - data_.i64 = tensor.template data()[0]; - break; - case DataType::INT16: - data_.i16 = tensor.template data()[0]; - break; - case DataType::INT8: - data_.i8 = tensor.template data()[0]; - break; - case DataType::UINT8: - data_.ui8 = tensor.template data()[0]; - break; - case DataType::BOOL: - data_.b = tensor.template data()[0]; - break; - case DataType::COMPLEX64: - data_.c64 = tensor.template data()[0]; - break; - case DataType::COMPLEX128: - data_.c128 = tensor.template data()[0]; - break; - default: - PD_THROW("Invalid tensor data type `", dtype_, "`."); - } - } private: bool is_from_tensor_{false}; diff --git a/paddle/phi/core/CMakeLists.txt b/paddle/phi/core/CMakeLists.txt index 23574e98fbf17..b42b4388c2ce1 100644 --- a/paddle/phi/core/CMakeLists.txt +++ b/paddle/phi/core/CMakeLists.txt @@ -23,7 +23,7 @@ cc_library(string_tensor SRCS string_tensor.cc DEPS convert_utils tensor_meta te cc_library(meta_tensor SRCS meta_tensor.cc DEPS tensor_base tensor_meta dense_tensor) cc_library(infermeta_utils SRCS infermeta_utils.cc DEPS meta_tensor) -cc_library(selected_rows SRCS selected_rows_impl.cc selected_rows.cc DEPS tensor_base dense_tensor phi_enforce ddim memcpy) +cc_library(selected_rows SRCS selected_rows_impl.cc DEPS dense_tensor phi_enforce ddim memcpy) cc_library(phi_device_context SRCS device_context.cc DEPS dense_tensor selected_rows) cc_library(custom_kernel SRCS custom_kernel.cc DEPS kernel_factory) diff --git a/paddle/phi/core/selected_rows.cc b/paddle/phi/core/selected_rows.cc deleted file mode 100644 index dcf9c4182157a..0000000000000 --- a/paddle/phi/core/selected_rows.cc +++ /dev/null @@ -1,26 +0,0 @@ -/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/phi/core/selected_rows.h" - -namespace phi { - -SelectedRows::SelectedRows(const std::vector& rows, - const int64_t& height) - : impl_(std::make_shared(rows, height)) {} - -SelectedRows::SelectedRows() - : impl_(std::make_shared()) {} - -} // namespace phi diff --git a/paddle/phi/core/selected_rows.h b/paddle/phi/core/selected_rows.h index a71c0471cc431..7ee475b4d5d9e 100644 --- a/paddle/phi/core/selected_rows.h +++ b/paddle/phi/core/selected_rows.h @@ -42,9 +42,10 @@ class SelectedRows : public TensorBase, * */ public: - SelectedRows(const std::vector& rows, const int64_t& height); + SelectedRows(const std::vector& rows, const int64_t& height) + : impl_(std::make_shared(rows, height)) {} - SelectedRows(); + SelectedRows() : impl_(std::make_shared()) {} const DenseTensor& value() const { return impl_->value(); } diff --git a/paddle/phi/core/utils/type_registry.h b/paddle/phi/core/utils/type_registry.h index f27c3db2275c3..8d9f9167242c8 100644 --- a/paddle/phi/core/utils/type_registry.h +++ b/paddle/phi/core/utils/type_registry.h @@ -51,7 +51,7 @@ TypeInfo TypeRegistry::RegisterType(const std::string& type) { std::lock_guard guard(mutex_); assert(name_to_id_.find(type) == name_to_id_.end()); assert(names_.size() < std::numeric_limits::max()); - int8_t id = static_cast(names_.size()); + int8_t id = names_.size(); names_.emplace_back(type); name_to_id_[type] = id; return TypeInfo(id); diff --git a/paddle/phi/tests/api/CMakeLists.txt b/paddle/phi/tests/api/CMakeLists.txt index dd4b7e62ec52f..94378aceff58c 100644 --- a/paddle/phi/tests/api/CMakeLists.txt +++ b/paddle/phi/tests/api/CMakeLists.txt @@ -11,14 +11,14 @@ cc_test(test_mean_api SRCS test_mean_api.cc DEPS ${COMMON_API_TEST_DEPS}) cc_test(test_dot_api SRCS test_dot_api.cc DEPS ${COMMON_API_TEST_DEPS}) cc_test(test_matmul_api SRCS test_matmul_api.cc DEPS ${COMMON_API_TEST_DEPS}) cc_test(test_empty_api SRCS test_empty_api.cc DEPS ${COMMON_API_TEST_DEPS}) -cc_test(test_fill_api SRCS test_fill_api.cc DEPS ${COMMON_API_TEST_DEPS} api_scalar) +cc_test(test_fill_api SRCS test_fill_api.cc DEPS ${COMMON_API_TEST_DEPS}) cc_test(test_elementwise_api SRCS test_elementwise_api.cc DEPS ${COMMON_API_TEST_DEPS}) cc_test(test_cast_api SRCS test_cast_api.cc DEPS ${COMMON_API_TEST_DEPS}) cc_test(test_reshape_api SRCS test_reshape_api.cc DEPS ${COMMON_API_TEST_DEPS}) cc_test(test_to_api SRCS test_to_api.cc DEPS ${COMMON_API_TEST_DEPS}) cc_test(test_slice_api SRCS test_slice_api.cc DEPS ${COMMON_API_TEST_DEPS}) cc_test(test_sum_api SRCS test_sum_api.cc DEPS ${COMMON_API_TEST_DEPS}) -cc_test(test_scale_api SRCS test_scale_api.cc DEPS ${COMMON_API_TEST_DEPS} api_scalar) +cc_test(test_scale_api SRCS test_scale_api.cc DEPS ${COMMON_API_TEST_DEPS}) cc_test(test_scale_benchmark SRCS test_scale_benchmark.cc DEPS ${COMMON_API_TEST_DEPS}) cc_test(test_conj_api SRCS test_conj_api.cc DEPS ${COMMON_API_TEST_DEPS}) cc_test(test_concat_api SRCS test_concat_api.cc DEPS ${COMMON_API_TEST_DEPS}) diff --git a/paddle/phi/tests/common/CMakeLists.txt b/paddle/phi/tests/common/CMakeLists.txt index ca6d20045d171..710ea3c066472 100644 --- a/paddle/phi/tests/common/CMakeLists.txt +++ b/paddle/phi/tests/common/CMakeLists.txt @@ -2,9 +2,3 @@ cc_test(phi_test_backend SRCS test_backend.cc DEPS gtest) cc_test(phi_test_data_layout SRCS test_data_layout.cc DEPS gtest) cc_test(phi_test_data_type SRCS test_data_type.cc DEPS gtest) cc_test(phi_test_place SRCS test_place.cc DEPS phi_place) -if (WITH_GPU) - nv_test(phi_test_scalar SRCS test_scalar.cu DEPS scalar api_scalar) -endif() -if(WITH_ROCM) - hip_test(phi_test_scalar SRCS test_scalar.cu DEPS scalar api_scalar) -endif() diff --git a/paddle/phi/tests/common/test_scalar.cu b/paddle/phi/tests/common/test_scalar.cu deleted file mode 100644 index 6b0caa175dc04..0000000000000 --- a/paddle/phi/tests/common/test_scalar.cu +++ /dev/null @@ -1,205 +0,0 @@ -/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include // NOLINT -#include "gtest/gtest.h" -#include "paddle/fluid/memory/allocation/allocator_facade.h" -#include "paddle/phi/api/include/tensor.h" -#include "paddle/phi/api/lib/utils/allocator.h" -#include "paddle/phi/backends/gpu/gpu_context.h" -#include "paddle/phi/common/complex.h" -#include "paddle/phi/common/float16.h" -#include "paddle/phi/common/scalar.h" -#include "paddle/phi/core/dense_tensor.h" -#include "paddle/phi/core/kernel_registry.h" - -PD_DECLARE_KERNEL(copy, GPU, ALL_LAYOUT); - -namespace phi { -namespace tests { - -using DDim = phi::DDim; -using float16 = phi::dtype::float16; -using complex64 = ::phi::dtype::complex; -using complex128 = ::phi::dtype::complex; - -__global__ void FillTensor(float* data) { data[0] = 1; } - -TEST(Scalar, ConstructFromDenseTensor1) { - // 1. create tensor - const auto alloc = - std::make_unique(phi::CPUPlace()); - phi::DenseTensor dense_x( - alloc.get(), - phi::DenseTensorMeta( - phi::DataType::FLOAT16, phi::make_ddim({1}), phi::DataLayout::NCHW)); - phi::CPUContext dev_ctx; - dev_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance() - .GetAllocator(phi::CPUPlace()) - .get()); - dev_ctx.Init(); - - auto* dense_x_data = dev_ctx.Alloc(&dense_x); - dense_x_data[0] = 1; - phi::Scalar scalar_test(dense_x); - ASSERT_NEAR(1, scalar_test.to(), 1e-6); -} - -TEST(Scalar, ConstructFromDenseTensor2) { - // 1. create tensor - const auto alloc = - std::make_unique(phi::CPUPlace()); - phi::DenseTensor dense_x( - alloc.get(), - phi::DenseTensorMeta( - phi::DataType::INT16, phi::make_ddim({1}), phi::DataLayout::NCHW)); - phi::CPUContext dev_ctx; - dev_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance() - .GetAllocator(phi::CPUPlace()) - .get()); - dev_ctx.Init(); - - auto* dense_x_data = dev_ctx.Alloc(&dense_x); - dense_x_data[0] = 1; - phi::Scalar scalar_test(dense_x); - ASSERT_EQ(1, scalar_test.to()); -} - -TEST(Scalar, ConstructFromDenseTensor3) { - // 1. create tensor - const auto alloc = - std::make_unique(phi::CPUPlace()); - phi::DenseTensor dense_x( - alloc.get(), - phi::DenseTensorMeta( - phi::DataType::INT8, phi::make_ddim({1}), phi::DataLayout::NCHW)); - phi::CPUContext dev_ctx; - dev_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance() - .GetAllocator(phi::CPUPlace()) - .get()); - dev_ctx.Init(); - - auto* dense_x_data = dev_ctx.Alloc(&dense_x); - dense_x_data[0] = 1; - phi::Scalar scalar_test(dense_x); - ASSERT_EQ(1, scalar_test.to()); -} - -TEST(Scalar, ConstructFromDenseTensor4) { - // 1. create tensor - const auto alloc = - std::make_unique(phi::CPUPlace()); - phi::DenseTensor dense_x( - alloc.get(), - phi::DenseTensorMeta( - phi::DataType::BOOL, phi::make_ddim({1}), phi::DataLayout::NCHW)); - phi::CPUContext dev_ctx; - dev_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance() - .GetAllocator(phi::CPUPlace()) - .get()); - dev_ctx.Init(); - - auto* dense_x_data = dev_ctx.Alloc(&dense_x); - dense_x_data[0] = true; - phi::Scalar scalar_test(dense_x); - ASSERT_EQ(true, scalar_test.to()); -} - -TEST(Scalar, ConstructFromDenseTensor5) { - // 1. create tensor - const auto alloc = - std::make_unique(phi::CPUPlace()); - phi::DenseTensor dense_x(alloc.get(), - phi::DenseTensorMeta(phi::DataType::COMPLEX64, - phi::make_ddim({1}), - phi::DataLayout::NCHW)); - phi::CPUContext dev_ctx; - dev_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance() - .GetAllocator(phi::CPUPlace()) - .get()); - dev_ctx.Init(); - - auto* dense_x_data = dev_ctx.Alloc(&dense_x); - dense_x_data[0] = 1; - phi::Scalar scalar_test(dense_x); - complex64 expected_value(1, 0); - EXPECT_TRUE(expected_value == scalar_test.to()); -} - -TEST(Scalar, ConstructFromDenseTensor6) { - // 1. create tensor - const auto alloc = - std::make_unique(phi::CPUPlace()); - phi::DenseTensor dense_x(alloc.get(), - phi::DenseTensorMeta(phi::DataType::COMPLEX128, - phi::make_ddim({1}), - phi::DataLayout::NCHW)); - phi::CPUContext dev_ctx; - dev_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance() - .GetAllocator(phi::CPUPlace()) - .get()); - dev_ctx.Init(); - - auto* dense_x_data = dev_ctx.Alloc(&dense_x); - dense_x_data[0] = 1; - phi::Scalar scalar_test(dense_x); - complex128 expected_value(1, 0); - EXPECT_TRUE(expected_value == scalar_test.to()); -} - -TEST(Scalar, ConstructFromDenseTensor7) { - // 1. create tensor - const auto alloc = - std::make_unique(phi::GPUPlace()); - phi::DenseTensor dense_x( - alloc.get(), - phi::DenseTensorMeta( - phi::DataType::FLOAT32, phi::make_ddim({1}), phi::DataLayout::NCHW)); - phi::GPUContext dev_ctx; - dev_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance() - .GetAllocator(phi::GPUPlace()) - .get()); - dev_ctx.Init(); - - auto* dense_x_data = dev_ctx.Alloc(&dense_x); - FillTensor<<<1, 1, 0, dev_ctx.stream()>>>(dense_x_data); - dev_ctx.Wait(); - phi::Scalar scalar_test(dense_x); - ASSERT_NEAR(1, scalar_test.to(), 1e-6); -} - -TEST(Scalar, ConstructFromTensor) { - // 1. create tensor - const auto alloc = - std::make_unique(phi::GPUPlace()); - auto dense_x = std::make_shared( - alloc.get(), - phi::DenseTensorMeta( - phi::DataType::FLOAT32, phi::make_ddim({1}), phi::DataLayout::NCHW)); - - phi::GPUContext dev_ctx; - dev_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance() - .GetAllocator(phi::GPUPlace()) - .get()); - dev_ctx.Init(); - auto* dense_x_data = dev_ctx.Alloc(dense_x.get()); - FillTensor<<<1, 1, 0, dev_ctx.stream()>>>(dense_x_data); - dev_ctx.Wait(); - paddle::experimental::Tensor x(dense_x); - paddle::experimental::Scalar scalar_test(x); - ASSERT_NEAR(1, scalar_test.to(), 1e-6); -} - -} // namespace tests -} // namespace phi diff --git a/paddle/phi/tests/core/CMakeLists.txt b/paddle/phi/tests/core/CMakeLists.txt index 7d2fd90e6bb7b..824d188457815 100644 --- a/paddle/phi/tests/core/CMakeLists.txt +++ b/paddle/phi/tests/core/CMakeLists.txt @@ -1,4 +1,4 @@ -cc_test(test_custom_kernel SRCS test_custom_kernel.cc DEPS custom_kernel scalar) +cc_test(test_custom_kernel SRCS test_custom_kernel.cc DEPS custom_kernel) cc_test(test_dense_tensor SRCS test_dense_tensor.cc DEPS dense_tensor) cc_test(test_intrusive_ptr SRCS test_intrusive_ptr.cc) cc_test(test_type_info SRCS test_type_info.cc) From b0b7516978ca542a12602d478aeb299ce92afdb9 Mon Sep 17 00:00:00 2001 From: zlsh80826 Date: Wed, 13 Apr 2022 14:42:28 +0800 Subject: [PATCH 125/211] Reduce trt convert unit test problem size (#41701) --- .../inference/test_trt_convert_activation.py | 34 +++++----- .../inference/test_trt_convert_batch_norm.py | 20 +++--- .../ir/inference/test_trt_convert_clip.py | 34 +++++----- .../ir/inference/test_trt_convert_conv2d.py | 64 +++++-------------- .../test_trt_convert_conv2d_fusion.py | 47 +++++--------- .../inference/test_trt_convert_elementwise.py | 14 ++-- .../ir/inference/test_trt_convert_gelu.py | 32 +++++----- .../test_trt_convert_hard_sigmoid.py | 16 ++--- .../inference/test_trt_convert_hard_swish.py | 8 +-- .../ir/inference/test_trt_convert_prelu.py | 8 +-- .../ir/inference/test_trt_convert_scale.py | 6 +- .../ir/inference/test_trt_convert_stack.py | 2 +- .../ir/inference/test_trt_convert_yolo_box.py | 14 ++-- 13 files changed, 122 insertions(+), 177 deletions(-) diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_activation.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_activation.py index bc40d3b4c27d9..c7f724bdaae3f 100644 --- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_activation.py +++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_activation.py @@ -28,16 +28,16 @@ def is_program_valid(self, program_config: ProgramConfig) -> bool: def sample_program_configs(self): def generate_input1(dims, batch, attrs: List[Dict[str, Any]]): if dims == 1: - return np.ones([64]).astype(np.float32) + return np.ones([32]).astype(np.float32) elif dims == 2: - return np.ones([3, 64]).astype(np.float32) + return np.ones([3, 32]).astype(np.float32) elif dims == 3: - return np.ones([3, 64, 64]).astype(np.float32) + return np.ones([3, 32, 32]).astype(np.float32) else: - return np.ones([batch, 3, 64, 64]).astype(np.float32) + return np.ones([batch, 3, 32, 32]).astype(np.float32) for dims in [1, 2, 3, 4]: - for batch in [1, 2, 4]: + for batch in [1, 4]: for op_type in ["relu", "sigmoid", "tanh", "relu6"]: self.dims = dims dics = [{}] @@ -70,27 +70,25 @@ def sample_predictor_configs( def generate_dynamic_shape(attrs): if self.dims == 1: self.dynamic_shape.min_input_shape = {"input_data": [1]} - self.dynamic_shape.max_input_shape = {"input_data": [128]} - self.dynamic_shape.opt_input_shape = {"input_data": [64]} + self.dynamic_shape.max_input_shape = {"input_data": [64]} + self.dynamic_shape.opt_input_shape = {"input_data": [32]} elif self.dims == 2: - self.dynamic_shape.min_input_shape = {"input_data": [1, 32]} - self.dynamic_shape.max_input_shape = {"input_data": [4, 64]} - self.dynamic_shape.opt_input_shape = {"input_data": [3, 64]} + self.dynamic_shape.min_input_shape = {"input_data": [1, 16]} + self.dynamic_shape.max_input_shape = {"input_data": [4, 32]} + self.dynamic_shape.opt_input_shape = {"input_data": [3, 32]} elif self.dims == 3: - self.dynamic_shape.min_input_shape = {"input_data": [1, 32, 32]} - self.dynamic_shape.max_input_shape = { - "input_data": [10, 64, 64] - } - self.dynamic_shape.opt_input_shape = {"input_data": [3, 64, 64]} + self.dynamic_shape.min_input_shape = {"input_data": [1, 16, 16]} + self.dynamic_shape.max_input_shape = {"input_data": [4, 32, 32]} + self.dynamic_shape.opt_input_shape = {"input_data": [3, 32, 32]} else: self.dynamic_shape.min_input_shape = { - "input_data": [1, 3, 32, 32] + "input_data": [1, 3, 16, 16] } self.dynamic_shape.max_input_shape = { - "input_data": [4, 3, 64, 64] + "input_data": [4, 3, 32, 32] } self.dynamic_shape.opt_input_shape = { - "input_data": [1, 3, 64, 64] + "input_data": [1, 3, 32, 32] } def clear_dynamic_shape(): diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_batch_norm.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_batch_norm.py index 410cef798aa63..899cf0e263955 100644 --- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_batch_norm.py +++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_batch_norm.py @@ -54,7 +54,7 @@ def generate_MomentumTensor(attrs: List[Dict[str, Any]], batch): for dims in [2, 3, 4]: for num_input in [0, 1]: - for batch in [1, 2, 4]: + for batch in [1, 4]: for epsilon in [1e-6, 1e-5, 1e-4]: for data_layout in ["NCHW"]: for momentum in [0.9, 0.8]: @@ -134,33 +134,33 @@ def generate_dynamic_shape(attrs): if self.dims == 4: if attrs[0]['data_layout'] == "NCHW": self.dynamic_shape.min_input_shape = { - "batch_norm_input": [1, 3, 24, 24] + "batch_norm_input": [1, 3, 12, 12] } self.dynamic_shape.max_input_shape = { - "batch_norm_input": [4, 3, 48, 48] + "batch_norm_input": [4, 3, 24, 24] } self.dynamic_shape.opt_input_shape = { - "batch_norm_input": [1, 3, 24, 48] + "batch_norm_input": [1, 3, 24, 24] } elif attrs[0]['data_layout'] == "NHWC": self.dynamic_shape.min_input_shape = { - "batch_norm_input": [1, 24, 24, 3] + "batch_norm_input": [1, 12, 12, 3] } self.dynamic_shape.max_input_shape = { - "batch_norm_input": [4, 48, 48, 3] + "batch_norm_input": [4, 24, 24, 3] } self.dynamic_shape.opt_input_shape = { - "batch_norm_input": [1, 24, 48, 3] + "batch_norm_input": [1, 24, 24, 3] } elif self.dims == 3: self.dynamic_shape.min_input_shape = { - "batch_norm_input": [1, 3, 24] + "batch_norm_input": [1, 3, 12] } self.dynamic_shape.max_input_shape = { - "batch_norm_input": [4, 3, 48] + "batch_norm_input": [4, 3, 24] } self.dynamic_shape.opt_input_shape = { - "batch_norm_input": [1, 3, 48] + "batch_norm_input": [1, 3, 24] } elif self.dims == 2: self.dynamic_shape.min_input_shape = { diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_clip.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_clip.py index 5150622cf801d..1277cde011c17 100644 --- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_clip.py +++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_clip.py @@ -28,13 +28,13 @@ def is_program_valid(self, program_config: ProgramConfig) -> bool: def sample_program_configs(self): def generate_input1(dims, batch, attrs: List[Dict[str, Any]]): if dims == 1: - return np.ones([64]).astype(np.float32) + return np.ones([32]).astype(np.float32) elif dims == 2: - return np.ones([3, 64]).astype(np.float32) + return np.ones([3, 32]).astype(np.float32) elif dims == 3: - return np.ones([3, 64, 64]).astype(np.float32) + return np.ones([3, 32, 32]).astype(np.float32) else: - return np.ones([batch, 3, 64, 64]).astype(np.float32) + return np.ones([batch, 3, 32, 32]).astype(np.float32) def generate_weight1(attrs: List[Dict[str, Any]]): return np.array([np.random.uniform(1, 10)]).astype("float32") @@ -43,7 +43,7 @@ def generate_weight2(attrs: List[Dict[str, Any]]): return np.array([np.random.uniform(10, 20)]).astype("float32") for dims in [1, 2, 3, 4]: - for batch in [1, 2, 4]: + for batch in [1, 4]: for op_inputs in [{ "X": ["input_data"] }, { @@ -89,27 +89,25 @@ def sample_predictor_configs(self, program_config): def generate_dynamic_shape(attrs): if self.dims == 1: self.dynamic_shape.min_input_shape = {"input_data": [1]} - self.dynamic_shape.max_input_shape = {"input_data": [128]} - self.dynamic_shape.opt_input_shape = {"input_data": [64]} + self.dynamic_shape.max_input_shape = {"input_data": [64]} + self.dynamic_shape.opt_input_shape = {"input_data": [32]} elif self.dims == 2: - self.dynamic_shape.min_input_shape = {"input_data": [1, 32]} - self.dynamic_shape.max_input_shape = {"input_data": [4, 64]} - self.dynamic_shape.opt_input_shape = {"input_data": [3, 64]} + self.dynamic_shape.min_input_shape = {"input_data": [1, 16]} + self.dynamic_shape.max_input_shape = {"input_data": [4, 32]} + self.dynamic_shape.opt_input_shape = {"input_data": [3, 32]} elif self.dims == 3: - self.dynamic_shape.min_input_shape = {"input_data": [1, 32, 32]} - self.dynamic_shape.max_input_shape = { - "input_data": [10, 64, 64] - } - self.dynamic_shape.opt_input_shape = {"input_data": [3, 64, 64]} + self.dynamic_shape.min_input_shape = {"input_data": [1, 16, 16]} + self.dynamic_shape.max_input_shape = {"input_data": [4, 32, 32]} + self.dynamic_shape.opt_input_shape = {"input_data": [3, 32, 32]} else: self.dynamic_shape.min_input_shape = { - "input_data": [1, 3, 32, 32] + "input_data": [1, 3, 16, 16] } self.dynamic_shape.max_input_shape = { - "input_data": [4, 3, 64, 64] + "input_data": [4, 3, 32, 32] } self.dynamic_shape.opt_input_shape = { - "input_data": [1, 3, 64, 64] + "input_data": [1, 3, 32, 32] } def clear_dynamic_shape(): diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_conv2d.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_conv2d.py index 5f85debf4b27c..84ef5b4da68ab 100644 --- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_conv2d.py +++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_conv2d.py @@ -46,20 +46,16 @@ def sample_program_configs(self): self.trt_param.workspace_size = 1073741824 def generate_input1(batch, attrs: List[Dict[str, Any]]): - if attrs[0]['groups'] == 1: - return np.ones([batch, 3, 64, 64]).astype(np.float32) - elif attrs[0]['groups'] == 2: - return np.ones([batch, 6, 64, 64]).astype(np.float32) - else: - return np.ones([batch, 9, 64, 64]).astype(np.float32) + return np.ones( + [batch, attrs[0]['groups'] * 3, 64, 64]).astype(np.float32) def generate_weight1(attrs: List[Dict[str, Any]]): return np.random.random([24, 3, 3, 3]).astype(np.float32) - for batch in [1, 2, 4]: + for batch in [1, 4]: for strides in [[1, 1], [2, 2], [1, 2]]: for paddings in [[0, 3], [1, 2, 3, 4]]: - for groups in [1, 2, 3]: + for groups in [1, 3]: for padding_algorithm in ['EXPLICIT', 'SAME', 'VALID']: for dilations in [[1, 1], [2, 2], [1, 2]]: for data_format in ['NCHW']: @@ -116,45 +112,19 @@ def generate_weight1(attrs: List[Dict[str, Any]]): def sample_predictor_configs( self, program_config) -> (paddle_infer.Config, List[int], float): def generate_dynamic_shape(attrs): - if attrs[0]['groups'] == 1: - self.dynamic_shape.min_input_shape = { - "input_data": [1, 3, 32, 32], - "output_data": [1, 24, 32, 32] - } - self.dynamic_shape.max_input_shape = { - "input_data": [4, 3, 64, 64], - "output_data": [4, 24, 64, 64] - } - self.dynamic_shape.opt_input_shape = { - "input_data": [1, 3, 64, 64], - "output_data": [1, 24, 64, 64] - } - elif attrs[0]['groups'] == 2: - self.dynamic_shape.min_input_shape = { - "input_data": [1, 6, 32, 32], - "output_data": [1, 24, 32, 32] - } - self.dynamic_shape.max_input_shape = { - "input_data": [4, 6, 64, 64], - "output_data": [4, 24, 64, 64] - } - self.dynamic_shape.opt_input_shape = { - "input_data": [1, 6, 64, 64], - "output_data": [1, 24, 64, 64] - } - else: - self.dynamic_shape.min_input_shape = { - "input_data": [1, 9, 32, 32], - "output_data": [1, 24, 32, 32] - } - self.dynamic_shape.max_input_shape = { - "input_data": [4, 9, 64, 64], - "output_data": [4, 24, 64, 64] - } - self.dynamic_shape.opt_input_shape = { - "input_data": [1, 9, 64, 64], - "output_data": [1, 24, 64, 64] - } + input_groups = attrs[0]['groups'] * 3 + self.dynamic_shape.min_input_shape = { + "input_data": [1, input_groups, 32, 32], + "output_data": [1, 24, 32, 32] + } + self.dynamic_shape.max_input_shape = { + "input_data": [4, input_groups, 64, 64], + "output_data": [4, 24, 64, 64] + } + self.dynamic_shape.opt_input_shape = { + "input_data": [1, input_groups, 64, 64], + "output_data": [1, 24, 64, 64] + } def clear_dynamic_shape(): self.dynamic_shape.min_input_shape = {} diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_conv2d_fusion.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_conv2d_fusion.py index b1b5626c10eb9..8a9a9909571a4 100644 --- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_conv2d_fusion.py +++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_conv2d_fusion.py @@ -49,10 +49,8 @@ def sample_program_configs(self): self.trt_param.workspace_size = 1073741824 def generate_input1(batch, attrs: List[Dict[str, Any]]): - if attrs[0]['groups'] == 2: - return np.ones([batch, 6, 64, 64]).astype(np.float32) - else: - return np.ones([batch, 9, 64, 64]).astype(np.float32) + return np.ones( + [batch, attrs[0]['groups'] * 3, 64, 64]).astype(np.float32) def generate_weight1(attrs: List[Dict[str, Any]]): return np.random.random([24, 3, 3, 3]).astype(np.float32) @@ -60,7 +58,7 @@ def generate_weight1(attrs: List[Dict[str, Any]]): def generate_weight2(attrs: List[Dict[str, Any]]): return np.random.random([24, 1, 1]).astype(np.float32) - for batch in [1, 2, 4]: + for batch in [1, 4]: for strides in [[1, 1], [2, 2], [1, 2]]: for paddings in [[0, 3], [1, 2, 3, 4]]: for groups in [2, 3]: @@ -126,32 +124,19 @@ def generate_weight2(attrs: List[Dict[str, Any]]): def sample_predictor_configs( self, program_config) -> (paddle_infer.Config, List[int], float): def generate_dynamic_shape(attrs): - if attrs[0]['groups'] == 2: - self.dynamic_shape.min_input_shape = { - "input_data": [1, 6, 32, 32], - "output_data": [1, 24, 32, 32] - } - self.dynamic_shape.max_input_shape = { - "input_data": [4, 6, 64, 64], - "output_data": [4, 24, 64, 64] - } - self.dynamic_shape.opt_input_shape = { - "input_data": [1, 6, 64, 64], - "output_data": [1, 24, 64, 64] - } - else: - self.dynamic_shape.min_input_shape = { - "input_data": [1, 9, 32, 32], - "output_data": [1, 24, 32, 32] - } - self.dynamic_shape.max_input_shape = { - "input_data": [4, 9, 64, 64], - "output_data": [4, 24, 64, 64] - } - self.dynamic_shape.opt_input_shape = { - "input_data": [1, 9, 64, 64], - "output_data": [1, 24, 64, 64] - } + input_groups = attrs[0]['groups'] * 3 + self.dynamic_shape.min_input_shape = { + "input_data": [1, input_groups, 32, 32], + "output_data": [1, 24, 32, 32] + } + self.dynamic_shape.max_input_shape = { + "input_data": [4, input_groups, 64, 64], + "output_data": [4, 24, 64, 64] + } + self.dynamic_shape.opt_input_shape = { + "input_data": [1, input_groups, 64, 64], + "output_data": [1, 24, 64, 64] + } def clear_dynamic_shape(): self.dynamic_shape.min_input_shape = {} diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_elementwise.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_elementwise.py index e849496621a10..ec02a357a48b6 100644 --- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_elementwise.py +++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_elementwise.py @@ -32,7 +32,7 @@ def generate_input(shape): def generate_weight(): return np.random.randn(32).astype(np.float32) - for batch in [1, 2, 4]: + for batch in [1, 4]: for shape in [[32], [batch, 32], [batch, 32, 32], [batch, 32, 16, 32]]: for op_type in ["elementwise_add", "elementwise_mul"]: @@ -72,7 +72,7 @@ def generate_dynamic_shape(attrs): # The input.dims[1] must be equal to the weight's length. if self.dims == 1: self.dynamic_shape.min_input_shape = {"input_data": [4]} - self.dynamic_shape.max_input_shape = {"input_data": [256]} + self.dynamic_shape.max_input_shape = {"input_data": [32]} self.dynamic_shape.opt_input_shape = {"input_data": [16]} elif self.dims == 2: self.dynamic_shape.min_input_shape = {"input_data": [1, 32]} @@ -80,19 +80,17 @@ def generate_dynamic_shape(attrs): self.dynamic_shape.opt_input_shape = {"input_data": [2, 32]} elif self.dims == 3: self.dynamic_shape.min_input_shape = {"input_data": [1, 32, 4]} - self.dynamic_shape.max_input_shape = { - "input_data": [4, 32, 256] - } - self.dynamic_shape.opt_input_shape = {"input_data": [2, 32, 16]} + self.dynamic_shape.max_input_shape = {"input_data": [4, 32, 32]} + self.dynamic_shape.opt_input_shape = {"input_data": [2, 32, 32]} elif self.dims == 4: self.dynamic_shape.min_input_shape = { "input_data": [1, 32, 4, 4] } self.dynamic_shape.max_input_shape = { - "input_data": [4, 32, 128, 256] + "input_data": [4, 32, 32, 32] } self.dynamic_shape.opt_input_shape = { - "input_data": [2, 32, 32, 16] + "input_data": [4, 32, 16, 32] } def clear_dynamic_shape(): diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_gelu.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_gelu.py index e79b33d88d3f1..448e4e3e71b02 100644 --- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_gelu.py +++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_gelu.py @@ -28,13 +28,13 @@ def is_program_valid(self, program_config: ProgramConfig) -> bool: def sample_program_configs(self): def generate_input1(dims, attrs: List[Dict[str, Any]]): if dims == 1: - return np.ones([64]).astype(np.float32) + return np.ones([32]).astype(np.float32) elif dims == 2: - return np.ones([3, 64]).astype(np.float32) + return np.ones([3, 32]).astype(np.float32) elif dims == 3: - return np.ones([3, 64, 64]).astype(np.float32) + return np.ones([3, 32, 32]).astype(np.float32) else: - return np.ones([1, 3, 64, 64]).astype(np.float32) + return np.ones([1, 3, 32, 32]).astype(np.float32) for dims in [1, 2, 3, 4]: for approximate in [True, False]: @@ -69,27 +69,25 @@ def sample_predictor_configs( def generate_dynamic_shape(attrs): if self.dims == 1: self.dynamic_shape.min_input_shape = {"input_data": [1]} - self.dynamic_shape.max_input_shape = {"input_data": [128]} - self.dynamic_shape.opt_input_shape = {"input_data": [64]} + self.dynamic_shape.max_input_shape = {"input_data": [64]} + self.dynamic_shape.opt_input_shape = {"input_data": [32]} elif self.dims == 2: - self.dynamic_shape.min_input_shape = {"input_data": [1, 32]} - self.dynamic_shape.max_input_shape = {"input_data": [4, 64]} - self.dynamic_shape.opt_input_shape = {"input_data": [3, 64]} + self.dynamic_shape.min_input_shape = {"input_data": [1, 16]} + self.dynamic_shape.max_input_shape = {"input_data": [4, 32]} + self.dynamic_shape.opt_input_shape = {"input_data": [3, 32]} elif self.dims == 3: - self.dynamic_shape.min_input_shape = {"input_data": [1, 32, 32]} - self.dynamic_shape.max_input_shape = { - "input_data": [10, 64, 64] - } - self.dynamic_shape.opt_input_shape = {"input_data": [3, 64, 64]} + self.dynamic_shape.min_input_shape = {"input_data": [1, 16, 16]} + self.dynamic_shape.max_input_shape = {"input_data": [4, 32, 32]} + self.dynamic_shape.opt_input_shape = {"input_data": [3, 32, 32]} else: self.dynamic_shape.min_input_shape = { - "input_data": [1, 3, 32, 32] + "input_data": [1, 3, 16, 16] } self.dynamic_shape.max_input_shape = { - "input_data": [4, 3, 64, 64] + "input_data": [4, 3, 32, 32] } self.dynamic_shape.opt_input_shape = { - "input_data": [1, 3, 64, 64] + "input_data": [1, 3, 32, 32] } def clear_dynamic_shape(): diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_hard_sigmoid.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_hard_sigmoid.py index 969f0e8b148a2..b3f118e9fbf52 100644 --- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_hard_sigmoid.py +++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_hard_sigmoid.py @@ -29,8 +29,8 @@ def sample_program_configs(self): def generate_input(shape): return np.random.random(shape).astype(np.float32) - for batch in [1, 2, 4]: - for shape in [[batch, 64], [batch, 32, 64], [batch, 64, 32, 128]]: + for batch in [1, 4]: + for shape in [[batch, 32], [batch, 16, 32], [batch, 32, 16, 128]]: self.input_dim = len(shape) for slope in [0.1, 0.5]: for offset in [0.2, 0.7]: @@ -63,23 +63,21 @@ def sample_predictor_configs( def generate_dynamic_shape(attrs): if self.input_dim == 2: self.dynamic_shape.min_input_shape = {"input_data": [1, 8]} - self.dynamic_shape.max_input_shape = {"input_data": [64, 128]} + self.dynamic_shape.max_input_shape = {"input_data": [4, 32]} self.dynamic_shape.opt_input_shape = {"input_data": [2, 16]} elif self.input_dim == 3: self.dynamic_shape.min_input_shape = {"input_data": [1, 8, 8]} - self.dynamic_shape.max_input_shape = { - "input_data": [64, 128, 256] - } - self.dynamic_shape.opt_input_shape = {"input_data": [2, 16, 64]} + self.dynamic_shape.max_input_shape = {"input_data": [4, 16, 32]} + self.dynamic_shape.opt_input_shape = {"input_data": [4, 16, 32]} elif self.input_dim == 4: self.dynamic_shape.min_input_shape = { "input_data": [1, 8, 8, 4] } self.dynamic_shape.max_input_shape = { - "input_data": [64, 128, 256, 512] + "input_data": [4, 32, 16, 128] } self.dynamic_shape.opt_input_shape = { - "input_data": [2, 16, 64, 128] + "input_data": [4, 32, 16, 128] } def clear_dynamic_shape(): diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_hard_swish.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_hard_swish.py index 283a19ec00574..c092d6da86839 100644 --- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_hard_swish.py +++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_hard_swish.py @@ -37,7 +37,7 @@ def is_program_valid(self, program_config: ProgramConfig) -> bool: def sample_program_configs(self): def generate_input1(attrs: List[Dict[str, Any]]): - return np.ones([1, 3, 64, 64]).astype(np.float32) + return np.ones([1, 3, 32, 32]).astype(np.float32) for threshold in [6.0, 7.0, 100.0, 0.0, -1.0]: for scale in [5.0, 6.0, 7.0, -1.0, 0.0, 100.0]: @@ -74,9 +74,9 @@ def generate_input1(attrs: List[Dict[str, Any]]): def sample_predictor_configs( self, program_config) -> (paddle_infer.Config, List[int], float): def generate_dynamic_shape(attrs): - self.dynamic_shape.min_input_shape = {"input_data": [1, 3, 32, 32]} - self.dynamic_shape.max_input_shape = {"input_data": [4, 3, 64, 64]} - self.dynamic_shape.opt_input_shape = {"input_data": [1, 3, 64, 64]} + self.dynamic_shape.min_input_shape = {"input_data": [1, 3, 16, 16]} + self.dynamic_shape.max_input_shape = {"input_data": [2, 3, 32, 32]} + self.dynamic_shape.opt_input_shape = {"input_data": [1, 3, 32, 32]} def clear_dynamic_shape(): self.dynamic_shape.min_input_shape = {} diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_prelu.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_prelu.py index 10109cdc73a2b..00e3f7feb6022 100644 --- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_prelu.py +++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_prelu.py @@ -136,7 +136,7 @@ def generate_dynamic_shape(attrs): "input_data": [1, 1], } self.dynamic_shape.max_input_shape = { - "input_data": [4, 64], + "input_data": [4, 32], } self.dynamic_shape.opt_input_shape = { "input_data": [2, 3], @@ -146,7 +146,7 @@ def generate_dynamic_shape(attrs): "input_data": [1, 1, 1, 1], } self.dynamic_shape.max_input_shape = { - "input_data": [4, 64, 128, 128], + "input_data": [4, 3, 16, 32], } self.dynamic_shape.opt_input_shape = { "input_data": [2, 3, 16, 32], @@ -156,10 +156,10 @@ def generate_dynamic_shape(attrs): "input_data": [1, 1, 1], } self.dynamic_shape.max_input_shape = { - "input_data": [4, 64, 256], + "input_data": [4, 3, 32], } self.dynamic_shape.opt_input_shape = { - "input_data": [2, 3, 128], + "input_data": [2, 3, 16], } def clear_dynamic_shape(): diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_scale.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_scale.py index 62e7a10327747..d607a43739eb7 100644 --- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_scale.py +++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_scale.py @@ -94,14 +94,14 @@ def generate_dynamic_shape(attrs): "scale_input": [1, 3, 24, 24] } self.dynamic_shape.max_input_shape = { - "scale_input": [9, 3, 48, 48] + "scale_input": [4, 3, 24, 24] } self.dynamic_shape.opt_input_shape = { - "scale_input": [1, 3, 48, 24] + "scale_input": [1, 3, 24, 24] } elif self.dims == 3: self.dynamic_shape.min_input_shape = {"scale_input": [1, 3, 24]} - self.dynamic_shape.max_input_shape = {"scale_input": [9, 6, 48]} + self.dynamic_shape.max_input_shape = {"scale_input": [4, 3, 24]} self.dynamic_shape.opt_input_shape = {"scale_input": [1, 3, 24]} elif self.dims == 2: self.dynamic_shape.min_input_shape = {"scale_input": [1, 24]} diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_stack.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_stack.py index 93ba5da9d66d9..062312b0fab4f 100644 --- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_stack.py +++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_stack.py @@ -69,7 +69,7 @@ def generate_input3(attrs: List[Dict[str, Any]], batch): return np.ones([24]).astype(np.float32) for dims in [1, 2, 3, 4]: - for batch in [1, 2, 4]: + for batch in [1, 4]: for axis in [-2, -1, 0, 1, 2, 3]: self.dims = dims dics = [{"axis": axis}, {}] diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_yolo_box.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_yolo_box.py index 17955c6e007d9..269523661ee4d 100644 --- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_yolo_box.py +++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_yolo_box.py @@ -37,7 +37,7 @@ def generate_input1(attrs: List[Dict[str, Any]], batch, channel): def generate_input2(attrs: List[Dict[str, Any]], batch): return np.random.random([batch, 2]).astype(np.int32) - for batch in [1, 2, 4]: + for batch in [1, 4]: for class_num in [80, 30]: for anchors in [[10, 13, 16, 30, 33, 23]]: for downsample_ratio in [32, 16]: @@ -97,24 +97,24 @@ def generate_dynamic_shape(attrs): if attrs[0]['iou_aware'] == True: channel = 3 * (attrs[0]['class_num'] + 6) self.dynamic_shape.min_input_shape = { - "scale_input": [1, channel, 24, 24] + "scale_input": [1, channel, 12, 12] } self.dynamic_shape.max_input_shape = { - "scale_input": [4, channel, 48, 48] + "scale_input": [4, channel, 24, 24] } self.dynamic_shape.opt_input_shape = { - "scale_input": [1, channel, 24, 48] + "scale_input": [1, channel, 24, 24] } else: channel = 3 * (attrs[0]['class_num'] + 5) self.dynamic_shape.min_input_shape = { - "scale_input": [1, channel, 24, 24] + "scale_input": [1, channel, 12, 12] } self.dynamic_shape.max_input_shape = { - "scale_input": [4, channel, 48, 48] + "scale_input": [4, channel, 24, 24] } self.dynamic_shape.opt_input_shape = { - "scale_input": [1, channel, 24, 48] + "scale_input": [1, channel, 24, 24] } def clear_dynamic_shape(): From 6d1e03a2164b0dba7297d978e5b89c2cd8676e0f Mon Sep 17 00:00:00 2001 From: zyfncg Date: Wed, 13 Apr 2022 15:06:27 +0800 Subject: [PATCH 126/211] Add yaml and unittest for SGD (#41485) * add sgd yaml * change python api * open eager mode in sgd * fix bug --- paddle/fluid/operators/optimizers/sgd_op.cc | 2 +- paddle/phi/api/lib/api_custom_impl.cc | 170 ++++++++++++++++++ paddle/phi/api/lib/api_custom_impl.h | 8 + paddle/phi/api/lib/api_gen_utils.cc | 28 ++- paddle/phi/api/lib/api_gen_utils.h | 8 +- paddle/phi/infermeta/multiary.cc | 2 +- paddle/phi/infermeta/multiary.h | 2 +- python/paddle/fluid/optimizer.py | 7 +- .../fluid/tests/unittests/test_sgd_op.py | 6 + python/paddle/optimizer/sgd.py | 7 +- python/paddle/utils/code_gen/api.yaml | 6 + 11 files changed, 234 insertions(+), 12 deletions(-) diff --git a/paddle/fluid/operators/optimizers/sgd_op.cc b/paddle/fluid/operators/optimizers/sgd_op.cc index f51d776d7195c..a2af131cb505e 100644 --- a/paddle/fluid/operators/optimizers/sgd_op.cc +++ b/paddle/fluid/operators/optimizers/sgd_op.cc @@ -126,7 +126,7 @@ This operator implements one step of the stochastic gradient descent algorithm. namespace ops = paddle::operators; DECLARE_INFER_SHAPE_FUNCTOR(sgd, SGDInferShapeFunctor, - PD_INFER_META(phi::SGDInferMeta)); + PD_INFER_META(phi::SgdInferMeta)); REGISTER_OPERATOR( sgd, ops::SGDOp, ops::SGDOpMaker, paddle::framework::EmptyGradOpMaker, diff --git a/paddle/phi/api/lib/api_custom_impl.cc b/paddle/phi/api/lib/api_custom_impl.cc index d7f148fff818b..2b80094a39e31 100644 --- a/paddle/phi/api/lib/api_custom_impl.cc +++ b/paddle/phi/api/lib/api_custom_impl.cc @@ -656,6 +656,176 @@ std::tuple momentum_impl( return api_output; } +std::tuple sgd_impl( + const Tensor& param, + const Tensor& learning_rate, + const Tensor& grad, + paddle::optional master_param, + bool multi_precision) { + DataType kernel_data_type = ParseDataType(param); + auto kernel_key_set = ParseKernelKeyByInputArgs(param, learning_rate, grad); + auto kernel_key = kernel_key_set.GetHighestPriorityKernelKey(); + VLOG(6) << "sgd API kernel key: [" << kernel_key.backend() << ", " + << kernel_key.layout() << ", " << kernel_data_type << "]"; + + const auto& param_tensor = param.impl(); + std::string kernel_name = "sgd"; + if (phi::DenseTensor::classof(param_tensor.get())) { + if (!phi::DenseTensor::classof(grad.impl().get())) { + kernel_name = "sgd_dense_param_sparse_grad"; + } + } else { + kernel_name = "sgd_sparse_param_sparse_grad"; + } + const auto& kernel = phi::KernelFactory::Instance().SelectKernelOrThrowError( + kernel_name, + {kernel_key.backend(), kernel_key.layout(), kernel_data_type}); + VLOG(6) << kernel_name << " API kernel: " << kernel; + + auto* dev_ctx = GetDeviceContextByBackend(kernel_key.backend()); + + auto in_learning_rate = + PrepareData(learning_rate, kernel.InputAt(1), {false, true, true, true}); + + std::tuple out; + std::get<0>(out) = param; + if (master_param) { + std::get<1>(out) = *master_param; + } + phi::MetaTensor meta_out_0(std::get<0>(out).impl().get()); + phi::MetaTensor meta_out_1(master_param ? std::get<1>(out).impl().get() + : nullptr); + + if (phi::DenseTensor::classof(param_tensor.get())) { + auto in_param = PrepareData(param, kernel.InputAt(0), {}); + auto in_master_param = PrepareData(master_param, kernel.InputAt(3), {}); + + paddle::optional in_master_param_opt = + master_param + ? paddle::make_optional(*in_master_param) + : paddle::none; + auto master_param_meta = MakeMetaTensor(in_master_param_opt); + paddle::optional master_param_meta_opt = + master_param + ? paddle::make_optional(*master_param_meta) + : paddle::none; + + phi::DenseTensor* kernel_out_0 = + SetKernelOutput(kernel_key.backend(), &std::get<0>(out)); + phi::DenseTensor* kernel_out_1 = + master_param + ? static_cast(std::get<1>(out).impl().get()) + : nullptr; + + if (phi::DenseTensor::classof(grad.impl().get())) { + auto in_grad = PrepareData(grad, kernel.InputAt(2), {}); + SgdInferMeta(MakeMetaTensor(*in_param), + MakeMetaTensor(*in_learning_rate), + MakeMetaTensor(*in_grad), + master_param_meta_opt, + multi_precision, + &meta_out_0, + &meta_out_1); + + using kernel_signature = + void (*)(const platform::DeviceContext&, + const phi::DenseTensor&, + const phi::DenseTensor&, + const phi::DenseTensor&, + paddle::optional, + bool, + phi::DenseTensor*, + phi::DenseTensor*); + + auto* kernel_fn = kernel.GetVariadicKernelFn(); + (*kernel_fn)(*dev_ctx, + *in_param, + *in_learning_rate, + *in_grad, + in_master_param_opt, + multi_precision, + kernel_out_0, + kernel_out_1); + } else { + auto in_grad = TensorToSelectedRows(grad); + SgdInferMeta(MakeMetaTensor(*in_param), + MakeMetaTensor(*in_learning_rate), + MakeMetaTensor(*in_grad), + master_param_meta_opt, + multi_precision, + &meta_out_0, + &meta_out_1); + + using kernel_signature = + void (*)(const platform::DeviceContext&, + const phi::DenseTensor&, + const phi::DenseTensor&, + const phi::SelectedRows&, + paddle::optional, + bool, + phi::DenseTensor*, + phi::DenseTensor*); + auto* kernel_fn = kernel.GetVariadicKernelFn(); + (*kernel_fn)(*dev_ctx, + *in_param, + *in_learning_rate, + *in_grad, + in_master_param_opt, + multi_precision, + kernel_out_0, + kernel_out_1); + } + } else { + auto in_param = TensorToSelectedRows(param); + auto in_grad = TensorToSelectedRows(grad); + auto in_master_param = TensorToSelectedRows(master_param); + auto in_master_param_opt = + master_param + ? paddle::make_optional(*in_master_param) + : paddle::none; + auto master_param_meta = MakeMetaTensor(in_master_param_opt); + paddle::optional master_param_meta_opt = + master_param + ? paddle::make_optional(*master_param_meta) + : paddle::none; + + phi::SelectedRows* kernel_out_0 = + SetSelectedRowsKernelOutput(kernel_key.backend(), &std::get<0>(out)); + phi::SelectedRows* kernel_out_1 = + master_param + ? static_cast(std::get<1>(out).impl().get()) + : nullptr; + + SgdInferMeta(MakeMetaTensor(*in_param), + MakeMetaTensor(*in_learning_rate), + MakeMetaTensor(*in_grad), + master_param_meta_opt, + multi_precision, + &meta_out_0, + &meta_out_1); + + using kernel_signature = + void (*)(const platform::DeviceContext&, + const phi::SelectedRows&, + const phi::DenseTensor&, + const phi::SelectedRows&, + paddle::optional, + bool, + phi::SelectedRows*, + phi::SelectedRows*); + auto* kernel_fn = kernel.GetVariadicKernelFn(); + (*kernel_fn)(*dev_ctx, + *in_param, + *in_learning_rate, + *in_grad, + in_master_param_opt, + multi_precision, + kernel_out_0, + kernel_out_1); + } + return out; +} + ////////////////// Backward(grad) api impls ////////////////////// // TODO(chenweihang): the original sum grad op can support higher-level diff --git a/paddle/phi/api/lib/api_custom_impl.h b/paddle/phi/api/lib/api_custom_impl.h index 5d46ed691816b..4ddc3e5f4e0d2 100644 --- a/paddle/phi/api/lib/api_custom_impl.h +++ b/paddle/phi/api/lib/api_custom_impl.h @@ -14,6 +14,7 @@ limitations under the License. */ #pragma once +#include #include #include "paddle/phi/api/include/tensor.h" @@ -107,6 +108,13 @@ std::tuple momentum_impl( bool multi_precision, float rescale_grad); +std::tuple sgd_impl( + const Tensor& param, + const Tensor& learning_rate, + const Tensor& grad, + paddle::optional master_param, + bool multi_precision); + ////////////////// Backward(grad) api impls ////////////////////// std::vector add_n_grad_impl(const std::vector& x, diff --git a/paddle/phi/api/lib/api_gen_utils.cc b/paddle/phi/api/lib/api_gen_utils.cc index f9db152956923..e0c910ba3d66c 100644 --- a/paddle/phi/api/lib/api_gen_utils.cc +++ b/paddle/phi/api/lib/api_gen_utils.cc @@ -20,13 +20,13 @@ namespace experimental { /* ------------------ for input ----------------------- */ std::shared_ptr TensorToDenseTensor(const Tensor& tensor) { - return std::dynamic_pointer_cast(tensor.impl()); + return std::static_pointer_cast(tensor.impl()); } std::shared_ptr TensorToDenseTensor( - const paddle::optional& tensor) { + const paddle::optional& tensor) { if (tensor) { - return std::dynamic_pointer_cast(tensor->impl()); + return std::static_pointer_cast(tensor->impl()); } return nullptr; } @@ -45,13 +45,13 @@ std::unique_ptr> TensorToDenseTensor( } std::shared_ptr TensorToSelectedRows(const Tensor& tensor) { - return std::dynamic_pointer_cast(tensor.impl()); + return std::static_pointer_cast(tensor.impl()); } std::shared_ptr TensorToSelectedRows( - const paddle::optional& tensor) { + const paddle::optional& tensor) { if (tensor) { - return std::dynamic_pointer_cast(tensor->impl()); + return std::static_pointer_cast(tensor->impl()); } return nullptr; } @@ -66,6 +66,14 @@ phi::MetaTensor MakeMetaTensor(const phi::DenseTensor& tensor) { return phi::MetaTensor(tensor); } +paddle::optional MakeMetaTensor( + const paddle::optional& tensor) { + if (tensor) { + return {phi::MetaTensor(*tensor)}; + } + return {paddle::none}; +} + std::vector MakeMetaTensor( const std::vector& tensors) { std::vector meta_tensors; @@ -90,6 +98,14 @@ phi::MetaTensor MakeMetaTensor(const phi::SelectedRows& tensor) { return phi::MetaTensor(tensor); } +paddle::optional MakeMetaTensor( + const paddle::optional& tensor) { + if (tensor) { + return {phi::MetaTensor(*tensor)}; + } + return {paddle::none}; +} + phi::MetaTensor MakeMetaTensor(const phi::StringTensor& tensor) { return phi::MetaTensor(tensor); } diff --git a/paddle/phi/api/lib/api_gen_utils.h b/paddle/phi/api/lib/api_gen_utils.h index 035dfc5204720..47b80bb3fc290 100644 --- a/paddle/phi/api/lib/api_gen_utils.h +++ b/paddle/phi/api/lib/api_gen_utils.h @@ -42,7 +42,7 @@ std::unique_ptr> TensorToDenseTensor( std::shared_ptr TensorToSelectedRows(const Tensor& tensor); std::shared_ptr TensorToSelectedRows( - const paddle::optional& tensor); + const paddle::optional& tensor); std::shared_ptr TensorToStringTensor(const Tensor& tensor); @@ -50,6 +50,9 @@ std::shared_ptr TensorToStringTensor(const Tensor& tensor); phi::MetaTensor MakeMetaTensor(const phi::DenseTensor& tensor); +paddle::optional MakeMetaTensor( + const paddle::optional& tensor); + std::vector MakeMetaTensor( const std::vector& tensors); @@ -58,6 +61,9 @@ std::vector MakeMetaTensor( phi::MetaTensor MakeMetaTensor(const phi::SelectedRows& tensor); +paddle::optional MakeMetaTensor( + const paddle::optional& tensor); + phi::MetaTensor MakeMetaTensor(const phi::StringTensor& tensor); /* ------------------ for output ----------------------- */ diff --git a/paddle/phi/infermeta/multiary.cc b/paddle/phi/infermeta/multiary.cc index f2acfe5a9962b..5fecd3740e930 100644 --- a/paddle/phi/infermeta/multiary.cc +++ b/paddle/phi/infermeta/multiary.cc @@ -1887,7 +1887,7 @@ void RnnInferMeta(const MetaTensor& x, } } -void SGDInferMeta(const MetaTensor& param, +void SgdInferMeta(const MetaTensor& param, const MetaTensor& learning_rate, const MetaTensor& grad, paddle::optional master_param, diff --git a/paddle/phi/infermeta/multiary.h b/paddle/phi/infermeta/multiary.h index c037641d082b7..9137b574ac09d 100644 --- a/paddle/phi/infermeta/multiary.h +++ b/paddle/phi/infermeta/multiary.h @@ -292,7 +292,7 @@ void RnnInferMeta(const MetaTensor& x, std::vector state, MetaTensor* reserve); -void SGDInferMeta(const MetaTensor& param, +void SgdInferMeta(const MetaTensor& param, const MetaTensor& learning_rate, const MetaTensor& grad, paddle::optional master_param, diff --git a/python/paddle/fluid/optimizer.py b/python/paddle/fluid/optimizer.py index 8242d8e3392ec..95db9d39c1ec4 100755 --- a/python/paddle/fluid/optimizer.py +++ b/python/paddle/fluid/optimizer.py @@ -44,6 +44,7 @@ from .. import compat as cpt import warnings from paddle import _C_ops +from ..fluid.framework import _in_legacy_dygraph, in_dygraph_mode __all__ = [ 'SGD', 'Momentum', 'Adagrad', 'Adam', 'Adamax', 'Dpsgd', 'DecayedAdagrad', @@ -1370,7 +1371,11 @@ def _append_optimize_op(self, block, param_and_grad): if find_master else None) lr = self._create_param_lr(param_and_grad) - if framework._non_static_mode(): + if in_dygraph_mode(): + _C_ops.final_state_sgd(param_and_grad[0], lr, param_and_grad[1], + master_weight, find_master) + return None + if _in_legacy_dygraph(): _C_ops.sgd(param_and_grad[0], lr, param_and_grad[1], master_weight, param_and_grad[0], master_weight) return None diff --git a/python/paddle/fluid/tests/unittests/test_sgd_op.py b/python/paddle/fluid/tests/unittests/test_sgd_op.py index 817150a21f5e5..ad03fa30009e7 100644 --- a/python/paddle/fluid/tests/unittests/test_sgd_op.py +++ b/python/paddle/fluid/tests/unittests/test_sgd_op.py @@ -21,6 +21,7 @@ from paddle.fluid.op import Operator from op_test import OpTest import paddle +from paddle.fluid.framework import _test_eager_guard paddle.enable_static() @@ -291,6 +292,11 @@ def test_sgd_group_dygraph(self): adam.step() adam.clear_gradients() + def test_eager(self): + with _test_eager_guard(): + self.test_sgd_dygraph() + self.test_sgd_group_dygraph() + class TestSGDMultiPrecision2_0(unittest.TestCase): def dygraph_sgd_mp(self, mp): diff --git a/python/paddle/optimizer/sgd.py b/python/paddle/optimizer/sgd.py index fdee57bb1253e..46dd0b73a5eb8 100644 --- a/python/paddle/optimizer/sgd.py +++ b/python/paddle/optimizer/sgd.py @@ -22,6 +22,7 @@ from ..fluid.layer_helper import LayerHelper from ..fluid import unique_name from ..fluid import layers +from ..fluid.framework import _in_legacy_dygraph, in_dygraph_mode __all__ = [] @@ -144,7 +145,11 @@ def _append_optimize_op(self, block, param_and_grad): if find_master else None) lr = self._create_param_lr(param_and_grad) - if framework._non_static_mode(): + if in_dygraph_mode(): + _C_ops.final_state_sgd(param_and_grad[0], lr, param_and_grad[1], + master_weight, find_master) + return None + if _in_legacy_dygraph(): _C_ops.sgd(param_and_grad[0], lr, param_and_grad[1], master_weight, param_and_grad[0], master_weight) return None diff --git a/python/paddle/utils/code_gen/api.yaml b/python/paddle/utils/code_gen/api.yaml index 329882317ee2b..b4abe5b303b8e 100644 --- a/python/paddle/utils/code_gen/api.yaml +++ b/python/paddle/utils/code_gen/api.yaml @@ -1794,6 +1794,12 @@ func : selu backward : selu_grad +- api : sgd + args : (Tensor param, Tensor learning_rate, Tensor grad, Tensor master_param, bool multi_precision) + output : Tensor(param_out), Tensor(master_param_out) + invoke : sgd_impl(param, learning_rate, grad, master_param, multi_precision) + optional : master_param + - api : shape args : (Tensor input) output : Tensor From 14c3c4500a5a6458d9d5a4aa7bf57c01a05d5678 Mon Sep 17 00:00:00 2001 From: feng_shuai Date: Wed, 13 Apr 2022 15:16:52 +0800 Subject: [PATCH 127/211] init roll convert (#41689) * init roll convert * add ut for roll convert * roll convert don't support trt6.0 * fix: change ut for trt 7.0.0.1 --- .../fluid/inference/api/analysis_predictor.cc | 1 + .../inference/tensorrt/convert/CMakeLists.txt | 1 + .../inference/tensorrt/convert/roll_op.cc | 89 +++++++++++++ paddle/fluid/inference/tensorrt/op_teller.cc | 24 ++++ .../ir/inference/test_trt_convert_roll.py | 124 ++++++++++++++++++ 5 files changed, 239 insertions(+) create mode 100644 paddle/fluid/inference/tensorrt/convert/roll_op.cc create mode 100644 python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_roll.py diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc index d37e4a468cac0..99d3f790e253c 100644 --- a/paddle/fluid/inference/api/analysis_predictor.cc +++ b/paddle/fluid/inference/api/analysis_predictor.cc @@ -1755,6 +1755,7 @@ USE_TRT_CONVERTER(deformable_conv); USE_TRT_CONVERTER(pool3d) USE_TRT_CONVERTER(fused_preln_embedding_eltwise_layernorm) USE_TRT_CONVERTER(preln_skip_layernorm) +USE_TRT_CONVERTER(roll) USE_TRT_CONVERTER(strided_slice) #endif diff --git a/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt b/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt index f1800afcb1d26..ec8c1b2fcd75c 100644 --- a/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt +++ b/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt @@ -25,6 +25,7 @@ nv_library(tensorrt_converter preln_emb_eltwise_layernorm.cc strided_slice_op.cc preln_skip_layernorm.cc + roll_op.cc DEPS tensorrt_engine tensorrt_plugin operator scope framework_proto op_registry) nv_test(test_op_converter SRCS test_op_converter.cc DEPS diff --git a/paddle/fluid/inference/tensorrt/convert/roll_op.cc b/paddle/fluid/inference/tensorrt/convert/roll_op.cc new file mode 100644 index 0000000000000..407f43d58678e --- /dev/null +++ b/paddle/fluid/inference/tensorrt/convert/roll_op.cc @@ -0,0 +1,89 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/inference/tensorrt/convert/op_converter.h" +#include "paddle/fluid/inference/tensorrt/helper.h" + +namespace paddle { +namespace framework { +class Scope; +namespace proto { +class OpDesc; +} // namespace proto +} // namespace framework +} // namespace paddle + +namespace paddle { +namespace inference { +namespace tensorrt { +/* + * Stack converter from fluid to tensorRT. + */ +class RollOpConverter : public OpConverter { + public: + void operator()(const framework::proto::OpDesc& op, + const framework::Scope& scope, bool test_mode) override { + VLOG(4) << "convert fluid Roll op to tensorrt Slice layer"; + + framework::OpDesc op_desc(op, nullptr); + auto* input = engine_->GetITensor(op_desc.Input("X")[0]); + nvinfer1::Dims input_dims = input->getDimensions(); + + std::vector axis = + BOOST_GET_CONST(std::vector, op_desc.GetAttr("axis")); + std::vector shifts = + BOOST_GET_CONST(std::vector, op_desc.GetAttr("shifts")); + + nvinfer1::Dims start; + start.nbDims = input_dims.nbDims; + for (int i = 0; i < start.nbDims; i++) { + start.d[i] = 0; + } + int axis_size = axis.size(); + for (int i = 0; i < axis_size; i++) { + start.d[axis[i]] = (-shifts[i]) % input_dims.d[axis[i]]; + } + + nvinfer1::Dims stride; + stride.nbDims = input_dims.nbDims; + for (int i = 0; i < stride.nbDims; i++) { + stride.d[i] = 1; + } + + nvinfer1::Dims size; + size.nbDims = input_dims.nbDims; + for (int i = 0; i < size.nbDims; i++) { + size.d[i] = 1; + } + + auto output_name = op_desc.Output("Out")[0]; + + auto shape_layer = TRT_ENGINE_ADD_LAYER(engine_, Shape, *input); + + auto* layer = + TRT_ENGINE_ADD_LAYER(engine_, Slice, *input, start, size, stride); + layer->setInput(2, *shape_layer->getOutput(0)); +#if IS_TRT_VERSION_GE(7000) + layer->setMode(nvinfer1::SliceMode::kWRAP); +#endif + + RreplenishLayerAndOutput(layer, "roll", {output_name}, test_mode); + } +}; + +} // namespace tensorrt +} // namespace inference +} // namespace paddle + +REGISTER_TRT_OP_CONVERTER(roll, RollOpConverter); diff --git a/paddle/fluid/inference/tensorrt/op_teller.cc b/paddle/fluid/inference/tensorrt/op_teller.cc index d9a874dd2b629..b44450e7a8212 100644 --- a/paddle/fluid/inference/tensorrt/op_teller.cc +++ b/paddle/fluid/inference/tensorrt/op_teller.cc @@ -119,6 +119,7 @@ struct SimpleOpTypeSetTeller : public Teller { "slice", "strided_slice", "fused_preln_embedding_eltwise_layernorm", + "roll", "preln_skip_layernorm"}; std::unordered_set teller_set{ "mul", @@ -182,6 +183,7 @@ struct SimpleOpTypeSetTeller : public Teller { "strided_slice", "fused_preln_embedding_eltwise_layernorm", "preln_skip_layernorm", + "roll", "multiclass_nms3"}; }; @@ -928,6 +930,28 @@ bool OpTeller::Tell(const framework::ir::Node* node, bool use_no_calib_int8, } } + if (op_type == "roll") { +#if !IS_TRT_VERSION_GE(7000) + VLOG(3) << "roll converter does not support trt versions below 7.0"; + return false; +#endif + if (!with_dynamic_shape) { + return false; + } + } + + if (op_type == "strided_slice") { + if (!with_dynamic_shape) { + return false; + } + if (!desc.HasAttr("axes") || !desc.HasAttr("starts") || + !desc.HasAttr("ends") || !desc.HasAttr("strides")) { + VLOG(3) + << "The necessary attributes of the strided_slice operator miss "; + return false; + } + } + if (op_type == "slice") { if (desc.HasAttr("decrease_axis")) { std::vector decrease_axis = diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_roll.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_roll.py new file mode 100644 index 0000000000000..1b3d38036614f --- /dev/null +++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_roll.py @@ -0,0 +1,124 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from trt_layer_auto_scan_test import TrtLayerAutoScanTest, SkipReasons +from program_config import TensorConfig, ProgramConfig +import numpy as np +import paddle.inference as paddle_infer +from functools import partial +from typing import Optional, List, Callable, Dict, Any, Set +import unittest + + +class TrtConvertRollTest(TrtLayerAutoScanTest): + def is_program_valid(self, program_config: ProgramConfig) -> bool: + inputs = program_config.inputs + weights = program_config.weights + attrs = [ + program_config.ops[i].attrs + for i in range(len(program_config.ops)) + ] + return True + + def sample_program_configs(self): + def generate_input1(attrs: List[Dict[str, Any]]): + return np.ones([1, 56, 56, 192]).astype(np.float32) + + for axis in [[1, 2]]: + for shifts in [[-1, -1], [-3, -3]]: + dics = [{ + "axis": axis, + "shifts": shifts, + }] + + ops_config = [{ + "op_type": "roll", + "op_inputs": { + "X": ["input_data"] + }, + "op_outputs": { + "Out": ["roll_output_data"] + }, + "op_attrs": dics[0] + }] + ops = self.generate_op_config(ops_config) + + program_config = ProgramConfig( + ops=ops, + weights={}, + inputs={ + "input_data": + TensorConfig(data_gen=partial(generate_input1, dics)) + }, + outputs=["roll_output_data"]) + + yield program_config + + def sample_predictor_configs( + self, program_config) -> (paddle_infer.Config, List[int], float): + def generate_dynamic_shape(attrs): + self.dynamic_shape.min_input_shape = { + "input_data": [1, 56, 56, 192] + } + self.dynamic_shape.max_input_shape = { + "input_data": [8, 56, 56, 192] + } + self.dynamic_shape.opt_input_shape = { + "input_data": [4, 56, 56, 192] + } + + def clear_dynamic_shape(): + self.dynamic_shape.min_input_shape = {} + self.dynamic_shape.max_input_shape = {} + self.dynamic_shape.opt_input_shape = {} + + def generate_trt_nodes_num(attrs, dynamic_shape): + inputs = program_config.inputs + + if not dynamic_shape: + return 0, 3 + ver = paddle_infer.get_trt_compile_version() + if ver[0] * 1000 + ver[1] * 100 + ver[2] * 10 < 7000: + return 0, 3 + return 1, 2 + + attrs = [ + program_config.ops[i].attrs + for i in range(len(program_config.ops)) + ] + + # for static_shape + clear_dynamic_shape() + self.trt_param.precision = paddle_infer.PrecisionType.Float32 + yield self.create_inference_config(), generate_trt_nodes_num( + attrs, False), 1e-5 + self.trt_param.precision = paddle_infer.PrecisionType.Half + yield self.create_inference_config(), generate_trt_nodes_num( + attrs, False), 1e-4 + + # for dynamic_shape + generate_dynamic_shape(attrs) + self.trt_param.precision = paddle_infer.PrecisionType.Float32 + yield self.create_inference_config(), generate_trt_nodes_num(attrs, + True), 1e-5 + self.trt_param.precision = paddle_infer.PrecisionType.Half + yield self.create_inference_config(), generate_trt_nodes_num(attrs, + True), 1e-4 + + def test(self): + self.run_test() + + +if __name__ == "__main__": + unittest.main() From 1cdd88f6caa881ed4b4185a9d931999484cd4cab Mon Sep 17 00:00:00 2001 From: tianshuo78520a <707759223@qq.com> Date: Wed, 13 Apr 2022 15:23:13 +0800 Subject: [PATCH 128/211] Update static-check CI Build (#41174) --- paddle/scripts/paddle_build.sh | 49 +++++++++++++++++----------------- 1 file changed, 24 insertions(+), 25 deletions(-) diff --git a/paddle/scripts/paddle_build.sh b/paddle/scripts/paddle_build.sh index e8bde467e085d..2e2efa65d7007 100755 --- a/paddle/scripts/paddle_build.sh +++ b/paddle/scripts/paddle_build.sh @@ -923,12 +923,11 @@ function fetch_upstream_develop_if_not_exist() { } function check_whl_size() { - if [ ! "${pr_whl_size}" ];then - echo "pr whl size not found " - exit 1 - fi set +x + pr_whl_size=`du -m ${PADDLE_ROOT}/build/pr_whl/*.whl|awk '{print $1}'` + echo "pr_whl_size: ${pr_whl_size}" + dev_whl_size=`du -m ${PADDLE_ROOT}/build/python/dist/*.whl|awk '{print $1}'` echo "dev_whl_size: ${dev_whl_size}" @@ -949,11 +948,20 @@ function check_whl_size() { } function generate_upstream_develop_api_spec() { - fetch_upstream_develop_if_not_exist - cur_branch=`git branch | grep \* | cut -d ' ' -f2` + cp ${PADDLE_ROOT}/python/requirements.txt /tmp + pr_whl_size=`du -m ${PADDLE_ROOT}/build/python/dist/*.whl|awk '{print $1}'` + mkdir -p ${PADDLE_ROOT}/build/pr_whl && mv ${PADDLE_ROOT}/build/python/dist/*.whl ${PADDLE_ROOT}/build/pr_whl/ + echo "pr_whl_size: ${pr_whl_size}" + + rm -rf ${PADDLE_ROOT}/build/Makefile ${PADDLE_ROOT}/build/CMakeCache.txt + cmake_change=`git diff --name-only upstream/$BRANCH | grep "cmake/external" || true` + if [[ ${cmake_change} ]];then + rm -rf ${PADDLE_ROOT}/build/third_party + fi + + cd ${PADDLE_ROOT} git checkout . git checkout -b develop_base_pr upstream/$BRANCH - startTime_firstBuild=`date +%s` dev_commit=`git log -1|head -1|awk '{print $2}'` dev_url="https://xly-devops.bj.bcebos.com/PR/build_whl/0/${dev_commit}/paddlepaddle_gpu-0.0.0-cp37-cp37m-linux_x86_64.whl" @@ -964,21 +972,11 @@ function generate_upstream_develop_api_spec() { cmake_gen $1 build $2 fi - - cp ${PADDLE_ROOT}/python/requirements.txt /tmp - pr_whl_size=`du -m ${PADDLE_ROOT}/build/python/dist/*.whl|awk '{print $1}'` - echo "pr_whl_size: ${pr_whl_size}" - - - git checkout $cur_branch generate_api_spec "$1" "DEV" - git branch -D develop_base_pr - ENABLE_MAKE_CLEAN="ON" - rm -rf ${PADDLE_ROOT}/build/Makefile ${PADDLE_ROOT}/build/CMakeCache.txt - cmake_change=`git diff --name-only upstream/$BRANCH | grep "cmake/external" || true` - if [[ ${cmake_change} ]];then - rm -rf ${PADDLE_ROOT}/build/third_party - fi + + endTime_s=`date +%s` + echo "Build Time: $[ $endTime_s - $startTime_s ]s" + echo "ipipe_log_param_Build_Time: $[ $endTime_s - $startTime_s ]s" >> ${PADDLE_ROOT}/build/build_summary.txt } function generate_api_spec() { @@ -2997,15 +2995,13 @@ function main() { example_code=$? summary_check_problems $check_style_code $[${example_code_gpu} + ${example_code}] "$check_style_info" "${example_info_gpu}\n${example_info}" assert_api_spec_approvals - check_whl_size ;; build_and_check_cpu) set +e - generate_upstream_develop_api_spec ${PYTHON_ABI:-""} ${parallel_number} cmake_gen_and_build ${PYTHON_ABI:-""} ${parallel_number} - check_sequence_op_unittest generate_api_spec ${PYTHON_ABI:-""} "PR" - check_whl_size + generate_upstream_develop_api_spec ${PYTHON_ABI:-""} ${parallel_number} + check_sequence_op_unittest ;; build_and_check_gpu) set +e @@ -3022,6 +3018,9 @@ function main() { summary_check_problems $check_style_code $[${example_code_gpu} + ${example_code}] "$check_style_info" "${example_info_gpu}\n${example_info}" assert_api_spec_approvals ;; + check_whl_size) + check_whl_size + ;; build) cmake_gen ${PYTHON_ABI:-""} build ${parallel_number} From 1e56ca8a392916de1a1b465d98dda54d38ae1e04 Mon Sep 17 00:00:00 2001 From: lilong12 Date: Wed, 13 Apr 2022 15:29:45 +0800 Subject: [PATCH 129/211] Use densetensor instead of Tensor for ProcessGroup (#41403) --- paddle/fluid/distributed/collective/Common.cc | 18 +- paddle/fluid/distributed/collective/Common.h | 8 +- .../distributed/collective/ProcessGroup.cc | 3 +- .../distributed/collective/ProcessGroup.h | 37 ++- .../collective/ProcessGroupGloo.cc | 183 +++++++------- .../distributed/collective/ProcessGroupGloo.h | 26 +- .../collective/ProcessGroupHCCL.cc | 117 +++------ .../distributed/collective/ProcessGroupHCCL.h | 23 +- .../collective/ProcessGroupHeter.cc | 225 ++++++----------- .../collective/ProcessGroupHeter.h | 11 +- .../collective/ProcessGroupNCCL.cc | 238 ++++++++---------- .../distributed/collective/ProcessGroupNCCL.h | 48 ++-- .../fluid/distributed/collective/reducer.cc | 42 +++- .../operators/collective/c_allgather_op.cu.cc | 14 ++ .../operators/collective/c_broadcast_op.cu.cc | 7 +- paddle/fluid/pybind/distributed_py.cc | 50 ++-- .../tests/unittests/init_process_group.py | 5 + 17 files changed, 501 insertions(+), 554 deletions(-) diff --git a/paddle/fluid/distributed/collective/Common.cc b/paddle/fluid/distributed/collective/Common.cc index 4a883f8196389..3461efbf3aa9a 100644 --- a/paddle/fluid/distributed/collective/Common.cc +++ b/paddle/fluid/distributed/collective/Common.cc @@ -17,11 +17,11 @@ namespace paddle { namespace distributed { -std::vector GetPlaceList(const std::vector& tensors) { +std::vector GetPlaceList(const std::vector& tensors) { std::vector places; places.reserve(tensors.size()); for (auto& tensor : tensors) { - places.push_back(tensor.inner_place()); + places.push_back(tensor.place()); } return places; } @@ -40,15 +40,11 @@ std::string GetKeyFromPlaces(const std::vector& places) { return placeList; } -static bool CheckTensorsInPlace(const std::vector& tensors, - phi::AllocationType type) { - return std::all_of(tensors.cbegin(), tensors.cend(), [&](const Tensor& t) { - return t.place().GetType() == type; - }); -} - -bool CheckTensorsInCudaPlace(const std::vector& tensors) { - return CheckTensorsInPlace(tensors, phi::AllocationType::GPU); +bool CheckTensorsInCudaPlace(const std::vector& tensors) { + return std::all_of(tensors.cbegin(), tensors.cend(), + [&](const phi::DenseTensor& t) { + return platform::is_gpu_place(t.place()); + }); } } // namespace distributed diff --git a/paddle/fluid/distributed/collective/Common.h b/paddle/fluid/distributed/collective/Common.h index 9569f4c61acef..c01bd23fe127b 100644 --- a/paddle/fluid/distributed/collective/Common.h +++ b/paddle/fluid/distributed/collective/Common.h @@ -16,18 +16,18 @@ #include "paddle/fluid/platform/place.h" #include "paddle/phi/api/include/api.h" +#include "paddle/phi/common/place.h" +#include "paddle/phi/core/dense_tensor.h" namespace paddle { namespace distributed { -using Tensor = paddle::experimental::Tensor; - using Place = paddle::platform::Place; // Get the list of devices from list of tensors -std::vector GetPlaceList(const std::vector& tensors); +std::vector GetPlaceList(const std::vector& tensors); // Get the deviceList String from the list of devices std::string GetKeyFromPlaces(const std::vector& places); -bool CheckTensorsInCudaPlace(const std::vector& tensors); +bool CheckTensorsInCudaPlace(const std::vector& tensors); } // namespace distributed } // namespace paddle diff --git a/paddle/fluid/distributed/collective/ProcessGroup.cc b/paddle/fluid/distributed/collective/ProcessGroup.cc index 6da83a888683b..6fec3a41e1047 100644 --- a/paddle/fluid/distributed/collective/ProcessGroup.cc +++ b/paddle/fluid/distributed/collective/ProcessGroup.cc @@ -17,7 +17,8 @@ namespace paddle { namespace distributed { -ProcessGroup::Task::Task(int rank, const std::vector& inputTensors, +ProcessGroup::Task::Task(int rank, + const std::vector& inputTensors, CommType comm_type) : rank_(rank), comm_type_(comm_type) {} diff --git a/paddle/fluid/distributed/collective/ProcessGroup.h b/paddle/fluid/distributed/collective/ProcessGroup.h index 17d021852671e..fbc9c1f476202 100644 --- a/paddle/fluid/distributed/collective/ProcessGroup.h +++ b/paddle/fluid/distributed/collective/ProcessGroup.h @@ -54,7 +54,7 @@ class ProcessGroup { public: class Task { public: - Task(int rank, const std::vector& inputTensors, + Task(int rank, const std::vector& inputTensors, CommType opType = CommType::UNKNOWN); virtual ~Task(); @@ -79,25 +79,21 @@ class ProcessGroup { virtual const std::string GetBackendName() const = 0; virtual std::shared_ptr AllReduce( - std::vector& /* tensors */, + std::vector& /* input tensors */, // NOLINT + std::vector& /* output tensors */, // NOLINT const AllreduceOptions& = AllreduceOptions()) { PADDLE_THROW(platform::errors::InvalidArgument( "ProcessGroup%s does not support allreduce", GetBackendName())); } virtual std::shared_ptr Broadcast( - std::vector& /* tensors */, + std::vector& /* input tensors */, // NOLINT + std::vector& /* output tensors */, // NOLINT const BroadcastOptions& = BroadcastOptions()) { PADDLE_THROW(platform::errors::InvalidArgument( "ProcessGroup%s does not support broadcast", GetBackendName())); } - virtual void Broadcast(const phi::DenseTensor* in, phi::DenseTensor* out) { - PADDLE_THROW(platform::errors::Fatal( - "ProcessGroup%s does not support broadcast for static mode runtime", - GetBackendName())); - } - virtual std::shared_ptr Barrier( const BarrierOptions& = BarrierOptions()) { PADDLE_THROW(platform::errors::InvalidArgument( @@ -105,42 +101,43 @@ class ProcessGroup { } virtual std::shared_ptr Send( - std::vector& tensors /* tensors */, int dst_rank) { // NOLINT + std::vector&, int) { // NOLINT PADDLE_THROW(platform::errors::InvalidArgument( "ProcessGroup%s does not support send", GetBackendName())); } virtual std::shared_ptr Recv( - std::vector& tensors /* tensors */, int src_rank) { // NOLINT + std::vector& tensors, int) { // NOLINT PADDLE_THROW(platform::errors::InvalidArgument( "ProcessGroup%s does not support receive", GetBackendName())); } virtual std::shared_ptr AllGather( - std::vector& in_tensors /* tensors */, // NOLINT - std::vector& out_tensors /* tensors */) { // NOLINT + std::vector&, // NOLINT + std::vector&) { // NOLINT PADDLE_THROW(platform::errors::InvalidArgument( "ProcessGroup%s does not support AllGather", GetBackendName())); } virtual std::shared_ptr AllToAll( - std::vector& in /* tensors */, // NOLINT - std::vector& out /* tensors */) { // NOLINT + std::vector&, // NOLINT + std::vector&) { // NOLINT PADDLE_THROW(platform::errors::InvalidArgument( "ProcessGroup%s does not support AllToAll", GetBackendName())); } virtual std::shared_ptr Reduce( - std::vector& tensors /* tensors */, // NOLINT - const ReduceOptions& opts) { // NOLINT + std::vector&, // NOLINT + std::vector&, // NOLINT + const ReduceOptions& opts) { PADDLE_THROW(platform::errors::InvalidArgument( "ProcessGroup%s does not support Reduce", GetBackendName())); } virtual std::shared_ptr Scatter( - std::vector& in_tensors /* tensors */, // NOLINT - std::vector& out_tensors /* tensors */, // NOLINT - const ScatterOptions&) { // NOLINT + std::vector&, // NOLINT + std::vector&, // NOLINT + const ScatterOptions&) { // NOLINT PADDLE_THROW(platform::errors::InvalidArgument( "ProcessGroup%s does not support Scatter", GetBackendName())); } diff --git a/paddle/fluid/distributed/collective/ProcessGroupGloo.cc b/paddle/fluid/distributed/collective/ProcessGroupGloo.cc index 91c3bf93849e0..6ddea74d95db6 100644 --- a/paddle/fluid/distributed/collective/ProcessGroupGloo.cc +++ b/paddle/fluid/distributed/collective/ProcessGroupGloo.cc @@ -27,6 +27,7 @@ #include #include #include +#include "paddle/fluid/distributed/collective/Common.h" #include "paddle/fluid/distributed/collective/ProcessGroupGloo.h" #include "paddle/fluid/framework/fleet/gloo_wrapper.h" #include "paddle/fluid/platform/enforce.h" @@ -105,107 +106,104 @@ reduce_func get_function(const ReduceOp& r) { exit(-1); } -bool CheckTensorsInCPUPlace(const std::vector& tensors) { - return std::all_of(tensors.cbegin(), tensors.cend(), [&](const Tensor& t) { - return t.place() == PlaceType::kCPU; - }); -} - template -T* get_data(const Tensor& tensor) { - auto raw_tensor = std::dynamic_pointer_cast(tensor.impl()); - return static_cast(raw_tensor->data()); +T* get_data(phi::DenseTensor& tensor) { // NOLINT + return reinterpret_cast(tensor.data()); } template -std::vector get_multi_data(const std::vector& tensors) { - std::vector ret(tensors.size()); +std::vector get_multi_data( + std::vector& tensors) { // NOLINT + std::vector ret; + ret.reserve(tensors.size()); for (size_t i = 0; i < tensors.size(); i++) { - ret[i] = get_data(tensors[i]); + ret.push_back(get_data(tensors[i])); } return ret; } template -void set_output(P& opts, const Tensor& tensor) { // NOLINT +void set_output(P& opts, phi::DenseTensor& tensor) { // NOLINT opts.setOutput(get_data(tensor), tensor.numel()); } template -void set_input(P& opts, const Tensor& tensor) { // NOLINT +void set_input(P& opts, phi::DenseTensor& tensor) { // NOLINT opts.setInput(get_data(tensor), tensor.numel()); } template -void set_outputs(P& opts, const std::vector& tensors) { // NOLINT +void set_outputs(P& opts, // NOLINT + std::vector& tensors) { // NOLINT opts.setOutputs(get_multi_data(tensors), tensors[0].numel()); } template -void set_inputs(P& opts, const std::vector& tensors) { // NOLINT +void set_inputs(P& opts, // NOLINT + std::vector& tensors) { // NOLINT opts.setInputs(get_multi_data(tensors), tensors[0].numel()); } template -void set_inputs_for_scatter(P& opts, // NOLINT - const std::vector& tensors, // NOLINT +void set_inputs_for_scatter(P& opts, // NOLINT + phi::DenseTensor& tensor, // NOLINT int nranks) { - std::vector ret(nranks); - auto raw_tensor = - std::dynamic_pointer_cast(tensors[0].impl()); - T* raw_pointer = reinterpret_cast(raw_tensor->data()); + std::vector ret; + ret.reserve(nranks); + T* raw_pointer = reinterpret_cast(tensor.data()); size_t offset = 0; for (int i = 0; i < nranks; i++) { - ret[i] = raw_pointer + offset; - offset += tensors[0].numel() / nranks; + ret.push_back(raw_pointer + offset); + offset += tensor.numel() / nranks; } - opts.setInputs(ret, tensors[0].numel() / nranks); + opts.setInputs(ret, tensor.numel() / nranks); } -ProcessGroupGloo::GlooTask::GlooTask(int rank, - const std::vector& inputs, - CommType comm_type) - : ProcessGroup::Task(rank, inputs, comm_type) { - PADDLE_ENFORCE_EQ(CheckTensorsInCPUPlace(inputs), true, - platform::errors::Fatal( - "Only CPU place is supported for ProcessGroupGloo.")); -} +ProcessGroupGloo::GlooTask::GlooTask( + int rank, const std::vector& inputs, CommType comm_type) + : ProcessGroup::Task(rank, inputs, comm_type) {} ProcessGroupGloo::ProcessGroupGloo( - const std::shared_ptr& store, int rank, - int world_size, int gid, const std::shared_ptr options) + const std::shared_ptr& store, int rank, int world_size, + int gid, const std::shared_ptr options) : ProcessGroup(rank, world_size, gid), _tag(0), _store(new GlooStore(store)) { _context = std::make_shared(rank, world_size); auto prefix_store = - ::gloo::rendezvous::PrefixStore(std::to_string(0), *_store); + ::gloo::rendezvous::PrefixStore(std::to_string(gid), *_store); _context->connectFullMesh(prefix_store, options->device); } class BroadcastGlooTask : public ProcessGroupGloo::GlooTask { public: BroadcastGlooTask(const std::shared_ptr& context, - const std::vector& inputs, int rank, int root, - uint32_t tag) + std::vector& inputs, // NOLINT + std::vector& outputs, // NOLINT + int rank, int root, uint32_t tag) : ProcessGroupGloo::GlooTask(rank, inputs, CommType::BROADCAST), _context(context), _root(root), _inputs(inputs), + _outputs(outputs), _tag(tag) {} - void Run() override { _do_broadcast(_inputs[0]); } + void Run() override { _do_broadcast(_inputs[0], _outputs[0]); } private: std::shared_ptr _context; const int _root; - std::vector _inputs{}; + std::vector _inputs{}; + std::vector _outputs{}; const uint32_t _tag; - void _do_broadcast(const Tensor& tensor) { + void _do_broadcast(phi::DenseTensor& in, phi::DenseTensor& out) { // NOLINT gloo::BroadcastOptions opts(_context); - const auto& dtype = tensor.type(); - GENERATE_FUNC(dtype, set_output, opts, tensor); + const auto& dtype = in.dtype(); + if (rank_ == _root) { + GENERATE_FUNC(dtype, set_input, opts, in); + } + GENERATE_FUNC(dtype, set_output, opts, out); opts.setRoot(_root); opts.setTag(_tag); gloo::broadcast(opts); @@ -213,12 +211,14 @@ class BroadcastGlooTask : public ProcessGroupGloo::GlooTask { }; std::shared_ptr ProcessGroupGloo::Broadcast( - std::vector& inputs, const BroadcastOptions& opts) { + std::vector& inputs, + std::vector& outputs, const BroadcastOptions& opts) { auto root = opts.source_rank; std::unique_ptr task; auto tag = next_tag(); auto context = get_context(); - task = std::make_unique(context, inputs, rank_, root, tag); + task = std::make_unique(context, inputs, outputs, rank_, + root, tag); task->Run(); return task; } @@ -226,19 +226,22 @@ std::shared_ptr ProcessGroupGloo::Broadcast( class AllreduceGlooTask : public ProcessGroupGloo::GlooTask { public: AllreduceGlooTask(int rank, const std::shared_ptr& context, - std::vector& inputs, ReduceOp reduce_op, // NOLINT - uint32_t tag) + std::vector& inputs, // NOLINT + std::vector& outputs, // NOLINT + ReduceOp reduce_op, uint32_t tag) : ProcessGroupGloo::GlooTask(rank, inputs, CommType::ALLREDUCE), _context(context), _inputs(inputs), + _outputs(outputs), _reduce_op(reduce_op), _tag(tag) {} - void Run() override { _do_allreduce(_inputs); } + void Run() override { _do_allreduce(_inputs, _outputs); } private: std::shared_ptr _context; - std::vector _inputs; + std::vector _inputs; + std::vector _outputs; const ReduceOp _reduce_op; uint32_t _tag; @@ -255,11 +258,12 @@ class AllreduceGlooTask : public ProcessGroupGloo::GlooTask { fn = get_function(op); } - void _do_allreduce(std::vector& tensors) { // NOLINT - const auto& dtype = tensors[0].type(); + void _do_allreduce(std::vector& ins, // NOLINT + std::vector& outs) { // NOLINT + const auto& dtype = ins[0].dtype(); gloo::AllreduceOptions opts(_context); - GENERATE_FUNC(dtype, set_inputs, opts, tensors); - GENERATE_FUNC(dtype, set_outputs, opts, tensors); + GENERATE_FUNC(dtype, set_inputs, opts, ins); + GENERATE_FUNC(dtype, set_outputs, opts, outs); opts.setReduceFunction(_get_function(dtype, _reduce_op)); opts.setTag(_tag); gloo::allreduce(opts); @@ -267,11 +271,12 @@ class AllreduceGlooTask : public ProcessGroupGloo::GlooTask { }; std::shared_ptr ProcessGroupGloo::AllReduce( - std::vector& inputs, const AllreduceOptions& opts) { + std::vector& inputs, + std::vector& outputs, const AllreduceOptions& opts) { auto tag = next_tag(); std::shared_ptr task; auto context = get_context(); - task = std::make_shared(rank_, context, inputs, + task = std::make_shared(rank_, context, inputs, outputs, opts.reduce_op, tag); task->Run(); return task; @@ -280,7 +285,7 @@ std::shared_ptr ProcessGroupGloo::AllReduce( class BarrierGlooTask : public ProcessGroupGloo::GlooTask { public: BarrierGlooTask(int rank, const std::shared_ptr& context) - : ProcessGroupGloo::GlooTask(rank, std::vector{}, + : ProcessGroupGloo::GlooTask(rank, std::vector{}, CommType::BARRIER), _context(context) {} @@ -307,8 +312,8 @@ std::shared_ptr ProcessGroupGloo::Barrier( class AllgatherGlooTask : public ProcessGroupGloo::GlooTask { public: AllgatherGlooTask(int rank, const std::shared_ptr& context, - std::vector& inputs, // NOLINT - std::vector& outputs, // NOLINT + std::vector& inputs, // NOLINT + std::vector& outputs, // NOLINT uint32_t tag) : ProcessGroupGloo::GlooTask(rank, inputs, CommType::ALLGATHER), _context(context), @@ -320,13 +325,13 @@ class AllgatherGlooTask : public ProcessGroupGloo::GlooTask { private: std::shared_ptr _context; - std::vector _inputs; - std::vector _outputs; + std::vector _inputs; + std::vector _outputs; uint32_t _tag; - void _do_allgather(std::vector& in, // NOLINT - std::vector& out) { // NOLINT - const auto& dtype = in[0].type(); + void _do_allgather(std::vector& in, // NOLINT + std::vector& out) { // NOLINT + const auto& dtype = in[0].dtype(); gloo::AllgatherOptions opts(_context); GENERATE_FUNC(dtype, set_input, opts, in[0]); GENERATE_FUNC(dtype, set_output, opts, out[0]); @@ -336,7 +341,8 @@ class AllgatherGlooTask : public ProcessGroupGloo::GlooTask { }; std::shared_ptr ProcessGroupGloo::AllGather( - std::vector& in_tensors, std::vector& out_tensors) { + std::vector& in_tensors, + std::vector& out_tensors) { std::shared_ptr task; auto tag = next_tag(); auto context = get_context(); @@ -349,20 +355,23 @@ std::shared_ptr ProcessGroupGloo::AllGather( class ReduceGlooTask : public ProcessGroupGloo::GlooTask { public: ReduceGlooTask(int rank, const std::shared_ptr& context, - std::vector& in, ReduceOp reduce_op, // NOLINT - int dst, uint32_t tag) - : ProcessGroupGloo::GlooTask(rank, in, CommType::REDUCE), + std::vector& inputs, // NOLINT + std::vector& outputs, // NOLINT + ReduceOp reduce_op, int dst, uint32_t tag) + : ProcessGroupGloo::GlooTask(rank, inputs, CommType::REDUCE), _context(context), - _inputs(in), + _inputs(inputs), + _outputs(outputs), _reduce_op(reduce_op), _dst(dst), _tag(tag) {} - void Run() override { _do_reduce(_inputs, _dst); } + void Run() override { _do_reduce(_inputs, _outputs, _dst); } private: std::shared_ptr _context; - std::vector _inputs; + std::vector _inputs; + std::vector _outputs; const ReduceOp _reduce_op; int _dst; uint32_t _tag; @@ -380,11 +389,13 @@ class ReduceGlooTask : public ProcessGroupGloo::GlooTask { fn = get_function(op); } - void _do_reduce(std::vector& tensors, int dst) { // NOLINT - const auto& dtype = tensors[0].type(); + void _do_reduce(std::vector& inputs, // NOLINT + std::vector& outputs, // NOLINT + int dst) { + const auto& dtype = inputs[0].dtype(); gloo::ReduceOptions opts(_context); - GENERATE_FUNC(dtype, set_input, opts, tensors[0]); - GENERATE_FUNC(dtype, set_output, opts, tensors[0]); + GENERATE_FUNC(dtype, set_input, opts, inputs[0]); + GENERATE_FUNC(dtype, set_output, opts, outputs[0]); opts.setReduceFunction(_get_function(dtype, _reduce_op)); opts.setTag(_tag); opts.setRoot(dst); @@ -393,11 +404,12 @@ class ReduceGlooTask : public ProcessGroupGloo::GlooTask { }; std::shared_ptr ProcessGroupGloo::Reduce( - std::vector& tensors, const ReduceOptions& opts) { + std::vector& inputs, + std::vector& outputs, const ReduceOptions& opts) { std::shared_ptr task; auto tag = next_tag(); auto context = get_context(); - task = std::make_shared(rank_, context, tensors, + task = std::make_shared(rank_, context, inputs, outputs, opts.reduce_op, opts.root_rank, tag); task->Run(); return task; @@ -406,8 +418,8 @@ std::shared_ptr ProcessGroupGloo::Reduce( class ScatterGlooTask : public ProcessGroupGloo::GlooTask { public: ScatterGlooTask(int rank, const std::shared_ptr& context, - std::vector& inputs, // NOLINT - std::vector& outputs, // NOLINT + std::vector& inputs, // NOLINT + std::vector& outputs, // NOLINT int src, int size, uint32_t tag) : ProcessGroupGloo::GlooTask(rank, inputs, CommType::SCATTER), _context(context), @@ -421,18 +433,19 @@ class ScatterGlooTask : public ProcessGroupGloo::GlooTask { private: std::shared_ptr _context; - std::vector _inputs; - std::vector _outputs; + std::vector _inputs; + std::vector _outputs; int _src; int _size; uint32_t _tag; - void _do_scatter(std::vector& in, std::vector& out, // NOLINT + void _do_scatter(std::vector& in, // NOLINT + std::vector& out, // NOLINT int src) { - const auto& dtype = in[0].type(); + const auto& dtype = in[0].dtype(); gloo::ScatterOptions opts(_context); if (rank_ == src) { - GENERATE_FUNC(dtype, set_inputs_for_scatter, opts, in, _size); + GENERATE_FUNC(dtype, set_inputs_for_scatter, opts, in[0], _size); } GENERATE_FUNC(dtype, set_output, opts, out[0]); opts.setRoot(src); @@ -442,8 +455,8 @@ class ScatterGlooTask : public ProcessGroupGloo::GlooTask { }; std::shared_ptr ProcessGroupGloo::Scatter( - std::vector& in_tensors, std::vector& out_tensors, - const ScatterOptions& opts) { + std::vector& in_tensors, + std::vector& out_tensors, const ScatterOptions& opts) { std::shared_ptr task; auto tag = next_tag(); auto context = get_context(); diff --git a/paddle/fluid/distributed/collective/ProcessGroupGloo.h b/paddle/fluid/distributed/collective/ProcessGroupGloo.h index f0bf872cfc9e4..335ca1bd17f2c 100644 --- a/paddle/fluid/distributed/collective/ProcessGroupGloo.h +++ b/paddle/fluid/distributed/collective/ProcessGroupGloo.h @@ -36,7 +36,8 @@ class ProcessGroupGloo : public ProcessGroup { class GlooTask : public ProcessGroup::Task, public std::enable_shared_from_this { public: - explicit GlooTask(int rank, const std::vector& input_tensors, + explicit GlooTask(int rank, + const std::vector& input_tensors, CommType comm_type); ~GlooTask() = default; @@ -106,26 +107,31 @@ class ProcessGroupGloo : public ProcessGroup { ~ProcessGroupGloo() = default; std::shared_ptr Broadcast( - std::vector& inputs, + std::vector& inputs, + std::vector& outputs, const BroadcastOptions& = BroadcastOptions()) override; std::shared_ptr AllReduce( - std::vector& inputs, + std::vector& inputs, + std::vector& outputs, const AllreduceOptions& opts = AllreduceOptions()) override; std::shared_ptr Barrier( const BarrierOptions& = BarrierOptions()) override; std::shared_ptr AllGather( - std::vector& in_tensors, - std::vector& out_tensors) override; + std::vector& in_tensors, + std::vector& out_tensors) override; std::shared_ptr Reduce( - std::vector& tensors, const ReduceOptions& opts) override; - - std::shared_ptr Scatter(std::vector& in_tensors, - std::vector& out_tensors, - const ScatterOptions&) override; + std::vector& in_tensors, + std::vector& out_tensors, + const ReduceOptions& opts) override; + + std::shared_ptr Scatter( + std::vector& in_tensors, + std::vector& out_tensors, + const ScatterOptions&) override; std::shared_ptr<::gloo::Context> get_context() { return _context; } uint64_t next_tag() { return _tag++; } diff --git a/paddle/fluid/distributed/collective/ProcessGroupHCCL.cc b/paddle/fluid/distributed/collective/ProcessGroupHCCL.cc index 55945b5e0e396..55ecdaaf6bfb7 100644 --- a/paddle/fluid/distributed/collective/ProcessGroupHCCL.cc +++ b/paddle/fluid/distributed/collective/ProcessGroupHCCL.cc @@ -44,14 +44,14 @@ void SyncDefaultStream( std::shared_ptr ProcessGroupHCCL::CreateTask( std::vector places, int rank, CommType comm_type, - const std::vector& inputs) { + const std::vector& inputs) { return std::make_shared(places, rank, comm_type, inputs); } -ProcessGroupHCCL::HCCLTask::HCCLTask(const std::vector& places, int rank, - CommType CommType, - const std::vector& inputs) +ProcessGroupHCCL::HCCLTask::HCCLTask( + const std::vector& places, int rank, CommType CommType, + const std::vector& inputs) : Task(rank, inputs, CommType), places_(places) { control_events_.resize(places.size()); hcclComms_.resize(places.size()); @@ -60,8 +60,8 @@ ProcessGroupHCCL::HCCLTask::HCCLTask(const std::vector& places, int rank, ProcessGroupHCCL::HCCLTask::~HCCLTask() {} void ProcessGroupHCCL::HCCLTask::SetOutputs( - std::vector& outputs) { // NOLINT - outputs_ = std::make_shared>(outputs); + std::vector& outputs) { // NOLINT + outputs_ = std::make_shared>(outputs); } void ProcessGroupHCCL::HCCLTask::SynchronizeStreams() { @@ -166,8 +166,8 @@ void ProcessGroupHCCL::CreateHCCLManagerCache( template std::shared_ptr ProcessGroupHCCL::Collective( - std::vector& inputs, std::vector& outputs, Fn fn, - CommType op_type) { + std::vector& inputs, + std::vector& outputs, Fn fn, CommType op_type) { const auto places = GetPlaceList(inputs); const auto key = GetKeyFromPlaces(places); @@ -208,91 +208,44 @@ std::shared_ptr ProcessGroupHCCL::Collective( return task; } -template -std::shared_ptr ProcessGroupHCCL::PointToPoint( - std::vector& tensors, Fn fn, int dst_rank, CommType op_type) { - const auto places = GetPlaceList(tensors); - const auto key = GetKeyFromPlaces(places); - - { - std::lock_guard lock(mutex_); - if (places_to_hcclcomm_.find(key) == places_to_hcclcomm_.end()) { - CreateHCCLManagerCache(key, places); - } - } - - auto& hccl_comms = places_to_hcclcomm_[key]; - - SyncDefaultStream(places, places_to_events_[key], places_to_ctx_[key]); - - auto task = CreateTask(places, rank_, op_type, tensors); - - // construct uninitialize guard for device - - // if (FLAGS_use_stream_safe_npu_allocator) { - // for (size_t i = 0; i < tensors.size(); ++i) { - // platform::NPUDeviceGuard guard(places[i].GetDeviceId()); - // auto dense_tensor = - // std::dynamic_pointer_cast(tensors[i].impl()); - // memory::RecordStream(dense_tensor->Holder(), - // places_to_ctx_[key][i]->stream()); - // } - // } - - for (size_t i = 0; i < tensors.size(); ++i) { - platform::NPUDeviceGuard guard(places[i].GetDeviceId()); - const auto& hccl_stream = places_to_ctx_[key][i]->stream(); - fn(tensors[i], hccl_comms[i]->GetHcclComm(), hccl_stream, dst_rank); - } - - for (size_t i = 0; i < tensors.size(); ++i) { - platform::NPUDeviceGuard guard(places[i].GetDeviceId()); - task->control_events_[i].Record(*places_to_ctx_[key][i]); - } - return task; -} - std::shared_ptr ProcessGroupHCCL::AllReduce( - std::vector& tensors, const AllreduceOptions& opts) { - // PADDLE_ENFORCE_EQ( - // CheckTensorsInNPUPlace(tensors), true, - // platform::errors::InvalidArgument("All inputs should be in - // NPUPlace.")); - return Collective( - tensors, tensors, - [&](const Tensor& input, Tensor& output, HcclComm comm, - const aclrtStream& stream) { - auto input_tensor = - std::dynamic_pointer_cast(input.impl()); - auto output_tensor = - std::dynamic_pointer_cast(output.impl()); - return platform::dynload::HcclAllReduce( - input_tensor->data(), output_tensor->data(), input_tensor->numel(), - platform::ToHCCLDataType(input.type()), - ToHCCLRedType(opts.reduce_op), comm, stream); - }, - CommType::ALLREDUCE); + std::vector& in_tensors, // NOLINT + std::vector& out_tensors, // NOLINT + const AllreduceOptions& opts) { + return Collective(in_tensors, out_tensors, + [&](phi::DenseTensor& input, phi::DenseTensor& output, + HcclComm comm, const aclrtStream& stream) { + return platform::dynload::HcclAllReduce( + input.data(), output.data(), input.numel(), + platform::ToHCCLDataType(input.dtype()), + ToHCCLRedType(opts.reduce_op), comm, stream); + }, + CommType::ALLREDUCE); } std::shared_ptr ProcessGroupHCCL::Broadcast( - std::vector& tensors, const BroadcastOptions& opts) { + std::vector& in_tensors, // NOLINT + std::vector& out_tensors, // NOLINT + const BroadcastOptions& opts) { // PADDLE_ENFORCE_EQ( // CheckTensorsInNPUPlace(tensors), true, // platform::errors::InvalidArgument("All inputs should be in // CudaPlace.")); return Collective( - tensors, tensors, - [&](Tensor& input, Tensor& output, HcclComm comm, + in_tensors, out_tensors, + [&](phi::DenseTensor& input, phi::DenseTensor& output, HcclComm comm, const aclrtStream& stream) { - const auto root = opts.source_rank * tensors.size() + opts.source_root; - auto input_tensor = - std::dynamic_pointer_cast(input.impl()); - auto output_tensor = - std::dynamic_pointer_cast(output.impl()); - return platform::dynload::HcclBroadcast( - input_tensor->data(), input_tensor->numel(), - platform::ToHCCLDataType(input.type()), root, comm, stream); + int root = opts.source_rank * in_tensors.size() + opts.source_root; + if (rank_ == root) { + return platform::dynload::HcclBroadcast( + input.data(), input.numel(), + platform::ToHCCLDataType(input.dtype()), root, comm, stream); + } else { + return platform::dynload::HcclBroadcast( + output.data(), output.numel(), + platform::ToHCCLDataType(output.dtype()), root, comm, stream); + } }, CommType::BROADCAST); } diff --git a/paddle/fluid/distributed/collective/ProcessGroupHCCL.h b/paddle/fluid/distributed/collective/ProcessGroupHCCL.h index 932ae75fc6b9d..f3d3fa2f8a72a 100644 --- a/paddle/fluid/distributed/collective/ProcessGroupHCCL.h +++ b/paddle/fluid/distributed/collective/ProcessGroupHCCL.h @@ -46,7 +46,7 @@ class ProcessGroupHCCL : public ProcessGroup { public std::enable_shared_from_this { public: HCCLTask(const std::vector& places, int rank, CommType CommType, - const std::vector& inputs); + const std::vector& inputs); bool IsCompleted(); @@ -56,7 +56,7 @@ class ProcessGroupHCCL : public ProcessGroup { void Synchronize(); - void SetOutputs(std::vector& outputs); // NOLINT + void SetOutputs(std::vector& outputs); // NOLINT virtual ~HCCLTask(); @@ -65,7 +65,7 @@ class ProcessGroupHCCL : public ProcessGroup { protected: std::vector places_; std::vector> hcclComms_; - std::shared_ptr> outputs_; + std::shared_ptr> outputs_; private: }; @@ -78,17 +78,19 @@ class ProcessGroupHCCL : public ProcessGroup { } std::shared_ptr AllReduce( - std::vector& tensors, + std::vector& in_tensors, + std::vector& out_tensors, const AllreduceOptions& = AllreduceOptions()) override; std::shared_ptr Broadcast( - std::vector& tensors, + std::vector& in_tensors, + std::vector& out_tensors, const BroadcastOptions& = BroadcastOptions()) override; protected: virtual std::shared_ptr CreateTask( std::vector places, int rank, CommType opType, - const std::vector& inputs); + const std::vector& inputs); std::shared_ptr store_; std::shared_ptr hccl_comm_; @@ -113,15 +115,10 @@ class ProcessGroupHCCL : public ProcessGroup { template std::shared_ptr Collective( - std::vector& inputs, // NOLINT - std::vector& outputs, // NOLINT + std::vector& inputs, // NOLINT + std::vector& outputs, // NOLINT Fn fn, CommType op_type); - template - std::shared_ptr PointToPoint( - std::vector& tensors, // NOLINT - Fn fn, int dst_rank, CommType op_type); - void CreateHCCLManagerCache(const std::string& places_key, const std::vector& places); }; diff --git a/paddle/fluid/distributed/collective/ProcessGroupHeter.cc b/paddle/fluid/distributed/collective/ProcessGroupHeter.cc index b3c9ddde50116..a48bda06323be 100644 --- a/paddle/fluid/distributed/collective/ProcessGroupHeter.cc +++ b/paddle/fluid/distributed/collective/ProcessGroupHeter.cc @@ -26,13 +26,13 @@ namespace distributed { using Place = paddle::platform::Place; std::shared_ptr ProcessGroupHeter::CreateTask( - int rank, CommType comm_type, const std::vector& inputs) { + int rank, CommType comm_type, const std::vector& inputs) { return std::make_shared(rank, comm_type, inputs); } -ProcessGroupHeter::HeterTask::HeterTask(int rank, CommType CommType, - const std::vector& inputs) +ProcessGroupHeter::HeterTask::HeterTask( + int rank, CommType CommType, const std::vector& inputs) : Task(rank, inputs, CommType) {} ProcessGroupHeter::HeterTask::~HeterTask() {} @@ -86,248 +86,177 @@ static void _do_add(T* dst, T* src, size_t size) { } std::shared_ptr ProcessGroupHeter::AllReduce( - std::vector& tensors, const AllreduceOptions& opts) { + std::vector& in_tensors, + std::vector& out_tensors, const AllreduceOptions& opts) { #if defined(PADDLE_WITH_NCCL) PADDLE_ENFORCE_EQ( - CheckTensorsInCudaPlace(tensors), true, + CheckTensorsInCudaPlace(in_tensors), true, platform::errors::InvalidArgument("All inputs should be in CudaPlace.")); + PADDLE_ENFORCE_EQ( + CheckTensorsInCudaPlace(out_tensors), true, + platform::errors::InvalidArgument("All outputs should be in CudaPlace.")); #endif // Step1: do allreduce in inner cluster - auto task = inner_pg_->AllReduce(tensors, opts); + auto task = inner_pg_->AllReduce(in_tensors, in_tensors, opts); task->Wait(); // Step2: copy tensors to CPU if (local_rank_ == 0) { - std::vector cpu_tensors; - cpu_tensors.reserve(tensors.size()); - for (size_t i = 0; i < tensors.size(); i++) { - auto dense_gpu_tensor = - std::dynamic_pointer_cast(tensors[i].impl()); - phi::DenseTensorMeta meta = phi::DenseTensorMeta( - dense_gpu_tensor->dtype(), dense_gpu_tensor->dims()); - std::shared_ptr dense_cpu_tensor = - std::make_shared( - std::make_unique( - paddle::platform::CPUPlace()) - .get(), - meta); - dense_cpu_tensor->ResizeAndAllocate(dense_gpu_tensor->dims()); - cpu_tensors[i] = paddle::experimental::Tensor(dense_cpu_tensor); - framework::TensorCopySync(*dense_gpu_tensor, platform::CPUPlace(), - dense_cpu_tensor.get()); + std::vector cpu_tensors; + cpu_tensors.reserve(in_tensors.size()); + for (size_t i = 0; i < in_tensors.size(); i++) { + auto gpu_tensor = in_tensors[i]; + auto cpu_tensor = cpu_tensors[i]; + cpu_tensor.Resize(gpu_tensor.dims()); + framework::TensorCopySync(gpu_tensor, platform::CPUPlace(), &cpu_tensor); } // Step3: do inter cluster allreduce if (with_switch_) { if (local_rank_ == 0) { HeterClient* client_ = HeterClient::GetInstance({switch_endpoint_}, {}, 0).get(); - auto dense_cpu_tensor = - std::dynamic_pointer_cast(cpu_tensors[0].impl()); + auto dense_cpu_tensor = cpu_tensors[0]; std::vector send_size; - send_size.push_back(dense_cpu_tensor->numel()); + send_size.push_back(dense_cpu_tensor.numel()); int ret = client_->Send( - gid_, {dense_cpu_tensor->name()}, send_size, - dense_cpu_tensor->data(), - dense_cpu_tensor->numel() * - framework::DataTypeSize(dense_cpu_tensor->dtype())); + gid_, {dense_cpu_tensor.name()}, send_size, dense_cpu_tensor.data(), + dense_cpu_tensor.numel() * + framework::DataTypeSize(dense_cpu_tensor.dtype())); PADDLE_ENFORCE_EQ(ret, 0, platform::errors::PreconditionNotMet( "Send to the switch module error.")); phi::DenseTensorMeta meta = phi::DenseTensorMeta( - dense_cpu_tensor->dtype(), dense_cpu_tensor->dims()); + dense_cpu_tensor.dtype(), dense_cpu_tensor.dims()); std::shared_ptr dense_cpu_tensor2 = std::make_shared( std::make_unique( paddle::platform::CPUPlace()) .get(), meta); - dense_cpu_tensor2->ResizeAndAllocate(dense_cpu_tensor->dims()); - Tensor cpu_tensor_temp = - paddle::experimental::Tensor(dense_cpu_tensor2); + dense_cpu_tensor2->ResizeAndAllocate(dense_cpu_tensor.dims()); ret = client_->Recv( - gid_, {dense_cpu_tensor->name()}, dense_cpu_tensor2->data(), + gid_, {dense_cpu_tensor.name()}, dense_cpu_tensor2->data(), dense_cpu_tensor2->numel() * framework::DataTypeSize(dense_cpu_tensor2->dtype())); PADDLE_ENFORCE_EQ(ret, 0, platform::errors::PreconditionNotMet( "Recv from the switch module error.")); - switch (dense_cpu_tensor->dtype()) { + switch (dense_cpu_tensor.dtype()) { case DataType::FLOAT32: - _do_add(reinterpret_cast(dense_cpu_tensor->data()), + _do_add(reinterpret_cast(dense_cpu_tensor.data()), reinterpret_cast(dense_cpu_tensor2->data()), - dense_cpu_tensor->numel()); + dense_cpu_tensor.numel()); break; case DataType::FLOAT64: _do_add( - reinterpret_cast(dense_cpu_tensor->data()), + reinterpret_cast(dense_cpu_tensor.data()), reinterpret_cast(dense_cpu_tensor2->data()), - dense_cpu_tensor->numel()); + dense_cpu_tensor.numel()); break; case DataType::INT32: - _do_add(reinterpret_cast(dense_cpu_tensor->data()), + _do_add(reinterpret_cast(dense_cpu_tensor.data()), reinterpret_cast(dense_cpu_tensor2->data()), - dense_cpu_tensor->numel()); + dense_cpu_tensor.numel()); break; default: PADDLE_THROW(platform::errors::PreconditionNotMet( "Unsupported data type (%s) to do add.", - framework::DataType2String(dense_cpu_tensor->dtype()))); + framework::DataType2String(dense_cpu_tensor.dtype()))); } } } else { - auto gloo_task = inter_pg_->AllReduce(cpu_tensors, opts); + auto gloo_task = inter_pg_->AllReduce(cpu_tensors, cpu_tensors, opts); gloo_task->Wait(); } // Step4: copy cpu tensors to gpu // copy cpu tensors to gpu - for (size_t i = 0; i < tensors.size(); i++) { - auto dense_gpu_tensor = - std::dynamic_pointer_cast(tensors[i].impl()); - auto dense_cpu_tensor = - std::dynamic_pointer_cast(cpu_tensors[i].impl()); - framework::TensorCopySync(*dense_cpu_tensor, dense_cpu_tensor->place(), - dense_gpu_tensor.get()); + for (size_t i = 0; i < in_tensors.size(); i++) { + auto gpu_tensor = out_tensors[i]; + auto cpu_tensor = cpu_tensors[i]; + framework::TensorCopySync(cpu_tensor, cpu_tensor.place(), &gpu_tensor); } } // Step5: broadcast among inner cluster auto b_opts = BroadcastOptions(); - b_opts.source_root = 0; - auto broadcast_task = inner_pg_->Broadcast(tensors, b_opts); + b_opts.source_rank = 0; + auto broadcast_task = inner_pg_->Broadcast(out_tensors, out_tensors, b_opts); broadcast_task->Wait(); - return CreateTask(rank_, CommType::ALLREDUCE, tensors); + return CreateTask(rank_, CommType::ALLREDUCE, in_tensors); } std::shared_ptr ProcessGroupHeter::Broadcast( - std::vector& tensors, const BroadcastOptions& opts) { + std::vector& in_tensors, + std::vector& out_tensors, const BroadcastOptions& opts) { #if defined(PADDLE_WITH_NCCL) PADDLE_ENFORCE_EQ( - CheckTensorsInCudaPlace(tensors), true, + CheckTensorsInCudaPlace(in_tensors), true, platform::errors::InvalidArgument("All inputs should be in CudaPlace.")); + PADDLE_ENFORCE_EQ( + CheckTensorsInCudaPlace(out_tensors), true, + platform::errors::InvalidArgument("All outputs should be in CudaPlace.")); #endif // Step1: do broadcast in inner cluster auto b_opts = BroadcastOptions(); - b_opts.source_root = 0; - inner_pg_->Broadcast(tensors, b_opts); + b_opts.source_rank = 0; + inner_pg_->Broadcast(in_tensors, out_tensors, b_opts); if (local_rank_ == 0) { - std::vector cpu_tensors; - cpu_tensors.reserve(tensors.size()); - for (size_t i = 0; i < tensors.size(); i++) { - auto dense_gpu_tensor = - std::dynamic_pointer_cast(tensors[i].impl()); - phi::DenseTensorMeta meta = phi::DenseTensorMeta( - dense_gpu_tensor->dtype(), dense_gpu_tensor->dims()); - std::shared_ptr dense_cpu_tensor = - std::make_shared( - std::make_unique( - paddle::platform::CPUPlace()) - .get(), - meta); - dense_cpu_tensor->ResizeAndAllocate(dense_gpu_tensor->dims()); - cpu_tensors[i] = paddle::experimental::Tensor(dense_cpu_tensor); - framework::TensorCopySync(*dense_gpu_tensor, platform::CPUPlace(), - dense_cpu_tensor.get()); + std::vector cpu_tensors; + cpu_tensors.reserve(in_tensors.size()); + for (size_t i = 0; i < in_tensors.size(); i++) { + auto gpu_tensor = in_tensors[i]; + auto cpu_tensor = cpu_tensors[i]; + cpu_tensor.Resize(gpu_tensor.dims()); + framework::TensorCopySync(gpu_tensor, platform::CPUPlace(), &cpu_tensor); } if (with_switch_) { if (local_rank_ == 0) { HeterClient* client_ = HeterClient::GetInstance({switch_endpoint_}, {}, 0).get(); - auto dense_cpu_tensor = - std::dynamic_pointer_cast(cpu_tensors[0].impl()); + auto dense_cpu_tensor = cpu_tensors[0]; if (gloo_rank_ == 0) { std::vector send_size; - send_size.push_back(dense_cpu_tensor->numel()); + send_size.push_back(dense_cpu_tensor.numel()); int ret = client_->Send( - gid_, {dense_cpu_tensor->name()}, send_size, - dense_cpu_tensor->data(), - dense_cpu_tensor->numel() * - framework::DataTypeSize(dense_cpu_tensor->dtype())); + gid_, {dense_cpu_tensor.name()}, send_size, + dense_cpu_tensor.data(), + dense_cpu_tensor.numel() * + framework::DataTypeSize(dense_cpu_tensor.dtype())); PADDLE_ENFORCE_EQ(ret, 0, platform::errors::PreconditionNotMet( "Send to the switch module error.")); } else { int ret = client_->Recv( - gid_, {dense_cpu_tensor->name()}, dense_cpu_tensor->data(), - dense_cpu_tensor->numel() * - framework::DataTypeSize(dense_cpu_tensor->dtype())); + gid_, {dense_cpu_tensor.name()}, dense_cpu_tensor.data(), + dense_cpu_tensor.numel() * + framework::DataTypeSize(dense_cpu_tensor.dtype())); PADDLE_ENFORCE_EQ(ret, 0, platform::errors::PreconditionNotMet( "Receive from the switch module error.")); ret = client_->Recv( - gid_, {dense_cpu_tensor->name()}, dense_cpu_tensor->data(), - dense_cpu_tensor->numel() * - framework::DataTypeSize(dense_cpu_tensor->dtype())); + gid_, {dense_cpu_tensor.name()}, dense_cpu_tensor.data(), + dense_cpu_tensor.numel() * + framework::DataTypeSize(dense_cpu_tensor.dtype())); PADDLE_ENFORCE_EQ(ret, 0, platform::errors::PreconditionNotMet( "Receive from the switch module error.")); } } } else { - auto gloo_task = inter_pg_->Broadcast(cpu_tensors, opts); + auto gloo_task = inter_pg_->Broadcast(cpu_tensors, cpu_tensors, opts); gloo_task->Wait(); } - for (size_t i = 0; i < tensors.size(); i++) { - auto dense_gpu_tensor = - std::dynamic_pointer_cast(tensors[i].impl()); - auto dense_cpu_tensor = - std::dynamic_pointer_cast(cpu_tensors[i].impl()); - framework::TensorCopySync(*dense_cpu_tensor, dense_cpu_tensor->place(), - dense_gpu_tensor.get()); + for (size_t i = 0; i < in_tensors.size(); i++) { + auto gpu_tensor = out_tensors[i]; + auto cpu_tensor = cpu_tensors[i]; + framework::TensorCopySync(cpu_tensor, gpu_tensor.place(), &gpu_tensor); } } - auto broadcast_task = inner_pg_->Broadcast(tensors, b_opts); + auto broadcast_task = inner_pg_->Broadcast(out_tensors, out_tensors, b_opts); broadcast_task->Wait(); - return CreateTask(rank_, CommType::BROADCAST, tensors); -} - -void ProcessGroupHeter::Broadcast(const phi::DenseTensor* in, - phi::DenseTensor* out) { - // Step1: do broadcast in inner cluster - inner_pg_->Broadcast(in, out); - - if (local_rank_ == 0) { - phi::DenseTensorMeta meta = phi::DenseTensorMeta(in->dtype(), in->dims()); - std::shared_ptr dense_cpu_tensor = - std::make_shared( - std::make_unique( - paddle::platform::CPUPlace()) - .get(), - meta); - dense_cpu_tensor->ResizeAndAllocate(in->dims()); - Tensor cpu_tensor = paddle::experimental::Tensor(dense_cpu_tensor); - framework::TensorCopySync(*in, platform::CPUPlace(), - dense_cpu_tensor.get()); - if (with_switch_) { - if (local_rank_ == 0) { - HeterClient* client_ = - HeterClient::GetInstance({switch_endpoint_}, {}, 0).get(); - if (gloo_rank_ == 0) { - std::vector send_size; - send_size.push_back(in->numel()); - int ret = client_->Send( - gid_, {in->name()}, send_size, dense_cpu_tensor->data(), - in->numel() * framework::DataTypeSize(in->dtype())); - PADDLE_ENFORCE_EQ(ret, 0, platform::errors::PreconditionNotMet( - "Send to the switch module error.")); - } else { - int ret = - client_->Recv(gid_, {in->name()}, dense_cpu_tensor->data(), - in->numel() * framework::DataTypeSize(in->dtype())); - PADDLE_ENFORCE_EQ(ret, 0, - platform::errors::PreconditionNotMet( - "Receive from the switch module error.")); - } - } - } else { - std::vector cpu_tensors = {cpu_tensor}; - auto gloo_task = inter_pg_->Broadcast(cpu_tensors); - gloo_task->Wait(); - } - framework::TensorCopySync(*dense_cpu_tensor, out->place(), out); - } - inner_pg_->Broadcast(out, out); + return CreateTask(rank_, CommType::BROADCAST, in_tensors); } -} // namespace distributed -} // namespace paddle +} // namespace distributed +} // namespace paddle diff --git a/paddle/fluid/distributed/collective/ProcessGroupHeter.h b/paddle/fluid/distributed/collective/ProcessGroupHeter.h index 892dbb9369e8d..05bacd93d7815 100644 --- a/paddle/fluid/distributed/collective/ProcessGroupHeter.h +++ b/paddle/fluid/distributed/collective/ProcessGroupHeter.h @@ -66,7 +66,8 @@ class ProcessGroupHeter : public ProcessGroup { class HeterTask : public ProcessGroup::Task, public std::enable_shared_from_this { public: - HeterTask(int rank, CommType CommType, const std::vector& inputs); + HeterTask(int rank, CommType CommType, + const std::vector&); bool IsCompleted(); @@ -89,18 +90,16 @@ class ProcessGroupHeter : public ProcessGroup { } std::shared_ptr AllReduce( - std::vector& tensors, + std::vector&, std::vector&, const AllreduceOptions& = AllreduceOptions()) override; std::shared_ptr Broadcast( - std::vector& tensors, + std::vector&, std::vector&, const BroadcastOptions& = BroadcastOptions()) override; - void Broadcast(const phi::DenseTensor* in, phi::DenseTensor* out) override; - protected: virtual std::shared_ptr CreateTask( - int rank, CommType opType, const std::vector& inputs); + int rank, CommType opType, const std::vector& inputs); private: std::shared_ptr store_; diff --git a/paddle/fluid/distributed/collective/ProcessGroupNCCL.cc b/paddle/fluid/distributed/collective/ProcessGroupNCCL.cc index b1d892e2521a3..30813b904df53 100644 --- a/paddle/fluid/distributed/collective/ProcessGroupNCCL.cc +++ b/paddle/fluid/distributed/collective/ProcessGroupNCCL.cc @@ -41,14 +41,14 @@ void SyncDefaultStream( std::shared_ptr ProcessGroupNCCL::CreateTask( std::vector places, int rank, CommType comm_type, - const std::vector& inputs) { + const std::vector& inputs) { return std::make_shared(places, rank, comm_type, inputs); } -ProcessGroupNCCL::NCCLTask::NCCLTask(const std::vector& places, int rank, - CommType CommType, - const std::vector& inputs) +ProcessGroupNCCL::NCCLTask::NCCLTask( + const std::vector& places, int rank, CommType CommType, + const std::vector& inputs) : Task(rank, inputs, CommType), places_(places) { control_events_.resize(places.size()); ncclComms_.resize(places.size()); @@ -57,8 +57,8 @@ ProcessGroupNCCL::NCCLTask::NCCLTask(const std::vector& places, int rank, ProcessGroupNCCL::NCCLTask::~NCCLTask() {} void ProcessGroupNCCL::NCCLTask::SetOutputs( - std::vector& outputs) { // NOLINT - outputs_ = std::make_shared>(outputs); + std::vector& outputs) { // NOLINT + outputs_ = std::make_shared>(outputs); } void ProcessGroupNCCL::NCCLTask::SynchronizeStreams() { @@ -180,8 +180,8 @@ void ProcessGroupNCCL::CreateNCCLManagerCache( template std::shared_ptr ProcessGroupNCCL::Collective( - std::vector& inputs, std::vector& outputs, Fn fn, - CommType op_type) { + std::vector& inputs, + std::vector& outputs, Fn fn, CommType op_type) { const auto places = GetPlaceList(inputs); const auto key = GetKeyFromPlaces(places); @@ -205,9 +205,7 @@ std::shared_ptr ProcessGroupNCCL::Collective( if (FLAGS_use_stream_safe_cuda_allocator) { for (size_t i = 0; i < inputs.size(); ++i) { cuda_guard.SetDevice(places[i]); - auto dense_tensor = - std::dynamic_pointer_cast(inputs[i].impl()); - memory::RecordStream(dense_tensor->Holder(), + memory::RecordStream(inputs[i].Holder(), places_to_ctx_[key][i]->stream()); } } @@ -267,7 +265,8 @@ void ProcessGroupNCCL::Collective(const phi::DenseTensor* in, template std::shared_ptr ProcessGroupNCCL::PointToPoint( - std::vector& tensors, Fn fn, int dst_rank, CommType op_type) { + std::vector& tensors, Fn fn, int dst_rank, + CommType op_type) { const auto places = GetPlaceList(tensors); const auto key = GetKeyFromPlaces(places); @@ -290,9 +289,7 @@ std::shared_ptr ProcessGroupNCCL::PointToPoint( if (FLAGS_use_stream_safe_cuda_allocator) { for (size_t i = 0; i < tensors.size(); ++i) { cuda_guard.SetDevice(places[i]); - auto dense_tensor = - std::dynamic_pointer_cast(tensors[i].impl()); - memory::RecordStream(dense_tensor->Holder(), + memory::RecordStream(tensors[i].Holder(), places_to_ctx_[key][i]->stream()); } } @@ -314,46 +311,40 @@ std::shared_ptr ProcessGroupNCCL::PointToPoint( } std::shared_ptr ProcessGroupNCCL::AllReduce( - std::vector& tensors, const AllreduceOptions& opts) { + std::vector& in_tensors, + std::vector& out_tensors, const AllreduceOptions& opts) { PADDLE_ENFORCE_EQ( - CheckTensorsInCudaPlace(tensors), true, + CheckTensorsInCudaPlace(in_tensors), true, platform::errors::InvalidArgument("All inputs should be in CudaPlace.")); - return Collective( - tensors, tensors, - [&](const Tensor& input, Tensor& output, ncclComm_t comm, - const gpuStream_t& stream) { - auto input_tensor = - std::dynamic_pointer_cast(input.impl()); - auto output_tensor = - std::dynamic_pointer_cast(output.impl()); - return platform::dynload::ncclAllReduce( - input_tensor->data(), output_tensor->data(), input_tensor->numel(), - platform::ToNCCLDataType(input.type()), - ToNCCLRedType(opts.reduce_op), comm, stream); - }, - CommType::ALLREDUCE); + return Collective(in_tensors, out_tensors, + [&](const phi::DenseTensor& input, phi::DenseTensor& output, + ncclComm_t comm, const gpuStream_t& stream) { + return platform::dynload::ncclAllReduce( + input.data(), output.data(), input.numel(), + platform::ToNCCLDataType(input.type()), + ToNCCLRedType(opts.reduce_op), comm, stream); + }, + CommType::ALLREDUCE); } std::shared_ptr ProcessGroupNCCL::Broadcast( - std::vector& tensors, const BroadcastOptions& opts) { + std::vector& in_tensors, + std::vector& out_tensors, const BroadcastOptions& opts) { PADDLE_ENFORCE_EQ( - CheckTensorsInCudaPlace(tensors), true, + CheckTensorsInCudaPlace(in_tensors), true, platform::errors::InvalidArgument("All inputs should be in CudaPlace.")); - return Collective( - tensors, tensors, - [&](Tensor& input, Tensor& output, ncclComm_t comm, - const gpuStream_t& stream) { - const auto root = opts.source_rank * tensors.size() + opts.source_root; - auto input_tensor = - std::dynamic_pointer_cast(input.impl()); - auto output_tensor = - std::dynamic_pointer_cast(output.impl()); - return platform::dynload::ncclBcast( - input_tensor->data(), input_tensor->numel(), - platform::ToNCCLDataType(input.type()), root, comm, stream); - }, - CommType::BROADCAST); + return Collective(in_tensors, out_tensors, + [&](phi::DenseTensor& input, phi::DenseTensor& output, + ncclComm_t comm, const gpuStream_t& stream) { + const auto root = opts.source_rank * in_tensors.size() + + opts.source_root; + return platform::dynload::ncclBroadcast( + input.data(), output.data(), input.numel(), + platform::ToNCCLDataType(input.type()), root, comm, + stream); + }, + CommType::BROADCAST); } std::shared_ptr ProcessGroupNCCL::Barrier( @@ -374,23 +365,24 @@ std::shared_ptr ProcessGroupNCCL::Barrier( places.emplace_back(place_id); } - std::vector barrierTensors; + std::vector barrierTensors; barrierTensors.reserve(places.size()); platform::CUDADeviceGuard gpuGuard; for (auto& place : places) { gpuGuard.SetDeviceIndex(place.GetDeviceId()); auto dt = full({1}, 0, phi::DataType::FLOAT32, phi::GPUPlace()); - barrierTensors.push_back(dt); + barrierTensors.push_back( + *std::dynamic_pointer_cast(dt.impl())); } - auto task = ProcessGroupNCCL::AllReduce(barrierTensors); + auto task = ProcessGroupNCCL::AllReduce(barrierTensors, barrierTensors); auto nccl_task = dynamic_cast(task.get()); nccl_task->barrierTensors_ = std::move(barrierTensors); return task; } -void CheckTensorsInDifferentDevices(const std::vector& tensors, - const size_t num_devices) { +void CheckTensorsInDifferentDevices( + const std::vector& tensors, const size_t num_devices) { PADDLE_ENFORCE_EQ( tensors.size() == 0, false, platform::errors::InvalidArgument("Tensor list must be nonempty.")); @@ -402,11 +394,11 @@ void CheckTensorsInDifferentDevices(const std::vector& tensors, std::set used_devices; for (const auto& t : tensors) { - PADDLE_ENFORCE_EQ(t.is_gpu() && t.is_dense_tensor(), true, + PADDLE_ENFORCE_EQ(platform::is_gpu_place(t.place()), true, platform::errors::InvalidArgument( "Tensors must be CUDA and dense tensor.")); - const auto inserted = used_devices.insert(t.inner_place()).second; + const auto inserted = used_devices.insert(t.place()).second; PADDLE_ENFORCE_EQ(inserted, true, platform::errors::InvalidArgument( "Tensors must be on distinct GPU devices.")); @@ -414,62 +406,55 @@ void CheckTensorsInDifferentDevices(const std::vector& tensors, } std::shared_ptr ProcessGroupNCCL::Send( - std::vector& tensors, int dst_rank) { + std::vector& tensors, int dst_rank) { CheckTensorsInDifferentDevices(tensors, static_cast(GetSize())); - auto task = PointToPoint( - tensors, - [&](Tensor& input, ncclComm_t comm, const gpuStream_t& stream, - int dst_rank) { - auto input_tensor = - std::dynamic_pointer_cast(input.impl()); - return platform::dynload::ncclSend( - input_tensor->data(), input_tensor->numel(), - platform::ToNCCLDataType(input.type()), dst_rank, comm, stream); - }, - dst_rank, CommType::SEND); + auto task = PointToPoint(tensors, + [&](phi::DenseTensor& input, ncclComm_t comm, + const gpuStream_t& stream, int dst_rank) { + return platform::dynload::ncclSend( + input.data(), input.numel(), + platform::ToNCCLDataType(input.dtype()), + dst_rank, comm, stream); + }, + dst_rank, CommType::SEND); return task; } std::shared_ptr ProcessGroupNCCL::Recv( - std::vector& tensors, int src_rank) { + std::vector& tensors, int src_rank) { CheckTensorsInDifferentDevices(tensors, static_cast(GetSize())); - auto task = PointToPoint( - tensors, - [&](Tensor& output, ncclComm_t comm, const gpuStream_t& stream, - int src_rank) { - auto output_tensor = - std::dynamic_pointer_cast(output.impl()); - return platform::dynload::ncclRecv( - output_tensor->data(), output_tensor->numel(), - platform::ToNCCLDataType(output.type()), src_rank, comm, stream); - }, - src_rank, CommType::RECV); + auto task = PointToPoint(tensors, + [&](phi::DenseTensor& output, ncclComm_t comm, + const gpuStream_t& stream, int src_rank) { + return platform::dynload::ncclRecv( + output.data(), output.numel(), + platform::ToNCCLDataType(output.dtype()), + src_rank, comm, stream); + }, + src_rank, CommType::RECV); return task; } std::shared_ptr ProcessGroupNCCL::AllGather( - std::vector& in_tensors, std::vector& out_tensors) { + std::vector& in_tensors, + std::vector& out_tensors) { PADDLE_ENFORCE_EQ( CheckTensorsInCudaPlace(in_tensors), true, platform::errors::InvalidArgument("All inputs should be in CudaPlace.")); PADDLE_ENFORCE_EQ( CheckTensorsInCudaPlace(out_tensors), true, platform::errors::InvalidArgument("All outputs should be in CudaPlace.")); - return Collective( - in_tensors, out_tensors, - [&](const Tensor& input, Tensor& output, ncclComm_t comm, - const gpuStream_t& stream) { - auto input_tensor = - std::dynamic_pointer_cast(input.impl()); - auto output_tensor = - std::dynamic_pointer_cast(output.impl()); - return platform::dynload::ncclAllGather( - input_tensor->data(), output_tensor->data(), input_tensor->numel(), - platform::ToNCCLDataType(input.type()), comm, stream); - }, - CommType::ALLGATHER); + return Collective(in_tensors, out_tensors, + [&](const phi::DenseTensor& input, phi::DenseTensor& output, + ncclComm_t comm, const gpuStream_t& stream) { + return platform::dynload::ncclAllGather( + input.data(), output.data(), input.numel(), + platform::ToNCCLDataType(input.dtype()), comm, + stream); + }, + CommType::ALLGATHER); } void* GetPointerByOffset(void* raw_pointer, size_t offset, @@ -493,10 +478,12 @@ void* GetPointerByOffset(void* raw_pointer, size_t offset, PADDLE_THROW(platform::errors::Unimplemented( "This datatype in nccl is not supported.")); } + return nullptr; } std::shared_ptr ProcessGroupNCCL::AllToAll( - std::vector& in_tensors, std::vector& out_tensors) { + std::vector& in_tensors, + std::vector& out_tensors) { PADDLE_ENFORCE_EQ( CheckTensorsInCudaPlace(in_tensors), true, platform::errors::InvalidArgument("All inputs should be in CudaPlace.")); @@ -505,24 +492,20 @@ std::shared_ptr ProcessGroupNCCL::AllToAll( platform::errors::InvalidArgument("All inputs should be in CudaPlace.")); return Collective( in_tensors, out_tensors, - [&](const Tensor& input, Tensor& output, ncclComm_t comm, + [&](phi::DenseTensor& input, phi::DenseTensor& output, ncclComm_t comm, const gpuStream_t& stream) { - auto input_tensor = - std::dynamic_pointer_cast(input.impl()); - auto output_tensor = - std::dynamic_pointer_cast(output.impl()); size_t offset = 0; PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclGroupStart()); for (auto i = 0; i < size_; i++) { PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclSend( - GetPointerByOffset(input_tensor->data(), offset, input.type()), - input_tensor->numel() / size_, - platform::ToNCCLDataType(input.type()), i, comm, stream)); + GetPointerByOffset(input.data(), offset, input.dtype()), + input.numel() / size_, platform::ToNCCLDataType(input.dtype()), i, + comm, stream)); PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclRecv( - GetPointerByOffset(output_tensor->data(), offset, input.type()), - input_tensor->numel() / size_, - platform::ToNCCLDataType(input.type()), i, comm, stream)); - offset += input_tensor->numel() / size_; + GetPointerByOffset(output.data(), offset, input.dtype()), + input.numel() / size_, platform::ToNCCLDataType(input.dtype()), i, + comm, stream)); + offset += input.numel() / size_; } PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclGroupEnd()); }, @@ -530,29 +513,26 @@ std::shared_ptr ProcessGroupNCCL::AllToAll( } std::shared_ptr ProcessGroupNCCL::Reduce( - std::vector& tensors, const ReduceOptions& opts) { + std::vector& in_tensors, + std::vector& out_tensors, const ReduceOptions& opts) { PADDLE_ENFORCE_EQ( - CheckTensorsInCudaPlace(tensors), true, + CheckTensorsInCudaPlace(in_tensors), true, platform::errors::InvalidArgument("All inputs should be in CudaPlace.")); return Collective( - tensors, tensors, - [&](const Tensor& input, Tensor& output, ncclComm_t comm, - const gpuStream_t& stream) { - auto input_tensor = - std::dynamic_pointer_cast(input.impl()); - auto output_tensor = - std::dynamic_pointer_cast(output.impl()); + in_tensors, out_tensors, + [&](const phi::DenseTensor& input, phi::DenseTensor& output, + ncclComm_t comm, const gpuStream_t& stream) { PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclReduce( - input_tensor->data(), output_tensor->data(), input.numel(), - platform::ToNCCLDataType(input.type()), + input.data(), output.data(), input.numel(), + platform::ToNCCLDataType(input.dtype()), ToNCCLRedType(opts.reduce_op), opts.root_rank, comm, stream)); }, CommType::REDUCE); } std::shared_ptr ProcessGroupNCCL::Scatter( - std::vector& in_tensors, std::vector& out_tensors, - const ScatterOptions& opts) { + std::vector& in_tensors, + std::vector& out_tensors, const ScatterOptions& opts) { PADDLE_ENFORCE_EQ( CheckTensorsInCudaPlace(in_tensors), true, platform::errors::InvalidArgument("All inputs should be in CudaPlace.")); @@ -561,31 +541,27 @@ std::shared_ptr ProcessGroupNCCL::Scatter( platform::errors::InvalidArgument("All inputs should be in CudaPlace.")); return Collective( in_tensors, out_tensors, - [&](const Tensor& input, Tensor& output, ncclComm_t comm, + [&](phi::DenseTensor& input, phi::DenseTensor& output, ncclComm_t comm, const gpuStream_t& stream) { - auto input_tensor = - std::dynamic_pointer_cast(input.impl()); - auto output_tensor = - std::dynamic_pointer_cast(output.impl()); size_t offset = 0; if (rank_ == opts.root_rank) { PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclGroupStart()); for (auto i = 0; i < size_; i++) { PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclSend( - GetPointerByOffset(input_tensor->data(), offset, input.type()), - input_tensor->numel() / size_, - platform::ToNCCLDataType(input.type()), i, comm, stream)); - offset += input_tensor->numel() / size_; + GetPointerByOffset(input.data(), offset, input.dtype()), + input.numel() / size_, platform::ToNCCLDataType(input.dtype()), + i, comm, stream)); + offset += input.numel() / size_; } PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclRecv( - output_tensor->data(), input_tensor->numel() / size_, - platform::ToNCCLDataType(input.type()), opts.root_rank, comm, + output.data(), input.numel() / size_, + platform::ToNCCLDataType(input.dtype()), opts.root_rank, comm, stream)); PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclGroupEnd()); } else { PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclRecv( - output_tensor->data(), input_tensor->numel() / size_, - platform::ToNCCLDataType(input.type()), opts.root_rank, comm, + output.data(), input.numel() / size_, + platform::ToNCCLDataType(input.dtype()), opts.root_rank, comm, stream)); } }, diff --git a/paddle/fluid/distributed/collective/ProcessGroupNCCL.h b/paddle/fluid/distributed/collective/ProcessGroupNCCL.h index fa73ed195b0c1..cca84285ef4de 100644 --- a/paddle/fluid/distributed/collective/ProcessGroupNCCL.h +++ b/paddle/fluid/distributed/collective/ProcessGroupNCCL.h @@ -51,7 +51,7 @@ class ProcessGroupNCCL : public ProcessGroup { public std::enable_shared_from_this { public: NCCLTask(const std::vector& places, int rank, CommType CommType, - const std::vector& inputs); + const std::vector& inputs); bool IsCompleted(); @@ -61,17 +61,17 @@ class ProcessGroupNCCL : public ProcessGroup { void Synchronize(); - void SetOutputs(std::vector& outputs); // NOLINT + void SetOutputs(std::vector& outputs); // NOLINT virtual ~NCCLTask(); std::vector control_events_; - std::vector barrierTensors_; + std::vector barrierTensors_; protected: std::vector places_; std::vector> ncclComms_; - std::shared_ptr> outputs_; + std::shared_ptr> outputs_; private: }; @@ -84,40 +84,46 @@ class ProcessGroupNCCL : public ProcessGroup { } std::shared_ptr AllReduce( - std::vector& tensors, + std::vector& in_tensors, + std::vector& out_tensors, const AllreduceOptions& = AllreduceOptions()) override; std::shared_ptr Broadcast( - std::vector& tensors, + std::vector& in_tensors, + std::vector& out_tensors, const BroadcastOptions& = BroadcastOptions()) override; std::shared_ptr Barrier( const BarrierOptions& = BarrierOptions()) override; - std::shared_ptr Send(std::vector& tensors, - int dst_rank) override; + std::shared_ptr Send( + std::vector& tensors, int dst_rank) override; - std::shared_ptr Recv(std::vector& tensors, - int src_rank) override; + std::shared_ptr Recv( + std::vector& tensors, int src_rank) override; std::shared_ptr AllGather( - std::vector& in_tensors, - std::vector& out_tensors) override; + std::vector& in_tensors, + std::vector& out_tensors) override; std::shared_ptr AllToAll( - std::vector& in, std::vector& out) override; + std::vector& in, + std::vector& out) override; std::shared_ptr Reduce( - std::vector& tensors, const ReduceOptions& opts) override; + std::vector& tensors, + std::vector& out_tensors, + const ReduceOptions& opts) override; - std::shared_ptr Scatter(std::vector& in_tensors, - std::vector& out_tensors, - const ScatterOptions&) override; + std::shared_ptr Scatter( + std::vector& in_tensors, + std::vector& out_tensors, + const ScatterOptions&) override; protected: virtual std::shared_ptr CreateTask( std::vector places, int rank, CommType opType, - const std::vector& inputs); + const std::vector& inputs); protected: std::shared_ptr store_; @@ -142,8 +148,8 @@ class ProcessGroupNCCL : public ProcessGroup { template std::shared_ptr Collective( - std::vector& inputs, // NOLINT - std::vector& outputs, // NOLINT + std::vector& inputs, // NOLINT + std::vector& outputs, // NOLINT Fn fn, CommType op_type); template @@ -152,7 +158,7 @@ class ProcessGroupNCCL : public ProcessGroup { template std::shared_ptr PointToPoint( - std::vector& tensors, // NOLINT + std::vector& tensors, // NOLINT Fn fn, int dst_rank, CommType op_type); void CreateNCCLManagerCache(const std::string& places_key, diff --git a/paddle/fluid/distributed/collective/reducer.cc b/paddle/fluid/distributed/collective/reducer.cc index 02f7f25636410..63e92444b32cb 100644 --- a/paddle/fluid/distributed/collective/reducer.cc +++ b/paddle/fluid/distributed/collective/reducer.cc @@ -734,7 +734,11 @@ void EagerReducer::ProcessUnusedDenseVars() { distributed::AllreduceOptions opts; opts.reduce_op = ReduceOp::SUM; std::vector reduce_tensors = {global_used_vars_}; - process_group_->AllReduce(reduce_tensors, opts)->Synchronize(); + std::vector in_out; + for (auto &t : reduce_tensors) { + in_out.push_back(*std::dynamic_pointer_cast(t.impl())); + } + process_group_->AllReduce(in_out, in_out, opts)->Synchronize(); framework::TensorToVector(*global_used_tensor, *dev_ctx, &local_used_vars_); @@ -820,7 +824,11 @@ void EagerReducer::FusedAllReduceSchedule(EagerGroup *group, // all_reduce std::vector reduce_tensors = {group->dense_contents_}; - group->task = process_group_->AllReduce(reduce_tensors, opts); + std::vector in_out; + for (auto &t : reduce_tensors) { + in_out.push_back(*std::dynamic_pointer_cast(t.impl())); + } + group->task = process_group_->AllReduce(in_out, in_out, opts); // split in FinalizeBackward() } @@ -871,7 +879,11 @@ void EagerReducer::AllReduceSparse(EagerGroup *group, distributed::AllreduceOptions opts; opts.reduce_op = ReduceOp::SUM; std::vector reduce_tensors = {rows_num_tensor}; - process_group_->AllReduce(reduce_tensors, opts)->Synchronize(); + std::vector in_out; + for (auto &t : reduce_tensors) { + in_out.push_back(*std::dynamic_pointer_cast(t.impl())); + } + process_group_->AllReduce(in_out, in_out, opts)->Synchronize(); framework::TensorToVector(*rows_num_dense_tensor, *dev_ctx, &rows_num_vector); @@ -908,8 +920,15 @@ void EagerReducer::AllReduceSparse(EagerGroup *group, std::vector src_rows_tensors = {src_rows_tensor}; std::vector dst_rows_tensors = {dst_rows_tensor}; - process_group_->AllGather(src_rows_tensors, dst_rows_tensors) - ->Synchronize(); + std::vector in; + std::vector out; + for (auto &t : src_rows_tensors) { + in.push_back(*std::dynamic_pointer_cast(t.impl())); + } + for (auto &t : dst_rows_tensors) { + out.push_back(*std::dynamic_pointer_cast(t.impl())); + } + process_group_->AllGather(in, out)->Synchronize(); framework::Vector dst_rows_vector(rows_num, 0); auto *dst_rows_dense_tensor = @@ -934,8 +953,17 @@ void EagerReducer::AllReduceSparse(EagerGroup *group, std::vector src_value_tensors = {src_value_tensor}; std::vector dst_value_tensors = {dst_value_tensor}; - process_group_->AllGather(src_value_tensors, dst_value_tensors) - ->Synchronize(); + std::vector src_dense; + std::vector dst_dense; + for (auto &t : src_value_tensors) { + src_dense.push_back( + *std::dynamic_pointer_cast(t.impl())); + } + for (auto &t : dst_value_tensors) { + dst_dense.push_back( + *std::dynamic_pointer_cast(t.impl())); + } + process_group_->AllGather(src_dense, dst_dense)->Synchronize(); src->set_rows(dst_rows_vector); *(src->mutable_value()) = diff --git a/paddle/fluid/operators/collective/c_allgather_op.cu.cc b/paddle/fluid/operators/collective/c_allgather_op.cu.cc index 89854999c16fc..0d97ffa96dc5c 100644 --- a/paddle/fluid/operators/collective/c_allgather_op.cu.cc +++ b/paddle/fluid/operators/collective/c_allgather_op.cu.cc @@ -18,7 +18,9 @@ limitations under the License. */ #include "paddle/fluid/platform/collective_helper.h" #include "paddle/fluid/platform/device/gpu/nccl_helper.h" #endif +#include "paddle/fluid/distributed/collective/ProcessGroup.h" #include "paddle/fluid/framework/convert_utils.h" +#include "paddle/phi/api/include/tensor.h" namespace paddle { namespace operators { @@ -35,6 +37,18 @@ class CAllGatherOpCUDAKernel : public framework::OpKernel { int nranks = ctx.Attr("nranks"); int rid = ctx.Attr("ring_id"); + auto map = distributed::ProcessGroupMapFromGid::getInstance(); + if (map->has(rid)) { + // Use ProcessGroup + distributed::ProcessGroup* pg = map->get(rid); + std::vector in_tensor; + std::vector out_tensor; + in_tensor.push_back(*in); + out_tensor.push_back(*out); + auto task = pg->AllGather(in_tensor, out_tensor); + task->Wait(); + return; + } auto place = ctx.GetPlace(); auto comm = platform::NCCLCommContext::Instance().Get(rid, place); PADDLE_ENFORCE_EQ( diff --git a/paddle/fluid/operators/collective/c_broadcast_op.cu.cc b/paddle/fluid/operators/collective/c_broadcast_op.cu.cc index 7bdf5f0c46ca6..4bed282ace8d1 100644 --- a/paddle/fluid/operators/collective/c_broadcast_op.cu.cc +++ b/paddle/fluid/operators/collective/c_broadcast_op.cu.cc @@ -41,7 +41,12 @@ class CBroadcastOpCUDAKernel : public framework::OpKernel { if (map->has(rid)) { // Use ProcessGroup distributed::ProcessGroup* pg = map->get(rid); - pg->Broadcast(x, out); + std::vector in_tensor; + std::vector out_tensor; + in_tensor.push_back(*x); + out_tensor.push_back(*out); + auto task = pg->Broadcast(in_tensor, out_tensor); + task->Wait(); return; } diff --git a/paddle/fluid/pybind/distributed_py.cc b/paddle/fluid/pybind/distributed_py.cc index 38ed1d4f2bb5d..716cd35f0a614 100644 --- a/paddle/fluid/pybind/distributed_py.cc +++ b/paddle/fluid/pybind/distributed_py.cc @@ -115,8 +115,10 @@ void BindDistributed(py::module *m) { auto tensor = CastPyArg2Tensor(py_tensor.ptr(), 0); distributed::AllreduceOptions opts; opts.reduce_op = op; - std::vector tensors = {tensor}; - return self.AllReduce(tensors, opts); + auto dense = + std::dynamic_pointer_cast(tensor.impl()); + std::vector tensors = {*dense}; + return self.AllReduce(tensors, tensors, opts); }, py::arg("tensor"), py::arg("op") = distributed::ReduceOp::SUM, py::call_guard()) @@ -127,8 +129,10 @@ void BindDistributed(py::module *m) { auto tensor = CastPyArg2Tensor(py_tensor.ptr(), 0); distributed::BroadcastOptions opts; opts.source_rank = source_rank; - std::vector tensors = {tensor}; - return self.Broadcast(tensors, opts); + auto dense = + std::dynamic_pointer_cast(tensor.impl()); + std::vector tensors = {*dense}; + return self.Broadcast(tensors, tensors, opts); }, py::arg("tensor"), py::arg("source_rank"), py::call_guard()) @@ -146,7 +150,9 @@ void BindDistributed(py::module *m) { [](distributed::ProcessGroup &self, py::handle py_tensor, int dst) { auto tensor = CastPyArg2Tensor(py_tensor.ptr(), 0); - std::vector tensors = {tensor}; + auto dense = + std::dynamic_pointer_cast(tensor.impl()); + std::vector tensors = {*dense}; return self.Send(tensors, dst); }, py::arg("tensor"), py::arg("dst"), @@ -156,7 +162,9 @@ void BindDistributed(py::module *m) { [](distributed::ProcessGroup &self, py::handle py_tensor, int src) { auto tensor = CastPyArg2Tensor(py_tensor.ptr(), 0); - std::vector tensors = {tensor}; + auto dense = + std::dynamic_pointer_cast(tensor.impl()); + std::vector tensors = {*dense}; return self.Recv(tensors, src); }, py::arg("tensor"), py::arg("src"), @@ -167,8 +175,12 @@ void BindDistributed(py::module *m) { py::handle py_out_tensor) { auto in_tensor = CastPyArg2Tensor(py_in_tensor.ptr(), 0); auto out_tensor = CastPyArg2Tensor(py_out_tensor.ptr(), 0); - std::vector in_tensors = {in_tensor}; - std::vector out_tensors = {out_tensor}; + auto in_dense = std::dynamic_pointer_cast( + in_tensor.impl()); + auto out_dense = std::dynamic_pointer_cast( + out_tensor.impl()); + std::vector in_tensors = {*in_dense}; + std::vector out_tensors = {*out_dense}; return self.AllGather(in_tensors, out_tensors); }, py::arg("in"), py::arg("out"), @@ -179,8 +191,12 @@ void BindDistributed(py::module *m) { py::handle py_out_tensor) { auto in_tensor = CastPyArg2Tensor(py_in_tensor.ptr(), 0); auto out_tensor = CastPyArg2Tensor(py_out_tensor.ptr(), 0); - std::vector in_tensors = {in_tensor}; - std::vector out_tensors = {out_tensor}; + auto in_dense = std::dynamic_pointer_cast( + in_tensor.impl()); + auto out_dense = std::dynamic_pointer_cast( + out_tensor.impl()); + std::vector in_tensors = {*in_dense}; + std::vector out_tensors = {*out_dense}; return self.AllToAll(in_tensors, out_tensors); }, py::arg("in"), py::arg("out"), @@ -193,8 +209,10 @@ void BindDistributed(py::module *m) { distributed::ReduceOptions opts; opts.reduce_op = op; opts.root_rank = dst; - std::vector tensors = {in_tensor}; - return self.Reduce(tensors, opts); + auto dense = std::dynamic_pointer_cast( + in_tensor.impl()); + std::vector tensors = {*dense}; + return self.Reduce(tensors, tensors, opts); }, py::arg("tensor"), py::arg("dst"), py::arg("op") = distributed::ReduceOp::SUM, @@ -207,8 +225,12 @@ void BindDistributed(py::module *m) { auto out_tensor = CastPyArg2Tensor(py_out_tensor.ptr(), 0); distributed::ScatterOptions opts; opts.root_rank = src; - std::vector in_tensors = {in_tensor}; - std::vector out_tensors = {out_tensor}; + auto in_dense = std::dynamic_pointer_cast( + in_tensor.impl()); + auto out_dense = std::dynamic_pointer_cast( + out_tensor.impl()); + std::vector in_tensors = {*in_dense}; + std::vector out_tensors = {*out_dense}; return self.Scatter(in_tensors, out_tensors, opts); }, py::arg("in"), py::arg("out"), py::arg("src"), diff --git a/python/paddle/fluid/tests/unittests/init_process_group.py b/python/paddle/fluid/tests/unittests/init_process_group.py index c9c957572c515..17887a9d767c1 100644 --- a/python/paddle/fluid/tests/unittests/init_process_group.py +++ b/python/paddle/fluid/tests/unittests/init_process_group.py @@ -46,6 +46,11 @@ def test_init_process_group(self): group = paddle.distributed.collective.Group(-1, 2, 0, [-1, -2]) ret = paddle.distributed.barrier(group) assert ret == None + paddle.enable_static() + in_tensor = paddle.empty((1, 2)) + in_tensor2 = paddle.empty((1, 2)) + paddle.distributed.broadcast(in_tensor, src=0) + paddle.distributed.all_gather([in_tensor, in_tensor2], in_tensor) print("test ok\n") From c239f15a8779c52840462f586b5f9f392f2bfd10 Mon Sep 17 00:00:00 2001 From: zhiboniu <31800336+zhiboniu@users.noreply.github.com> Date: Wed, 13 Apr 2022 18:20:54 +0800 Subject: [PATCH 130/211] tensor fluid code transfer part2 (#41096) --- python/paddle/fft.py | 3 +- .../tests/unittests/test_crop_tensor_op.py | 18 +- .../fluid/tests/unittests/test_slice_op.py | 8 +- .../tests/unittests/test_strided_slice_op.py | 8 +- python/paddle/tensor/attribute.py | 129 +- python/paddle/tensor/creation.py | 303 +++- python/paddle/tensor/linalg.py | 54 +- python/paddle/tensor/manipulation.py | 1338 ++++++++++++++++- python/paddle/tensor/random.py | 2 +- python/paddle/tensor/search.py | 2 +- python/paddle/tensor/stat.py | 2 +- 11 files changed, 1756 insertions(+), 111 deletions(-) diff --git a/python/paddle/fft.py b/python/paddle/fft.py index 975e632558feb..10d637ff8b9ba 100644 --- a/python/paddle/fft.py +++ b/python/paddle/fft.py @@ -15,7 +15,8 @@ from typing import Sequence import numpy as np import paddle -from .tensor.attribute import is_complex, is_floating_point, is_integer, _real_to_complex_dtype, _complex_to_real_dtype +from .tensor.attribute import is_complex, is_floating_point, is_integer +from .tensor.creation import _real_to_complex_dtype, _complex_to_real_dtype from .fluid.framework import _non_static_mode from . import _C_ops from .fluid.data_feeder import check_variable_and_dtype diff --git a/python/paddle/fluid/tests/unittests/test_crop_tensor_op.py b/python/paddle/fluid/tests/unittests/test_crop_tensor_op.py index a4552c8f5ddbb..04e47bd30ce24 100644 --- a/python/paddle/fluid/tests/unittests/test_crop_tensor_op.py +++ b/python/paddle/fluid/tests/unittests/test_crop_tensor_op.py @@ -17,6 +17,7 @@ import unittest import numpy as np from op_test import OpTest +import paddle import paddle.fluid as fluid @@ -225,31 +226,30 @@ def test_exception(self): offset = fluid.data(name='offset', shape=[1], dtype='int32') def attr_shape_type(): - out = fluid.layers.crop_tensor(input1, shape=3) + out = paddle.crop(input1, shape=3) def attr_shape_dtype(): - out = fluid.layers.crop_tensor(input1, shape=[2, 2.0, 3, 3]) + out = paddle.crop(input1, shape=[2, 2.0, 3, 3]) def attr_shape_value1(): - out = fluid.layers.crop_tensor(input1, shape=[2, -2, dim, 3]) + out = paddle.crop(input1, shape=[2, -2, dim, 3]) def attr_shape_value2(): - out = fluid.layers.crop_tensor(input1, shape=[2, 0, dim, 3]) + out = paddle.crop(input1, shape=[2, 0, dim, 3]) def attr_offsets_type(): - out = fluid.layers.crop_tensor( - input1, shape=[2, 2, 3, 3], offsets=0) + out = paddle.crop(input1, shape=[2, 2, 3, 3], offsets=0) def attr_offsets_dtype(): - out = fluid.layers.crop_tensor( + out = paddle.crop( input1, shape=[2, 2, 3, 3], offsets=[0, 1.0, 0, 0]) def attr_offsets_value(): - out = fluid.layers.crop_tensor( + out = paddle.crop( input1, shape=[2, 2, 3, 3], offsets=[0, -1, offset, 0]) def input_dtype(): - out = fluid.layers.crop_tensor(input2, shape=[2, 2, 3, 3]) + out = paddle.crop(input2, shape=[2, 2, 3, 3]) self.assertRaises(TypeError, attr_shape_type) self.assertRaises(TypeError, attr_shape_dtype) diff --git a/python/paddle/fluid/tests/unittests/test_slice_op.py b/python/paddle/fluid/tests/unittests/test_slice_op.py index a565bba304184..34f296c4b6354 100644 --- a/python/paddle/fluid/tests/unittests/test_slice_op.py +++ b/python/paddle/fluid/tests/unittests/test_slice_op.py @@ -534,13 +534,13 @@ def test_1(self): # value_int64 is greater than 2147483647 which is the max of int32 value_int64 = fluid.layers.fill_constant([1], "int64", 2147483648) - out_1 = fluid.layers.slice( + out_1 = paddle.slice( x, axes=[0, 1, 2], starts=[-3, 0, 2], ends=[value_int64, 100, -1]) - out_2 = fluid.layers.slice( + out_2 = paddle.slice( x, axes=[0, 1, 3], starts=[minus_3, 0, 2], ends=[3, 100, -1]) - out_3 = fluid.layers.slice( + out_3 = paddle.slice( x, axes=[0, 1, 3], starts=[minus_3, 0, 2], ends=[3, 100, minus_1]) - out_4 = fluid.layers.slice(x, axes=[0, 1, 2], starts=starts, ends=ends) + out_4 = paddle.slice(x, axes=[0, 1, 2], starts=starts, ends=ends) out_5 = x[-3:3, 0:100, 2:-1] out_6 = x[minus_3:3, 0:100, :, 2:-1] diff --git a/python/paddle/fluid/tests/unittests/test_strided_slice_op.py b/python/paddle/fluid/tests/unittests/test_strided_slice_op.py index ae17cb9b1b57c..4954cfc97e4e2 100644 --- a/python/paddle/fluid/tests/unittests/test_strided_slice_op.py +++ b/python/paddle/fluid/tests/unittests/test_strided_slice_op.py @@ -534,25 +534,25 @@ def test_1(self): shape=[3, 4, 5, 6], append_batch_size=False, dtype="float64") - out_1 = fluid.layers.strided_slice( + out_1 = paddle.strided_slice( x, axes=[0, 1, 2], starts=[-3, 0, 2], ends=[3, 100, -1], strides=[1, 1, 1]) - out_2 = fluid.layers.strided_slice( + out_2 = paddle.strided_slice( x, axes=[0, 1, 3], starts=[minus_3, 0, 2], ends=[3, 100, -1], strides=[1, 1, 1]) - out_3 = fluid.layers.strided_slice( + out_3 = paddle.strided_slice( x, axes=[0, 1, 3], starts=[minus_3, 0, 2], ends=[3, 100, minus_1], strides=[1, 1, 1]) - out_4 = fluid.layers.strided_slice( + out_4 = paddle.strided_slice( x, axes=[0, 1, 2], starts=starts, ends=ends, strides=strides) out_5 = x[-3:3, 0:100:2, -1:2:-1] diff --git a/python/paddle/tensor/attribute.py b/python/paddle/tensor/attribute.py index 07db7794b6d98..757b93dd88078 100644 --- a/python/paddle/tensor/attribute.py +++ b/python/paddle/tensor/attribute.py @@ -14,37 +14,128 @@ from __future__ import print_function -from ..framework import core -from ..fluid.layer_helper import LayerHelper +from ..framework import core, _non_static_mode +from ..framework import LayerHelper from ..fluid.data_feeder import check_variable_and_dtype +from ..fluid.data_feeder import check_type + +from .creation import assign +from .creation import _complex_to_real_dtype # TODO: define functions to get tensor attributes -from ..fluid.layers import rank # noqa: F401 -from ..fluid.layers import shape # noqa: F401 import paddle from paddle import _C_ops -from paddle.static import Variable +from ..static import Variable from ..fluid.framework import _in_legacy_dygraph, in_dygraph_mode +import numpy as np + __all__ = [] -def _complex_to_real_dtype(dtype): - if dtype == core.VarDesc.VarType.COMPLEX64: - return core.VarDesc.VarType.FP32 - elif dtype == core.VarDesc.VarType.COMPLEX128: - return core.VarDesc.VarType.FP64 - else: - return dtype +def rank(input): + """ + + The OP returns the number of dimensions for a tensor, which is a 0-D int32 Tensor. + + Args: + input (Tensor): The input N-D tensor with shape of :math:`[N_1, N_2, ..., N_k]`, the data type is arbitrary. + + Returns: + Tensor, the output data type is int32.: The 0-D tensor with the dimensions of the input Tensor. + + Examples: + .. code-block:: python + + import paddle + + input = paddle.rand((3, 100, 100)) + rank = paddle.rank(input) + print(rank) + # 3 + """ + check_type(input, 'input', (Variable), 'input') + ndims = len(input.shape) + out = assign(np.array(ndims, 'int32')) + + return out + + +def shape(input): + """ + :alias_main: paddle.shape + :alias: paddle.shape,paddle.tensor.shape,paddle.tensor.attribute.shape + :old_api: paddle.fluid.layers.shape + + **Shape Layer** + + Get the shape of the input. + + .. code-block:: text + + Case1: + Given N-D Tensor: + input = [ [1, 2, 3, 4], [5, 6, 7, 8] ] + Then: + input.shape = [2, 4] + + Case2: + Given SelectedRows: + input.rows = [0, 4, 19] + input.height = 20 + input.value = [ [1, 2], [3, 4], [5, 6] ] # inner tensor + Then: + input.shape = [3, 2] + + Args: + input (Variable): The input can be N-D Tensor or SelectedRows with data type bool, float16, float32, float64, int32, int64. + If input variable is type of SelectedRows, returns the shape of it's inner tensor. + + Returns: + Variable (Tensor): The shape of the input variable. + + Examples: + .. code-block:: python -def _real_to_complex_dtype(dtype): - if dtype == core.VarDesc.VarType.FP32: - return core.VarDesc.VarType.COMPLEX64 - elif dtype == core.VarDesc.VarType.FP64: - return core.VarDesc.VarType.COMPLEX128 - else: - return dtype + import paddle.fluid as fluid + import numpy as np + import paddle + paddle.enable_static() + + inputs = fluid.data(name="x", shape=[3, 100, 100], dtype="float32") + output = fluid.layers.shape(inputs) + + exe = fluid.Executor(fluid.CPUPlace()) + exe.run(fluid.default_startup_program()) + + img = np.ones((3, 100, 100)).astype(np.float32) + + res = exe.run(fluid.default_main_program(), feed={'x':img}, fetch_list=[output]) + print(res) # [array([ 3, 100, 100], dtype=int32)] + """ + if in_dygraph_mode(): + out = _C_ops.final_state_shape(input) + out.stop_gradient = True + return out + if _in_legacy_dygraph(): + out = _C_ops.shape(input) + out.stop_gradient = True + return out + + check_variable_and_dtype(input, 'input', [ + 'bool', 'float16', 'float32', 'float64', 'int32', 'int64', 'complex64', + 'complex128' + ], 'shape') + helper = LayerHelper('shape', **locals()) + out = helper.create_variable_for_type_inference(dtype='int32') + helper.append_op( + type='shape', + inputs={'Input': input}, + outputs={'Out': out}, + stop_gradient=True) + + return out def is_complex(x): diff --git a/python/paddle/tensor/creation.py b/python/paddle/tensor/creation.py index 95f145cf447b5..f4f1e7a3d5067 100644 --- a/python/paddle/tensor/creation.py +++ b/python/paddle/tensor/creation.py @@ -14,27 +14,138 @@ from __future__ import print_function import numpy as np +import math from paddle.common_ops_import import fill_constant from ..fluid.layers import utils - -from ..fluid.layers import tensor from ..static import Variable, device_guard from ..framework import _current_expected_place, _get_paddle_place from ..framework import dygraph_only from ..framework import core -from ..fluid.layer_helper import LayerHelper +from ..framework import in_dygraph_mode, _non_static_mode +from ..framework import LayerHelper from ..fluid.data_feeder import check_variable_and_dtype, check_type, check_dtype, convert_dtype from ..framework import convert_np_dtype_to_dtype_, _varbase_creator, OpProtoHolder -from paddle.tensor.attribute import _complex_to_real_dtype, _real_to_complex_dtype # TODO: define functions to get create a tensor -from ..fluid.layers import linspace # noqa: F401 import paddle from paddle import _C_ops -from ..fluid.framework import _in_legacy_dygraph, in_dygraph_mode, _in_eager_without_dygraph_check +from ..fluid.framework import _in_legacy_dygraph, _in_eager_without_dygraph_check +import warnings __all__ = [] +def _complex_to_real_dtype(dtype): + if dtype == core.VarDesc.VarType.COMPLEX64: + return core.VarDesc.VarType.FP32 + elif dtype == core.VarDesc.VarType.COMPLEX128: + return core.VarDesc.VarType.FP64 + else: + return dtype + + +def _real_to_complex_dtype(dtype): + if dtype == core.VarDesc.VarType.FP32: + return core.VarDesc.VarType.COMPLEX64 + elif dtype == core.VarDesc.VarType.FP64: + return core.VarDesc.VarType.COMPLEX128 + else: + return dtype + + +def linspace(start, stop, num, dtype=None, name=None): + r""" + This OP return fixed number of evenly spaced values within a given interval. + + Args: + start(int|float|Tensor): The input :attr:`start` is start variable of range. It is a scalar, \ + or a Tensor of shape [1] with input data type int32, int64, float32 or float64. + stop(int|float|Tensor): The input :attr:`stop` is start variable of range. It is a scalar, \ + or a Tensor of shape [1] with input data type int32, int64, float32 or float64. + num(int|Tensor): The input :attr:`num` is given num of the sequence. It is an int scalar, \ + or a Tensor of shape [1] with data type int32. + dtype(np.dtype|str, optional): The data type of output tensor, it could be + int32, int64, float32 and float64. Default: if None, the data type is float32. + name(str, optional): Normally there is no need for user to set this property. + For more information, please refer to :ref:`api_guide_Name`.Default: None. + + Returns: + Tensor: the output data type will be float32, float64. The 1-D tensor with fixed number of evenly spaced values, \ + the data shape of this tensor is :math:`[num]` . If the :attr:`num` is set 1, the output tensor just has \ + the value with input :attr:`start`. + + Examples: + .. code-block:: python + + import paddle + data = paddle.linspace(0, 10, 5, 'float32') # [0.0, 2.5, 5.0, 7.5, 10.0] + data = paddle.linspace(0, 10, 1, 'float32') # [0.0] + + """ + if dtype is None: + dtype = 'float32' + tensor_num = num + tensor_start = start + tensor_stop = stop + if not isinstance(num, Variable): + check_type(num, 'num', (int), 'linspace') + if not isinstance(dtype, core.VarDesc.VarType): + dtype = convert_np_dtype_to_dtype_(dtype) + if not isinstance(start, Variable): + with device_guard("cpu"): + tensor_start = fill_constant([1], dtype, start) + if not isinstance(stop, Variable): + with device_guard("cpu"): + tensor_stop = fill_constant([1], dtype, stop) + if not isinstance(num, Variable): + with device_guard("cpu"): + tensor_num = fill_constant([1], 'int32', num) + if _non_static_mode(): + return _C_ops.linspace(tensor_start, tensor_stop, tensor_num, 'dtype', + dtype) + + helper = LayerHelper("linspace", **locals()) + + start_dtype = convert_dtype(tensor_start.dtype) + stop_dtype = convert_dtype(tensor_stop.dtype) + out_dtype = convert_dtype(dtype) + if isinstance(start, Variable): + check_dtype(start.dtype, 'start', + ['float32', 'float64', 'int32', 'int64'], 'linspace') + else: + check_type(start, 'start', (int, float), 'linspace') + + if isinstance(stop, Variable): + check_dtype(stop.dtype, 'stop', + ['float32', 'float64', 'int32', 'int64'], 'linspace') + else: + check_type(stop, 'stop', (int, float), 'linspace') + if isinstance(num, Variable): + check_dtype(num.dtype, 'num', ['int32'], 'linspace') + check_dtype(dtype, 'dtype', ['int32', 'int64', 'float32', 'float64'], + 'linspace') + if ((stop_dtype == "float64" or start_dtype == "float64") and + out_dtype in ["float32", "int32"]) or ((stop_dtype == "int64" or + start_dtype == "int64") and + out_dtype == "int32"): + raise ValueError( + "The dtype of start/stop is {}/{} but the attr(dtype) of linspace is {}, " + "which may cause data type overflows. Please reset attr(dtype) of linspace." + .format(start_dtype, stop_dtype, dtype)) + + out = helper.create_variable_for_type_inference(dtype=dtype) + + helper.append_op( + type='linspace', + inputs={'Start': tensor_start, + 'Stop': tensor_stop, + 'Num': tensor_num}, + attrs={'dtype': dtype}, + outputs={'Out': [out]}) + if isinstance(num, int): + out.desc.set_shape((num, )) + return out + + @dygraph_only def to_tensor(data, dtype=None, place=None, stop_gradient=True): r""" @@ -60,7 +171,7 @@ def to_tensor(data, dtype=None, place=None, stop_gradient=True): Tensor: A Tensor constructed from ``data`` . Raises: - TypeError: If the data type of ``data`` is not scalar, list, tuple, numpy.ndarray, paddle.Tensor + TypeError: If the data type of ``data`` is not scalar, list, tuple, np.ndarray, paddle.Tensor ValueError: If ``data`` is tuple|list, it can't contain nested tuple|list with different lengths , such as: [[1, 2], [3, 4, 5]] TypeError: If ``dtype`` is not bool, float16, float32, float64, int8, int16, int32, int64, uint8, complex64, complex128 ValueError: If ``place`` is not paddle.CPUPlace, paddle.CUDAPinnedPlace, paddle.CUDAPlace or specified pattern string. @@ -152,7 +263,7 @@ def _handle_dtype(data, dtype): return data else: raise TypeError( - "Can't constructs a 'paddle.Tensor' with data type {}, data type must be scalar|list|tuple|numpy.ndarray|paddle.Tensor". + "Can't constructs a 'paddle.Tensor' with data type {}, data type must be scalar|list|tuple|np.ndarray|paddle.Tensor". format(type(data))) if not dtype: if data.dtype in [ @@ -439,11 +550,39 @@ def eye(num_rows, num_columns=None, dtype=None, name=None): dtype = 'float32' if num_columns is None: num_columns = num_rows - return paddle.fluid.layers.eye(num_rows=num_rows, - num_columns=num_columns, - batch_shape=None, - dtype=dtype, - name=name) + + if not isinstance(dtype, core.VarDesc.VarType): + dtype = convert_np_dtype_to_dtype_(dtype) + if num_columns is not None: + if not isinstance(num_columns, int) or num_columns < 0: + raise TypeError("num_columns should be a non-negative int") + else: + num_columns = num_rows + + if _non_static_mode(): + out = _C_ops.eye('dtype', dtype, 'num_rows', num_rows, 'num_columns', + num_columns) + + else: + helper = LayerHelper("eye", **locals()) + check_dtype(dtype, 'dtype', + ['float16', 'float32', 'float64', 'int32', 'int64'], 'eye') + if not isinstance(num_rows, int) or num_rows < 0: + raise TypeError("num_rows should be a non-negative int") + out = helper.create_variable_for_type_inference(dtype=dtype) + helper.append_op( + type='eye', + inputs={}, + outputs={'Out': [out]}, + attrs={ + 'num_rows': num_rows, + 'num_columns': num_columns, + 'dtype': dtype + }, + stop_gradient=True) + + out.stop_gradient = True + return out def full(shape, fill_value, dtype=None, name=None): @@ -564,7 +703,53 @@ def arange(start=0, end=None, step=1, dtype=None, name=None): end = start start = 0 - return paddle.fluid.layers.range(start, end, step, dtype, name) + if not isinstance(dtype, core.VarDesc.VarType): + dtype = convert_np_dtype_to_dtype_(dtype) + + if not isinstance(start, Variable): + with device_guard("cpu"): + start = fill_constant([1], dtype, start, force_cpu=True) + elif start.dtype != dtype: + start = paddle.cast(start, dtype) + + if not isinstance(end, Variable): + with device_guard("cpu"): + end = fill_constant([1], dtype, end, force_cpu=True) + elif end.dtype != dtype: + end = paddle.cast(end, dtype) + + if not isinstance(step, Variable): + with device_guard("cpu"): + step = fill_constant([1], dtype, step, force_cpu=True) + elif step.dtype != dtype: + step = paddle.cast(step, dtype) + + if in_dygraph_mode(): + return _C_ops.final_state_arange(start, end, step, dtype, + _current_expected_place()) + + if _in_legacy_dygraph(): + out = _C_ops.range(start, end, step) + out.stop_gradient = True + return out + + out_shape = None + if not isinstance(start, Variable) and not isinstance( + end, Variable) and not isinstance(step, Variable): + out_shape = [int(math.ceil((end - start) / step))] + + check_dtype(dtype, 'dtype', ['float32', 'float64', 'int32', 'int64'], + 'range/arange') + helper = LayerHelper('range', **locals()) + out = helper.create_variable_for_type_inference(dtype, shape=out_shape) + helper.append_op( + type='range', + inputs={'Start': start, + 'End': end, + 'Step': step}, + outputs={'Out': out}) + out.stop_gradient = True + return out def _tril_triu_op(helper): @@ -1187,7 +1372,7 @@ def assign(x, output=None): The OP copies the :attr:`x` to the :attr:`output`. Parameters: - x (Tensor|numpy.ndarray|list|tuple|scalar): A tensor, numpy ndarray, tuple/list of scalar, + x (Tensor|np.ndarray|list|tuple|scalar): A tensor, numpy ndarray, tuple/list of scalar, or scalar. Its data type supports float16, float32, float64, int32, int64, and bool. Note: the float64 data will be converted to float32 because of current platform protobuf data limitation. @@ -1211,9 +1396,91 @@ def assign(x, output=None): result2 = paddle.assign(data) # result2 = [[2.5, 2.5], [2.5, 2.5], [2.5, 2.5]] result3 = paddle.assign(np.array([[2.5, 2.5], [2.5, 2.5], [2.5, 2.5]], dtype='float32')) # result3 = [[2.5, 2.5], [2.5, 2.5], [2.5, 2.5]] """ - check_type(x, 'x', (Variable, np.ndarray, list, tuple, float, int, bool), - 'assign') - return tensor.assign(x, output) + input = x + helper = LayerHelper('assign', **locals()) + check_type(input, 'input', (Variable, np.ndarray, list, tuple, float, int, + bool), 'assign') + is_inplace = True if output is not None else False + + if np.isscalar(input) and not isinstance(input, str): + input = np.array([input]) + elif isinstance(input, (list, tuple)): + input = np.array(input) + # NOTE(Aurelius84): Why we judge core.VarBase? + # In case of @to_static, a VarBase can be as input of `assign`, + # but _non_static_mode()==False under @to_static, which means + # isinstance(VarBase, Variable) == False. It will cause return None + # after this api. + if isinstance(input, (Variable, core.VarBase)): + if _non_static_mode(): + if output is None: + if _in_legacy_dygraph(): + output = core.VarBase() + else: + output = core.eager.Tensor() + _C_ops.assign(input, output) + else: + check_dtype(input.dtype, 'input', [ + 'float16', 'uint16', 'float32', 'float64', 'int32', 'int64', + 'uint8', 'bool' + ], 'assign', '(When the type of input in assign is Variable.)') + if output is None: + output = helper.create_variable_for_type_inference( + dtype=input.dtype) + helper.append_op( + type='assign', inputs={'X': [input]}, + outputs={'Out': [output]}) + elif isinstance(input, np.ndarray): + # Not support [var, var, ...] currently. + if len(input.shape) > 0 and any(isinstance(x, Variable) for x in input): + raise TypeError( + "Required type(input) numpy.ndarray, but found `list(Variable)` in input." + ) + dtype = convert_np_dtype_to_dtype_(input.dtype) + if dtype == core.VarDesc.VarType.FP64: + # Setting FP64 numpy data is not supported in Paddle, so we + # use FP32 here + warnings.warn( + "paddle.assign doesn't support float64 input now due " + "to current platform protobuf data limitation, we convert " + "it to float32") + dtype = core.VarDesc.VarType.FP32 + if dtype == core.VarDesc.VarType.BOOL: + value_name = "bool_values" + values = [int(v) for v in input.flat] + elif dtype == core.VarDesc.VarType.FP32: + value_name = "fp32_values" + values = [float(v) for v in input.flat] + elif dtype == core.VarDesc.VarType.INT32: + value_name = "int32_values" + values = [int(v) for v in input.flat] + elif dtype == core.VarDesc.VarType.INT64: + value_name = "int64_values" + values = [int(v) for v in input.flat] + else: + raise TypeError( + "When the type of 'input' in assign is numpy.ndarray, " + "the data type of 'input' must be bool, float32, int32 or int64, but " + "received %s." % convert_dtype(dtype)) + if input.size > 1024 * 1024: + raise ValueError("The size of input is too big. Please consider " + "saving it to file and 'load_op' to load it") + if output is None: + output = helper.create_variable_for_type_inference( + dtype=input.dtype) + helper.append_op( + type='assign_value', + outputs={'Out': [output]}, + attrs={ + 'dtype': dtype, + 'shape': list(input.shape), + value_name: values + }) + + if is_inplace and _non_static_mode(): + output._bump_inplace_version() + + return output def clone(x, name=None): diff --git a/python/paddle/tensor/linalg.py b/python/paddle/tensor/linalg.py index a00ae8046ed68..4af4ac52209ef 100644 --- a/python/paddle/tensor/linalg.py +++ b/python/paddle/tensor/linalg.py @@ -13,14 +13,16 @@ # limitations under the License. import numpy as np -from ..fluid.layer_helper import LayerHelper +from ..framework import LayerHelper from ..framework import _varbase_creator, _dygraph_tracer, in_dygraph_mode, _non_static_mode from ..fluid.data_feeder import check_variable_and_dtype, check_type, check_dtype from ..static import Variable from ..fluid.framework import _in_legacy_dygraph from .manipulation import cast +from .math import multiply, add +from .logic import logical_not +from .creation import full -from ..fluid import layers import paddle from paddle.common_ops_import import core from paddle.common_ops_import import VarDesc @@ -2532,11 +2534,11 @@ def pinv(x, rcond=1e-15, hermitian=False, name=None): y = paddle.to_tensor(y, dtype=x.dtype) condition = s > cutoff - cond_int = layers.cast(condition, s.dtype) - cond_not_int = layers.cast(layers.logical_not(condition), s.dtype) - out1 = layers.elementwise_mul(1 / s, cond_int) - out2 = layers.elementwise_mul(1 / y, cond_not_int) - singular = layers.elementwise_add(out1, out2) + cond_int = cast(condition, s.dtype) + cond_not_int = cast(logical_not(condition), s.dtype) + out1 = multiply(1 / s, cond_int) + out2 = multiply(1 / y, cond_not_int) + singular = add(out1, out2) st, _ = _C_ops.unsqueeze2(singular, 'axes', [-2]) dims = list(range(len(vt.shape))) @@ -2559,11 +2561,11 @@ def pinv(x, rcond=1e-15, hermitian=False, name=None): y = paddle.to_tensor(y, dtype=s.dtype) condition = s_abs > cutoff - cond_int = layers.cast(condition, s.dtype) - cond_not_int = layers.cast(layers.logical_not(condition), s.dtype) - out1 = layers.elementwise_mul(1 / s, cond_int) - out2 = layers.elementwise_mul(1 / y, cond_not_int) - singular = layers.elementwise_add(out1, out2) + cond_int = cast(condition, s.dtype) + cond_not_int = cast(logical_not(condition), s.dtype) + out1 = multiply(1 / s, cond_int) + out2 = multiply(1 / y, cond_not_int) + singular = add(out1, out2) st, _ = _C_ops.unsqueeze2(singular, 'axes', [-2]) out_1 = u * st @@ -2597,17 +2599,17 @@ def pinv(x, rcond=1e-15, hermitian=False, name=None): 'keep_dim': True, 'reduce_all': False}) - rcond = layers.fill_constant(shape=[1], value=rcond, dtype=dtype) + rcond = full(shape=[1], fill_value=rcond, dtype=dtype) cutoff = rcond * max_singular_val y = float('inf') - y = layers.fill_constant(shape=[1], value=y, dtype=dtype) + y = full(shape=[1], fill_value=y, dtype=dtype) condition = s > cutoff - cond_int = layers.cast(condition, dtype) - cond_not_int = layers.cast(layers.logical_not(condition), dtype) - out1 = layers.elementwise_mul(1 / s, cond_int) - out2 = layers.elementwise_mul(1 / y, cond_not_int) - singular = layers.elementwise_add(out1, out2) + cond_int = cast(condition, dtype) + cond_not_int = cast(logical_not(condition), dtype) + out1 = multiply(1 / s, cond_int) + out2 = multiply(1 / y, cond_not_int) + singular = add(out1, out2) st = helper.create_variable_for_type_inference(dtype=dtype) st_shape = helper.create_variable_for_type_inference(dtype=dtype) @@ -2682,17 +2684,17 @@ def pinv(x, rcond=1e-15, hermitian=False, name=None): 'keep_dim': True, 'reduce_all': False}) - rcond = layers.fill_constant(shape=[1], value=rcond, dtype=s_type) + rcond = full(shape=[1], fill_value=rcond, dtype=s_type) cutoff = rcond * max_singular_val y = float('inf') - y = layers.fill_constant(shape=[1], value=y, dtype=s_type) + y = full(shape=[1], fill_value=y, dtype=s_type) condition = s_abs > cutoff - cond_int = layers.cast(condition, s_type) - cond_not_int = layers.cast(layers.logical_not(condition), s_type) - out1 = layers.elementwise_mul(1 / s, cond_int) - out2 = layers.elementwise_mul(1 / y, cond_not_int) - singular = layers.elementwise_add(out1, out2) + cond_int = cast(condition, s_type) + cond_not_int = cast(logical_not(condition), s_type) + out1 = multiply(1 / s, cond_int) + out2 = multiply(1 / y, cond_not_int) + singular = add(out1, out2) st = helper.create_variable_for_type_inference(dtype=s_type) st_shape = helper.create_variable_for_type_inference(dtype=s_type) diff --git a/python/paddle/tensor/manipulation.py b/python/paddle/tensor/manipulation.py index 3a79abd2dc06e..b0e0082c6d9c4 100755 --- a/python/paddle/tensor/manipulation.py +++ b/python/paddle/tensor/manipulation.py @@ -16,32 +16,723 @@ from collections import Counter from ..static import Variable, device_guard -from ..framework import core -from ..fluid.framework import _in_legacy_dygraph, in_dygraph_mode, _in_eager_without_dygraph_check, _non_static_mode -from ..fluid.layer_helper import LayerHelper +from ..framework import core, in_dygraph_mode +from ..fluid.framework import _in_legacy_dygraph, _in_eager_without_dygraph_check, _non_static_mode +from ..framework import LayerHelper from ..framework import OpProtoHolder, convert_np_dtype_to_dtype_, dygraph_only from ..fluid.data_feeder import convert_dtype, check_variable_and_dtype, check_type, check_dtype from ..fluid.layers import utils import numpy as np # TODO: define functions to manipulate a tensor -from ..fluid.layers import cast # noqa: F401 -from ..fluid.layers import slice # noqa: F401 -from ..fluid.layers import transpose # noqa: F401 -from ..fluid.layers import unstack # noqa: F401 - -from ..fluid.layers import scatter_nd # noqa: F401 -from ..fluid.layers import shard_index # noqa: F401 -from ..fluid.layers import crop_tensor as crop # noqa: F401 from ..fluid.layers.nn import _elementwise_op_in_dygraph -from ..fluid import layers from ..fluid.dygraph.inplace_utils import inplace_apis_in_dygraph_only import paddle from paddle import _C_ops -from paddle.tensor.attribute import _complex_to_real_dtype, _real_to_complex_dtype +from ..common_ops_import import dygraph_utils, fill_constant, _varbase_creator +import warnings +from .creation import zeros +from .creation import _complex_to_real_dtype +from .creation import _real_to_complex_dtype __all__ = [] +def cast(x, dtype): + """ + + This OP takes in the Tensor :attr:`x` with :attr:`x.dtype` and casts it + to the output with :attr:`dtype`. It's meaningless if the output dtype + equals the input dtype, but it's fine if you do so. + + Args: + x(Tensor): An input N-D Tensor with data type bool, float16, + float32, float64, int32, int64, uint8. + dtype(np.dtype|str): Data type of the output: + bool, float16, float32, float64, int8, int32, int64, uint8. + + Returns: + Tensor: A Tensor with the same shape as input's. + + Examples: + .. code-block:: python + + import paddle + + x = paddle.to_tensor([2, 3, 4], 'float64') + y = paddle.cast(x, 'uint8') + """ + if in_dygraph_mode(): + if not isinstance(dtype, core.VarDesc.VarType): + dtype = convert_np_dtype_to_dtype_(dtype) + return _C_ops.final_state_cast(x, dtype) + + if _non_static_mode(): + if not isinstance(dtype, core.VarDesc.VarType): + dtype = convert_np_dtype_to_dtype_(dtype) + out = _C_ops.cast(x, 'in_dtype', x.dtype, 'out_dtype', dtype) + return out + + check_variable_and_dtype(x, 'x', [ + 'bool', 'float16', 'float32', 'float64', 'int16', 'int32', 'int64', + 'uint8', 'uint16' + ], 'cast') + check_dtype(dtype, 'dtype', [ + 'bool', 'float16', 'float32', 'float64', 'int8', 'int16', 'int32', + 'int64', 'uint8', 'uint16' + ], 'cast') + + helper = LayerHelper('cast', **locals()) + out = helper.create_variable_for_type_inference( + dtype=dtype, stop_gradient=x.stop_gradient) + helper.append_op( + type='cast', + inputs={'X': [x]}, + outputs={'Out': [out]}, + attrs={'in_dtype': x.dtype, + 'out_dtype': out.dtype}) + return out + + +def slice(input, axes, starts, ends): + """ + This operator produces a slice of ``input`` along multiple axes. Similar to numpy: + https://docs.scipy.org/doc/numpy/reference/arrays.indexing.html + Slice uses ``axes``, ``starts`` and ``ends`` attributes to specify the start and + end dimension for each axis in the list of axes and Slice uses this information + to slice the input data tensor. If a negative value is passed to + ``starts`` or ``ends`` such as :math:`-i`, it represents the reverse position of the + axis :math:`i-1` (here 0 is the initial position). + If the value passed to ``starts`` or ``ends`` is greater than n + (the number of elements in this dimension), it represents n. + For slicing to the end of a dimension with unknown size, it is recommended + to pass in INT_MAX. The size of ``axes`` must be equal to ``starts`` and ``ends``. + Following examples will explain how slice works: + + .. code-block:: text + + Case1: + Given: + data = [ [1, 2, 3, 4], [5, 6, 7, 8], ] + axes = [0, 1] + starts = [1, 0] + ends = [2, 3] + Then: + result = [ [5, 6, 7], ] + + Case2: + Given: + data = [ [1, 2, 3, 4], [5, 6, 7, 8], ] + axes = [0, 1] + starts = [0, 1] + ends = [-1, 1000] # -1 denotes the reverse 0th position of dimension 0. + Then: + result = [ [2, 3, 4], ] # result = data[0:1, 1:4] + + Args: + input (Tensor): A ``Tensor`` . The data type is ``float16``, ``float32``, ``float64``, ``int32`` or ``int64``. + axes (list|tuple): The data type is ``int32`` . Axes that `starts` and `ends` apply to . + starts (list|tuple|Tensor): The data type is ``int32`` . If ``starts`` is a list or tuple, the elements of + it should be integers or Tensors with shape [1]. If ``starts`` is an Tensor, it should be an 1-D Tensor. + It represents starting indices of corresponding axis in ``axes``. + ends (list|tuple|Tensor): The data type is ``int32`` . If ``ends`` is a list or tuple, the elements of + it should be integers or Tensors with shape [1]. If ``ends`` is an Tensor, it should be an 1-D Tensor . + It represents ending indices of corresponding axis in ``axes``. + + Returns: + Tensor: A ``Tensor``. The data type is same as ``input``. + + Raises: + TypeError: The type of ``starts`` must be list, tuple or Tensor. + TypeError: The type of ``ends`` must be list, tuple or Tensor. + + Examples: + .. code-block:: python + + import paddle + + input = paddle.rand(shape=[4, 5, 6], dtype='float32') + # example 1: + # attr starts is a list which doesn't contain tensor. + axes = [0, 1, 2] + starts = [-3, 0, 2] + ends = [3, 2, 4] + sliced_1 = paddle.slice(input, axes=axes, starts=starts, ends=ends) + # sliced_1 is input[0:3, 0:2, 2:4]. + + # example 2: + # attr starts is a list which contain tensor. + minus_3 = paddle.full([1], -3, "int32") + sliced_2 = paddle.slice(input, axes=axes, starts=[minus_3, 0, 2], ends=ends) + # sliced_2 is input[0:3, 0:2, 2:4]. + """ + if in_dygraph_mode(): + attrs = () + starts_tensor = None + ends_tensor = None + + if isinstance(axes, (list, tuple)): + axes = list(axes) + if len(axes) == 0: + raise ValueError( + "Input axes should not be an empty list/tuple.") + for i in range(len(axes)): + if axes[i] < 0: + axes[i] = max(0, axes[i] + len(input.shape)) + else: + axes[i] = min(len(input.shape) - 1, axes[i]) + + else: + raise ValueError( + "Input axes must be a python list or tuple, but reveived {}". + format(type(axes))) + + infer_flags = list(1 for i in range(len(axes))) + + tmp_tensor_type = core.eager.Tensor + + if isinstance(starts, (list, tuple)): + starts = [ + item.numpy().item(0) + if isinstance(item, tmp_tensor_type) else item + for item in starts + ] + attrs += ('starts', starts) + elif isinstance(starts, tmp_tensor_type): + starts_tensor = starts + starts.stop_gradient = True + infer_flags = list(-1 for i in range(len(axes))) + + if isinstance(ends, (list, tuple)): + ends = [ + item.numpy().item(0) + if isinstance(item, tmp_tensor_type) else item for item in ends + ] + attrs += ('ends', ends) + elif isinstance(ends, tmp_tensor_type): + ends_tensor = ends + ends_tensor.stop_gradient = True + infer_flags = list(-1 for i in range(len(axes))) + return _C_ops.slice(input, starts_tensor, ends_tensor, None, None, + 'axes', axes, 'infer_flags', infer_flags, *attrs) + else: + if _in_legacy_dygraph(): + attrs = () + starts_tensor = None + ends_tensor = None + + if isinstance(axes, (list, tuple)): + axes = list(axes) + if len(axes) == 0: + raise ValueError( + "Input axes should not be an empty list/tuple.") + for i in range(len(axes)): + if axes[i] < 0: + axes[i] = max(0, axes[i] + len(input.shape)) + else: + axes[i] = min(len(input.shape) - 1, axes[i]) + + else: + raise ValueError( + "Input axes must be a python list or tuple, but reveived {}". + format(type(axes))) + + infer_flags = list(1 for i in range(len(axes))) + + tmp_tensor_type = Variable + + if isinstance(starts, (list, tuple)): + starts = [ + item.numpy().item(0) + if isinstance(item, tmp_tensor_type) else item + for item in starts + ] + attrs += ('starts', starts) + elif isinstance(starts, tmp_tensor_type): + starts_tensor = starts + starts.stop_gradient = True + infer_flags = list(-1 for i in range(len(axes))) + + if isinstance(ends, (list, tuple)): + ends = [ + item.numpy().item(0) + if isinstance(item, tmp_tensor_type) else item + for item in ends + ] + attrs += ('ends', ends) + elif isinstance(ends, tmp_tensor_type): + ends_tensor = ends + ends_tensor.stop_gradient = True + infer_flags = list(-1 for i in range(len(axes))) + + return _C_ops.slice(input, starts_tensor, ends_tensor, None, None, + 'axes', axes, 'infer_flags', infer_flags, + *attrs) + + if not isinstance(starts, (list, tuple, Variable)): + raise ValueError( + "Input starts must be an Variable, python list or tuple.") + if not isinstance(ends, (list, tuple, Variable)): + raise ValueError( + "Input ends must be an Variable, python list or tuple.") + + helper = LayerHelper('slice', **locals()) + + inputs = {'Input': input} + attrs = {'axes': axes} + infer_flags = list(1 for i in range(len(axes))) + + # starts + if isinstance(starts, Variable): + starts.stop_gradient = True + inputs['StartsTensor'] = starts + infer_flags = list(-1 for i in range(len(axes))) + elif isinstance(starts, (list, tuple)): + attrs['starts'] = [] + if utils._contain_var(starts): + inputs['StartsTensorList'] = utils._convert_to_tensor_list(starts) + for i, dim in enumerate(starts): + if isinstance(dim, Variable): + attrs['starts'].append(-1) + infer_flags[i] = -1 + else: + attrs['starts'].append(dim) + else: + attrs['starts'] = starts + + # ends + if isinstance(ends, Variable): + ends.stop_gradient = True + inputs['EndsTensor'] = ends + infer_flags = list(-1 for i in range(len(axes))) + elif isinstance(ends, (list, tuple)): + attrs['ends'] = [] + if utils._contain_var(ends): + inputs['EndsTensorList'] = utils._convert_to_tensor_list(ends) + for i, dim in enumerate(ends): + if isinstance(dim, Variable): + attrs['ends'].append(-1) + infer_flags[i] = -1 + else: + attrs['ends'].append(dim) + else: + attrs['ends'] = ends + + # infer_flags + attrs['infer_flags'] = infer_flags + out = helper.create_variable_for_type_inference( + dtype=helper.input_dtype('input')) + helper.append_op( + type='slice', inputs=inputs, attrs=attrs, outputs={'Out': out}) + + return out + + +def transpose(x, perm, name=None): + """ + Permute the data dimensions of `input` according to `perm`. + + The `i`-th dimension of the returned tensor will correspond to the + perm[i]-th dimension of `input`. + + Args: + x (Tensor): The input Tensor. It is a N-D Tensor of data types bool, float32, float64, int32. + perm (list|tuple): Permute the input according to the data of perm. + name (str): The name of this layer. It is optional. + + Returns: + Tensor: A transposed n-D Tensor, with data type being bool, float32, float64, int32, int64. + + For Example: + + .. code-block:: text + + x = [[[ 1 2 3 4] [ 5 6 7 8] [ 9 10 11 12]] + [[13 14 15 16] [17 18 19 20] [21 22 23 24]]] + shape(x) = [2,3,4] + + # Example 1 + perm0 = [1,0,2] + y_perm0 = [[[ 1 2 3 4] [13 14 15 16]] + [[ 5 6 7 8] [17 18 19 20]] + [[ 9 10 11 12] [21 22 23 24]]] + shape(y_perm0) = [3,2,4] + + # Example 2 + perm1 = [2,1,0] + y_perm1 = [[[ 1 13] [ 5 17] [ 9 21]] + [[ 2 14] [ 6 18] [10 22]] + [[ 3 15] [ 7 19] [11 23]] + [[ 4 16] [ 8 20] [12 24]]] + shape(y_perm1) = [4,3,2] + + Examples: + + .. code-block:: python + + import paddle + + x = paddle.randn([2, 3, 4]) + x_transposed = paddle.transpose(x, perm=[1, 0, 2]) + print(x_transposed.shape) + # [3L, 2L, 4L] + + """ + if in_dygraph_mode(): + return _C_ops.final_state_transpose(x, perm) + else: + if _in_legacy_dygraph(): + out, _ = _C_ops.transpose2(x, 'axis', perm) + return out + + check_variable_and_dtype(x, 'x', [ + 'bool', 'float16', 'float32', 'float64', 'int32', 'int64', 'complex64', + 'complex128' + ], 'transpose') + check_type(perm, 'perm', (list, tuple), 'transpose') + if isinstance(perm, tuple): + perm = list(perm) + if len(perm) != len(x.shape): + raise ValueError( + "Input(perm) is the permutation of dimensions of Input(x), " + "its length should be equal to dimensions of Input(x), " + "but received dimension of Input(x) is %s, " + "the length of Input(perm) is %s." % (len(x.shape), len(perm))) + for idx, dim in enumerate(perm): + if dim >= len(x.shape): + raise ValueError( + "Each element in Input(perm) should be less than Input(x)'s dimension, " + "but %d-th element in Input(perm) is %d which exceeds Input(x)'s " + "dimension %d." % (idx, perm[idx], len(x.shape))) + + helper = LayerHelper('transpose', **locals()) + out = helper.create_variable_for_type_inference(x.dtype) + x_shape = helper.create_variable_for_type_inference(x.dtype) + helper.append_op( + type='transpose2', + inputs={'X': [x]}, + outputs={'Out': [out], + 'XShape': [x_shape]}, + attrs={'axis': perm}) + return out + + +def unstack(x, axis=0, num=None): + """ + :alias_main: paddle.unstack + :alias: paddle.unstack,paddle.tensor.unstack,paddle.tensor.manipulation.unstack + :old_api: paddle.fluid.layers.unstack + + **UnStack Layer** + + This layer unstacks input Tensor :code:`x` into several Tensors along :code:`axis`. + + If :code:`axis` < 0, it would be replaced with :code:`axis+rank(x)`. + If :code:`num` is None, it would be inferred from :code:`x.shape[axis]`, + and if :code:`x.shape[axis]` <= 0 or is unknown, :code:`ValueError` is + raised. + + Args: + x (Tensor): Input Tensor. It is a N-D Tensors of data types float32, float64, int32, int64. + axis (int): The axis along which the input is unstacked. + num (int|None): The number of output variables. + + Returns: + list(Tensor): The unstacked Tensors list. The list elements are N-D Tensors of data types float32, float64, int32, int64. + + Raises: + ValueError: If x.shape[axis] <= 0 or axis is not in range [-D, D). + + Examples: + .. code-block:: python + + import paddle + x = paddle.ones(name='x', shape=[2, 3, 5], dtype='float32') # create a tensor with shape=[2, 3, 5] + y = paddle.unstack(x, axis=1) # unstack with second axis, which results 3 tensors with shape=[2, 5] + + """ + if _non_static_mode(): + if num == None: + num = x.shape[axis] + if num == 0: + return [] + return _C_ops.unstack(x, num, 'axis', int(axis), 'num', num) + + helper = LayerHelper('unstack', **locals()) + if num is None: + if axis is None or x.shape[axis] <= 0: + raise ValueError('unknown unstack number') + else: + num = x.shape[axis] + + outs = [] + for _ in range(num): + outs.append(helper.create_variable_for_type_inference(x.dtype)) + + helper.append_op( + type='unstack', + inputs={'X': [x]}, + outputs={'Y': outs}, + attrs={'axis': axis, + 'num': num}) + return outs + + +def shard_index(input, index_num, nshards, shard_id, ignore_value=-1): + """ + Reset the values of `input` according to the shard it beloning to. + Every value in `input` must be a non-negative integer, and + the parameter `index_num` represents the integer above the maximum + value of `input`. Thus, all values in `input` must be in the range + [0, index_num) and each value can be regarded as the offset to the beginning + of the range. The range is further split into multiple shards. Specifically, + we first compute the `shard_size` according to the following formula, + which represents the number of integers each shard can hold. So for the + i'th shard, it can hold values in the range [i*shard_size, (i+1)*shard_size). + :: + + shard_size = (index_num + nshards - 1) // nshards + + For each value `v` in `input`, we reset it to a new value according to the + following formula: + :: + + v = v - shard_id * shard_size if shard_id * shard_size <= v < (shard_id+1) * shard_size else ignore_value + + That is, the value `v` is set to the new offset within the range represented by the shard `shard_id` + if it in the range. Otherwise, we reset it to be `ignore_value`. + + Args: + input (Tensor): Input tensor with data type int64 or int32. It's last dimension must be 1. + index_num (int): An integer represents the integer above the maximum value of `input`. + nshards (int): The number of shards. + shard_id (int): The index of the current shard. + ignore_value (int): An integer value out of sharded index range. + + Returns: + Tensor. + + Examples: + .. code-block:: python + + import paddle + label = paddle.to_tensor([[16], [1]], "int64") + shard_label = paddle.shard_index(input=label, + index_num=20, + nshards=2, + shard_id=0) + print(shard_label) + # [[-1], [1]] + """ + if in_dygraph_mode(): + return _C_ops.final_state_shard_index(input, index_num, nshards, + shard_id, ignore_value) + + check_variable_and_dtype(input, 'input', ['int64', 'int32'], 'shard_index') + op_type = 'shard_index' + helper = LayerHelper(op_type, **locals()) + if shard_id < 0 or shard_id >= nshards: + raise ValueError('The shard_id(%d) should be in [0, %d)' % + (shard_id, nshards)) + + out = helper.create_variable_for_type_inference(dtype=input.dtype) + helper.append_op( + type=op_type, + inputs={'X': [input]}, + outputs={'Out': out}, + attrs={ + 'index_num': index_num, + 'nshards': nshards, + 'shard_id': shard_id, + 'ignore_value': ignore_value + }, + stop_gradient=True) + return out + + +def crop(x, shape=None, offsets=None, name=None): + """ + Crop input into output, as specified by offsets and shape. + + .. code-block:: text + + * Case 1 (input is a 2-D Tensor): + Input: + X.shape = [3, 5] + X.data = [[0, 1, 2, 0, 0], + [0, 3, 4, 0, 0], + [0, 0, 0, 0, 0]] + Parameters: + shape = [2, 2] + offsets = [0, 1] + Output: + Out.shape = [2, 2] + Out.data = [[1, 2], + [3, 4]] + * Case 2 (input is a 3-D Tensor): + Input: + X.shape = [2, 3, 4] + X.data = [[[0, 1, 2, 3], + [0, 5, 6, 7], + [0, 0, 0, 0]], + [[0, 3, 4, 5], + [0, 6, 7, 8], + [0, 0, 0, 0]]] + Parameters: + shape = [2, 2, -1] + offsets = [0, 0, 1] + Output: + Out.shape = [2, 2, 3] + Out.data = [[[1, 2, 3], + [5, 6, 7]], + [[3, 4, 5], + [6, 7, 8]]] + + Parameters: + x (Tensor): 1-D to 6-D Tensor, the data type is float32, float64, int32 or int64. + shape (list|tuple|Tensor): The output shape is specified + by `shape`. Its data type is int32. If a list/tuple, it's length must be + the same as the dimension size of `x`. If a Tensor, it should be a 1-D Tensor. + When it is a list, each element can be an integer or a Tensor of shape: [1]. + If Variable contained, it is suitable for the case that the shape may + be changed each iteration. + offsets (list|tuple|Variable, optional): Specifies the cropping + offsets at each dimension. Its data type is int32. If a list/tuple, it's length + must be the same as the dimension size of `x`. If a Tensor, it should be a 1-D + Tensor. When it is a list, each element can be an integer or a Tensor of shape: [1]. + If Variable contained, it is suitable for the case that the offsets may be changed + each iteration. Default: None, the offsets are 0 at each dimension. + name(str, optional): The default value is None. Normally there is no need for user to set + this property. For more information, please refer to :ref:`api_guide_Name` . + + Returns: + Tensor: The cropped Tensor has same data type with `x`. + + Examples: + + .. code-block:: python + :name: code-example1 + + import paddle + x = paddle.to_tensor([[1, 2, 3], [4, 5, 6], [7, 8, 9]]) + # x.shape = [3, 3] + # x = [[1, 2, 3], [4, 5, 6], [7, 8, 9]] + + # shape can be a 1-D Tensor or list or tuple. + shape = paddle.to_tensor([2, 2], dtype='int32') + # shape = [2, 2] + # shape = (2, 2) + out = paddle.crop(x, shape) + # out.shape = [2, 2] + # out = [[1,2], [4,5]] + + # offsets can be a 1-D Tensor or list or tuple. + offsets = paddle.to_tensor([0, 1], dtype='int32') + # offsets = [1, 0] + # offsets = (1, 1) + out = paddle.crop(x, shape, offsets) + # out.shape = [2, 2] + # if offsets = [0, 0], out = [[1,2], [4,5]] + # if offsets = [0, 1], out = [[2,3], [5,6]] + # if offsets = [1, 0], out = [[4,5], [7,8]] + # if offsets = [1, 1], out = [[5,6], [8,9]] + + """ + helper = LayerHelper('crop_tensor', **locals()) + check_variable_and_dtype(x, 'x', ['float32', 'float64', 'int32', 'int64'], + 'crop_tensor') + check_type(shape, 'shape', (list, tuple, Variable), 'crop_tensor') + check_type(offsets, 'offsets', (list, tuple, Variable, type(None)), + 'crop_tensor') + + if offsets is None: + offsets = [0] * len(x.shape) + + out = helper.create_variable_for_type_inference(x.dtype) + ipts = {'X': x} + attrs = {} + + def _attr_shape_check(shape_val): + if not isinstance(shape_val, int): + raise TypeError( + "Attr(shape)'s dtype of Op(crop_tensor) should be int32, but received: %s." + % type(shape_val)) + if shape_val == 0: + raise ValueError( + "Attr(shape) of Op(crop_tensor) should not be zero, but received: %s." + % str(shape_val)) + if shape_val < -1: + raise ValueError( + "When the element in Attr(shape) of Op(crop_tensor) is negative, only -1 is supported, but received: %s." + % str(shape_val)) + + def _attr_offsets_check(offset_val): + if not isinstance(offset_val, int): + raise TypeError( + "Attr(offsets)'s dtype of Op(crop_tensor) should be int32, but received: %s." + % type(offset_val)) + if offset_val < 0: + raise ValueError( + "Attr(offsets) of Op(crop_tensor) should be greater or equal to zero, but received: %s." + % str(offset_val)) + + if isinstance(offsets, Variable): + offsets.stop_gradient = True + ipts['Offsets'] = offsets + attrs['offsets'] = [-1] * len(x.shape) + elif utils._contain_var(offsets): + new_offsets_tensor = [] + offsets_attr = [] + for dim in offsets: + if isinstance(dim, Variable): + dim.stop_gradient = True + new_offsets_tensor.append(dim) + offsets_attr.append(-1) + else: + _attr_offsets_check(dim) + temp_out = helper.create_variable_for_type_inference('int32') + fill_constant([1], 'int32', dim, force_cpu=True, out=temp_out) + new_offsets_tensor.append(temp_out) + offsets_attr.append(dim) + ipts['OffsetsTensor'] = new_offsets_tensor + attrs['offsets'] = offsets_attr + else: + for offset in offsets: + _attr_offsets_check(offset) + attrs['offsets'] = offsets + + if isinstance(shape, Variable): + shape.stop_gradient = True + ipts['Shape'] = shape + elif utils._contain_var(shape): + new_shape_tensor = [] + shape_attr = [] + for dim_size in shape: + if isinstance(dim_size, Variable): + dim_size.stop_gradient = True + new_shape_tensor.append(dim_size) + shape_attr.append(0) + else: + _attr_shape_check(dim_size) + temp_out = helper.create_variable_for_type_inference('int32') + fill_constant( + [1], 'int32', dim_size, force_cpu=True, out=temp_out) + new_shape_tensor.append(temp_out) + shape_attr.append(dim_size) + ipts['ShapeTensor'] = new_shape_tensor + attrs['shape'] = shape_attr + else: + for dim_size in shape: + _attr_shape_check(dim_size) + attrs['shape'] = shape + + helper.append_op( + type='crop_tensor', + inputs=ipts, + outputs={'Out': out}, + attrs=None if len(attrs) == 0 else attrs) + return out + + @dygraph_only def fill_(x, value): """ @@ -328,7 +1019,74 @@ def concat(x, axis=0, name=None): # [11 12 13] # [14 15 16]] """ - return paddle.fluid.layers.concat(input=x, axis=axis, name=name) + input = x + if in_dygraph_mode(): + if isinstance(axis, Variable): + axis = axis.numpy() + axis = axis.item(0) + if not isinstance(input, Variable): + input = [t for t in input if t.shape.count(0) == 0] + return _C_ops.final_state_concat(input, axis) + + if _in_legacy_dygraph(): + if isinstance(axis, Variable): + axis = axis.numpy() + axis = axis.item(0) + if not isinstance(input, Variable): + input = [t for t in input if t.shape.count(0) == 0] + out = _varbase_creator() + _C_ops.concat(input, out, 'axis', axis) + return out + + check_type(input, 'input', (list, tuple, Variable), 'concat') + if not isinstance(input, Variable): + for id, x in enumerate(input): + check_variable_and_dtype( + x, 'input[' + str(id) + ']', + ['bool', 'float16', 'float32', 'float64', 'int32', 'int64'], + 'concat') + if x.dtype != input[0].dtype: + raise TypeError( + "All the Tensors in the input must have the same data type.") + else: + input = [input] + check_type(axis, 'axis', (int, Variable), 'concat') + + if isinstance(axis, Variable): + check_dtype( + axis.dtype, 'axis', ['int32', 'int64'], 'concat', + "The data type of axis must be int32 or int64 when axis is a Tensor") + + helper = LayerHelper('concat', **locals()) + out = helper.create_variable_for_type_inference(dtype=helper.input_dtype()) + + if input[0].desc.type() == core.VarDesc.VarType.LOD_TENSOR_ARRAY: + # NOTE(liym27): Don't remove this if branch! + # This feature is supported for Dynamic-to-Static, because after transformed, the type of inputs[0] + # is LOD_TENSOR_ARRAY in some scenarios. And this feature can be used in static mode. + + assert len(input) == 1, "If the elements of 'input' in concat are Variable(LoDTensorArray), " \ + "number of the elements must be 1, but received %s." % len(input) + out_index = helper.create_variable_for_type_inference(dtype="int32") + helper.append_op( + type='tensor_array_to_tensor', + inputs={'X': input[0]}, + outputs={'Out': [out], + 'OutIndex': [out_index]}, + attrs={'axis': axis, + 'use_stack': False}) + else: + inputs = {'X': input} + attrs = {} + if isinstance(axis, Variable): + axis.stop_gradient = True + inputs['AxisTensor'] = axis + else: + attrs['axis'] = axis + + helper.append_op( + type='concat', inputs=inputs, outputs={'Out': [out]}, attrs=attrs) + return out def broadcast_tensors(input, name=None): @@ -900,7 +1658,53 @@ def stack(x, axis=0, name=None): # [3., 4.], # [5., 6.]]] """ - return layers.stack(x, axis, name) + axis = 0 if axis is None else axis + + if in_dygraph_mode(): + return _C_ops.final_state_stack(x, axis) + + if _in_legacy_dygraph(): + return _C_ops.stack(x, 'axis', axis) + + if not isinstance(x, list) and not isinstance(x, tuple): + # NOTE:(zhiqiu) Only support Variable as input if the Variable is a LOD_TENSOR_ARRAY create by create_array, array_write, array_read, etc. + # In that case, Variable is array of tensors indeed. + if isinstance(x, Variable) and x.desc.type( + ) == core.VarDesc.VarType.LOD_TENSOR_ARRAY: + x = [x] + else: + raise TypeError("The type of '%s' in %s must be %s, but received %s" + % ('x', 'stack', + 'list[Tensor], tuple[Tensor] or TensorArray', + type(x))) + + helper = LayerHelper('stack', **locals()) + + out = helper.create_variable_for_type_inference(x[0].dtype) + if x[0].desc.type() == core.VarDesc.VarType.LOD_TENSOR_ARRAY: + assert len(x) == 1, "If the elements of 'x' in stack are Variable(LoDTensorArray), " \ + "number of the elements must be 1, but received %s." % len(x) + out_index = helper.create_variable_for_type_inference(dtype="int32") + + for i in x: + check_variable_and_dtype(i, 'x', \ + ['float16', 'float32', 'float64', 'int32', 'int64'], 'stack') + + helper.append_op( + type='tensor_array_to_tensor', + inputs={'X': x[0]}, + outputs={'Out': [out], + 'OutIndex': [out_index]}, + attrs={'axis': axis, + 'use_stack': True}) + else: + helper.append_op( + type='stack', + inputs={'X': x}, + outputs={'Y': out}, + attrs={'axis': axis}) + + return out def split(x, num_or_sections, axis=0, name=None): @@ -951,8 +1755,110 @@ def split(x, num_or_sections, axis=0, name=None): print(out1.shape) # [3, 3, 5] print(out2.shape) # [3, 3, 5] """ - return paddle.fluid.layers.split( - input=x, num_or_sections=num_or_sections, dim=axis, name=name) + input = x + dim = axis + if _non_static_mode(): + num = None + attrs = () + + if isinstance(dim, Variable): + dim = dim.numpy() + dim = dim.item(0) + assert len(input.shape) + dim >= 0, "(rank(x) + axis) must >= 0" + dim = (len(input.shape) + dim) if dim < 0 else dim + attrs += ('axis', dim) + + if isinstance(num_or_sections, int): + num = num_or_sections + attrs += ('num', num_or_sections) + elif isinstance(num_or_sections, (list, tuple)): + num = len(num_or_sections) + if utils._contain_var(num_or_sections): + for index, item in enumerate(num_or_sections): + if isinstance(item, Variable): + num_or_sections[index] = num_or_sections[index].numpy()[ + 0] + attrs += ('sections', list(num_or_sections)) + else: + attrs += ('sections', list(num_or_sections)) + else: + raise TypeError( + "The type of 'num_or_sections' in split must be int, list or tuple in imperative mode, but " + "received %s." % (type(num_or_sections))) + out = [_varbase_creator() for n in range(num)] + _C_ops.split(input, out, *attrs) + return out + + check_variable_and_dtype( + input, 'input', + ['bool', 'float16', 'float32', 'float64', 'int32', 'int64'], 'split') + check_type(num_or_sections, 'num_or_sections', (list, int, tuple), 'split') + check_type(dim, 'dim', (int, Variable), 'split') + if isinstance(dim, Variable): + check_dtype(dim.dtype, 'dim', ['int32', 'int64'], 'split') + + helper = LayerHelper('split', **locals()) + + input_shape = input.shape + inputs = {'X': input} + attrs = {'num': num_or_sections if isinstance(num_or_sections, int) else 0} + + def _get_SectionsTensorList(one_list): + tensor_list = [] + unk_dim_idx = -1 + for idx, dim_size in enumerate(one_list): + if isinstance(dim_size, Variable): + dim_size.stop_gradient = True + tensor_list.append(dim_size) + else: + assert (isinstance(dim_size, int)) + if dim_size == -1: + assert unk_dim_idx == -1, ( + "Only one value of 'num_or_section' in split can " + "be -1. But received num_or_section[%d] is also -1." % + idx) + unk_dim_idx = idx + temp_out = helper.create_variable_for_type_inference('int32') + fill_constant( + [1], 'int32', dim_size, force_cpu=True, out=temp_out) + tensor_list.append(temp_out) + return tensor_list + + if isinstance(dim, Variable): + dim.stop_gradient = True + inputs['AxisTensor'] = dim + else: + assert len(input.shape) + dim >= 0, "(rank(x) + axis) must >= 0" + dim = (len(input_shape) + dim) if dim < 0 else dim + attrs['axis'] = dim + + if isinstance(num_or_sections, int): + assert num_or_sections > 1, 'num_or_sections must be more than 1.' + if isinstance(dim, int) and input_shape[dim] > 0: + assert input_shape[dim] % num_or_sections ==0, \ + "The input's size along the split dimension " \ + "must be evenly divisible by Attr(num_or_sections). " \ + "But %d is not evenly divisible by %d. " % (num_or_sections,input_shape[dim]) + num = num_or_sections + else: + if isinstance(dim, int) and input_shape[dim] > 0: + assert len(num_or_sections) <= input_shape[ + dim], 'len(num_or_sections) must not be more than input.shape[dim].' + num = len(num_or_sections) + attrs['sections'] = list( + map(lambda ele: -1 if isinstance(ele, Variable) else ele, + num_or_sections)) + if utils._contain_var(num_or_sections): + inputs['SectionsTensorList'] = _get_SectionsTensorList( + num_or_sections) + + outs = [ + helper.create_variable_for_type_inference(dtype=helper.input_dtype()) + for i in range(num) + ] + helper.append_op( + type='split', inputs=inputs, outputs={'Out': outs}, attrs=attrs) + return outs def squeeze(x, axis=None, name=None): @@ -1035,7 +1941,30 @@ def squeeze(x, axis=None, name=None): elif isinstance(axis, tuple): axis = list(axis) - return layers.squeeze(x, axis, name) + input = x + axes = axis + if in_dygraph_mode(): + return _C_ops.final_state_squeeze(input, axes)[1] + if _in_legacy_dygraph(): + out, _ = _C_ops.squeeze2(input, 'axes', axes) + return out + + helper = LayerHelper("squeeze", **locals()) + check_variable_and_dtype(input, 'input', [ + 'float16', 'float32', 'float64', 'bool', 'int8', 'int32', 'int64', + 'complex64', 'complex128' + ], 'squeeze') + check_type(axes, 'axis/axes', (list, tuple), 'squeeze') + out = helper.create_variable_for_type_inference(dtype=input.dtype) + x_shape = helper.create_variable_for_type_inference(dtype=input.dtype) + helper.append_op( + type="squeeze2", + inputs={"X": input}, + attrs={"axes": axes}, + outputs={"Out": out, + "XShape": x_shape}) + + return out @inplace_apis_in_dygraph_only @@ -1335,8 +2264,61 @@ def unsqueeze(x, axis, name=None): print(out3[0, 0, 0, 0, 0]) # [10.] """ + input = x + axes = axis + if _non_static_mode(): + if isinstance(axes, int): + axes = [axes] + elif isinstance(axes, Variable): + axes = axes.numpy().tolist() + elif isinstance(axes, (list, tuple)): + axes = [ + item.numpy().item(0) if isinstance(item, Variable) else item + for item in axes + ] + if _in_legacy_dygraph(): + out, _ = _C_ops.unsqueeze2(input, 'axes', axes) + return out + return _C_ops.final_state_unsqueeze(input, axes)[1] + + check_type(axes, 'axis/axes', (int, list, tuple, Variable), 'unsqueeze') + check_variable_and_dtype(input, 'input', [ + 'float16', + 'float32', + 'float64', + 'bool', + 'int8', + 'int16', + 'int32', + 'int64', + 'complex64', + 'complex128', + ], 'unsqueeze') + helper = LayerHelper("unsqueeze2", **locals()) + inputs = {"X": input} + attrs = {} + + if isinstance(axes, int): + axes = [axes] + if isinstance(axes, Variable): + axes.stop_gradient = True + inputs["AxesTensor"] = axes + elif isinstance(axes, (list, tuple)): + if utils._contain_var(axes): + inputs["AxesTensorList"] = utils._convert_to_tensor_list(axes) + else: + attrs["axes"] = axes + + out = helper.create_variable_for_type_inference(dtype=input.dtype) + x_shape = helper.create_variable_for_type_inference(dtype=input.dtype) + helper.append_op( + type="unsqueeze2", + inputs=inputs, + attrs=attrs, + outputs={"Out": out, + "XShape": x_shape}) - return layers.unsqueeze(x, axis, name) + return out @inplace_apis_in_dygraph_only @@ -1680,7 +2662,70 @@ def scatter_nd_add(x, index, updates, name=None): index = paddle.to_tensor(index_data) output = paddle.scatter_nd_add(x, index, updates) """ - return layers.scatter_nd_add(x, index, updates, name=None) + if in_dygraph_mode(): + op = getattr(_C_ops, 'scatter_nd_add') + return op(x, index, updates) + else: + if _in_legacy_dygraph(): + op = getattr(_C_ops, 'scatter_nd_add') + return op(x, index, updates) + else: + if x.dtype != updates.dtype: + raise ValueError("x and updates must have same data type.") + + helper = LayerHelper('scatter_nd_add', **locals()) + dtype = helper.input_dtype(input_param_name='x') + output = helper.create_variable_for_type_inference(dtype) + helper.append_op( + type="scatter_nd_add", + inputs={"X": x, + "Index": index, + "Updates": updates}, + outputs={"Out": output}) + return output + + +def scatter_nd(index, updates, shape, name=None): + """ + **Scatter_nd Layer** + + Output is obtained by scattering the :attr:`updates` in a new tensor according + to :attr:`index` . This op is similar to :code:`scatter_nd_add`, except the + tensor of :attr:`shape` is zero-initialized. Correspondingly, :code:`scatter_nd(index, updates, shape)` + is equal to :code:`scatter_nd_add(paddle.zeros(shape, updates.dtype), index, updates)` . + If :attr:`index` has repeated elements, then the corresponding updates are accumulated. + Because of the numerical approximation issues, the different order of repeated elements + in :attr:`index` may cause different results. The specific calculation method can be + seen :code:`scatter_nd_add` . This op is the inverse of the :code:`gather_nd` op. + + Args: + index (Tensor): The index input with ndim > 1 and index.shape[-1] <= len(shape). + Its dtype should be int32 or int64 as it is used as indexes. + updates (Tensor): The updated value of scatter_nd op. Its dtype should be float32, float64. + It must have the shape index.shape[:-1] + shape[index.shape[-1]:] + shape(tuple|list): Shape of output tensor. + name (str|None): The output Tensor name. If set None, the layer will be named automatically. + + Returns: + output (Tensor): The output is a tensor with the same type as :attr:`updates` . + + Examples: + + .. code-block:: python + + import paddle + import numpy as np + + index_data = np.array([[1, 1], + [0, 1], + [1, 3]]).astype(np.int64) + index = paddle.to_tensor(index_data) + updates = paddle.rand(shape=[3, 9, 10], dtype='float32') + shape = [3, 5, 9, 10] + + output = paddle.scatter_nd(index, updates, shape) + """ + return scatter_nd_add(zeros(shape, updates.dtype), index, updates, name) def chunk(x, chunks, axis=0, name=None): @@ -1722,8 +2767,7 @@ def chunk(x, chunks, axis=0, name=None): # out2.shape [3, 3, 5] """ check_type(chunks, 'chunks', (int), 'chunk') - return paddle.fluid.layers.split( - input=x, num_or_sections=chunks, dim=axis, name=name) + return split(x, num_or_sections=chunks, axis=axis, name=name) def tile(x, repeat_times, name=None): @@ -2136,7 +3180,124 @@ def reshape(x, shape, name=None): # the value is [10.] """ - return paddle.fluid.layers.reshape(x=x, shape=shape, name=name) + actual_shape = None + act = None + inplace = False + + if in_dygraph_mode(): + tmp_tensor_type = core.eager.Tensor + #TODO(zhiqiu): enable inplace in dygraph mode. + if inplace: + warnings.warn( + "Inplace on reshape is not allowed and will be discarded in dygraph mode currently." + ) + if isinstance(shape, (list, tuple)): + shape = [ + item.numpy().item(0) if isinstance(item, Variable) else item + for item in shape + ] + out, _ = _C_ops.reshape2(x, None, 'shape', shape) + elif isinstance(shape, tmp_tensor_type): + shape.stop_gradient = True + out, _ = _C_ops.reshape2(x, shape) + else: + raise ValueError( + "shape must be an instance of `list`, `tuple` or `Variable`," + " got '{}.'".format(type(shape))) + + return dygraph_utils._append_activation_in_dygraph(out, act) + else: + if _in_legacy_dygraph(): + tmp_tensor_type = Variable + if inplace: + warnings.warn( + "Inplace on reshape is not allowed and will be discarded in dygraph mode currently." + ) + if isinstance(shape, (list, tuple)): + shape = [ + item.numpy().item(0) if isinstance(item, Variable) else item + for item in shape + ] + out, _ = _C_ops.reshape2(x, None, 'shape', shape) + elif isinstance(shape, tmp_tensor_type): + shape.stop_gradient = True + out, _ = _C_ops.reshape2(x, shape) + else: + raise ValueError( + "shape must be an instance of `list`, `tuple` or `Variable`," + " got '{}.'".format(type(shape))) + + return dygraph_utils._append_activation_in_dygraph(out, act) + + check_variable_and_dtype(x, 'x', [ + 'float16', 'float32', 'float64', 'int16', 'int32', 'int64', 'bool', + 'uint16' + ], 'reshape') + check_type(shape, 'shape', (list, tuple, Variable), 'reshape') + check_type(actual_shape, 'actual_shape', (Variable, type(None)), 'reshape') + + helper = LayerHelper("reshape2", **locals()) + + def get_attr_shape(list_shape): + unk_dim_idx = -1 + attrs_shape = [] + for dim_idx, dim_size in enumerate(list_shape): + if isinstance(dim_size, Variable): + attrs_shape.append(-1) + else: + attrs_shape.append(dim_size) + if dim_size == -1: + assert unk_dim_idx == -1, ( + "Only one dimension value of 'shape' in reshape can " + "be -1. But received shape[%d] is also -1.\n" + "\n\t# N = x.shape()[2]\t\t# N is an int. " + "(NOT recommend under @to_static)\n\tN = paddle.shape(x)[2]\t\t" + "# N is a Tensor. (Recommend)\n\tz = paddle.reshape([N, -1, 4])" + "\t# z.shape is [-1, -1, 4]\n\n" + " If your target shape in Reshape represents dynamic shape, " + "please turn it into a Tensor under @to_static. See above example for details." + % dim_idx) + unk_dim_idx = dim_idx + elif dim_size == 0: + assert dim_idx < len(x.shape), ( + "The index of 0 in `shape` must be less than " + "the input tensor X's dimensions. " + "But received shape[%d] = 0, X's dimensions = %d." % + (dim_idx, len(x.shape))) + else: + assert dim_size > 0, ( + "Each dimension value of 'shape' in reshape must not " + "be negative except one unknown dimension. " + "But received shape[%d] = %s." % + (dim_idx, str(dim_size))) + return attrs_shape + + inputs = {"X": x} + attrs = {} + if isinstance(shape, Variable): + shape.stop_gradient = True + inputs["Shape"] = shape + elif isinstance(shape, (list, tuple)): + assert len(shape) > 0, ("The size of 'shape' in reshape can't be zero, " + "but received %s." % len(shape)) + attrs["shape"] = get_attr_shape(shape) + if utils._contain_var(shape): + inputs['ShapeTensor'] = utils._convert_to_tensor_list(shape) + elif isinstance(actual_shape, Variable): + actual_shape.stop_gradient = True + inputs["Shape"] = actual_shape + + out = x if inplace else helper.create_variable_for_type_inference( + dtype=x.dtype) + x_shape = helper.create_variable_for_type_inference(dtype=x.dtype) + helper.append_op( + type="reshape2", + inputs=inputs, + attrs=attrs, + outputs={"Out": out, + "XShape": x_shape}) + + return helper.append_activation(out) @inplace_apis_in_dygraph_only @@ -2231,8 +3392,24 @@ def gather_nd(x, index, name=None): output = paddle.gather_nd(x, index) #[[3, 4]] """ - - return paddle.fluid.layers.gather_nd(input=x, index=index, name=name) + if in_dygraph_mode(): + return _C_ops.final_state_gather_nd(x, index) + else: + if _in_legacy_dygraph(): + return _C_ops.gather_nd(x, index) + check_variable_and_dtype( + x, 'x', ['bool', 'float32', 'float64', 'int16', 'int32', 'int64'], + 'gather_np') + check_variable_and_dtype(index, 'index', ['int32', 'int64'], 'gather_np') + helper = LayerHelper('gather_nd', **locals()) + dtype = helper.input_dtype() + output = helper.create_variable_for_type_inference(dtype) + helper.append_op( + type="gather_nd", + inputs={"X": x, + "Index": index}, + outputs={"Out": output}) + return output def strided_slice(x, axes, starts, ends, strides, name=None): @@ -2318,8 +3495,115 @@ def strided_slice(x, axes, starts, ends, strides, name=None): # sliced_2 is x[:, 1:3:1, 0:2:1, 2:4:2]. """ - return paddle.fluid.layers.strided_slice( - input=x, axes=axes, starts=starts, ends=ends, strides=strides) + helper = LayerHelper('strided_slice', **locals()) + + check_variable_and_dtype(x, 'x', + ['bool', 'float32', 'float64', 'int32', 'int64'], + 'strided_slice') + check_type(axes, 'axes', (list, tuple), 'strided_slice') + check_type(starts, 'starts', (list, tuple, Variable), 'strided_slice') + check_type(ends, 'ends', (list, tuple, Variable), 'strided_slice') + check_type(strides, 'strides', (list, tuple, Variable), 'strided_slice') + + def check_list_elements_dtype(list_input, input_name): + if isinstance(list_input, Variable): + check_dtype(list_input.dtype, input_name, ['int32'], + 'strided_slice') + else: + for i, var in enumerate(list_input): + var_name = input_name + '[' + str(i) + ']' + if isinstance(var, Variable): + check_dtype(var.dtype, var_name, ['int32'], 'strided_slice') + + check_list_elements_dtype(axes, 'axes') + check_list_elements_dtype(starts, 'starts') + check_list_elements_dtype(ends, 'ends') + check_list_elements_dtype(strides, 'strides') + + def get_new_list_tensor(old_list): + new_list_tensor = [] + for dim in old_list: + if isinstance(dim, Variable): + dim.stop_gradient = True + new_list_tensor.append(dim) + else: + assert (isinstance(dim, int)) + temp_out = helper.create_variable_for_type_inference('int32') + fill_constant([1], 'int32', dim, force_cpu=True, out=temp_out) + new_list_tensor.append(temp_out) + return new_list_tensor + + inputs = {'Input': x} + attrs = {'axes': axes} + infer_flags = list(1 for i in range(len(axes))) + + if _non_static_mode(): + inputs = {'Input': x} + attrs = { + 'axes': axes, + 'starts': starts, + 'ends': ends, + 'strides': strides, + 'infer_flags': infer_flags + } + else: + # starts + if isinstance(starts, Variable): + starts.stop_gradient = True + inputs['StartsTensor'] = starts + elif isinstance(starts, (list, tuple)): + attrs['starts'] = [] + if utils._contain_var(starts): + inputs['StartsTensorList'] = get_new_list_tensor(starts) + for i, dim in enumerate(starts): + if isinstance(dim, Variable): + attrs['starts'].append(-1) + infer_flags[i] = -1 + else: + attrs['starts'].append(dim) + else: + attrs['starts'] = starts + + # ends + if isinstance(ends, Variable): + ends.stop_gradient = True + inputs['EndsTensor'] = ends + elif isinstance(ends, (list, tuple)): + attrs['ends'] = [] + if utils._contain_var(ends): + inputs['EndsTensorList'] = get_new_list_tensor(ends) + for i, dim in enumerate(ends): + if isinstance(dim, Variable): + attrs['ends'].append(-1) + infer_flags[i] = -1 + else: + attrs['ends'].append(dim) + else: + attrs['ends'] = ends + + # strides + if isinstance(strides, Variable): + strides.stop_gradient = True + inputs['StridesTensor'] = strides + elif isinstance(strides, (list, tuple)): + attrs['strides'] = [] + if utils._contain_var(strides): + inputs['StridesTensorList'] = get_new_list_tensor(strides) + for i, dim in enumerate(strides): + if isinstance(dim, Variable): + attrs['strides'].append(-1) + infer_flags[i] = -1 + else: + attrs['strides'].append(dim) + else: + attrs['strides'] = strides + attrs['infer_flags'] = infer_flags + out = helper.create_variable_for_type_inference( + dtype=helper.input_dtype('x')) + helper.append_op( + type='strided_slice', inputs=inputs, attrs=attrs, outputs={'Out': out}) + + return out def tensordot(x, y, axes=2, name=None): diff --git a/python/paddle/tensor/random.py b/python/paddle/tensor/random.py index 3d0617e40d6b6..b82f58ea3d087 100644 --- a/python/paddle/tensor/random.py +++ b/python/paddle/tensor/random.py @@ -16,7 +16,7 @@ from ..framework import core from ..framework import convert_np_dtype_to_dtype_, dygraph_only -from ..fluid.layer_helper import LayerHelper +from ..framework import LayerHelper from ..fluid.data_feeder import check_variable_and_dtype, check_type, check_dtype, check_shape from ..fluid.layers import utils import paddle diff --git a/python/paddle/tensor/search.py b/python/paddle/tensor/search.py index b2fb9d6c37ff2..6855b8f0f7061 100644 --- a/python/paddle/tensor/search.py +++ b/python/paddle/tensor/search.py @@ -14,7 +14,7 @@ from __future__ import print_function import numpy as np import paddle -from ..fluid.layer_helper import LayerHelper +from ..framework import LayerHelper from ..fluid.data_feeder import check_variable_and_dtype, check_type, check_dtype from ..fluid import layers from ..framework import core, in_dygraph_mode, _non_static_mode diff --git a/python/paddle/tensor/stat.py b/python/paddle/tensor/stat.py index 89462e2a8721f..9863abe1becbb 100644 --- a/python/paddle/tensor/stat.py +++ b/python/paddle/tensor/stat.py @@ -16,7 +16,7 @@ import numpy as np from ..static import Variable -from ..fluid.layer_helper import LayerHelper +from ..framework import LayerHelper from ..framework import core from paddle.fluid.framework import _in_legacy_dygraph, in_dygraph_mode from .search import where From ca4aea2c5cc79b593d3d8ec2d6a585f7a48ce208 Mon Sep 17 00:00:00 2001 From: chenjian Date: Wed, 13 Apr 2022 18:46:34 +0800 Subject: [PATCH 131/211] fix new dygraph record event (#41715) * fix new dygraph record event * refine name * fix * fix * fix according to review --- paddle/fluid/eager/auto_code_generator/eager_generator.cc | 2 +- .../final_state_generator/codegen_utils.py | 2 +- .../auto_code_generator/final_state_generator/eager_gen.py | 4 ++-- paddle/fluid/eager/backward.cc | 2 +- python/paddle/utils/code_gen/api_base.py | 4 ++-- 5 files changed, 7 insertions(+), 7 deletions(-) diff --git a/paddle/fluid/eager/auto_code_generator/eager_generator.cc b/paddle/fluid/eager/auto_code_generator/eager_generator.cc index 3ed17b67b842a..726e049e61150 100644 --- a/paddle/fluid/eager/auto_code_generator/eager_generator.cc +++ b/paddle/fluid/eager/auto_code_generator/eager_generator.cc @@ -2480,7 +2480,7 @@ static std::string GenerateGradNodeHeaderContents( "%s\n" " SetIsTensorWrappersCleared(true);\n" " }\n" - " std::string name() override { return \" GradNode%s \"; } \n " + " std::string name() override { return \"GradNode%sMid\"; } \n " "\n" "std::shared_ptr Copy() const override {{\n " " auto copied_node = std::shared_ptr(new " diff --git a/paddle/fluid/eager/auto_code_generator/final_state_generator/codegen_utils.py b/paddle/fluid/eager/auto_code_generator/final_state_generator/codegen_utils.py index ea7b4a21a2c54..6219ecee17f30 100644 --- a/paddle/fluid/eager/auto_code_generator/final_state_generator/codegen_utils.py +++ b/paddle/fluid/eager/auto_code_generator/final_state_generator/codegen_utils.py @@ -137,7 +137,7 @@ def RemoveConstAndReference(string): def GetGradNodeName(string): - return f"FinalGradNode{string}" + return f"GradNode{string}Final" def GetDygraphForwardFunctionName(string): diff --git a/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py b/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py index d6505ebaa1e68..bd31de520750d 100644 --- a/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py +++ b/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py @@ -120,7 +120,7 @@ class {} : public egr::GradNodeBase {{ virtual std::vector> operator()( std::vector>& grads, bool create_graph = false) override; - std::string name() override {{ return \" {} \"; }} + std::string name() override {{ return \"{}\"; }} void ClearTensorWrappers() override {{ {} @@ -804,7 +804,7 @@ def GenerateNodeCreationCodes(self): set_retain_grad_str = "\n".join(set_retain_grad_list) node_event_name = forward_api_name + " node_creation" - node_creation_event_str = f"{indent}paddle::platform::RecordEvent node_creation_record_event(\"{node_event_name}\", paddle::platform::TracerEventType::Operator, 1);\n" + node_creation_event_str = f"{indent}paddle::platform::RecordEvent node_creation_record_event(\"{node_event_name}\", paddle::platform::TracerEventType::OperatorInner, 1);\n" self.node_creation_str = FORWARD_BODY_TEMPLATE.format( node_creation_event_str, pass_stop_gradient_args_str, diff --git a/paddle/fluid/eager/backward.cc b/paddle/fluid/eager/backward.cc index 974acb8646ca5..3b555eda8fff7 100644 --- a/paddle/fluid/eager/backward.cc +++ b/paddle/fluid/eager/backward.cc @@ -643,7 +643,7 @@ std::vector RunBackward( VLOG(6) << "Running GradNode:" << node->name(); paddle::platform::RecordEvent node_record_event( - std::string(typeid(*node).name()) + " grad_node", + std::string((*node).name()) + " grad_node", paddle::platform::TracerEventType::Operator, 1); if (queue.size() > 1 && node_in_degree_map[node] != 0) { diff --git a/python/paddle/utils/code_gen/api_base.py b/python/paddle/utils/code_gen/api_base.py index 4325807746e7c..9aa3fc9eafe33 100644 --- a/python/paddle/utils/code_gen/api_base.py +++ b/python/paddle/utils/code_gen/api_base.py @@ -744,7 +744,7 @@ def gen_dense_tensor_kernel_code(self, code_indent, inplace_flag=False): {code_indent} using kernel_signature = {kernel_signature}; {code_indent} auto* kernel_fn = kernel.GetVariadicKernelFn(); {code_indent} {{ -{code_indent} paddle::platform::RecordEvent kernel_record_event(\"{api_func_name} compute\", paddle::platform::TracerEventType::Operator, 1); +{code_indent} paddle::platform::RecordEvent kernel_record_event(\"{api_func_name} compute\", paddle::platform::TracerEventType::OperatorInner, 1); {code_indent} (*kernel_fn)({kernel_args}, {outputs_args}); {code_indent} }} @@ -771,7 +771,7 @@ def gen_selected_rows_kernel_code(self, code_indent, inplace_flag=False): {code_indent} using kernel_signature = {kernel_signature}; {code_indent} auto* kernel_fn = kernel.GetVariadicKernelFn(); {code_indent} {{ -{code_indent} paddle::platform::RecordEvent kernel_record_event(\"{api_func_name} compute\", paddle::platform::TracerEventType::Operator, 1); +{code_indent} paddle::platform::RecordEvent kernel_record_event(\"{api_func_name} compute\", paddle::platform::TracerEventType::OperatorInner, 1); {code_indent} (*kernel_fn)({kernel_args}, {outputs_args}); {code_indent} }} From c9c03e7b41254f3fe267b2140b21da62739e713f Mon Sep 17 00:00:00 2001 From: zmxdream Date: Wed, 13 Apr 2022 19:53:40 +0800 Subject: [PATCH 132/211] [XPUPS]add support for kunlun2 (#40985) [XPUPS]add support for kunlun2 Co-authored-by: WorgenZhang --- paddle/fluid/framework/fleet/heter_context.h | 2 +- .../framework/fleet/heter_ps/CMakeLists.txt | 4 +- .../framework/fleet/heter_ps/feature_value.h | 24 +- .../framework/fleet/heter_ps/hashtable.h | 98 ++- .../{hashtable_inl.h => hashtable_kernel.cu} | 82 +- .../fleet/heter_ps/hashtable_kernel.kps | 346 ++++++++ .../framework/fleet/heter_ps/heter_comm.h | 100 ++- .../framework/fleet/heter_ps/heter_comm_inl.h | 764 +++++++++++------- .../fleet/heter_ps/heter_comm_kernel.cu | 269 ++++++ .../fleet/heter_ps/heter_comm_kernel.h | 86 ++ .../fleet/heter_ps/heter_comm_kernel.kps | 351 ++++++++ .../framework/fleet/heter_ps/heter_ps.cu | 8 + .../fluid/framework/fleet/heter_ps/heter_ps.h | 8 + .../framework/fleet/heter_ps/heter_ps_base.h | 2 + .../fleet/heter_ps/heter_resource.cc | 65 +- .../framework/fleet/heter_ps/heter_resource.h | 62 +- .../fluid/framework/fleet/heter_ps/mem_pool.h | 2 + .../framework/fleet/heter_ps/optimizer.cuh.h | 20 +- .../framework/fleet/heter_ps/optimizer_conf.h | 30 +- paddle/fluid/framework/fleet/ps_gpu_wrapper.h | 6 +- 20 files changed, 1938 insertions(+), 391 deletions(-) mode change 100755 => 100644 paddle/fluid/framework/fleet/heter_ps/hashtable.h rename paddle/fluid/framework/fleet/heter_ps/{hashtable_inl.h => hashtable_kernel.cu} (75%) create mode 100644 paddle/fluid/framework/fleet/heter_ps/hashtable_kernel.kps create mode 100644 paddle/fluid/framework/fleet/heter_ps/heter_comm_kernel.cu create mode 100644 paddle/fluid/framework/fleet/heter_ps/heter_comm_kernel.h create mode 100644 paddle/fluid/framework/fleet/heter_ps/heter_comm_kernel.kps diff --git a/paddle/fluid/framework/fleet/heter_context.h b/paddle/fluid/framework/fleet/heter_context.h index 6d3a4c5d9c0b9..3fdcf2379cb54 100644 --- a/paddle/fluid/framework/fleet/heter_context.h +++ b/paddle/fluid/framework/fleet/heter_context.h @@ -23,7 +23,7 @@ limitations under the License. */ #include #ifdef PADDLE_WITH_PSLIB -#include "common_value.h" // NOLINT +#include "common/common_value.h" // NOLINT #endif #ifdef PADDLE_WITH_PSCORE diff --git a/paddle/fluid/framework/fleet/heter_ps/CMakeLists.txt b/paddle/fluid/framework/fleet/heter_ps/CMakeLists.txt index 983208c0608ae..cac366d6b22a1 100644 --- a/paddle/fluid/framework/fleet/heter_ps/CMakeLists.txt +++ b/paddle/fluid/framework/fleet/heter_ps/CMakeLists.txt @@ -7,7 +7,9 @@ IF(WITH_GPU) get_property(RPC_DEPS GLOBAL PROPERTY RPC_DEPS) SET(HETERPS_DEPS ${HETERPS_DEPS} ${RPC_DEPS}) endif() - nv_library(heter_comm SRCS heter_comm.h feature_value.h heter_resource.cc heter_resource.h hashtable.h mem_pool.h DEPS ${HETERPS_DEPS}) + nv_library(heter_comm_kernel SRCS heter_comm_kernel.cu feature_value.h DEPS ${HETERPS_DEPS}) + nv_library(hashtable_kernel SRCS hashtable_kernel.cu feature_value.h DEPS ${HETERPS_DEPS}) + nv_library(heter_comm SRCS heter_comm.h feature_value.h heter_resource.cc heter_resource.h mem_pool.h DEPS ${HETERPS_DEPS} heter_comm_kernel hashtable_kernel) nv_test(test_heter_comm SRCS feature_value.h DEPS heter_comm) nv_library(heter_ps SRCS heter_ps.cu DEPS heter_comm) if(WITH_PSCORE) diff --git a/paddle/fluid/framework/fleet/heter_ps/feature_value.h b/paddle/fluid/framework/fleet/heter_ps/feature_value.h index db11fca109bc3..b633394e7a811 100644 --- a/paddle/fluid/framework/fleet/heter_ps/feature_value.h +++ b/paddle/fluid/framework/fleet/heter_ps/feature_value.h @@ -52,18 +52,18 @@ struct FeaturePushValue { float lr_g; float mf_g[MF_DIM]; - __device__ __forceinline__ FeaturePushValue - operator+(const FeaturePushValue& a) const { - FeaturePushValue out; - out.slot = a.slot; - out.show = a.show + show; - out.clk = a.clk + clk; - out.lr_g = a.lr_g + lr_g; - for (int i = 0; i < MF_DIM; ++i) { - out.mf_g[i] = a.mf_g[i] + mf_g[i]; - } - return out; - } + // __device__ __forceinline__ FeaturePushValue + // operator+(const FeaturePushValue& a) const { + // FeaturePushValue out; + // out.slot = a.slot; + // out.show = a.show + show; + // out.clk = a.clk + clk; + // out.lr_g = a.lr_g + lr_g; + // for (int i = 0; i < MF_DIM; ++i) { + // out.mf_g[i] = a.mf_g[i] + mf_g[i]; + // } + // return out; + // } }; } // end namespace framework diff --git a/paddle/fluid/framework/fleet/heter_ps/hashtable.h b/paddle/fluid/framework/fleet/heter_ps/hashtable.h old mode 100755 new mode 100644 index e8eb91f6f6b14..6a51713d74c19 --- a/paddle/fluid/framework/fleet/heter_ps/hashtable.h +++ b/paddle/fluid/framework/fleet/heter_ps/hashtable.h @@ -13,28 +13,38 @@ See the License for the specific language governing permissions and limitations under the License. */ #pragma once +#ifdef PADDLE_WITH_HETERPS #include #include #include #include + #ifdef PADDLE_WITH_PSLIB #include "common_value.h" // NOLINT #endif -#ifdef PADDLE_WITH_PSCORE + +#if defined(PADDLE_WITH_PSCORE) #include "paddle/fluid/distributed/ps/table/depends/feature_value.h" #endif +#include "paddle/fluid/framework/fleet/heter_ps/feature_value.h" #include "paddle/phi/core/utils/rw_lock.h" -#include "thrust/pair.h" -// #include "cudf/concurrent_unordered_map.cuh.h" + +#if defined(PADDLE_WITH_CUDA) #include "paddle/fluid/framework/fleet/heter_ps/cudf/concurrent_unordered_map.cuh.h" -#include "paddle/fluid/framework/fleet/heter_ps/feature_value.h" #include "paddle/fluid/framework/fleet/heter_ps/mem_pool.h" -#ifdef PADDLE_WITH_HETERPS #include "paddle/fluid/platform/device/gpu/gpu_types.h" +#include "thrust/pair.h" +#elif defined(__xpu__) +#include +#include "xpu/kernel/cluster_header.h" +#include "xpu/kernel/math.h" +#include "xpu/kernel/simd.h" +#endif namespace paddle { namespace framework { +#if defined(PADDLE_WITH_CUDA) template class TableContainer : public concurrent_unordered_map::max()>( capacity, ValType()) {} }; +#elif defined(PADDLE_WITH_XPU_KP) + +template +class XPUCacheArray { + public: + explicit XPUCacheArray(size_t capacity) : capacity_(capacity), size_(0) { + xpu_malloc(reinterpret_cast(&keys), capacity_ * sizeof(KeyType)); + xpu_malloc(reinterpret_cast(&vals), capacity_ * sizeof(ValType)); + } + + virtual ~XPUCacheArray() { + xpu_free(keys); + xpu_free(vals); + } + + void print() {} + // ValType* find(const KeyType& key) { return NULL; } + // bool insert(const KeyType& key, const ValType& val) { return true; } + + int prefetch(const int dev_id, XPUStream stream = NULL) {} + size_t size() { return size_; } + + private: + long long capacity_; + long long size_; + KeyType* keys; + ValType* vals; +}; +#endif template class HashTable { public: - HashTable(size_t capacity); + explicit HashTable(size_t capacity); virtual ~HashTable(); HashTable(const HashTable&) = delete; HashTable& operator=(const HashTable&) = delete; + + template void insert(const KeyType* d_keys, const ValType* d_vals, size_t len, - gpuStream_t stream); + StreamType stream); + + template void insert(const KeyType* d_keys, size_t len, char* pool, size_t start_index, - gpuStream_t stream); + StreamType stream); + + template void get(const KeyType* d_keys, ValType* d_vals, size_t len, - gpuStream_t stream); - void get(const KeyType* d_keys, char* d_vals, size_t len, gpuStream_t stream); + StreamType stream); + + template + void get(const KeyType* d_keys, char* d_vals, size_t len, StreamType stream); + void show(); - void dump_to_cpu(int devid, cudaStream_t stream); - template + template + void dump_to_cpu(int devid, StreamType stream); + +#if defined(PADDLE_WITH_CUDA) + + template void update(const KeyType* d_keys, const GradType* d_grads, size_t len, - Sgd sgd, gpuStream_t stream); + Sgd sgd, StreamType stream); - template + template void update(const KeyType* d_keys, const char* d_grads, size_t len, Sgd sgd, - gpuStream_t stream); + StreamType stream); + +#elif defined(PADDLE_WITH_XPU_KP) + template + void update(const KeyType* d_keys, const GradType* d_grads, size_t len, + StreamType stream); + + template + void update(const KeyType* d_keys, const char* d_grads, size_t len, + StreamType stream); + +#endif int size() { return container_->size(); } @@ -84,7 +147,11 @@ class HashTable { std::unique_ptr rwlock_{nullptr}; private: +#if defined(PADDLE_WITH_CUDA) TableContainer* container_; +#elif defined(PADDLE_WITH_XPU_KP) + XPUCacheArray* container_; +#endif int BLOCK_SIZE_{256}; float LOAD_FACTOR{0.75f}; size_t capacity_; @@ -94,5 +161,4 @@ class HashTable { }; } // end namespace framework } // end namespace paddle -#include "hashtable_inl.h" #endif diff --git a/paddle/fluid/framework/fleet/heter_ps/hashtable_inl.h b/paddle/fluid/framework/fleet/heter_ps/hashtable_kernel.cu similarity index 75% rename from paddle/fluid/framework/fleet/heter_ps/hashtable_inl.h rename to paddle/fluid/framework/fleet/heter_ps/hashtable_kernel.cu index 0297e71c35e27..cac1b9c17e077 100644 --- a/paddle/fluid/framework/fleet/heter_ps/hashtable_inl.h +++ b/paddle/fluid/framework/fleet/heter_ps/hashtable_kernel.cu @@ -1,4 +1,4 @@ -/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -13,10 +13,15 @@ See the License for the specific language governing permissions and limitations under the License. */ #ifdef PADDLE_WITH_HETERPS +#include +#include "paddle/fluid/framework/fleet/heter_ps/hashtable.h" +#include "paddle/fluid/framework/fleet/heter_ps/optimizer.cuh.h" namespace paddle { namespace framework { +#if defined(PADDLE_WITH_CUDA) + template struct ReplaceOp { __host__ __device__ value_type operator()(value_type new_value, @@ -87,6 +92,7 @@ __global__ void dy_mf_search_kernel(Table* table, } } } + template __global__ void update_kernel(Table* table, const typename Table::key_type* const keys, @@ -135,8 +141,9 @@ void HashTable::show() { } template +template void HashTable::get(const KeyType* d_keys, ValType* d_vals, - size_t len, gpuStream_t stream) { + size_t len, StreamType stream) { if (len == 0) { return; } @@ -146,8 +153,9 @@ void HashTable::get(const KeyType* d_keys, ValType* d_vals, } template +template void HashTable::get(const KeyType* d_keys, char* d_vals, - size_t len, gpuStream_t stream) { + size_t len, StreamType stream) { if (len == 0) { return; } @@ -157,9 +165,10 @@ void HashTable::get(const KeyType* d_keys, char* d_vals, } template +template void HashTable::insert(const KeyType* d_keys, const ValType* d_vals, size_t len, - gpuStream_t stream) { + StreamType stream) { if (len == 0) { return; } @@ -169,22 +178,24 @@ void HashTable::insert(const KeyType* d_keys, } template +template void HashTable::insert(const KeyType* d_keys, size_t len, char* pool, size_t start_index, - gpuStream_t stream) { + StreamType stream) { if (len == 0) { return; } - const int grid_size = (len - 1) / BLOCK_SIZE_ + 1; if (pool == NULL) { return; } + const int grid_size = (len - 1) / BLOCK_SIZE_ + 1; insert_kernel<<>>(container_, d_keys, len, pool, start_index); } template -void HashTable::dump_to_cpu(int devid, cudaStream_t stream) { +template +void HashTable::dump_to_cpu(int devid, StreamType stream) { container_->prefetch(cudaCpuDeviceId, stream); std::vector threads; size_t num = container_->size(); @@ -260,10 +271,10 @@ void HashTable::dump_to_cpu(int devid, cudaStream_t stream) { } template -template +template void HashTable::update(const KeyType* d_keys, const GradType* d_grads, size_t len, - Sgd sgd, gpuStream_t stream) { + Sgd sgd, StreamType stream) { if (len == 0) { return; } @@ -273,19 +284,66 @@ void HashTable::update(const KeyType* d_keys, } template -template +template void HashTable::update(const KeyType* d_keys, const char* d_grads, size_t len, - Sgd sgd, gpuStream_t stream) { + Sgd sgd, StreamType stream) { if (len == 0) { return; } const int grid_size = (len - 1) / BLOCK_SIZE_ + 1; - dy_mf_update_kernel<<>>( container_, d_keys, d_grads, len, sgd, push_grad_value_size_); } +template class HashTable; + +template void HashTable::get< + cudaStream_t>(const unsigned long* d_keys, + paddle::framework::FeatureValue* d_vals, size_t len, + cudaStream_t stream); + +// template void +// HashTable::get( +// const unsigned long* d_keys, char* d_vals, size_t len, cudaStream_t +// stream); + +template void HashTable::insert< + cudaStream_t>(const unsigned long* d_keys, + const paddle::framework::FeatureValue* d_vals, size_t len, + cudaStream_t stream); + +// template void HashTable::insert< +// cudaStream_t>(const unsigned long* d_keys, size_t len, char* pool, +// size_t start_index, cudaStream_t stream); + +template void HashTable:: + dump_to_cpu(int devid, cudaStream_t stream); + +template void HashTable::update< + paddle::framework::FeaturePushValue, + Optimizer, + cudaStream_t>(const unsigned long* d_keys, + const paddle::framework::FeaturePushValue* d_grads, + size_t len, Optimizer + sgd, + cudaStream_t stream); + +// template void HashTable::update< +// Optimizer, +// cudaStream_t>(const unsigned long* d_keys, const char* d_grads, size_t +// len, +// Optimizer +// sgd, +// cudaStream_t stream); + +#endif } // end namespace framework } // end namespace paddle #endif diff --git a/paddle/fluid/framework/fleet/heter_ps/hashtable_kernel.kps b/paddle/fluid/framework/fleet/heter_ps/hashtable_kernel.kps new file mode 100644 index 0000000000000..9d2a20a361e31 --- /dev/null +++ b/paddle/fluid/framework/fleet/heter_ps/hashtable_kernel.kps @@ -0,0 +1,346 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#ifdef PADDLE_WITH_HETERPS +#include "paddle/fluid/framework/fleet/heter_ps/hashtable.h" + +namespace optimizer_config { +extern _global_ptr_ float* nonclk_coeff; +extern _global_ptr_ float* clk_coeff; + +extern _global_ptr_ float* min_bound; +extern _global_ptr_ float* max_bound; +extern _global_ptr_ float* learning_rate; +extern _global_ptr_ float* initial_g2sum; +extern _global_ptr_ float* initial_range; + +extern _global_ptr_ float* mf_create_thresholds; +extern _global_ptr_ float* mf_learning_rate; +extern _global_ptr_ float* mf_initial_g2sum; +extern _global_ptr_ float* mf_initial_range; +extern _global_ptr_ float* mf_min_bound; +extern _global_ptr_ float* mf_max_bound; +} + +namespace paddle { +namespace framework { + +#if defined(PADDLE_WITH_XPU_KP) + +__device__ void update_lr(float* w, float* g2sum, float g, // NOLINT + float scale) { + __local__ float local_learning_rate; + __local__ float local_initial_g2sum; + __local__ float local_min_bound; + __local__ float local_max_bound; + + GM2LM(optimizer_config::learning_rate, &local_learning_rate, sizeof(float)); + GM2LM(optimizer_config::initial_g2sum, &local_initial_g2sum, sizeof(float)); + GM2LM(optimizer_config::min_bound, &local_min_bound, sizeof(float)); + GM2LM(optimizr_config::max_bound, &local_max_bound, sizeof(float)); + + double add_g2sum = 0; + double ratio = local_learning_rate * + sqrt(local_initial_g2sum / (local_initial_g2sum + g2sum)); + double scaled_grad = g / scale; + + (*w) += scaled_grad * ratio; + + if (w < local_min_bound) w = local_min_bound; + if (w > local_max_bound) w = local_max_bound; + + add_g2sum += scaled_grad * scaled_grad; + + (*g2sum) += add_g2sum; +} + +__device__ void update_mf(int n, float* w, float* g2sum, const float* g, + float scale) { + __local__ float local_mf_learning_rate; + __local__ float local_mf_initial_g2sum; + __local__ float local_mf_min_bound; + __local__ float local_mf_max_bound; + + GM2LM(optimizer_config::mf_learning_rate, &local_mf_learning_rate, + sizeof(float)); + GM2LM(optimizer_config::mf_initial_g2sum, &local_mf_initial_g2sum, + sizeof(float)); + GM2LM(optimizer_config::mf_min_bound, &local_mf_min_bound, sizeof(float)); + GM2LM(optimizer_config::mf_max_bound, &local_mf_max_bound, sizeof(float)); + + double add_g2sum = 0; + double ratio = + local_mf_learning_rate * + sqrt(local_mf_initial_g2sum / (local_mf_initial_g2sum + g2sum)); + for (int i = 0; i < n; ++i) { + double scaled_grad = g[i] / scale; + w[i] += scaled_grad * ratio; + + if (w[i] < local_mf_min_bound) w[i] = local_mf_min_bound; + if (w[i] > local_mf_max_bound) w[i] = local_mf_max_bound; + add_g2sum += scaled_grad * scaled_grad; + } + + (*g2sum) += add_g2sum / n; +} + +__device__ float xpu_rand_uniform() { return 0.1; } + +template +__device__ void update_value(ValType* val, const GradType* grad) { // NOLINT + (*val).slot = (*grad).slot; + (*val).show += (*grad).show; + (*val).clk += (*grad).clk; + + __local__ float local_nonclk_coeff; + __local__ float local_clk_coeff; + + __local__ float local_mf_create_thresholds; + __local__ float local_mf_initial_range; + + GM2LM(optimizer_config::nonclk_coeff, &local_nonclk_coeff, sizeof(float)); + GM2LM(optimizer_config::clk_coeff, &local_clk_coeff, sizeof(float)); + GM2LM(optimizer_config::mf_create_thresholds, &local_mf_create_thresholds, + sizeof(float)); + + val.delta_score += local_nonclk_coeff * ((*grad).show - (*grad).clk) + + local_clk_coeff * (*grad).clk; + + update_lr(&(*val).lr, &(*val).lr_g2sum, (*grad).lr_g, (*grad).show); + + if (val.mf_size == 0) { + if (local_mf_create_thresholds <= + local_nonclk_coeff * ((*val).show - (*val).clk) + + local_clk_coeff * (*val).clk) { + val.mf_size = MF_DIM + 1; + val.mf[0] = 0; + + xpu_rand_uniform(&); + for (int i = 0; i < MF_DIM; ++i) { + (*val).mf[i + 1] = (xpu_rand_uniform()) * local_mf_initial_range; + } + } + } else { + update_mf(MF_DIM, &val.mf[1], &val.mf[0], (*grad).mf_g, (*grad).show); + } +} + +template +__global__ void insert_kernel(Table* table, const KeyType* const keys, + const ValType* const vals, size_t len) { + int cid = core_id(); + int ncores = core_num(); + if (cid >= ncores) { + return; + } + int thread_id = ncores * cluster_id() + cid; + int nthreads = ncores * cluster_num(); + + const int buf_size = 150; + __local__ KeyType local_keys[buf_size]; + __local__ ValType local_vals[buf_size]; + int len_per_loop = min(buf_size, roundup_div(len, nthreads)); + + for (int i = thread_id * len_per_loop; i < len; + i += nthreads * len_per_loop) { + int read_len = min(len_per_loop, len - i); + GM2LM(keys, local_keys, read_len * sizeof(KeyType)); + GM2LM(vals, local_vals, read_len * sizeof(ValType)); + for (int k = 0; k < read_len; k++) { + // auto status = table->insert(local_keys[k], local_vals[k]); + // assert(status != false && "error: insert fails: table is full"); + } + } +} + +template +__global__ void search_kernel(Table* table, const KeyType* const keys, + ValType* const vals, size_t len) { + int cid = core_id(); + int ncores = core_num(); + if (cid >= ncores) { + return; + } + int thread_id = ncores * cluster_id() + cid; + int nthreads = ncores * cluster_num(); + + const int buf_size = 150; + __local__ KeyType local_keys[buf_size]; + __local__ ValType local_vals[buf_size]; + + int len_per_loop = min(buf_size, roundup_div(len, nthreads)); + for (int i = thread_id * len_per_loop; i < len; + i += nthreads * len_per_loop) { + int read_len = min(len_per_loop, len - i); + GM2LM(keys, local_keys, read_len * sizeof(KeyType)); + for (int k = 0; k < read_len; k++) { + // ValType* val = table->find(local_keys[k]); + // if (val != NULL) { + // local_vals[k] = *val; + // } + } + LM2GM(local_vals, vals + i, read_len * sizeof(ValType)); + } +} + +template +__global__ void update_kernel(Table* table, const KeyType* const keys, + const GradType* const grads, size_t len) { + int cid = core_id(); + int ncores = core_num(); + if (cid >= ncores) { + return; + } + int thread_id = ncores * cluster_id() + cid; + int nthreads = ncores * cluster_num(); + + const int buf_size = 250; + __local__ KeyType local_keys[buf_size]; + __local__ GradType local_grads[buf_size]; + + int len_per_loop = min(buf_size, roundup_div(len, nthreads)); + for (int i = thread_id * len_per_loop; i < len; + i += nthreads * len_per_loop) { + int read_len = min(len_per_loop, len - i); + + GM2LM(keys, local_keys, read_len * sizeof(KeyType)); + GM2LM(grads, local_grads, read_len * sizeof(GradType)); + + for (int k = 0; k < read_len; k++) { + // ValType* val = table->find(local_keys[k]); + // if (val != NULL) { + // update_value(*val, grads[i]); + //} + } + } +} + +template +HashTable::HashTable(size_t capacity) { + auto tmp_container = XPUCacheArray(capacity); + xpu_malloc(reinterpret_cast(&container_), + sizeof(XPUCacheArray)); + xpu_memcpy(container_, &tmp_container, + sizeof(XPUCacheArray), XPU_HOST_TO_DEVICE); + rwlock_.reset(new phi::RWLock); +} + +template +HashTable::~HashTable() { + xpu_free((void*)container_); +} + +template +void HashTable::show() { + container_->print(); +} + +template +template +void HashTable::get(const KeyType* d_keys, ValType* d_vals, + size_t len, StreamType stream) { + if (len == 0) { + return; + } + search_kernel<<<4, 64, stream>>>(container_, d_keys, d_vals, len); +} + +template +template +void HashTable::get(const KeyType* d_keys, char* d_vals, + size_t len, StreamType stream) { + if (len == 0) { + return; + } + // TODO(zhangminxu): to be implemented +} + +template +template +void HashTable::insert(const KeyType* d_keys, + const ValType* d_vals, size_t len, + StreamType stream) { + if (len == 0) { + return; + } + insert_kernel<<<4, 64, stream>>>(container_, d_keys, d_vals, len); +} + +template +template +void HashTable::dump_to_cpu(int devid, StreamType stream) { + // TODO(zhangminxu): to be implemented +} + +template +template +void HashTable::update(const KeyType* d_keys, + const GradType* d_grads, size_t len, + StreamType stream) { + if (len == 0) { + return; + } + update_kernel<<<4, 64, stream>>>(container_, d_keys, d_grads, len); +} + +template +template +void HashTable::update(const KeyType* d_keys, + const char* d_grads, size_t len, + StreamType stream) { + if (len == 0) { + return; + } + // TODO(zhangminxu): to be implemented +} + +template class HashTable; + +template void HashTable::get< + XPUStream>(const unsigned long* d_keys, + paddle::framework::FeatureValue* d_vals, size_t len, + XPUStream stream); + +// template void +// HashTable::get( +// const unsigned long* d_keys, char* d_vals, size_t len, XPUStream stream); + +template void HashTable::insert< + XPUStream>(const unsigned long* d_keys, + const paddle::framework::FeatureValue* d_vals, size_t len, + XPUStream stream); + +// template void HashTable::insert< +// XPUStream>(const unsigned long* d_keys, size_t len, char* pool, +// size_t start_index, XPUStream stream); + +template void HashTable:: + dump_to_cpu(int devid, XPUStream stream); + +template void HashTable::update< + paddle::framework::FeaturePushValue, XPUStream>( + const unsigned long* d_keys, + const paddle::framework::FeaturePushValue* d_grads, size_t len, + XPUStream stream); + +// template void HashTable::update< +// XPUStream>(const unsigned long* d_keys, const char* d_grads, +// size_t len, XPUStream stream); + +#endif +} // end namespace framework +} // end namespace paddle +#endif diff --git a/paddle/fluid/framework/fleet/heter_ps/heter_comm.h b/paddle/fluid/framework/fleet/heter_ps/heter_comm.h index 1fca8cdf8bb80..817fd8d38ee06 100644 --- a/paddle/fluid/framework/fleet/heter_ps/heter_comm.h +++ b/paddle/fluid/framework/fleet/heter_ps/heter_comm.h @@ -15,39 +15,28 @@ limitations under the License. */ #pragma once #include #include -#include "cub/cub.cuh" -#include "cub/util_allocator.cuh" -#include "hashtable.h" // NOLINT -#include "heter_resource.h" // NOLINT +#if defined(PADDLE_WITH_CUDA) #include "paddle/fluid/framework/fleet/heter_ps/optimizer.cuh.h" -#include "paddle/fluid/memory/allocation/allocator.h" -#include "paddle/fluid/memory/memory.h" #include "paddle/fluid/platform/cuda_device_guard.h" #include "paddle/fluid/platform/dynload/nccl.h" -#include "paddle/fluid/platform/place.h" #include "thrust/pair.h" +#elif defined(PADDLE_WITH_XPU_KP) +#include +#include "paddle/fluid/platform/device/xpu/enforce_xpu.h" +#endif + +#include "paddle/fluid/framework/fleet/heter_ps/hashtable.h" +#include "paddle/fluid/framework/fleet/heter_ps/heter_comm_kernel.h" +#include "paddle/fluid/framework/fleet/heter_ps/heter_resource.h" +#include "paddle/fluid/memory/allocation/allocator.h" +#include "paddle/fluid/memory/memory.h" +#include "paddle/fluid/platform/place.h" #ifdef PADDLE_WITH_HETERPS namespace paddle { namespace framework { -struct CustomGradMerger { - template - CUB_RUNTIME_FUNCTION __forceinline__ __device__ T - operator()(const T& a, const T& b) const { - T out; - out.slot = a.slot; - out.show = a.show + b.show; - out.clk = a.clk + b.clk; - out.lr_g = a.lr_g + b.lr_g; - for (int i = 0; i < MF_DIM; ++i) { - out.mf_g[i] = a.mf_g[i] + b.mf_g[i]; - } - return out; - } -}; - template class HeterComm { public: @@ -67,10 +56,21 @@ class HeterComm { void show_one_table(int gpu_num); int get_index_by_devid(int devid); +#if defined(PADDLE_WITH_CUDA) template void push_sparse(int num, KeyType* d_keys, GradType* d_grads, size_t len, Sgd& sgd); // NOLINT +#elif defined(PADDLE_WITH_XPU_KP) + void push_sparse(int num, KeyType* d_keys, GradType* d_grads, size_t len); +#endif + + int log2i(int x); + template + void memory_copy(DstPlace dst_place, void* dst, SrcPlace src_place, + const void* src, size_t count, StreamType stream = 0); + +#if defined(PADDLE_WITH_CUDA) template void push_sparse_multi_node(int num, KeyType* d_keys, GradType* d_grads, size_t len, Sgd& sgd); // NOLINT @@ -85,8 +85,6 @@ class HeterComm { int gather_multi_node_grad(int num, KeyType* d_keys, GradType* d_grads, int len); - int log2i(int x); - void set_nccl_comm_and_size(const std::vector& inner_comms, const std::vector& inter_comms, int comm_size) { @@ -101,19 +99,21 @@ class HeterComm { // void dump_to_cpu(int index); - void end_pass(); - int get_transfer_devid(int send_id) { return (send_id + 4) % 8; } +#endif + + void end_pass(); + struct Node { - cudaStream_t in_stream; - cudaStream_t out_stream; + ppStream in_stream; + ppStream out_stream; char* key_storage; char* val_storage; int sync; int key_bytes_len; int val_bytes_len; - int gpu_num; + int dev_num; }; struct Path { @@ -133,7 +133,7 @@ class HeterComm { alloc(size, true); } - void alloc(int size, bool force = false) { + void alloc(size_t size, bool force = false) { if (force || size > all_keys_mem->size()) { all_keys_mem.reset(); all_grads_mem.reset(); @@ -152,7 +152,11 @@ class HeterComm { } } +#if defined(PADDLE_WITH_CUDA) platform::CUDAPlace place_; +#elif defined(PADDLE_WITH_XPU_KP) + platform::XPUPlace place_; +#endif std::shared_ptr all_keys_mem; std::shared_ptr all_grads_mem; KeyType* all_keys; @@ -166,6 +170,33 @@ class HeterComm { void init_path(); + template + void sync_stream(const StreamType& stream) { +#if defined(PADDLE_WITH_CUDA) + PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamSynchronize(stream)); +#elif defined(PADDLE_WITH_XPU_KP) + PADDLE_ENFORCE_XPU_SUCCESS(xpu_wait(stream)); +#endif + } + + template + void create_stream(StreamType* stream) { +#if defined(PADDLE_WITH_CUDA) + PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamCreate(stream)); +#elif defined(PADDLE_WITH_XPU_KP) + PADDLE_ENFORCE_XPU_SUCCESS(xpu_stream_create(stream)); +#endif + } + + template + void destroy_stream(StreamType stream) { +#if defined(PADDLE_WITH_CUDA) + PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamDestroy(stream)); +#elif defined(PADDLE_WITH_XPU_KP) + PADDLE_ENFORCE_XPU_SUCCESS(xpu_stream_destroy(stream)); +#endif + } + void create_storage(int start_index, int end_index, int keylen, int vallen); void destroy_storage(int start_index, int end_index); void walk_to_dest(int start_index, int gpu_num, int* h_left, int* h_right, @@ -182,15 +213,18 @@ class HeterComm { int block_size_{256}; private: + std::unique_ptr heter_comm_kernel_; std::vector storage_; - CustomGradMerger merger_; int topo_aware_{0}; int feanum_{1800 * 2048}; int multi_node_{0}; + int node_size_; + +#if defined(PADDLE_WITH_CUDA) std::vector nccl_inner_comms_; std::vector nccl_inter_comms_; - int node_size_; std::vector> allocators_; +#endif }; } // end namespace framework diff --git a/paddle/fluid/framework/fleet/heter_ps/heter_comm_inl.h b/paddle/fluid/framework/fleet/heter_ps/heter_comm_inl.h index f85ed330dc8ea..3ced33b490d59 100644 --- a/paddle/fluid/framework/fleet/heter_ps/heter_comm_inl.h +++ b/paddle/fluid/framework/fleet/heter_ps/heter_comm_inl.h @@ -13,115 +13,46 @@ See the License for the specific language governing permissions and limitations under the License. */ #pragma once #ifdef PADDLE_WITH_HETERPS -//#include "paddle/fluid/framework/fleet/heter_ps/heter_comm.h" #include +#include "paddle/fluid/framework/fleet/heter_ps/heter_comm_kernel.h" +#include "paddle/fluid/platform/device_context.h" +#ifdef PADDLE_WITH_XPU_KP +#include "paddle/fluid/platform/device/xpu/xpu_info.h" +#endif namespace paddle { namespace framework { -template -__global__ void fill_idx(T* idx, size_t len) { - const size_t i = blockIdx.x * blockDim.x + threadIdx.x; - if (i < len) { - idx[i] = i; - } -} - -template -void show_tensor(T* input, size_t len, gpuStream_t stream, std::string name) { - T tmp[len]; // NOLINT - cudaMemcpyAsync(&tmp, input, sizeof(T) * len, cudaMemcpyDeviceToHost, stream); - cudaStreamSynchronize(stream); - std::cout << name; - for (int i = 0; i < len; ++i) { - std::cout << ":" << tmp[i]; - } - std::cout << std::endl; -} - -template -__global__ void calc_shard_offset(T* idx, T* left, T* right, size_t len) { - const size_t i = blockIdx.x * blockDim.x + threadIdx.x; - if (i < len - 1) { - if (idx[i] != idx[i + 1]) { - right[idx[i]] = i; - left[idx[i + 1]] = i + 1; - } - } - if (i == 0) { - left[idx[i]] = i; - } - if (i == (len - 1)) { - right[idx[i]] = i; - } -} - -template -__global__ void calc_shard_index(KeyType* d_keys, size_t len, T* shard_index, - int total_gpu) { - const size_t i = blockIdx.x * blockDim.x + threadIdx.x; - if (i < len) { - shard_index[i] = d_keys[i] % total_gpu; - } -} - -template -__global__ void fill_shard_key(KeyType* d_shard_keys, KeyType* d_keys, T* idx, - size_t len) { - const size_t i = blockIdx.x * blockDim.x + threadIdx.x; - if (i < len) { - d_shard_keys[i] = d_keys[idx[i]]; - } -} - -template -__global__ void fill_shard_grads(KeyType* d_shard_keys, KeyType* d_keys, - GradType* d_shard_grads, GradType* d_grads, - T* idx, size_t len) { - const size_t i = blockIdx.x * blockDim.x + threadIdx.x; - if (i < len) { - d_shard_keys[i] = d_keys[idx[i]]; - d_shard_grads[i] = d_grads[idx[i]]; - } -} - -template -__global__ void fill_dvals(ValType* d_shard_vals, ValType* d_vals, T* idx, - size_t len) { - const size_t i = blockIdx.x * blockDim.x + threadIdx.x; - if (i < len) { - d_vals[idx[i]] = d_shard_vals[i]; - } -} - template HeterComm::HeterComm( size_t capacity, std::shared_ptr resource) { resource_ = resource; - storage_.resize(resource_->total_gpu()); - for (int i = 0; i < resource_->total_gpu(); ++i) { + storage_.resize(resource_->total_device()); + for (int i = 0; i < resource_->total_device(); ++i) { +#if defined(PADDLE_WITH_CUDA) platform::CUDADeviceGuard guard(resource_->dev_id(i)); allocators_.push_back(std::make_shared( 8, 1, (unsigned int)-1, (size_t)-1, false, false)); // NOLINT +#endif auto table = new Table(capacity / load_factor_); tables_.push_back(table); if (multi_node_) { storage_[i].init(feanum_, resource_->dev_id(i)); } } + heter_comm_kernel_ = std::make_unique(block_size_); init_path(); } template void HeterComm::init_path() { - int total_gpu = resource_->total_gpu(); - path_.resize(total_gpu); - + int total_device = resource_->total_device(); + path_.resize(total_device); if (!topo_aware_) { VLOG(0) << "init path without topo aware"; - for (int i = 0; i < total_gpu; ++i) { - path_[i].resize(total_gpu); - for (int j = 0; j < total_gpu; ++j) { + for (int i = 0; i < total_device; ++i) { + path_[i].resize(total_device); + for (int j = 0; j < total_device; ++j) { auto& nodes = path_[i][j].nodes_; nodes.resize(1); nodes[0].in_stream = resource_->comm_stream(i, j); @@ -129,17 +60,18 @@ void HeterComm::init_path() { nodes[0].key_storage = NULL; nodes[0].val_storage = NULL; nodes[0].sync = 0; - nodes[0].gpu_num = j; + nodes[0].dev_num = j; } } } else { VLOG(0) << "init path with topo aware"; - for (int i = 0; i < total_gpu; ++i) { - path_[i].resize(total_gpu); - for (int j = 0; j < total_gpu; ++j) { + for (int i = 0; i < total_device; ++i) { + path_[i].resize(total_device); + for (int j = 0; j < total_device; ++j) { auto& nodes = path_[i][j].nodes_; int from = resource_->dev_id(i); int to = resource_->dev_id(j); + int transfer_id = i; if (need_transfer(from, to)) { transfer_id = resource_->get_index_by_devid(get_transfer_devid(from)); @@ -150,7 +82,7 @@ void HeterComm::init_path() { node.key_storage = NULL; node.val_storage = NULL; node.sync = 1; - node.gpu_num = transfer_id; + node.dev_num = transfer_id; } nodes.push_back(Node()); Node& node = nodes.back(); @@ -159,148 +91,222 @@ void HeterComm::init_path() { node.key_storage = NULL; node.val_storage = NULL; node.sync = 0; - node.gpu_num = j; + node.dev_num = j; } } } } +template +template +void HeterComm::memory_copy( + DstPlace dst_place, void* dst, SrcPlace src_place, const void* src, + size_t count, StreamType stream) { +#if defined(PADDLE_WITH_CUDA) + cudaMemcpyAsync(dst, src, count, cudaMemcpyDefault, stream); + if (stream == 0) { + cudaStreamSynchronize(0); + } +#elif defined(PADDLE_WITH_XPU_KP) + memory::Copy(dst_place, dst, src_place, src, count); +#endif +} + template void HeterComm::create_storage(int start_index, int end_index, int keylen, int vallen) { +#if defined(PADDLE_WITH_CUDA) auto& allocator = allocators_[start_index]; auto& nodes = path_[start_index][end_index].nodes_; for (size_t i = 0; i < nodes.size(); ++i) { - platform::CUDADeviceGuard guard(resource_->dev_id(nodes[i].gpu_num)); + platform::CUDADeviceGuard guard(resource_->dev_id(nodes[i].dev_num)); allocator->DeviceAllocate( - resource_->dev_id(nodes[i].gpu_num), + resource_->dev_id(nodes[i].dev_num), (void**)&(nodes[i].key_storage), // NOLINT - keylen, resource_->remote_stream(nodes[i].gpu_num, start_index)); + keylen, resource_->remote_stream(nodes[i].dev_num, start_index)); allocator->DeviceAllocate( - resource_->dev_id(nodes[i].gpu_num), + resource_->dev_id(nodes[i].dev_num), (void**)&(nodes[i].val_storage), // NOLINT - vallen, resource_->remote_stream(nodes[i].gpu_num, start_index)); - + vallen, resource_->remote_stream(nodes[i].dev_num, start_index)); + nodes[i].key_bytes_len = keylen; + nodes[i].val_bytes_len = vallen; + } +#elif defined(PADDLE_WITH_XPU_KP) + auto& nodes = path_[start_index][end_index].nodes_; + for (size_t i = 0; i < nodes.size(); ++i) { + platform::XPUDeviceGuard guard(resource_->dev_id(nodes[i].dev_num)); + auto place = DevPlace(resource_->dev_id(nodes[i].dev_num)); + auto node_keys_mem = memory::Alloc(place, keylen); + nodes[i].key_storage = reinterpret_cast(node_keys_mem->ptr()); + auto node_vals_mem = memory::Alloc(place, vallen); + nodes[i].val_storage = reinterpret_cast(node_vals_mem->ptr()); nodes[i].key_bytes_len = keylen; nodes[i].val_bytes_len = vallen; } +#endif } template void HeterComm::destroy_storage(int start_index, int end_index) { +#if defined(PADDLE_WITH_CUDA) auto& allocator = allocators_[start_index]; auto& nodes = path_[start_index][end_index].nodes_; for (size_t i = 0; i < nodes.size(); ++i) { - platform::CUDADeviceGuard guard(resource_->dev_id(nodes[i].gpu_num)); + platform::CUDADeviceGuard guard(resource_->dev_id(nodes[i].dev_num)); - allocator->DeviceFree(resource_->dev_id(nodes[i].gpu_num), + allocator->DeviceFree(resource_->dev_id(nodes[i].dev_num), nodes[i].key_storage); - allocator->DeviceFree(resource_->dev_id(nodes[i].gpu_num), + allocator->DeviceFree(resource_->dev_id(nodes[i].dev_num), nodes[i].val_storage); +#endif } } template -void HeterComm::walk_to_dest( - int start_index, int gpu_num, int* h_left, int* h_right, KeyType* src_key, - GradType* src_val) { +void HeterComm::walk_to_dest(int start_index, + int num, int* h_left, + int* h_right, + KeyType* src_key, + GradType* src_val) { int need_copy_val = 0; if (src_val) { need_copy_val = 1; } std::queue que; - for (int i = 0; i < gpu_num; i++) { + for (int i = 0; i < num; i++) { if (h_left[i] == -1 || h_right[i] == -1) { continue; } - int size = path_[start_index][i].nodes_.size(); + // int size = path_[start_index][i].nodes_.size(); auto& node = path_[start_index][i].nodes_[0]; + CopyTask t(&path_[start_index][i], 0); que.push(t); - cudaMemcpyAsync(node.key_storage, - reinterpret_cast(src_key + h_left[i]), - node.key_bytes_len, cudaMemcpyDefault, node.in_stream); + auto src_dev_id = resource_->dev_id(start_index); + auto dst_dev_id = resource_->dev_id(i); + auto src_place = DevPlace(src_dev_id); + auto dst_place = DevPlace(dst_dev_id); + + memory_copy(dst_place, node.key_storage, src_place, + reinterpret_cast(src_key + h_left[i]), + node.key_bytes_len, node.in_stream); if (need_copy_val) { - cudaMemcpyAsync(node.val_storage, - reinterpret_cast(src_val + h_left[i]), - node.val_bytes_len, cudaMemcpyDefault, node.in_stream); + memory_copy(dst_place, node.val_storage, src_place, + reinterpret_cast(src_val + h_left[i]), + node.val_bytes_len, node.in_stream); } } while (!que.empty()) { CopyTask& cur_task = que.front(); que.pop(); if (cur_task.path->nodes_[cur_task.step].sync) { - cudaStreamSynchronize(cur_task.path->nodes_[cur_task.step].in_stream); + sync_stream(cur_task.path->nodes_[cur_task.step].in_stream); } - if (cur_task.step != cur_task.path->nodes_.size() - 1) { + if (static_cast(cur_task.step) != + cur_task.path->nodes_.size() - 1) { int cur_step = cur_task.step; CopyTask c(cur_task.path, cur_step + 1); que.push(c); - cudaMemcpyAsync(cur_task.path->nodes_[cur_step + 1].key_storage, - cur_task.path->nodes_[cur_step].key_storage, - cur_task.path->nodes_[cur_step + 1].key_bytes_len, - cudaMemcpyDefault, - cur_task.path->nodes_[cur_step + 1].in_stream); + + auto src_dev_id = + resource_->dev_id(cur_task.path->nodes_[cur_step].dev_num); + auto dst_dev_id = + resource_->dev_id(cur_task.path->nodes_[cur_step + 1].dev_num); + auto src_place = DevPlace(src_dev_id); + auto dst_place = DevPlace(dst_dev_id); + + memory_copy(dst_place, cur_task.path->nodes_[cur_step + 1].key_storage, + src_place, cur_task.path->nodes_[cur_step].key_storage, + cur_task.path->nodes_[cur_step + 1].key_bytes_len, + cur_task.path->nodes_[cur_step + 1].in_stream); if (need_copy_val) { - cudaMemcpyAsync(cur_task.path->nodes_[cur_step + 1].val_storage, - cur_task.path->nodes_[cur_step].val_storage, - cur_task.path->nodes_[cur_step + 1].val_bytes_len, - cudaMemcpyDefault, - cur_task.path->nodes_[cur_step + 1].in_stream); + memory_copy(dst_place, cur_task.path->nodes_[cur_step + 1].val_storage, + src_place, cur_task.path->nodes_[cur_step].val_storage, + cur_task.path->nodes_[cur_step + 1].val_bytes_len, + cur_task.path->nodes_[cur_step + 1].in_stream); } } } } template -void HeterComm::walk_to_src( - int start_index, int gpu_num, int* h_left, int* h_right, ValType* src_val) { +void HeterComm::walk_to_src(int start_index, + int num, int* h_left, + int* h_right, + ValType* src_val) { std::queue que; - for (int i = 0; i < gpu_num; i++) { + + for (int i = 0; i < num; i++) { if (h_left[i] == -1 || h_right[i] == -1) { continue; } int cur_step = path_[start_index][i].nodes_.size() - 1; auto& node = path_[start_index][i].nodes_[cur_step]; + + auto src_dev_id = resource_->dev_id(i); + auto src_place = DevPlace(src_dev_id); + if (cur_step == 0) { - cudaMemcpyAsync(reinterpret_cast(src_val + h_left[i]), - node.val_storage, node.val_bytes_len, cudaMemcpyDefault, - node.out_stream); + auto dst_dev_id = resource_->dev_id(start_index); + auto dst_place = DevPlace(dst_dev_id); + memory_copy(dst_place, reinterpret_cast(src_val + h_left[i]), + src_place, node.val_storage, node.val_bytes_len, + node.out_stream); } else { CopyTask t(&path_[start_index][i], cur_step - 1); que.push(t); - cudaMemcpyAsync(path_[start_index][i].nodes_[cur_step - 1].val_storage, - node.val_storage, - path_[start_index][i].nodes_[cur_step - 1].val_bytes_len, - cudaMemcpyDefault, - path_[start_index][i].nodes_[cur_step - 1].out_stream); + + auto dst_dev_id = + resource_->dev_id(path_[start_index][i].nodes_[cur_step - 1].dev_num); + auto dst_place = DevPlace(dst_dev_id); + + memory_copy(dst_place, + path_[start_index][i].nodes_[cur_step - 1].val_storage, + src_place, node.val_storage, + path_[start_index][i].nodes_[cur_step - 1].val_bytes_len, + path_[start_index][i].nodes_[cur_step - 1].out_stream); } } + while (!que.empty()) { CopyTask& cur_task = que.front(); que.pop(); int cur_step = cur_task.step; if (cur_task.path->nodes_[cur_step].sync) { - cudaStreamSynchronize(cur_task.path->nodes_[cur_step].out_stream); + sync_stream(cur_task.path->nodes_[cur_step].out_stream); } + + auto src_dev_id = + resource_->dev_id(cur_task.path->nodes_[cur_step].dev_num); + auto src_place = DevPlace(src_dev_id); + if (cur_step > 0) { CopyTask c(cur_task.path, cur_step - 1); que.push(c); - cudaMemcpyAsync(cur_task.path->nodes_[cur_step - 1].val_storage, - cur_task.path->nodes_[cur_step].val_storage, - cur_task.path->nodes_[cur_step - 1].val_bytes_len, - cudaMemcpyDefault, - cur_task.path->nodes_[cur_step - 1].out_stream); + + auto dst_dev_id = + resource_->dev_id(cur_task.path->nodes_[cur_step - 1].dev_num); + auto dst_place = DevPlace(dst_dev_id); + + memory_copy(dst_place, cur_task.path->nodes_[cur_step - 1].val_storage, + src_place, cur_task.path->nodes_[cur_step].val_storage, + cur_task.path->nodes_[cur_step - 1].val_bytes_len, + cur_task.path->nodes_[cur_step - 1].out_stream); + } else if (cur_step == 0) { - int end_index = cur_task.path->nodes_.back().gpu_num; - cudaMemcpyAsync(reinterpret_cast(src_val + h_left[end_index]), - cur_task.path->nodes_[cur_step].val_storage, - cur_task.path->nodes_[cur_step].val_bytes_len, - cudaMemcpyDefault, - cur_task.path->nodes_[cur_step].out_stream); + int end_index = cur_task.path->nodes_.back().dev_num; + + auto dst_dev_id = resource_->dev_id(end_index); + auto dst_place = DevPlace(dst_dev_id); + + memory_copy(dst_place, + reinterpret_cast(src_val + h_left[end_index]), + src_place, cur_task.path->nodes_[cur_step].val_storage, + cur_task.path->nodes_[cur_step].val_bytes_len, + cur_task.path->nodes_[cur_step].out_stream); } } } @@ -314,8 +320,8 @@ HeterComm::~HeterComm() { } template -void HeterComm::show_one_table(int gpu_num) { - tables_[gpu_num]->show(); +void HeterComm::show_one_table(int num) { + tables_[num]->show(); } template @@ -333,24 +339,22 @@ int HeterComm::get_index_by_devid(int devid) { } template -void HeterComm::build_ps(int num, KeyType* h_keys, - ValType* h_vals, - size_t len, - size_t chunk_size, - int stream_num) { +void HeterComm::build_ps( + int dev_num, KeyType* h_keys, ValType* h_vals, size_t len, + size_t chunk_size, int stream_num) { if (len <= 0) { return; } - int dev_id = resource_->dev_id(num); - platform::CUDAPlace place = platform::CUDAPlace(dev_id); - platform::CUDADeviceGuard guard(dev_id); + int dev_id = resource_->dev_id(dev_num); std::vector d_key_bufs; std::vector d_val_bufs; - gpuStream_t streams[stream_num]; // NOLINT + DevPlace place = DevPlace(dev_id); + AnyDeviceGuard guard(dev_id); + ppStream streams[stream_num]; // NOLINT for (int i = 0; i < stream_num; ++i) { - PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamCreate(&(streams[i]))); + create_stream(&(streams[i])); auto d_k_buf = memory::Alloc(place, chunk_size * sizeof(KeyType)); auto d_v_buf = memory::Alloc(place, chunk_size * sizeof(ValType)); d_key_bufs.push_back(std::move(d_k_buf)); @@ -360,39 +364,48 @@ void HeterComm::build_ps(int num, KeyType* h_keys, int cur_len = 0; int cur_stream = 0; - while (cur_len < len) { + while (static_cast(cur_len) < len) { cur_stream = cur_stream % stream_num; + auto cur_use_stream = streams[cur_stream]; +#if defined(PADDLE_WITH_XPU_KP) + cur_use_stream = 0; +#endif + int tmp_len = cur_len + chunk_size > len ? len - cur_len : chunk_size; - PADDLE_ENFORCE_GPU_SUCCESS( - cudaMemcpyAsync(d_key_bufs[cur_stream]->ptr(), h_keys + cur_len, - sizeof(KeyType) * tmp_len, cudaMemcpyHostToDevice, - streams[cur_stream])); - PADDLE_ENFORCE_GPU_SUCCESS( - cudaMemcpyAsync(d_val_bufs[cur_stream]->ptr(), h_vals + cur_len, - sizeof(ValType) * tmp_len, cudaMemcpyHostToDevice, - streams[cur_stream])); - tables_[num]->insert( + + auto dst_place = place; + auto src_place = platform::CPUPlace(); + + memory_copy( + dst_place, reinterpret_cast(d_key_bufs[cur_stream]->ptr()), + src_place, h_keys + cur_len, sizeof(KeyType) * tmp_len, cur_use_stream); + memory_copy( + dst_place, reinterpret_cast(d_val_bufs[cur_stream]->ptr()), + src_place, h_vals + cur_len, sizeof(ValType) * tmp_len, cur_use_stream); + + tables_[dev_num]->insert( reinterpret_cast(d_key_bufs[cur_stream]->ptr()), reinterpret_cast(d_val_bufs[cur_stream]->ptr()), tmp_len, - streams[cur_stream]); + cur_use_stream); + cur_stream += 1; cur_len += tmp_len; } - for (int i = 0; i < stream_num; ++i) { - cudaStreamSynchronize(streams[i]); - PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamDestroy(streams[i])); + sync_stream(streams[i]); + destroy_stream(streams[i]); } } template void HeterComm::merge_grad( - int gpu_num, KeyType* d_keys, GradType* d_grads, size_t len, + int dev_num, KeyType* d_keys, GradType* d_grads, size_t len, int& uniq_len) { // NOLINT - int dev_id = resource_->dev_id(gpu_num); - platform::CUDAPlace place = platform::CUDAPlace(dev_id); - platform::CUDADeviceGuard guard(dev_id); - auto stream = resource_->local_stream(gpu_num, 0); + + int dev_id = resource_->dev_id(dev_num); + DevPlace place = DevPlace(dev_id); + AnyDeviceGuard guard(dev_id); + auto stream = resource_->local_stream(dev_num, 0); size_t temp_storage_bytes; @@ -403,48 +416,50 @@ void HeterComm::merge_grad( GradType* d_merge_grads_ptr = reinterpret_cast(d_merge_grads->ptr()); - PADDLE_ENFORCE_GPU_SUCCESS(cub::DeviceRadixSort::SortPairs( - NULL, temp_storage_bytes, d_keys, d_merge_keys_ptr, d_grads, - d_merge_grads_ptr, len, 0, 8 * sizeof(KeyType), stream, false)); + heter_comm_kernel_->sort_pairs(NULL, temp_storage_bytes, d_keys, + d_merge_keys_ptr, d_grads, d_merge_grads_ptr, + len, 0, 8 * sizeof(KeyType), stream, false); - void* d_buff = NULL; auto d_temp_storage = memory::Alloc(place, temp_storage_bytes); - PADDLE_ENFORCE_GPU_SUCCESS(cub::DeviceRadixSort::SortPairs( + heter_comm_kernel_->sort_pairs( d_temp_storage->ptr(), temp_storage_bytes, d_keys, d_merge_keys_ptr, - d_grads, d_merge_grads_ptr, len, 0, 8 * sizeof(KeyType), stream, false)); + d_grads, d_merge_grads_ptr, len, 0, 8 * sizeof(KeyType), stream, false); temp_storage_bytes = 0; auto d_num_runs_out_mem = memory::Alloc(place, sizeof(int)); int* d_num_runs_out = reinterpret_cast(d_num_runs_out_mem->ptr()); - PADDLE_ENFORCE_GPU_SUCCESS(cub::DeviceReduce::ReduceByKey( - NULL, temp_storage_bytes, d_merge_keys_ptr, d_keys, d_merge_grads_ptr, - d_grads, d_num_runs_out, merger_, len, stream, false)); + heter_comm_kernel_->reduce_by_key(NULL, temp_storage_bytes, d_merge_keys_ptr, + d_keys, d_merge_grads_ptr, d_grads, + d_num_runs_out, len, stream, false); if (d_temp_storage->size() < temp_storage_bytes) { d_temp_storage = NULL; d_temp_storage = memory::Alloc(place, temp_storage_bytes); } - PADDLE_ENFORCE_GPU_SUCCESS(cub::DeviceReduce::ReduceByKey( + heter_comm_kernel_->reduce_by_key( d_temp_storage->ptr(), temp_storage_bytes, d_merge_keys_ptr, d_keys, - d_merge_grads_ptr, d_grads, d_num_runs_out, merger_, len, stream, false)); + d_merge_grads_ptr, d_grads, d_num_runs_out, len, stream, false); - cudaMemcpyAsync(&uniq_len, d_num_runs_out, sizeof(int), - cudaMemcpyDeviceToHost, stream); - PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamSynchronize(stream)); + auto dst_place = platform::CPUPlace(); + auto src_place = place; + memory_copy(dst_place, &uniq_len, src_place, d_num_runs_out, sizeof(int), + stream); + + sync_stream(stream); } template void HeterComm::split_input_to_shard( KeyType* d_keys, int* d_idx_ptr, size_t len, int* left, int* right, - int gpu_num) { - int total_gpu = resource_->total_gpu(); - int dev_id = resource_->dev_id(gpu_num); - platform::CUDAPlace place = platform::CUDAPlace(dev_id); - platform::CUDADeviceGuard guard(dev_id); - auto stream = resource_->local_stream(gpu_num, 0); + int dev_num) { + int total_device = resource_->total_device(); + int dev_id = resource_->dev_id(dev_num); + DevPlace place = DevPlace(dev_id); + AnyDeviceGuard guard(dev_id); + auto stream = resource_->local_stream(dev_num, 0); auto d_idx_tmp = memory::Alloc(place, len * sizeof(int)); int* d_idx_tmp_ptr = reinterpret_cast(d_idx_tmp->ptr()); @@ -455,24 +470,28 @@ void HeterComm::split_input_to_shard( auto d_shard_index_tmp = memory::Alloc(place, len * sizeof(int)); int* d_shard_index_tmp_ptr = reinterpret_cast(d_shard_index_tmp->ptr()); - int grid_size = (len - 1) / block_size_ + 1; - fill_idx<<>>(d_idx_tmp_ptr, len); - calc_shard_index<<>>( - d_keys, len, d_shard_index_tmp_ptr, total_gpu); + // int grid_size = (len - 1) / block_size_ + 1; + + heter_comm_kernel_->fill_idx(d_idx_tmp_ptr, len, stream); + heter_comm_kernel_->calc_shard_index(d_keys, len, d_shard_index_tmp_ptr, + total_device, stream); size_t temp_storage_bytes; - const int num_bits = 1 + log2i(total_gpu); - PADDLE_ENFORCE_GPU_SUCCESS(cub::DeviceRadixSort::SortPairs( + const int num_bits = 1 + log2i(total_device); + + heter_comm_kernel_->sort_pairs( NULL, temp_storage_bytes, d_shard_index_tmp_ptr, d_shard_index_ptr, - d_idx_tmp_ptr, d_idx_ptr, len, 0, num_bits, stream)); + d_idx_tmp_ptr, d_idx_ptr, len, 0, num_bits, stream); auto d_temp_storage = memory::Alloc(place, temp_storage_bytes); - PADDLE_ENFORCE_GPU_SUCCESS(cub::DeviceRadixSort::SortPairs( + + heter_comm_kernel_->sort_pairs( d_temp_storage->ptr(), temp_storage_bytes, d_shard_index_tmp_ptr, - d_shard_index_ptr, d_idx_tmp_ptr, d_idx_ptr, len, 0, num_bits, stream)); - calc_shard_offset<<>>(d_shard_index_ptr, - left, right, len); - cudaStreamSynchronize(stream); + d_shard_index_ptr, d_idx_tmp_ptr, d_idx_ptr, len, 0, num_bits, stream); + + heter_comm_kernel_->calc_shard_offset(d_shard_index_ptr, left, right, len, + total_device, stream); + sync_stream(stream); } template @@ -484,25 +503,43 @@ void HeterComm::pull_sparse(int num, return; } - int total_gpu = resource_->total_gpu(); + int total_device = resource_->total_device(); int dev_id = resource_->dev_id(num); - platform::CUDAPlace place = platform::CUDAPlace(dev_id); - platform::CUDADeviceGuard guard(dev_id); + DevPlace place = DevPlace(dev_id); + AnyDeviceGuard guard(dev_id); auto stream = resource_->local_stream(num, 0); - int grid_size = (len - 1) / block_size_ + 1; + // int grid_size = (len - 1) / block_size_ + 1; - int h_left[total_gpu]; // NOLINT - int h_right[total_gpu]; // NOLINT + int h_left[total_device]; // NOLINT + int h_right[total_device]; // NOLINT - auto d_left = memory::Alloc(place, total_gpu * sizeof(int)); - auto d_right = memory::Alloc(place, total_gpu * sizeof(int)); + auto d_left = memory::Alloc(place, total_device * sizeof(int)); + auto d_right = memory::Alloc(place, total_device * sizeof(int)); int* d_left_ptr = reinterpret_cast(d_left->ptr()); int* d_right_ptr = reinterpret_cast(d_right->ptr()); - cudaMemsetAsync(d_left_ptr, -1, total_gpu * sizeof(int), stream); - cudaMemsetAsync(d_right_ptr, -1, total_gpu * sizeof(int), stream); - // +#if defined(PADDLE_WITH_CUDA) + cudaMemsetAsync(d_left_ptr, -1, total_device * sizeof(int), stream); + cudaMemsetAsync(d_right_ptr, -1, total_device * sizeof(int), stream); + +#elif defined(PADDLE_WITH_XPU_KP) + // get XPUDeviceContext according to xpu place + paddle::platform::XPUDeviceContext xpu_dev_ctx(place); + auto xpu_context = xpu_dev_ctx.x_context(); + + int r = xpu::constant(xpu_context, d_left_ptr, total_device, -1); + PADDLE_ENFORCE_EQ(r, XPU_SUCCESS, + platform::errors::External( + "XPU constant kernel return wrong value[%d %s]", r, + XPUAPIErrorMsg[r])); + int r2 = xpu::constant(xpu_context, d_right_ptr, total_device, -1); + PADDLE_ENFORCE_EQ(r2, XPU_SUCCESS, + platform::errors::External( + "XPU constant kernel return wrong value[%d %s]", r2, + XPUAPIErrorMsg[r2])); +#endif + auto d_idx = memory::Alloc(place, len * sizeof(int)); int* d_idx_ptr = reinterpret_cast(d_idx->ptr()); @@ -513,17 +550,20 @@ void HeterComm::pull_sparse(int num, split_input_to_shard(d_keys, d_idx_ptr, len, d_left_ptr, d_right_ptr, num); - fill_shard_key<<>>(d_shard_keys_ptr, - d_keys, d_idx_ptr, len); + heter_comm_kernel_->fill_shard_key(d_shard_keys_ptr, d_keys, d_idx_ptr, len, + stream); - cudaStreamSynchronize(stream); + sync_stream(stream); - cudaMemcpy(h_left, d_left_ptr, total_gpu * sizeof(int), - cudaMemcpyDeviceToHost); - cudaMemcpy(h_right, d_right_ptr, total_gpu * sizeof(int), - cudaMemcpyDeviceToHost); + auto dst_place = platform::CPUPlace(); + auto src_place = place; - for (int i = 0; i < total_gpu; ++i) { + memory_copy(dst_place, h_left, src_place, d_left_ptr, + total_device * sizeof(int), stream); + memory_copy(dst_place, h_right, src_place, d_right_ptr, + total_device * sizeof(int), stream); + + for (int i = 0; i < total_device; ++i) { int shard_len = h_right[i] - h_left[i] + 1; if (shard_len == 0) { continue; @@ -532,47 +572,53 @@ void HeterComm::pull_sparse(int num, shard_len * sizeof(ValType)); } - walk_to_dest(num, total_gpu, h_left, h_right, d_shard_keys_ptr, NULL); + walk_to_dest(num, total_device, h_left, h_right, d_shard_keys_ptr, NULL); - for (int i = 0; i < total_gpu; ++i) { + for (int i = 0; i < total_device; ++i) { if (h_left[i] == -1) { continue; } auto& node = path_[num][i].nodes_.back(); - cudaStreamSynchronize(node.in_stream); - platform::CUDADeviceGuard guard(resource_->dev_id(i)); + sync_stream(node.in_stream); + + AnyDeviceGuard guard(resource_->dev_id(i)); + tables_[i]->rwlock_->RDLock(); tables_[i]->get(reinterpret_cast(node.key_storage), reinterpret_cast(node.val_storage), h_right[i] - h_left[i] + 1, resource_->remote_stream(i, num)); } - for (int i = 0; i < total_gpu; ++i) { - cudaStreamSynchronize(resource_->remote_stream(i, num)); + + for (int i = 0; i < total_device; ++i) { + sync_stream(resource_->remote_stream(i, num)); if (h_left[i] == -1) { continue; } tables_[i]->rwlock_->UNLock(); } - walk_to_src(num, total_gpu, h_left, h_right, d_shard_vals_ptr); + walk_to_src(num, total_device, h_left, h_right, d_shard_vals_ptr); - for (int i = 0; i < total_gpu; ++i) { + for (int i = 0; i < total_device; ++i) { auto& node = path_[num][i].nodes_.front(); - cudaStreamSynchronize(node.out_stream); + sync_stream(node.out_stream); } - fill_dvals<<>>(d_shard_vals_ptr, d_vals, - d_idx_ptr, len); - cudaStreamSynchronize(stream); - for (int i = 0; i < total_gpu; ++i) { + heter_comm_kernel_->fill_dvals(d_shard_vals_ptr, d_vals, d_idx_ptr, len, + stream); + + sync_stream(stream); + + for (int i = 0; i < total_device; ++i) { destroy_storage(num, i); } } +#if defined(PADDLE_WITH_CUDA) template template -void HeterComm::push_sparse(int gpu_num, +void HeterComm::push_sparse(int dev_num, KeyType* d_keys, GradType* d_grads, size_t len, @@ -581,23 +627,42 @@ void HeterComm::push_sparse(int gpu_num, return; } - int total_gpu = resource_->total_gpu(); - int dev_id = resource_->dev_id(gpu_num); - platform::CUDAPlace place = platform::CUDAPlace(dev_id); - platform::CUDADeviceGuard guard(dev_id); - auto stream = resource_->local_stream(gpu_num, 0); + int total_device = resource_->total_device(); + int dev_id = resource_->dev_id(dev_num); - int h_left[total_gpu]; // NOLINT - int h_right[total_gpu]; // NOLINT + DevPlace place = DevPlace(dev_id); + AnyDeviceGuard guard(dev_id); + auto stream = resource_->local_stream(dev_num, 0); - auto d_left = memory::Alloc(place, total_gpu * sizeof(int)); - auto d_right = memory::Alloc(place, total_gpu * sizeof(int)); + int h_left[total_device]; // NOLINT + int h_right[total_device]; // NOLINT + + auto d_left = memory::Alloc(place, total_device * sizeof(int)); + auto d_right = memory::Alloc(place, total_device * sizeof(int)); int* d_left_ptr = reinterpret_cast(d_left->ptr()); int* d_right_ptr = reinterpret_cast(d_right->ptr()); - cudaMemsetAsync(d_left_ptr, -1, total_gpu * sizeof(int), stream); - cudaMemsetAsync(d_right_ptr, -1, total_gpu * sizeof(int), stream); - // +#if defined(PADDLE_WITH_CUDA) + cudaMemsetAsync(d_left_ptr, -1, total_device * sizeof(int), stream); + cudaMemsetAsync(d_right_ptr, -1, total_device * sizeof(int), stream); + +#elif defined(PADDLE_WITH_XPU_KP) + // get XPUDeviceContext according to xpu place + paddle::platform::XPUDeviceContext xpu_dev_ctx(place); + auto xpu_context = xpu_dev_ctx.x_context(); + + int r = xpu::constant(xpu_context, d_left_ptr, total_device, -1); + PADDLE_ENFORCE_EQ(r, XPU_SUCCESS, + platform::errors::External( + "XPU constant kernel return wrong value[%d %s]", r, + XPUAPIErrorMsg[r])); + int r2 = xpu::constant(xpu_context, d_right_ptr, total_device, -1); + PADDLE_ENFORCE_EQ(r2, XPU_SUCCESS, + platform::errors::External( + "XPU constant kernel return wrong value[%d %s]", r2, + XPUAPIErrorMsg[r2])); +#endif + auto d_idx = memory::Alloc(place, len * sizeof(int)); int* d_idx_ptr = reinterpret_cast(d_idx->ptr()); @@ -608,61 +673,183 @@ void HeterComm::push_sparse(int gpu_num, reinterpret_cast(d_shard_grads->ptr()); int uniq_len = len; - merge_grad(gpu_num, d_keys, d_grads, len, uniq_len); + merge_grad(dev_num, d_keys, d_grads, len, uniq_len); - int grid_size = (uniq_len - 1) / block_size_ + 1; + // int grid_size = (uniq_len - 1) / block_size_ + 1; split_input_to_shard(d_keys, d_idx_ptr, uniq_len, d_left_ptr, d_right_ptr, - gpu_num); + dev_num); - fill_shard_grads<<>>( - d_shard_keys_ptr, d_keys, d_shard_grads_ptr, d_grads, d_idx_ptr, - uniq_len); + heter_comm_kernel_->fill_shard_grads(d_shard_keys_ptr, d_keys, + d_shard_grads_ptr, d_grads, d_idx_ptr, + uniq_len, stream); - cudaStreamSynchronize(stream); + sync_stream(stream); - cudaMemcpy(h_left, d_left_ptr, total_gpu * sizeof(int), - cudaMemcpyDeviceToHost); - cudaMemcpy(h_right, d_right_ptr, total_gpu * sizeof(int), - cudaMemcpyDeviceToHost); + auto dst_place = platform::CPUPlace(); + auto src_place = place; + memory_copy(dst_place, h_left, src_place, d_left_ptr, + total_device * sizeof(int), stream); + memory_copy(dst_place, h_right, src_place, d_right_ptr, + total_device * sizeof(int), stream); - for (int i = 0; i < total_gpu; ++i) { + for (int i = 0; i < total_device; ++i) { int shard_len = h_right[i] - h_left[i] + 1; if (h_left[i] == -1 || h_right[i] == -1) { continue; } - create_storage(gpu_num, i, shard_len * sizeof(KeyType), + create_storage(dev_num, i, shard_len * sizeof(KeyType), shard_len * sizeof(GradType)); } - walk_to_dest(gpu_num, total_gpu, h_left, h_right, d_shard_keys_ptr, + walk_to_dest(dev_num, total_device, h_left, h_right, d_shard_keys_ptr, d_shard_grads_ptr); - for (int i = 0; i < total_gpu; ++i) { + for (int i = 0; i < total_device; ++i) { if (h_left[i] == -1 || h_right[i] == -1) { continue; } - auto& node = path_[gpu_num][i].nodes_.back(); - cudaStreamSynchronize(node.in_stream); + auto& node = path_[dev_num][i].nodes_.back(); + sync_stream(node.in_stream); - platform::CUDADeviceGuard guard(resource_->dev_id(i)); + AnyDeviceGuard guard(resource_->dev_id(i)); tables_[i]->rwlock_->WRLock(); tables_[i]->update(reinterpret_cast(node.key_storage), reinterpret_cast(node.val_storage), h_right[i] - h_left[i] + 1, sgd, - resource_->remote_stream(i, gpu_num)); + resource_->remote_stream(i, dev_num)); } - for (int i = 0; i < total_gpu; ++i) { - cudaStreamSynchronize(resource_->remote_stream(i, gpu_num)); + + for (int i = 0; i < total_device; ++i) { + sync_stream(resource_->remote_stream(i, dev_num)); if (h_left[i] != -1) { tables_[i]->rwlock_->UNLock(); } } - for (int i = 0; i < total_gpu; ++i) { - destroy_storage(gpu_num, i); + + for (int i = 0; i < total_device; ++i) { + destroy_storage(dev_num, i); } } +#elif defined(PADDLE_WITH_XPU_KP) +template +void HeterComm::push_sparse(int dev_num, + KeyType* d_keys, + GradType* d_grads, + size_t len) { + if (len == 0) { + return; + } + + int total_device = resource_->total_device(); + int dev_id = resource_->dev_id(dev_num); + + DevPlace place = DevPlace(dev_id); + AnyDeviceGuard guard(dev_id); + auto stream = resource_->local_stream(dev_num, 0); + + int h_left[total_device]; // NOLINT + int h_right[total_device]; // NOLINT + + auto d_left = memory::Alloc(place, total_device * sizeof(int)); + auto d_right = memory::Alloc(place, total_device * sizeof(int)); + int* d_left_ptr = reinterpret_cast(d_left->ptr()); + int* d_right_ptr = reinterpret_cast(d_right->ptr()); + +#if defined(PADDLE_WITH_CUDA) + cudaMemsetAsync(d_left_ptr, -1, total_device * sizeof(int), stream); + cudaMemsetAsync(d_right_ptr, -1, total_device * sizeof(int), stream); + +#elif defined(PADDLE_WITH_XPU_KP) + // get XPUDeviceContext according to xpu place + paddle::platform::XPUDeviceContext xpu_dev_ctx(place); + auto xpu_context = xpu_dev_ctx.x_context(); + + int r = xpu::constant(xpu_context, d_left_ptr, total_device, -1); + PADDLE_ENFORCE_EQ(r, XPU_SUCCESS, + platform::errors::External( + "XPU constant kernel return wrong value[%d %s]", r, + XPUAPIErrorMsg[r])); + int r2 = xpu::constant(xpu_context, d_right_ptr, total_device, -1); + PADDLE_ENFORCE_EQ(r2, XPU_SUCCESS, + platform::errors::External( + "XPU constant kernel return wrong value[%d %s]", r2, + XPUAPIErrorMsg[r2])); +#endif + + auto d_idx = memory::Alloc(place, len * sizeof(int)); + int* d_idx_ptr = reinterpret_cast(d_idx->ptr()); + + auto d_shard_keys = memory::Alloc(place, len * sizeof(KeyType)); + KeyType* d_shard_keys_ptr = reinterpret_cast(d_shard_keys->ptr()); + auto d_shard_grads = memory::Alloc(place, len * sizeof(GradType)); + GradType* d_shard_grads_ptr = + reinterpret_cast(d_shard_grads->ptr()); + + int uniq_len = len; + merge_grad(dev_num, d_keys, d_grads, len, uniq_len); + + // int grid_size = (uniq_len - 1) / block_size_ + 1; + + split_input_to_shard(d_keys, d_idx_ptr, uniq_len, d_left_ptr, d_right_ptr, + dev_num); + + heter_comm_kernel_->fill_shard_grads(d_shard_keys_ptr, d_keys, + d_shard_grads_ptr, d_grads, d_idx_ptr, + (long long)uniq_len, stream); + + sync_stream(stream); + + auto dst_place = platform::CPUPlace(); + auto src_place = place; + memory_copy(dst_place, h_left, src_place, d_left_ptr, + total_device * sizeof(int)); + memory_copy(dst_place, h_right, src_place, d_right_ptr, + total_device * sizeof(int)); + + for (int i = 0; i < total_device; ++i) { + int shard_len = h_right[i] - h_left[i] + 1; + if (h_left[i] == -1 || h_right[i] == -1) { + continue; + } + create_storage(dev_num, i, shard_len * sizeof(KeyType), + shard_len * sizeof(GradType)); + } + + walk_to_dest(dev_num, total_device, h_left, h_right, d_shard_keys_ptr, + d_shard_grads_ptr); + + for (int i = 0; i < total_device; ++i) { + if (h_left[i] == -1 || h_right[i] == -1) { + continue; + } + auto& node = path_[dev_num][i].nodes_.back(); + sync_stream(node.in_stream); + + AnyDeviceGuard guard(resource_->dev_id(i)); + tables_[i]->rwlock_->WRLock(); + tables_[i]->update(reinterpret_cast(node.key_storage), + reinterpret_cast(node.val_storage), + h_right[i] - h_left[i] + 1, + resource_->remote_stream(i, dev_num)); + } + + for (int i = 0; i < total_device; ++i) { + sync_stream(resource_->remote_stream(i, dev_num)); + if (h_left[i] != -1) { + tables_[i]->rwlock_->UNLock(); + } + } + + for (int i = 0; i < total_device; ++i) { + destroy_storage(dev_num, i); + } +} + +#endif + +#if defined(PADDLE_WITH_CUDA) template template void HeterComm::update_one_table( @@ -705,7 +892,7 @@ void HeterComm::push_sparse_multi_node( template int HeterComm::gather_one_node_grad( int gpu_num, KeyType* d_keys, GradType* d_grads, int len) { - int total_gpu = resource_->total_gpu(); + int total_gpu = resource_->total_device(); int dev_id = resource_->dev_id(gpu_num); auto& storage = storage_[gpu_num]; platform::CUDAPlace place = platform::CUDAPlace(dev_id); @@ -725,10 +912,10 @@ int HeterComm::gather_one_node_grad( // allgather grad len PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclGroupStart()); - PADDLE_ENFORCE_GPU_SUCCESS( - platform::dynload::ncclAllGather((const void*)(d_node_len + gpu_num), - (void*)d_node_len, 1, ncclInt, // NOLINT - nccl_inner_comm, stream)); + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllGather( + (const void*)(d_node_len + gpu_num), (void*)d_node_len, 1, // NOLINT + ncclInt, // NOLINT + nccl_inner_comm, stream)); PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclGroupEnd()); PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamSynchronize(stream)); cudaMemcpy(h_node_len, d_node_len, sizeof(int) * total_gpu, @@ -775,11 +962,12 @@ int HeterComm::gather_one_node_grad( cudaMemcpy(h_right, d_right_ptr, total_gpu * sizeof(int), cudaMemcpyDeviceToHost); - int grid_size = (h_node_len[i] - 1) / block_size_ + 1; - fill_shard_grads<<>>( + // int grid_size = (h_node_len[i] - 1) / block_size_ + 1; + heter_comm_kernel_->fill_shard_grads( storage.local_keys + merge_num, storage.all_keys + index, storage.local_grads + merge_num, storage.all_grads + index, - d_idx_ptr + h_left[gpu_num], h_right[gpu_num] - h_left[gpu_num] + 1); + d_idx_ptr + h_left[gpu_num], h_right[gpu_num] - h_left[gpu_num] + 1, + stream); merge_num = merge_num + h_right[gpu_num] - h_left[gpu_num] + 1; } @@ -848,19 +1036,21 @@ int HeterComm::gather_multi_node_grad( return ret; } +#endif + template void HeterComm::end_pass() { - int total_gpu = resource_->total_gpu(); + int total_device = resource_->total_device(); std::vector threads; auto dump_to_cpu_func = [this](int index) { auto stream = resource_->local_stream(index, 0); int dev_id = resource_->dev_id(index); - platform::CUDADeviceGuard guard(dev_id); + AnyDeviceGuard guard(dev_id); tables_[index]->dump_to_cpu(dev_id, stream); }; - for (int i = 0; i < total_gpu; ++i) { + for (int i = 0; i < total_device; ++i) { threads.push_back(std::thread(dump_to_cpu_func, i)); } for (auto& t : threads) { diff --git a/paddle/fluid/framework/fleet/heter_ps/heter_comm_kernel.cu b/paddle/fluid/framework/fleet/heter_ps/heter_comm_kernel.cu new file mode 100644 index 0000000000000..694bdb8d563f5 --- /dev/null +++ b/paddle/fluid/framework/fleet/heter_ps/heter_comm_kernel.cu @@ -0,0 +1,269 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#ifdef PADDLE_WITH_HETERPS +#include "paddle/fluid/framework/fleet/heter_ps/heter_comm_kernel.h" + +namespace paddle { +namespace framework { + +#ifdef PADDLE_WITH_CUDA + +struct GPUCustomGradMerger { + template + CUB_RUNTIME_FUNCTION __forceinline__ __device__ T + operator()(const T& a, const T& b) const { + T out; + out.slot = a.slot; + out.show = a.show + b.show; + out.clk = a.clk + b.clk; + out.lr_g = a.lr_g + b.lr_g; + for (int i = 0; i < MF_DIM; ++i) { + out.mf_g[i] = a.mf_g[i] + b.mf_g[i]; + } + return out; + } +} gpu_merger; + +template +__global__ void fill_idx_kernel(T* idx, size_t len) { + const size_t i = blockIdx.x * blockDim.x + threadIdx.x; + if (i < len) { + idx[i] = i; + } +} + +// template +// void show_tensor(T* input, size_t len, gpuStream_t stream, std::string +// name) +// { +// T tmp[len]; // NOLINT +// cudaMemcpyAsync(&tmp, input, sizeof(T) * len, cudaMemcpyDeviceToHost, +// stream); +// cudaStreamSynchronize(stream); +// std::cout << name; +// for (int i = 0; i < len; ++i) { +// std::cout << ":" << tmp[i]; +// } +// std::cout << std::endl; +//} + +template +__global__ void calc_shard_offset_kernel(T* idx, T* left, T* right, + size_t len) { + const size_t i = blockIdx.x * blockDim.x + threadIdx.x; + if (i < len - 1) { + if (idx[i] != idx[i + 1]) { + right[idx[i]] = i; + left[idx[i + 1]] = i + 1; + } + } + if (i == 0) { + left[idx[i]] = i; + } + if (i == (len - 1)) { + right[idx[i]] = i; + } +} + +template +__global__ void calc_shard_index_kernel(KeyType* d_keys, size_t len, + T* shard_index, int total_gpu) { + const size_t i = blockIdx.x * blockDim.x + threadIdx.x; + if (i < len) { + shard_index[i] = d_keys[i] % total_gpu; + } +} + +template +__global__ void fill_shard_key_kernel(KeyType* d_shard_keys, KeyType* d_keys, + T* idx, size_t len) { + const size_t i = blockIdx.x * blockDim.x + threadIdx.x; + if (i < len) { + d_shard_keys[i] = d_keys[idx[i]]; + } +} + +template +__global__ void fill_shard_grads_kernel(KeyType* d_shard_keys, KeyType* d_keys, + GradType* d_shard_grads, + GradType* d_grads, T* idx, size_t len) { + const size_t i = blockIdx.x * blockDim.x + threadIdx.x; + if (i < len) { + d_shard_keys[i] = d_keys[idx[i]]; + d_shard_grads[i] = d_grads[idx[i]]; + } +} + +template +__global__ void fill_dvals_kernel(ValType* d_shard_vals, ValType* d_vals, + T* idx, size_t len) { + const size_t i = blockIdx.x * blockDim.x + threadIdx.x; + if (i < len) { + d_vals[idx[i]] = d_shard_vals[i]; + } +} + +// cuda implemention of heter_comm_kernel.h +template +void HeterCommKernel::fill_idx(T* idx, long long len, + const StreamType& stream) { + int grid_size = (len - 1) / block_size_ + 1; + size_t c_len = (size_t)len; + fill_idx_kernel<<>>(idx, c_len); +} + +template +void HeterCommKernel::calc_shard_offset(T* idx, T* left, T* right, + long long len, int total_devs, + const StreamType& stream) { + int grid_size = (len - 1) / block_size_ + 1; + size_t c_len = (size_t)len; + calc_shard_offset_kernel<<>>(idx, left, + right, c_len); +} + +template +void HeterCommKernel::calc_shard_index(KeyType* d_keys, long long len, + T* shard_index, int total_gpu, + const StreamType& stream) { + int grid_size = (len - 1) / block_size_ + 1; + size_t c_len = (size_t)len; + calc_shard_index_kernel<<>>( + d_keys, c_len, shard_index, total_gpu); +} + +template +void HeterCommKernel::fill_shard_key(KeyType* d_shard_keys, KeyType* d_keys, + T* idx, long long len, + const StreamType& stream) { + int grid_size = (len - 1) / block_size_ + 1; + size_t c_len = (size_t)len; + fill_shard_key_kernel<<>>( + d_shard_keys, d_keys, idx, c_len); +} + +template +void HeterCommKernel::fill_shard_grads(KeyType* d_shard_keys, KeyType* d_keys, + GradType* d_shard_grads, + GradType* d_grads, T* idx, long long len, + const StreamType& stream) { + int grid_size = (len - 1) / block_size_ + 1; + size_t c_len = (size_t)len; + fill_shard_grads_kernel<<>>( + d_shard_keys, d_keys, d_shard_grads, d_grads, idx, c_len); +} + +template +void HeterCommKernel::fill_dvals(ValType* d_shard_vals, ValType* d_vals, T* idx, + long long len, const StreamType& stream) { + int grid_size = (len - 1) / block_size_ + 1; + size_t c_len = (size_t)len; + fill_dvals_kernel<<>>(d_shard_vals, d_vals, + idx, c_len); +} + +template +void HeterCommKernel::sort_pairs(void* d_temp_storage, + size_t& temp_storage_bytes, // NOLINT + const KeyT* d_keys_in, // NOLINT + KeyT* d_keys_out, const ValueT* d_values_in, + ValueT* d_values_out, int num_items, + int begin_bit, int end_bit, StreamType stream, + bool debug_synchronous) { + PADDLE_ENFORCE_GPU_SUCCESS(cub::DeviceRadixSort::SortPairs( + d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out, d_values_in, + d_values_out, num_items, begin_bit, end_bit, stream, debug_synchronous)); +} + +template +void HeterCommKernel::reduce_by_key(void* d_temp_storage, + size_t& temp_storage_bytes, // NOLINT + KeysInputIteratorT d_keys_in, + UniqueOutputIteratorT d_unique_out, + ValuesInputIteratorT d_values_in, + AggregatesOutputIteratorT d_aggregates_out, + NumRunsOutputIteratorT d_num_runs_out, + int num_items, StreamType stream, + bool debug_synchronous) { + PADDLE_ENFORCE_GPU_SUCCESS(cub::DeviceReduce::ReduceByKey( + d_temp_storage, temp_storage_bytes, d_keys_in, d_unique_out, d_values_in, + d_aggregates_out, d_num_runs_out, gpu_merger, num_items, stream, + debug_synchronous)); +} + +template void HeterCommKernel::fill_idx( + int* idx, long long len, const cudaStream_t& stream); + +template void HeterCommKernel::calc_shard_offset( + int* idx, int* left, int* right, long long len, int total_devs, + const cudaStream_t& stream); +template void HeterCommKernel::calc_shard_index< + unsigned long, int, cudaStream_t>(unsigned long* d_keys, long long len, + int* shard_index, int total_devs, + const cudaStream_t& stream); + +template void HeterCommKernel::fill_shard_key( + unsigned long* d_shard_keys, unsigned long* d_keys, int* idx, long long len, + const cudaStream_t& stream); + +template void HeterCommKernel::fill_shard_grads< + unsigned long, paddle::framework::FeaturePushValue, int, cudaStream_t>( + unsigned long* d_shard_keys, unsigned long* d_keys, + paddle::framework::FeaturePushValue* d_shard_grads, + paddle::framework::FeaturePushValue* d_grads, int* idx, long long len, + const cudaStream_t& stream); + +template void +HeterCommKernel::fill_dvals( + paddle::framework::FeatureValue* d_shard_vals, + paddle::framework::FeatureValue* d_vals, int* idx, long long len, + const cudaStream_t& stream); + +template void HeterCommKernel::sort_pairs< + unsigned long, paddle::framework::FeaturePushValue, cudaStream_t>( + void* d_temp_storage, + size_t& temp_storage_bytes, // NOLINT + const unsigned long* d_keys_in, // NOLINT + unsigned long* d_keys_out, + const paddle::framework::FeaturePushValue* d_values_in, + paddle::framework::FeaturePushValue* d_values_out, int num_items, + int begin_bit, int end_bit, cudaStream_t stream, bool debug_synchronous); + +template void HeterCommKernel::sort_pairs( + void* d_temp_storage, + size_t& temp_storage_bytes, // NOLINT + const int* d_keys_in, // NOLINT + int* d_keys_out, const int* d_values_in, int* d_values_out, int num_items, + int begin_bit, int end_bit, cudaStream_t stream, bool debug_synchronous); + +template void HeterCommKernel::reduce_by_key< + unsigned long*, unsigned long*, paddle::framework::FeaturePushValue*, + paddle::framework::FeaturePushValue*, int*, cudaStream_t>( + void* d_temp_storage, + size_t& temp_storage_bytes, // NOLINT + unsigned long* d_keys_in, unsigned long* d_unique_out, + paddle::framework::FeaturePushValue* d_values_in, + paddle::framework::FeaturePushValue* d_aggregates_out, int* d_num_runs_out, + int num_items, cudaStream_t stream, bool debug_synchronous); + +#endif + +} // namespace framework +} // namespace paddle +#endif diff --git a/paddle/fluid/framework/fleet/heter_ps/heter_comm_kernel.h b/paddle/fluid/framework/fleet/heter_ps/heter_comm_kernel.h new file mode 100644 index 0000000000000..1be3687a7dbee --- /dev/null +++ b/paddle/fluid/framework/fleet/heter_ps/heter_comm_kernel.h @@ -0,0 +1,86 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#ifdef PADDLE_WITH_HETERPS +#include "paddle/fluid/framework/fleet/heter_ps/feature_value.h" + +#if defined(PADDLE_WITH_CUDA) +#include "cub/cub.cuh" +#include "cub/util_allocator.cuh" +#include "paddle/fluid/platform/cuda_device_guard.h" +#include "paddle/fluid/platform/enforce.h" +#endif + +namespace paddle { +namespace framework { + +class HeterCommKernel { + public: + HeterCommKernel() {} + explicit HeterCommKernel(const int block_size) : block_size_(block_size) {} + + template + void fill_idx(T* idx, long long len, const StreamType& stream); + + template + void calc_shard_offset(T* idx, T* left, T* right, long long len, + int total_devs, const StreamType& stream); + + template + void calc_shard_index(KeyType* d_keys, long long len, T* shard_index, + int total_devs, const StreamType& stream); + + template + void fill_shard_key(KeyType* d_shard_keys, KeyType* d_keys, T* idx, + long long len, const StreamType& stream); + + template + void fill_shard_grads(KeyType* d_shard_keys, KeyType* d_keys, + GradType* d_shard_grads, GradType* d_grads, T* idx, + long long len, const StreamType& stream); + + template + void fill_dvals(ValType* d_shard_vals, ValType* d_vals, T* idx, long long len, + const StreamType& stream); + + template + void sort_pairs(void* d_temp_storage, size_t& temp_storage_bytes, // NOLINT + const KeyT* d_keys_in, KeyT* d_keys_out, + const ValueT* d_values_in, ValueT* d_values_out, + int num_items, int begin_bit = 0, + int end_bit = sizeof(KeyT) * 8, StreamType stream = NULL, + bool debug_synchronous = false); + + template + void reduce_by_key(void* d_temp_storage, + size_t& temp_storage_bytes, // NOLINT + KeysInputIteratorT d_keys_in, + UniqueOutputIteratorT d_unique_out, + ValuesInputIteratorT d_values_in, + AggregatesOutputIteratorT d_aggregates_out, + NumRunsOutputIteratorT d_num_runs_out, int num_items, + StreamType stream = NULL, bool debug_synchronous = false); + + private: + int block_size_{256}; +}; + +} // end namespace framework +} // end namespace paddle +#endif diff --git a/paddle/fluid/framework/fleet/heter_ps/heter_comm_kernel.kps b/paddle/fluid/framework/fleet/heter_ps/heter_comm_kernel.kps new file mode 100644 index 0000000000000..c3e37d9eba34d --- /dev/null +++ b/paddle/fluid/framework/fleet/heter_ps/heter_comm_kernel.kps @@ -0,0 +1,351 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#ifdef PADDLE_WITH_HETERPS +#include "paddle/fluid/framework/fleet/heter_ps/heter_comm_kernel.h" + +#if defined(PADDLE_WITH_XPU_KP) +#include +#include "xpu/kernel/cluster_header.h" +#include "xpu/kernel/math.h" +#include "xpu/kernel/simd.h" +#endif + +namespace paddle { +namespace framework { + +#if defined(PADDLE_WITH_XPU_KP) + +struct XPUCustomGradMerger { + template + __device__ T operator()(const T& a, const T& b) const { + T out; + out.slot = a.slot; + out.show = a.show + b.show; + out.clk = a.clk + b.clk; + out.lr_g = a.lr_g + b.lr_g; + for (int i = 0; i < MF_DIM; ++i) { + out.mf_g[i] = a.mf_g[i] + b.mf_g[i]; + } + return out; + } +} xpu_merger; + +template +__global__ void fill_idx_kernel(T* idx, long long len) { + int cid = core_id(); + int ncores = core_num(); + if (cid >= ncores) { + return; + } + int thread_id = ncores * cluster_id() + cid; + int nthreads = ncores * cluster_num(); + const int buf_size = 1024; + __local__ T local_idx[buf_size]; + int len_per_loop = min(buf_size, roundup_div(len, nthreads)); + for (int i = thread_id * len_per_loop; i < len; + i += nthreads * len_per_loop) { + int read_len = min(len_per_loop, len - i); + for (int k = 0; k < read_len; k++) { + int real_idx = i + k; + local_idx[k] = real_idx; + } + LM2GM(local_idx, idx + i, read_len * sizeof(T)); + } +} + +template +__global__ void calc_shard_offset_kernel(T* idx, T* left, T* right, + long long len, const int total_xpu) { + int cid = core_id(); + int ncores = core_num(); + if (cid >= ncores) { + return; + } + int thread_id = ncores * cluster_id() + cid; + int nthreads = ncores * cluster_num(); + + const int buf_size = 1024; + __local__ T local_idx[buf_size]; + __local__ T local_left[total_xpu]; + __local__ T local_right[total_xpu]; + + for (int i = 0; i < total_xpu; i++) { + local_left[i] = -1; + local_right[i] = -1; + } + int len_per_loop = min(buf_size, roundup_div(len, nthreads)); + for (int i = thread_id * len_per_loop; i < len; + i += nthreads * len_per_loop) { + // read batch from GM will boost performance + int read_len = min(len_per_loop, len - i); + GM2LM(idx + i, local_idx, read_len * sizeof(T)); + for (int k = 0; k < read_len; k++) { + if (local_idx[k] != local_idx[k + 1]) { + int real_idx = i + k; + local_right[local_idx[k]] = real_idx; + local_left[local_idx[k + 1]] = real_idx + 1; + } + } + if (i == 0) { + local_left[local_idx[i]] = i; + } + if (i + read_len == len) { + local_right[local_idx[len - 1]] = len - 1; + } + } + // to be optimized: call LM2GM too frequently + // all_reduce between threads to get global left & global right && LM2GM + for (int i = 0; i < total_xpu; i++) { + if (local_left[i] != -1) LM2GM(local_left + i, left + i, sizeof(T)); + if (local_right[i] != -1) LM2GM(local_right + i, right + i, sizeof(T)); + } +} + +template +__global__ void calc_shard_index_kernel(KeyType* d_keys, long long len, + T* shard_index, int total_xpu) { + int cid = core_id(); + int ncores = core_num(); + if (cid >= ncores) { + return; + } + int thread_id = ncores * cluster_id() + cid; + int nthreads = ncores * cluster_num(); + const int buf_size = 512; + __local__ KeyType local_keys[buf_size]; + __local__ T local_shard_index[buf_size]; + int len_per_loop = min(buf_size, roundup_div(len, nthreads)); + for (int i = thread_id * len_per_loop; i < len; + i += nthreads * len_per_loop) { + // read batch from GM will boost performance + int read_len = min(len_per_loop, len - i); + GM2LM(d_keys + i, local_keys, read_len * sizeof(KeyType)); + for (int k = 0; k < read_len; k++) { + local_shard_index[k] = local_keys[k] % total_xpu; + } + LM2GM(local_shard_index, shard_index + i, read_len * sizeof(T)); + } +} + +template +__global__ void fill_shard_key_kernel(KeyType* d_shard_keys, KeyType* d_keys, + T* idx, long long len) { + int cid = core_id(); + int ncores = core_num(); + if (cid >= ncores) { + return; + } + int thread_id = ncores * cluster_id() + cid; + int nthreads = ncores * cluster_num(); + const int buf_size = 400; + __local__ KeyType local_keys[buf_size]; + __local__ KeyType local_shard_keys[buf_size]; + __local__ T local_idx[buf_size]; + int len_per_loop = min(buf_size, roundup_div(len, nthreads)); + for (int i = thread_id * len_per_loop; i < len; + i += nthreads * len_per_loop) { + // read batch from GM will boost performance + int read_len = min(len_per_loop, len - i); + GM2LM(d_keys + i, local_keys, read_len * sizeof(KeyType)); + GM2LM(idx + i, local_idx, read_len * sizeof(T)); + for (int k = 0; k < read_len; k++) { + local_shard_keys[k] = local_keys[local_idx[k]]; + } + LM2GM(local_shard_keys, d_shard_keys + i, read_len * sizeof(KeyType)); + } +} + +// local mem too large, cause compile error +template +__global__ void fill_shard_grads_kernel(KeyType* d_shard_keys, KeyType* d_keys, + GradType* d_shard_grads, + GradType* d_grads, T* idx, + long long len) { + int cid = core_id(); + int ncores = core_num(); + if (cid >= ncores) { + return; + } + int thread_id = ncores * cluster_id() + cid; + int nthreads = ncores * cluster_num(); + + const int buf_size = 100; + __local__ KeyType local_keys[buf_size]; + __local__ GradType local_grads[buf_size]; + __local__ KeyType local_shard_keys[buf_size]; + __local__ GradType local_shard_grads[buf_size]; + __local__ T local_idx[buf_size]; + + int len_per_loop = min(buf_size, roundup_div(len, nthreads)); + for (int i = thread_id * len_per_loop; i < len; + i += nthreads * len_per_loop) { + // read batch from GM will boost performance + int read_len = min(len_per_loop, len - i); + GM2LM(d_keys + i, local_keys, read_len * sizeof(KeyType)); + GM2LM(d_grads + i, local_grads, read_len * sizeof(GradType)); + GM2LM(idx + i, local_idx, read_len * sizeof(T)); + for (int k = 0; k < read_len; k++) { + local_shard_keys[k] = local_keys[local_idx[k]]; + local_shard_grads[k] = local_grads[local_idx[k]]; + } + LM2GM(local_shard_keys, d_shard_keys + i, read_len * sizeof(KeyType)); + LM2GM(local_shard_grads, d_shard_grads + i, read_len * sizeof(GradType)); + } +} + +template +__global__ void fill_dvals_kernel(ValType* d_shard_vals, ValType* d_vals, + T* idx, long long len) { + int cid = core_id(); + int ncores = core_num(); + if (cid >= ncores) { + return; + } + int thread_id = ncores * cluster_id() + cid; + int nthreads = ncores * cluster_num(); + const int buf_size = 50; + __local__ ValType local_vals[buf_size]; + __local__ ValType local_shard_vals[buf_size]; + __local__ T local_idx[buf_size]; + int len_per_loop = min(buf_size, roundup_div(len, nthreads)); + for (int i = thread_id * len_per_loop; i < len; + i += nthreads * len_per_loop) { + // read batch from GM will boost performance + int read_len = min(len_per_loop, len - i); + GM2LM(idx + i, local_idx, read_len * sizeof(T)); + GM2LM(d_shard_vals + i, local_shard_vals, read_len * sizeof(ValType)); + for (int k = 0; k < read_len; k++) { + local_vals[local_idx[k]] = local_shard_vals[k]; + } + LM2GM(local_vals, d_vals + i, read_len * sizeof(ValType)); + } +} + +// xpu implementation of heter_comm_kernel.h + +template +void fill_idx(T* idx, long long len, const StreamType& stream) { + fill_idx_kernel<<<4, 64, stream>>>(idx, len); +} + +template +void calc_shard_offset(T* idx, T* left, T* right, long long len, int total_devs, + const StreamType& stream) { + calc_shard_offset_kernel<<<4, 64, stream>>>(idx, left, right, len, + total_devs); +} + +template +void calc_shard_index(KeyType* d_keys, long long len, T* shard_index, + int total_devs, const StreamType& stream) { + calc_shard_index_kernel<<<4, 64, stream>>>( + d_keys, len, shard_index, total_devs); +} + +template +void fill_shard_key(KeyType* d_shard_keys, KeyType* d_keys, T* idx, + long long len, const StreamType& stream) { + fill_shard_key_kernel<<<4, 64, stream>>>(d_shard_keys, d_keys, + idx, len); +} + +template +void fill_shard_grads(KeyType* d_shard_keys, KeyType* d_keys, + GradType* d_shard_grads, GradType* d_grads, T* idx, + long long len, const StreamType& stream) { + fill_shard_grads_kernel<<<4, 64, stream>>>( + d_shard_keys, d_keys, d_shard_grads, d_grads, idx, len); +} + +template +void fill_dvals(ValType* d_shard_vals, ValType* d_vals, T* idx, long long len, + const StreamType& stream) { + fill_dvals_kernel<<<4, 64, stream>>>(d_shard_vals, d_vals, idx, + len); +} + +template +void sort_pairs(void* d_temp_storage, size_t& temp_storage_bytes, // NOLINT + const KeyT* d_keys_in, // NOLINT + KeyT* d_keys_out, const ValueT* d_values_in, + ValueT* d_values_out, int num_items, int begin_bit, int end_bit, + StreamType stream, bool debug_synchronous) {} + +template (int* idx, long long len, + const XPUStream& stream); +template void calc_shard_offset(int* idx, int* left, int* right, + long long len, int total_devs, + const XPUStream& stream); +template void calc_shard_index( + unsigned long* d_keys, long long len, int* shard_index, int total_devs, + const XPUStream& stream); + +template void fill_shard_key( + unsigned long* d_shard_keys, unsigned long* d_keys, int* idx, long long len, + const XPUStream& stream); +template void +fill_shard_grads(unsigned long* d_shard_keys, unsigned long* d_keys, + paddle::framework::FeaturePushValue* d_shard_grads, + paddle::framework::FeaturePushValue* d_grads, + int* idx, long long len, const XPUStream& stream); +template void fill_dvals( + paddle::framework::FeatureValue* d_shard_vals, + paddle::framework::FeatureValue* d_vals, int* idx, long long len, + const XPUStream& stream); + +template void +sort_pairs( + void* d_temp_storage, + size_t& temp_storage_bytes, // NOLINT + const unsigned long* d_keys_in, // NOLINT + unsigned long* d_keys_out, + const paddle::framework::FeaturePushValue* d_values_in, + paddle::framework::FeaturePushValue* d_values_out, int num_items, + int begin_bit, int end_bit, XPUStream stream, bool debug_synchronous); + +template void sort_pairs( + void* d_temp_storage, + size_t& temp_storage_bytes, // NOLINT + const int* d_keys_in, // NOLINT + int* d_keys_out, const int* d_values_in, int* d_values_out, int num_items, + int begin_bit, int end_bit, XPUStream stream, bool debug_synchronous); + +template void reduce_by_key< + unsigned long*, unsigned long*, paddle::framework::FeaturePushValue*, + paddle::framework::FeaturePushValue*, int*, XPUStream>( + void* d_temp_storage, + size_t& temp_storage_bytes, // NOLINT + unsigned long* d_keys_in, unsigned long* d_unique_out, + paddle::framework::FeaturePushValue* d_values_in, + paddle::framework::FeaturePushValue* d_aggregates_out, + int* d_num_runs_out int num_items, XPUStream stream, + bool debug_synchronous); + +#endif + +} // end namespace framework +} // end namespace paddle +#endif diff --git a/paddle/fluid/framework/fleet/heter_ps/heter_ps.cu b/paddle/fluid/framework/fleet/heter_ps/heter_ps.cu index 581b0d511c23e..583eb926a26a5 100644 --- a/paddle/fluid/framework/fleet/heter_ps/heter_ps.cu +++ b/paddle/fluid/framework/fleet/heter_ps/heter_ps.cu @@ -29,7 +29,9 @@ HeterPs::HeterPs(size_t capacity, std::shared_ptr resource) { comm_ = std::make_shared>( capacity, resource); +#if defined(PADDLE_WITH_CUDA) opt_ = Optimizer(); +#endif } HeterPs::~HeterPs() {} @@ -54,15 +56,21 @@ void HeterPs::show_one_table(int gpu_num) { comm_->show_one_table(gpu_num); } void HeterPs::push_sparse(int num, FeatureKey* d_keys, FeaturePushValue* d_grads, size_t len) { +#if defined(PADDLE_WITH_CUDA) comm_->push_sparse(num, d_keys, d_grads, len, opt_); +#elif defined(PADDLE_WITH_XPU_KP) + comm_->push_sparse(num, d_keys, d_grads, len); +#endif // comm_->push_sparse_multi_node(num, d_keys, d_grads, len, opt_); } +#if defined(PADDLE_WITH_CUDA) void HeterPs::set_nccl_comm_and_size(const std::vector& inner_comms, const std::vector& inter_comms, int comm_size) { comm_->set_nccl_comm_and_size(inner_comms, inter_comms, comm_size); } +#endif } // end namespace framework } // end namespace paddle diff --git a/paddle/fluid/framework/fleet/heter_ps/heter_ps.h b/paddle/fluid/framework/fleet/heter_ps/heter_ps.h index d78b6b492074d..7fb50f4da1fce 100644 --- a/paddle/fluid/framework/fleet/heter_ps/heter_ps.h +++ b/paddle/fluid/framework/fleet/heter_ps/heter_ps.h @@ -16,7 +16,9 @@ limitations under the License. */ #include #include "paddle/fluid/framework/fleet/heter_ps/heter_comm.h" #include "paddle/fluid/framework/fleet/heter_ps/heter_ps_base.h" +#if defined(PADDLE_WITH_CUDA) #include "paddle/fluid/framework/fleet/heter_ps/optimizer.cuh.h" +#endif #ifdef PADDLE_WITH_HETERPS @@ -35,9 +37,13 @@ class HeterPs : public HeterPsBase { size_t len) override; virtual void build_ps(int num, FeatureKey* h_keys, FeatureValue* h_vals, size_t len, size_t chunk_size, int stream_num) override; + +#if defined(PADDLE_WITH_CUDA) virtual void set_nccl_comm_and_size( const std::vector& inner_comms, const std::vector& inter_comms, int comm_size) override; +#endif + virtual void end_pass() override; virtual int get_index_by_devid(int devid) override; virtual void show_one_table(int gpu_num) override; @@ -46,7 +52,9 @@ class HeterPs : public HeterPsBase { private: std::shared_ptr> comm_; +#if defined(PADDLE_WITH_CUDA) Optimizer opt_; +#endif }; } // end namespace framework diff --git a/paddle/fluid/framework/fleet/heter_ps/heter_ps_base.h b/paddle/fluid/framework/fleet/heter_ps/heter_ps_base.h index 05b3ecf9c3c12..ddbf02df6c578 100644 --- a/paddle/fluid/framework/fleet/heter_ps/heter_ps_base.h +++ b/paddle/fluid/framework/fleet/heter_ps/heter_ps_base.h @@ -35,9 +35,11 @@ class HeterPsBase { virtual void build_ps(int num, FeatureKey* h_keys, FeatureValue* h_vals, size_t len, size_t chunk_size, int stream_num) = 0; virtual int get_index_by_devid(int devid) = 0; +#if defined(PADDLE_WITH_CUDA) virtual void set_nccl_comm_and_size( const std::vector& inner_comms, const std::vector& inter_comms, int comm_size) = 0; +#endif virtual void end_pass() = 0; virtual void show_one_table(int gpu_num) = 0; virtual void push_sparse(int num, FeatureKey* d_keys, diff --git a/paddle/fluid/framework/fleet/heter_ps/heter_resource.cc b/paddle/fluid/framework/fleet/heter_ps/heter_resource.cc index cad7559af5742..7074cfb521bdf 100644 --- a/paddle/fluid/framework/fleet/heter_ps/heter_resource.cc +++ b/paddle/fluid/framework/fleet/heter_ps/heter_resource.cc @@ -13,12 +13,21 @@ See the License for the specific language governing permissions and limitations under the License. */ #ifdef PADDLE_WITH_HETERPS -#include "heter_resource.h" +#include "paddle/fluid/framework/fleet/heter_ps/heter_resource.h" + +#ifdef PADDLE_WITH_CUDA #include "paddle/fluid/platform/cuda_device_guard.h" +#endif + +#ifdef PADDLE_WITH_XPU_KP +#include "paddle/fluid/platform/device/xpu/enforce_xpu.h" +#include "paddle/fluid/platform/device/xpu/xpu_info.h" +#endif namespace paddle { namespace framework { +#if defined(PADDLE_WITH_CUDA) GPUResource::GPUResource(std::vector& dev_ids, int index) { index_ = index; dev_ids_ = dev_ids; @@ -52,7 +61,41 @@ GPUResource::~GPUResource() { } } +#elif defined(PADDLE_WITH_XPU_KP) +XPUResource::XPUResource(std::vector& dev_ids, int index) { + index_ = index; + dev_ids_ = dev_ids; + dev_id_ = dev_ids_[index]; + + platform::XPUDeviceGuard guard(dev_id_); + local_streams_.resize(dev_ids_.size()); + comm_streams_.resize(dev_ids_.size(), NULL); + remote_streams_.resize(dev_ids_.size()); + + for (size_t i = 0; i < dev_ids_.size(); ++i) { + PADDLE_ENFORCE_XPU_SUCCESS(xpu_stream_create(&local_streams_[i])); + // PADDLE_ENFORCE_XPU_SUCCESS(xpu_stream_create(&comm_streams_[i])); + PADDLE_ENFORCE_XPU_SUCCESS(xpu_stream_create(&remote_streams_[i])); + } +} + +XPUResource::~XPUResource() { + platform::XPUDeviceGuard guard(dev_id_); + for (size_t i = 0; i < local_streams_.size(); ++i) { + PADDLE_ENFORCE_XPU_SUCCESS(xpu_stream_destroy(local_streams_[i])); + } + // for (size_t i = 0; i < comm_streams_.size(); ++i) { + // PADDLE_ENFORCE_XPU_SUCCESS(xpu_stream_destroy(comm_streams_[i])); + // } + for (size_t i = 0; i < remote_streams_.size(); ++i) { + PADDLE_ENFORCE_XPU_SUCCESS(xpu_stream_destroy(remote_streams_[i])); + } +} + +#endif + void HeterPsResource::enable_p2p() { +#if defined(PADDLE_WITH_CUDA) for (size_t i = 0; i < dev_ids_.size(); ++i) { platform::CUDADeviceGuard guard(dev_ids_[i]); for (size_t j = 0; j < dev_ids_.size(); ++j) { @@ -72,28 +115,28 @@ void HeterPsResource::enable_p2p() { } } } +#endif } HeterPsResource::HeterPsResource(const std::vector& dev_ids) { dev_ids_ = dev_ids; for (size_t i = 0; i < dev_ids_.size(); ++i) { - std::shared_ptr resource = - std::make_shared(dev_ids_, i); + std::shared_ptr resource = + std::make_shared(dev_ids_, i); resources_.push_back(resource); devid_2_index_[dev_ids_[i]] = i; } } -cudaStream_t HeterPsResource::comm_stream(int gpu_num, int stream_num) { - return resources_[gpu_num]->comm_stream(stream_num); +ppStream HeterPsResource::comm_stream(int dev_num, int stream_num) { + return resources_[dev_num]->comm_stream(stream_num); } - -cudaStream_t HeterPsResource::local_stream(int gpu_num, int stream_num) { - return resources_[gpu_num]->local_stream(stream_num); +ppStream HeterPsResource::local_stream(int dev_num, int stream_num) { + return resources_[dev_num]->local_stream(stream_num); } -cudaStream_t HeterPsResource::remote_stream(int gpu_num, int stream_num) { - return resources_[gpu_num]->remote_stream(stream_num); +ppStream HeterPsResource::remote_stream(int dev_num, int stream_num) { + return resources_[dev_num]->remote_stream(stream_num); } int HeterPsResource::dev_id(int num) { return dev_ids_[num]; } @@ -102,7 +145,7 @@ int HeterPsResource::get_index_by_devid(int devid) { return devid_2_index_[devid]; } -int HeterPsResource::total_gpu() { return dev_ids_.size(); } +int HeterPsResource::total_device() { return dev_ids_.size(); } void HeterPsResource::set_multi_mf(int multi_mf_dim, int max_mf_dim) { multi_mf_dim_ = multi_mf_dim; diff --git a/paddle/fluid/framework/fleet/heter_ps/heter_resource.h b/paddle/fluid/framework/fleet/heter_ps/heter_resource.h index 19df8cc70f50e..164fca2276800 100644 --- a/paddle/fluid/framework/fleet/heter_ps/heter_resource.h +++ b/paddle/fluid/framework/fleet/heter_ps/heter_resource.h @@ -17,7 +17,16 @@ limitations under the License. */ #include #include #include + +#ifdef PADDLE_WITH_CUDA #include "paddle/fluid/platform/cuda_device_guard.h" +#endif + +#ifdef PADDLE_WITH_XPU_KP +#include // NOLINT +#include "paddle/fluid/platform/device/xpu/xpu_info.h" +#endif + #include "paddle/fluid/platform/enforce.h" #ifdef PADDLE_WITH_HETERPS @@ -25,9 +34,16 @@ limitations under the License. */ namespace paddle { namespace framework { +#if defined(PADDLE_WITH_CUDA) +using ppStream = cudaStream_t; +#elif defined(PADDLE_WITH_XPU_KP) +using ppStream = XPUStream; +#endif + +#if defined(PADDLE_WITH_CUDA) class GPUResource { public: - GPUResource(std::vector& device_id, int index); + GPUResource(std::vector& device_id, int index); // NOLINT virtual ~GPUResource(); GPUResource(const GPUResource&) = delete; GPUResource& operator=(const GPUResource&) = delete; @@ -45,23 +61,55 @@ class GPUResource { std::vector local_streams_; std::vector comm_streams_; }; +#elif defined(PADDLE_WITH_XPU_KP) +class XPUResource { + public: + XPUResource(std::vector& device_id, int index); // NOLINT + virtual ~XPUResource(); + XPUResource(const XPUResource&) = delete; + XPUResource& operator=(const XPUResource&) = delete; + + int dev_id() const { return dev_id_; } + int index() const { return index_; } + XPUStream local_stream(int num) { return local_streams_[num]; } + XPUStream remote_stream(int num) { return remote_streams_[num]; } + XPUStream comm_stream(int num) { return comm_streams_[num]; } + + int dev_id_; + int index_; + std::vector dev_ids_; + std::vector remote_streams_; + std::vector local_streams_; + std::vector comm_streams_; +}; +#endif + +#if defined(PADDLE_WITH_CUDA) +using DevResource = GPUResource; +using DevPlace = platform::CUDAPlace; +using AnyDeviceGuard = platform::CUDADeviceGuard; +#elif defined(PADDLE_WITH_XPU_KP) +using DevResource = XPUResource; +using DevPlace = platform::XPUPlace; +using AnyDeviceGuard = platform::XPUDeviceGuard; +#endif class HeterPsResource { public: - HeterPsResource(const std::vector& dev_ids); + explicit HeterPsResource(const std::vector& dev_ids); HeterPsResource(const HeterPsResource&) = delete; HeterPsResource& operator=(const HeterPsResource&) = delete; virtual ~HeterPsResource() {} void enable_p2p(); - int total_gpu(); + int total_device(); int get_index_by_devid(int devid); int dev_id(int num); void set_multi_mf(int multi_mf_dim, int max_mf_dim); - gpuStream_t local_stream(int gpu_num, int stream_num); - gpuStream_t remote_stream(int gpu_num, int stream_num); - gpuStream_t comm_stream(int gpu_num, int stream_num); + ppStream local_stream(int dev_num, int stream_num); + ppStream remote_stream(int dev_num, int stream_num); + ppStream comm_stream(int dev_num, int stream_num); - std::vector> resources_; + std::vector> resources_; std::vector dev_ids_; std::map devid_2_index_; int multi_mf_dim_{0}; diff --git a/paddle/fluid/framework/fleet/heter_ps/mem_pool.h b/paddle/fluid/framework/fleet/heter_ps/mem_pool.h index 9189902c28ffb..a663d1bf76410 100644 --- a/paddle/fluid/framework/fleet/heter_ps/mem_pool.h +++ b/paddle/fluid/framework/fleet/heter_ps/mem_pool.h @@ -18,6 +18,7 @@ limitations under the License. */ // #include // "paddle/fluid/framework/fleet/heter_ps/cudf/concurrent_unordered_map.cuh.h" #include +#ifdef PADDLE_WITH_CUDA #include "paddle/fluid/framework/fleet/heter_ps/cudf/managed.cuh" namespace paddle { @@ -111,3 +112,4 @@ class HBMMemoryPool : public managed { } // end namespace framework } // end namespace paddle #endif +#endif diff --git a/paddle/fluid/framework/fleet/heter_ps/optimizer.cuh.h b/paddle/fluid/framework/fleet/heter_ps/optimizer.cuh.h index ff9976db5d875..ebf7dd277c7d6 100644 --- a/paddle/fluid/framework/fleet/heter_ps/optimizer.cuh.h +++ b/paddle/fluid/framework/fleet/heter_ps/optimizer.cuh.h @@ -13,16 +13,19 @@ See the License for the specific language governing permissions and limitations under the License. */ #pragma once +#ifdef PADDLE_WITH_HETERPS + +#if defined(PADDLE_WITH_CUDA) #include +#endif #include -#include "optimizer_conf.h" #include "paddle/fluid/framework/fleet/heter_ps/feature_value.h" - -#ifdef PADDLE_WITH_HETERPS +#include "paddle/fluid/framework/fleet/heter_ps/optimizer_conf.h" namespace paddle { namespace framework { +#if defined(PADDLE_WITH_CUDA) template class Optimizer { public: @@ -32,7 +35,8 @@ class Optimizer { void initialize() {} - __device__ void update_lr(float& w, float& g2sum, float g, float scale) { + __device__ void update_lr(float& w, float& g2sum, float g, // NOLINT + float scale) { double add_g2sum = 0; double ratio = optimizer_config::learning_rate * sqrt(optimizer_config::initial_g2sum / @@ -49,8 +53,8 @@ class Optimizer { g2sum += add_g2sum; } - __device__ void update_mf(int n, float* w, float& g2sum, const float* g, - float scale) { + __device__ void update_mf(int n, float* w, float& g2sum, // NOLINT + const float* g, float scale) { double add_g2sum = 0; double ratio = optimizer_config::mf_learning_rate * sqrt(optimizer_config::mf_initial_g2sum / @@ -69,7 +73,8 @@ class Optimizer { g2sum += add_g2sum / n; } - __device__ void update_value(ValType& val, const GradType& grad) { + + __device__ void update_value(ValType& val, const GradType& grad) { // NOLINT val.slot = grad.slot; val.show += grad.show; val.clk += grad.clk; @@ -132,6 +137,7 @@ class Optimizer { } }; +#endif } // end namespace framework } // end namespace paddle #endif diff --git a/paddle/fluid/framework/fleet/heter_ps/optimizer_conf.h b/paddle/fluid/framework/fleet/heter_ps/optimizer_conf.h index 55d0fc561c574..6d924a395e19a 100644 --- a/paddle/fluid/framework/fleet/heter_ps/optimizer_conf.h +++ b/paddle/fluid/framework/fleet/heter_ps/optimizer_conf.h @@ -14,8 +14,16 @@ limitations under the License. */ #pragma once +#if defined(PADDLE_WITH_XPU_KP) +#include "xpu/kernel/cluster_header.h" +#include "xpu/kernel/debug.h" +#include "xpu/kernel/math.h" +#endif + namespace optimizer_config { +#if defined(PADDLE_WITH_CUDA) + __constant__ float nonclk_coeff = 0.1; __constant__ float clk_coeff = 1; @@ -31,4 +39,24 @@ __constant__ float mf_initial_g2sum = 3.0; __constant__ float mf_initial_range = 1e-4; __constant__ float mf_min_bound = -10; __constant__ float mf_max_bound = 10; -} + +#elif defined(PADDLE_WITH_XPU_KP) + +_global_ptr_ float* nonclk_coeff; +_global_ptr_ float* clk_coeff; + +_global_ptr_ float* min_bound; +_global_ptr_ float* max_bound; +_global_ptr_ float* learning_rate; +_global_ptr_ float* initial_g2sum; +_global_ptr_ float* initial_range; + +_global_ptr_ float* mf_create_thresholds; +_global_ptr_ float* mf_learning_rate; +_global_ptr_ float* mf_initial_g2sum; +_global_ptr_ float* mf_initial_range; +_global_ptr_ float* mf_min_bound; +_global_ptr_ float* mf_max_bound; + +#endif +} // namespace optimizer_config diff --git a/paddle/fluid/framework/fleet/ps_gpu_wrapper.h b/paddle/fluid/framework/fleet/ps_gpu_wrapper.h index 9551e49b6b77b..b7060764863f1 100755 --- a/paddle/fluid/framework/fleet/ps_gpu_wrapper.h +++ b/paddle/fluid/framework/fleet/ps_gpu_wrapper.h @@ -146,7 +146,7 @@ class PSGPUWrapper { is_initialized_ = true; resource_ = std::make_shared(dev_ids); resource_->enable_p2p(); - keys_tensor.resize(resource_->total_gpu()); + keys_tensor.resize(resource_->total_device()); #ifdef PADDLE_WITH_GLOO auto gloo = paddle::framework::GlooWrapper::GetInstance(); if (gloo->Size() > 1) { @@ -312,8 +312,8 @@ class PSGPUWrapper { for (size_t i = 0; i < num_of_dim; i++) { dim_index_map[index_dim_vec_[i]] = i; } - hbm_pools_.resize(resource_->total_gpu() * num_of_dim); - mem_pools_.resize(resource_->total_gpu() * num_of_dim); + hbm_pools_.resize(resource_->total_device() * num_of_dim); + mem_pools_.resize(resource_->total_device() * num_of_dim); max_mf_dim_ = index_dim_vec_.back(); multi_mf_dim_ = (dim_index_map.size() >= 1) ? dim_index_map.size() : 0; resource_->set_multi_mf(multi_mf_dim_, max_mf_dim_); From b61fa16ad4fe65bd59131a18c4c353b162ee6f7a Mon Sep 17 00:00:00 2001 From: hong <43953930+phlrain@users.noreply.github.com> Date: Wed, 13 Apr 2022 20:33:53 +0800 Subject: [PATCH 133/211] add split backward yaml (#41746) --- .../fluid/tests/unittests/test_split_op.py | 27 ++++++++++++++++--- python/paddle/utils/code_gen/api.yaml | 1 + python/paddle/utils/code_gen/backward.yaml | 2 +- 3 files changed, 25 insertions(+), 5 deletions(-) diff --git a/python/paddle/fluid/tests/unittests/test_split_op.py b/python/paddle/fluid/tests/unittests/test_split_op.py index c826a0e1030f4..bf3be4080a9fc 100644 --- a/python/paddle/fluid/tests/unittests/test_split_op.py +++ b/python/paddle/fluid/tests/unittests/test_split_op.py @@ -19,6 +19,7 @@ from op_test import OpTest, convert_float_to_uint16 import paddle.fluid as fluid from paddle.fluid import compiler, Program, program_guard, core +from paddle.fluid.framework import _test_eager_guard class TestSplitOp(OpTest): @@ -402,12 +403,30 @@ def test_out1(self): with fluid.dygraph.guard(): input_1 = np.random.random([4, 6, 6]).astype("int32") # input is a variable which shape is [4, 6, 6] - input = fluid.dygraph.to_variable(input_1) + input = paddle.to_tensor(input_1) x0, x1, x2 = paddle.split(input, num_or_sections=3, axis=1) x0_out = x0.numpy() x1_out = x1.numpy() x2_out = x2.numpy() ex_x0, ex_x1, ex_x2 = np.split(input_1, 3, axis=1) + + with _test_eager_guard(): + # input is a variable which shape is [4, 6, 6] + input = paddle.to_tensor(input_1) + input.stop_gradient = False + x0, x1, x2 = paddle.split(input, num_or_sections=3, axis=1) + eager_x0_out = x0.numpy() + eager_x1_out = x1.numpy() + eager_x2_out = x2.numpy() + loss = x0.sum() + loss.backward() + manul_grad = np.zeros_like(input_1) + manul_grad[:, :2, :] = 1 + self.assertTrue(np.allclose(input.gradient(), manul_grad)) + self.assertTrue(np.allclose(ex_x0, eager_x0_out)) + self.assertTrue(np.allclose(ex_x1, eager_x1_out)) + self.assertTrue(np.allclose(ex_x2, eager_x2_out)) + self.assertTrue(np.allclose(ex_x0, x0_out)) self.assertTrue(np.allclose(ex_x1, x1_out)) self.assertTrue(np.allclose(ex_x2, x2_out)) @@ -416,7 +435,7 @@ def test_out2(self): with fluid.dygraph.guard(): input_1 = np.random.random([4, 6, 6]).astype("bool") # input is a variable which shape is [4, 6, 6] - input = fluid.dygraph.to_variable(input_1) + input = paddle.to_tensor(input_1) x0, x1, x2 = paddle.split(input, num_or_sections=3, axis=1) x0_out = x0.numpy() x1_out = x1.numpy() @@ -430,7 +449,7 @@ def test_out_tensor_input(self): with fluid.dygraph.guard(): input_1 = np.random.random([4, 6, 6]).astype("int32") # input is a variable which shape is [4, 6, 6] - input = fluid.dygraph.to_variable(input_1) + input = paddle.to_tensor(input_1) num1 = paddle.full(shape=[1], fill_value=2, dtype='int32') x0, x1, x2 = paddle.split( input, num_or_sections=[num1, 2, 2], axis=1) @@ -446,7 +465,7 @@ def test_axis_tensor_input(self): with fluid.dygraph.guard(): input_1 = np.random.random([4, 6, 6]).astype("int32") # input is a variable which shape is [4, 6, 6] - input = fluid.dygraph.to_variable(input_1) + input = paddle.to_tensor(input_1) num1 = paddle.full(shape=[1], fill_value=1, dtype='int32') x0, x1, x2 = paddle.split( input, num_or_sections=[2, 2, 2], axis=num1) diff --git a/python/paddle/utils/code_gen/api.yaml b/python/paddle/utils/code_gen/api.yaml index b4abe5b303b8e..f5245d59babd2 100644 --- a/python/paddle/utils/code_gen/api.yaml +++ b/python/paddle/utils/code_gen/api.yaml @@ -1917,6 +1917,7 @@ args : (Tensor x, IntArray num_or_sections, Scalar(int) axis) output : Tensor[] invoke : split_impl(x, num_or_sections, axis) + backward : split_grad - api : sqrt args : (Tensor x) diff --git a/python/paddle/utils/code_gen/backward.yaml b/python/paddle/utils/code_gen/backward.yaml index d0f337cb054f4..97c9c7ddf1584 100644 --- a/python/paddle/utils/code_gen/backward.yaml +++ b/python/paddle/utils/code_gen/backward.yaml @@ -1523,7 +1523,7 @@ - backward_api : split_grad forward : split (Tensor x, IntArray num_or_sections, Scalar axis) -> Tensor[](out) - args : (Tensor[] out_grad, Scalar axis) + args : (Tensor[] out_grad, Scalar axis = -1) output : Tensor(x_grad) invoke : concat( out_grad, axis) # TODO(zhangyunfei) The config of double grad and triple grad will be supported in the future. From 27a91b1a6ea18f88355f0153f737056d4e4a3fb2 Mon Sep 17 00:00:00 2001 From: zyfncg Date: Wed, 13 Apr 2022 22:03:30 +0800 Subject: [PATCH 134/211] Adjust the slice end in getitem (#41681) * adjust the slice end in getitem * fix bug * fix bug * fix bug * recover start change --- python/paddle/fluid/variable_index.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/python/paddle/fluid/variable_index.py b/python/paddle/fluid/variable_index.py index 257ddc96d9c87..83a569aacc911 100644 --- a/python/paddle/fluid/variable_index.py +++ b/python/paddle/fluid/variable_index.py @@ -375,7 +375,13 @@ def _getitem_impl_(var, item): if start is None: start = 0 if step > 0 else MAX_INTEGER if end is None: - end = MAX_INTEGER if step > 0 else -1 + if var.shape[dim] != -1 and ( + paddle.fluid.framework._non_static_mode() or + var.desc.type() != core.VarDesc.VarType.LOD_TENSOR_ARRAY + ): + end = var.shape[dim] if step > 0 else -1 + else: + end = MAX_INTEGER if step > 0 else -1 elif isinstance(slice_item, list): all_bool = True From b12af9e1d9980935f90ac3264797110f9671589e Mon Sep 17 00:00:00 2001 From: wangguanqun Date: Wed, 13 Apr 2022 23:32:14 +0800 Subject: [PATCH 135/211] the one ps proto (#41659) * the one ps proto * the one ps proto * fix * fix * fix * fix windows ci * fix windows ci * add dependency * add dependency --- paddle/fluid/distributed/CMakeLists.txt | 20 +- paddle/fluid/distributed/ps.proto | 236 ------------------ .../the_one_ps.proto | 23 ++ paddle/fluid/framework/CMakeLists.txt | 5 +- .../framework/distributed_strategy.proto | 2 +- .../fleet/base/distributed_strategy.py | 4 +- python/paddle/distributed/ps/the_one_ps.py | 13 +- 7 files changed, 51 insertions(+), 252 deletions(-) delete mode 100644 paddle/fluid/distributed/ps.proto rename paddle/fluid/{framework => distributed}/the_one_ps.proto (89%) mode change 100755 => 100644 diff --git a/paddle/fluid/distributed/CMakeLists.txt b/paddle/fluid/distributed/CMakeLists.txt index 06b0583eddf24..0091c14bfd177 100644 --- a/paddle/fluid/distributed/CMakeLists.txt +++ b/paddle/fluid/distributed/CMakeLists.txt @@ -1,11 +1,29 @@ add_subdirectory(collective) add_subdirectory(store) +if(WITH_PYTHON) + py_proto_compile(ps_py_proto SRCS the_one_ps.proto) + add_custom_target(ps_py_proto_init ALL + COMMAND ${CMAKE_COMMAND} -E make_directory ${PADDLE_BINARY_DIR}/python/paddle/distributed/fleet/proto) + if (NOT WIN32) + add_custom_command(TARGET ps_py_proto POST_BUILD + COMMAND mv the_one_ps_pb2.py ${PADDLE_BINARY_DIR}/python/paddle/distributed/fleet/proto/) + else(NOT WIN32) + string(REPLACE "/" "\\" fleet_proto_dstpath "${PADDLE_BINARY_DIR}/python/paddle/distributed/fleet/proto/") + add_custom_command(TARGET ps_py_proto POST_BUILD + COMMAND copy /Y the_one_ps_pb2.py ${fleet_proto_dstpath} + COMMENT "Copy generated python the_one_ps_pb2 into directory ${fleet_proto_dstpath}.") + endif(NOT WIN32) +endif() + if(NOT WITH_PSCORE) add_subdirectory(fleet_executor) return() endif() -proto_library(ps_framework_proto SRCS ps.proto) +proto_library(ps_framework_proto SRCS the_one_ps.proto) +add_custom_command(TARGET ps_framework_proto POST_BUILD + COMMAND mv the_one_ps.pb.h ps.pb.h + COMMAND mv the_one_ps.pb.cc ps.pb.cc) set(DISTRIBUTE_COMPILE_FLAGS "-Wno-error=unused-value -Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor -Wno-error=sign-compare -Wno-error=unused-variable -Wno-error=return-type -Wno-error=unused-but-set-variable -Wno-error=type-limits -Wno-error=unknown-pragmas -Wno-error=parentheses -Wno-error=unused-result") diff --git a/paddle/fluid/distributed/ps.proto b/paddle/fluid/distributed/ps.proto deleted file mode 100644 index 9bfa2c05efa67..0000000000000 --- a/paddle/fluid/distributed/ps.proto +++ /dev/null @@ -1,236 +0,0 @@ -// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -syntax = "proto2"; -package paddle.distributed; -option cc_generic_services = true; -option cc_enable_arenas = true; - -message FsClientParameter { - enum FsApiType { - HDFS = 0; - AFS = 1; - } - optional FsApiType fs_type = 1 [ default = HDFS ]; - optional string uri = 2; // such as afs://xxx.afs.com:9902 - optional string user = 3; // user_name to access fs - optional string passwd = 4; // password - optional int32 buffer_size = 5; // buffer for read/write - optional string hadoop_bin = 51; - optional string afs_conf = 101; -} - -message PSParameter { - optional string worker_class = 1; - optional string server_class = 2; - optional string instance_class = 3; - optional string init_gflags = 4 [ default = "" ]; - optional WorkerParameter worker_param = 101; - optional ServerParameter server_param = 102; - repeated DownpourTrainerParameter trainer_param = 301; - optional FsClientParameter fs_client_param = 501; -} - -message WorkerParameter { - optional DownpourWorkerParameter downpour_worker_param = 1; -} - -message DownpourWorkerParameter { - repeated TableParameter downpour_table_param = 1; -} - -message DownpourServerParameter { - repeated TableParameter downpour_table_param = 1; - optional ServerServiceParameter service_param = 2; -} - -message ServerParameter { - optional DownpourServerParameter downpour_server_param = 1; -} - -message DownpourTrainerParameter { - repeated DenseTableParameter dense_table = 1; - repeated SparseTableParameter sparse_table = 2; - optional int32 push_sparse_per_batch = 3; - optional int32 push_dense_per_batch = 4; - repeated string skip_op = 5; - repeated ProgramConfig program_config = 6; -} - -message DenseTableParameter { - optional int32 table_id = 1; - repeated string dense_variable_name = 2; - repeated string dense_gradient_variable_name = 3; - optional int32 fea_dim = 4; -} - -message SparseTableParameter { - optional int32 table_id = 1; - optional int32 feature_dim = 2; - repeated string slot_key = 3; - repeated string slot_value = 4; - repeated string slot_gradient = 5; -} - -message ServerServiceParameter { - optional string server_class = 1 [ default = "BrpcPsServer" ]; - optional string client_class = 2 [ default = "BrpcPsClient" ]; - optional string service_class = 3 [ default = "BrpcPsService" ]; - optional uint32 start_server_port = 4 - [ default = 0 ]; // will find a avaliable port from it - optional uint32 server_thread_num = 5 [ default = 12 ]; -} - -message ProgramConfig { - required string program_id = 1; - repeated int32 push_sparse_table_id = 2; - repeated int32 push_dense_table_id = 3; - repeated int32 pull_sparse_table_id = 4; - repeated int32 pull_dense_table_id = 5; -} - -enum TableType { - PS_SPARSE_TABLE = 0; - PS_DENSE_TABLE = 1; - PS_OTHER_TABLE = 2; -} - -message TableParameter { - optional uint64 table_id = 1; - optional string table_class = 2; - optional uint64 shard_num = 3 [ default = 1000 ]; - optional TableAccessorParameter accessor = 4; - optional TensorAccessorParameter tensor = 5; - optional CommonAccessorParameter common = 6; - optional TableType type = 7; - optional bool compress_in_save = 8 [ default = false ]; - optional GraphParameter graph_parameter = 9; -} - -message TableAccessorParameter { - optional string accessor_class = 1; - optional uint32 fea_dim = 4 [ default = 11 ]; - optional uint32 embedx_dim = 5 [ default = 8 ]; - optional uint32 embedx_threshold = 6 [ default = 10 ]; - optional CtrAccessorParameter ctr_accessor_param = 7; - repeated TableAccessorSaveParameter table_accessor_save_param = 8; - optional SparseCommonSGDRuleParameter embed_sgd_param = 10; - optional SparseCommonSGDRuleParameter embedx_sgd_param = 11; -} - -message CtrAccessorParameter { - optional float nonclk_coeff = 1 - [ default = 0.1 ]; // to calculate show_click_score - optional float click_coeff = 2 - [ default = 1 ]; // to calculate show_click_score - optional float base_threshold = 3 [ - default = 1.5 - ]; // show_click_score > base_threshold, this feature can be saved - optional float delta_threshold = 4 - [ default = - 0.25 ]; // delta_score > delta_threshold, this feature can be saved - optional float delta_keep_days = 5 - [ default = - 16 ]; // unseen_day < delta_keep_days, this feature can be saved - optional float show_click_decay_rate = 6 [ - default = 0.98 - ]; // show/click will update to show/click * show_click_decay_rate after a day - optional float delete_threshold = 7 - [ default = 0.8 ]; // threshold to shrink a feasign - optional float delete_after_unseen_days = 8 - [ default = 30 ]; // unseen_day > delete_after_unseen_days, this feature - // will be delete in shrink_model - optional int32 ssd_unseenday_threshold = 9 - [ default = 1 ]; // threshold to save ssd -} - -message TensorAccessorParameter { - optional string feed_var_name = 1; - optional string fetch_var_name = 2; - optional int64 startup_program_id = 3; - optional int64 main_program_id = 4; - optional string tensor_table_class = 6; -} - -message CommonAccessorParameter { - optional string name = 1; - optional string table_name = 2; - repeated string attributes = 3; - repeated string params = 4; - repeated uint32 dims = 5; - repeated string initializers = 6; - optional string entry = 7; - optional int32 trainer_num = 8; - optional bool sync = 9; - optional uint32 table_num = 10; - optional uint32 table_dim = 11; -} - -message TableAccessorSaveParameter { - optional uint32 param = 1; - optional string converter = 2; - optional string deconverter = 3; -} - -message SparseCommonSGDRuleParameter { - optional string name = 1; - optional SparseNaiveSGDRuleParameter naive = 2; - optional SparseAdagradSGDRuleParameter adagrad = 3; - optional SparseAdamSGDParameter adam = 4; -} - -message SparseNaiveSGDRuleParameter { // SparseNaiveSGDRule - optional double learning_rate = 1 [ default = 0.05 ]; - optional double initial_range = 2 [ default = 0.0001 ]; - repeated float weight_bounds = 3; -} - -message - SparseAdagradSGDRuleParameter { // SparseAdaGradSGDRule|StdAdaGradSGDRule - optional double learning_rate = 1 [ default = 0.05 ]; - optional double initial_g2sum = 2 [ default = 3.0 ]; - optional double initial_range = 3 [ default = 0.0001 ]; - repeated float weight_bounds = 4; -} - -message SparseAdamSGDParameter { // SparseAdamSGDRule - optional double learning_rate = 1 [ default = 0.001 ]; - optional double initial_range = 2 [ default = 0.0001 ]; - optional double beta1_decay_rate = 3 [ default = 0.9 ]; - optional double beta2_decay_rate = 4 [ default = 0.999 ]; - optional double ada_epsilon = 5 [ default = 1e-08 ]; - repeated float weight_bounds = 6; -} - -message GraphParameter { - optional int32 task_pool_size = 1 [ default = 24 ]; - optional bool gpups_mode = 2 [ default = false ]; - optional string gpups_graph_sample_class = 3 - [ default = "CompleteGraphSampler" ]; - optional string gpups_graph_sample_args = 4 [ default = "" ]; - optional bool use_cache = 5 [ default = false ]; - optional int32 cache_size_limit = 6 [ default = 100000 ]; - optional int32 cache_ttl = 7 [ default = 5 ]; - optional GraphFeature graph_feature = 8; - optional string table_name = 9 [ default = "" ]; - optional string table_type = 10 [ default = "" ]; - optional int32 shard_num = 11 [ default = 127 ]; - optional int32 gpu_num = 12 [ default = 1 ]; -} - -message GraphFeature { - repeated string name = 1; - repeated string dtype = 2; - repeated int32 shape = 3; -} \ No newline at end of file diff --git a/paddle/fluid/framework/the_one_ps.proto b/paddle/fluid/distributed/the_one_ps.proto old mode 100755 new mode 100644 similarity index 89% rename from paddle/fluid/framework/the_one_ps.proto rename to paddle/fluid/distributed/the_one_ps.proto index 0ae87812bce43..34b11dfd1c5b7 --- a/paddle/fluid/framework/the_one_ps.proto +++ b/paddle/fluid/distributed/the_one_ps.proto @@ -115,6 +115,7 @@ message TableParameter { optional CommonAccessorParameter common = 6; optional TableType type = 7; optional bool compress_in_save = 8 [ default = false ]; + optional GraphParameter graph_parameter = 9; } message TableAccessorParameter { @@ -211,3 +212,25 @@ message SparseAdamSGDParameter { // SparseAdamSGDRule optional double ada_epsilon = 5 [ default = 1e-08 ]; repeated float weight_bounds = 6; } + +message GraphParameter { + optional int32 task_pool_size = 1 [ default = 24 ]; + optional bool gpups_mode = 2 [ default = false ]; + optional string gpups_graph_sample_class = 3 + [ default = "CompleteGraphSampler" ]; + optional string gpups_graph_sample_args = 4 [ default = "" ]; + optional bool use_cache = 5 [ default = false ]; + optional int32 cache_size_limit = 6 [ default = 100000 ]; + optional int32 cache_ttl = 7 [ default = 5 ]; + optional GraphFeature graph_feature = 8; + optional string table_name = 9 [ default = "" ]; + optional string table_type = 10 [ default = "" ]; + optional int32 shard_num = 11 [ default = 127 ]; + optional int32 gpu_num = 12 [ default = 1 ]; +} + +message GraphFeature { + repeated string name = 1; + repeated string dtype = 2; + repeated int32 shape = 3; +} diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt index 1b9943df1b087..ad9f37b98bd3d 100755 --- a/paddle/fluid/framework/CMakeLists.txt +++ b/paddle/fluid/framework/CMakeLists.txt @@ -237,7 +237,6 @@ if(WITH_PYTHON) py_proto_compile(trainer_py_proto SRCS trainer_desc.proto data_feed.proto) py_proto_compile(distributed_strategy_py_proto SRCS distributed_strategy.proto) py_proto_compile(pass_desc_py_proto SRCS pass_desc.proto) - py_proto_compile(ps_py_proto SRCS the_one_ps.proto) #Generate an empty \ #__init__.py to make framework_py_proto as a valid python module. add_custom_target(fleet_proto_init ALL @@ -245,13 +244,12 @@ if(WITH_PYTHON) COMMAND ${CMAKE_COMMAND} -E touch ${PADDLE_BINARY_DIR}/python/paddle/distributed/fleet/proto/__init__.py ) add_custom_target(framework_py_proto_init ALL COMMAND ${CMAKE_COMMAND} -E touch __init__.py) - add_dependencies(framework_py_proto framework_py_proto_init trainer_py_proto distributed_strategy_py_proto fleet_proto_init pass_desc_py_proto ps_py_proto) + add_dependencies(framework_py_proto framework_py_proto_init trainer_py_proto distributed_strategy_py_proto fleet_proto_init pass_desc_py_proto ps_py_proto ps_py_proto_init) if (NOT WIN32) add_custom_command(TARGET framework_py_proto POST_BUILD COMMAND ${CMAKE_COMMAND} -E make_directory ${PADDLE_BINARY_DIR}/python/paddle/fluid/proto COMMAND cp *.py ${PADDLE_BINARY_DIR}/python/paddle/fluid/proto/ COMMAND cp distributed_strategy_*.py ${PADDLE_BINARY_DIR}/python/paddle/distributed/fleet/proto - COMMAND cp the_one_ps_pb2.py ${PADDLE_BINARY_DIR}/python/paddle/distributed/fleet/proto COMMENT "Copy generated python proto into directory paddle/fluid/proto." WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}) add_custom_target(fleet_executor_proto_init ALL DEPENDS fleet_proto_init fleet_executor_desc_py_proto @@ -263,7 +261,6 @@ if(WITH_PYTHON) add_custom_command(TARGET framework_py_proto POST_BUILD COMMAND ${CMAKE_COMMAND} -E make_directory ${PADDLE_BINARY_DIR}/python/paddle/fluid/proto COMMAND copy /Y *.py ${proto_dstpath} - COMMAND copy /Y the_one_ps_pb2.py ${fleet_proto_dstpath} COMMAND copy /Y distributed_strategy_*.py ${fleet_proto_dstpath} COMMENT "Copy generated python proto into directory paddle/fluid/proto." COMMENT "Copy generated python proto into directory paddle/distributed/fleet/proto." diff --git a/paddle/fluid/framework/distributed_strategy.proto b/paddle/fluid/framework/distributed_strategy.proto index c94a344f74d8d..9b0a033856d73 100644 --- a/paddle/fluid/framework/distributed_strategy.proto +++ b/paddle/fluid/framework/distributed_strategy.proto @@ -310,7 +310,7 @@ message DistributedStrategy { optional bool asp = 33 [ default = false ]; optional bool fuse_grad_merge = 34 [ default = false ]; optional bool semi_auto = 35 [ default = false ]; - optional bool adam_d2sum = 36 [ default = true ]; + optional bool adam_d2sum = 36 [ default = false ]; optional bool auto_search = 37 [ default = false ]; optional bool heter_ccl_mode = 38 [ default = false ]; diff --git a/python/paddle/distributed/fleet/base/distributed_strategy.py b/python/paddle/distributed/fleet/base/distributed_strategy.py index 199418ab77955..c46b6eeb048a0 100644 --- a/python/paddle/distributed/fleet/base/distributed_strategy.py +++ b/python/paddle/distributed/fleet/base/distributed_strategy.py @@ -404,7 +404,7 @@ def trainer_desc_configs(self): def adam_d2sum(self): """ set adam_d2sum - Default value: True + Default value: False Examples: @@ -415,7 +415,7 @@ def adam_d2sum(self): fleet.init(role_maker) strategy = fleet.DistributedStrategy() - strategy.adam_d2sum = True # by default this is True + strategy.adam_d2sum = True # by default this is False # code block for defining loss and local optimizer # sgd = fleet.distributed_optimizer(optimizer, strategy) diff --git a/python/paddle/distributed/ps/the_one_ps.py b/python/paddle/distributed/ps/the_one_ps.py index 1d23567b72abe..5be739785ff44 100755 --- a/python/paddle/distributed/ps/the_one_ps.py +++ b/python/paddle/distributed/ps/the_one_ps.py @@ -609,7 +609,6 @@ def _set(self, table_proto): check_embedding_dim(table_proto.accessor, self.common.table_name, ctx.program_id(), self.context) - adam_d2sum = self.context["user_defined_strategy"].adam_d2sum self.common.parse_by_optimizer(ctx, self.context) self.common.parse_entry(self.common.table_name, ctx.program_id(), self.context) @@ -641,7 +640,6 @@ def _set(self, table_proto): self.common.table_name = self.context['grad_name_to_param_name'][ ctx.origin_varnames()[0]] - adam_d2sum = self.context["user_defined_strategy"].adam_d2sum self.common.parse_by_optimizer(ctx, self.context) self.common.parse_entry(self.common.table_name, ctx.program_id(), self.context) @@ -673,7 +671,6 @@ def _set(self, table_proto): table_proto.accessor.embedx_dim = 1 self.common.table_name = "MergedDense" - adam_d2sum = self.context["user_defined_strategy"].adam_d2sum self.common.parse_by_optimizer(ctx, self.context) self.common.parse_entry(self.common.table_name, ctx.program_id(), self.context) @@ -922,11 +919,6 @@ def sync_strategy_envs(): kwargs["trainer_id"] = self.role_maker._worker_index() return kwargs - proto_txt = worker_desc - debug = bool(int(os.getenv("PSERVER_DEBUG", "0"))) - if debug: - print("worker: \n{}".format(proto_txt)) - dense_map = get_the_one_recv_context( self.context, split_dense_table=self.is_heter_ps_mode) send_ctx = get_the_one_send_context( @@ -937,6 +929,7 @@ def sync_strategy_envs(): self._send_ctx = send_ctx trainer_config = self.context['trainer'] + proto_txt = worker_desc debug = bool(int(os.getenv("PSERVER_DEBUG", "0"))) if debug: print("worker: \n{}".format(proto_txt)) @@ -1060,6 +1053,10 @@ def _init_server(self, dirname=None, var_names=None, **kwargs): if self.is_heter_ps_mode: trainers += len(self.role_maker._get_heter_worker_endpoints()) + # debug = bool(int(os.getenv("PSERVER_DEBUG", "0"))) + # if debug: + # print("server: \n{}".format(server_desc)) + self._server = fluid.core.DistFleetWrapper() self._server.init_server(server_desc, self.string_hosts, role_id, trainers, self._server_sub_program) From 97dec7ca51bd29f913a31f52fd618e2d364dbeed Mon Sep 17 00:00:00 2001 From: levi131 <83750468+levi131@users.noreply.github.com> Date: Wed, 13 Apr 2022 23:54:14 +0800 Subject: [PATCH 136/211] Lml/add prim ops (#41201) * native commit for triple grad of sigmod * Updated unittests files * init functional jacobian api * Updated trible_test func * Updated gradient_checker & test_script * finish test with dtype float32 * add float64 test case * polish code * use atol=1e-5 with dtype float64 * fix for ci * set timeout for test_jacobian * fix dygraph grad to support high differential * polish API docstring * Updated gradient checker and some related files * fix double grad strip error for high differential * fix double grad strip error for high differential * Add Sigmoid triple grad tests * fix dygraph double grad dtype error when calling for high differential senario * Updated triple grad teses func * Use np.random to initialize ddx * Updated triple_grad_check func * add todo for gradient checker and refine some comments * remove additional code * add test for warnging in backward.py * format python code * support multi input in triple gradient checker * Add matmul triple grad kernel * Updated comments of TODO * Supported some special tests * Change code-format to follow CI std * Updated gradient_checker.py * Fix conflicts * Removed unnecessary printing log * Change code style to follow CI std * merge upstream * add_p * rm useless files * add sub_p mul_p div_p * add sqrt_p and tanh_p * add reshape_p * add broadcast_p * add broadcast_p fill_constant_p matmul_p reduce_p reshape_p transpose_p * add split_p and concat_p * add gather_p and scatter_add_p * add slice_select_p and slice_assign_p * add multi input check for add_p, sub_p, mul_p, div_p * update concat_p * refine gather_p and scatter_add_p * refine slice_assign_p and slice_select_p * add 9 test for prim ops * add more test and fix some bug * add more test * register proto * add shape valid check for broadcast_p op, and add keepdim attr into reduce_p op proto * support multi input and multi output for split_p and concat_p * fix slice bug for slice_select_p and slice_assign_p * dtype for axis attr should be long int * update dtype for axis attr int64_t * update for iscan CI * add more shape and dtype check * change IndexTensor into int32 dtype --- paddle/fluid/operators/CMakeLists.txt | 1 + .../fluid/operators/prim_ops/CMakeLists.txt | 28 + paddle/fluid/operators/prim_ops/add_p_op.cc | 116 ++++ .../operators/prim_ops/broadcast_p_op.cc | 110 ++++ .../fluid/operators/prim_ops/concat_p_op.cc | 134 +++++ paddle/fluid/operators/prim_ops/div_p_op.cc | 116 ++++ .../operators/prim_ops/fill_constant_p_op.cc | 81 +++ .../fluid/operators/prim_ops/gather_p_op.cc | 117 ++++ .../fluid/operators/prim_ops/matmul_p_op.cc | 138 +++++ paddle/fluid/operators/prim_ops/mul_p_op.cc | 116 ++++ .../fluid/operators/prim_ops/prim_op_test.cc | 553 ++++++++++++++++++ .../fluid/operators/prim_ops/reduce_p_op.cc | 107 ++++ .../fluid/operators/prim_ops/reshape_p_op.cc | 97 +++ .../operators/prim_ops/scatter_add_p_op.cc | 160 +++++ .../operators/prim_ops/slice_assign_p_op.cc | 152 +++++ .../operators/prim_ops/slice_select_p_op.cc | 115 ++++ paddle/fluid/operators/prim_ops/split_p_op.cc | 119 ++++ paddle/fluid/operators/prim_ops/sqrt_p_op.cc | 80 +++ paddle/fluid/operators/prim_ops/sub_p_op.cc | 116 ++++ paddle/fluid/operators/prim_ops/tanh_p_op.cc | 80 +++ .../operators/prim_ops/transpose_p_op.cc | 116 ++++ .../operators/prim_ops/unity_build_rule.cmake | 20 + 22 files changed, 2672 insertions(+) create mode 100644 paddle/fluid/operators/prim_ops/CMakeLists.txt create mode 100644 paddle/fluid/operators/prim_ops/add_p_op.cc create mode 100644 paddle/fluid/operators/prim_ops/broadcast_p_op.cc create mode 100644 paddle/fluid/operators/prim_ops/concat_p_op.cc create mode 100644 paddle/fluid/operators/prim_ops/div_p_op.cc create mode 100644 paddle/fluid/operators/prim_ops/fill_constant_p_op.cc create mode 100644 paddle/fluid/operators/prim_ops/gather_p_op.cc create mode 100644 paddle/fluid/operators/prim_ops/matmul_p_op.cc create mode 100644 paddle/fluid/operators/prim_ops/mul_p_op.cc create mode 100644 paddle/fluid/operators/prim_ops/prim_op_test.cc create mode 100644 paddle/fluid/operators/prim_ops/reduce_p_op.cc create mode 100644 paddle/fluid/operators/prim_ops/reshape_p_op.cc create mode 100644 paddle/fluid/operators/prim_ops/scatter_add_p_op.cc create mode 100644 paddle/fluid/operators/prim_ops/slice_assign_p_op.cc create mode 100644 paddle/fluid/operators/prim_ops/slice_select_p_op.cc create mode 100644 paddle/fluid/operators/prim_ops/split_p_op.cc create mode 100644 paddle/fluid/operators/prim_ops/sqrt_p_op.cc create mode 100644 paddle/fluid/operators/prim_ops/sub_p_op.cc create mode 100644 paddle/fluid/operators/prim_ops/tanh_p_op.cc create mode 100644 paddle/fluid/operators/prim_ops/transpose_p_op.cc create mode 100644 paddle/fluid/operators/prim_ops/unity_build_rule.cmake diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt index 68eaf1a0ed469..63bf3ab6a0382 100644 --- a/paddle/fluid/operators/CMakeLists.txt +++ b/paddle/fluid/operators/CMakeLists.txt @@ -22,6 +22,7 @@ add_subdirectory(reduce_ops) add_subdirectory(sequence_ops) add_subdirectory(string) add_subdirectory(jit) +add_subdirectory(prim_ops) if(WITH_MKLDNN) add_subdirectory(mkldnn) endif() diff --git a/paddle/fluid/operators/prim_ops/CMakeLists.txt b/paddle/fluid/operators/prim_ops/CMakeLists.txt new file mode 100644 index 0000000000000..a58ee6dc1f7ba --- /dev/null +++ b/paddle/fluid/operators/prim_ops/CMakeLists.txt @@ -0,0 +1,28 @@ +include(operators) +if(WITH_UNITY_BUILD) + # Load Unity Build rules for operators in paddle/fluid/operators/prim_ops. + include(unity_build_rule.cmake) +endif() +register_operators() + +SET(PRIM_OP_SRCS + reshape_p_op.cc + broadcast_p_op.cc + reduce_p_op.cc + transpose_p_op.cc + split_p_op.cc + concat_p_op.cc + slice_select_p_op.cc + slice_assign_p_op.cc + gather_p_op.cc + scatter_add_p_op.cc + add_p_op.cc + sub_p_op.cc + mul_p_op.cc + div_p_op.cc + sqrt_p_op.cc + tanh_p_op.cc + matmul_p_op.cc + fill_constant_p_op.cc) + +cc_test(prim_op_test SRCS prim_op_test.cc ${PRIM_OP_SRCS} DEPS op_registry) diff --git a/paddle/fluid/operators/prim_ops/add_p_op.cc b/paddle/fluid/operators/prim_ops/add_p_op.cc new file mode 100644 index 0000000000000..4789ed8958f91 --- /dev/null +++ b/paddle/fluid/operators/prim_ops/add_p_op.cc @@ -0,0 +1,116 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/operator.h" + +namespace paddle { +namespace framework { +class InferShapeContext; +class VarDesc; +} // namespace framework +} // namespace paddle + +namespace paddle { +namespace operators { +class AddPrimOp : public framework::OperatorBase { + public: + AddPrimOp(const std::string &type, const framework::VariableNameMap &inputs, + const framework::VariableNameMap &outputs, + const framework::AttributeMap &attrs) + : framework::OperatorBase(type, inputs, outputs, attrs) {} + void RunImpl(const framework::Scope &scope, + const platform::Place &dev_place) const override { + PADDLE_THROW(platform::errors::Unimplemented( + "Prim operator add_p should not be excuted directly")); + } +}; + +class AddPrimOpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() override { + AddInput("X", "(Tensor), The input tensor of add_p op."); + AddInput("Y", "(Tensor), The input tensor of add_p op."); + AddOutput("Z", "(Tensor), The output tensor of add_p op."); + AddComment(R"DOC( +Autograd primitive add_p operator. +)DOC"); + } +}; + +class AddPrimOpShapeInference : public framework::InferShapeBase { + public: + void operator()(framework::InferShapeContext *ctx) const override { + framework::InferShapeVarPtr x_var_ptr = ctx->GetInputVarPtrs("X")[0]; + framework::InferShapeVarPtr y_var_ptr = ctx->GetInputVarPtrs("Y")[0]; + framework::InferShapeVarPtr z_var_ptr = ctx->GetOutputVarPtrs("Z")[0]; + + framework::VarDesc *x_var = BOOST_GET(framework::VarDesc *, x_var_ptr); + framework::VarDesc *y_var = BOOST_GET(framework::VarDesc *, y_var_ptr); + auto x_shape = x_var->GetShape(); + auto y_shape = y_var->GetShape(); + size_t x_rank = x_shape.size(); + size_t y_rank = y_shape.size(); + PADDLE_ENFORCE_EQ(x_rank, y_rank, + platform::errors::InvalidArgument( + "The dimensions of two input tensor should be same, " + "but get %d and %d", + x_rank, y_rank)); + for (size_t i = 0; i < x_rank; ++i) { + PADDLE_ENFORCE_EQ( + x_shape[i], y_shape[i], + platform::errors::InvalidArgument( + "The shape of two input tensor at dimension %d should be same, " + "but get %d and %d", + i, x_shape[i], y_shape[i])); + } + + BOOST_GET(framework::VarDesc *, z_var_ptr)->SetShape(x_shape); + } +}; + +class AddPrimOpVarTypeInference + : public framework::StaticGraphVarTypeInference { + public: + void operator()(framework::InferVarTypeContext *ctx) const override { + auto x_name = Input(ctx, "X")[0]; + auto y_name = Input(ctx, "Y")[0]; + auto z_name = Output(ctx, "Z")[0]; + auto x_type = GetType(ctx, x_name); + auto y_type = GetType(ctx, y_name); + auto x_dtype = GetDataType(ctx, x_name); + auto y_dtype = GetDataType(ctx, y_name); + PADDLE_ENFORCE_EQ(x_type, y_type, + platform::errors::InvalidArgument( + "The type of two input tensor should be same, " + "but get %d and %d", + x_type, y_type)); + PADDLE_ENFORCE_EQ(x_dtype, y_dtype, + platform::errors::InvalidArgument( + "The datatype of two input tensor should be same, " + "but get %d and %d", + x_dtype, y_dtype)); + + SetType(ctx, z_name, x_type); + SetDataType(ctx, z_name, x_dtype); + } +}; + +} // namespace operators +} // namespace paddle + +REGISTER_OPERATOR(add_p, paddle::operators::AddPrimOp, + paddle::operators::AddPrimOpMaker, + paddle::operators::AddPrimOpShapeInference, + paddle::operators::AddPrimOpVarTypeInference); diff --git a/paddle/fluid/operators/prim_ops/broadcast_p_op.cc b/paddle/fluid/operators/prim_ops/broadcast_p_op.cc new file mode 100644 index 0000000000000..5459b73911473 --- /dev/null +++ b/paddle/fluid/operators/prim_ops/broadcast_p_op.cc @@ -0,0 +1,110 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/operator.h" + +namespace paddle { +namespace framework { +class InferShapeContext; +class VarDesc; +} // namespace framework +} // namespace paddle + +namespace paddle { +namespace operators { +class BroadcastPrimOp : public framework::OperatorBase { + public: + BroadcastPrimOp(const std::string &type, + const framework::VariableNameMap &inputs, + const framework::VariableNameMap &outputs, + const framework::AttributeMap &attrs) + : framework::OperatorBase(type, inputs, outputs, attrs) {} + void RunImpl(const framework::Scope &scope, + const platform::Place &dev_place) const override { + PADDLE_THROW(platform::errors::Unimplemented( + "Prim operator broadcast_p should not be excuted directly")); + } +}; + +class BroadcastPrimOpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() override { + AddInput("X", "(Tensor), The input tensor of broadcast_p op."); + AddOutput("Y", "(Tensor), The output tensor of broadcast_p op."); + AddAttr>( + "shape", + "(std::vector) Target shape of broadcast_p operator."); + AddComment(R"DOC( +Autograd primitive broadcast_p operator. +)DOC"); + } +}; + +static void CheckShapeValid(const std::vector &x_shape, + const std::vector &target_shape) { + size_t x_rank = x_shape.size(); + size_t target_rank = target_shape.size(); + PADDLE_ENFORCE_GE(target_rank, x_rank, + platform::errors::InvalidArgument( + "The rank of target shape should be greater than or " + "equal to input tensor's dimensions, " + "but received %d and %d", + target_rank, x_rank)); + std::vector::const_iterator it = target_shape.begin(); + for (size_t i = 0; i < x_rank; i++, it++) { + if (x_shape[i] != 1) { + it = std::find(it, target_shape.end(), x_shape[i]); + } + PADDLE_ENFORCE_EQ( + it != target_shape.end(), true, + platform::errors::InvalidArgument( + "Invalid shape, can not broadcast input tensor into target shape," + "the first dismatching shape %d is shape of input tensor at " + "dimension %d", + x_shape[i], i)); + } +} + +class BroadcastPrimOpShapeInference : public framework::InferShapeBase { + public: + void operator()(framework::InferShapeContext *ctx) const override { + framework::InferShapeVarPtr x_var_ptr = ctx->GetInputVarPtrs("X")[0]; + framework::InferShapeVarPtr y_var_ptr = ctx->GetOutputVarPtrs("Y")[0]; + framework::VarDesc *x_var = BOOST_GET(framework::VarDesc *, x_var_ptr); + auto x_shape = x_var->GetShape(); + auto target_shape = ctx->Attrs().Get>("shape"); + CheckShapeValid(x_shape, target_shape); + BOOST_GET(framework::VarDesc *, y_var_ptr)->SetShape(target_shape); + } +}; + +class BroadcastPrimOpVarTypeInference + : public framework::StaticGraphVarTypeInference { + public: + void operator()(framework::InferVarTypeContext *ctx) const override { + auto x_name = Input(ctx, "X")[0]; + auto y_name = Output(ctx, "Y")[0]; + SetType(ctx, y_name, GetType(ctx, x_name)); + SetDataType(ctx, y_name, GetDataType(ctx, x_name)); + } +}; + +} // namespace operators +} // namespace paddle + +REGISTER_OPERATOR(broadcast_p, paddle::operators::BroadcastPrimOp, + paddle::operators::BroadcastPrimOpMaker, + paddle::operators::BroadcastPrimOpShapeInference, + paddle::operators::BroadcastPrimOpVarTypeInference); diff --git a/paddle/fluid/operators/prim_ops/concat_p_op.cc b/paddle/fluid/operators/prim_ops/concat_p_op.cc new file mode 100644 index 0000000000000..24516356a2836 --- /dev/null +++ b/paddle/fluid/operators/prim_ops/concat_p_op.cc @@ -0,0 +1,134 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/operator.h" + +namespace paddle { +namespace framework { +class InferShapeContext; +class VarDesc; +} // namespace framework +} // namespace paddle + +namespace paddle { +namespace operators { +class ConcatPrimOp : public framework::OperatorBase { + public: + ConcatPrimOp(const std::string &type, + const framework::VariableNameMap &inputs, + const framework::VariableNameMap &outputs, + const framework::AttributeMap &attrs) + : framework::OperatorBase(type, inputs, outputs, attrs) {} + void RunImpl(const framework::Scope &scope, + const platform::Place &dev_place) const override { + PADDLE_THROW(platform::errors::Unimplemented( + "Prim operator concat_p should not be excuted directly")); + } +}; + +class ConcatPrimOpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() override { + AddInput("XS", "(Tensor), The input tensors of concat_p op.") + .AsDuplicable(); + AddOutput("Y", "(Tensor), The output tensor of concat_p op."); + AddAttr("axis", "(int64_t), The axis along which to concat."); + AddComment(R"DOC( +Autograd primitive concat_p operator. +)DOC"); + } +}; + +class ConcatPrimOpShapeInference : public framework::InferShapeBase { + public: + void operator()(framework::InferShapeContext *ctx) const override { + auto x_var_ptrs = ctx->GetInputVarPtrs("XS"); + framework::InferShapeVarPtr y_var_ptr = ctx->GetOutputVarPtrs("Y")[0]; + auto axis = ctx->Attrs().Get("axis"); + int64_t cnt_along_axis = 0; + framework::VarDesc *first_x_var = + BOOST_GET(framework::VarDesc *, x_var_ptrs[0]); + auto first_x_shape = first_x_var->GetShape(); + cnt_along_axis += first_x_shape[axis]; + size_t first_x_rank = first_x_shape.size(); + for (size_t i = 1; i < x_var_ptrs.size(); ++i) { + framework::VarDesc *x_var = + BOOST_GET(framework::VarDesc *, x_var_ptrs[i]); + auto x_shape = x_var->GetShape(); + cnt_along_axis += x_shape[axis]; + size_t x_rank = x_shape.size(); + PADDLE_ENFORCE_EQ( + x_rank, first_x_rank, + platform::errors::InvalidArgument("The dimensions of %d input tensor " + "should be same as the dimensions " + "of 1st input tensor's, " + "but get %d and %d", + i + 1, x_rank, first_x_rank)); + for (size_t j = 0; j < x_rank; ++j) { + if (j != size_t(axis)) { + PADDLE_ENFORCE_EQ(x_shape[j], first_x_shape[j], + platform::errors::InvalidArgument( + "The shape of %d input tensor at dimension %d " + "should be same as the 1st input tensor's, " + "but get %d and %d", + i + 1, j, x_shape[j], first_x_shape[j])); + } + } + } + + std::vector y_shape(first_x_shape); + y_shape[axis] = cnt_along_axis; + BOOST_GET(framework::VarDesc *, y_var_ptr)->SetShape(y_shape); + } +}; + +class ConcatPrimOpVarTypeInference + : public framework::StaticGraphVarTypeInference { + public: + void operator()(framework::InferVarTypeContext *ctx) const override { + auto x_names = Input(ctx, "XS"); + auto y_name = Output(ctx, "Y")[0]; + auto first_x_name = x_names[0]; + auto first_x_type = GetType(ctx, first_x_name); + auto first_x_dtype = GetDataType(ctx, first_x_name); + for (size_t i = 1; i < x_names.size(); ++i) { + auto x_name = x_names[i]; + auto x_type = GetType(ctx, x_name); + auto x_dtype = GetDataType(ctx, x_name); + PADDLE_ENFORCE_EQ(x_type, first_x_type, + platform::errors::InvalidArgument( + "The type of %d input tensor should be same as the " + "first input tensor's, " + "but get %d and %d", + i + 1, x_type, first_x_type)); + PADDLE_ENFORCE_EQ(x_dtype, first_x_dtype, + platform::errors::InvalidArgument( + "The datatype of %d input tensor should be same as " + "the first input tensor's, " + "but get %d and %d", + i + 1, x_dtype, first_x_dtype)); + } + SetType(ctx, y_name, GetType(ctx, first_x_name)); + SetDataType(ctx, y_name, GetDataType(ctx, first_x_name)); + } +}; + +} // namespace operators +} // namespace paddle + +REGISTER_OPERATOR(concat_p, paddle::operators::ConcatPrimOp, + paddle::operators::ConcatPrimOpMaker, + paddle::operators::ConcatPrimOpShapeInference, + paddle::operators::ConcatPrimOpVarTypeInference); diff --git a/paddle/fluid/operators/prim_ops/div_p_op.cc b/paddle/fluid/operators/prim_ops/div_p_op.cc new file mode 100644 index 0000000000000..35ae1f69cd2c8 --- /dev/null +++ b/paddle/fluid/operators/prim_ops/div_p_op.cc @@ -0,0 +1,116 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/operator.h" + +namespace paddle { +namespace framework { +class InferShapeContext; +class VarDesc; +} // namespace framework +} // namespace paddle + +namespace paddle { +namespace operators { +class DivPrimOp : public framework::OperatorBase { + public: + DivPrimOp(const std::string &type, const framework::VariableNameMap &inputs, + const framework::VariableNameMap &outputs, + const framework::AttributeMap &attrs) + : framework::OperatorBase(type, inputs, outputs, attrs) {} + void RunImpl(const framework::Scope &scope, + const platform::Place &dev_place) const override { + PADDLE_THROW(platform::errors::Unimplemented( + "Prim operator div_p should not be excuted directly")); + } +}; + +class DivPrimOpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() override { + AddInput("X", "(Tensor), The input tensor of div_p op."); + AddInput("Y", "(Tensor), The input tensor of div_p op."); + AddOutput("Z", "(Tensor), The output tensor of div_p op."); + AddComment(R"DOC( +Autograd primitive div_p operator. +)DOC"); + } +}; + +class DivPrimOpShapeInference : public framework::InferShapeBase { + public: + void operator()(framework::InferShapeContext *ctx) const override { + framework::InferShapeVarPtr x_var_ptr = ctx->GetInputVarPtrs("X")[0]; + framework::InferShapeVarPtr y_var_ptr = ctx->GetInputVarPtrs("Y")[0]; + framework::InferShapeVarPtr z_var_ptr = ctx->GetOutputVarPtrs("Z")[0]; + + framework::VarDesc *x_var = BOOST_GET(framework::VarDesc *, x_var_ptr); + framework::VarDesc *y_var = BOOST_GET(framework::VarDesc *, y_var_ptr); + auto x_shape = x_var->GetShape(); + auto y_shape = y_var->GetShape(); + size_t x_rank = x_shape.size(); + size_t y_rank = y_shape.size(); + PADDLE_ENFORCE_EQ(x_rank, y_rank, + platform::errors::InvalidArgument( + "The dimensions of two input tensor should be same, " + "but get %d and %d", + x_rank, y_rank)); + for (size_t i = 0; i < x_rank; ++i) { + PADDLE_ENFORCE_EQ( + x_shape[i], y_shape[i], + platform::errors::InvalidArgument( + "The shape of two input tensor at dimension %d should be same, " + "but get %d and %d", + i, x_shape[i], y_shape[i])); + } + + BOOST_GET(framework::VarDesc *, z_var_ptr)->SetShape(x_shape); + } +}; + +class DivPrimOpVarTypeInference + : public framework::StaticGraphVarTypeInference { + public: + void operator()(framework::InferVarTypeContext *ctx) const override { + auto x_name = Input(ctx, "X")[0]; + auto y_name = Input(ctx, "Y")[0]; + auto z_name = Output(ctx, "Z")[0]; + auto x_type = GetType(ctx, x_name); + auto y_type = GetType(ctx, y_name); + auto x_dtype = GetDataType(ctx, x_name); + auto y_dtype = GetDataType(ctx, y_name); + PADDLE_ENFORCE_EQ(x_type, y_type, + platform::errors::InvalidArgument( + "The type of two input tensor should be same, " + "but get %d and %d", + x_type, y_type)); + PADDLE_ENFORCE_EQ(x_dtype, y_dtype, + platform::errors::InvalidArgument( + "The datatype of two input tensor should be same, " + "but get %d and %d", + x_dtype, y_dtype)); + + SetType(ctx, z_name, x_type); + SetDataType(ctx, z_name, x_dtype); + } +}; + +} // namespace operators +} // namespace paddle + +REGISTER_OPERATOR(div_p, paddle::operators::DivPrimOp, + paddle::operators::DivPrimOpMaker, + paddle::operators::DivPrimOpShapeInference, + paddle::operators::DivPrimOpVarTypeInference); diff --git a/paddle/fluid/operators/prim_ops/fill_constant_p_op.cc b/paddle/fluid/operators/prim_ops/fill_constant_p_op.cc new file mode 100644 index 0000000000000..9831599e46ccc --- /dev/null +++ b/paddle/fluid/operators/prim_ops/fill_constant_p_op.cc @@ -0,0 +1,81 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/operator.h" + +namespace paddle { +namespace framework { +class InferShapeContext; +class VarDesc; +} // namespace framework +} // namespace paddle + +namespace paddle { +namespace operators { +class FillConstantPrimOp : public framework::OperatorBase { + public: + FillConstantPrimOp(const std::string &type, + const framework::VariableNameMap &inputs, + const framework::VariableNameMap &outputs, + const framework::AttributeMap &attrs) + : framework::OperatorBase(type, inputs, outputs, attrs) {} + void RunImpl(const framework::Scope &scope, + const platform::Place &dev_place) const override { + PADDLE_THROW(platform::errors::Unimplemented( + "Prim operator fill_constant_p should not be excuted directly")); + } +}; + +class FillConstantPrimOpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() override { + AddOutput("Y", "(Tensor), The output tensor of fill_constant_p op."); + AddAttr("value", "(float) The value of output tensor."); + AddAttr>( + "shape", "(std::vector) The shape of output tensor."); + AddAttr("dtype", "(int) The dtype of output tensor."); + AddComment(R"DOC( +Autograd primitive fill_constant_p operator. +)DOC"); + } +}; + +class FillConstantPrimOpShapeInference : public framework::InferShapeBase { + public: + void operator()(framework::InferShapeContext *ctx) const override { + framework::InferShapeVarPtr y_var_ptr = ctx->GetOutputVarPtrs("Y")[0]; + auto shape = ctx->Attrs().Get>("shape"); + BOOST_GET(framework::VarDesc *, y_var_ptr)->SetShape(shape); + } +}; + +class FillConstantPrimOpVarTypeInference + : public framework::StaticGraphVarTypeInference { + public: + void operator()(framework::InferVarTypeContext *ctx) const override { + auto y_name = Output(ctx, "Y")[0]; + auto data_type = static_cast( + BOOST_GET_CONST(int, ctx->GetAttr("dtype"))); + SetDataType(ctx, y_name, data_type); + } +}; + +} // namespace operators +} // namespace paddle + +REGISTER_OPERATOR(fill_constant_p, paddle::operators::FillConstantPrimOp, + paddle::operators::FillConstantPrimOpMaker, + paddle::operators::FillConstantPrimOpShapeInference, + paddle::operators::FillConstantPrimOpVarTypeInference); diff --git a/paddle/fluid/operators/prim_ops/gather_p_op.cc b/paddle/fluid/operators/prim_ops/gather_p_op.cc new file mode 100644 index 0000000000000..be777de055803 --- /dev/null +++ b/paddle/fluid/operators/prim_ops/gather_p_op.cc @@ -0,0 +1,117 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/operator.h" + +namespace paddle { +namespace framework { +class InferShapeContext; +class VarDesc; +} // namespace framework +} // namespace paddle + +namespace paddle { +namespace operators { +class GatherPrimOp : public framework::OperatorBase { + public: + GatherPrimOp(const std::string &type, + const framework::VariableNameMap &inputs, + const framework::VariableNameMap &outputs, + const framework::AttributeMap &attrs) + : framework::OperatorBase(type, inputs, outputs, attrs) {} + void RunImpl(const framework::Scope &scope, + const platform::Place &dev_place) const override { + PADDLE_THROW(platform::errors::Unimplemented( + "Prim operator gather_p should not be excuted directly")); + } +}; + +class GatherPrimOpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() override { + AddInput("X", "(Tensor), The input tensor of gather_p op."); + AddInput("IndexTensor", + "(Tensor), The index tensor of gather_p op, which is a 1D tensor.") + .AsDispensable(); + AddOutput("Y", "(Tensor), The output tensor of gather_p op."); + AddAttr("axis", "(int64_t), The axis along which to gather."); + AddAttr>( + "index", "(std::vector) The index of gather_p op") + .SetDefault({0}); + AddComment(R"DOC( +Autograd primitive gather_p operator. +)DOC"); + } +}; + +class GatherPrimOpShapeInference : public framework::InferShapeBase { + public: + void operator()(framework::InferShapeContext *ctx) const override { + framework::InferShapeVarPtr x_var_ptr = ctx->GetInputVarPtrs("X")[0]; + framework::InferShapeVarPtr y_var_ptr = ctx->GetOutputVarPtrs("Y")[0]; + int64_t num_index = 0; + if (ctx->HasInput("IndexTensor")) { + framework::InferShapeVarPtr index_var_ptr = + ctx->GetInputVarPtrs("IndexTensor")[0]; + framework::VarDesc *index_var = + BOOST_GET(framework::VarDesc *, index_var_ptr); + auto index_shape = index_var->GetShape(); + PADDLE_ENFORCE_EQ(index_shape.size(), 1, + platform::errors::InvalidArgument( + "The index tensor should be a 1D tensor," + "but get rank %d", + index_shape.size())); + num_index = index_shape[0]; + } else { + num_index = ctx->Attrs().Get>("index").size(); + } + auto axis = ctx->Attrs().Get("axis"); + + framework::VarDesc *x_var = BOOST_GET(framework::VarDesc *, x_var_ptr); + auto x_shape = x_var->GetShape(); + x_shape[axis] = num_index; + + BOOST_GET(framework::VarDesc *, y_var_ptr)->SetShape(x_shape); + } +}; + +class GatherPrimOpVarTypeInference + : public framework::StaticGraphVarTypeInference { + public: + void operator()(framework::InferVarTypeContext *ctx) const override { + auto x_name = Input(ctx, "X")[0]; + auto y_name = Output(ctx, "Y")[0]; + if (ctx->HasInput("IndexTensor")) { + auto index_name = Input(ctx, "IndexTensor")[0]; + auto index_dtype = GetDataType(ctx, index_name); + PADDLE_ENFORCE_EQ( + index_dtype, framework::proto::VarType_Type_INT32, + platform::errors::InvalidArgument( + "The datatype of input tensor should be VarType_Type_INT32(%d), " + "but get %d", + framework::proto::VarType_Type_INT32, index_dtype)); + } + SetType(ctx, y_name, GetType(ctx, x_name)); + SetDataType(ctx, y_name, GetDataType(ctx, x_name)); + } +}; + +} // namespace operators +} // namespace paddle + +REGISTER_OPERATOR(gather_p, paddle::operators::GatherPrimOp, + paddle::operators::GatherPrimOpMaker, + paddle::operators::GatherPrimOpShapeInference, + paddle::operators::GatherPrimOpVarTypeInference); diff --git a/paddle/fluid/operators/prim_ops/matmul_p_op.cc b/paddle/fluid/operators/prim_ops/matmul_p_op.cc new file mode 100644 index 0000000000000..1a28e1ca5c427 --- /dev/null +++ b/paddle/fluid/operators/prim_ops/matmul_p_op.cc @@ -0,0 +1,138 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/operator.h" + +namespace paddle { +namespace framework { +class InferShapeContext; +class VarDesc; +} // namespace framework +} // namespace paddle + +namespace paddle { +namespace operators { +class MatmulPrimOp : public framework::OperatorBase { + public: + MatmulPrimOp(const std::string &type, + const framework::VariableNameMap &inputs, + const framework::VariableNameMap &outputs, + const framework::AttributeMap &attrs) + : framework::OperatorBase(type, inputs, outputs, attrs) {} + void RunImpl(const framework::Scope &scope, + const platform::Place &dev_place) const override { + PADDLE_THROW(platform::errors::Unimplemented( + "Prim operator matmul_p should not be excuted directly")); + } +}; + +class MatmulPrimOpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() override { + AddInput("X", "(Tensor), The input tensor of matmul_p op."); + AddInput("Y", "(Tensor), The input tensor of matmul_p op."); + AddOutput("Z", "(Tensor), The output tensor of matmul_p op."); + AddComment(R"DOC( +Autograd primitive matmul_p operator. +)DOC"); + } +}; + +class MatmulPrimOpShapeInference : public framework::InferShapeBase { + public: + void operator()(framework::InferShapeContext *ctx) const override { + framework::InferShapeVarPtr x_var_ptr = ctx->GetInputVarPtrs("X")[0]; + framework::InferShapeVarPtr y_var_ptr = ctx->GetInputVarPtrs("Y")[0]; + framework::InferShapeVarPtr z_var_ptr = ctx->GetOutputVarPtrs("Z")[0]; + + framework::VarDesc *x_var = BOOST_GET(framework::VarDesc *, x_var_ptr); + framework::VarDesc *y_var = BOOST_GET(framework::VarDesc *, y_var_ptr); + auto x_shape = x_var->GetShape(); + auto y_shape = y_var->GetShape(); + size_t x_rank = x_shape.size(); + size_t y_rank = y_shape.size(); + PADDLE_ENFORCE_EQ(x_rank, y_rank, + platform::errors::InvalidArgument( + "The two input tensor's dimension should be equal" + "But received first input tensor's dimension is %d, " + "and another input tensor's dimension is %d", + x_rank, y_rank)); + + PADDLE_ENFORCE_EQ(x_rank == 2 || x_rank == 3, true, + platform::errors::InvalidArgument( + "The input tensor's dimension should be 2 or 3" + "But received input tensor's dimension is %d", + x_rank)); + + PADDLE_ENFORCE_EQ( + x_shape[x_rank - 1], y_shape[y_rank - 2], + platform::errors::InvalidArgument( + "Invalid shape for matmul, the last dimension of first input and " + "the penultimate dimension for the second input should be same." + "But received %d and %d.", + x_shape[x_rank - 1], y_shape[y_rank - 2])); + if (x_rank == 2) { + std::vector z_shape{x_shape[x_rank - 2], y_shape[y_rank - 1]}; + BOOST_GET(framework::VarDesc *, z_var_ptr)->SetShape(z_shape); + } else { + PADDLE_ENFORCE_EQ(x_shape[0], y_shape[0], + platform::errors::InvalidArgument( + "Invalid shape for matmul when input tensor's " + "dimension is 3, the first dimension of first " + "input and the second input should be same." + "But received %d and %d.", + x_shape[0], y_shape[0])); + + std::vector z_shape{x_shape[0], x_shape[x_rank - 2], + y_shape[y_rank - 1]}; + BOOST_GET(framework::VarDesc *, z_var_ptr)->SetShape(z_shape); + } + } +}; + +class MatmulPrimOpVarTypeInference + : public framework::StaticGraphVarTypeInference { + public: + void operator()(framework::InferVarTypeContext *ctx) const override { + auto x_name = Input(ctx, "X")[0]; + auto y_name = Input(ctx, "Y")[0]; + auto z_name = Output(ctx, "Z")[0]; + auto x_type = GetType(ctx, x_name); + auto y_type = GetType(ctx, y_name); + auto x_dtype = GetDataType(ctx, x_name); + auto y_dtype = GetDataType(ctx, y_name); + PADDLE_ENFORCE_EQ(x_type, y_type, + platform::errors::InvalidArgument( + "The type of two input tensor should be same, " + "but get %d and %d", + x_type, y_type)); + PADDLE_ENFORCE_EQ(x_dtype, y_dtype, + platform::errors::InvalidArgument( + "The datatype of two input tensor should be same, " + "but get %d and %d", + x_dtype, y_dtype)); + + SetType(ctx, z_name, x_type); + SetDataType(ctx, z_name, x_dtype); + } +}; + +} // namespace operators +} // namespace paddle + +REGISTER_OPERATOR(matmul_p, paddle::operators::MatmulPrimOp, + paddle::operators::MatmulPrimOpMaker, + paddle::operators::MatmulPrimOpShapeInference, + paddle::operators::MatmulPrimOpVarTypeInference); diff --git a/paddle/fluid/operators/prim_ops/mul_p_op.cc b/paddle/fluid/operators/prim_ops/mul_p_op.cc new file mode 100644 index 0000000000000..a60e2601a339b --- /dev/null +++ b/paddle/fluid/operators/prim_ops/mul_p_op.cc @@ -0,0 +1,116 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/operator.h" + +namespace paddle { +namespace framework { +class InferShapeContext; +class VarDesc; +} // namespace framework +} // namespace paddle + +namespace paddle { +namespace operators { +class MulPrimOp : public framework::OperatorBase { + public: + MulPrimOp(const std::string &type, const framework::VariableNameMap &inputs, + const framework::VariableNameMap &outputs, + const framework::AttributeMap &attrs) + : framework::OperatorBase(type, inputs, outputs, attrs) {} + void RunImpl(const framework::Scope &scope, + const platform::Place &dev_place) const override { + PADDLE_THROW(platform::errors::Unimplemented( + "Prim operator mul_p should not be excuted directly")); + } +}; + +class MulPrimOpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() override { + AddInput("X", "(Tensor), The input tensor of mul_p op."); + AddInput("Y", "(Tensor), The input tensor of mul_p op."); + AddOutput("Z", "(Tensor), The output tensor of mul_p op."); + AddComment(R"DOC( +Autograd primitive mul_p operator. +)DOC"); + } +}; + +class MulPrimOpShapeInference : public framework::InferShapeBase { + public: + void operator()(framework::InferShapeContext *ctx) const override { + framework::InferShapeVarPtr x_var_ptr = ctx->GetInputVarPtrs("X")[0]; + framework::InferShapeVarPtr y_var_ptr = ctx->GetInputVarPtrs("Y")[0]; + framework::InferShapeVarPtr z_var_ptr = ctx->GetOutputVarPtrs("Z")[0]; + + framework::VarDesc *x_var = BOOST_GET(framework::VarDesc *, x_var_ptr); + framework::VarDesc *y_var = BOOST_GET(framework::VarDesc *, y_var_ptr); + auto x_shape = x_var->GetShape(); + auto y_shape = y_var->GetShape(); + size_t x_rank = x_shape.size(); + size_t y_rank = y_shape.size(); + PADDLE_ENFORCE_EQ(x_rank, y_rank, + platform::errors::InvalidArgument( + "The dimensions of two input tensor should be same, " + "but get %d and %d", + x_rank, y_rank)); + for (size_t i = 0; i < x_rank; ++i) { + PADDLE_ENFORCE_EQ( + x_shape[i], y_shape[i], + platform::errors::InvalidArgument( + "The shape of two input tensor at dimension %d should be same, " + "but get %d and %d", + i, x_shape[i], y_shape[i])); + } + + BOOST_GET(framework::VarDesc *, z_var_ptr)->SetShape(x_shape); + } +}; + +class MulPrimOpVarTypeInference + : public framework::StaticGraphVarTypeInference { + public: + void operator()(framework::InferVarTypeContext *ctx) const override { + auto x_name = Input(ctx, "X")[0]; + auto y_name = Input(ctx, "Y")[0]; + auto z_name = Output(ctx, "Z")[0]; + auto x_type = GetType(ctx, x_name); + auto y_type = GetType(ctx, y_name); + auto x_dtype = GetDataType(ctx, x_name); + auto y_dtype = GetDataType(ctx, y_name); + PADDLE_ENFORCE_EQ(x_type, y_type, + platform::errors::InvalidArgument( + "The type of two input tensor should be same, " + "but get %d and %d", + x_type, y_type)); + PADDLE_ENFORCE_EQ(x_dtype, y_dtype, + platform::errors::InvalidArgument( + "The datatype of two input tensor should be same, " + "but get %d and %d", + x_dtype, y_dtype)); + + SetType(ctx, z_name, x_type); + SetDataType(ctx, z_name, x_dtype); + } +}; + +} // namespace operators +} // namespace paddle + +REGISTER_OPERATOR(mul_p, paddle::operators::MulPrimOp, + paddle::operators::MulPrimOpMaker, + paddle::operators::MulPrimOpShapeInference, + paddle::operators::MulPrimOpVarTypeInference); diff --git a/paddle/fluid/operators/prim_ops/prim_op_test.cc b/paddle/fluid/operators/prim_ops/prim_op_test.cc new file mode 100644 index 0000000000000..2d65149d130bb --- /dev/null +++ b/paddle/fluid/operators/prim_ops/prim_op_test.cc @@ -0,0 +1,553 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "gtest/gtest.h" + +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/program_desc.h" + +USE_OP_ITSELF(reshape_p); +USE_OP_ITSELF(broadcast_p); +USE_OP_ITSELF(reduce_p); +USE_OP_ITSELF(transpose_p); +USE_OP_ITSELF(split_p); +USE_OP_ITSELF(concat_p); +USE_OP_ITSELF(slice_select_p); +USE_OP_ITSELF(slice_assign_p); +USE_OP_ITSELF(gather_p); +USE_OP_ITSELF(scatter_add_p); +USE_OP_ITSELF(add_p); +USE_OP_ITSELF(sub_p); +USE_OP_ITSELF(mul_p); +USE_OP_ITSELF(div_p); +USE_OP_ITSELF(sqrt_p); +USE_OP_ITSELF(tanh_p); +USE_OP_ITSELF(matmul_p); +USE_OP_ITSELF(fill_constant_p); + +namespace paddle { +namespace framework { + +static void NewVar(BlockDesc *block, const std::string &name, + const std::vector &shape) { + auto *var_desc = block->Var(name); + if (shape.size() > 0) { + var_desc->SetShape(shape); + var_desc->SetType(proto::VarType::LOD_TENSOR); + var_desc->SetDataType(proto::VarType_Type_FP32); + } +} + +static void AppendOp(BlockDesc *block, const std::string &type, + VariableNameMap inputs, VariableNameMap outputs, + AttributeMap attrs) { + auto &op_info = OpInfoMap::Instance().Get(type); + if (op_info.Checker()) { + op_info.Checker()->Check(&attrs); + } + + auto *op = block->AppendOp(); + op->SetType(type); + for (auto &pair : inputs) { + op->SetInput(pair.first, pair.second); + } + + for (auto &pair : outputs) { + op->SetOutput(pair.first, pair.second); + for (auto &var_name : pair.second) { + if (!block->FindVarRecursive(var_name)) { + NewVar(block, var_name, {}); + } + } + } + + op->SetAttrMap(attrs); + op->InferVarType(block); + op->InferShape(*block); +} + +TEST(PrimOp, reshape_p) { + ProgramDesc program; + auto *block = program.MutableBlock(0); + std::vector shape{3, 4, 5}; + + std::string x0 = "x0"; + std::string x1 = "x1"; + + NewVar(block, x0, shape); + AppendOp(block, "reshape_p", {{"X", {x0}}}, {{"Y", {x1}}}, + {{"shape", std::vector{12, 5}}}); + ASSERT_EQ(block->Var("x1")->GetType(), proto::VarType::LOD_TENSOR); + ASSERT_EQ(block->Var("x1")->GetDataType(), proto::VarType_Type_FP32); + auto shapes = block->Var("x1")->GetShape(); + ASSERT_EQ(shapes.size(), 2UL); + ASSERT_EQ(shapes[0], 12L); + ASSERT_EQ(shapes[1], 5L); +} + +TEST(PrimOp, broadcast_p) { + ProgramDesc program; + auto *block = program.MutableBlock(0); + std::vector shape{3, 1}; + + std::string x0 = "x0"; + std::string x1 = "x1"; + + NewVar(block, x0, shape); + AppendOp(block, "broadcast_p", {{"X", {x0}}}, {{"Y", {x1}}}, + {{"shape", std::vector{3, 4, 5}}}); + ASSERT_EQ(block->Var("x1")->GetType(), proto::VarType::LOD_TENSOR); + ASSERT_EQ(block->Var("x1")->GetDataType(), proto::VarType_Type_FP32); + auto shapes = block->Var("x1")->GetShape(); + ASSERT_EQ(shapes.size(), 3UL); + ASSERT_EQ(shapes[0], 3L); + ASSERT_EQ(shapes[1], 4L); + ASSERT_EQ(shapes[2], 5L); +} + +TEST(PrimOp, reduce_p) { + ProgramDesc program; + auto *block = program.MutableBlock(0); + std::vector shape{3, 4, 5}; + + std::string x0 = "x0"; + std::string x1 = "x1"; + std::string x2 = "x2"; + + NewVar(block, x0, shape); + AppendOp(block, "reduce_p", {{"X", {x0}}}, {{"Y", {x1}}}, + {{"axis", std::vector{0, 2}}, {"keepdim", false}}); + ASSERT_EQ(block->Var("x1")->GetType(), proto::VarType::LOD_TENSOR); + ASSERT_EQ(block->Var("x1")->GetDataType(), proto::VarType_Type_FP32); + auto shapes = block->Var("x1")->GetShape(); + ASSERT_EQ(shapes.size(), 1UL); + ASSERT_EQ(shapes[0], 4L); + AppendOp(block, "reduce_p", {{"X", {x0}}}, {{"Y", {x2}}}, + {{"axis", std::vector{0, 2}}, {"keepdim", true}}); + ASSERT_EQ(block->Var("x2")->GetType(), proto::VarType::LOD_TENSOR); + ASSERT_EQ(block->Var("x2")->GetDataType(), proto::VarType_Type_FP32); + shapes = block->Var("x2")->GetShape(); + ASSERT_EQ(shapes.size(), 3UL); + ASSERT_EQ(shapes[0], 1L); + ASSERT_EQ(shapes[1], 4L); + ASSERT_EQ(shapes[2], 1L); +} + +TEST(PrimOp, transpose_p) { + ProgramDesc program; + auto *block = program.MutableBlock(0); + std::vector shape{3, 4, 5}; + + std::string x0 = "x0"; + std::string x1 = "x1"; + + NewVar(block, x0, shape); + AppendOp(block, "transpose_p", {{"X", {x0}}}, {{"Y", {x1}}}, + {{"axis", std::vector{2, 1, 0}}}); + ASSERT_EQ(block->Var("x1")->GetType(), proto::VarType::LOD_TENSOR); + ASSERT_EQ(block->Var("x1")->GetDataType(), proto::VarType_Type_FP32); + auto shapes = block->Var("x1")->GetShape(); + ASSERT_EQ(shapes.size(), 3UL); + ASSERT_EQ(shapes[0], 5L); + ASSERT_EQ(shapes[1], 4L); + ASSERT_EQ(shapes[2], 3L); +} + +TEST(PrimOp, split_p) { + ProgramDesc program; + auto *block = program.MutableBlock(0); + std::vector shape{6, 8, 10}; + + std::string x0 = "x0"; + std::string x1 = "x1"; + std::string x2 = "x2"; + std::string x3 = "x3"; + + NewVar(block, x0, shape); + AppendOp(block, "split_p", {{"X", {x0}}}, {{"YS", {x1, x2, x3}}}, + {{"axis", int64_t{1}}, + {"num_or_sections", std::vector{2, 4, 2}}}); + ASSERT_EQ(block->Var("x1")->GetType(), proto::VarType::LOD_TENSOR); + ASSERT_EQ(block->Var("x1")->GetDataType(), proto::VarType_Type_FP32); + auto shapes = block->Var("x1")->GetShape(); + ASSERT_EQ(shapes.size(), 3UL); + ASSERT_EQ(shapes[0], 6L); + ASSERT_EQ(shapes[1], 2L); + ASSERT_EQ(shapes[2], 10L); + ASSERT_EQ(block->Var("x2")->GetType(), proto::VarType::LOD_TENSOR); + ASSERT_EQ(block->Var("x2")->GetDataType(), proto::VarType_Type_FP32); + shapes = block->Var("x2")->GetShape(); + ASSERT_EQ(shapes.size(), 3UL); + ASSERT_EQ(shapes[0], 6L); + ASSERT_EQ(shapes[1], 4L); + ASSERT_EQ(shapes[2], 10L); + ASSERT_EQ(block->Var("x3")->GetType(), proto::VarType::LOD_TENSOR); + ASSERT_EQ(block->Var("x3")->GetDataType(), proto::VarType_Type_FP32); + shapes = block->Var("x3")->GetShape(); + ASSERT_EQ(shapes.size(), 3UL); + ASSERT_EQ(shapes[0], 6L); + ASSERT_EQ(shapes[1], 2L); + ASSERT_EQ(shapes[2], 10L); + std::string x4 = "x4"; + std::string x5 = "x5"; + AppendOp( + block, "split_p", {{"X", {x0}}}, {{"YS", {x4, x5}}}, + {{"axis", int64_t{2}}, {"num_or_sections", std::vector{2}}}); + ASSERT_EQ(block->Var("x4")->GetType(), proto::VarType::LOD_TENSOR); + ASSERT_EQ(block->Var("x4")->GetDataType(), proto::VarType_Type_FP32); + shapes = block->Var("x4")->GetShape(); + ASSERT_EQ(shapes.size(), 3UL); + ASSERT_EQ(shapes[0], 6L); + ASSERT_EQ(shapes[1], 8L); + ASSERT_EQ(shapes[2], 5L); + ASSERT_EQ(block->Var("x5")->GetType(), proto::VarType::LOD_TENSOR); + ASSERT_EQ(block->Var("x5")->GetDataType(), proto::VarType_Type_FP32); + shapes = block->Var("x5")->GetShape(); + ASSERT_EQ(shapes.size(), 3UL); + ASSERT_EQ(shapes[0], 6L); + ASSERT_EQ(shapes[1], 8L); + ASSERT_EQ(shapes[2], 5L); +} + +TEST(PrimOp, concat_p) { + ProgramDesc program; + auto *block = program.MutableBlock(0); + std::vector shape_0{3, 1, 5}; + std::vector shape_1{3, 4, 5}; + std::vector shape_2{3, 6, 5}; + + std::string x0 = "x0"; + std::string x1 = "x1"; + std::string x2 = "x2"; + std::string x3 = "x3"; + + NewVar(block, x0, shape_0); + NewVar(block, x1, shape_1); + NewVar(block, x2, shape_2); + AppendOp(block, "concat_p", {{"XS", {x0, x1, x2}}}, {{"Y", {x3}}}, + {{"axis", int64_t{1}}}); + ASSERT_EQ(block->Var("x3")->GetType(), proto::VarType::LOD_TENSOR); + ASSERT_EQ(block->Var("x3")->GetDataType(), proto::VarType_Type_FP32); + auto shapes = block->Var("x3")->GetShape(); + ASSERT_EQ(shapes.size(), 3UL); + ASSERT_EQ(shapes[0], 3L); + ASSERT_EQ(shapes[1], 11L); + ASSERT_EQ(shapes[2], 5L); +} + +TEST(PrimOp, slice_select_p) { + ProgramDesc program; + auto *block = program.MutableBlock(0); + std::vector shape{6, 8, 10}; + + std::string x0 = "x0"; + std::string x1 = "x1"; + + NewVar(block, x0, shape); + AppendOp(block, "slice_select_p", {{"X", {x0}}}, {{"Y", {x1}}}, + {{"axis", std::vector{0, 1, 2}}, + {"starts", std::vector{0, 0, 0}}, + {"ends", std::vector{5, 7, 9}}, + {"strides", std::vector{2, 2, 2}}}); + ASSERT_EQ(block->Var("x1")->GetType(), proto::VarType::LOD_TENSOR); + ASSERT_EQ(block->Var("x1")->GetDataType(), proto::VarType_Type_FP32); + auto shapes = block->Var("x1")->GetShape(); + ASSERT_EQ(shapes.size(), 3UL); + ASSERT_EQ(shapes[0], 3L); + ASSERT_EQ(shapes[1], 4L); + ASSERT_EQ(shapes[2], 5L); +} + +TEST(PrimOp, slice_assign_p) { + ProgramDesc program; + auto *block = program.MutableBlock(0); + std::vector shape_0{6, 8, 10}; + std::vector shape_1{3, 4, 5}; + + std::string x0 = "x0"; + std::string x1 = "x1"; + std::string x2 = "x2"; + + NewVar(block, x0, shape_0); + NewVar(block, x1, shape_1); + AppendOp(block, "slice_assign_p", {{"X", {x0}}, {"Y", {x1}}}, {{"Z", {x2}}}, + {{"axis", std::vector{0, 1, 2}}, + {"starts", std::vector{0, 0, 0}}, + {"ends", std::vector{5, 7, 9}}, + {"strides", std::vector{2, 2, 2}}}); + ASSERT_EQ(block->Var("x2")->GetType(), proto::VarType::LOD_TENSOR); + ASSERT_EQ(block->Var("x2")->GetDataType(), proto::VarType_Type_FP32); + auto shapes = block->Var("x2")->GetShape(); + ASSERT_EQ(shapes.size(), 3UL); + ASSERT_EQ(shapes[0], 6L); + ASSERT_EQ(shapes[1], 8L); + ASSERT_EQ(shapes[2], 10L); +} + +TEST(PrimOp, gather_p) { + ProgramDesc program; + auto *block = program.MutableBlock(0); + std::vector shape{6, 8, 10}; + + std::string x0 = "x0"; + std::string x1 = "x1"; + + NewVar(block, x0, shape); + AppendOp(block, "gather_p", {{"X", {x0}}}, {{"Y", {x1}}}, + {{"axis", int64_t{1}}, {"index", std::vector{0, 2, 5}}}); + ASSERT_EQ(block->Var("x1")->GetType(), proto::VarType::LOD_TENSOR); + ASSERT_EQ(block->Var("x1")->GetDataType(), proto::VarType_Type_FP32); + auto shapes = block->Var("x1")->GetShape(); + ASSERT_EQ(shapes.size(), 3UL); + ASSERT_EQ(shapes[0], 6L); + ASSERT_EQ(shapes[1], 3L); + ASSERT_EQ(shapes[2], 10L); + std::string index_t = "index_t"; + std::string x2 = "x2"; + + auto *var_desc = block->Var(index_t); + var_desc->SetShape(std::vector{3}); + var_desc->SetType(proto::VarType::LOD_TENSOR); + var_desc->SetDataType(proto::VarType_Type_INT32); + AppendOp(block, "gather_p", {{"X", {x0}}, {"IndexTensor", {index_t}}}, + {{"Y", {x2}}}, {{"axis", int64_t{1}}}); + ASSERT_EQ(block->Var("x2")->GetType(), proto::VarType::LOD_TENSOR); + ASSERT_EQ(block->Var("x2")->GetDataType(), proto::VarType_Type_FP32); + shapes = block->Var("x2")->GetShape(); + ASSERT_EQ(shapes.size(), 3UL); + ASSERT_EQ(shapes[0], 6L); + ASSERT_EQ(shapes[1], 3L); + ASSERT_EQ(shapes[2], 10L); +} + +TEST(PrimOp, scatter_add_p) { + ProgramDesc program; + auto *block = program.MutableBlock(0); + std::vector shape_0{6, 8, 10}; + std::vector shape_1{6, 3, 10}; + + std::string x0 = "x0"; + std::string x1 = "x1"; + std::string x2 = "x2"; + + NewVar(block, x0, shape_0); + NewVar(block, x1, shape_1); + AppendOp(block, "scatter_add_p", {{"X", {x0}}, {"Y", {x1}}}, {{"Z", {x2}}}, + {{"axis", int64_t{1}}, {"index", std::vector{0, 2, 5}}}); + ASSERT_EQ(block->Var("x2")->GetType(), proto::VarType::LOD_TENSOR); + ASSERT_EQ(block->Var("x2")->GetDataType(), proto::VarType_Type_FP32); + auto shapes = block->Var("x2")->GetShape(); + ASSERT_EQ(shapes.size(), 3UL); + ASSERT_EQ(shapes[0], 6L); + ASSERT_EQ(shapes[1], 8L); + ASSERT_EQ(shapes[2], 10L); + std::string index_t = "index_t"; + std::string x3 = "x3"; + + auto *var_desc = block->Var(index_t); + var_desc->SetShape(std::vector{3}); + var_desc->SetType(proto::VarType::LOD_TENSOR); + var_desc->SetDataType(proto::VarType_Type_INT32); + AppendOp(block, "scatter_add_p", + {{"X", {x0}}, {"Y", {x1}}, {"IndexTensor", {index_t}}}, + {{"Z", {x3}}}, {{"axis", int64_t{1}}}); + ASSERT_EQ(block->Var("x3")->GetType(), proto::VarType::LOD_TENSOR); + ASSERT_EQ(block->Var("x3")->GetDataType(), proto::VarType_Type_FP32); + shapes = block->Var("x3")->GetShape(); + ASSERT_EQ(shapes.size(), 3UL); + ASSERT_EQ(shapes[0], 6L); + ASSERT_EQ(shapes[1], 8L); + ASSERT_EQ(shapes[2], 10L); +} + +TEST(PrimOp, add_p) { + ProgramDesc program; + auto *block = program.MutableBlock(0); + std::vector shape{3, 4, 5}; + + std::string x0 = "x0"; + std::string x1 = "x1"; + std::string x2 = "x2"; + + NewVar(block, x0, shape); + NewVar(block, x1, shape); + AppendOp(block, "add_p", {{"X", {x0}}, {"Y", {x1}}}, {{"Z", {x2}}}, {}); + ASSERT_EQ(block->Var("x2")->GetType(), proto::VarType::LOD_TENSOR); + ASSERT_EQ(block->Var("x2")->GetDataType(), proto::VarType_Type_FP32); + auto shapes = block->Var("x2")->GetShape(); + ASSERT_EQ(shapes.size(), 3UL); + ASSERT_EQ(shapes[0], 3L); + ASSERT_EQ(shapes[1], 4L); + ASSERT_EQ(shapes[2], 5L); +} + +TEST(PrimOp, sub_p) { + ProgramDesc program; + auto *block = program.MutableBlock(0); + std::vector shape{3, 4, 5}; + + std::string x0 = "x0"; + std::string x1 = "x1"; + std::string x2 = "x2"; + + NewVar(block, x0, shape); + NewVar(block, x1, shape); + AppendOp(block, "sub_p", {{"X", {x0}}, {"Y", {x1}}}, {{"Z", {x2}}}, {}); + ASSERT_EQ(block->Var("x2")->GetType(), proto::VarType::LOD_TENSOR); + ASSERT_EQ(block->Var("x2")->GetDataType(), proto::VarType_Type_FP32); + auto shapes = block->Var("x2")->GetShape(); + ASSERT_EQ(shapes.size(), 3UL); + ASSERT_EQ(shapes[0], 3L); + ASSERT_EQ(shapes[1], 4L); + ASSERT_EQ(shapes[2], 5L); +} + +TEST(PrimOp, mul_p) { + ProgramDesc program; + auto *block = program.MutableBlock(0); + std::vector shape{3, 4, 5}; + + std::string x0 = "x0"; + std::string x1 = "x1"; + std::string x2 = "x2"; + + NewVar(block, x0, shape); + NewVar(block, x1, shape); + AppendOp(block, "mul_p", {{"X", {x0}}, {"Y", {x1}}}, {{"Z", {x2}}}, {}); + ASSERT_EQ(block->Var("x2")->GetType(), proto::VarType::LOD_TENSOR); + ASSERT_EQ(block->Var("x2")->GetDataType(), proto::VarType_Type_FP32); + auto shapes = block->Var("x2")->GetShape(); + ASSERT_EQ(shapes.size(), 3UL); + ASSERT_EQ(shapes[0], 3L); + ASSERT_EQ(shapes[1], 4L); + ASSERT_EQ(shapes[2], 5L); +} + +TEST(PrimOp, div_p) { + ProgramDesc program; + auto *block = program.MutableBlock(0); + std::vector shape{3, 4, 5}; + + std::string x0 = "x0"; + std::string x1 = "x1"; + std::string x2 = "x2"; + + NewVar(block, x0, shape); + NewVar(block, x1, shape); + AppendOp(block, "div_p", {{"X", {x0}}, {"Y", {x1}}}, {{"Z", {x2}}}, {}); + ASSERT_EQ(block->Var("x2")->GetType(), proto::VarType::LOD_TENSOR); + ASSERT_EQ(block->Var("x2")->GetDataType(), proto::VarType_Type_FP32); + auto shapes = block->Var("x2")->GetShape(); + ASSERT_EQ(shapes.size(), 3UL); + ASSERT_EQ(shapes[0], 3L); + ASSERT_EQ(shapes[1], 4L); + ASSERT_EQ(shapes[2], 5L); +} + +TEST(PrimOp, sqrt_p) { + ProgramDesc program; + auto *block = program.MutableBlock(0); + std::vector shape{3, 4, 5}; + + std::string x0 = "x0"; + std::string x1 = "x1"; + + NewVar(block, x0, shape); + AppendOp(block, "sqrt_p", {{"X", {x0}}}, {{"Y", {x1}}}, {}); + ASSERT_EQ(block->Var("x1")->GetType(), proto::VarType::LOD_TENSOR); + ASSERT_EQ(block->Var("x1")->GetDataType(), proto::VarType_Type_FP32); + auto shapes = block->Var("x1")->GetShape(); + ASSERT_EQ(shapes.size(), 3UL); + ASSERT_EQ(shapes[0], 3L); + ASSERT_EQ(shapes[1], 4L); + ASSERT_EQ(shapes[2], 5L); +} + +TEST(PrimOp, tanh_p) { + ProgramDesc program; + auto *block = program.MutableBlock(0); + std::vector shape{3, 4, 5}; + + std::string x0 = "x0"; + std::string x1 = "x1"; + + NewVar(block, x0, shape); + AppendOp(block, "tanh_p", {{"X", {x0}}}, {{"Y", {x1}}}, {}); + ASSERT_EQ(block->Var("x1")->GetType(), proto::VarType::LOD_TENSOR); + ASSERT_EQ(block->Var("x1")->GetDataType(), proto::VarType_Type_FP32); + auto shapes = block->Var("x1")->GetShape(); + ASSERT_EQ(shapes.size(), 3UL); + ASSERT_EQ(shapes[0], 3L); + ASSERT_EQ(shapes[1], 4L); + ASSERT_EQ(shapes[2], 5L); +} + +TEST(PrimOp, matmul_p) { + ProgramDesc program; + auto *block = program.MutableBlock(0); + std::vector shape_0{3, 4, 5}; + std::vector shape_1{3, 5, 8}; + + std::string x0 = "x0"; + std::string x1 = "x1"; + std::string x2 = "x2"; + + NewVar(block, x0, shape_0); + NewVar(block, x1, shape_1); + AppendOp(block, "matmul_p", {{"X", {x0}}, {"Y", {x1}}}, {{"Z", {x2}}}, {}); + ASSERT_EQ(block->Var("x2")->GetType(), proto::VarType::LOD_TENSOR); + ASSERT_EQ(block->Var("x2")->GetDataType(), proto::VarType_Type_FP32); + auto shapes = block->Var("x2")->GetShape(); + ASSERT_EQ(shapes.size(), 3UL); + ASSERT_EQ(shapes[0], 3L); + ASSERT_EQ(shapes[1], 4L); + ASSERT_EQ(shapes[2], 8L); + std::vector shape_2{4, 5}; + std::vector shape_3{5, 8}; + + std::string x3 = "x3"; + std::string x4 = "x4"; + std::string x5 = "x5"; + + NewVar(block, x3, shape_2); + NewVar(block, x4, shape_3); + AppendOp(block, "matmul_p", {{"X", {x3}}, {"Y", {x4}}}, {{"Z", {x5}}}, {}); + ASSERT_EQ(block->Var("x5")->GetType(), proto::VarType::LOD_TENSOR); + ASSERT_EQ(block->Var("x5")->GetDataType(), proto::VarType_Type_FP32); + shapes = block->Var("x5")->GetShape(); + ASSERT_EQ(shapes.size(), 2UL); + ASSERT_EQ(shapes[0], 4L); + ASSERT_EQ(shapes[1], 8L); +} + +TEST(PrimOp, fill_constant_p) { + ProgramDesc program; + auto *block = program.MutableBlock(0); + std::string x0 = "x0"; + + AppendOp(block, "fill_constant_p", {{}}, {{"Y", {x0}}}, + {{"value", 0.0f}, + {"dtype", proto::VarType_Type_FP32}, + {"shape", std::vector{3, 4, 5}}}); + ASSERT_EQ(block->Var("x0")->GetType(), proto::VarType::LOD_TENSOR); + ASSERT_EQ(block->Var("x0")->GetDataType(), proto::VarType_Type_FP32); + auto shapes = block->Var("x0")->GetShape(); + ASSERT_EQ(shapes.size(), 3UL); + ASSERT_EQ(shapes[0], 3L); + ASSERT_EQ(shapes[1], 4L); + ASSERT_EQ(shapes[2], 5L); +} + +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/operators/prim_ops/reduce_p_op.cc b/paddle/fluid/operators/prim_ops/reduce_p_op.cc new file mode 100644 index 0000000000000..9f2b5f3ed2c43 --- /dev/null +++ b/paddle/fluid/operators/prim_ops/reduce_p_op.cc @@ -0,0 +1,107 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/operator.h" + +namespace paddle { +namespace framework { +class InferShapeContext; +class VarDesc; +} // namespace framework +} // namespace paddle + +namespace paddle { +namespace operators { +class ReducePrimOp : public framework::OperatorBase { + public: + ReducePrimOp(const std::string &type, + const framework::VariableNameMap &inputs, + const framework::VariableNameMap &outputs, + const framework::AttributeMap &attrs) + : framework::OperatorBase(type, inputs, outputs, attrs) {} + void RunImpl(const framework::Scope &scope, + const platform::Place &dev_place) const override { + PADDLE_THROW(platform::errors::Unimplemented( + "Prim operator reduce_p should not be excuted directly")); + } +}; + +class ReducePrimOpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() override { + AddInput("X", "(Tensor), The input tensor of reduce_p op."); + AddOutput("Y", "(Tensor), The output tensor of reduce_p op."); + AddAttr>( + "axis", + "(std::vector) The axis along which to reduce on. Must be in " + "range [-rank(input), rank(input)]. If `axis[i] < 0`, the axis[i] to " + "reduce is `rank + axis[i]`."); + AddAttr("keepdim", + "(bool, default false) " + "If true, retain the reduced axis with length 1.") + .SetDefault(false); + AddComment(R"DOC( +Autograd primitive reduce_p operator. +)DOC"); + } +}; + +class ReducePrimOpShapeInference : public framework::InferShapeBase { + public: + void operator()(framework::InferShapeContext *ctx) const override { + framework::InferShapeVarPtr x_var_ptr = ctx->GetInputVarPtrs("X")[0]; + framework::InferShapeVarPtr y_var_ptr = ctx->GetOutputVarPtrs("Y")[0]; + framework::VarDesc *x_var = BOOST_GET(framework::VarDesc *, x_var_ptr); + auto x_shape = x_var->GetShape(); + auto axis = ctx->Attrs().Get>("axis"); + auto keepdim = ctx->Attrs().Get("keepdim"); + if (keepdim) { + for (size_t i = 0; i < axis.size(); ++i) { + x_shape[axis[i]] = 1; + } + } else { + const int kDelFlag = -2; + for (size_t i = 0; i < axis.size(); ++i) { + x_shape[axis[i]] = kDelFlag; + } + x_shape.erase(remove(x_shape.begin(), x_shape.end(), kDelFlag), + x_shape.end()); + } + if (!keepdim && x_shape.size() == 0) { + x_shape.push_back(1); + } + + BOOST_GET(framework::VarDesc *, y_var_ptr)->SetShape(x_shape); + } +}; + +class ReducePrimOpVarTypeInference + : public framework::StaticGraphVarTypeInference { + public: + void operator()(framework::InferVarTypeContext *ctx) const override { + auto x_name = Input(ctx, "X")[0]; + auto y_name = Output(ctx, "Y")[0]; + SetType(ctx, y_name, GetType(ctx, x_name)); + SetDataType(ctx, y_name, GetDataType(ctx, x_name)); + } +}; + +} // namespace operators +} // namespace paddle + +REGISTER_OPERATOR(reduce_p, paddle::operators::ReducePrimOp, + paddle::operators::ReducePrimOpMaker, + paddle::operators::ReducePrimOpShapeInference, + paddle::operators::ReducePrimOpVarTypeInference); diff --git a/paddle/fluid/operators/prim_ops/reshape_p_op.cc b/paddle/fluid/operators/prim_ops/reshape_p_op.cc new file mode 100644 index 0000000000000..497bc8fbaffb3 --- /dev/null +++ b/paddle/fluid/operators/prim_ops/reshape_p_op.cc @@ -0,0 +1,97 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/operator.h" + +namespace paddle { +namespace framework { +class InferShapeContext; +class VarDesc; +} // namespace framework +} // namespace paddle + +namespace paddle { +namespace operators { +class ReshapePrimOp : public framework::OperatorBase { + public: + ReshapePrimOp(const std::string &type, + const framework::VariableNameMap &inputs, + const framework::VariableNameMap &outputs, + const framework::AttributeMap &attrs) + : framework::OperatorBase(type, inputs, outputs, attrs) {} + void RunImpl(const framework::Scope &scope, + const platform::Place &dev_place) const override { + PADDLE_THROW(platform::errors::Unimplemented( + "Prim operator reshape_p should not be excuted directly")); + } +}; + +class ReshapePrimOpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() override { + AddInput("X", "(Tensor), The input tensor of reshape_p op."); + AddOutput("Y", "(Tensor), The output tensor of reshape_p op."); + AddAttr>( + "shape", "(std::vector) Target shape of reshape_p operator."); + AddComment(R"DOC( +Autograd primitive reshape_p operator. +)DOC"); + } +}; + +static int64_t product(const std::vector &shape) { + int64_t rslt = 1; + for (size_t i = 0; i < shape.size(); ++i) { + rslt *= shape[i]; + } + return rslt; +} + +class ReshapePrimOpShapeInference : public framework::InferShapeBase { + public: + void operator()(framework::InferShapeContext *ctx) const override { + framework::InferShapeVarPtr x_var_ptr = ctx->GetInputVarPtrs("X")[0]; + framework::InferShapeVarPtr y_var_ptr = ctx->GetOutputVarPtrs("Y")[0]; + framework::VarDesc *x_var = BOOST_GET(framework::VarDesc *, x_var_ptr); + auto x_shape = x_var->GetShape(); + auto shape = ctx->Attrs().Get>("shape"); + PADDLE_ENFORCE_EQ(product(x_shape), product(shape), + platform::errors::InvalidArgument( + "The input tensor can't be reshaped to target shape, " + "the input tensor has %d elements but target shape " + "contains %d elements", + product(x_shape), product(shape))); + BOOST_GET(framework::VarDesc *, y_var_ptr)->SetShape(shape); + } +}; + +class ReshapePrimOpVarTypeInference + : public framework::StaticGraphVarTypeInference { + public: + void operator()(framework::InferVarTypeContext *ctx) const override { + auto x_name = Input(ctx, "X")[0]; + auto y_name = Output(ctx, "Y")[0]; + SetType(ctx, y_name, GetType(ctx, x_name)); + SetDataType(ctx, y_name, GetDataType(ctx, x_name)); + } +}; + +} // namespace operators +} // namespace paddle + +REGISTER_OPERATOR(reshape_p, paddle::operators::ReshapePrimOp, + paddle::operators::ReshapePrimOpMaker, + paddle::operators::ReshapePrimOpShapeInference, + paddle::operators::ReshapePrimOpVarTypeInference); diff --git a/paddle/fluid/operators/prim_ops/scatter_add_p_op.cc b/paddle/fluid/operators/prim_ops/scatter_add_p_op.cc new file mode 100644 index 0000000000000..420e6907e193d --- /dev/null +++ b/paddle/fluid/operators/prim_ops/scatter_add_p_op.cc @@ -0,0 +1,160 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/operator.h" + +namespace paddle { +namespace framework { +class InferShapeContext; +class VarDesc; +} // namespace framework +} // namespace paddle + +namespace paddle { +namespace operators { +class ScatterAddPrimOp : public framework::OperatorBase { + public: + ScatterAddPrimOp(const std::string &type, + const framework::VariableNameMap &inputs, + const framework::VariableNameMap &outputs, + const framework::AttributeMap &attrs) + : framework::OperatorBase(type, inputs, outputs, attrs) {} + void RunImpl(const framework::Scope &scope, + const platform::Place &dev_place) const override { + PADDLE_THROW(platform::errors::Unimplemented( + "Prim operator scatter_add_p should not be excuted directly")); + } +}; + +class ScatterAddPrimOpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() override { + AddInput("X", "(Tensor), The tensor to apply scatter rule and add on."); + AddInput("Y", "(Tensor), The source tensor of scatter_add_p op."); + AddInput( + "IndexTensor", + "(Tensor), The index tensor of scatter_add_p op, which is a 1D tensor.") + .AsDispensable(); + AddOutput("Z", "(Tensor), The output tensor of scatter_add_p op."); + AddAttr("axis", + "(int64_t), The axis along which to scatter and add."); + AddAttr>( + "index", "(std::vector) The index of scatter_add_p op") + .SetDefault({0}); + AddComment(R"DOC( +Autograd primitive scatter_add_p operator. +)DOC"); + } +}; + +class ScatterAddPrimOpShapeInference : public framework::InferShapeBase { + public: + void operator()(framework::InferShapeContext *ctx) const override { + framework::InferShapeVarPtr x_var_ptr = ctx->GetInputVarPtrs("X")[0]; + framework::InferShapeVarPtr y_var_ptr = ctx->GetInputVarPtrs("Y")[0]; + framework::InferShapeVarPtr z_var_ptr = ctx->GetOutputVarPtrs("Z")[0]; + int64_t num_index = 0; + if (ctx->HasInput("IndexTensor")) { + framework::InferShapeVarPtr index_var_ptr = + ctx->GetInputVarPtrs("IndexTensor")[0]; + framework::VarDesc *index_var = + BOOST_GET(framework::VarDesc *, index_var_ptr); + auto index_shape = index_var->GetShape(); + PADDLE_ENFORCE_EQ(index_shape.size(), 1, + platform::errors::InvalidArgument( + "The index tensor should be a 1D tensor," + "but get rank %d", + index_shape.size())); + num_index = index_shape[0]; + } else { + num_index = ctx->Attrs().Get>("index").size(); + } + auto axis = ctx->Attrs().Get("axis"); + framework::VarDesc *x_var = BOOST_GET(framework::VarDesc *, x_var_ptr); + framework::VarDesc *y_var = BOOST_GET(framework::VarDesc *, y_var_ptr); + auto x_shape = x_var->GetShape(); + auto y_shape = y_var->GetShape(); + size_t x_rank = x_shape.size(); + size_t y_rank = y_shape.size(); + PADDLE_ENFORCE_EQ(x_rank, y_rank, + platform::errors::InvalidArgument( + "The dimensions of two input tensor should be same, " + "but get %d and %d", + x_rank, y_rank)); + PADDLE_ENFORCE_EQ(y_shape[axis], num_index, + platform::errors::InvalidArgument( + "The shape of source input tensor at scatter axis " + "should be equal to num_index, " + "but get %d and %d", + y_shape[axis], num_index)); + for (size_t i = 0; i < x_rank; ++i) { + if (i != size_t(axis)) { + PADDLE_ENFORCE_EQ( + x_shape[i], y_shape[i], + platform::errors::InvalidArgument( + "The shape of two input tensor at dimension %d should be same, " + "but get %d and %d", + i, x_rank, y_rank)); + } + } + + BOOST_GET(framework::VarDesc *, z_var_ptr)->SetShape(x_shape); + } +}; + +class ScatterAddPrimOpVarTypeInference + : public framework::StaticGraphVarTypeInference { + public: + void operator()(framework::InferVarTypeContext *ctx) const override { + auto x_name = Input(ctx, "X")[0]; + auto y_name = Input(ctx, "Y")[0]; + auto z_name = Output(ctx, "Z")[0]; + auto x_type = GetType(ctx, x_name); + auto y_type = GetType(ctx, y_name); + auto x_dtype = GetDataType(ctx, x_name); + auto y_dtype = GetDataType(ctx, y_name); + PADDLE_ENFORCE_EQ(x_type, y_type, + platform::errors::InvalidArgument( + "The type of two input tensor should be same, " + "but get %d and %d", + x_type, y_type)); + PADDLE_ENFORCE_EQ(x_dtype, y_dtype, + platform::errors::InvalidArgument( + "The datatype of two input tensor should be same, " + "but get %d and %d", + x_dtype, y_dtype)); + + if (ctx->HasInput("IndexTensor")) { + auto index_name = Input(ctx, "IndexTensor")[0]; + auto index_dtype = GetDataType(ctx, index_name); + PADDLE_ENFORCE_EQ( + index_dtype, framework::proto::VarType_Type_INT32, + platform::errors::InvalidArgument( + "The datatype of input tensor should be VarType_Type_INT32(%d), " + "but get %d", + framework::proto::VarType_Type_INT32, index_dtype)); + } + SetType(ctx, z_name, GetType(ctx, x_name)); + SetDataType(ctx, z_name, GetDataType(ctx, x_name)); + } +}; + +} // namespace operators +} // namespace paddle + +REGISTER_OPERATOR(scatter_add_p, paddle::operators::ScatterAddPrimOp, + paddle::operators::ScatterAddPrimOpMaker, + paddle::operators::ScatterAddPrimOpShapeInference, + paddle::operators::ScatterAddPrimOpVarTypeInference); diff --git a/paddle/fluid/operators/prim_ops/slice_assign_p_op.cc b/paddle/fluid/operators/prim_ops/slice_assign_p_op.cc new file mode 100644 index 0000000000000..6fff54cced550 --- /dev/null +++ b/paddle/fluid/operators/prim_ops/slice_assign_p_op.cc @@ -0,0 +1,152 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/operator.h" + +namespace paddle { +namespace framework { +class InferShapeContext; +class VarDesc; +} // namespace framework +} // namespace paddle + +namespace paddle { +namespace operators { +class SliceAssignPrimOp : public framework::OperatorBase { + public: + SliceAssignPrimOp(const std::string &type, + const framework::VariableNameMap &inputs, + const framework::VariableNameMap &outputs, + const framework::AttributeMap &attrs) + : framework::OperatorBase(type, inputs, outputs, attrs) {} + void RunImpl(const framework::Scope &scope, + const platform::Place &dev_place) const override { + PADDLE_THROW(platform::errors::Unimplemented( + "Prim operator slice_assign_p should not be excuted directly")); + } +}; + +class SliceAssignPrimOpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() override { + AddInput("X", "(Tensor), The tensor to slice from and assign on."); + AddInput("Y", "(Tensor), The source tensor of slice_assign_p op."); + AddOutput("Z", "(Tensor), The output tensor of slice_assign_p op."); + AddAttr>( + "axis", "(std::vector), The axis along which to gather."); + AddAttr>( + "starts", + "(std::vector) The slice starts of slice_assign_p op"); + AddAttr>( + "ends", "(std::vector) The slice ends of slice_assign_p op"); + AddAttr>( + "strides", + "(std::vector) The slice strides of slice_assign_p op"); + AddComment(R"DOC( +Autograd primitive slice_assign_p operator. +)DOC"); + } +}; + +class SliceAssignPrimOpShapeInference : public framework::InferShapeBase { + public: + void operator()(framework::InferShapeContext *ctx) const override { + framework::InferShapeVarPtr x_var_ptr = ctx->GetInputVarPtrs("X")[0]; + framework::InferShapeVarPtr y_var_ptr = ctx->GetInputVarPtrs("Y")[0]; + framework::InferShapeVarPtr z_var_ptr = ctx->GetOutputVarPtrs("Z")[0]; + framework::VarDesc *x_var = BOOST_GET(framework::VarDesc *, x_var_ptr); + framework::VarDesc *y_var = BOOST_GET(framework::VarDesc *, y_var_ptr); + auto x_shape = x_var->GetShape(); + auto y_shape = y_var->GetShape(); + size_t x_rank = x_shape.size(); + size_t y_rank = y_shape.size(); + auto axis = ctx->Attrs().Get>("axis"); + auto starts = ctx->Attrs().Get>("starts"); + auto ends = ctx->Attrs().Get>("ends"); + auto strides = ctx->Attrs().Get>("strides"); + PADDLE_ENFORCE_EQ( + starts.size(), axis.size(), + platform::errors::InvalidArgument( + "Number of starts attribute and axis attribute should be same, " + "but get %d and %d", + starts.size(), axis.size())); + PADDLE_ENFORCE_EQ( + ends.size(), axis.size(), + platform::errors::InvalidArgument( + "Number of ends attribute and axis attribute should be same, " + "but get %d and %d", + ends.size(), axis.size())); + PADDLE_ENFORCE_EQ( + strides.size(), axis.size(), + platform::errors::InvalidArgument( + "Number of strides attribute and axis attribute should be same, " + "but get %d and %d", + strides.size(), axis.size())); + PADDLE_ENFORCE_EQ(x_rank, y_rank, + platform::errors::InvalidArgument( + "The dimensions of two input tensor should be same, " + "but get %d and %d", + x_rank, y_rank)); + std::vector y_target_shape(x_shape); + for (size_t i = 0; i < axis.size(); ++i) { + y_target_shape[axis[i]] = + (ends[i] - starts[i] + strides[i] - 1) / strides[i]; + } + for (size_t i = 0; i < x_rank; ++i) { + PADDLE_ENFORCE_EQ(y_target_shape[i], y_shape[i], + platform::errors::InvalidArgument( + "The shape of source tensor of slice_assign_p op " + "at dimension %d should be %d, " + "but get %d", + i, y_target_shape[i], y_shape[i])); + } + BOOST_GET(framework::VarDesc *, z_var_ptr)->SetShape(x_shape); + } +}; + +class SliceAssignPrimOpVarTypeInference + : public framework::StaticGraphVarTypeInference { + public: + void operator()(framework::InferVarTypeContext *ctx) const override { + auto x_name = Input(ctx, "X")[0]; + auto y_name = Input(ctx, "Y")[0]; + auto z_name = Output(ctx, "Z")[0]; + auto x_type = GetType(ctx, x_name); + auto y_type = GetType(ctx, y_name); + auto x_dtype = GetDataType(ctx, x_name); + auto y_dtype = GetDataType(ctx, y_name); + PADDLE_ENFORCE_EQ(x_type, y_type, + platform::errors::InvalidArgument( + "The type of two input tensor should be same, " + "but get %d and %d", + x_type, y_type)); + PADDLE_ENFORCE_EQ(x_dtype, y_dtype, + platform::errors::InvalidArgument( + "The datatype of two input tensor should be same, " + "but get %d and %d", + x_dtype, y_dtype)); + + SetType(ctx, z_name, GetType(ctx, x_name)); + SetDataType(ctx, z_name, GetDataType(ctx, x_name)); + } +}; + +} // namespace operators +} // namespace paddle + +REGISTER_OPERATOR(slice_assign_p, paddle::operators::SliceAssignPrimOp, + paddle::operators::SliceAssignPrimOpMaker, + paddle::operators::SliceAssignPrimOpShapeInference, + paddle::operators::SliceAssignPrimOpVarTypeInference); diff --git a/paddle/fluid/operators/prim_ops/slice_select_p_op.cc b/paddle/fluid/operators/prim_ops/slice_select_p_op.cc new file mode 100644 index 0000000000000..9456ab403737d --- /dev/null +++ b/paddle/fluid/operators/prim_ops/slice_select_p_op.cc @@ -0,0 +1,115 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/operator.h" + +namespace paddle { +namespace framework { +class InferShapeContext; +class VarDesc; +} // namespace framework +} // namespace paddle + +namespace paddle { +namespace operators { +class SliceSelectPrimOp : public framework::OperatorBase { + public: + SliceSelectPrimOp(const std::string &type, + const framework::VariableNameMap &inputs, + const framework::VariableNameMap &outputs, + const framework::AttributeMap &attrs) + : framework::OperatorBase(type, inputs, outputs, attrs) {} + void RunImpl(const framework::Scope &scope, + const platform::Place &dev_place) const override { + PADDLE_THROW(platform::errors::Unimplemented( + "Prim operator slice_select_p should not be excuted directly")); + } +}; + +class SliceSelectPrimOpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() override { + AddInput("X", "(Tensor), The input tensor of slice_select_p op."); + AddOutput("Y", "(Tensor), The output tensor of slice_select_p op."); + AddAttr>( + "axis", "(std::vector), The axis along which to gather."); + AddAttr>( + "starts", + "(std::vector) The slice starts of slice_select_p op"); + AddAttr>( + "ends", "(std::vector) The slice ends of slice_select_p op"); + AddAttr>( + "strides", + "(std::vector) The slice strides of slice_select_p op"); + AddComment(R"DOC( +Autograd primitive slice_select_p operator. +)DOC"); + } +}; + +class SliceSelectPrimOpShapeInference : public framework::InferShapeBase { + public: + void operator()(framework::InferShapeContext *ctx) const override { + framework::InferShapeVarPtr x_var_ptr = ctx->GetInputVarPtrs("X")[0]; + framework::InferShapeVarPtr y_var_ptr = ctx->GetOutputVarPtrs("Y")[0]; + framework::VarDesc *x_var = BOOST_GET(framework::VarDesc *, x_var_ptr); + auto x_shape = x_var->GetShape(); + auto axis = ctx->Attrs().Get>("axis"); + auto starts = ctx->Attrs().Get>("starts"); + auto ends = ctx->Attrs().Get>("ends"); + auto strides = ctx->Attrs().Get>("strides"); + PADDLE_ENFORCE_EQ( + starts.size(), axis.size(), + platform::errors::InvalidArgument( + "Number of starts attribute and axis attribute should be same, " + "but get %d and %d", + starts.size(), axis.size())); + PADDLE_ENFORCE_EQ( + ends.size(), axis.size(), + platform::errors::InvalidArgument( + "Number of ends attribute and axis attribute should be same, " + "but get %d and %d", + ends.size(), axis.size())); + PADDLE_ENFORCE_EQ( + strides.size(), axis.size(), + platform::errors::InvalidArgument( + "Number of strides attribute and axis attribute should be same, " + "but get %d and %d", + strides.size(), axis.size())); + for (size_t i = 0; i < axis.size(); ++i) { + x_shape[axis[i]] = (ends[i] - starts[i] + strides[i] - 1) / strides[i]; + } + BOOST_GET(framework::VarDesc *, y_var_ptr)->SetShape(x_shape); + } +}; + +class SliceSelectPrimOpVarTypeInference + : public framework::StaticGraphVarTypeInference { + public: + void operator()(framework::InferVarTypeContext *ctx) const override { + auto x_name = Input(ctx, "X")[0]; + auto y_name = Output(ctx, "Y")[0]; + SetType(ctx, y_name, GetType(ctx, x_name)); + SetDataType(ctx, y_name, GetDataType(ctx, x_name)); + } +}; + +} // namespace operators +} // namespace paddle + +REGISTER_OPERATOR(slice_select_p, paddle::operators::SliceSelectPrimOp, + paddle::operators::SliceSelectPrimOpMaker, + paddle::operators::SliceSelectPrimOpShapeInference, + paddle::operators::SliceSelectPrimOpVarTypeInference); diff --git a/paddle/fluid/operators/prim_ops/split_p_op.cc b/paddle/fluid/operators/prim_ops/split_p_op.cc new file mode 100644 index 0000000000000..212692bf0355b --- /dev/null +++ b/paddle/fluid/operators/prim_ops/split_p_op.cc @@ -0,0 +1,119 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/operator.h" + +namespace paddle { +namespace framework { +class InferShapeContext; +class VarDesc; +} // namespace framework +} // namespace paddle + +namespace paddle { +namespace operators { +class SplitPrimOp : public framework::OperatorBase { + public: + SplitPrimOp(const std::string &type, const framework::VariableNameMap &inputs, + const framework::VariableNameMap &outputs, + const framework::AttributeMap &attrs) + : framework::OperatorBase(type, inputs, outputs, attrs) {} + void RunImpl(const framework::Scope &scope, + const platform::Place &dev_place) const override { + PADDLE_THROW(platform::errors::Unimplemented( + "Prim operator split_p should not be excuted directly")); + } +}; + +class SplitPrimOpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() override { + AddInput("X", "(Tensor), The input tensor of split_p op."); + AddOutput("YS", "(Tensor), The output tensors of split_p op.") + .AsDuplicable(); + AddAttr("axis", "(int64_t), The axis along which to split."); + AddAttr>( + "num_or_sections", + "(std::vector) If num_or_sections has only one element, then " + "num_or_sections indicates the number of equal sized sub-Tensors that " + "the input will be divided into. If num_or_sections has more then one " + "element, the length of it indicates the number of sub-Tensors and the " + "elements in it indicate the sizes of sub-Tensors’ dimension orderly. " + "The length of the vector must not be larger than the input's size of " + "specified axis."); + AddComment(R"DOC( +Autograd primitive split_p operator. +)DOC"); + } +}; + +class SplitPrimOpShapeInference : public framework::InferShapeBase { + public: + void operator()(framework::InferShapeContext *ctx) const override { + framework::InferShapeVarPtr x_var_ptr = ctx->GetInputVarPtrs("X")[0]; + auto y_var_ptrs = ctx->GetOutputVarPtrs("YS"); + framework::VarDesc *x_var = BOOST_GET(framework::VarDesc *, x_var_ptr); + auto x_shape = x_var->GetShape(); + auto axis = ctx->Attrs().Get("axis"); + auto num_or_sections = + ctx->Attrs().Get>("num_or_sections"); + std::vector y_shape(x_shape); + if (num_or_sections.size() == 1) { + PADDLE_ENFORCE_EQ(x_shape[axis] % num_or_sections[0], 0, + platform::errors::InvalidArgument( + "The input tensor can't be devided equally into %d " + "parts equally along axis %d", + num_or_sections[0], axis)); + y_shape[axis] = x_shape[axis] / num_or_sections[0]; + for (size_t i = 0; i < size_t(num_or_sections[0]); ++i) { + BOOST_GET(framework::VarDesc *, y_var_ptrs[i])->SetShape(y_shape); + } + } else { + int64_t cnt_along_axis = 0; + for (size_t i = 0; i < num_or_sections.size(); ++i) { + y_shape[axis] = num_or_sections[i]; + cnt_along_axis += num_or_sections[i]; + BOOST_GET(framework::VarDesc *, y_var_ptrs[i])->SetShape(y_shape); + } + PADDLE_ENFORCE_EQ( + x_shape[axis], cnt_along_axis, + platform::errors::InvalidArgument( + "The input tensor has %d elements along axis %d, thus can't be " + "devided into %d tensor with %d elements totally.", + x_shape[axis], axis, num_or_sections.size(), cnt_along_axis)); + } + } +}; + +class SplitPrimOpVarTypeInference + : public framework::StaticGraphVarTypeInference { + public: + void operator()(framework::InferVarTypeContext *ctx) const override { + auto x_name = Input(ctx, "X")[0]; + auto y_names = Output(ctx, "YS"); + for (auto y_name : y_names) { + SetType(ctx, y_name, GetType(ctx, x_name)); + SetDataType(ctx, y_name, GetDataType(ctx, x_name)); + } + } +}; + +} // namespace operators +} // namespace paddle + +REGISTER_OPERATOR(split_p, paddle::operators::SplitPrimOp, + paddle::operators::SplitPrimOpMaker, + paddle::operators::SplitPrimOpShapeInference, + paddle::operators::SplitPrimOpVarTypeInference); diff --git a/paddle/fluid/operators/prim_ops/sqrt_p_op.cc b/paddle/fluid/operators/prim_ops/sqrt_p_op.cc new file mode 100644 index 0000000000000..de4958d29f933 --- /dev/null +++ b/paddle/fluid/operators/prim_ops/sqrt_p_op.cc @@ -0,0 +1,80 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/operator.h" + +namespace paddle { +namespace framework { +class InferShapeContext; +class VarDesc; +} // namespace framework +} // namespace paddle + +namespace paddle { +namespace operators { +class SqrtPrimOp : public framework::OperatorBase { + public: + SqrtPrimOp(const std::string &type, const framework::VariableNameMap &inputs, + const framework::VariableNameMap &outputs, + const framework::AttributeMap &attrs) + : framework::OperatorBase(type, inputs, outputs, attrs) {} + void RunImpl(const framework::Scope &scope, + const platform::Place &dev_place) const override { + PADDLE_THROW(platform::errors::Unimplemented( + "Prim operator sqrt_p should not be excuted directly")); + } +}; + +class SqrtPrimOpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() override { + AddInput("X", "(Tensor), The input tensor of sqrt_p op."); + AddOutput("Y", "(Tensor), The output tensor of sqrt_p op."); + AddComment(R"DOC( +Autograd primitive sqrt_p operator. +)DOC"); + } +}; + +class SqrtPrimOpShapeInference : public framework::InferShapeBase { + public: + void operator()(framework::InferShapeContext *ctx) const override { + framework::InferShapeVarPtr x_var_ptr = ctx->GetInputVarPtrs("X")[0]; + framework::InferShapeVarPtr y_var_ptr = ctx->GetOutputVarPtrs("Y")[0]; + + framework::VarDesc *x_var = BOOST_GET(framework::VarDesc *, x_var_ptr); + + BOOST_GET(framework::VarDesc *, y_var_ptr)->SetShape(x_var->GetShape()); + } +}; + +class SqrtPrimOpVarTypeInference + : public framework::StaticGraphVarTypeInference { + public: + void operator()(framework::InferVarTypeContext *ctx) const override { + auto x_name = Input(ctx, "X")[0]; + auto y_name = Output(ctx, "Y")[0]; + SetType(ctx, y_name, GetType(ctx, x_name)); + SetDataType(ctx, y_name, GetDataType(ctx, x_name)); + } +}; + +} // namespace operators +} // namespace paddle + +REGISTER_OPERATOR(sqrt_p, paddle::operators::SqrtPrimOp, + paddle::operators::SqrtPrimOpMaker, + paddle::operators::SqrtPrimOpShapeInference, + paddle::operators::SqrtPrimOpVarTypeInference); diff --git a/paddle/fluid/operators/prim_ops/sub_p_op.cc b/paddle/fluid/operators/prim_ops/sub_p_op.cc new file mode 100644 index 0000000000000..f689f2d2d918b --- /dev/null +++ b/paddle/fluid/operators/prim_ops/sub_p_op.cc @@ -0,0 +1,116 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/operator.h" + +namespace paddle { +namespace framework { +class InferShapeContext; +class VarDesc; +} // namespace framework +} // namespace paddle + +namespace paddle { +namespace operators { +class SubPrimOp : public framework::OperatorBase { + public: + SubPrimOp(const std::string &type, const framework::VariableNameMap &inputs, + const framework::VariableNameMap &outputs, + const framework::AttributeMap &attrs) + : framework::OperatorBase(type, inputs, outputs, attrs) {} + void RunImpl(const framework::Scope &scope, + const platform::Place &dev_place) const override { + PADDLE_THROW(platform::errors::Unimplemented( + "Prim operator sub_p should not be excuted directly")); + } +}; + +class SubPrimOpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() override { + AddInput("X", "(Tensor), The input tensor of sub_p op."); + AddInput("Y", "(Tensor), The input tensor of sub_p op."); + AddOutput("Z", "(Tensor), The output tensor of sub_p op."); + AddComment(R"DOC( +Autograd primitive sub_p operator. +)DOC"); + } +}; + +class SubPrimOpShapeInference : public framework::InferShapeBase { + public: + void operator()(framework::InferShapeContext *ctx) const override { + framework::InferShapeVarPtr x_var_ptr = ctx->GetInputVarPtrs("X")[0]; + framework::InferShapeVarPtr y_var_ptr = ctx->GetInputVarPtrs("Y")[0]; + framework::InferShapeVarPtr z_var_ptr = ctx->GetOutputVarPtrs("Z")[0]; + + framework::VarDesc *x_var = BOOST_GET(framework::VarDesc *, x_var_ptr); + framework::VarDesc *y_var = BOOST_GET(framework::VarDesc *, y_var_ptr); + auto x_shape = x_var->GetShape(); + auto y_shape = y_var->GetShape(); + size_t x_rank = x_shape.size(); + size_t y_rank = y_shape.size(); + PADDLE_ENFORCE_EQ(x_rank, y_rank, + platform::errors::InvalidArgument( + "The dimensions of two input tensor should be same, " + "but get %d and %d", + x_rank, y_rank)); + for (size_t i = 0; i < x_rank; ++i) { + PADDLE_ENFORCE_EQ( + x_shape[i], y_shape[i], + platform::errors::InvalidArgument( + "The shape of two input tensor at dimension %d should be same, " + "but get %d and %d", + i, x_shape[i], y_shape[i])); + } + + BOOST_GET(framework::VarDesc *, z_var_ptr)->SetShape(x_shape); + } +}; + +class SubPrimOpVarTypeInference + : public framework::StaticGraphVarTypeInference { + public: + void operator()(framework::InferVarTypeContext *ctx) const override { + auto x_name = Input(ctx, "X")[0]; + auto y_name = Input(ctx, "Y")[0]; + auto z_name = Output(ctx, "Z")[0]; + auto x_type = GetType(ctx, x_name); + auto y_type = GetType(ctx, y_name); + auto x_dtype = GetDataType(ctx, x_name); + auto y_dtype = GetDataType(ctx, y_name); + PADDLE_ENFORCE_EQ(x_type, y_type, + platform::errors::InvalidArgument( + "The type of two input tensor should be same, " + "but get %d and %d", + x_type, y_type)); + PADDLE_ENFORCE_EQ(x_dtype, y_dtype, + platform::errors::InvalidArgument( + "The datatype of two input tensor should be same, " + "but get %d and %d", + x_dtype, y_dtype)); + + SetType(ctx, z_name, x_type); + SetDataType(ctx, z_name, x_dtype); + } +}; + +} // namespace operators +} // namespace paddle + +REGISTER_OPERATOR(sub_p, paddle::operators::SubPrimOp, + paddle::operators::SubPrimOpMaker, + paddle::operators::SubPrimOpShapeInference, + paddle::operators::SubPrimOpVarTypeInference); diff --git a/paddle/fluid/operators/prim_ops/tanh_p_op.cc b/paddle/fluid/operators/prim_ops/tanh_p_op.cc new file mode 100644 index 0000000000000..c2afdcbe4b207 --- /dev/null +++ b/paddle/fluid/operators/prim_ops/tanh_p_op.cc @@ -0,0 +1,80 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/operator.h" + +namespace paddle { +namespace framework { +class InferShapeContext; +class VarDesc; +} // namespace framework +} // namespace paddle + +namespace paddle { +namespace operators { +class TanhPrimOp : public framework::OperatorBase { + public: + TanhPrimOp(const std::string &type, const framework::VariableNameMap &inputs, + const framework::VariableNameMap &outputs, + const framework::AttributeMap &attrs) + : framework::OperatorBase(type, inputs, outputs, attrs) {} + void RunImpl(const framework::Scope &scope, + const platform::Place &dev_place) const override { + PADDLE_THROW(platform::errors::Unimplemented( + "Prim operator tanh_p should not be excuted directly")); + } +}; + +class TanhPrimOpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() override { + AddInput("X", "(Tensor), The input tensor of tanh_p op."); + AddOutput("Y", "(Tensor), The output tensor of tanh_p op."); + AddComment(R"DOC( +Autograd primitive tanh_p operator. +)DOC"); + } +}; + +class TanhPrimOpShapeInference : public framework::InferShapeBase { + public: + void operator()(framework::InferShapeContext *ctx) const override { + framework::InferShapeVarPtr x_var_ptr = ctx->GetInputVarPtrs("X")[0]; + framework::InferShapeVarPtr y_var_ptr = ctx->GetOutputVarPtrs("Y")[0]; + + framework::VarDesc *x_var = BOOST_GET(framework::VarDesc *, x_var_ptr); + + BOOST_GET(framework::VarDesc *, y_var_ptr)->SetShape(x_var->GetShape()); + } +}; + +class TanhPrimOpVarTypeInference + : public framework::StaticGraphVarTypeInference { + public: + void operator()(framework::InferVarTypeContext *ctx) const override { + auto x_name = Input(ctx, "X")[0]; + auto y_name = Output(ctx, "Y")[0]; + SetType(ctx, y_name, GetType(ctx, x_name)); + SetDataType(ctx, y_name, GetDataType(ctx, x_name)); + } +}; + +} // namespace operators +} // namespace paddle + +REGISTER_OPERATOR(tanh_p, paddle::operators::TanhPrimOp, + paddle::operators::TanhPrimOpMaker, + paddle::operators::TanhPrimOpShapeInference, + paddle::operators::TanhPrimOpVarTypeInference); diff --git a/paddle/fluid/operators/prim_ops/transpose_p_op.cc b/paddle/fluid/operators/prim_ops/transpose_p_op.cc new file mode 100644 index 0000000000000..b3b72318cd51d --- /dev/null +++ b/paddle/fluid/operators/prim_ops/transpose_p_op.cc @@ -0,0 +1,116 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/operator.h" + +namespace paddle { +namespace framework { +class InferShapeContext; +class VarDesc; +} // namespace framework +} // namespace paddle + +namespace paddle { +namespace operators { +class TransposePrimOp : public framework::OperatorBase { + public: + TransposePrimOp(const std::string &type, + const framework::VariableNameMap &inputs, + const framework::VariableNameMap &outputs, + const framework::AttributeMap &attrs) + : framework::OperatorBase(type, inputs, outputs, attrs) {} + void RunImpl(const framework::Scope &scope, + const platform::Place &dev_place) const override { + PADDLE_THROW(platform::errors::Unimplemented( + "Prim operator transpose_p should not be excuted directly")); + } +}; + +class TransposePrimOpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() override { + AddInput("X", "(Tensor), The input tensor of transpose_p op."); + AddOutput("Y", "(Tensor), The output tensor of transpose_p op."); + AddAttr>("axis", + "(std::vector) Tanspose axis."); + AddComment(R"DOC( +Autograd primitive transpose_p operator. +)DOC"); + } +}; + +class TransposePrimOpShapeInference : public framework::InferShapeBase { + public: + void operator()(framework::InferShapeContext *ctx) const override { + framework::InferShapeVarPtr x_var_ptr = ctx->GetInputVarPtrs("X")[0]; + framework::InferShapeVarPtr y_var_ptr = ctx->GetOutputVarPtrs("Y")[0]; + framework::VarDesc *x_var = BOOST_GET(framework::VarDesc *, x_var_ptr); + auto x_shape = x_var->GetShape(); + auto axis = ctx->Attrs().Get>("axis"); + size_t x_rank = x_shape.size(); + size_t axis_size = axis.size(); + PADDLE_ENFORCE_EQ(x_rank, axis_size, + platform::errors::InvalidArgument( + "The input tensor's dimension " + "should be equal to the axis's size. " + "But received input tensor's dimension is %d, " + "axis's size is %d", + x_rank, axis_size)); + + std::vector count(axis_size, 0); + for (size_t i = 0; i < axis_size; i++) { + PADDLE_ENFORCE_GE(axis[i], 0, + platform::errors::InvalidArgument( + "The axis should be greater than or equal to 0." + "But received %d of axis[%d]", + axis[i], i)); + + PADDLE_ENFORCE_EQ( + axis[i] < static_cast(axis_size) && ++count[axis[i]] == 1, true, + platform::errors::InvalidArgument( + "Each element of Attribute axis should " + "be a unique value range from 0 to (dims - 1), " + "where the dims is the axis's size, " + "unique value means this axis value can appear only once. " + "But received axis[%d] is %d, axis_size is %d, " + "count[axis[%d]] is %d", + i, axis[i], axis_size, i, count[axis[i]])); + } + std::vector y_shape(axis_size); + for (size_t i = 0; i < axis_size; i++) { + y_shape[i] = x_shape[axis[i]]; + } + BOOST_GET(framework::VarDesc *, y_var_ptr)->SetShape(y_shape); + } +}; + +class TransposePrimOpVarTypeInference + : public framework::StaticGraphVarTypeInference { + public: + void operator()(framework::InferVarTypeContext *ctx) const override { + auto x_name = Input(ctx, "X")[0]; + auto y_name = Output(ctx, "Y")[0]; + SetType(ctx, y_name, GetType(ctx, x_name)); + SetDataType(ctx, y_name, GetDataType(ctx, x_name)); + } +}; + +} // namespace operators +} // namespace paddle + +REGISTER_OPERATOR(transpose_p, paddle::operators::TransposePrimOp, + paddle::operators::TransposePrimOpMaker, + paddle::operators::TransposePrimOpShapeInference, + paddle::operators::TransposePrimOpVarTypeInference); diff --git a/paddle/fluid/operators/prim_ops/unity_build_rule.cmake b/paddle/fluid/operators/prim_ops/unity_build_rule.cmake new file mode 100644 index 0000000000000..5d6a732272b9b --- /dev/null +++ b/paddle/fluid/operators/prim_ops/unity_build_rule.cmake @@ -0,0 +1,20 @@ +register_unity_group(cc + reshape_p_op.cc + broadcast_p_op.cc + reduce_p_op.cc + transpose_p_op.cc + split_p_op.cc + concat_p_op.cc + slice_select_p_op.cc + slice_assign_p_op.cc + gather_p_op.cc + scatter_add_p_op.cc + add_p_op.cc + sub_p_op.cc + mul_p_op.cc + div_p_op.cc + sqrt_p_op.cc + tanh_p_op.cc + matmul_p_op.cc + fill_constant_p_op.cc + ) From c7623d72de13bf167559c7f4e68520244911ff25 Mon Sep 17 00:00:00 2001 From: jakpiase Date: Thu, 14 Apr 2022 03:20:28 +0200 Subject: [PATCH 137/211] Added shuffle_channel BF16/FP32 FWD oneDNN kernel (#39756) * added shuffle_channel bf16/fp32 fwd kernel * added missing files * CI fix * changed from pten to phi * tmp save * added reviewers suggestions * fix for test --- paddle/fluid/framework/ir/CMakeLists.txt | 1 + .../shuffle_channel_mkldnn_detect_pass.cc | 237 ++++++++++++++++++ .../shuffle_channel_mkldnn_detect_pass.h | 38 +++ .../inference/api/paddle_pass_builder.cc | 1 + .../mkldnn/shuffle_channel_mkldnn_op.cc | 77 ++++++ paddle/fluid/operators/shuffle_channel_op.cc | 18 +- .../test_mkldnn_shuffle_channel_op.py | 61 +++++ .../mkldnn/test_shuffle_channel_mkldnn_op.py | 62 +++++ 8 files changed, 492 insertions(+), 3 deletions(-) create mode 100644 paddle/fluid/framework/ir/mkldnn/shuffle_channel_mkldnn_detect_pass.cc create mode 100644 paddle/fluid/framework/ir/mkldnn/shuffle_channel_mkldnn_detect_pass.h create mode 100644 paddle/fluid/operators/mkldnn/shuffle_channel_mkldnn_op.cc create mode 100644 python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_shuffle_channel_op.py create mode 100644 python/paddle/fluid/tests/unittests/mkldnn/test_shuffle_channel_mkldnn_op.py diff --git a/paddle/fluid/framework/ir/CMakeLists.txt b/paddle/fluid/framework/ir/CMakeLists.txt index 834a2c953eab8..48ccadd037363 100755 --- a/paddle/fluid/framework/ir/CMakeLists.txt +++ b/paddle/fluid/framework/ir/CMakeLists.txt @@ -128,6 +128,7 @@ if(WITH_MKLDNN) pass_library(fc_mkldnn_pass inference DIR mkldnn) pass_library(interpolate_mkldnn_pass inference DIR mkldnn) pass_library(softplus_activation_mkldnn_fuse_pass inference DIR mkldnn) + pass_library(shuffle_channel_mkldnn_detect_pass inference DIR mkldnn) pass_library(fc_act_mkldnn_fuse_pass inference DIR mkldnn) pass_library(elt_act_mkldnn_fuse_pass inference DIR mkldnn) pass_library(cpu_quantize_placement_pass base DIR mkldnn) diff --git a/paddle/fluid/framework/ir/mkldnn/shuffle_channel_mkldnn_detect_pass.cc b/paddle/fluid/framework/ir/mkldnn/shuffle_channel_mkldnn_detect_pass.cc new file mode 100644 index 0000000000000..bf603dc4bbcb9 --- /dev/null +++ b/paddle/fluid/framework/ir/mkldnn/shuffle_channel_mkldnn_detect_pass.cc @@ -0,0 +1,237 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include + +#include "paddle/fluid/framework/ir/mkldnn/shuffle_channel_mkldnn_detect_pass.h" +#include "paddle/fluid/framework/op_version_registry.h" + +namespace paddle { +namespace framework { +namespace ir { + +#define GET_IR_NODE(node__) GET_IR_NODE_FROM_SUBGRAPH(node__, node__, pattern); +#define GET_NODES \ + GET_IR_NODE(reshape1_op); \ + GET_IR_NODE(reshape1_out); \ + GET_IR_NODE(transpose_op); \ + GET_IR_NODE(transpose_out); \ + GET_IR_NODE(reshape2_op); \ + GET_IR_NODE(reshape2_out); + +ShuffleChannelMKLDNNDetectPass::ShuffleChannelMKLDNNDetectPass() { + AddOpCompat(OpCompat("reshape2")) + .AddInput("X") + .IsTensor() + .End() + .AddInput("Shape") + .IsOptional() + .IsTensor() + .End() + .AddInput("ShapeTensor") + .IsOptional() + .IsTensor() + .End() + .AddOutput("XShape") + .IsTensor() + .End() + .AddOutput("Out") + .IsTensor() + .End() + .AddAttr("shape") + .IsType>() + .End(); + + AddOpCompat(OpCompat("transpose2")) + .AddInput("X") + .IsTensor() + .End() + .AddOutput("XShape") + .IsTensor() + .End() + .AddOutput("Out") + .IsTensor() + .End() + .AddAttr("axis") + .IsType>() + .End(); +} + +void ShuffleChannelMKLDNNDetectPass::ApplyImpl(ir::Graph* graph) const { + const std::string pattern_name = "shufflechannel_pattern"; + FusePassBase::Init(pattern_name, graph); + + GraphPatternDetector gpd; + auto* x = gpd.mutable_pattern() + ->NewNode("x") + ->assert_is_op_input("reshape2", "X") + ->AsInput(); + + patterns::ShuffleChannelPattern pattern(gpd.mutable_pattern(), pattern_name); + pattern(x); + + auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph, + Graph* g) { + GET_NODES; + if (!IsCompat(subgraph, g)) { + LOG(WARNING) << "The Pass in op compat failed."; + return; + } + PADDLE_ENFORCE_GT( + subgraph.count(x), 0, + platform::errors::NotFound("Detector did not find input X.")); + auto* input_node = subgraph.at(x); + auto reshape1_desc = reshape1_op->Op(); + auto reshape2_desc = reshape2_op->Op(); + auto trans_desc = transpose_op->Op(); + std::string input_name = input_node->Name(); + std::string output_name = reshape2_out->Name(); + + auto reshape1_shape = + BOOST_GET_CONST(std::vector, reshape1_desc->GetAttr("shape")); + auto reshape2_shape = + BOOST_GET_CONST(std::vector, reshape2_desc->GetAttr("shape")); + auto trans_axis = + BOOST_GET_CONST(std::vector, trans_desc->GetAttr("axis")); + auto* block1 = reshape1_desc->Block(); + auto* block2 = reshape2_desc->Block(); + if (block1 && block2) { + auto x_var_name = reshape1_desc->Input("X")[0]; + auto* x_var_desc = block1->FindVar(x_var_name); + auto x_shape1 = x_var_desc->GetShape(); + x_var_name = reshape2_desc->Input("X")[0]; + x_var_desc = block2->FindVar(x_var_name); + auto x_shape2 = x_var_desc->GetShape(); + // now shuffle_channel is 4D(NCHW) only. + if (x_shape1.size() != 4 || reshape1_shape.size() != 5 || + reshape2_shape.size() != 4 || trans_axis.size() != 5) { + return; + } + + // process 0 and -1 in reshape. + constexpr int64_t copy_dim_val = 0; + for (size_t i = 0; i < reshape1_shape.size(); i++) { + if (reshape1_shape[i] == copy_dim_val) { + reshape1_shape[i] = x_shape1[i]; + } + } + for (size_t i = 0; i < reshape2_shape.size(); i++) { + if (reshape2_shape[i] == copy_dim_val) { + reshape2_shape[i] = x_shape2[i]; + } + } + constexpr int64_t unk_dim_idx = -1; + bool all_positive = std::all_of(x_shape1.cbegin(), x_shape1.cend(), + [](int64_t i) { return i > 0; }); + for (size_t i = 0; i < reshape1_shape.size(); ++i) { + // if -1 is not in batch dim, try to calculate number + if ((reshape1_shape[i] == unk_dim_idx) && (i != 0)) { + // there is no sufficient info + if (!all_positive) return; + reshape1_shape[i] = + std::accumulate(x_shape1.begin(), x_shape1.end(), + static_cast(1), + std::multiplies()) / + std::accumulate(reshape1_shape.begin(), reshape1_shape.end(), + static_cast(-1), + std::multiplies()); + break; + } + } + + all_positive = std::all_of(x_shape2.cbegin(), x_shape2.cend(), + [](int64_t i) { return i > 0; }); + for (size_t i = 0; i < reshape2_shape.size(); ++i) { + // if -1 is not in batch dim, try to calculate number + if ((reshape2_shape[i] == unk_dim_idx) && (i != 0)) { + // there is no sufficient info + if (!all_positive) return; + reshape2_shape[i] = + std::accumulate(x_shape2.begin(), x_shape2.end(), + static_cast(1), + std::multiplies()) / + std::accumulate(reshape2_shape.begin(), reshape2_shape.end(), + static_cast(-1), + std::multiplies()); + break; + } + } + + // shuffle_channel dosen't change shape + if ((reshape2_shape[0] != -1) && (x_shape1[0] != reshape2_shape[0])) { + return; + } + for (size_t i = 1; i < x_shape1.size(); i++) { + if (x_shape1[i] != reshape2_shape[i]) { + return; + } + } + if ((reshape2_shape[3] != reshape1_shape[4]) || + (reshape2_shape[2] != reshape1_shape[3])) { + return; + } + } else { + return; // conservative judgement + } + + int i_c = reshape1_shape[2]; + int o_c = reshape2_shape[1]; + int group = o_c / i_c; + // should split on channel dim + if (reshape2_shape[1] != reshape1_shape[2] * reshape1_shape[1]) return; + // trans on channel dim + if (trans_axis[0] != 0 || trans_axis[3] != 3 || trans_axis[4] != 4) return; + if (group != 1 && i_c != 1) { + if (trans_axis[1] != 2 && trans_axis[2] != 1) { + return; + } + } + + framework::OpDesc new_op_desc; + new_op_desc.SetType("shuffle_channel"); + new_op_desc.SetInput("X", {input_name}); + new_op_desc.SetOutput("Out", {output_name}); + + new_op_desc.SetAttr("group", group); + new_op_desc.SetAttr("use_mkldnn", true); + new_op_desc.Flush(); + + // Create a new node for the fused op. + auto* new_op = graph->CreateOpNode(&new_op_desc); + + IR_NODE_LINK_TO(input_node, new_op); + IR_NODE_LINK_TO(new_op, reshape2_out); + + // Delete the unneeded nodes. + GraphSafeRemoveNodes(graph, {reshape1_op, reshape1_out, transpose_op, + transpose_out, reshape2_op}); + LOG_FIRST_N(WARNING, 1) + << "There is fluid.layers.shuffle_channel API already, maybe you can " + "use it instead of (reshape + transpose + reshape)"; + }; + + gpd(graph, handler); +} + +} // namespace ir +} // namespace framework +} // namespace paddle + +REGISTER_PASS(shuffle_channel_mkldnn_detect_pass, + paddle::framework::ir::ShuffleChannelMKLDNNDetectPass); +REGISTER_PASS_CAPABILITY(shuffle_channel_mkldnn_detect_pass) + .AddCombination( + paddle::framework::compatible::OpVersionComparatorCombination() + .EQ("reshape2", 0) + .EQ("transpose2", 0)); diff --git a/paddle/fluid/framework/ir/mkldnn/shuffle_channel_mkldnn_detect_pass.h b/paddle/fluid/framework/ir/mkldnn/shuffle_channel_mkldnn_detect_pass.h new file mode 100644 index 0000000000000..231b63c3b6a00 --- /dev/null +++ b/paddle/fluid/framework/ir/mkldnn/shuffle_channel_mkldnn_detect_pass.h @@ -0,0 +1,38 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include + +#include "paddle/fluid/framework/ir/fuse_pass_base.h" +#include "paddle/fluid/framework/ir/graph_pattern_detector.h" + +namespace paddle { +namespace framework { +namespace ir { + +class Graph; + +class ShuffleChannelMKLDNNDetectPass : public FusePassBase { + public: + ShuffleChannelMKLDNNDetectPass(); + virtual ~ShuffleChannelMKLDNNDetectPass() {} + + protected: + void ApplyImpl(ir::Graph* graph) const override; +}; + +} // namespace ir +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/inference/api/paddle_pass_builder.cc b/paddle/fluid/inference/api/paddle_pass_builder.cc index 20418e37a7b94..d0fe3953d00d6 100644 --- a/paddle/fluid/inference/api/paddle_pass_builder.cc +++ b/paddle/fluid/inference/api/paddle_pass_builder.cc @@ -298,6 +298,7 @@ void CpuPassStrategy::EnableMKLDNN() { // "fc_act_mkldnn_fuse_pass", "batch_norm_act_fuse_pass", // "softplus_activation_mkldnn_fuse_pass", // + "shuffle_channel_mkldnn_detect_pass", // "elt_act_mkldnn_fuse_pass", // // TODO(intel): Please fix the bug on windows. // https://github.com/PaddlePaddle/Paddle/issues/29710 diff --git a/paddle/fluid/operators/mkldnn/shuffle_channel_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/shuffle_channel_mkldnn_op.cc new file mode 100644 index 0000000000000..408de57bf946d --- /dev/null +++ b/paddle/fluid/operators/mkldnn/shuffle_channel_mkldnn_op.cc @@ -0,0 +1,77 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/platform/mkldnn_reuse.h" + +namespace paddle { +namespace operators { + +using framework::Tensor; +using platform::MKLDNNGetDataType; +template +class ShuffleChannelMKLDNNHandler + : public platform::MKLDNNHandlerNoCachingT { + public: + ShuffleChannelMKLDNNHandler(const Tensor* x, const int group, + const dnnl::engine engine, + platform::Place cpu_place) + : platform::MKLDNNHandlerNoCachingT(engine, + cpu_place) { + static constexpr int channel_axis = 1; + const auto md = dnnl::memory::desc(phi::vectorize(x->dims()), + MKLDNNGetDataType(), x->format()); + + this->AcquireForwardPrimitiveDescriptor(dnnl::prop_kind::forward_training, + md, channel_axis, group); + } +}; + +template +class ShuffleChannelMKLDNNKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + const auto& dev_ctx = + ctx.template device_context(); + const auto& mkldnn_engine = dev_ctx.GetEngine(); + + const auto* x = ctx.Input("X"); + auto* out = ctx.Output("Out"); + + // oneDNN handles group using C/g instead of g + const int group = x->dims()[1] / ctx.Attr("group"); + + ShuffleChannelMKLDNNHandler handler(x, group, mkldnn_engine, + ctx.GetPlace()); + + auto src_memory_p = handler.AcquireSrcMemory(x); + auto dst_memory_p = handler.AcquireDstMemory(out); + + auto shuffle_p = handler.AcquireForwardPrimitive(); + + auto& astream = platform::MKLDNNDeviceContext::tls().get_stream(); + shuffle_p->execute(astream, {{DNNL_ARG_SRC, *src_memory_p}, + {DNNL_ARG_DST, *dst_memory_p}}); + astream.wait(); + + out->set_layout(framework::DataLayout::kMKLDNN); + out->set_format(x->format()); + } +}; +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP_KERNEL(shuffle_channel, MKLDNN, paddle::platform::CPUPlace, + ops::ShuffleChannelMKLDNNKernel, + ops::ShuffleChannelMKLDNNKernel); diff --git a/paddle/fluid/operators/shuffle_channel_op.cc b/paddle/fluid/operators/shuffle_channel_op.cc index 119d2e7236946..70fddc9b04712 100644 --- a/paddle/fluid/operators/shuffle_channel_op.cc +++ b/paddle/fluid/operators/shuffle_channel_op.cc @@ -35,9 +35,17 @@ class ShuffleChannelOp : public framework::OperatorWithKernel { protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { - return framework::OpKernelType( - OperatorWithKernel::IndicateVarDataType(ctx, "X"), - ctx.device_context()); + auto input_data_type = + framework::OperatorWithKernel::IndicateVarDataType(ctx, "X"); + +#ifdef PADDLE_WITH_MKLDNN + if (this->CanMKLDNNBeUsed(ctx, input_data_type)) { + return framework::OpKernelType(input_data_type, ctx.GetPlace(), + framework::DataLayout::kMKLDNN, + framework::LibraryType::kMKLDNN); + } +#endif + return framework::OpKernelType(input_data_type, ctx.GetPlace()); } }; @@ -56,6 +64,10 @@ class ShuffleChannelOpMaker : public framework::OpProtoAndCheckerMaker { PADDLE_ENFORCE_GE(group, 1, platform::errors::InvalidArgument( "group should be larger than 0.")); }); + AddAttr("use_mkldnn", + "(bool, default false) Only used in mkldnn kernel") + .SetDefault(false) + .AsExtra(); AddComment(R"DOC( Shuffle Channel operator diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_shuffle_channel_op.py b/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_shuffle_channel_op.py new file mode 100644 index 0000000000000..26655970290cd --- /dev/null +++ b/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_shuffle_channel_op.py @@ -0,0 +1,61 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from auto_scan_test import MkldnnAutoScanTest +from program_config import TensorConfig, ProgramConfig, OpConfig +import numpy as np +from functools import partial +import unittest +from hypothesis import given +import hypothesis.strategies as st + + +class TestMKLDNNShuffleChannelOp(MkldnnAutoScanTest): + def is_program_valid(self, program_config: ProgramConfig) -> bool: + return True + + def sample_program_configs(self, *args, **kwargs): + def generate_input(*args, **kwargs): + return np.random.random(kwargs['in_shape']).astype(np.float32) + + shuffle_channel_op = OpConfig( + type="shuffle_channel", + inputs={"X": ["input_data"]}, + outputs={"Out": ["output_data"]}, + attrs={"group": kwargs['group']}) + + program_config = ProgramConfig( + ops=[shuffle_channel_op], + weights={}, + inputs={ + "input_data": TensorConfig(data_gen=partial(generate_input, + *args, **kwargs)), + }, + outputs=["output_data"]) + + yield program_config + + def sample_predictor_configs(self, program_config): + config = self.create_inference_config(use_mkldnn=True) + yield config, (1e-5, 1e-5) + + @given( + group=st.sampled_from([1, 2, 8, 32, 128]), + in_shape=st.sampled_from([[5, 512, 2, 3], [2, 256, 5, 4]])) + def test(self, *args, **kwargs): + self.run_test(quant=False, *args, **kwargs) + + +if __name__ == "__main__": + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/mkldnn/test_shuffle_channel_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_shuffle_channel_mkldnn_op.py new file mode 100644 index 0000000000000..1d657817503de --- /dev/null +++ b/python/paddle/fluid/tests/unittests/mkldnn/test_shuffle_channel_mkldnn_op.py @@ -0,0 +1,62 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import unittest +import numpy as np +from paddle.fluid.tests.unittests.op_test import OpTest, OpTestTool +import paddle +import paddle.fluid as fluid +import paddle.fluid.core as core + + +@OpTestTool.skip_if_not_cpu_bf16() +class TestShuffleChannelOneDNNOp(OpTest): + def setUp(self): + self.op_type = "shuffle_channel" + self.set_dtype() + self.set_group() + self.inputs = {'X': np.random.random((5, 64, 2, 3)).astype(self.dtype)} + self.attrs = {'use_mkldnn': True, 'group': self.group} + + _, c, h, w = self.inputs['X'].shape + input_reshaped = np.reshape(self.inputs['X'], + (-1, self.group, c // self.group, h, w)) + input_transposed = np.transpose(input_reshaped, (0, 2, 1, 3, 4)) + self.outputs = {'Out': np.reshape(input_transposed, (-1, c, h, w))} + + def set_dtype(self): + self.dtype = np.float32 + + def set_group(self): + self.group = 4 + + def test_check_output(self): + self.check_output_with_place(core.CPUPlace()) + + +class TestShuffleChannelSingleGroupOneDNNOp(TestShuffleChannelOneDNNOp): + def set_group(self): + self.group = 1 + + +class TestShuffleChannelBF16OneDNNOp(TestShuffleChannelOneDNNOp): + def set_dtype(self): + self.dtype = np.uint16 + + +if __name__ == "__main__": + paddle.enable_static() + unittest.main() From 8e2d4d3077b879833447ebb388552721930d4afb Mon Sep 17 00:00:00 2001 From: baoachun <962571062@qq.com> Date: Thu, 14 Apr 2022 10:46:03 +0800 Subject: [PATCH 138/211] add mkldnn int8 pass [step3] (#41599) * add mkldnn int8 pass [step3] * Add test for compute_propagate_scales_mkldnn_pass * update pass * update api comment and python api Co-authored-by: wozna --- paddle/fluid/framework/ir/CMakeLists.txt | 1 + ...ute_propagate_scales_mkldnn_pass_tester.cc | 276 ++++++++++++++++++ .../framework/ir/mkldnn/cpu_quantize_pass.cc | 67 ++++- .../framework/ir/mkldnn/cpu_quantize_pass.h | 6 + .../ir/mkldnn/quant_dequant_mkldnn_pass.cc | 4 - paddle/fluid/inference/analysis/argument.h | 2 + .../inference/analysis/ir_pass_manager.cc | 4 + paddle/fluid/inference/api/analysis_config.cc | 49 ++++ .../fluid/inference/api/analysis_predictor.cc | 7 + .../inference/api/paddle_analysis_config.h | 34 +++ .../inference/api/paddle_pass_builder.cc | 73 +++++ .../fluid/inference/api/paddle_pass_builder.h | 11 + .../fluid/inference/tests/api/CMakeLists.txt | 20 +- ...lyzer_quant_image_classification_tester.cc | 5 +- .../fluid/inference/tests/api/tester_helper.h | 1 + paddle/fluid/pybind/inference_api.cc | 4 + 16 files changed, 542 insertions(+), 22 deletions(-) create mode 100644 paddle/fluid/framework/ir/mkldnn/compute_propagate_scales_mkldnn_pass_tester.cc diff --git a/paddle/fluid/framework/ir/CMakeLists.txt b/paddle/fluid/framework/ir/CMakeLists.txt index 48ccadd037363..e8696a3c2276b 100755 --- a/paddle/fluid/framework/ir/CMakeLists.txt +++ b/paddle/fluid/framework/ir/CMakeLists.txt @@ -218,6 +218,7 @@ endif() cc_test(test_scale_matmul_fuse_pass SRCS mkldnn/scale_matmul_fuse_pass_tester.cc DEPS scale_matmul_fuse_pass) cc_test(test_mkldnn_placement_pass SRCS mkldnn/mkldnn_placement_pass_tester.cc DEPS mkldnn_placement_pass) cc_test(test_mkldnn_inplace_pass SRCS mkldnn/mkldnn_inplace_pass_tester.cc DEPS mkldnn_inplace_pass) + cc_test(test_compute_propagate_scales_mkldnn_pass SRCS mkldnn/compute_propagate_scales_mkldnn_pass_tester.cc DEPS compute_propagate_scales_mkldnn_pass naive_executor) cc_test(test_cpu_quantize_placement_pass SRCS mkldnn/cpu_quantize_placement_pass_tester.cc DEPS cpu_quantize_placement_pass) cc_test(test_cpu_quantize_pass SRCS mkldnn/cpu_quantize_pass_tester.cc DEPS cpu_quantize_pass naive_executor) cc_test(test_cpu_quantize_squash_pass SRCS mkldnn/cpu_quantize_squash_pass_tester.cc DEPS cpu_quantize_squash_pass naive_executor) diff --git a/paddle/fluid/framework/ir/mkldnn/compute_propagate_scales_mkldnn_pass_tester.cc b/paddle/fluid/framework/ir/mkldnn/compute_propagate_scales_mkldnn_pass_tester.cc new file mode 100644 index 0000000000000..8a7fa1f51c7c7 --- /dev/null +++ b/paddle/fluid/framework/ir/mkldnn/compute_propagate_scales_mkldnn_pass_tester.cc @@ -0,0 +1,276 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include + +#include "paddle/fluid/framework/ir/mkldnn/compute_propagate_scales_mkldnn_pass.h" +#include "paddle/fluid/framework/naive_executor.h" +#include "paddle/fluid/platform/place.h" + +namespace paddle { +namespace framework { +namespace ir { + +const std::array positive_and_negative_values = { + -0.0482659, -0.0102493, -0.00794221, -0.00387115, -0.00674586, + -0.0495346, 0.0629528, -0.00531285, -0.0230353, 0.0269089}; + +const std::vector> wx = { + {0.04347931, -0.5643393, 0.7551297, 0.26713502, 0.8055306, 0.91144973}, + {0.01707571, 0.12741385, 0.15419468, 0.66127586, 0.46821925, 0.9665961}, + {0.40393898, 0.884427, -0.5853097, 0.5840954, 0.9170512, 0.98245513}}; +const std::vector> wh = { + {0.42484227, -0.9025513, 0.17087583, 0.8403284, 0.03325734, 0.92331886}, + {0.32630175, 0.41691914, 0.99848574, 0.3504407, 0.06707559, 0.62239844}}; + +const std::vector gru_scales = {2.35381475, 1.08304947, 1.32427582, + 1.19001095, 1.00151656, 1.01785819}; + +const std::vector lstm_scales = {2.35381475, 1.10797026, 1.00151656, + 1.19001095, 1.09045166, 1.01785819}; + +static const std::initializer_list conv_variable_names{ + "conv_in", "filter", "bias", "conv_out"}; + +static const std::initializer_list rnn_variable_names{ + "x", "wx", "wh", "b", "h", "c"}; + +class ComputePropagateScalesMkldnnPassTest : public testing::Test { + public: + ComputePropagateScalesMkldnnPassTest() { + pass.reset(new ComputePropagateScalesMkldnnPass()); + } + + std::vector GetScales(Tensor* tensor, int axis) const { + return pass->GetScales(tensor, axis); + } + + void ComputeVarScales(ir::Graph* graph, Scope* scope, + const std::unordered_set ops, + const std::string& weight_name, const int axis, + StringPairMap* var_quant_scales) const { + pass->ComputeVarScales(graph, scope, ops, weight_name, axis, + var_quant_scales); + } + + void ComputeGruWeightScales(ir::Graph* graph, Scope* scope, + const std::string& wx_name, + const std::string& wh_name, + StringPairMap* var_quant_scales) const { + pass->ComputeGruWeightScales(graph, scope, wx_name, wh_name, + var_quant_scales); + } + + void ComputeLstmWeightScales(ir::Graph* graph, Scope* scope, + std::string wx_name, std::string wh_name, + StringPairMap* var_quant_scales) const { + pass->ComputeLstmWeightScales(graph, scope, wx_name, wh_name, + var_quant_scales); + } + + void InitTensorHolder(Scope* scope, const paddle::platform::Place& place, + const std::string& var_name) { + auto x = scope->Var(var_name); + auto tensor = x->GetMutable(); + auto tensor_size = 1; + if (var_name == "filter") { + tensor_size = positive_and_negative_values.size(); + } else if (var_name == "wx") { + tensor_size = wx.size(); + } else if (var_name == "wh") { + tensor_size = wh.size(); + } + tensor->mutable_data(place, + framework::TransToPhiDataType(proto::VarType::FP32), + tensor_size); + } + + void PrepareGraph(ir::Graph* graph, const ProgramDesc& prog, Scope* scope, + const std::initializer_list& variable_names) { + auto place = paddle::platform::CPUPlace(); + NaiveExecutor exe{place}; + exe.CreateVariables(prog, 0, true, scope); + + for (auto& v : variable_names) { + InitTensorHolder(scope, place, v.c_str()); + } + graph->SetNotOwned(kParamScopeAttr, scope); + } + + void ComputeRnnWeightScalesTest(const std::string& type, + const std::initializer_list& ops, + const framework::ProgramDesc& prog, + std::vector scales) { + ir::Graph* graph(new ir::Graph(prog)); + Scope scope; + + PrepareGraph(graph, prog, &scope, rnn_variable_names); + + std::string wx_name = "WeightX"; + std::string wh_name = "WeightH"; + std::string wx_var_names = "wx"; + std::string wh_var_names = "wh"; + + StringPairMap var_quant_scales; + + auto* wx_var = scope.FindVar(wx_var_names); + auto* wx_tensor = wx_var->GetMutable(); + wx_tensor->Resize(phi::make_dim(wx.size(), wx[0].size())); + for (size_t i = 0; i < wx.size(); i++) + std::copy(begin(wx[i]), end(wx[i]), + wx_tensor->mutable_data(platform::CPUPlace()) + + i * wx[0].size()); + + auto* wh_var = scope.FindVar(wh_var_names); + auto* wh_tensor = wh_var->GetMutable(); + wh_tensor->Resize(phi::make_dim(wh.size(), wh[0].size())); + for (size_t i = 0; i < wh.size(); i++) + std::copy(begin(wh[i]), end(wh[i]), + wh_tensor->mutable_data(platform::CPUPlace()) + + i * wh[0].size()); + if (type == "gru") { + ComputeGruWeightScales(graph, &scope, wx_name, wh_name, + &var_quant_scales); + } else { + ComputeLstmWeightScales(graph, &scope, wx_name, wh_name, + &var_quant_scales); + } + bool is_unsigned; + framework::Tensor wx_result_tensor; + + std::tie(is_unsigned, wx_result_tensor) = var_quant_scales[wx_var_names]; + ASSERT_EQ(is_unsigned, false); + ASSERT_EQ(wx_result_tensor.numel(), static_cast(scales.size())); + for (int64_t i = 0; i < wx_result_tensor.numel(); i++) { + ASSERT_FLOAT_EQ(wx_result_tensor.data()[i], scales[i]); + } + } + + private: + std::unique_ptr pass; +}; + +void SetOp(ProgramDesc* prog, const std::string& type, const std::string& name, + const std::vector& inputs, + const std::vector& outputs) { + auto* op = prog->MutableBlock(0)->AppendOp(); + op->SetType(type); + op->SetAttr("use_mkldnn", true); + op->SetAttr("name", name); + if (type == "conv2d") { + op->SetInput("Input", {inputs[0]}); + if (inputs.size() > 1) op->SetInput("Filter", {inputs[1]}); + if (inputs.size() > 2) op->SetInput("Bias", {inputs[2]}); + op->SetOutput("Output", {outputs[0]}); + } else if (type == "fusion_gru" || type == "fusion_lstm") { + op->SetInput("X", {inputs[0]}); + op->SetInput("WeightX", {inputs[1]}); + op->SetInput("WeightH", {inputs[2]}); + op->SetOutput("Hidden", {outputs[0]}); + if (type == "fusion_lstm") op->SetOutput("Cell", {outputs[1]}); + } +} + +ProgramDesc BuildConv2dProgramDesc() { + ProgramDesc prog; + for (auto& v : conv_variable_names) { + prog.MutableBlock(0)->Var(v); + } + SetOp(&prog, "conv2d", "Conv2d", {"conv_in", "filter", "bias"}, {"conv_out"}); + + return prog; +} + +ProgramDesc BuildFusionGruProgramDesc() { + ProgramDesc prog; + for (auto& v : rnn_variable_names) { + prog.MutableBlock(0)->Var(v); + } + SetOp(&prog, "fusion_gru", "Fusion_gru", {"x", "wx", "wh"}, {"h"}); + + return prog; +} + +ProgramDesc BuildFusionLstmProgramDesc() { + ProgramDesc prog; + for (auto& v : rnn_variable_names) { + prog.MutableBlock(0)->Var(v); + } + SetOp(&prog, "fusion_lstm", "Fusion_lstm", {"x", "wx", "wh"}, {"h", "c"}); + + return prog; +} + +TEST_F(ComputePropagateScalesMkldnnPassTest, get_scales_function) { + const auto& values = positive_and_negative_values; + float max_val = *std::max_element(values.begin(), values.end()); + + framework::Tensor var_tensor; + var_tensor.Resize(phi::make_dim(values.size(), 1)); + std::copy(begin(values), end(values), + var_tensor.mutable_data(platform::CPUPlace())); + std::vector results = GetScales(&var_tensor, 0); + + ASSERT_EQ(results.size(), std::size_t(1)); + ASSERT_EQ(results[0], (1.f / max_val)); +} + +TEST_F(ComputePropagateScalesMkldnnPassTest, compute_var_scales) { + auto prog = BuildConv2dProgramDesc(); + const auto& values = positive_and_negative_values; + ir::Graph* graph(new ir::Graph(prog)); + Scope scope; + + PrepareGraph(graph, prog, &scope, conv_variable_names); + + std::initializer_list ops = {"conv2d", "depthwise_conv2d"}; + std::string weight_name = "Filter"; + std::string weight_var_name = "filter"; + + auto axis = 1; + StringPairMap var_quant_scales; + + auto* var = scope.FindVar(weight_var_name); + auto* weight_tensor = var->GetMutable(); + weight_tensor->Resize(phi::make_dim(1, values.size())); + std::copy(begin(values), end(values), + weight_tensor->mutable_data(platform::CPUPlace())); + + auto max_val = *std::max_element(values.begin(), values.end()); + + ComputeVarScales(graph, &scope, ops, weight_name, axis, &var_quant_scales); + + bool is_unsigned; + framework::Tensor result_tensor; + + std::tie(is_unsigned, result_tensor) = var_quant_scales[weight_var_name]; + + ASSERT_EQ(is_unsigned, false); + ASSERT_EQ(result_tensor.numel(), 1); + ASSERT_FLOAT_EQ(result_tensor.data()[0], (1.0 / max_val)); +} + +TEST_F(ComputePropagateScalesMkldnnPassTest, compute_gru_weight_scales) { + ComputeRnnWeightScalesTest("gru", {"fusion_gru", "multi_gru"}, + BuildFusionGruProgramDesc(), gru_scales); +} + +TEST_F(ComputePropagateScalesMkldnnPassTest, compute_lstm_weight_scales) { + ComputeRnnWeightScalesTest("lstm", {"fusion_lstm"}, + BuildFusionLstmProgramDesc(), lstm_scales); +} + +} // namespace ir +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.cc b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.cc index 410dfbd680286..4aae60b853d4f 100644 --- a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.cc +++ b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.cc @@ -12,12 +12,12 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.h" - #include #include #include +#include "paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.h" +#include "paddle/fluid/framework/ir/mkldnn/mkldnn_pass_util.h" #include "paddle/fluid/platform/mkldnn_helper.h" #include "paddle/fluid/string/pretty_log.h" @@ -226,12 +226,21 @@ void CPUQuantizePass::DequantizeOutput(Graph* g, Node* op, Node* output, bool CPUQuantizePass::AreScalesPresentForVarNames( std::vector names) const { - auto& scales = Get("quant_var_scales"); bool present = true; - for (auto name : names) { - if (scales.find(name) == scales.end()) { - present = false; - LogScaleIsMissingForVarName(name); + if (var_quant_scales_->empty()) { + auto& scales = Get("quant_var_scales"); + for (auto name : names) { + if (scales.find(name) == scales.end()) { + present = false; + LogScaleIsMissingForVarName(name); + } + } + } else { + for (auto name : names) { + if (var_quant_scales_->find(name) == var_quant_scales_->end()) { + present = false; + LogScaleIsMissingForVarName(name); + } } } return present; @@ -239,12 +248,21 @@ bool CPUQuantizePass::AreScalesPresentForVarNames( bool CPUQuantizePass::AreScalesPresentForNodes( std::initializer_list nodes) const { - auto& scales = Get("quant_var_scales"); bool present = true; - for (auto node : nodes) { - if (scales.count(node->Name()) == 0) { - present = false; - LogScaleIsMissingForVarNode(node); + if (var_quant_scales_->empty()) { + auto& scales = Get("quant_var_scales"); + for (auto node : nodes) { + if (scales.count(node->Name()) == 0) { + present = false; + LogScaleIsMissingForVarNode(node); + } + } + } else { + for (auto node : nodes) { + if (var_quant_scales_->count(node->Name()) == 0) { + present = false; + LogScaleIsMissingForVarNode(node); + } } } return present; @@ -252,8 +270,11 @@ bool CPUQuantizePass::AreScalesPresentForNodes( std::pair CPUQuantizePass::GetScaleDataByName( const std::string& name) const { - auto& scales = Get("quant_var_scales"); - return scales.at(name); + if (var_quant_scales_->empty()) { + auto& scales = Get("quant_var_scales"); + return scales.at(name); + } + return var_quant_scales_->at(name); } std::pair CPUQuantizePass::GetScaleDataForNode( @@ -290,6 +311,23 @@ bool CPUQuantizePass::IsOpQuantized(const Node* node) const { }); } +void CPUQuantizePass::GetQuantInfo(Graph* graph) const { + std::unordered_map> info_map{}; + GetInfoFromTheFirstOp(graph, "has_quant_info", "var_quant_scales", &info_map); + + for (auto iter = info_map.begin(); iter != info_map.end(); iter++) { + LoDTensor tensor; + const int size = static_cast(iter->second.size()); + auto* data = tensor.mutable_data({size}, platform::CPUPlace()); + for (int i = 0; i < size; i++) { + data[i] = static_cast(iter->second[i]); + } + + auto pair = std::make_pair(false, tensor); + var_quant_scales_->insert(std::make_pair(iter->first, pair)); + } +} + void CPUQuantizePass::QuantizeConv(Graph* graph, bool with_residual_data) const { GraphPatternDetector gpd; @@ -1138,6 +1176,7 @@ void CPUQuantizePass::ApplyImpl(ir::Graph* graph) const { PADDLE_ENFORCE_NOT_NULL(param_scope(), platform::errors::InvalidArgument( "Scope cannot be nullptr.")); + GetQuantInfo(graph); QuantizeConv(graph, false /* with_residual_data */); QuantizeConv(graph, true /* with_residual_data */); QuantizePool(graph); diff --git a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.h b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.h index 3a286264e41ff..f1e2527ae6ef0 100644 --- a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.h +++ b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.h @@ -95,6 +95,12 @@ class CPUQuantizePass : public FusePassBase { bool IsOpQuantized(const Node* node) const; const std::string name_scope_{"quantize"}; + + private: + VarQuantScale string_pair_map = {}; + VarQuantScale* const var_quant_scales_ = &string_pair_map; + + void GetQuantInfo(Graph* graph) const; }; } // namespace ir diff --git a/paddle/fluid/framework/ir/mkldnn/quant_dequant_mkldnn_pass.cc b/paddle/fluid/framework/ir/mkldnn/quant_dequant_mkldnn_pass.cc index 808d043a4b226..55470db312f81 100644 --- a/paddle/fluid/framework/ir/mkldnn/quant_dequant_mkldnn_pass.cc +++ b/paddle/fluid/framework/ir/mkldnn/quant_dequant_mkldnn_pass.cc @@ -200,10 +200,8 @@ void QuantDequantMkldnnPass::CollectFakeQuantizeOps( for (auto* node_input : op_node->inputs) { if (node_input->Name() == x_var_name) { fake_quant_in = node_input; - break; } else if (node_input->Name() == in_scale_name) { fake_quant_in_scale = node_input; - break; } } @@ -212,10 +210,8 @@ void QuantDequantMkldnnPass::CollectFakeQuantizeOps( for (auto* node_output : op_node->outputs) { if (node_output->Name() == out_var_name) { fake_quant_out = node_output; - break; } else if (node_output->Name() == out_scale_name) { fake_quant_out_scale = node_output; - break; } } diff --git a/paddle/fluid/inference/analysis/argument.h b/paddle/fluid/inference/analysis/argument.h index 74e8ca3f229c6..2336fd1980d2e 100644 --- a/paddle/fluid/inference/analysis/argument.h +++ b/paddle/fluid/inference/analysis/argument.h @@ -182,6 +182,8 @@ struct Argument { // A set of op types to enable their bfloat16 kernels DECL_ARGUMENT_FIELD(bfloat16_enabled_op_types, Bfloat16EnabledOpTypes, std::unordered_set); + + DECL_ARGUMENT_FIELD(use_mkldnn_int8, UseMkldnnInt8, bool); #endif // Passed from config. diff --git a/paddle/fluid/inference/analysis/ir_pass_manager.cc b/paddle/fluid/inference/analysis/ir_pass_manager.cc index d6eb39e767825..b2d8afaa7b49c 100644 --- a/paddle/fluid/inference/analysis/ir_pass_manager.cc +++ b/paddle/fluid/inference/analysis/ir_pass_manager.cc @@ -107,6 +107,10 @@ void IRPassManager::CreatePasses(Argument *argument, "quantize_excluded_op_ids", new std::unordered_set(argument->quantize_excluded_op_ids())); } else if (pass_name == "cpu_quantize_pass") { + if (argument->quantize_enabled_op_types().count("conv2d") || + argument->quantize_enabled_op_types().count("depthwise_conv2d")) { + pass->Set("data_layout", new std::string("NHWC")); + } pass->Set("quant_var_scales", new VarQuantScale(argument->quant_var_scales())); } else if (pass_name == "cpu_bfloat16_placement_pass") { diff --git a/paddle/fluid/inference/api/analysis_config.cc b/paddle/fluid/inference/api/analysis_config.cc index d08d28a3f6233..4827fe6c1ac97 100644 --- a/paddle/fluid/inference/api/analysis_config.cc +++ b/paddle/fluid/inference/api/analysis_config.cc @@ -261,6 +261,9 @@ AnalysisConfig::AnalysisConfig(const AnalysisConfig &other) { CP_MEMBER(use_mkldnn_bfloat16_); CP_MEMBER(bfloat16_enabled_op_types_); // Quantization related. + CP_MEMBER(use_mkldnn_int8_); + CP_MEMBER(quantize_enabled_op_types_); + CP_MEMBER(quantize_excluded_op_ids_); CP_MEMBER(use_mkldnn_quantizer_); CP_MEMBER(mkldnn_quantizer_config_); CP_MEMBER(min_input_shape_); @@ -435,6 +438,35 @@ void AnalysisConfig::EnableMkldnnBfloat16() { Update(); } +void AnalysisConfig::EnableMkldnnInt8( + const std::unordered_set &op_list) { +#ifdef PADDLE_WITH_MKLDNN + use_mkldnn_int8_ = true; + use_fc_padding_ = false; + if (!op_list.empty()) { + for (auto &type : op_list) { + if (!quantize_enabled_op_types_.count(type)) { + LOG(ERROR) << "There are unsupported operators in the configured " + "quantization operator list. The unsupported operator " + "is: " + << type; + use_mkldnn_int8_ = false; + break; + } + } + if (use_mkldnn_int8_) { + quantize_enabled_op_types_.clear(); + quantize_enabled_op_types_.insert(op_list.begin(), op_list.end()); + } + } +#else + LOG(ERROR) << "Please compile with MKLDNN first to use MkldnnInt8"; + use_mkldnn_int8_ = false; +#endif + + Update(); +} + MkldnnQuantizerConfig *AnalysisConfig::mkldnn_quantizer_config() const { PADDLE_ENFORCE_NOT_NULL(mkldnn_quantizer_config_, platform::errors::PreconditionNotMet( @@ -632,6 +664,20 @@ void AnalysisConfig::Update() { #endif } + if (use_mkldnn_int8_) { +#ifdef PADDLE_WITH_MKLDNN + if (!enable_ir_optim_) { + LOG(ERROR) << "EnableMkldnnInt8() only works when IR optimization " + "is enabled."; + } else if (!use_mkldnn_) { + LOG(ERROR) << "EnableMkldnnInt8() only works when MKLDNN " + "is enabled."; + } else { + pass_builder()->EnableMkldnnInt8(); + } +#endif + } + #ifdef PADDLE_WITH_MKLDNN // Do not optimize when mkldnn is on if (enable_memory_optim_ && !use_mkldnn_) { @@ -731,6 +777,9 @@ std::string AnalysisConfig::SerializeInfoCache() { ss << use_mkldnn_quantizer_; ss << use_mkldnn_bfloat16_; for (auto &item : bfloat16_enabled_op_types_) ss << item; + ss << use_mkldnn_int8_; + for (auto &item : quantize_enabled_op_types_) ss << item; + for (auto &item : quantize_excluded_op_ids_) ss << item; ss << ";"; ss << model_from_memory_; diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc index 99d3f790e253c..f1d56000b03ca 100644 --- a/paddle/fluid/inference/api/analysis_predictor.cc +++ b/paddle/fluid/inference/api/analysis_predictor.cc @@ -949,6 +949,13 @@ void AnalysisPredictor::PrepareArgument() { LOG(INFO) << "Bfloat16 is enabled"; argument_.SetBfloat16EnabledOpTypes(config_.bfloat16_enabled_op_types_); } + + if (config_.use_mkldnn_int8_) { + LOG(INFO) << "Int8 is enabled"; + argument_.SetQuantizeEnabledOpTypes(config_.quantize_enabled_op_types_); + argument_.SetQuantizeExcludedOpIds(config_.quantize_excluded_op_ids_); + argument_.SetQuantVarScales({}); + } #endif auto passes = config_.pass_builder()->AllPasses(); diff --git a/paddle/fluid/inference/api/paddle_analysis_config.h b/paddle/fluid/inference/api/paddle_analysis_config.h index bdfe0e46e9ca4..d25f51e4fd41e 100644 --- a/paddle/fluid/inference/api/paddle_analysis_config.h +++ b/paddle/fluid/inference/api/paddle_analysis_config.h @@ -712,6 +712,20 @@ struct PD_INFER_DECL AnalysisConfig { /// void EnableMkldnnQuantizer(); + /// + /// \brief Turn on MKLDNN int8. + /// + /// \param op_list The operator type list. + /// + void EnableMkldnnInt8(const std::unordered_set& op_list = {}); + + /// + /// \brief A boolean state telling whether to use the MKLDNN Int8. + /// + /// \return bool Whether to use the MKLDNN Int8. + /// + bool mkldnn_int8_enabled() const { return use_mkldnn_int8_; } + /// /// \brief Turn on MKLDNN bfloat16. /// @@ -981,6 +995,26 @@ struct PD_INFER_DECL AnalysisConfig { std::shared_ptr mkldnn_quantizer_config_; bool use_mkldnn_bfloat16_{false}; std::unordered_set bfloat16_enabled_op_types_; + bool use_mkldnn_int8_{false}; + std::unordered_set quantize_excluded_op_ids_{}; + std::unordered_set quantize_enabled_op_types_{ + "concat", + "conv2d", + "depthwise_conv2d", + "elementwise_add", + "elementwise_mul", + "fc", + "matmul", + "nearest_interp", + "nearest_interp_v2", + "pool2d", + "prior_box", + "reshape2", + "transpose2", + "fusion_gru", + "fusion_lstm", + "multi_gru", + "slice"}; // ipu related. bool use_ipu_{false}; diff --git a/paddle/fluid/inference/api/paddle_pass_builder.cc b/paddle/fluid/inference/api/paddle_pass_builder.cc index d0fe3953d00d6..ce733c53059b7 100644 --- a/paddle/fluid/inference/api/paddle_pass_builder.cc +++ b/paddle/fluid/inference/api/paddle_pass_builder.cc @@ -220,6 +220,10 @@ void GpuPassStrategy::EnableMkldnnBfloat16() { LOG(ERROR) << "GPU not support MKL-DNN bfloat16"; } +void GpuPassStrategy::EnableMkldnnInt8() { + LOG(ERROR) << "GPU not support MKL-DNN int8"; +} + CpuPassStrategy::CpuPassStrategy() : PassStrategy({}) { // NOTE the large fusions should be located in the front, so that they will // not be damaged by smaller ones. @@ -339,6 +343,75 @@ void CpuPassStrategy::EnableMkldnnBfloat16() { #endif } +void CpuPassStrategy::EnableMkldnnInt8() { +#ifdef PADDLE_WITH_MKLDNN + if (!use_mkldnn_int8_) { + passes_.clear(); + passes_.push_back("quant_dequant_mkldnn_pass"); + passes_.push_back("layer_norm_fuse_pass"); + passes_.push_back("attention_lstm_fuse_pass"); + passes_.push_back("seqconv_eltadd_relu_fuse_pass"); + passes_.push_back("fc_lstm_fuse_pass"); + passes_.push_back("mul_lstm_fuse_pass"); + passes_.push_back("fc_gru_fuse_pass"); + passes_.push_back("mul_gru_fuse_pass"); + passes_.push_back("multi_gru_fuse_pass"); + passes_.push_back("multi_gru_seq_fuse_pass"); + passes_.push_back("seq_concat_fc_fuse_pass"); + passes_.push_back("gpu_cpu_squeeze2_matmul_fuse_pass"); + passes_.push_back("gpu_cpu_reshape2_matmul_fuse_pass"); + passes_.push_back("gpu_cpu_flatten2_matmul_fuse_pass"); + passes_.push_back("matmul_v2_scale_fuse_pass"); + passes_.push_back("squared_mat_sub_fuse_pass"); + passes_.push_back("is_test_pass"); + passes_.push_back("gpu_cpu_map_matmul_v2_to_mul_pass"); + passes_.push_back("gpu_cpu_map_matmul_v2_to_matmul_pass"); + passes_.push_back("matmul_scale_fuse_pass"); + passes_.push_back("gpu_cpu_map_matmul_to_mul_pass"); + passes_.push_back("repeated_fc_relu_fuse_pass"); + passes_.push_back("mkldnn_placement_pass"); + passes_.push_back("depthwise_conv_mkldnn_pass"); + passes_.push_back("conv_bn_fuse_pass"); + passes_.push_back("conv_eltwiseadd_bn_fuse_pass"); + passes_.push_back("conv_transpose_bn_fuse_pass"); + passes_.push_back("conv_transpose_eltwiseadd_bn_fuse_pass"); + passes_.push_back("conv_bias_mkldnn_fuse_pass"); + passes_.push_back("conv_transpose_bias_mkldnn_fuse_pass"); + passes_.push_back("conv_elementwise_add_mkldnn_fuse_pass"); + passes_.push_back("conv_concat_relu_mkldnn_fuse_pass"); + passes_.push_back("conv_relu_mkldnn_fuse_pass"); + passes_.push_back("conv_leaky_relu_mkldnn_fuse_pass"); + passes_.push_back("conv_relu6_mkldnn_fuse_pass"); + passes_.push_back("conv_swish_mkldnn_fuse_pass"); + passes_.push_back("conv_hard_swish_mkldnn_fuse_pass"); + passes_.push_back("conv_mish_mkldnn_fuse_pass"); + passes_.push_back("conv_hard_sigmoid_mkldnn_fuse_pass"); + passes_.push_back("conv_gelu_mkldnn_fuse_pass"); + passes_.push_back("fc_fuse_pass"); + passes_.push_back("repeated_fc_relu_fuse_pass"); + passes_.push_back("fc_mkldnn_pass"); + passes_.push_back("fc_act_mkldnn_fuse_pass"); + passes_.push_back("matmul_transpose_reshape_fuse_pass"); + passes_.push_back("matmul_v2_transpose_reshape_fuse_pass"); + passes_.push_back("batch_norm_act_fuse_pass"); + passes_.push_back("softplus_activation_mkldnn_fuse_pass"); + passes_.push_back("compute_propagate_scales_mkldnn_pass"); + passes_.push_back("scale_matmul_fuse_pass"); + passes_.push_back("reshape_transpose_matmul_mkldnn_fuse_pass"); + passes_.push_back("reshape_transpose_matmul_v2_mkldnn_fuse_pass"); + passes_.push_back("cpu_quantize_placement_pass"); + passes_.push_back("cpu_quantize_pass"); + passes_.push_back("cpu_quantize_squash_pass"); + passes_.push_back("simplify_with_basic_ops_pass"); + passes_.push_back("mkldnn_inplace_pass"); + passes_.push_back("runtime_context_cache_pass"); + } + use_mkldnn_int8_ = true; +#else + use_mkldnn_int8_ = false; +#endif +} + IpuPassStrategy::IpuPassStrategy() : PassStrategy({}) { passes_.assign({"inference_process_pass"}); } diff --git a/paddle/fluid/inference/api/paddle_pass_builder.h b/paddle/fluid/inference/api/paddle_pass_builder.h index 02290ed33ff1c..231ee2cb1e8e6 100644 --- a/paddle/fluid/inference/api/paddle_pass_builder.h +++ b/paddle/fluid/inference/api/paddle_pass_builder.h @@ -139,6 +139,9 @@ class PD_INFER_DECL PassStrategy : public PaddlePassBuilder { /// \brief Enable MKLDNN bfloat16. virtual void EnableMkldnnBfloat16() {} + /// \brief Enable MKLDNN int8. + virtual void EnableMkldnnInt8() {} + /// \brief Check if we are using gpu. /// \return A bool variable implying whether we are in gpu mode. bool use_gpu() const { return use_gpu_; } @@ -189,6 +192,7 @@ class PD_INFER_DECL CpuPassStrategy : public PassStrategy { use_mkldnn_ = other.use_mkldnn_; use_mkldnn_quantizer_ = other.use_mkldnn_quantizer_; use_mkldnn_bfloat16_ = other.use_mkldnn_bfloat16_; + use_mkldnn_int8_ = other.use_mkldnn_int8_; } /// \brief Default destructor. virtual ~CpuPassStrategy() = default; @@ -205,10 +209,14 @@ class PD_INFER_DECL CpuPassStrategy : public PassStrategy { /// \brief Enable MKLDNN bfloat16. void EnableMkldnnBfloat16() override; + /// \brief Enable MKLDNN int8. + void EnableMkldnnInt8() override; + protected: /// \cond Protected bool use_mkldnn_quantizer_{false}; bool use_mkldnn_bfloat16_{false}; + bool use_mkldnn_int8_{false}; /// \endcond }; @@ -243,6 +251,9 @@ class PD_INFER_DECL GpuPassStrategy : public PassStrategy { /// \brief Not supported in GPU mode yet. void EnableMkldnnBfloat16() override; + /// \brief Not supported in GPU mode yet. + void EnableMkldnnInt8() override; + /// \brief Default destructor. virtual ~GpuPassStrategy() = default; diff --git a/paddle/fluid/inference/tests/api/CMakeLists.txt b/paddle/fluid/inference/tests/api/CMakeLists.txt index 8c96499a022f7..06d1cd0814eb2 100644 --- a/paddle/fluid/inference/tests/api/CMakeLists.txt +++ b/paddle/fluid/inference/tests/api/CMakeLists.txt @@ -168,7 +168,7 @@ function(inference_analysis_api_test_with_fake_data_run TARGET_NAME test_binary --disable_mkldnn_fc=${disable_fc}) endfunction() -function(inference_analysis_api_quant_test_run TARGET_NAME test_binary fp32_model_dir int8_model_dir data_path) +function(inference_analysis_api_quant_test_run TARGET_NAME test_binary fp32_model_dir int8_model_dir data_path enable_quant_int8) inference_analysis_test_run(${TARGET_NAME} COMMAND ${test_binary} ARGS --fp32_model=${fp32_model_dir} @@ -176,6 +176,7 @@ function(inference_analysis_api_quant_test_run TARGET_NAME test_binary fp32_mode --infer_data=${data_path} --batch_size=50 --enable_int8=true + --enable_quant_int8=${enable_quant_int8} --cpu_num_threads=${CPU_NUM_THREADS_ON_CI} --with_accuracy_layer=false --iterations=2) @@ -554,7 +555,20 @@ if(WITH_MKLDNN) download_quant_data_without_verify(${QUANT2_MobileNetV1_MODEL_DIR} "MobileNet_qat_perf.tar.gz") endif(NOT LINUX) download_quant_data_without_verify(${QUANT2_INT8_MobileNetV1_MODEL_DIR} "MobileNet_qat_perf_int8.tar.gz") - inference_analysis_api_quant_test_run(test_analyzer_quant_performance_benchmark ${QUANT_IMG_CLASS_TEST_APP} ${QUANT2_MobileNetV1_MODEL_DIR}/MobileNet_qat_perf/float ${QUANT2_INT8_MobileNetV1_MODEL_DIR}/MobileNet_qat_perf_int8 ${IMAGENET_DATA_PATH}) + inference_analysis_api_quant_test_run(test_analyzer_quant_performance_benchmark ${QUANT_IMG_CLASS_TEST_APP} ${QUANT2_MobileNetV1_MODEL_DIR}/MobileNet_qat_perf/float ${QUANT2_INT8_MobileNetV1_MODEL_DIR}/MobileNet_qat_perf_int8 ${IMAGENET_DATA_PATH} false) + + # Quant2 MobileNetV1 + inference_analysis_api_quant_test_run(test_analyzer_quant2_mobilenetv1_mkldnn ${QUANT_IMG_CLASS_TEST_APP} ${QUANT2_MobileNetV1_MODEL_DIR}/MobileNet_qat_perf/float ${QUANT2_MobileNetV1_MODEL_DIR}/MobileNet_qat_perf/float ${IMAGENET_DATA_PATH} true) + + # Quant2 ResNet50 with input/output scales in `fake_quantize_range_abs_max` operators and the `out_threshold` attributes, + # with weight scales in `fake_channel_wise_dequantize_max_abs` operators + set(QUANT2_RESNET50_CHANNELWISE_MODEL_DIR "${QUANT_DATA_DIR}/ResNet50_quant2_channelwise") + set(QUANT2_RESNET50_CHANNELWISE_MODEL_ARCHIVE "ResNet50_qat_channelwise.tar.gz") + if(NOT LINUX) + download_quant_data_without_verify(${QUANT2_RESNET50_CHANNELWISE_MODEL_DIR} ${QUANT2_RESNET50_CHANNELWISE_MODEL_ARCHIVE}) + endif(NOT LINUX) + set(QUANT2_RESNET50_MODEL ${QUANT2_RESNET50_CHANNELWISE_MODEL_DIR}/ResNet50_qat_channelwise) + inference_analysis_api_quant_test_run(test_analyzer_quant2_resnet50_channelwise_mkldnn ${QUANT_IMG_CLASS_TEST_APP} ${QUANT2_RESNET50_MODEL} ${QUANT2_RESNET50_MODEL} ${IMAGENET_DATA_PATH} true) ### Other tests @@ -774,6 +788,8 @@ if(WITH_MKLDNN) set_tests_properties(test_analyzer_int8_mobilenetv2 PROPERTIES TIMEOUT 120) set_tests_properties(test_analyzer_int8_mobilenetv1 PROPERTIES TIMEOUT 120) set_tests_properties(test_analyzer_int8_mobilenetv3_large PROPERTIES TIMEOUT 120) + set_tests_properties(test_analyzer_quant2_mobilenetv1_mkldnn PROPERTIES TIMEOUT 120) + set_tests_properties(test_analyzer_quant2_resnet50_channelwise_mkldnn PROPERTIES TIMEOUT 120) endif() set_tests_properties(lite_resnet50_test PROPERTIES TIMEOUT 120) diff --git a/paddle/fluid/inference/tests/api/analyzer_quant_image_classification_tester.cc b/paddle/fluid/inference/tests/api/analyzer_quant_image_classification_tester.cc index 5e867fc87fea3..4bb59f3c8df42 100644 --- a/paddle/fluid/inference/tests/api/analyzer_quant_image_classification_tester.cc +++ b/paddle/fluid/inference/tests/api/analyzer_quant_image_classification_tester.cc @@ -26,8 +26,7 @@ namespace analysis { void SetConfig(AnalysisConfig *cfg, std::string model_path) { cfg->SetModel(model_path); cfg->DisableGpu(); - cfg->SwitchIrOptim(false); - cfg->SwitchSpecifyInputNames(); + cfg->SwitchIrOptim(true); cfg->SetCpuMathLibraryNumThreads(FLAGS_cpu_num_threads); if (FLAGS_enable_mkldnn) cfg->EnableMKLDNN(); } @@ -113,9 +112,11 @@ void SetInput(std::vector> *inputs, TEST(Analyzer_quant_image_classification, quantization) { AnalysisConfig fp32_cfg; SetConfig(&fp32_cfg, FLAGS_fp32_model); + fp32_cfg.EnableMKLDNN(); AnalysisConfig int8_cfg; SetConfig(&int8_cfg, FLAGS_int8_model); + if (FLAGS_enable_quant_int8) int8_cfg.EnableMkldnnInt8(); // read data from file and prepare batches with test data std::vector> input_slots_all; diff --git a/paddle/fluid/inference/tests/api/tester_helper.h b/paddle/fluid/inference/tests/api/tester_helper.h index e63dfd14175b9..f2df018f4978a 100644 --- a/paddle/fluid/inference/tests/api/tester_helper.h +++ b/paddle/fluid/inference/tests/api/tester_helper.h @@ -53,6 +53,7 @@ DEFINE_bool(with_accuracy_layer, true, DEFINE_bool(enable_fp32, true, "Enable FP32 type prediction"); DEFINE_bool(enable_bf16, false, "Enable BF16 type prediction"); DEFINE_bool(enable_int8, false, "Enable INT8 type prediction"); +DEFINE_bool(enable_quant_int8, false, "Enable QUANT INT8 type prediction"); DEFINE_int32(warmup_batch_size, 100, "batch size for quantization warmup"); // setting iterations to 0 means processing the whole dataset DEFINE_int32(iterations, 0, "number of batches to process"); diff --git a/paddle/fluid/pybind/inference_api.cc b/paddle/fluid/pybind/inference_api.cc index 97f3722008769..91d5d39622714 100644 --- a/paddle/fluid/pybind/inference_api.cc +++ b/paddle/fluid/pybind/inference_api.cc @@ -695,6 +695,10 @@ void BindAnalysisConfig(py::module *m) { .def("set_mkldnn_cache_capacity", &AnalysisConfig::SetMkldnnCacheCapacity, py::arg("capacity") = 0) .def("set_bfloat16_op", &AnalysisConfig::SetBfloat16Op) + .def("enable_mkldnn_int8", &AnalysisConfig::EnableMkldnnInt8, + py::arg("mkldnn_int8_enabled_op_types") = + std::unordered_set({})) + .def("mkldnn_int8_enabled", &AnalysisConfig::mkldnn_int8_enabled) #endif .def("set_mkldnn_op", &AnalysisConfig::SetMKLDNNOp) .def("set_model_buffer", &AnalysisConfig::SetModelBuffer) From 2ab986aeb7eecc7c28dc5b1907bf3f5ca72911e4 Mon Sep 17 00:00:00 2001 From: Chen Weihang Date: Thu, 14 Apr 2022 11:01:26 +0800 Subject: [PATCH 139/211] [Phi] Unify dispatch macros to visit (#41653) * chnage dispatch to visit * resolve conflict --- paddle/phi/api/ext/dispatch.h | 318 ++-------------- paddle/phi/api/lib/data_transform.cc | 1 - paddle/phi/core/visit_type.h | 338 ++++++++++++++++++ paddle/phi/kernels/cpu/cast_grad_kernel.cc | 2 + paddle/phi/kernels/cpu/cast_impl.h | 2 +- paddle/phi/kernels/cpu/cast_kernel.cc | 1 + .../kernels/cpu/cross_entropy_grad_kernel.cc | 6 +- paddle/phi/kernels/cpu/elementwise_kernel.cc | 1 - paddle/phi/kernels/cpu/reduce.h | 2 +- paddle/phi/kernels/cpu/transpose_kernel.cc | 3 +- paddle/phi/kernels/funcs/reduce_function.h | 1 - paddle/phi/kernels/gpu/cast_grad_kernel.cu | 4 +- paddle/phi/kernels/gpu/cast_impl.h | 2 +- paddle/phi/kernels/gpu/cast_kernel.cu | 4 +- .../kernels/gpu/cross_entropy_grad_kernel.cu | 6 +- .../phi/kernels/gpu/cross_entropy_kernel.cu | 30 +- paddle/phi/kernels/gpu/reduce.h | 1 + paddle/phi/kernels/gpu/reduce_grad.h | 2 +- paddle/phi/kernels/gpu/transpose_kernel.cu | 1 - .../sparse/cpu/convolution_grad_kernel.cc | 5 +- .../kernels/sparse/cpu/convolution_kernel.cc | 5 +- .../kernels/sparse/cpu/sparse_mask_kernel.cc | 7 +- .../sparse/cpu/sparse_pool_grad_kernel.cc | 5 +- .../kernels/sparse/cpu/sparse_pool_kernel.cc | 5 +- .../phi/kernels/sparse/gpu/convolution.cu.h | 2 +- .../sparse/gpu/convolution_grad_kernel.cu | 5 +- .../kernels/sparse/gpu/convolution_kernel.cu | 5 +- .../kernels/sparse/gpu/sparse_mask_kernel.cu | 7 +- .../sparse/gpu/sparse_pool_grad_kernel.cu | 5 +- .../kernels/sparse/gpu/sparse_pool_kernel.cu | 5 +- paddle/phi/kernels/transfer_layout_kernel.cc | 2 +- paddle/phi/kernels/xpu/full_kernel.cc | 2 +- 32 files changed, 421 insertions(+), 364 deletions(-) create mode 100644 paddle/phi/core/visit_type.h diff --git a/paddle/phi/api/ext/dispatch.h b/paddle/phi/api/ext/dispatch.h index 6b6d0ae7fe723..aa9cd0f53a4c6 100644 --- a/paddle/phi/api/ext/dispatch.h +++ b/paddle/phi/api/ext/dispatch.h @@ -14,327 +14,57 @@ limitations under the License. */ #pragma once -#include "paddle/phi/api/ext/exception.h" -#include "paddle/phi/common/data_type.h" +#include "paddle/phi/core/visit_type.h" namespace paddle { -///////// Basic Marco /////////// - -#define PD_PRIVATE_CASE_TYPE_USING_HINT(NAME, enum_type, type, HINT, ...) \ - case enum_type: { \ - using HINT = type; \ - __VA_ARGS__(); \ - break; \ - } - -#define PD_PRIVATE_CASE_TYPE(NAME, enum_type, type, ...) \ - PD_PRIVATE_CASE_TYPE_USING_HINT(NAME, enum_type, type, data_t, __VA_ARGS__) +// Note: Keep this file only for compatibility with custom operators ///////// Floating Dispatch Marco /////////// -#define PD_DISPATCH_FLOATING_TYPES(TYPE, NAME, ...) \ - [&] { \ - const auto& __dtype__ = TYPE; \ - switch (__dtype__) { \ - PD_PRIVATE_CASE_TYPE( \ - NAME, ::paddle::DataType::FLOAT32, float, __VA_ARGS__) \ - PD_PRIVATE_CASE_TYPE( \ - NAME, ::paddle::DataType::FLOAT64, double, __VA_ARGS__) \ - default: \ - PD_THROW("function " #NAME " is not implemented for data type `", \ - __dtype__, \ - "`"); \ - } \ - }() +#define PD_DISPATCH_FLOATING_TYPES(TYPE, NAME, ...) \ + PD_VISIT_FLOATING_TYPES(TYPE, NAME, __VA_ARGS__) -#define PD_DISPATCH_FLOATING_AND_HALF_TYPES(TYPE, NAME, ...) \ - [&] { \ - const auto& __dtype__ = TYPE; \ - switch (__dtype__) { \ - PD_PRIVATE_CASE_TYPE( \ - NAME, ::paddle::DataType::FLOAT32, float, __VA_ARGS__) \ - PD_PRIVATE_CASE_TYPE( \ - NAME, ::paddle::DataType::FLOAT64, double, __VA_ARGS__) \ - PD_PRIVATE_CASE_TYPE( \ - NAME, ::paddle::DataType::FLOAT16, paddle::float16, __VA_ARGS__) \ - default: \ - PD_THROW("function " #NAME " is not implemented for data type `", \ - __dtype__, \ - "`"); \ - } \ - }() +#define PD_DISPATCH_FLOATING_AND_HALF_TYPES(TYPE, NAME, ...) \ + PD_VISIT_FLOATING_AND_HALF_TYPES(TYPE, NAME, __VA_ARGS__) ///////// Integral Dispatch Marco /////////// -#define PD_DISPATCH_INTEGRAL_TYPES(TYPE, NAME, ...) \ - [&] { \ - const auto& __dtype__ = TYPE; \ - switch (__dtype__) { \ - PD_PRIVATE_CASE_TYPE(NAME, ::paddle::DataType::INT32, int, __VA_ARGS__) \ - PD_PRIVATE_CASE_TYPE( \ - NAME, ::paddle::DataType::INT64, int64_t, __VA_ARGS__) \ - PD_PRIVATE_CASE_TYPE( \ - NAME, ::paddle::DataType::INT8, int8_t, __VA_ARGS__) \ - PD_PRIVATE_CASE_TYPE( \ - NAME, ::paddle::DataType::UINT8, uint8_t, __VA_ARGS__) \ - PD_PRIVATE_CASE_TYPE( \ - NAME, ::paddle::DataType::INT16, int16_t, __VA_ARGS__) \ - default: \ - PD_THROW("function " #NAME " is not implemented for data type `", \ - __dtype__, \ - "`"); \ - } \ - }() +#define PD_DISPATCH_INTEGRAL_TYPES(TYPE, NAME, ...) \ + PD_VISIT_INTEGRAL_TYPES(TYPE, NAME, __VA_ARGS__) ///////// Complex Dispatch Marco /////////// -#define PD_DISPATCH_COMPLEX_TYPES(TYPE, NAME, ...) \ - [&] { \ - const auto& __dtype__ = TYPE; \ - switch (__dtype__) { \ - PD_PRIVATE_CASE_TYPE(NAME, \ - ::paddle::DataType::COMPLEX64, \ - ::paddle::complex64, \ - __VA_ARGS__) \ - PD_PRIVATE_CASE_TYPE(NAME, \ - ::paddle::DataType::COMPLEX128, \ - ::paddle::complex128, \ - __VA_ARGS__) \ - default: \ - PD_THROW("function " #NAME " is not implemented for data type `", \ - __dtype__, \ - "`"); \ - } \ - }() +#define PD_DISPATCH_COMPLEX_TYPES(TYPE, NAME, ...) \ + PD_VISIT_COMPLEX_TYPES(TYPE, NAME, __VA_ARGS__) ///////// Floating and Integral Dispatch Marco /////////// -#define PD_DISPATCH_FLOATING_AND_INTEGRAL_TYPES(TYPE, NAME, ...) \ - [&] { \ - const auto& __dtype__ = TYPE; \ - switch (__dtype__) { \ - PD_PRIVATE_CASE_TYPE( \ - NAME, ::paddle::DataType::FLOAT32, float, __VA_ARGS__) \ - PD_PRIVATE_CASE_TYPE( \ - NAME, ::paddle::DataType::FLOAT64, double, __VA_ARGS__) \ - PD_PRIVATE_CASE_TYPE(NAME, ::paddle::DataType::INT32, int, __VA_ARGS__) \ - PD_PRIVATE_CASE_TYPE( \ - NAME, ::paddle::DataType::INT64, int64_t, __VA_ARGS__) \ - PD_PRIVATE_CASE_TYPE( \ - NAME, ::paddle::DataType::INT8, int8_t, __VA_ARGS__) \ - PD_PRIVATE_CASE_TYPE( \ - NAME, ::paddle::DataType::UINT8, uint8_t, __VA_ARGS__) \ - PD_PRIVATE_CASE_TYPE( \ - NAME, ::paddle::DataType::INT16, int16_t, __VA_ARGS__) \ - default: \ - PD_THROW("function " #NAME " is not implemented for data type `", \ - __dtype__, \ - "`"); \ - } \ - }() +#define PD_DISPATCH_FLOATING_AND_INTEGRAL_TYPES(TYPE, NAME, ...) \ + PD_VISIT_FLOATING_AND_INTEGRAL_TYPES(TYPE, NAME, __VA_ARGS__) ///////// Floating and Complex Dispatch Marco /////////// -#define PD_DISPATCH_FLOATING_AND_COMPLEX_TYPES(TYPE, NAME, ...) \ - [&] { \ - const auto& __dtype__ = TYPE; \ - switch (__dtype__) { \ - PD_PRIVATE_CASE_TYPE( \ - NAME, ::paddle::DataType::FLOAT32, float, __VA_ARGS__) \ - PD_PRIVATE_CASE_TYPE( \ - NAME, ::paddle::DataType::FLOAT64, double, __VA_ARGS__) \ - PD_PRIVATE_CASE_TYPE(NAME, \ - ::paddle::DataType::COMPLEX64, \ - ::paddle::complex64, \ - __VA_ARGS__) \ - PD_PRIVATE_CASE_TYPE(NAME, \ - ::paddle::DataType::COMPLEX128, \ - ::paddle::complex128, \ - __VA_ARGS__) \ - default: \ - PD_THROW("function " #NAME " is not implemented for data type `", \ - __dtype__, \ - "`"); \ - } \ - }() +#define PD_DISPATCH_FLOATING_AND_COMPLEX_TYPES(TYPE, NAME, ...) \ + PD_VISIT_FLOATING_AND_COMPLEX_TYPES(TYPE, NAME, __VA_ARGS__) ///////// Floating and Complex and other type Dispatch Marco /////////// -#define PD_DISPATCH_FLOATING_AND_COMPLEX_AND_1_TYPES( \ - SPECIFIED_TYPE, TYPE, NAME, ...) \ - [&] { \ - const auto& __dtype__ = TYPE; \ - switch (__dtype__) { \ - PD_PRIVATE_CASE_TYPE( \ - NAME, \ - SPECIFIED_TYPE, \ - ::paddle::experimental::DataTypeToCppType::type, \ - __VA_ARGS__) \ - PD_PRIVATE_CASE_TYPE( \ - NAME, ::paddle::DataType::FLOAT32, float, __VA_ARGS__) \ - PD_PRIVATE_CASE_TYPE( \ - NAME, ::paddle::DataType::FLOAT64, double, __VA_ARGS__) \ - PD_PRIVATE_CASE_TYPE(NAME, \ - ::paddle::DataType::COMPLEX64, \ - ::paddle::complex64, \ - __VA_ARGS__) \ - PD_PRIVATE_CASE_TYPE(NAME, \ - ::paddle::DataType::COMPLEX128, \ - ::paddle::complex128, \ - __VA_ARGS__) \ - default: \ - PD_THROW("function " #NAME " is not implemented for data type `", \ - __dtype__, \ - "`"); \ - } \ - }() +#define PD_DISPATCH_FLOATING_AND_COMPLEX_AND_1_TYPE( \ + SPECIFIED_TYPE, TYPE, NAME, ...) \ + PD_VISIT_FLOATING_AND_COMPLEX_AND_1_TYPE( \ + SPECIFIED_TYPE, TYPE, NAME, __VA_ARGS__) ///////// Floating and Complex and 2 other type Dispatch Marco /////////// -#define PD_DISPATCH_FLOATING_AND_COMPLEX_AND_2_TYPES( \ - SPECIFIED_TYPE1, SPECIFIED_TYPE2, TYPE, NAME, ...) \ - [&] { \ - const auto& __dtype__ = TYPE; \ - switch (__dtype__) { \ - PD_PRIVATE_CASE_TYPE( \ - NAME, \ - SPECIFIED_TYPE1, \ - ::paddle::experimental::DataTypeToCppType::type, \ - __VA_ARGS__) \ - PD_PRIVATE_CASE_TYPE( \ - NAME, \ - SPECIFIED_TYPE2, \ - ::paddle::experimental::DataTypeToCppType::type, \ - __VA_ARGS__) \ - PD_PRIVATE_CASE_TYPE( \ - NAME, ::paddle::DataType::FLOAT32, float, __VA_ARGS__) \ - PD_PRIVATE_CASE_TYPE( \ - NAME, ::paddle::DataType::FLOAT64, double, __VA_ARGS__) \ - PD_PRIVATE_CASE_TYPE(NAME, \ - ::paddle::DataType::COMPLEX64, \ - ::paddle::complex64, \ - __VA_ARGS__) \ - PD_PRIVATE_CASE_TYPE(NAME, \ - ::paddle::DataType::COMPLEX128, \ - ::paddle::complex128, \ - __VA_ARGS__) \ - default: \ - PD_THROW("function " #NAME " is not implemented for data type `", \ - __dtype__, \ - "`"); \ - } \ - }() +#define PD_DISPATCH_FLOATING_AND_COMPLEX_AND_2_TYPES( \ + SPECIFIED_TYPE1, SPECIFIED_TYPE2, TYPE, NAME, ...) \ + PD_VISIT_FLOATING_AND_COMPLEX_AND_2_TYPES( \ + SPECIFIED_TYPE1, SPECIFIED_TYPE2, TYPE, NAME, __VA_ARGS__) ///////// Floating, Integral and Complex Dispatch Marco /////////// -#define PD_DISPATCH_FLOATING_AND_INTEGRAL_AND_COMPLEX_TYPES(TYPE, NAME, ...) \ - [&] { \ - const auto& __dtype__ = TYPE; \ - switch (__dtype__) { \ - PD_PRIVATE_CASE_TYPE( \ - NAME, ::paddle::DataType::FLOAT32, float, __VA_ARGS__) \ - PD_PRIVATE_CASE_TYPE( \ - NAME, ::paddle::DataType::FLOAT64, double, __VA_ARGS__) \ - PD_PRIVATE_CASE_TYPE(NAME, ::paddle::DataType::INT32, int, __VA_ARGS__) \ - PD_PRIVATE_CASE_TYPE( \ - NAME, ::paddle::DataType::INT64, int64_t, __VA_ARGS__) \ - PD_PRIVATE_CASE_TYPE( \ - NAME, ::paddle::DataType::INT8, int8_t, __VA_ARGS__) \ - PD_PRIVATE_CASE_TYPE( \ - NAME, ::paddle::DataType::UINT8, uint8_t, __VA_ARGS__) \ - PD_PRIVATE_CASE_TYPE( \ - NAME, ::paddle::DataType::INT16, int16_t, __VA_ARGS__) \ - PD_PRIVATE_CASE_TYPE(NAME, \ - ::paddle::DataType::COMPLEX64, \ - ::paddle::complex64, \ - __VA_ARGS__) \ - PD_PRIVATE_CASE_TYPE(NAME, \ - ::paddle::DataType::COMPLEX128, \ - ::paddle::complex128, \ - __VA_ARGS__) \ - default: \ - PD_THROW("function " #NAME " is not implemented for data type `", \ - __dtype__, \ - "`"); \ - } \ - }() - -// TODO(chenweihang): Add more Marcos in the future if needed - -#define PD_VISIT_ALL_TYPES(TYPE, NAME, ...) \ - [&] { \ - const auto& __dtype__ = TYPE; \ - switch (__dtype__) { \ - PD_PRIVATE_CASE_TYPE(NAME, ::phi::DataType::BOOL, bool, __VA_ARGS__) \ - PD_PRIVATE_CASE_TYPE(NAME, ::phi::DataType::INT8, int8_t, __VA_ARGS__) \ - PD_PRIVATE_CASE_TYPE(NAME, ::phi::DataType::UINT8, uint8_t, __VA_ARGS__) \ - PD_PRIVATE_CASE_TYPE(NAME, ::phi::DataType::INT16, int16_t, __VA_ARGS__) \ - PD_PRIVATE_CASE_TYPE(NAME, ::phi::DataType::INT32, int32_t, __VA_ARGS__) \ - PD_PRIVATE_CASE_TYPE(NAME, ::phi::DataType::INT64, int64_t, __VA_ARGS__) \ - PD_PRIVATE_CASE_TYPE(NAME, \ - ::phi::DataType::BFLOAT16, \ - paddle::experimental::bfloat16, \ - __VA_ARGS__) \ - PD_PRIVATE_CASE_TYPE(NAME, \ - ::phi::DataType::FLOAT16, \ - paddle::experimental::float16, \ - __VA_ARGS__) \ - PD_PRIVATE_CASE_TYPE(NAME, ::phi::DataType::FLOAT32, float, __VA_ARGS__) \ - PD_PRIVATE_CASE_TYPE( \ - NAME, ::phi::DataType::FLOAT64, double, __VA_ARGS__) \ - PD_PRIVATE_CASE_TYPE(NAME, \ - ::phi::DataType::COMPLEX64, \ - paddle::experimental::complex64, \ - __VA_ARGS__) \ - PD_PRIVATE_CASE_TYPE(NAME, \ - ::phi::DataType::COMPLEX128, \ - paddle::experimental::complex128, \ - __VA_ARGS__) \ - default: \ - PADDLE_THROW(phi::errors::InvalidArgument( \ - "Invalid enum data type `%d`.", static_cast(__dtype__))); \ - } \ - }() - -#define PD_VISIT_BOOL_AND_FLOATING_AND_COMPLEX_AND_3_TYPES( \ - SPECIFIED_TYPE1, SPECIFIED_TYPE2, SPECIFIED_TYPE3, TYPE, NAME, ...) \ - [&] { \ - const auto& __dtype__ = TYPE; \ - switch (__dtype__) { \ - PD_PRIVATE_CASE_TYPE(NAME, ::paddle::DataType::BOOL, bool, __VA_ARGS__) \ - PD_PRIVATE_CASE_TYPE( \ - NAME, ::paddle::DataType::FLOAT32, float, __VA_ARGS__) \ - PD_PRIVATE_CASE_TYPE( \ - NAME, ::paddle::DataType::FLOAT64, double, __VA_ARGS__) \ - PD_PRIVATE_CASE_TYPE(NAME, \ - ::paddle::DataType::COMPLEX64, \ - ::paddle::complex64, \ - __VA_ARGS__) \ - PD_PRIVATE_CASE_TYPE(NAME, \ - ::paddle::DataType::COMPLEX128, \ - ::paddle::complex128, \ - __VA_ARGS__) \ - PD_PRIVATE_CASE_TYPE( \ - NAME, \ - SPECIFIED_TYPE1, \ - ::paddle::experimental::DataTypeToCppType::type, \ - __VA_ARGS__) \ - PD_PRIVATE_CASE_TYPE( \ - NAME, \ - SPECIFIED_TYPE2, \ - ::paddle::experimental::DataTypeToCppType::type, \ - __VA_ARGS__) \ - PD_PRIVATE_CASE_TYPE( \ - NAME, \ - SPECIFIED_TYPE3, \ - ::paddle::experimental::DataTypeToCppType::type, \ - __VA_ARGS__) \ - default: \ - PD_THROW("function " #NAME " is not implemented for data type `", \ - __dtype__, \ - "`"); \ - } \ - }() +#define PD_DISPATCH_FLOATING_AND_INTEGRAL_AND_COMPLEX_TYPES(TYPE, NAME, ...) \ + PD_VISIT_FLOATING_AND_INTEGRAL_AND_COMPLEX_TYPES(TYPE, NAME, __VA_ARGS__) } // namespace paddle diff --git a/paddle/phi/api/lib/data_transform.cc b/paddle/phi/api/lib/data_transform.cc index 82d2e741e9de8..d4e92ded324da 100644 --- a/paddle/phi/api/lib/data_transform.cc +++ b/paddle/phi/api/lib/data_transform.cc @@ -14,7 +14,6 @@ limitations under the License. */ #include "paddle/phi/api/lib/data_transform.h" -#include "paddle/phi/api/ext/dispatch.h" #include "paddle/phi/api/lib/kernel_dispatch.h" #include "paddle/phi/api/lib/utils/storage.h" #include "paddle/phi/backends/all_context.h" diff --git a/paddle/phi/core/visit_type.h b/paddle/phi/core/visit_type.h new file mode 100644 index 0000000000000..bd972c8ceedc7 --- /dev/null +++ b/paddle/phi/core/visit_type.h @@ -0,0 +1,338 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include "paddle/phi/api/ext/exception.h" +#include "paddle/phi/common/data_type.h" + +namespace phi { + +///////// Basic Marco /////////// + +#define PD_PRIVATE_CASE_TYPE_USING_HINT(NAME, enum_type, type, HINT, ...) \ + case enum_type: { \ + using HINT = type; \ + __VA_ARGS__(); \ + break; \ + } + +#define PD_PRIVATE_CASE_TYPE(NAME, enum_type, type, ...) \ + PD_PRIVATE_CASE_TYPE_USING_HINT(NAME, enum_type, type, data_t, __VA_ARGS__) + +///////// Floating Dispatch Marco /////////// + +#define PD_VISIT_FLOATING_TYPES(TYPE, NAME, ...) \ + [&] { \ + const auto& __dtype__ = TYPE; \ + switch (__dtype__) { \ + PD_PRIVATE_CASE_TYPE( \ + NAME, ::paddle::DataType::FLOAT32, float, __VA_ARGS__) \ + PD_PRIVATE_CASE_TYPE( \ + NAME, ::paddle::DataType::FLOAT64, double, __VA_ARGS__) \ + default: \ + PD_THROW("function " #NAME " is not implemented for data type `", \ + __dtype__, \ + "`"); \ + } \ + }() + +#define PD_VISIT_FLOATING_AND_HALF_TYPES(TYPE, NAME, ...) \ + [&] { \ + const auto& __dtype__ = TYPE; \ + switch (__dtype__) { \ + PD_PRIVATE_CASE_TYPE( \ + NAME, ::paddle::DataType::FLOAT32, float, __VA_ARGS__) \ + PD_PRIVATE_CASE_TYPE( \ + NAME, ::paddle::DataType::FLOAT64, double, __VA_ARGS__) \ + PD_PRIVATE_CASE_TYPE( \ + NAME, ::paddle::DataType::FLOAT16, paddle::float16, __VA_ARGS__) \ + default: \ + PD_THROW("function " #NAME " is not implemented for data type `", \ + __dtype__, \ + "`"); \ + } \ + }() + +///////// Integral Dispatch Marco /////////// + +#define PD_VISIT_INTEGRAL_TYPES(TYPE, NAME, ...) \ + [&] { \ + const auto& __dtype__ = TYPE; \ + switch (__dtype__) { \ + PD_PRIVATE_CASE_TYPE(NAME, ::paddle::DataType::INT32, int, __VA_ARGS__) \ + PD_PRIVATE_CASE_TYPE( \ + NAME, ::paddle::DataType::INT64, int64_t, __VA_ARGS__) \ + PD_PRIVATE_CASE_TYPE( \ + NAME, ::paddle::DataType::INT8, int8_t, __VA_ARGS__) \ + PD_PRIVATE_CASE_TYPE( \ + NAME, ::paddle::DataType::UINT8, uint8_t, __VA_ARGS__) \ + PD_PRIVATE_CASE_TYPE( \ + NAME, ::paddle::DataType::INT16, int16_t, __VA_ARGS__) \ + default: \ + PD_THROW("function " #NAME " is not implemented for data type `", \ + __dtype__, \ + "`"); \ + } \ + }() + +///////// Complex Dispatch Marco /////////// + +#define PD_VISIT_COMPLEX_TYPES(TYPE, NAME, ...) \ + [&] { \ + const auto& __dtype__ = TYPE; \ + switch (__dtype__) { \ + PD_PRIVATE_CASE_TYPE(NAME, \ + ::paddle::DataType::COMPLEX64, \ + ::paddle::complex64, \ + __VA_ARGS__) \ + PD_PRIVATE_CASE_TYPE(NAME, \ + ::paddle::DataType::COMPLEX128, \ + ::paddle::complex128, \ + __VA_ARGS__) \ + default: \ + PD_THROW("function " #NAME " is not implemented for data type `", \ + __dtype__, \ + "`"); \ + } \ + }() + +///////// Floating and Integral Dispatch Marco /////////// + +#define PD_VISIT_FLOATING_AND_INTEGRAL_TYPES(TYPE, NAME, ...) \ + [&] { \ + const auto& __dtype__ = TYPE; \ + switch (__dtype__) { \ + PD_PRIVATE_CASE_TYPE( \ + NAME, ::paddle::DataType::FLOAT32, float, __VA_ARGS__) \ + PD_PRIVATE_CASE_TYPE( \ + NAME, ::paddle::DataType::FLOAT64, double, __VA_ARGS__) \ + PD_PRIVATE_CASE_TYPE(NAME, ::paddle::DataType::INT32, int, __VA_ARGS__) \ + PD_PRIVATE_CASE_TYPE( \ + NAME, ::paddle::DataType::INT64, int64_t, __VA_ARGS__) \ + PD_PRIVATE_CASE_TYPE( \ + NAME, ::paddle::DataType::INT8, int8_t, __VA_ARGS__) \ + PD_PRIVATE_CASE_TYPE( \ + NAME, ::paddle::DataType::UINT8, uint8_t, __VA_ARGS__) \ + PD_PRIVATE_CASE_TYPE( \ + NAME, ::paddle::DataType::INT16, int16_t, __VA_ARGS__) \ + default: \ + PD_THROW("function " #NAME " is not implemented for data type `", \ + __dtype__, \ + "`"); \ + } \ + }() + +///////// Floating and Complex Dispatch Marco /////////// + +#define PD_VISIT_FLOATING_AND_COMPLEX_TYPES(TYPE, NAME, ...) \ + [&] { \ + const auto& __dtype__ = TYPE; \ + switch (__dtype__) { \ + PD_PRIVATE_CASE_TYPE( \ + NAME, ::paddle::DataType::FLOAT32, float, __VA_ARGS__) \ + PD_PRIVATE_CASE_TYPE( \ + NAME, ::paddle::DataType::FLOAT64, double, __VA_ARGS__) \ + PD_PRIVATE_CASE_TYPE(NAME, \ + ::paddle::DataType::COMPLEX64, \ + ::paddle::complex64, \ + __VA_ARGS__) \ + PD_PRIVATE_CASE_TYPE(NAME, \ + ::paddle::DataType::COMPLEX128, \ + ::paddle::complex128, \ + __VA_ARGS__) \ + default: \ + PD_THROW("function " #NAME " is not implemented for data type `", \ + __dtype__, \ + "`"); \ + } \ + }() + +///////// Floating and Complex and other type Dispatch Marco /////////// + +#define PD_VISIT_FLOATING_AND_COMPLEX_AND_1_TYPE( \ + SPECIFIED_TYPE, TYPE, NAME, ...) \ + [&] { \ + const auto& __dtype__ = TYPE; \ + switch (__dtype__) { \ + PD_PRIVATE_CASE_TYPE( \ + NAME, \ + SPECIFIED_TYPE, \ + ::paddle::experimental::DataTypeToCppType::type, \ + __VA_ARGS__) \ + PD_PRIVATE_CASE_TYPE( \ + NAME, ::paddle::DataType::FLOAT32, float, __VA_ARGS__) \ + PD_PRIVATE_CASE_TYPE( \ + NAME, ::paddle::DataType::FLOAT64, double, __VA_ARGS__) \ + PD_PRIVATE_CASE_TYPE(NAME, \ + ::paddle::DataType::COMPLEX64, \ + ::paddle::complex64, \ + __VA_ARGS__) \ + PD_PRIVATE_CASE_TYPE(NAME, \ + ::paddle::DataType::COMPLEX128, \ + ::paddle::complex128, \ + __VA_ARGS__) \ + default: \ + PD_THROW("function " #NAME " is not implemented for data type `", \ + __dtype__, \ + "`"); \ + } \ + }() + +///////// Floating and Complex and 2 other type Dispatch Marco /////////// + +#define PD_VISIT_FLOATING_AND_COMPLEX_AND_2_TYPES( \ + SPECIFIED_TYPE1, SPECIFIED_TYPE2, TYPE, NAME, ...) \ + [&] { \ + const auto& __dtype__ = TYPE; \ + switch (__dtype__) { \ + PD_PRIVATE_CASE_TYPE( \ + NAME, \ + SPECIFIED_TYPE1, \ + ::paddle::experimental::DataTypeToCppType::type, \ + __VA_ARGS__) \ + PD_PRIVATE_CASE_TYPE( \ + NAME, \ + SPECIFIED_TYPE2, \ + ::paddle::experimental::DataTypeToCppType::type, \ + __VA_ARGS__) \ + PD_PRIVATE_CASE_TYPE( \ + NAME, ::paddle::DataType::FLOAT32, float, __VA_ARGS__) \ + PD_PRIVATE_CASE_TYPE( \ + NAME, ::paddle::DataType::FLOAT64, double, __VA_ARGS__) \ + PD_PRIVATE_CASE_TYPE(NAME, \ + ::paddle::DataType::COMPLEX64, \ + ::paddle::complex64, \ + __VA_ARGS__) \ + PD_PRIVATE_CASE_TYPE(NAME, \ + ::paddle::DataType::COMPLEX128, \ + ::paddle::complex128, \ + __VA_ARGS__) \ + default: \ + PD_THROW("function " #NAME " is not implemented for data type `", \ + __dtype__, \ + "`"); \ + } \ + }() + +///////// Floating, Integral and Complex Dispatch Marco /////////// + +#define PD_VISIT_FLOATING_AND_INTEGRAL_AND_COMPLEX_TYPES(TYPE, NAME, ...) \ + [&] { \ + const auto& __dtype__ = TYPE; \ + switch (__dtype__) { \ + PD_PRIVATE_CASE_TYPE( \ + NAME, ::paddle::DataType::FLOAT32, float, __VA_ARGS__) \ + PD_PRIVATE_CASE_TYPE( \ + NAME, ::paddle::DataType::FLOAT64, double, __VA_ARGS__) \ + PD_PRIVATE_CASE_TYPE(NAME, ::paddle::DataType::INT32, int, __VA_ARGS__) \ + PD_PRIVATE_CASE_TYPE( \ + NAME, ::paddle::DataType::INT64, int64_t, __VA_ARGS__) \ + PD_PRIVATE_CASE_TYPE( \ + NAME, ::paddle::DataType::INT8, int8_t, __VA_ARGS__) \ + PD_PRIVATE_CASE_TYPE( \ + NAME, ::paddle::DataType::UINT8, uint8_t, __VA_ARGS__) \ + PD_PRIVATE_CASE_TYPE( \ + NAME, ::paddle::DataType::INT16, int16_t, __VA_ARGS__) \ + PD_PRIVATE_CASE_TYPE(NAME, \ + ::paddle::DataType::COMPLEX64, \ + ::paddle::complex64, \ + __VA_ARGS__) \ + PD_PRIVATE_CASE_TYPE(NAME, \ + ::paddle::DataType::COMPLEX128, \ + ::paddle::complex128, \ + __VA_ARGS__) \ + default: \ + PD_THROW("function " #NAME " is not implemented for data type `", \ + __dtype__, \ + "`"); \ + } \ + }() + +#define PD_VISIT_ALL_TYPES(TYPE, NAME, ...) \ + [&] { \ + const auto& __dtype__ = TYPE; \ + switch (__dtype__) { \ + PD_PRIVATE_CASE_TYPE(NAME, ::phi::DataType::BOOL, bool, __VA_ARGS__) \ + PD_PRIVATE_CASE_TYPE(NAME, ::phi::DataType::INT8, int8_t, __VA_ARGS__) \ + PD_PRIVATE_CASE_TYPE(NAME, ::phi::DataType::UINT8, uint8_t, __VA_ARGS__) \ + PD_PRIVATE_CASE_TYPE(NAME, ::phi::DataType::INT16, int16_t, __VA_ARGS__) \ + PD_PRIVATE_CASE_TYPE(NAME, ::phi::DataType::INT32, int32_t, __VA_ARGS__) \ + PD_PRIVATE_CASE_TYPE(NAME, ::phi::DataType::INT64, int64_t, __VA_ARGS__) \ + PD_PRIVATE_CASE_TYPE(NAME, \ + ::phi::DataType::BFLOAT16, \ + paddle::experimental::bfloat16, \ + __VA_ARGS__) \ + PD_PRIVATE_CASE_TYPE(NAME, \ + ::phi::DataType::FLOAT16, \ + paddle::experimental::float16, \ + __VA_ARGS__) \ + PD_PRIVATE_CASE_TYPE(NAME, ::phi::DataType::FLOAT32, float, __VA_ARGS__) \ + PD_PRIVATE_CASE_TYPE( \ + NAME, ::phi::DataType::FLOAT64, double, __VA_ARGS__) \ + PD_PRIVATE_CASE_TYPE(NAME, \ + ::phi::DataType::COMPLEX64, \ + paddle::experimental::complex64, \ + __VA_ARGS__) \ + PD_PRIVATE_CASE_TYPE(NAME, \ + ::phi::DataType::COMPLEX128, \ + paddle::experimental::complex128, \ + __VA_ARGS__) \ + default: \ + PADDLE_THROW(phi::errors::InvalidArgument( \ + "Invalid enum data type `%d`.", static_cast(__dtype__))); \ + } \ + }() + +#define PD_VISIT_BOOL_AND_FLOATING_AND_COMPLEX_AND_3_TYPES( \ + SPECIFIED_TYPE1, SPECIFIED_TYPE2, SPECIFIED_TYPE3, TYPE, NAME, ...) \ + [&] { \ + const auto& __dtype__ = TYPE; \ + switch (__dtype__) { \ + PD_PRIVATE_CASE_TYPE(NAME, ::paddle::DataType::BOOL, bool, __VA_ARGS__) \ + PD_PRIVATE_CASE_TYPE( \ + NAME, ::paddle::DataType::FLOAT32, float, __VA_ARGS__) \ + PD_PRIVATE_CASE_TYPE( \ + NAME, ::paddle::DataType::FLOAT64, double, __VA_ARGS__) \ + PD_PRIVATE_CASE_TYPE(NAME, \ + ::paddle::DataType::COMPLEX64, \ + ::paddle::complex64, \ + __VA_ARGS__) \ + PD_PRIVATE_CASE_TYPE(NAME, \ + ::paddle::DataType::COMPLEX128, \ + ::paddle::complex128, \ + __VA_ARGS__) \ + PD_PRIVATE_CASE_TYPE( \ + NAME, \ + SPECIFIED_TYPE1, \ + ::paddle::experimental::DataTypeToCppType::type, \ + __VA_ARGS__) \ + PD_PRIVATE_CASE_TYPE( \ + NAME, \ + SPECIFIED_TYPE2, \ + ::paddle::experimental::DataTypeToCppType::type, \ + __VA_ARGS__) \ + PD_PRIVATE_CASE_TYPE( \ + NAME, \ + SPECIFIED_TYPE3, \ + ::paddle::experimental::DataTypeToCppType::type, \ + __VA_ARGS__) \ + default: \ + PD_THROW("function " #NAME " is not implemented for data type `", \ + __dtype__, \ + "`"); \ + } \ + }() + +} // namespace phi diff --git a/paddle/phi/kernels/cpu/cast_grad_kernel.cc b/paddle/phi/kernels/cpu/cast_grad_kernel.cc index c294c743bd4cf..79f53cbce1a4a 100644 --- a/paddle/phi/kernels/cpu/cast_grad_kernel.cc +++ b/paddle/phi/kernels/cpu/cast_grad_kernel.cc @@ -13,7 +13,9 @@ // limitations under the License. #include "paddle/phi/kernels/cast_grad_kernel.h" + #include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/core/visit_type.h" #include "paddle/phi/kernels/cpu/cast_impl.h" namespace phi { diff --git a/paddle/phi/kernels/cpu/cast_impl.h b/paddle/phi/kernels/cpu/cast_impl.h index d39ef24e7beb1..9648b584243f5 100644 --- a/paddle/phi/kernels/cpu/cast_impl.h +++ b/paddle/phi/kernels/cpu/cast_impl.h @@ -13,7 +13,7 @@ // limitations under the License. #pragma once -#include "paddle/phi/api/ext/dispatch.h" + #include "paddle/phi/backends/cpu/cpu_context.h" // See Note [ Why still include the fluid headers? ] diff --git a/paddle/phi/kernels/cpu/cast_kernel.cc b/paddle/phi/kernels/cpu/cast_kernel.cc index b53c94eb4cae2..2132f0d5ae86c 100644 --- a/paddle/phi/kernels/cpu/cast_kernel.cc +++ b/paddle/phi/kernels/cpu/cast_kernel.cc @@ -16,6 +16,7 @@ #include "paddle/phi/kernels/cpu/cast_impl.h" #include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/core/visit_type.h" namespace phi { diff --git a/paddle/phi/kernels/cpu/cross_entropy_grad_kernel.cc b/paddle/phi/kernels/cpu/cross_entropy_grad_kernel.cc index d4a632b5e6ece..021fdac225330 100644 --- a/paddle/phi/kernels/cpu/cross_entropy_grad_kernel.cc +++ b/paddle/phi/kernels/cpu/cross_entropy_grad_kernel.cc @@ -16,13 +16,11 @@ limitations under the License. */ #include "paddle/phi/backends/cpu/cpu_context.h" #include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/core/visit_type.h" #include "paddle/phi/kernels/copy_kernel.h" #include "paddle/phi/kernels/funcs/axis_utils.h" #include "paddle/phi/kernels/funcs/eigen/common.h" -// TODO(chenweihang): move dispatch.h into phi/core -#include "paddle/phi/api/ext/dispatch.h" - namespace phi { template @@ -200,7 +198,7 @@ void CrossEntropyWithSoftmaxGradKernel(const Context& dev_ctx, axis, logits_grad); } else { - PD_DISPATCH_INTEGRAL_TYPES( + PD_VISIT_INTEGRAL_TYPES( dtype, "CrossEntropyWithSoftmaxGradCPUKernel", ([&] { CrossEntropyWithSoftmaxGradCPUKernel(dev_ctx, label, diff --git a/paddle/phi/kernels/cpu/elementwise_kernel.cc b/paddle/phi/kernels/cpu/elementwise_kernel.cc index 4ca41de7bb64a..a91ca1ee3244b 100644 --- a/paddle/phi/kernels/cpu/elementwise_kernel.cc +++ b/paddle/phi/kernels/cpu/elementwise_kernel.cc @@ -13,7 +13,6 @@ // limitations under the License. #include "paddle/phi/kernels/cpu/elementwise.h" -#include "paddle/phi/api/ext/dispatch.h" #include "paddle/phi/backends/cpu/cpu_context.h" #include "paddle/phi/common/bfloat16.h" #include "paddle/phi/common/complex.h" diff --git a/paddle/phi/kernels/cpu/reduce.h b/paddle/phi/kernels/cpu/reduce.h index af67bdf5d624f..06a458832d19f 100644 --- a/paddle/phi/kernels/cpu/reduce.h +++ b/paddle/phi/kernels/cpu/reduce.h @@ -16,8 +16,8 @@ #include -#include "paddle/phi/api/ext/dispatch.h" #include "paddle/phi/backends/cpu/cpu_context.h" +#include "paddle/phi/core/visit_type.h" #include "paddle/phi/kernels/cast_kernel.h" #include "paddle/phi/api/lib/utils/storage.h" diff --git a/paddle/phi/kernels/cpu/transpose_kernel.cc b/paddle/phi/kernels/cpu/transpose_kernel.cc index 5dc4866e1efc3..a2f5aa2a29795 100644 --- a/paddle/phi/kernels/cpu/transpose_kernel.cc +++ b/paddle/phi/kernels/cpu/transpose_kernel.cc @@ -13,8 +13,9 @@ // limitations under the License. #include "paddle/phi/kernels/transpose_kernel.h" + #include -#include "paddle/phi/api/ext/dispatch.h" + #include "paddle/phi/backends/cpu/cpu_context.h" #include "paddle/phi/common/bfloat16.h" #include "paddle/phi/core/kernel_registry.h" diff --git a/paddle/phi/kernels/funcs/reduce_function.h b/paddle/phi/kernels/funcs/reduce_function.h index 4eb6ba0310886..b414dfc5d6e84 100644 --- a/paddle/phi/kernels/funcs/reduce_function.h +++ b/paddle/phi/kernels/funcs/reduce_function.h @@ -35,7 +35,6 @@ namespace cub = hipcub; #include "paddle/fluid/platform/device/gpu/gpu_device_function.h" #include "paddle/fluid/platform/device/gpu/gpu_launch_config.h" -#include "paddle/phi/api/ext/dispatch.h" #include "paddle/phi/backends/gpu/gpu_context.h" #include "paddle/phi/backends/gpu/gpu_info.h" #include "paddle/phi/core/dense_tensor.h" diff --git a/paddle/phi/kernels/gpu/cast_grad_kernel.cu b/paddle/phi/kernels/gpu/cast_grad_kernel.cu index 1c1d8cf2c06d4..f4b610301583c 100644 --- a/paddle/phi/kernels/gpu/cast_grad_kernel.cu +++ b/paddle/phi/kernels/gpu/cast_grad_kernel.cu @@ -12,8 +12,10 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/kernels/cast_grad_kernel.h" + +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/core/visit_type.h" #include "paddle/phi/kernels/gpu/cast_impl.h" namespace phi { diff --git a/paddle/phi/kernels/gpu/cast_impl.h b/paddle/phi/kernels/gpu/cast_impl.h index 8f6351e675cfa..f73d396572541 100644 --- a/paddle/phi/kernels/gpu/cast_impl.h +++ b/paddle/phi/kernels/gpu/cast_impl.h @@ -13,7 +13,7 @@ // limitations under the License. #pragma once -#include "paddle/phi/api/ext/dispatch.h" + #include "paddle/phi/backends/gpu/gpu_context.h" #include "paddle/phi/kernels/funcs/elementwise_base.h" diff --git a/paddle/phi/kernels/gpu/cast_kernel.cu b/paddle/phi/kernels/gpu/cast_kernel.cu index 40a84648e4b16..a879dc3bafd74 100644 --- a/paddle/phi/kernels/gpu/cast_kernel.cu +++ b/paddle/phi/kernels/gpu/cast_kernel.cu @@ -12,8 +12,10 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/kernels/cast_kernel.h" + +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/core/visit_type.h" #include "paddle/phi/kernels/gpu/cast_impl.h" namespace phi { diff --git a/paddle/phi/kernels/gpu/cross_entropy_grad_kernel.cu b/paddle/phi/kernels/gpu/cross_entropy_grad_kernel.cu index 215b94c52b395..c66daf4fe64e6 100644 --- a/paddle/phi/kernels/gpu/cross_entropy_grad_kernel.cu +++ b/paddle/phi/kernels/gpu/cross_entropy_grad_kernel.cu @@ -24,15 +24,13 @@ namespace cub = hipcub; #include "paddle/phi/common/amp_type_traits.h" #include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/core/visit_type.h" #include "paddle/phi/kernels/copy_kernel.h" #include "paddle/phi/kernels/funcs/axis_utils.h" #include "paddle/phi/kernels/funcs/for_range.h" #include "paddle/phi/kernels/funcs/math_function.h" #include "paddle/phi/kernels/gpudnn/softmax_gpudnn.h" -// TODO(chenweihang): move dispatch.h into phi/core -#include "paddle/phi/api/ext/dispatch.h" - #include "paddle/fluid/operators/math/cross_entropy.h" #include "paddle/fluid/operators/math/softmax.h" #include "paddle/fluid/platform/device/gpu/gpu_device_function.h" @@ -267,7 +265,7 @@ void CrossEntropyWithSoftmaxGradKernel(const Context& dev_ctx, axis, logits_grad); } else { - PD_DISPATCH_INTEGRAL_TYPES( + PD_VISIT_INTEGRAL_TYPES( dtype, "CrossEntropyWithSoftmaxGradGPUKernel", ([&] { CrossEntropyWithSoftmaxGradGPUKernel(dev_ctx, label, diff --git a/paddle/phi/kernels/gpu/cross_entropy_kernel.cu b/paddle/phi/kernels/gpu/cross_entropy_kernel.cu index 055706cffd41e..1908c78060483 100644 --- a/paddle/phi/kernels/gpu/cross_entropy_kernel.cu +++ b/paddle/phi/kernels/gpu/cross_entropy_kernel.cu @@ -24,15 +24,13 @@ namespace cub = hipcub; #include "paddle/phi/common/amp_type_traits.h" #include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/core/visit_type.h" #include "paddle/phi/kernels/copy_kernel.h" #include "paddle/phi/kernels/funcs/axis_utils.h" #include "paddle/phi/kernels/funcs/for_range.h" #include "paddle/phi/kernels/funcs/math_function.h" #include "paddle/phi/kernels/gpudnn/softmax_gpudnn.h" -// TODO(chenweihang): move dispatch.h into phi/core -#include "paddle/phi/api/ext/dispatch.h" - #include "paddle/fluid/operators/math/cross_entropy.h" #include "paddle/fluid/operators/math/softmax.h" #include "paddle/fluid/platform/device/gpu/gpu_device_function.h" @@ -1529,19 +1527,19 @@ void CrossEntropyWithSoftmaxKernel(const Context& dev_ctx, softmax, loss); } else { - PD_DISPATCH_INTEGRAL_TYPES( - dtype, "CrossEntropyWithSoftmaxCUDAKernel", ([&] { - CrossEntropyWithSoftmaxCUDAKernel(dev_ctx, - logits, - label, - soft_label, - use_softmax, - numeric_stable_mode, - ignore_index, - axis, - softmax, - loss); - })); + PD_VISIT_INTEGRAL_TYPES(dtype, "CrossEntropyWithSoftmaxCUDAKernel", ([&] { + CrossEntropyWithSoftmaxCUDAKernel( + dev_ctx, + logits, + label, + soft_label, + use_softmax, + numeric_stable_mode, + ignore_index, + axis, + softmax, + loss); + })); } } diff --git a/paddle/phi/kernels/gpu/reduce.h b/paddle/phi/kernels/gpu/reduce.h index a54669c6e9d42..6fb81edd6bf47 100644 --- a/paddle/phi/kernels/gpu/reduce.h +++ b/paddle/phi/kernels/gpu/reduce.h @@ -18,6 +18,7 @@ #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ defined(PADDLE_WITH_XPU_KP) +#include "paddle/phi/core/visit_type.h" #include "paddle/phi/kernels/funcs/reduce_function.h" namespace phi { diff --git a/paddle/phi/kernels/gpu/reduce_grad.h b/paddle/phi/kernels/gpu/reduce_grad.h index 1e39a08e9cbaf..e1f7419fb7a01 100644 --- a/paddle/phi/kernels/gpu/reduce_grad.h +++ b/paddle/phi/kernels/gpu/reduce_grad.h @@ -23,7 +23,7 @@ #include #include -#include "paddle/phi/api/ext/dispatch.h" +#include "paddle/phi/core/visit_type.h" #include "paddle/phi/kernels/funcs/broadcast_function.h" namespace phi { diff --git a/paddle/phi/kernels/gpu/transpose_kernel.cu b/paddle/phi/kernels/gpu/transpose_kernel.cu index 9ea2af292ccf1..203f10e4ddd47 100644 --- a/paddle/phi/kernels/gpu/transpose_kernel.cu +++ b/paddle/phi/kernels/gpu/transpose_kernel.cu @@ -14,7 +14,6 @@ #include -#include "paddle/phi/api/ext/dispatch.h" #include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/kernels/transpose_kernel.h" diff --git a/paddle/phi/kernels/sparse/cpu/convolution_grad_kernel.cc b/paddle/phi/kernels/sparse/cpu/convolution_grad_kernel.cc index 80693c90d1e7f..216685f0f7191 100644 --- a/paddle/phi/kernels/sparse/cpu/convolution_grad_kernel.cc +++ b/paddle/phi/kernels/sparse/cpu/convolution_grad_kernel.cc @@ -13,13 +13,12 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/phi/kernels/sparse/convolution_grad_kernel.h" +#include "paddle/phi/core/visit_type.h" #include "paddle/phi/kernels/copy_kernel.h" #include "paddle/phi/kernels/funcs/blas/blas.h" #include "paddle/phi/kernels/funcs/math_function.h" #include "paddle/phi/kernels/sparse/cpu/convolution.h" -#include "paddle/phi/api/ext/dispatch.h" - namespace phi { namespace sparse { @@ -191,7 +190,7 @@ void Conv3dGradKernel(const Context& dev_ctx, const bool subm, SparseCooTensor* x_grad, DenseTensor* kernel_grad) { - PD_DISPATCH_INTEGRAL_TYPES( + PD_VISIT_INTEGRAL_TYPES( x.non_zero_indices().dtype(), "Conv3dGradCPUKernel", ([&] { Conv3dGradCPUKernel(dev_ctx, x, diff --git a/paddle/phi/kernels/sparse/cpu/convolution_kernel.cc b/paddle/phi/kernels/sparse/cpu/convolution_kernel.cc index a1c8cf014c7fb..c920f3c461287 100644 --- a/paddle/phi/kernels/sparse/cpu/convolution_kernel.cc +++ b/paddle/phi/kernels/sparse/cpu/convolution_kernel.cc @@ -15,10 +15,9 @@ limitations under the License. */ #include "paddle/phi/kernels/sparse/cpu/convolution.h" #include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/core/tensor_meta.h" +#include "paddle/phi/core/visit_type.h" #include "paddle/phi/kernels/funcs/blas/blas.h" -#include "paddle/phi/api/ext/dispatch.h" - namespace phi { namespace sparse { @@ -159,7 +158,7 @@ void Conv3dKernel(const Context& dev_ctx, const bool subm, SparseCooTensor* out, DenseTensor* rulebook) { - PD_DISPATCH_INTEGRAL_TYPES( + PD_VISIT_INTEGRAL_TYPES( x.non_zero_indices().dtype(), "Conv3dCPUKernel", ([&] { Conv3dCPUKernel(dev_ctx, x, diff --git a/paddle/phi/kernels/sparse/cpu/sparse_mask_kernel.cc b/paddle/phi/kernels/sparse/cpu/sparse_mask_kernel.cc index a07a7fb2ecf44..c10a240c68430 100644 --- a/paddle/phi/kernels/sparse/cpu/sparse_mask_kernel.cc +++ b/paddle/phi/kernels/sparse/cpu/sparse_mask_kernel.cc @@ -16,13 +16,12 @@ limitations under the License. */ #include "paddle/phi/core/ddim.h" #include "paddle/phi/core/enforce.h" #include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/core/visit_type.h" #include "paddle/phi/kernels/copy_kernel.h" #include "paddle/phi/kernels/empty_kernel.h" #include "paddle/phi/kernels/funcs/math_function.h" #include "paddle/phi/kernels/funcs/sparse/common_shape.h" -#include "paddle/phi/api/ext/dispatch.h" - namespace phi { namespace sparse { @@ -78,7 +77,7 @@ void SparseMaskKernel(const Context& dev_ctx, const DenseTensor& x, const SparseCooTensor& mask, SparseCooTensor* out) { - PD_DISPATCH_INTEGRAL_TYPES( + PD_VISIT_INTEGRAL_TYPES( mask.non_zero_indices().dtype(), "SparseMaskCPUKernel", ([&] { SparseMaskCPUKernel(dev_ctx, x, mask, out); })); @@ -145,7 +144,7 @@ void SparseMaskHelperKernel(const Context& dev_ctx, const SparseCooTensor& x, const DenseTensor& mask_indices, DenseTensor* out) { - PD_DISPATCH_INTEGRAL_TYPES( + PD_VISIT_INTEGRAL_TYPES( x.non_zero_indices().dtype(), "SparseMaskHelperCPUKernel", ([&] { SparseMaskHelperCPUKernel(dev_ctx, x, mask_indices, out); })); diff --git a/paddle/phi/kernels/sparse/cpu/sparse_pool_grad_kernel.cc b/paddle/phi/kernels/sparse/cpu/sparse_pool_grad_kernel.cc index 30221975e7756..78b6354f44f9e 100644 --- a/paddle/phi/kernels/sparse/cpu/sparse_pool_grad_kernel.cc +++ b/paddle/phi/kernels/sparse/cpu/sparse_pool_grad_kernel.cc @@ -14,13 +14,12 @@ limitations under the License. */ #include "paddle/phi/kernels/sparse/sparse_pool_grad_kernel.h" #include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/core/visit_type.h" #include "paddle/phi/kernels/copy_kernel.h" #include "paddle/phi/kernels/empty_kernel.h" #include "paddle/phi/kernels/funcs/pooling.h" #include "paddle/phi/kernels/funcs/sparse/convolution.h" -#include "paddle/phi/api/ext/dispatch.h" - namespace phi { namespace sparse { @@ -82,7 +81,7 @@ void MaxPoolGradKernel(const Context& dev_ctx, const SparseCooTensor& out_grad, const std::vector& kernel_sizes, SparseCooTensor* x_grad) { - PD_DISPATCH_INTEGRAL_TYPES( + PD_VISIT_INTEGRAL_TYPES( x.non_zero_indices().dtype(), "MaxPoolGradCPUKernel", ([&] { MaxPoolGradCPUKernel( dev_ctx, x, rulebook, out, out_grad, kernel_sizes, x_grad); diff --git a/paddle/phi/kernels/sparse/cpu/sparse_pool_kernel.cc b/paddle/phi/kernels/sparse/cpu/sparse_pool_kernel.cc index ed6e0200587e8..28211a1cda347 100644 --- a/paddle/phi/kernels/sparse/cpu/sparse_pool_kernel.cc +++ b/paddle/phi/kernels/sparse/cpu/sparse_pool_kernel.cc @@ -15,12 +15,11 @@ limitations under the License. */ #include "paddle/phi/kernels/sparse/sparse_pool_kernel.h" #include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/core/tensor_meta.h" +#include "paddle/phi/core/visit_type.h" #include "paddle/phi/kernels/funcs/pooling.h" #include "paddle/phi/kernels/funcs/sparse/convolution.h" #include "paddle/phi/kernels/sparse/cpu/convolution.h" -#include "paddle/phi/api/ext/dispatch.h" - namespace phi { namespace sparse { @@ -106,7 +105,7 @@ void MaxPoolKernel(const Context& dev_ctx, const std::vector& strides, SparseCooTensor* out, DenseTensor* rulebook) { - PD_DISPATCH_INTEGRAL_TYPES( + PD_VISIT_INTEGRAL_TYPES( x.non_zero_indices().dtype(), "MaxPoolCPUKernel", ([&] { MaxPoolCPUKernel(dev_ctx, x, diff --git a/paddle/phi/kernels/sparse/gpu/convolution.cu.h b/paddle/phi/kernels/sparse/gpu/convolution.cu.h index 5662a4fac71c5..1bceb767b6708 100644 --- a/paddle/phi/kernels/sparse/gpu/convolution.cu.h +++ b/paddle/phi/kernels/sparse/gpu/convolution.cu.h @@ -338,7 +338,7 @@ int ProductRuleBook(const Context& dev_ctx, SparseCooTensor* out, std::vector* h_counter, std::vector* h_offsets) { - // TODO(zhangkaihuo): use PD_DISPATCH_INTEGRAL_TYPES for secondary dispatch + // TODO(zhangkaihuo): use PD_VISIT_INTEGRAL_TYPES for secondary dispatch auto indices_dtype = paddle::experimental::CppTypeToDataType::Type(); const int64_t non_zero_num = x.nnz(); const auto& non_zero_indices = x.non_zero_indices(); diff --git a/paddle/phi/kernels/sparse/gpu/convolution_grad_kernel.cu b/paddle/phi/kernels/sparse/gpu/convolution_grad_kernel.cu index 2b61be7289646..6c37f759923c3 100644 --- a/paddle/phi/kernels/sparse/gpu/convolution_grad_kernel.cu +++ b/paddle/phi/kernels/sparse/gpu/convolution_grad_kernel.cu @@ -18,14 +18,13 @@ limitations under the License. */ #include "paddle/phi/backends/gpu/gpu_launch_config.h" #include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/core/tensor_meta.h" +#include "paddle/phi/core/visit_type.h" #include "paddle/phi/kernels/copy_kernel.h" #include "paddle/phi/kernels/funcs/blas/blas.h" #include "paddle/phi/kernels/funcs/math_function.h" #include "paddle/phi/kernels/sparse/convolution_grad_kernel.h" #include "paddle/phi/kernels/sparse/gpu/convolution.cu.h" -#include "paddle/phi/api/ext/dispatch.h" - namespace phi { namespace sparse { @@ -249,7 +248,7 @@ void Conv3dGradKernel(const Context& dev_ctx, const bool subm, SparseCooTensor* x_grad, DenseTensor* kernel_grad) { - PD_DISPATCH_INTEGRAL_TYPES( + PD_VISIT_INTEGRAL_TYPES( x.non_zero_indices().dtype(), "Conv3dGradGPUKernel", ([&] { Conv3dGradGPUKernel(dev_ctx, x, diff --git a/paddle/phi/kernels/sparse/gpu/convolution_kernel.cu b/paddle/phi/kernels/sparse/gpu/convolution_kernel.cu index 2d212eadffac1..83f19ce5785df 100644 --- a/paddle/phi/kernels/sparse/gpu/convolution_kernel.cu +++ b/paddle/phi/kernels/sparse/gpu/convolution_kernel.cu @@ -15,12 +15,11 @@ limitations under the License. */ #include "paddle/phi/backends/gpu/gpu_context.h" #include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/core/tensor_meta.h" +#include "paddle/phi/core/visit_type.h" #include "paddle/phi/kernels/funcs/blas/blas.h" #include "paddle/phi/kernels/sparse/convolution_kernel.h" #include "paddle/phi/kernels/sparse/gpu/convolution.cu.h" -#include "paddle/phi/api/ext/dispatch.h" - namespace phi { namespace sparse { @@ -177,7 +176,7 @@ void Conv3dKernel(const Context& dev_ctx, const bool subm, SparseCooTensor* out, DenseTensor* rulebook) { - PD_DISPATCH_INTEGRAL_TYPES( + PD_VISIT_INTEGRAL_TYPES( x.non_zero_indices().dtype(), "Conv3dGPUKernel", ([&] { Conv3dGPUKernel(dev_ctx, x, diff --git a/paddle/phi/kernels/sparse/gpu/sparse_mask_kernel.cu b/paddle/phi/kernels/sparse/gpu/sparse_mask_kernel.cu index 96ab56697b9b0..dff1cc2318f13 100644 --- a/paddle/phi/kernels/sparse/gpu/sparse_mask_kernel.cu +++ b/paddle/phi/kernels/sparse/gpu/sparse_mask_kernel.cu @@ -19,14 +19,13 @@ limitations under the License. */ #include "paddle/phi/core/ddim.h" #include "paddle/phi/core/enforce.h" #include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/core/visit_type.h" #include "paddle/phi/kernels/copy_kernel.h" #include "paddle/phi/kernels/empty_kernel.h" #include "paddle/phi/kernels/funcs/math_function.h" #include "paddle/phi/kernels/funcs/sparse/common_shape.h" #include "paddle/phi/kernels/sparse/sparse_mask_kernel.h" -#include "paddle/phi/api/ext/dispatch.h" - namespace phi { namespace sparse { @@ -118,7 +117,7 @@ void SparseMaskKernel(const Context& dev_ctx, const DenseTensor& x, const SparseCooTensor& mask, SparseCooTensor* out) { - PD_DISPATCH_INTEGRAL_TYPES( + PD_VISIT_INTEGRAL_TYPES( mask.non_zero_indices().dtype(), "SparseMaskGPUKernel", ([&] { SparseMaskGPUKernel(dev_ctx, x, mask, out); })); @@ -265,7 +264,7 @@ void SparseMaskHelperKernel(const Context& dev_ctx, const SparseCooTensor& x, const DenseTensor& mask_indices, DenseTensor* out) { - PD_DISPATCH_INTEGRAL_TYPES( + PD_VISIT_INTEGRAL_TYPES( x.non_zero_indices().dtype(), "SparseMaskHelperGPUKernel", ([&] { SparseMaskHelperGPUKernel(dev_ctx, x, mask_indices, out); })); diff --git a/paddle/phi/kernels/sparse/gpu/sparse_pool_grad_kernel.cu b/paddle/phi/kernels/sparse/gpu/sparse_pool_grad_kernel.cu index 8657e7319d8ca..bd862a44afeeb 100644 --- a/paddle/phi/kernels/sparse/gpu/sparse_pool_grad_kernel.cu +++ b/paddle/phi/kernels/sparse/gpu/sparse_pool_grad_kernel.cu @@ -18,14 +18,13 @@ limitations under the License. */ #include "paddle/phi/backends/gpu/gpu_info.h" #include "paddle/phi/backends/gpu/gpu_launch_config.h" #include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/core/visit_type.h" #include "paddle/phi/kernels/copy_kernel.h" #include "paddle/phi/kernels/empty_kernel.h" #include "paddle/phi/kernels/funcs/math_function.h" #include "paddle/phi/kernels/funcs/pooling.h" #include "paddle/phi/kernels/funcs/sparse/convolution.h" -#include "paddle/phi/api/ext/dispatch.h" - namespace phi { namespace sparse { @@ -129,7 +128,7 @@ void MaxPoolGradKernel(const Context& dev_ctx, const SparseCooTensor& out_grad, const std::vector& kernel_sizes, SparseCooTensor* x_grad) { - PD_DISPATCH_INTEGRAL_TYPES( + PD_VISIT_INTEGRAL_TYPES( x.non_zero_indices().dtype(), "MaxPoolGradGPUKernel", ([&] { MaxPoolGradGPUKernel( dev_ctx, x, rulebook, out, out_grad, kernel_sizes, x_grad); diff --git a/paddle/phi/kernels/sparse/gpu/sparse_pool_kernel.cu b/paddle/phi/kernels/sparse/gpu/sparse_pool_kernel.cu index a59cd3c7a5a78..b76b61f83bfc9 100644 --- a/paddle/phi/kernels/sparse/gpu/sparse_pool_kernel.cu +++ b/paddle/phi/kernels/sparse/gpu/sparse_pool_kernel.cu @@ -16,12 +16,11 @@ limitations under the License. */ #include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/core/tensor_meta.h" +#include "paddle/phi/core/visit_type.h" #include "paddle/phi/kernels/funcs/pooling.h" #include "paddle/phi/kernels/funcs/sparse/convolution.h" #include "paddle/phi/kernels/sparse/gpu/convolution.cu.h" -#include "paddle/phi/api/ext/dispatch.h" - namespace phi { namespace sparse { @@ -136,7 +135,7 @@ void MaxPoolKernel(const Context& dev_ctx, const std::vector& strides, SparseCooTensor* out, DenseTensor* rulebook) { - PD_DISPATCH_INTEGRAL_TYPES( + PD_VISIT_INTEGRAL_TYPES( x.non_zero_indices().dtype(), "MaxPoolGPUKernel", ([&] { MaxPoolGPUKernel(dev_ctx, x, diff --git a/paddle/phi/kernels/transfer_layout_kernel.cc b/paddle/phi/kernels/transfer_layout_kernel.cc index 60df877355b82..f7ecf379fdfa9 100644 --- a/paddle/phi/kernels/transfer_layout_kernel.cc +++ b/paddle/phi/kernels/transfer_layout_kernel.cc @@ -14,9 +14,9 @@ limitations under the License. */ #include "paddle/phi/kernels/transfer_layout_kernel.h" -#include "paddle/phi/api/ext/dispatch.h" #include "paddle/phi/backends/all_context.h" #include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/core/visit_type.h" #include "paddle/phi/kernels/funcs/math_function.h" namespace phi { diff --git a/paddle/phi/kernels/xpu/full_kernel.cc b/paddle/phi/kernels/xpu/full_kernel.cc index 6668ae39cbdbe..978bdb5129c04 100644 --- a/paddle/phi/kernels/xpu/full_kernel.cc +++ b/paddle/phi/kernels/xpu/full_kernel.cc @@ -14,13 +14,13 @@ #include "paddle/phi/kernels/full_kernel.h" -#include "paddle/phi/api/ext/dispatch.h" #include "paddle/phi/backends/xpu/xpu_context.h" #include "paddle/phi/common/bfloat16.h" #include "paddle/phi/common/complex.h" #include "paddle/phi/common/float16.h" #include "paddle/phi/common/scalar.h" #include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/core/visit_type.h" // See Note [ Why still include the fluid headers? ] #include "paddle/fluid/memory/memcpy.h" From 5a6182b80481542a81b1cfdc7ff3dcdbf1926d4e Mon Sep 17 00:00:00 2001 From: Wilber Date: Thu, 14 Apr 2022 11:08:23 +0800 Subject: [PATCH 140/211] infrt run once (A trick version) (#41634) * temporariliy run once * update * update * update * update * fix ci problem --- paddle/infrt/dialect/phi/ir/phi_base.cc | 2 +- paddle/infrt/dialect/tensorrt/convert.h | 2 +- paddle/infrt/dialect/tensorrt/trt_exec.cc | 2 +- paddle/infrt/host_context/op_executable.cc | 11 ++++- .../infrt/kernel/phi/dense_tensor_kernels.cc | 47 +++++++++++++------ .../infrt/kernel/phi/dense_tensor_kernels.h | 7 +-- paddle/infrt/kernel/tensor_kernels.cc | 7 ++- paddle/infrt/kernel/tensorrt/trt_kernels.cc | 2 +- paddle/infrt/kernel/tensorrt/trt_layers.h | 21 +++++++++ 9 files changed, 76 insertions(+), 25 deletions(-) diff --git a/paddle/infrt/dialect/phi/ir/phi_base.cc b/paddle/infrt/dialect/phi/ir/phi_base.cc index f91381fe72903..1bd6068d3fb96 100644 --- a/paddle/infrt/dialect/phi/ir/phi_base.cc +++ b/paddle/infrt/dialect/phi/ir/phi_base.cc @@ -14,7 +14,7 @@ #include "paddle/infrt/dialect/phi/ir/phi_base.h" -#include +#include #include #include #include diff --git a/paddle/infrt/dialect/tensorrt/convert.h b/paddle/infrt/dialect/tensorrt/convert.h index be363e77848a5..2a242ca285ba8 100644 --- a/paddle/infrt/dialect/tensorrt/convert.h +++ b/paddle/infrt/dialect/tensorrt/convert.h @@ -15,7 +15,7 @@ #include #include -#include +#include #include #include #include diff --git a/paddle/infrt/dialect/tensorrt/trt_exec.cc b/paddle/infrt/dialect/tensorrt/trt_exec.cc index 2682a744bb056..dcb84ceb50edf 100644 --- a/paddle/infrt/dialect/tensorrt/trt_exec.cc +++ b/paddle/infrt/dialect/tensorrt/trt_exec.cc @@ -87,7 +87,7 @@ int main(int argc, char** argv) { std::cout << "\npass failed!\n" << std::endl; return 4; } - // module->dump(); + module->dump(); ::infrt::host_context::TestMlir(module.get(), ®istry); return 0; } diff --git a/paddle/infrt/host_context/op_executable.cc b/paddle/infrt/host_context/op_executable.cc index 59a73e7108328..4d588a9c2b523 100644 --- a/paddle/infrt/host_context/op_executable.cc +++ b/paddle/infrt/host_context/op_executable.cc @@ -16,6 +16,7 @@ #include #include +#include #include "paddle/infrt/host_context/kernel_frame.h" #include "paddle/infrt/host_context/kernel_registry.h" @@ -71,7 +72,15 @@ OpExecutableBuilder::OpExecutableBuilder(const std::string& op_name, // TODO(Superjomn) support other device other than CPU. CHECK(impl_->kernel_impl) << "No CPU kernel called " << op_name; - if (op_name == "dt.get_param") { + // TODO(wilber): Maybe we can use the MLIR trait or other facilities to remove + // the run_once set. + std::unordered_set run_once_set{ + "dt.get_param", + "trt.create_engine", + "phi_dt.create_host_inited_dense_tensor.f32", + "phi_dt.create_context.cpu", + "phi_dt.create_context.gpu"}; + if (run_once_set.count(op_name)) { impl_->run_once = true; } } diff --git a/paddle/infrt/kernel/phi/dense_tensor_kernels.cc b/paddle/infrt/kernel/phi/dense_tensor_kernels.cc index fe1cda0e10028..7ffc8de151075 100644 --- a/paddle/infrt/kernel/phi/dense_tensor_kernels.cc +++ b/paddle/infrt/kernel/phi/dense_tensor_kernels.cc @@ -22,6 +22,7 @@ #include "paddle/infrt/tensor/tensor_map.h" #include "paddle/phi/backends/all_context.h" #include "paddle/phi/common/place.h" +#include "paddle/phi/core/dense_tensor.h" #ifdef INFRT_WITH_GPU #include @@ -308,34 +309,50 @@ inline size_t SizeOfDataType(::phi::DataType data_type) { } return 0; } -::phi::DenseTensor GpuMemCpy(const ::phi::DenseTensor& input, - const ::phi::GPUContext& context, - bool d2h) { +void GpuMemCpy(const ::phi::DenseTensor& input, + const ::phi::GPUContext& context, + bool d2h, + ::phi::DenseTensor* output) { if (d2h) { - ::phi::DenseTensor ret( - const_cast<::phi::Allocator*>(&context.GetHostAllocator()), - input.meta()); CHECK(input.place().GetType() == ::phi::AllocationType::GPU); - // TODO(wilber): Add sync op and stream. - cudaMemcpyAsync(ret.data(), + + // TODO(wilber): Just a trick to avoid malloc. + if (input.numel() > output->numel()) { + // TODO(wilber): Use pinned memory. + output->Resize(input.dims()); + context.HostAlloc( + output, input.dtype(), input.numel() * SizeOfDataType(input.dtype())); + } + + cudaMemcpyAsync(output->data(), input.data(), SizeOfDataType(input.dtype()) * input.numel(), cudaMemcpyDeviceToHost, - nullptr); - return ret; + context.stream()); + // TODO(wilber): Ir add sync op. + cudaStreamSynchronize(context.stream()); } else { // h2d - ::phi::DenseTensor ret( - const_cast<::phi::Allocator*>(&context.GetAllocator()), input.meta()); CHECK(input.place().GetType() == ::phi::AllocationType::CPU || input.place().GetType() == ::phi::AllocationType::GPUPINNED); + + if (input.numel() > output->numel()) { + output->Resize(input.dims()); + context.Alloc(output, + input.dtype(), + input.numel() * SizeOfDataType(input.dtype()), + false); + + } else { + output->Resize(input.dims()); + } + // TODO(wilber): Add sync op and stream. - cudaMemcpyAsync(ret.data(), + cudaMemcpyAsync(output->data(), input.data(), SizeOfDataType(input.dtype()) * input.numel(), cudaMemcpyHostToDevice, - nullptr); - return ret; + context.stream()); } } #endif diff --git a/paddle/infrt/kernel/phi/dense_tensor_kernels.h b/paddle/infrt/kernel/phi/dense_tensor_kernels.h index b1075444731b5..c401fb99978a3 100644 --- a/paddle/infrt/kernel/phi/dense_tensor_kernels.h +++ b/paddle/infrt/kernel/phi/dense_tensor_kernels.h @@ -76,9 +76,10 @@ ::infrt::phi::DenseTensorMap LoadCombinedParameters( int32_t TensorMapGetSize(const ::infrt::phi::DenseTensorMap& map); #ifdef INFRT_WITH_GPU -::phi::DenseTensor GpuMemCpy(const ::phi::DenseTensor& input, - const ::phi::GPUContext& context, - bool d2h); +void GpuMemCpy(const ::phi::DenseTensor& input, + const ::phi::GPUContext& context, + bool d2h, + ::phi::DenseTensor* output); #endif } // namespace phi diff --git a/paddle/infrt/kernel/tensor_kernels.cc b/paddle/infrt/kernel/tensor_kernels.cc index 65e137472b3d6..2e952e77d1f0a 100644 --- a/paddle/infrt/kernel/tensor_kernels.cc +++ b/paddle/infrt/kernel/tensor_kernels.cc @@ -119,6 +119,7 @@ void NaiveMatmul(const DenseHostTensor &x, const int N = w.shape().GetDim(1); for (int i = 0; i < M; i++) { for (int j = 0; j < N; j++) { + out_data[i * N + j] = 0; for (int k = 0; k < K; k++) { out_data[i * N + j] += x_data[i * K + k] * w_data[k * N + j]; } @@ -134,9 +135,11 @@ void RegisterTensorKernels(host_context::KernelRegistry *registry) { {"shape"}); registry->AddKernel("dt.print_tensor", INFRT_KERNEL(PrintTensor)); registry->AddKernel("dt.fill_tensor_with_constant.f32", - INFRT_KERNEL(FillTensorWithConstant)); + INFRT_KERNEL(FillTensorWithConstant), + {"value"}); registry->AddKernel("dt.fill_tensor_with_constant.f64", - INFRT_KERNEL(FillTensorWithConstant)); + INFRT_KERNEL(FillTensorWithConstant), + {"value"}); // TensorMap related methods. registry->AddKernel("dt.load_params", INFRT_KERNEL(LoadParams)); diff --git a/paddle/infrt/kernel/tensorrt/trt_kernels.cc b/paddle/infrt/kernel/tensorrt/trt_kernels.cc index c182dda2705fd..c0f5ebb4a7657 100644 --- a/paddle/infrt/kernel/tensorrt/trt_kernels.cc +++ b/paddle/infrt/kernel/tensorrt/trt_kernels.cc @@ -57,7 +57,7 @@ ::infrt::backends::tensorrt::TrtEngine CreateTrtEngine( // TODO(wilber): The build option shoule be fiiled from mlir info. backends::tensorrt::BuildOptions options; options.max_batch = 4; - options.workspace = 1024; + options.workspace = 128; // Parse mlir Region which only has one block. mlir::Operation& operation = *create_engine_op.operation; diff --git a/paddle/infrt/kernel/tensorrt/trt_layers.h b/paddle/infrt/kernel/tensorrt/trt_layers.h index 9d8eba0bb31f5..0f2c2c88ca097 100644 --- a/paddle/infrt/kernel/tensorrt/trt_layers.h +++ b/paddle/infrt/kernel/tensorrt/trt_layers.h @@ -115,6 +115,27 @@ inline void PoolFunc(trt::PoolingOp& op, // NOLINT // TODO(Inference) // CHECK(false) << "Not supported adaptive pool"; + // TODO(wilber): Reformat. + // global average pooling. + auto ksize_vec = ArrayAttrToVec(ksize); + if (static_cast(pool_type) == + nvinfer1::PoolingType::kAVERAGE && + ksize_vec.size() == 2 && ksize_vec[0] == 1 && ksize_vec[1] == 1) { + nvinfer1::Dims dims; + dims.nbDims = 2; + dims.d[0] = input_shape.d[1]; + dims.d[1] = input_shape.d[2]; + auto* layer = network->addPoolingNd( + *input_itensor, static_cast(pool_type), dims); + CHECK_NOTNULL(layer); + + mlir::Value out_repr = op.output_tensor(); + nvinfer1::ITensor* out_tensor = layer->getOutput(0); + value_to_trt_tensor_map[out_repr] = out_tensor; + return; + } + + // plugin... std::vector input_shape_v; for (int i = 0; i < input_dims; i++) { input_shape_v.push_back(input_shape.d[i]); From 419d8eb2442ac2f769448e61337466090a5b49bc Mon Sep 17 00:00:00 2001 From: zhangbo9674 <82555433+zhangbo9674@users.noreply.github.com> Date: Thu, 14 Apr 2022 11:09:11 +0800 Subject: [PATCH 141/211] support weakref for eager tensor (#41769) --- paddle/fluid/pybind/eager.cc | 3 +++ paddle/fluid/pybind/eager.h | 2 ++ 2 files changed, 5 insertions(+) diff --git a/paddle/fluid/pybind/eager.cc b/paddle/fluid/pybind/eager.cc index c600844596d98..74d15b6c0ca79 100644 --- a/paddle/fluid/pybind/eager.cc +++ b/paddle/fluid/pybind/eager.cc @@ -709,6 +709,8 @@ int TensorInit(PyObject* self, PyObject* args, PyObject* kwargs) { } static void TensorDealloc(TensorObject* self) { + if (self->weakrefs != NULL) + PyObject_ClearWeakRefs(reinterpret_cast(self)); self->tensor.~Tensor(); Py_TYPE(self)->tp_free(reinterpret_cast(self)); } @@ -739,6 +741,7 @@ void BindEager(pybind11::module* module) { type->tp_getset = variable_properties; type->tp_init = TensorInit; type->tp_new = TensorNew; + type->tp_weaklistoffset = offsetof(TensorObject, weakrefs); Py_INCREF(&PyBaseObject_Type); type->tp_base = reinterpret_cast(&PyBaseObject_Type); type->tp_flags |= diff --git a/paddle/fluid/pybind/eager.h b/paddle/fluid/pybind/eager.h index bb55ef62ee689..03676a677ac90 100644 --- a/paddle/fluid/pybind/eager.h +++ b/paddle/fluid/pybind/eager.h @@ -22,6 +22,8 @@ namespace pybind { typedef struct { PyObject_HEAD paddle::experimental::Tensor tensor; + // Weak references + PyObject* weakrefs; } TensorObject; typedef struct { From e26e51ba87b343fd63c1bc2a0f8c158f1efd6162 Mon Sep 17 00:00:00 2001 From: xiayanming Date: Thu, 14 Apr 2022 11:21:15 +0800 Subject: [PATCH 142/211] [fix bug] communication op suppport rccl (#41763) --- paddle/fluid/operators/collective/alltoall_op.cu.cc | 6 +++--- .../operators/collective/c_comm_init_multitrainer_op.cc | 7 +++++-- paddle/fluid/operators/collective/global_gather_op.cu.cc | 6 +++--- paddle/fluid/operators/collective/global_scatter_op.cu.cc | 6 +++--- 4 files changed, 14 insertions(+), 11 deletions(-) diff --git a/paddle/fluid/operators/collective/alltoall_op.cu.cc b/paddle/fluid/operators/collective/alltoall_op.cu.cc index 26fdee200cd84..0e0ea72208488 100644 --- a/paddle/fluid/operators/collective/alltoall_op.cu.cc +++ b/paddle/fluid/operators/collective/alltoall_op.cu.cc @@ -14,7 +14,7 @@ limitations under the License. */ #include "paddle/fluid/operators/collective/alltoall_op.h" -#if defined(PADDLE_WITH_NCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) #include "paddle/fluid/platform/collective_helper.h" #include "paddle/fluid/platform/device/gpu/nccl_helper.h" #endif @@ -26,7 +26,7 @@ template class AllToAllOpCUDAKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { -#if defined(PADDLE_WITH_NCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) #if NCCL_VERSION_CODE >= 2703 auto x = ctx.Input("X"); auto out = ctx.Output("Out"); @@ -43,7 +43,7 @@ class AllToAllOpCUDAKernel : public framework::OpKernel { auto comm = platform::NCCLCommContext::Instance().Get(ring_id, place); int nranks = comm->nranks(); - cudaStream_t stream = nullptr; + gpuStream_t stream = nullptr; if (ctx.Attr("use_calc_stream")) { auto dev_ctx = platform::DeviceContextPool::Instance().Get(place); stream = static_cast(dev_ctx)->stream(); diff --git a/paddle/fluid/operators/collective/c_comm_init_multitrainer_op.cc b/paddle/fluid/operators/collective/c_comm_init_multitrainer_op.cc index f69fe8f1e3f1f..86c966378ccb6 100644 --- a/paddle/fluid/operators/collective/c_comm_init_multitrainer_op.cc +++ b/paddle/fluid/operators/collective/c_comm_init_multitrainer_op.cc @@ -14,6 +14,9 @@ limitations under the License. */ #if defined(PADDLE_WITH_NCCL) #include #endif +#if defined(PADDLE_WITH_RCCL) +#include +#endif #include #include #include @@ -24,7 +27,7 @@ limitations under the License. */ #include "paddle/fluid/framework/threadpool.h" // #include "paddle/fluid/operators/distributed/distributed.h" // #include "paddle/fluid/operators/distributed/request_handler_impl.h" -#if defined(PADDLE_WITH_NCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) #include "paddle/fluid/platform/collective_helper.h" #include "paddle/fluid/platform/device/gpu/nccl_helper.h" #endif @@ -51,7 +54,7 @@ class CCommInitMultiTrainerOp : public framework::OperatorBase { auto var = scope.FindVar(Input("X")); PADDLE_ENFORCE_NOT_NULL( var, platform::errors::InvalidArgument("Input X must be provided.")); -#if defined(PADDLE_WITH_NCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) ncclUniqueId* nccl_id = var->GetMutable(); int ntrainers = Attr("ntrainers"); diff --git a/paddle/fluid/operators/collective/global_gather_op.cu.cc b/paddle/fluid/operators/collective/global_gather_op.cu.cc index 4f9725a27062b..6684470e881cb 100644 --- a/paddle/fluid/operators/collective/global_gather_op.cu.cc +++ b/paddle/fluid/operators/collective/global_gather_op.cu.cc @@ -14,7 +14,7 @@ limitations under the License. */ #include "paddle/fluid/operators/collective/global_gather_op.h" -#if defined(PADDLE_WITH_NCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) #include "paddle/fluid/platform/collective_helper.h" #include "paddle/fluid/platform/device/gpu/nccl_helper.h" #endif @@ -26,7 +26,7 @@ template class GlobalGatherOpCUDAKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { -#if defined(PADDLE_WITH_NCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) #if NCCL_VERSION_CODE >= 2703 auto x = ctx.Input("X"); auto local_count = ctx.Input("local_count"); @@ -79,7 +79,7 @@ class GlobalGatherOpCUDAKernel : public framework::OpKernel { ring_id)); auto place = ctx.GetPlace(); auto comm = platform::NCCLCommContext::Instance().Get(ring_id, place); - cudaStream_t stream = nullptr; + gpuStream_t stream = nullptr; if (ctx.Attr("use_calc_stream")) { auto dev_ctx = platform::DeviceContextPool::Instance().Get(place); stream = static_cast(dev_ctx)->stream(); diff --git a/paddle/fluid/operators/collective/global_scatter_op.cu.cc b/paddle/fluid/operators/collective/global_scatter_op.cu.cc index 3a7e6a0079ac5..cd3c3a3229ca0 100644 --- a/paddle/fluid/operators/collective/global_scatter_op.cu.cc +++ b/paddle/fluid/operators/collective/global_scatter_op.cu.cc @@ -14,7 +14,7 @@ limitations under the License. */ #include "paddle/fluid/operators/collective/global_scatter_op.h" -#if defined(PADDLE_WITH_NCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) #include "paddle/fluid/platform/collective_helper.h" #include "paddle/fluid/platform/device/gpu/nccl_helper.h" #endif @@ -26,7 +26,7 @@ template class GlobalScatterOpCUDAKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { -#if defined(PADDLE_WITH_NCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) #if NCCL_VERSION_CODE >= 2703 auto x = ctx.Input("X"); auto local_count = ctx.Input("local_count"); @@ -78,7 +78,7 @@ class GlobalScatterOpCUDAKernel : public framework::OpKernel { auto place = ctx.GetPlace(); auto comm = platform::NCCLCommContext::Instance().Get(ring_id, place); - cudaStream_t stream = nullptr; + gpuStream_t stream = nullptr; if (ctx.Attr("use_calc_stream")) { auto dev_ctx = platform::DeviceContextPool::Instance().Get(place); stream = static_cast(dev_ctx)->stream(); From 192f6f85fc358d681c9e87db961c14edf7595ca3 Mon Sep 17 00:00:00 2001 From: Sing_chan <51314274+betterpig@users.noreply.github.com> Date: Thu, 14 Apr 2022 11:25:12 +0800 Subject: [PATCH 143/211] fix bug of set NIGHTLY_MODE;test=document_fix;test=windows_ci (#41758) --- paddle/scripts/paddle_build.bat | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/paddle/scripts/paddle_build.bat b/paddle/scripts/paddle_build.bat index 21df60e972121..8b9bfcf46042f 100644 --- a/paddle/scripts/paddle_build.bat +++ b/paddle/scripts/paddle_build.bat @@ -75,7 +75,7 @@ if not defined WITH_UNITY_BUILD set WITH_UNITY_BUILD=OFF if not defined INFERENCE_DEMO_INSTALL_DIR set INFERENCE_DEMO_INSTALL_DIR=%cache_dir:\=/%/inference_demo if not defined LOG_LEVEL set LOG_LEVEL=normal if not defined PRECISION_TEST set PRECISION_TEST=OFF -if not defined NIGHTLY_MODE set PRECISION_TEST=OFF +if not defined NIGHTLY_MODE set NIGHTLY_MODE=OFF if not defined retry_times set retry_times=1 if not defined PYTHON_ROOT set PYTHON_ROOT=C:\Python37 if not defined BUILD_DIR set BUILD_DIR=build From de2a3942e96a79ff61c987b874d0757939c6f1bd Mon Sep 17 00:00:00 2001 From: Chen Weihang Date: Thu, 14 Apr 2022 11:33:13 +0800 Subject: [PATCH 144/211] remove inner_place using (#41768) --- paddle/fluid/eager/amp_auto_cast.h | 2 +- paddle/fluid/eager/eager_amp_auto_cast.h | 2 +- paddle/fluid/eager/grad_node_info.cc | 8 ++++---- paddle/fluid/eager/grad_node_info.h | 4 ++-- paddle/fluid/eager/grad_tensor_holder.cc | 4 ++-- paddle/fluid/eager/pylayer/py_layer_node.h | 2 +- .../data_structure_tests/eager_tensor_test.cc | 2 +- paddle/fluid/pybind/eager.cc | 2 +- paddle/fluid/pybind/eager_functions.cc | 20 +++++++++---------- paddle/fluid/pybind/eager_method.cc | 12 +++++------ paddle/fluid/pybind/eager_properties.cc | 6 +++--- paddle/phi/api/include/tensor.h | 10 ---------- paddle/phi/api/lib/kernel_dispatch.cc | 2 +- paddle/phi/api/lib/tensor.cc | 19 +++--------------- paddle/phi/api/lib/tensor_method.cc | 9 ++++----- 15 files changed, 40 insertions(+), 64 deletions(-) diff --git a/paddle/fluid/eager/amp_auto_cast.h b/paddle/fluid/eager/amp_auto_cast.h index 6d5758adbe526..3a96b23dcebbb 100644 --- a/paddle/fluid/eager/amp_auto_cast.h +++ b/paddle/fluid/eager/amp_auto_cast.h @@ -21,7 +21,7 @@ namespace egr { static inline bool NeedCast(const paddle::experimental::Tensor& tensor, const paddle::experimental::DataType& dst_dtype) { - auto place = tensor.inner_place(); + auto place = tensor.place(); auto data_type = tensor.dtype(); if (paddle::platform::is_gpu_place(place) || paddle::platform::is_cuda_pinned_place(place) || diff --git a/paddle/fluid/eager/eager_amp_auto_cast.h b/paddle/fluid/eager/eager_amp_auto_cast.h index 9bd1ca1f6fe53..ee9da41881b2d 100644 --- a/paddle/fluid/eager/eager_amp_auto_cast.h +++ b/paddle/fluid/eager/eager_amp_auto_cast.h @@ -20,7 +20,7 @@ namespace egr { static inline bool NeedCast(const paddle::experimental::Tensor& tensor, const paddle::experimental::DataType& dst_dtype) { - auto place = tensor.inner_place(); + auto place = tensor.place(); auto data_type = tensor.dtype(); if (paddle::platform::is_gpu_place(place) || paddle::platform::is_cuda_pinned_place(place) || diff --git a/paddle/fluid/eager/grad_node_info.cc b/paddle/fluid/eager/grad_node_info.cc index 6afdd854344eb..72b84b9db3210 100644 --- a/paddle/fluid/eager/grad_node_info.cc +++ b/paddle/fluid/eager/grad_node_info.cc @@ -151,7 +151,7 @@ void GradNodeBase::SetGradInMeta(const paddle::experimental::Tensor& fwd_out, "which is illegal.")); meta.SetTensorMeta(dense_tensor->meta()); - meta.SetPlace(fwd_out.inner_place()); + meta.SetPlace(fwd_out.place()); if (paddle::framework::IsComplexType( paddle::framework::TransToProtoVarType(dense_tensor->type()))) { @@ -210,7 +210,7 @@ void GradNodeBase::SetGradInMeta( "with phi::DataType::UNDEFINED," "which is illegal.")); meta.SetTensorMeta(dense_tensor->meta()); - meta.SetPlace(fwd_out_tensor.inner_place()); + meta.SetPlace(fwd_out_tensor.place()); if (paddle::framework::IsComplexType( paddle::framework::TransToProtoVarType(dense_tensor->type()))) { @@ -256,7 +256,7 @@ void GradNodeBase::SetGradOutMeta(const paddle::experimental::Tensor& fwd_in, "with phi::DataType::UNDEFINED," "which is illegal.")); meta.SetTensorMeta(dense_tensor->meta()); - meta.SetPlace(fwd_in.inner_place()); + meta.SetPlace(fwd_in.place()); } } else { VLOG(6) << "Unable to initialize the DenseTensorMeta of GradSlotMeta with " @@ -301,7 +301,7 @@ void GradNodeBase::SetGradOutMeta( "phi::DataType::UNDEFINED," "which is illegal.")); meta.SetTensorMeta(dense_tensor->meta()); - meta.SetPlace(fwd_in_tensor.inner_place()); + meta.SetPlace(fwd_in_tensor.place()); } } else { VLOG(6) << "Unable to initialize the DenseTensorMeta of GradSlotMeta " diff --git a/paddle/fluid/eager/grad_node_info.h b/paddle/fluid/eager/grad_node_info.h index 6a70a16a2416f..decb682bf4517 100644 --- a/paddle/fluid/eager/grad_node_info.h +++ b/paddle/fluid/eager/grad_node_info.h @@ -317,11 +317,11 @@ inline void CheckTensor(const paddle::experimental::Tensor& pre, paddle::framework::DataType2String(pre.dtype()), paddle::framework::DataType2String(post.dtype()))); PADDLE_ENFORCE_EQ( - pre.inner_place(), post.inner_place(), + pre.place(), post.place(), paddle::platform::errors::PermissionDenied( "The place of tensor before(%s) and after(%s) " "hook are not consistent", - pre.inner_place().DebugString(), post.inner_place().DebugString())); + pre.place().DebugString(), post.place().DebugString())); } } diff --git a/paddle/fluid/eager/grad_tensor_holder.cc b/paddle/fluid/eager/grad_tensor_holder.cc index 183282d6f87b2..27a8c6002e29d 100644 --- a/paddle/fluid/eager/grad_tensor_holder.cc +++ b/paddle/fluid/eager/grad_tensor_holder.cc @@ -53,7 +53,7 @@ void GradTensorHolder::CopyValueFromTensor( paddle::experimental::Tensor& buffer_tensor = buffer_[slot_id][rank]; if ((!buffer_tensor.defined() || !buffer_tensor.initialized())) { // Perform deep copy here - buffer_tensor.copy_(t, t.inner_place(), false); + buffer_tensor.copy_(t, t.place(), false); buffer_tensor.set_autograd_meta(t.mutable_autograd_meta()); } else { @@ -66,7 +66,7 @@ void GradTensorHolder::CopyValueFromTensor( if (t.defined()) { // Fill 1.0, use full to support complex, one_like don't support it. buffer_[slot_id][rank] = - paddle::experimental::full(t.shape(), 1, t.dtype(), t.inner_place()); + paddle::experimental::full(t.shape(), 1, t.dtype(), t.place()); } } } diff --git a/paddle/fluid/eager/pylayer/py_layer_node.h b/paddle/fluid/eager/pylayer/py_layer_node.h index f2e50494467c7..87e8acf88a694 100644 --- a/paddle/fluid/eager/pylayer/py_layer_node.h +++ b/paddle/fluid/eager/pylayer/py_layer_node.h @@ -62,7 +62,7 @@ class GradNodePyLayer : public GradNodeBase { } else { forward_outputs_meta_[i].emplace_back(); } - forward_outputs_place_[i].emplace_back(tensor->inner_place()); + forward_outputs_place_[i].emplace_back(tensor->place()); } } } diff --git a/paddle/fluid/eager/tests/data_structure_tests/eager_tensor_test.cc b/paddle/fluid/eager/tests/data_structure_tests/eager_tensor_test.cc index de9758b73d250..9afe3962faa29 100644 --- a/paddle/fluid/eager/tests/data_structure_tests/eager_tensor_test.cc +++ b/paddle/fluid/eager/tests/data_structure_tests/eager_tensor_test.cc @@ -96,7 +96,7 @@ TEST(Tensor, MemberFunction) { CHECK_EQ(et3.dims(), expected_dim); CHECK_EQ(et3.type(), paddle::experimental::DataType::FLOAT32); CHECK_EQ(et3.layout(), paddle::experimental::DataLayout::NCHW); - CHECK(paddle::platform::is_cpu_place(et3.inner_place())); + CHECK(paddle::platform::is_cpu_place(et3.place())); VLOG(6) << "Get impl"; auto* dt3_ptr = std::dynamic_pointer_cast(et3.impl())->data(); diff --git a/paddle/fluid/pybind/eager.cc b/paddle/fluid/pybind/eager.cc index 74d15b6c0ca79..c529d121f3945 100644 --- a/paddle/fluid/pybind/eager.cc +++ b/paddle/fluid/pybind/eager.cc @@ -137,7 +137,7 @@ void InitTensorWithTensor(TensorObject* self, const paddle::platform::Place& place, const std::string& name) { self->tensor.set_name(name); - if (place == src.inner_place()) { + if (place == src.place()) { auto impl = std::static_pointer_cast(src.impl()); self->tensor.set_impl(impl); VLOG(4) << "Same place, do ShareDataWith"; diff --git a/paddle/fluid/pybind/eager_functions.cc b/paddle/fluid/pybind/eager_functions.cc index fb115455357dd..1073cdc83a428 100644 --- a/paddle/fluid/pybind/eager_functions.cc +++ b/paddle/fluid/pybind/eager_functions.cc @@ -554,32 +554,32 @@ static PyObject* eager_api_async_read(PyObject* self, PyObject* args, src.is_gpu_pinned(), true, platform::errors::InvalidArgument("Required `src` device should be " "CUDAPinnedPlace, but received %d.", - src.inner_place())); + src.place())); PADDLE_ENFORCE_EQ( dst.is_gpu(), true, platform::errors::InvalidArgument( "Required `dst` device should be CUDAPlace, but received %d.", - dst.inner_place())); + dst.place())); PADDLE_ENFORCE_EQ( index.is_cpu(), true, platform::errors::InvalidArgument( "Required `index` device should be CPUPlace, but received %d.", - index.inner_place())); + index.place())); PADDLE_ENFORCE_EQ(buffer.is_gpu_pinned(), true, platform::errors::InvalidArgument( "Required `buffer` device should be CUDAPinnedPlace, " "but received %d.", - buffer.inner_place())); + buffer.place())); PADDLE_ENFORCE_EQ( offset.is_cpu(), true, platform::errors::InvalidArgument( "Required `offset` device should be CPUPlace, but received %d.", - offset.inner_place())); + offset.place())); PADDLE_ENFORCE_EQ( count.is_cpu(), true, platform::errors::InvalidArgument( "Required `count` device should be CPUPlace, but received %d.", - count.inner_place())); + count.place())); auto& src_tensor = src; auto* dst_tensor = &dst; @@ -701,22 +701,22 @@ static PyObject* eager_api_async_write(PyObject* self, PyObject* args, src.is_gpu(), true, platform::errors::InvalidArgument( "Required `src` device should be CUDAPlace, but received %d. ", - src.inner_place())); + src.place())); PADDLE_ENFORCE_EQ(dst.is_gpu_pinned(), true, platform::errors::InvalidArgument( "Required `dst` device should be CUDAPinnedPlace, " "but received %d. ", - dst.inner_place())); + dst.place())); PADDLE_ENFORCE_EQ( offset.is_cpu(), true, platform::errors::InvalidArgument("Required `offset` device should " "be CPUPlace, but received %d. ", - offset.inner_place())); + offset.place())); PADDLE_ENFORCE_EQ( count.is_cpu(), true, platform::errors::InvalidArgument( "Required `count` device should be CPUPlace, but received %d. ", - count.inner_place())); + count.place())); // TODO(daisiming): In future, add index as arguments following // async_read. diff --git a/paddle/fluid/pybind/eager_method.cc b/paddle/fluid/pybind/eager_method.cc index 6dbed97a55f40..4610196726e75 100644 --- a/paddle/fluid/pybind/eager_method.cc +++ b/paddle/fluid/pybind/eager_method.cc @@ -342,11 +342,11 @@ static PyObject* tensor_method_copy_(TensorObject* self, PyObject* args, ->SetPersistable( egr::EagerUtils::autograd_meta(&(src_tensor))->Persistable()); if (src_tensor.initialized()) { - self->tensor.copy_(src_tensor, src_tensor.inner_place(), blocking); + self->tensor.copy_(src_tensor, src_tensor.place(), blocking); } } else { if (src_tensor.initialized()) { - self->tensor.copy_(src_tensor, self->tensor.inner_place(), blocking); + self->tensor.copy_(src_tensor, self->tensor.place(), blocking); } } @@ -934,7 +934,7 @@ static PyObject* tensor_method__setitem_eager_tensor(TensorObject* self, } else { SetTensorFromPyArray( static_cast(value_tensor_tmp.impl().get()), - value, value_tensor_tmp.inner_place(), false); + value, value_tensor_tmp.place(), false); } value_tensor = value_tensor_tmp; @@ -1018,7 +1018,7 @@ static PyObject* tensor_method__setitem_eager_tensor(TensorObject* self, platform::Place(platform::CPUPlace()), false); #endif } else { - SetTensorFromPyArray(self_tensor, self_numpy, self->tensor.inner_place(), + SetTensorFromPyArray(self_tensor, self_numpy, self->tensor.place(), false); } } @@ -1367,7 +1367,7 @@ static PyObject* tensor_method__share_memory(TensorObject* self, PyObject* args, PyObject* kwargs) { EAGER_TRY #ifndef _WIN32 - PADDLE_ENFORCE_EQ(platform::is_cpu_place(self->tensor.inner_place()), true, + PADDLE_ENFORCE_EQ(platform::is_cpu_place(self->tensor.place()), true, platform::errors::InvalidArgument( "Sharing memory only support CPU Tensor currently")); // 1. get LoDTensor @@ -1419,7 +1419,7 @@ static PyObject* tensor_method__uva(TensorObject* self, PyObject* args, platform::errors::InvalidArgument( "Unified virtual addressing only support " "DenseTensor currently.")); - PADDLE_ENFORCE_EQ(platform::is_cpu_place(self->tensor.inner_place()), true, + PADDLE_ENFORCE_EQ(platform::is_cpu_place(self->tensor.place()), true, platform::errors::InvalidArgument( "Unified virtual addressing only support " "CPU Tensor currently.")); diff --git a/paddle/fluid/pybind/eager_properties.cc b/paddle/fluid/pybind/eager_properties.cc index a72ea6c4b02e1..797b68fcb36ea 100644 --- a/paddle/fluid/pybind/eager_properties.cc +++ b/paddle/fluid/pybind/eager_properties.cc @@ -108,7 +108,7 @@ int tensor_properties_set_grad(TensorObject* self, PyObject* value, "Detected NULL grad" "Please check if you have manually cleared" "the grad inside autograd_meta")); - grad->copy_(src, self->tensor.inner_place(), true); + grad->copy_(src, self->tensor.place(), true); return 0; EAGER_CATCH_AND_THROW_RETURN_NEG } @@ -160,14 +160,14 @@ PyObject* tensor_properties_get_shape(TensorObject* self, void* closure) { PyObject* tensor_properties_get_place(TensorObject* self, void* closure) { EAGER_TRY - return ToPyObject(self->tensor.inner_place()); + return ToPyObject(self->tensor.place()); EAGER_CATCH_AND_THROW_RETURN_NULL } PyObject* tensor_properties_get_place_str(TensorObject* self, void* closure) { EAGER_TRY std::stringstream ostr; - ostr << self->tensor.inner_place(); + ostr << self->tensor.place(); return ToPyObject(ostr.str()); EAGER_CATCH_AND_THROW_RETURN_NULL } diff --git a/paddle/phi/api/include/tensor.h b/paddle/phi/api/include/tensor.h index d3efb7ca1c21e..3c3da4b749ed0 100644 --- a/paddle/phi/api/include/tensor.h +++ b/paddle/phi/api/include/tensor.h @@ -249,21 +249,11 @@ class PADDLE_API Tensor final { /** * @brief Return the place (device) of Tensor. - * This is a deprecated method and may be removed in the future! * * @return Place */ Place place() const; - /** - * @brief Return the place (device) of Tensor. - * - * This is a deprecated method and may be removed in the future!!! - * - * @return Place - */ - Place inner_place() const; - /** * @brief Determine whether the tensor device is CPU * diff --git a/paddle/phi/api/lib/kernel_dispatch.cc b/paddle/phi/api/lib/kernel_dispatch.cc index 6d97dc7657f00..a534f02663dff 100644 --- a/paddle/phi/api/lib/kernel_dispatch.cc +++ b/paddle/phi/api/lib/kernel_dispatch.cc @@ -126,7 +126,7 @@ Backend ParseBackend(const Place& place) { return phi::TransToPhiBackend(place); } Backend ParseBackend(const Tensor& tensor) { - return phi::TransToPhiBackend(tensor.inner_place()); + return phi::TransToPhiBackend(tensor.place()); } Backend ParseBackendWithInputOrder(const Place& place, const Tensor& tensor) { diff --git a/paddle/phi/api/lib/tensor.cc b/paddle/phi/api/lib/tensor.cc index 07204b7ffcf61..7eff846bbc1e3 100644 --- a/paddle/phi/api/lib/tensor.cc +++ b/paddle/phi/api/lib/tensor.cc @@ -163,25 +163,12 @@ Place Tensor::place() const { return impl_->place(); } -Place Tensor::inner_place() const { - PADDLE_ENFORCE_NOT_NULL( - impl_, - phi::errors::PermissionDenied( - "Null pointer error, the impl_ of Tensor should not be " - "Null when calling Tensor::inner_place().")); - return impl_->place(); -} - -bool Tensor::is_cpu() const { - return paddle::platform::is_cpu_place(inner_place()); -} +bool Tensor::is_cpu() const { return paddle::platform::is_cpu_place(place()); } -bool Tensor::is_gpu() const { - return paddle::platform::is_gpu_place(inner_place()); -} +bool Tensor::is_gpu() const { return paddle::platform::is_gpu_place(place()); } bool Tensor::is_gpu_pinned() const { - return paddle::platform::is_cuda_pinned_place(inner_place()); + return paddle::platform::is_cuda_pinned_place(place()); } /* Part 4: Data Access methods */ diff --git a/paddle/phi/api/lib/tensor_method.cc b/paddle/phi/api/lib/tensor_method.cc index 46ca457b2c10a..79519f67d2ad3 100644 --- a/paddle/phi/api/lib/tensor_method.cc +++ b/paddle/phi/api/lib/tensor_method.cc @@ -97,16 +97,15 @@ void Tensor::copy_(const Tensor &src, name(), src.name())); PADDLE_ENFORCE_EQ(target_place, - inner_place(), + place(), phi::errors::PreconditionNotMet( "Place is different of dst tensor and args %s, which " "current tensor holds %s " "Copy cannot be performed!", target_place, - inner_place())); - kernel_key_set.backend_set = - kernel_key_set.backend_set | - BackendSet(phi::TransToPhiBackend(inner_place())); + place())); + kernel_key_set.backend_set = kernel_key_set.backend_set | + BackendSet(phi::TransToPhiBackend(place())); } else { // Deep Copy AutoGrad info from src to self. *autograd_meta_ = *(src.autograd_meta_); From 4ae76d2179cf9812f76ea91ab8eb6007a5098ec7 Mon Sep 17 00:00:00 2001 From: Aurelius84 Date: Thu, 14 Apr 2022 12:03:39 +0800 Subject: [PATCH 145/211] [Op]Fix adam/adamw beta1_pow/beta2_pow place while copying (#41732) --- paddle/phi/kernels/gpu/adamw_kernel.cu | 4 ++-- paddle/phi/kernels/selected_rows/gpu/adam_kernel.cu | 4 ++-- paddle/phi/kernels/selected_rows/gpu/adamw_kernel.cu | 4 ++-- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/paddle/phi/kernels/gpu/adamw_kernel.cu b/paddle/phi/kernels/gpu/adamw_kernel.cu index 3555df11b5e1f..4873ba9c13d48 100644 --- a/paddle/phi/kernels/gpu/adamw_kernel.cu +++ b/paddle/phi/kernels/gpu/adamw_kernel.cu @@ -190,8 +190,8 @@ void AdamwDenseKernel(const Context& dev_ctx, phi::Copy(dev_ctx, param, dev_ctx.GetPlace(), false, param_out); phi::Copy(dev_ctx, moment1, dev_ctx.GetPlace(), false, moment1_out); phi::Copy(dev_ctx, moment2, dev_ctx.GetPlace(), false, moment2_out); - phi::Copy(dev_ctx, beta1_pow, dev_ctx.GetPlace(), false, beta1_pow_out); - phi::Copy(dev_ctx, beta2_pow, dev_ctx.GetPlace(), false, beta2_pow_out); + phi::Copy(dev_ctx, beta1_pow, beta1_pow.place(), false, beta1_pow_out); + phi::Copy(dev_ctx, beta2_pow, beta2_pow.place(), false, beta2_pow_out); return; } diff --git a/paddle/phi/kernels/selected_rows/gpu/adam_kernel.cu b/paddle/phi/kernels/selected_rows/gpu/adam_kernel.cu index 2cb086503283b..31abac149951d 100644 --- a/paddle/phi/kernels/selected_rows/gpu/adam_kernel.cu +++ b/paddle/phi/kernels/selected_rows/gpu/adam_kernel.cu @@ -139,8 +139,8 @@ void AdamDenseParamSparseGradKernel( phi::Copy(dev_ctx, param, dev_ctx.GetPlace(), false, param_out); phi::Copy(dev_ctx, moment1, dev_ctx.GetPlace(), false, moment1_out); phi::Copy(dev_ctx, moment2, dev_ctx.GetPlace(), false, moment2_out); - phi::Copy(dev_ctx, beta1_pow, dev_ctx.GetPlace(), false, beta1_pow_out); - phi::Copy(dev_ctx, beta2_pow, dev_ctx.GetPlace(), false, beta2_pow_out); + phi::Copy(dev_ctx, beta1_pow, beta1_pow.place(), false, beta1_pow_out); + phi::Copy(dev_ctx, beta2_pow, beta2_pow.place(), false, beta2_pow_out); return; } diff --git a/paddle/phi/kernels/selected_rows/gpu/adamw_kernel.cu b/paddle/phi/kernels/selected_rows/gpu/adamw_kernel.cu index 0fc223e081506..b847f48d12267 100644 --- a/paddle/phi/kernels/selected_rows/gpu/adamw_kernel.cu +++ b/paddle/phi/kernels/selected_rows/gpu/adamw_kernel.cu @@ -156,8 +156,8 @@ void AdamwDenseParamSparseGradKernel( phi::Copy(dev_ctx, param, dev_ctx.GetPlace(), false, param_out); phi::Copy(dev_ctx, moment1, dev_ctx.GetPlace(), false, moment1_out); phi::Copy(dev_ctx, moment2, dev_ctx.GetPlace(), false, moment2_out); - phi::Copy(dev_ctx, beta1_pow, dev_ctx.GetPlace(), false, beta1_pow_out); - phi::Copy(dev_ctx, beta2_pow, dev_ctx.GetPlace(), false, beta2_pow_out); + phi::Copy(dev_ctx, beta1_pow, beta1_pow.place(), false, beta1_pow_out); + phi::Copy(dev_ctx, beta2_pow, beta2_pow.place(), false, beta2_pow_out); return; } From e0abb90b666b51c345070a180de4a6c0aab41c8d Mon Sep 17 00:00:00 2001 From: Vigi Zhang Date: Thu, 14 Apr 2022 12:06:45 +0800 Subject: [PATCH 146/211] add security policy (#41749) --- SECURITY.md | 58 +++++++++++++++++++++++++++++++++++++++++++ SECURITY_cn.md | 44 ++++++++++++++++++++++++++++++++ security/README.md | 12 +++++++++ security/README_cn.md | 12 +++++++++ 4 files changed, 126 insertions(+) create mode 100644 SECURITY.md create mode 100644 SECURITY_cn.md create mode 100644 security/README.md create mode 100644 security/README_cn.md diff --git a/SECURITY.md b/SECURITY.md new file mode 100644 index 0000000000000..490c804e9de9d --- /dev/null +++ b/SECURITY.md @@ -0,0 +1,58 @@ +# Using PaddlePaddle Securely + +This document describes model security and code security in PaddlePaddle. It also provides guidelines on how to report vulnerabilities in PaddlePaddle. + +## PaddlePaddle Model Security + +PaddlePaddle attaches great importance to security and privacy of model. This includes how to prevent the model from outputting wrong decision results under the interference when it is used in security-related and safety-critical scenarios, and how to avoid leaking data and privacy information from the model itself, the model gradient or the model inference results. + + + +[PaddleSleeve](https://github.com/PaddlePaddle/PaddleSleeve) provides a series of security and privacy tools, which can help model developers and users systematically evaluate and improve the model security and privacy in both development and deployment stages. + + + +These tools include adversarial example evaluation test, pseudo-natural environment robustness evaluation test, model reversing evaluation test, member inference evaluation test, sample denoising, adversarial training, privacy enhancement optimizer, etc. + +## PaddlePaddle Code Security + +PaddlePaddle always take code security seriously. However, due to the complexity of the framework and its dependence on other thirdparty open source libraries, there may still be some security issues undetected. Therefore, we hope that more security researchers and PaddlePaddle developers can participate in the code security program. We encourage responsible disclosure of security issues, as well as contributing code to improve our vulnerability finding tools to make PaddlePaddle safer. + +### Code security tools + +PaddlePaddle security team attaches great importance to the security of the framework. In order to find and fix security issues as soon as possible, we are continuously conducting code security audit and developing automatic vunerability discovery tools. We have already open sourced some of them to the community, hoping this could encourage people to contribute and improve the safety and robustness of PaddlePaddle. [This tool](https://github.com/PaddlePaddle/PaddleSleeve/tree/main/CodeSecurity) includes two parts. The dynamic part includes some op fuzzer samples. And the static part includes some CodeQL samples. Both of them are aim to find vulnerabilities in PaddlePaddle framework codebase. By referring the samples, security researchers can write their own fuzzers or QLs to test more PaddlePaddle modules, and find more code security issues. + +### Reporting vulnerabilities + +We encourage responsible disclosure of security issues to PaddlePaddle and please email reports about any security issues you find to security@paddlepaddle.org. + + + +After the security team receives your email, they will communicate with you in time. The security team will work to keep you informed of an issue fix. + + + +In order to reproduce and identify the issue, please include the following information along with your email: + +- The details of the vulnerability including how to reproduce it. Try to attach a PoC. +- The attack scenario and what an attacker might be able to achieve with this issue. +- Whether this vulnerability has been made public. If it is, please attach details. +- Your name and affiliation. + +We will indicate the bug fix in the release of PaddlePaddle, and publish the vulnerability detail and the reporter in the security advisories (Your name will not be published if you choose to remain anonymous). + +### What is a vulnerability? + +In the process of computation graphs in PaddlePaddle, models can perform arbitrary computations , including reading and writing files, communicating with the network, etc. It may cause memory exhaustion, deadlock, etc., which will lead to unexpected behavior of PaddlePaddle. We consider these behavior to be security vulnerabilities only if they are out of the intention of the operation involved. + + + +Some unexpected parameters and behaviors have been checked in PaddlePaddle by throwing exceptions in Python or return error states in C++. In these cases, denial of service is still possible, but the exit of the PaddlePaddle is clean. Since the error handling of PaddlePaddle is expected and correct, these cases are not security vulnerabilities. + + + +If malicious input can trigger memory corruption or non-clean exit, such bug is considered a security problem. + + + +[security advisories](https://github.com/PaddlePaddle/Paddle/security/README.md) diff --git a/SECURITY_cn.md b/SECURITY_cn.md new file mode 100644 index 0000000000000..e9f503192c1eb --- /dev/null +++ b/SECURITY_cn.md @@ -0,0 +1,44 @@ +# 安全使用飞桨 + + + +本文将对飞桨模型及代码安全进行介绍,并介绍如何向飞桨提报漏洞。 + +## 飞桨模型安全 + +飞桨关注模型的安全性和隐私性。其中包括当模型被用于安全攸关场景时,如何避免模型在干扰下输出错误的决策结果,以及如何避免从模型本身、模型梯度或模型推理结果中泄露数据和隐私信息。 + +飞桨的安全和隐私套件[PaddleSleeve](https://github.com/PaddlePaddle/PaddleSleeve)提供了一系列工具,可帮助模型开发者及使用者在模型的开发或部署阶段,系统性地评估并提升模型的安全性和隐私性。这些工具包括对抗样本评估测试、拟自然环境鲁棒性评估测试、模型逆向评估测试、成员推断评估测试、样本去噪、对抗训练、隐私增强优化器等。 + +## 飞桨代码安全 + +飞桨团队一向非常重视代码安全,但鉴于飞桨框架的实现非常复杂,并且依赖了多个第三方开源库,其中仍可能会存在未被发现的问题。因此,我们希望有更多安全研究人员、飞桨开发者能参与到飞桨代码安全保障项目中来,我们鼓励向飞桨负责任的披露(Responsible Disclosure)安全问题,也鼓励向飞桨贡献代码完善动静态漏洞挖掘工具,让飞桨变得更安全。 + +### 安全工具 + +飞桨安全团队对于飞桨框架自身的安全高度重视,为了尽快地发现和修复安全问题,我们内部在持续地进行代码安全审计和研发自动化漏洞挖掘工具。我们将一些工具和方法开源给社区,希望能抛砖引玉,大家一起来贡献提高飞桨的安全性和鲁棒性。工具开源见[CodeSecurity](https://github.com/PaddlePaddle/PaddleSleeve/tree/main/CodeSecurity)。该开源工具包含两部分内容,分别从动态(模糊测试)和静态(CodeQL)两个角度对飞桨代码进行安全审计和漏洞挖掘。通过参照和添加新的测试模块,可以帮助覆盖更多飞桨代码模块,发现更多的代码安全问题。 + +### 报告安全问题 + +我们鼓励向飞桨负责任地披露安全问题,请将所发现的安全问题发送电子邮件到 security@paddlepaddle.org。 + +在安全团队收到邮件后将会及时与您沟通并反馈问题修复进度。 + +为了更好地复现和认定问题情况,请在邮件中: + +- 详细描述漏洞细节,如何复现,并尽量附上PoC。 +- 描述攻击场景,介绍攻击者可能由此问题所能达到的效果。 +- 该问题是否已公开并描述情况。 +- 署名您的姓名和从属关系。 + +我们会将漏洞修复情况注明在飞桨的发布当中,并在致谢公告中发布漏洞情况和提报人(如果您选择不公开署名将不会发布提报人信息)。 + +### 安全问题认定说明 + +飞桨在计算图的过程中,由于模型可以执行任何计算,操作文件,进行网络通信等功能,可能造成内存耗尽,死锁等情况发生,这将导致飞桨产生一些非预期的行为。我们认为只有当这些行为超出了所涉及的操作意图时才算作是安全问题。 + +飞桨框架代码中对于一些非预期的参数和行为会进行检查,Python代码中以抛出异常为形式,C++代码中以返回错误状态为形式。这些情况下,飞桨代码的退出是干净的,但仍可能会因此造成拒绝服务,然而由于飞桨的处理是预期且正确的,所以造成这些情况并不算作是安全问题。 + +如果输入非预期的参数后,对飞桨代码造成了内存破坏,或者非干净退出,这类行为被认定为存在安全问题。 + +### [安全公告](https://github.com/PaddlePaddle/Paddle/security/README_cn.md) diff --git a/security/README.md b/security/README.md new file mode 100644 index 0000000000000..ab3dab8c0cc70 --- /dev/null +++ b/security/README.md @@ -0,0 +1,12 @@ +# PaddlePaddle Security Advisories + +We regularly publish security advisories about using PaddlePaddle. + + + +*Note*: In conjunction with these security advisories, we strongly encourage PaddlePaddle users to read and understand PaddlePaddle's security model as outlined in [SECURITY.md](https://github.com/PaddlePaddle/Paddle/SECURITY.md). + + +| Advisory Number | Type | Versions affected | Reported by | Additional Information| +| --------------- | ---- | :---------------: | ----------- | ----------------------| +| | | | | | diff --git a/security/README_cn.md b/security/README_cn.md new file mode 100644 index 0000000000000..2ae23046469d4 --- /dev/null +++ b/security/README_cn.md @@ -0,0 +1,12 @@ +# 飞桨安全公告 + +我们在此定期发布飞桨安全公告。 + + + +注:我们非常建议飞桨用户阅读和理解[SECURITY_cn.md](https://github.com/PaddlePaddle/Paddle/SECURITY_cn.md)所介绍的飞桨安全模型,以便更好地了解此安全公告。 + + +| 安全公告编号 | 类型 | 受影响版本 | 报告者 | 备注 | +| --------------- | ---- | :---------------: | ----------- | ----------------------| +| | | | | | From 7f73ef2c7304ea3a4d22659ac8701d36e588c4e3 Mon Sep 17 00:00:00 2001 From: Sing_chan <51314274+betterpig@users.noreply.github.com> Date: Thu, 14 Apr 2022 12:46:30 +0800 Subject: [PATCH 147/211] fix bfgs_doc (#41505) * fix bfgs_doc; test=document_fix * add parameter name; test=document_fix * modify according to chenlong's comments;test=document_fix --- .../incubate/optimizer/functional/bfgs.py | 82 +++++++------------ .../incubate/optimizer/functional/lbfgs.py | 78 ++++++++---------- 2 files changed, 66 insertions(+), 94 deletions(-) diff --git a/python/paddle/incubate/optimizer/functional/bfgs.py b/python/paddle/incubate/optimizer/functional/bfgs.py index abdab457fda00..23fd8dc0825f0 100644 --- a/python/paddle/incubate/optimizer/functional/bfgs.py +++ b/python/paddle/incubate/optimizer/functional/bfgs.py @@ -33,63 +33,43 @@ def minimize_bfgs(objective_func, name=None): r""" Minimizes a differentiable function `func` using the BFGS method. - The BFGS is a quasi-Newton method for solving an unconstrained - optimization problem over a differentiable function. - Closely related is the Newton method for minimization. Consider the iterate - update formula + The BFGS is a quasi-Newton method for solving an unconstrained optimization problem over a differentiable function. + Closely related is the Newton method for minimization. Consider the iterate update formula: + .. math:: - x_{k+1} = x_{k} + H \nabla{f}, - If $H$ is the inverse Hessian of $f$ at $x_{k}$, then it's the Newton method. - If $H$ is symmetric and positive definite, used as an approximation of the inverse Hessian, then + x_{k+1} = x_{k} + H_k \nabla{f_k} + + If :math:`H_k` is the inverse Hessian of :math:`f` at :math:`x_k`, then it's the Newton method. + If :math:`H_k` is symmetric and positive definite, used as an approximation of the inverse Hessian, then it's a quasi-Newton. In practice, the approximated Hessians are obtained by only using the gradients, over either whole or part of the search - history, the former is BFGS. - - Reference: - Jorge Nocedal, Stephen J. Wright, Numerical Optimization, Second Edition, 2006. - pp140: Algorithm 6.1 (BFGS Method). - - Following summarizes the the main logic of the program based on BFGS. Note: _k represents value of - k_th iteration, ^T represents the transposition of a vector or matrix. - repeat - p_k = H_k * g_k - alpha = strong_wolfe(f, x_k, p_k) - x_k+1 = x_k + alpha * p_k - s_k = x_k+1 - x_k - y_k = g_k+1 - g_k - rho_k = 1 / (s_k^T * y_k) - V_k^T = I - rho_k * s_k * y_k^T - V_k = I - rho_k * y_k * s_k^T - H_k+1 = V_k^T * H_k * V_k + rho_k * s_k * s_k^T - check_converge - end + history, the former is BFGS, the latter is L-BFGS. + + Reference: + Jorge Nocedal, Stephen J. Wright, Numerical Optimization, Second Edition, 2006. pp140: Algorithm 6.1 (BFGS Method). Args: - objective_func: the objective function to minimize. ``func`` accepts - a multivariate input and returns a scalar. - initial_position (Tensor): the starting point of the iterates. For methods like Newton and quasi-Newton - the initial trial step length should always be 1.0. - max_iters (int): the maximum number of minimization iterations. - tolerance_grad (float): terminates if the gradient norm is smaller than this. Currently gradient norm uses inf norm. - tolerance_change (float): terminates if the change of function value/position/parameter between - two iterations is smaller than this value. - initial_inverse_hessian_estimate (Tensor): the initial inverse hessian approximation at initial_position. - It must be symmetric and positive definite. - line_search_fn (str): indicate which line search method to use, only support 'strong wolfe' right now. May support - 'Hager Zhang' in the futrue. - max_line_search_iters (int): the maximum number of line search iterations. - initial_step_length (float): step length used in first iteration of line search. different initial_step_length - may cause different optimal result. - dtype ('float32' | 'float64'): In static graph, float64 will be convert to float32 due to paddle.assign limit. - + objective_func: the objective function to minimize. ``objective_func`` accepts a multivariate input and returns a scalar. + initial_position (Tensor): the starting point of the iterates. + max_iters (int, optional): the maximum number of minimization iterations. Default value: 50. + tolerance_grad (float, optional): terminates if the gradient norm is smaller than this. Currently gradient norm uses inf norm. Default value: 1e-7. + tolerance_change (float, optional): terminates if the change of function value/position/parameter between two iterations is smaller than this value. Default value: 1e-9. + initial_inverse_hessian_estimate (Tensor, optional): the initial inverse hessian approximation at initial_position. It must be symmetric and positive definite. Default value: None. + line_search_fn (str, optional): indicate which line search method to use, only support 'strong wolfe' right now. May support 'Hager Zhang' in the futrue. Default value: 'strong wolfe'. + max_line_search_iters (int, optional): the maximum number of line search iterations. Default value: 50. + initial_step_length (float, optional): step length used in first iteration of line search. different initial_step_length may cause different optimal result. For methods like Newton and quasi-Newton the initial trial step length should always be 1.0. Default value: 1.0. + dtype ('float32' | 'float64', optional): data type used in the algorithm. Default value: 'float32'. + name (str, optional): Name for the operation. For more information, please refer to :ref:`api_guide_Name`. Default value: None. + Returns: - is_converge (bool): Indicates whether found the minimum within tolerance. - num_func_calls (int): number of objective function called. - position (Tensor): the position of the last iteration. If the search converged, this value is the argmin of - the objective function regrading to the initial position. - objective_value (Tensor): objective function value at the `position`. - objective_gradient (Tensor): objective function gradient at the `position`. - inverse_hessian_estimate (Tensor): the estimate of inverse hessian at the `position`. + output(tuple): + + - is_converge (bool): Indicates whether found the minimum within tolerance. + - num_func_calls (int): number of objective function called. + - position (Tensor): the position of the last iteration. If the search converged, this value is the argmin of the objective function regrading to the initial position. + - objective_value (Tensor): objective function value at the `position`. + - objective_gradient (Tensor): objective function gradient at the `position`. + - inverse_hessian_estimate (Tensor): the estimate of inverse hessian at the `position`. Examples: .. code-block:: python diff --git a/python/paddle/incubate/optimizer/functional/lbfgs.py b/python/paddle/incubate/optimizer/functional/lbfgs.py index d4bf511f85a99..f283381597733 100644 --- a/python/paddle/incubate/optimizer/functional/lbfgs.py +++ b/python/paddle/incubate/optimizer/functional/lbfgs.py @@ -32,54 +32,46 @@ def minimize_lbfgs(objective_func, initial_step_length=1.0, dtype='float32', name=None): - r"""Minimizes a differentiable function `func` using the L-BFGS method. - The L-BFGS is simalar as BFGS, the only difference is that L-BFGS use historical - sk, yk, rhok rather than H_k-1 to compute Hk. + r""" + Minimizes a differentiable function `func` using the L-BFGS method. + The L-BFGS is a quasi-Newton method for solving an unconstrained optimization problem over a differentiable function. + Closely related is the Newton method for minimization. Consider the iterate update formula: + + .. math:: + x_{k+1} = x_{k} + H_k \nabla{f_k} + + If :math:`H_k` is the inverse Hessian of :math:`f` at :math:`x_k`, then it's the Newton method. + If :math:`H_k` is symmetric and positive definite, used as an approximation of the inverse Hessian, then + it's a quasi-Newton. In practice, the approximated Hessians are obtained + by only using the gradients, over either whole or part of the search + history, the former is BFGS, the latter is L-BFGS. + Reference: - Jorge Nocedal, Stephen J. Wright, Numerical Optimization, Second Edition, 2006. - pp179: Algorithm 7.5 (L-BFGS). - - Following summarizes the the main logic of the program based on L-BFGS.Note: _k represents - value of k_th iteration, ^T represents the transposition of a vector or matrix. - repeat - compute p_k by two-loop recursion - alpha = strong_wolfe(f, x_k, p_k) - x_k+1 = x_k + alpha * p_k - s_k = x_k+1 - x_k - y_k = g_k+1 - g_k - rho_k = 1 / (s_k^T * y_k) - update sk_vec, yk_vec, rhok_vec - check_converge - end + Jorge Nocedal, Stephen J. Wright, Numerical Optimization, Second Edition, 2006. pp179: Algorithm 7.5 (L-BFGS). Args: - objective_func: the objective function to minimize. ``func`` accepts - a multivariate input and returns a scalar. - initial_position (Tensor): the starting point of the iterates. For methods like Newton and quasi-Newton - the initial trial step length should always be 1.0 . - history_size (Scalar): the number of stored vector pairs {si,yi}. - max_iters (Scalar): the maximum number of minimization iterations. - tolerance_grad (Scalar): terminates if the gradient norm is smaller than - this. Currently gradient norm uses inf norm. - tolerance_change (Scalar): terminates if the change of function value/position/parameter between - two iterations is smaller than this value. - initial_inverse_hessian_estimate (Tensor): the initial inverse hessian approximation. - line_search_fn (str): indicate which line search method to use, only support 'strong wolfe' right now. May support - 'Hager Zhang' in the futrue. - max_line_search_iters (Scalar): the maximum number of line search iterations. - initial_step_length: step length used in first iteration of line search. different initial_step_length - may cause different optimal result. - dtype ('float' | 'float32' | 'float64' | 'double'): the data - type to be used. - + objective_func: the objective function to minimize. ``objective_func`` accepts a multivariate input and returns a scalar. + initial_position (Tensor): the starting point of the iterates. + history_size (Scalar): the number of stored vector pairs {si,yi}. Default value: 100. + max_iters (int, optional): the maximum number of minimization iterations. Default value: 50. + tolerance_grad (float, optional): terminates if the gradient norm is smaller than this. Currently gradient norm uses inf norm. Default value: 1e-7. + tolerance_change (float, optional): terminates if the change of function value/position/parameter between two iterations is smaller than this value. Default value: 1e-9. + initial_inverse_hessian_estimate (Tensor, optional): the initial inverse hessian approximation at initial_position. It must be symmetric and positive definite. Default value: None. + line_search_fn (str, optional): indicate which line search method to use, only support 'strong wolfe' right now. May support 'Hager Zhang' in the futrue. Default value: 'strong wolfe'. + max_line_search_iters (int, optional): the maximum number of line search iterations. Default value: 50. + initial_step_length (float, optional): step length used in first iteration of line search. different initial_step_length may cause different optimal result. For methods like Newton and quasi-Newton the initial trial step length should always be 1.0. Default value: 1.0. + dtype ('float32' | 'float64', optional): data type used in the algorithm. Default value: 'float32'. + name (str, optional): Name for the operation. For more information, please refer to :ref:`api_guide_Name`. Default value: None. + Returns: - is_converge (bool): Indicates whether found the minimum within tolerance. - num_func_calls (int): number of objective function called. - position (Tensor): the position of the last iteration. If the search converged, this value is the argmin of - the objective function regrading to the initial position. - objective_value (Tensor): objective function value at the `position`. - objective_gradient (Tensor): objective function gradient at the `position`. + output(tuple): + - is_converge (bool): Indicates whether found the minimum within tolerance. + - num_func_calls (int): number of objective function called. + - position (Tensor): the position of the last iteration. If the search converged, this value is the argmin of the objective function regrading to the initial position. + - objective_value (Tensor): objective function value at the `position`. + - objective_gradient (Tensor): objective function gradient at the `position`. + Examples: .. code-block:: python From ad9585b6697f749fae479ad103bb18b549446255 Mon Sep 17 00:00:00 2001 From: Zhanlue Yang Date: Thu, 14 Apr 2022 13:42:09 +0800 Subject: [PATCH 148/211] [DoubleGrad] Enabled test_autograd_functional_dynamic.py under eager mode (#41668) * [DoubleGrad] Enabled double grad test cases in eager_mode for test_imperative_double_grad * Fixed elementwise issue * Addressed CI failures * [DoubleGrad] Enabled test_imperative_triple_grad test cases under eager_mode * [DoubleGrad] Enabled test_autograd_functional_dynamic.py under eager mode * Enabled more test cases * Fixed performance issues * Fixed minor issue --- .../final_state_generator/codegen_utils.py | 13 +- paddle/fluid/eager/autograd_meta.h | 2 + paddle/fluid/eager/backward.cc | 20 +-- paddle/fluid/eager/grad_node_info.h | 2 +- paddle/fluid/eager/tensor_wrapper.h | 31 ++-- paddle/phi/infermeta/backward.h | 4 +- python/paddle/autograd/functional.py | 4 + .../test_autograd_functional_dynamic.py | 146 ++++++++++++++---- python/paddle/tensor/linalg.py | 10 +- python/paddle/utils/code_gen/backward.yaml | 44 ++++++ 10 files changed, 212 insertions(+), 64 deletions(-) diff --git a/paddle/fluid/eager/auto_code_generator/final_state_generator/codegen_utils.py b/paddle/fluid/eager/auto_code_generator/final_state_generator/codegen_utils.py index 6219ecee17f30..96af7dfc4fe65 100644 --- a/paddle/fluid/eager/auto_code_generator/final_state_generator/codegen_utils.py +++ b/paddle/fluid/eager/auto_code_generator/final_state_generator/codegen_utils.py @@ -22,9 +22,16 @@ ### Global Variables ### ######################## ops_to_fill_zero_for_empty_grads = set([ - "split_grad", "rnn_grad", "matmul_double_grad", "matmul_triple_grad", - "sigmoid_double_grad", "sigmoid_triple_grad", "add_double_grad", - "add_triple_grad" + "split_grad", + "rnn_grad", + "matmul_double_grad", + "matmul_triple_grad", + "sigmoid_double_grad", + "sigmoid_triple_grad", + "add_double_grad", + "add_triple_grad", + "multiply_double_grad", + "multiply_triple_grad", ]) # For API dispatch used at python-level diff --git a/paddle/fluid/eager/autograd_meta.h b/paddle/fluid/eager/autograd_meta.h index dca76d3b8a0db..2241ccca81ca4 100644 --- a/paddle/fluid/eager/autograd_meta.h +++ b/paddle/fluid/eager/autograd_meta.h @@ -107,6 +107,8 @@ class AutogradMeta : public AbstractAutogradMeta { GradNodeBase* GradNode() const { return grad_node_.get(); } + void ResetGradNode() { grad_node_.reset(); } + void SetSingleOutRankWithSlot(size_t slot_id, size_t rank) { out_slot_id_ = slot_id; out_rank_ = rank; diff --git a/paddle/fluid/eager/backward.cc b/paddle/fluid/eager/backward.cc index 3b555eda8fff7..6db606edf6f4c 100644 --- a/paddle/fluid/eager/backward.cc +++ b/paddle/fluid/eager/backward.cc @@ -53,7 +53,7 @@ class GeneralGrad { auto* target_node = auto_grad_meta->GetMutableGradNode().get(); if (orig_to_copied_node_mapping_.count(target_node)) { - target_node = orig_to_copied_node_mapping_[target_node]; + target_node = orig_to_copied_node_mapping_[target_node].get(); } else { VLOG(6) << "Unable to find target node in " "orig_to_copied_node_mapping_, likely indicating an " @@ -261,7 +261,7 @@ class GeneralGrad { auto* target_node = auto_grad_meta->GetMutableGradNode().get(); if (orig_to_copied_node_mapping_.count(target_node)) { - target_node = orig_to_copied_node_mapping_[target_node]; + target_node = orig_to_copied_node_mapping_[target_node].get(); } else { VLOG(6) << "Unable to find target node in " "orig_to_copied_node_mapping_, likely indicating an unused " @@ -349,12 +349,12 @@ class GeneralGrad { GradNodeBase* CopyGradNode(const std::shared_ptr& orig_node) { if (orig_to_copied_node_mapping_.count(orig_node.get())) { - return orig_to_copied_node_mapping_[orig_node.get()]; + return orig_to_copied_node_mapping_[orig_node.get()].get(); } std::shared_ptr copied_node = orig_node->Copy(); // Save node and update mapping - orig_to_copied_node_mapping_[orig_node.get()] = copied_node.get(); + orig_to_copied_node_mapping_[orig_node.get()] = copied_node; copied_grad_nodes_.push_back(copied_node); return copied_node.get(); @@ -379,7 +379,7 @@ class GeneralGrad { paddle::platform::errors::Fatal( "Cannot reconstruct backward graph," "unable to find copied target for certain grad node.")); - GradNodeBase* copied_node = orig_to_copied_node_mapping_[orig_node]; + GradNodeBase* copied_node = orig_to_copied_node_mapping_[orig_node].get(); const std::vector>& orig_edges = orig_node->GetEdges(); std::vector>& copied_edges = @@ -397,13 +397,12 @@ class GeneralGrad { std::shared_ptr copied_next_node; if (orig_to_copied_node_mapping_.count(orig_next_node.get())) { copied_next_node = - orig_to_copied_node_mapping_[orig_next_node.get()] - ->shared_from_this(); + orig_to_copied_node_mapping_[orig_next_node.get()]; } else { copied_next_node = orig_next_node->Copy(); orig_to_copied_node_mapping_[orig_next_node.get()] = - copied_next_node.get(); + copied_next_node; copied_grad_nodes_.push_back(copied_next_node); } @@ -436,7 +435,8 @@ class GeneralGrad { std::unordered_map results_map; std::vector> copied_grad_nodes_; - std::unordered_map orig_to_copied_node_mapping_; + std::unordered_map> + orig_to_copied_node_mapping_; DISABLE_COPY_AND_ASSIGN(GeneralGrad); }; @@ -534,6 +534,7 @@ std::vector RunBackward( // GeneralGrad bool is_general_grad = !inputs.empty(); + if (is_general_grad) GeneralGrad::Instance().Clear(); /* --- Initialization --- */ // 1. Init queue with starting nodes @@ -746,6 +747,7 @@ std::vector RunBackward( VLOG(6) << "We get grad_output_tensor with slot: " << i << ", rank: " << j << " as uninitialized or undefined tensor"; } + VLOG(6) << "Get Edge and grad_output_tensor with slot: " << i << ", rank: " << j << " 's name is: " << grad_output_tensor.name(); diff --git a/paddle/fluid/eager/grad_node_info.h b/paddle/fluid/eager/grad_node_info.h index decb682bf4517..201aae294f928 100644 --- a/paddle/fluid/eager/grad_node_info.h +++ b/paddle/fluid/eager/grad_node_info.h @@ -87,7 +87,7 @@ class GradSlotMeta { std::shared_ptr meta_ = nullptr; }; -class GradNodeBase : public std::enable_shared_from_this { +class GradNodeBase { public: GradNodeBase() { VLOG(6) << "Construct GradNodeBase"; } GradNodeBase(size_t bwd_in_slot_num, size_t bwd_out_slot_num); diff --git a/paddle/fluid/eager/tensor_wrapper.h b/paddle/fluid/eager/tensor_wrapper.h index b5dd6b960b23a..e42e04a266b46 100644 --- a/paddle/fluid/eager/tensor_wrapper.h +++ b/paddle/fluid/eager/tensor_wrapper.h @@ -79,9 +79,9 @@ class TensorWrapper { auto* tensor_autograd_meta = EagerUtils::nullable_autograd_meta(tensor); if (tensor_autograd_meta) { - auto autograd_meta = std::make_shared( - Edge(nullptr, EagerUtils::OutRankInfo(tensor))); - autograd_meta->SetStopGradient(tensor_autograd_meta->StopGradient()); + auto autograd_meta = + std::make_shared(*tensor_autograd_meta); + autograd_meta->ResetGradNode(); intermidiate_tensor_.set_autograd_meta(autograd_meta); weak_grad_node_ = tensor_autograd_meta->GetMutableGradNode(); } @@ -98,8 +98,11 @@ class TensorWrapper { check_inplace_version(); // if it's full_reserved just return the full copy of tensor - paddle::experimental::Tensor recovered_tensor = intermidiate_tensor_; - if (!full_reserved_) { + if (full_reserved_) { + return intermidiate_tensor_; + } else { + paddle::experimental::Tensor recovered_tensor = intermidiate_tensor_; + std::shared_ptr new_grad_node = weak_grad_node_.lock(); if (new_grad_node) { VLOG(3) << "Recovered TensorWrapper with GradNode " @@ -109,17 +112,15 @@ class TensorWrapper { } auto* intermediate_autograd_meta = EagerUtils::unsafe_autograd_meta(intermidiate_tensor_); - auto p_ab_autograd_meta = std::make_shared( - Edge(new_grad_node, intermediate_autograd_meta->OutRankInfo())); - p_ab_autograd_meta->SetStopGradient( - intermediate_autograd_meta->StopGradient()); - - recovered_tensor.set_autograd_meta( - std::static_pointer_cast( - p_ab_autograd_meta)); - } + auto p_ab_autograd_meta = + std::make_shared(*intermediate_autograd_meta); + if (new_grad_node) { + p_ab_autograd_meta->SetGradNode(new_grad_node); + } + recovered_tensor.set_autograd_meta(p_ab_autograd_meta); - return recovered_tensor; + return recovered_tensor; + } } void check_inplace_version() { diff --git a/paddle/phi/infermeta/backward.h b/paddle/phi/infermeta/backward.h index 6e730c83d1d50..c51708bb54394 100644 --- a/paddle/phi/infermeta/backward.h +++ b/paddle/phi/infermeta/backward.h @@ -100,6 +100,8 @@ void GatherNdGradInferMeta(const MetaTensor& x, const MetaTensor& out_grad, MetaTensor* x_grad); +void GeneralUnaryGradInferMeta(const MetaTensor& x, MetaTensor* dx); + void GeneralBinaryGradInferMeta(const MetaTensor& x, const MetaTensor& y, MetaTensor* dx, @@ -132,8 +134,6 @@ void GeneralQuinaryGradInferMeta(const MetaTensor& x, MetaTensor* dk, MetaTensor* dl); -void GeneralUnaryGradInferMeta(const MetaTensor& x, MetaTensor* dx); - void GumbelSoftmaxGradInferMeta(const MetaTensor& out, const MetaTensor& dout, int axis, diff --git a/python/paddle/autograd/functional.py b/python/paddle/autograd/functional.py index 8e027c270b700..93142c9112fd0 100644 --- a/python/paddle/autograd/functional.py +++ b/python/paddle/autograd/functional.py @@ -943,8 +943,10 @@ def func(x, y): # [0., 1., 0., 1., 0., 1., 0., 1.]])) ''' + inputs = _as_tensors(inputs) outputs = _as_tensors(func(*inputs)) + batch_size = inputs[0].shape[0] for input in inputs: assert input.shape[ @@ -961,12 +963,14 @@ def func(x, y): for i, flat_output in enumerate(flat_outputs): jac_i = list([] for _ in range(fin_size)) for k in range(flat_output.shape[1]): + row_k = paddle.grad( flat_output[:, k], inputs, create_graph=create_graph, retain_graph=True, allow_unused=allow_unused) + for j in range(fin_size): jac_i[j].append( paddle.reshape( diff --git a/python/paddle/fluid/tests/unittests/autograd/test_autograd_functional_dynamic.py b/python/paddle/fluid/tests/unittests/autograd/test_autograd_functional_dynamic.py index 8c725fe24e59c..40aead9076569 100644 --- a/python/paddle/fluid/tests/unittests/autograd/test_autograd_functional_dynamic.py +++ b/python/paddle/fluid/tests/unittests/autograd/test_autograd_functional_dynamic.py @@ -205,7 +205,7 @@ def func_vjp_aliased_input(self): self.check_results(ref_result, aliased_result) def test_all_cases(self): - if _in_legacy_dygraph(): + with _test_eager_guard(): self.func_vjp_i1o1() self.func_vjp_i2o1() self.func_vjp_i2o2() @@ -213,6 +213,13 @@ def test_all_cases(self): self.func_vjp_nested() self.func_vjp_aliased_input() + self.func_vjp_i1o1() + self.func_vjp_i2o1() + self.func_vjp_i2o2() + self.func_vjp_i2o2_omitting_v() + self.func_vjp_nested() + self.func_vjp_aliased_input() + @utils.place(config.DEVICES) @utils.parameterize( @@ -227,8 +234,9 @@ def func_vjp(self): paddle.to_tensor(self.v)) def test_all_cases(self): - if _in_legacy_dygraph(): + with _test_eager_guard(): self.func_vjp() + self.func_vjp() def jac(grad_fn, f, inputs): @@ -303,11 +311,15 @@ def func_jvp_i2o2_omitting_v(self): self.check_results(results_omitting_v, results_with_v) def test_all_cases(self): - if _in_legacy_dygraph(): + with _test_eager_guard(): self.func_jvp_i1o1() self.func_jvp_i2o1() self.func_jvp_i2o2() self.func_jvp_i2o2_omitting_v() + self.func_jvp_i1o1() + self.func_jvp_i2o1() + self.func_jvp_i2o2() + self.func_jvp_i2o2_omitting_v() @utils.place(config.DEVICES) @@ -328,12 +340,12 @@ def setUp(self): self._atol = config.TOLERANCE.get(str(self._dtype)).get( "first_order_grad").get("atol") - self.xs = [paddle.to_tensor(x) for x in self.xs] if isinstance( + def func_jacobian(self): + xs = [paddle.to_tensor(x) for x in self.xs] if isinstance( self.xs, typing.Sequence) else paddle.to_tensor(self.xs) - self._actual = paddle.autograd.Jacobian(self.func, self.xs, False) - self._expected = self._expected() + self._actual = paddle.autograd.Jacobian(self.func, xs, False) + self._expected = self._get_expected() - def func_jacobian(self): Index = collections.namedtuple('Index', ('type', 'value')) indexes = (Index('all', (slice(0, None, None), slice(0, None, None))), Index('row', (0, slice(0, None, None))), @@ -349,14 +361,17 @@ def func_jacobian(self): err_msg=f'Testcase {index.type} index not passed, value is {index.value}' ) - def _expected(self): - jac = utils._compute_numerical_jacobian(self.func, self.xs, self._eps, + def _get_expected(self): + xs = [paddle.to_tensor(x) for x in self.xs] if isinstance( + self.xs, typing.Sequence) else paddle.to_tensor(self.xs) + jac = utils._compute_numerical_jacobian(self.func, xs, self._eps, self._dtype) return utils._np_concat_matrix_sequence(jac, utils.MatrixFormat.NM) def test_all_cases(self): - if _in_legacy_dygraph(): + with _test_eager_guard(): self.func_jacobian() + self.func_jacobian() @utils.place(config.DEVICES) @@ -375,12 +390,12 @@ def setUp(self): self._atol = config.TOLERANCE.get(str(self._dtype)).get( "first_order_grad").get("atol") - self.xs = [paddle.to_tensor(x) for x in self.xs] if isinstance( + def func_jacobian(self): + xs = [paddle.to_tensor(x) for x in self.xs] if isinstance( self.xs, typing.Sequence) else paddle.to_tensor(self.xs) - self._actual = paddle.autograd.Jacobian(self.func, self.xs, True) - self._expected = self._expected() + self._actual = paddle.autograd.Jacobian(self.func, xs, True) + self._expected = self._get_expected() - def func_jacobian(self): Index = collections.namedtuple('Index', ('type', 'value')) indexes = ( Index('all', (slice(0, None, None), slice(0, None, None), @@ -402,16 +417,19 @@ def func_jacobian(self): err_msg=f'Testcase {index.type} index not passed, value is {index.value}' ) - def _expected(self): - jac = utils._compute_numerical_batch_jacobian( - self.func, self.xs, self._eps, self._dtype, False) + def _get_expected(self): + xs = [paddle.to_tensor(x) for x in self.xs] if isinstance( + self.xs, typing.Sequence) else paddle.to_tensor(self.xs) + jac = utils._compute_numerical_batch_jacobian(self.func, xs, self._eps, + self._dtype, False) jac = utils._np_concat_matrix_sequence(jac, utils.MatrixFormat.NBM) return utils._np_transpose_matrix_format(jac, utils.MatrixFormat.NBM, utils.MatrixFormat.BNM) def test_all_cases(self): - if _in_legacy_dygraph(): + with _test_eager_guard(): self.func_jacobian() + self.func_jacobian() class TestHessianClassNoBatch(unittest.TestCase): @@ -492,12 +510,19 @@ def func(x): paddle.autograd.Hessian(func, paddle.ones([3])) def test_all_cases(self): - if _in_legacy_dygraph(): + with _test_eager_guard(): + self.setUpClass() self.func_single_input() self.func_multi_input() self.func_allow_unused_true() self.func_create_graph_true() self.func_out_not_single() + self.setUpClass() + self.func_single_input() + self.func_multi_input() + self.func_allow_unused_true() + self.func_create_graph_true() + self.func_out_not_single() class TestHessianClassBatchFirst(unittest.TestCase): @@ -599,12 +624,19 @@ def func(x): paddle.autograd.Hessian(func, paddle.ones((3, 3)), is_batched=True) def test_all_cases(self): - if _in_legacy_dygraph(): + with _test_eager_guard(): + self.setUpClass() self.func_single_input() self.func_multi_input() self.func_allow_unused() self.func_stop_gradient() self.func_out_not_single() + self.setUpClass() + self.func_single_input() + self.func_multi_input() + self.func_allow_unused() + self.func_stop_gradient() + self.func_out_not_single() class TestHessian(unittest.TestCase): @@ -619,6 +651,7 @@ def setUpClass(self): "second_order_grad").get("rtol") self.atol = config.TOLERANCE.get(self.dtype).get( "second_order_grad").get("atol") + self.x = paddle.rand(shape=self.shape, dtype=self.dtype) self.y = paddle.rand(shape=self.shape, dtype=self.dtype) @@ -694,9 +727,10 @@ def func(x): self.rtol, self.atol) try: paddle.grad(hessian, self.x) - except RuntimeError as e: + except Exception as e: error_msg = cpt.get_exception_message(e) - assert error_msg.find("has no gradient") > 0 + assert error_msg.find("has no gradient") > 0 or error_msg.find( + "does not appear") > 0 def func_create_graph_true(self): def func(x): @@ -713,13 +747,21 @@ def func(x): assert triple_grad is not None def test_all_cases(self): - if _in_legacy_dygraph(): + with _test_eager_guard(): + self.setUpClass() self.func_single_input() self.func_multi_input() self.func_allow_unused_false() self.func_allow_unused_true() self.func_create_graph_false() self.func_create_graph_true() + self.setUpClass() + self.func_single_input() + self.func_multi_input() + self.func_allow_unused_false() + self.func_allow_unused_true() + self.func_create_graph_false() + self.func_create_graph_true() class TestHessianFloat64(TestHessian): @@ -830,9 +872,10 @@ def func(x): self.rtol, self.atol) try: paddle.grad(hessian, self.x) - except RuntimeError as e: + except Exception as e: error_msg = cpt.get_exception_message(e) - assert error_msg.find("has no gradient") > 0 + assert error_msg.find("has no gradient") > 0 or error_msg.find( + "does not appear") > 0 def func_create_graph_true(self): def func(x): @@ -849,13 +892,21 @@ def func(x): assert triple_grad is not None def test_all_cases(self): - if _in_legacy_dygraph(): + with _test_eager_guard(): + self.setUpClass() self.func_single_input() self.func_multi_input() self.func_allow_unused_false() self.func_allow_unused_true() self.func_create_graph_false() self.func_create_graph_true() + self.setUpClass() + self.func_single_input() + self.func_multi_input() + self.func_allow_unused_false() + self.func_allow_unused_true() + self.func_create_graph_false() + self.func_create_graph_true() class TestBatchHessianFloat64(TestBatchHessian): @@ -985,12 +1036,19 @@ def func(x): assert triple_grad is not None def test_all_cases(self): - if _in_legacy_dygraph(): + with _test_eager_guard(): + self.setUpClass() self.func_v_default() self.func_multi_input() self.func_single_input() self.func_allow_unused_true() self.func_create_graph_true() + self.setUpClass() + self.func_v_default() + self.func_multi_input() + self.func_single_input() + self.func_allow_unused_true() + self.func_create_graph_true() class TestJacobian(unittest.TestCase): @@ -1100,9 +1158,10 @@ def func(x, y): self.atol) try: paddle.grad(jacobian[0], [self.x, self.y]) - except RuntimeError as e: + except Exception as e: error_msg = cpt.get_exception_message(e) - assert error_msg.find("has no gradient") > 0 + assert error_msg.find("has no gradient") > 0 or error_msg.find( + "does not appear") > 0 def func_create_graph_true(self): def func(x, y): @@ -1123,7 +1182,8 @@ def func(x, y): assert double_grad is not None def test_all_cases(self): - if _in_legacy_dygraph(): + with _test_eager_guard(): + self.setUpClass() self.func_multi_input_and_multi_output() self.func_multi_input_and_single_output() self.func_single_input_and_multi_output() @@ -1132,6 +1192,15 @@ def test_all_cases(self): self.func_allow_unused_true() self.func_create_graph_false() self.func_create_graph_true() + self.setUpClass() + self.func_multi_input_and_multi_output() + self.func_multi_input_and_single_output() + self.func_single_input_and_multi_output() + self.func_single_input_and_single_output() + self.func_allow_unused_false() + self.func_allow_unused_true() + self.func_create_graph_false() + self.func_create_graph_true() class TestJacobianFloat64(TestJacobian): @@ -1269,9 +1338,10 @@ def func(x, y): self.atol) try: paddle.grad(jacobian[0], [self.x, self.y]) - except RuntimeError as e: + except Exception as e: error_msg = cpt.get_exception_message(e) - assert error_msg.find("has no gradient") > 0 + assert error_msg.find("has no gradient") > 0 or error_msg.find( + "does not appear") > 0 def func_create_graph_true(self): def func(x, y): @@ -1292,7 +1362,8 @@ def func(x, y): assert double_grad is not None def test_all_cases(self): - if _in_legacy_dygraph(): + with _test_eager_guard(): + self.setUpClass() self.func_batch_single_input_and_batch_single_output() self.func_batch_single_input_and_batch_multi_output() self.func_batch_multi_input_and_batch_single_output() @@ -1301,6 +1372,15 @@ def test_all_cases(self): self.func_allow_unused_true() self.func_create_graph_false() self.func_create_graph_true() + self.setUpClass() + self.func_batch_single_input_and_batch_single_output() + self.func_batch_single_input_and_batch_multi_output() + self.func_batch_multi_input_and_batch_single_output() + self.func_batch_multi_input_and_batch_multi_output() + self.func_allow_unused_false() + self.func_allow_unused_true() + self.func_create_graph_false() + self.func_create_graph_true() class TestJacobianBatchFloat64(TestJacobianBatch): diff --git a/python/paddle/tensor/linalg.py b/python/paddle/tensor/linalg.py index 4af4ac52209ef..9c2074bbe3cda 100644 --- a/python/paddle/tensor/linalg.py +++ b/python/paddle/tensor/linalg.py @@ -1195,7 +1195,15 @@ def t(input, name=None): "Input(input) only support N-D (N<=2) tensor, but received " "length of Input(input) is %s. Perhaps you can use paddle." "tensor.transpose() instead." % len(input.shape)) - if paddle.in_dynamic_mode(): + if in_dygraph_mode(): + if len(input.shape) == 1: + return input + # 2-D tensor + perm = [1, 0] + out = _C_ops.final_state_transpose(input, perm) + return out + + if _in_legacy_dygraph(): if len(input.shape) == 1: return input # 2-D tensor diff --git a/python/paddle/utils/code_gen/backward.yaml b/python/paddle/utils/code_gen/backward.yaml index 97c9c7ddf1584..a7b29b9f5aefc 100644 --- a/python/paddle/utils/code_gen/backward.yaml +++ b/python/paddle/utils/code_gen/backward.yaml @@ -1097,6 +1097,7 @@ kernel : func : multiply_double_grad optional : grad_x_grad, grad_y_grad + backward : multiply_triple_grad - backward_api : multiply_grad forward : multiply (Tensor x, Tensor y) -> Tensor(out) @@ -1109,6 +1110,17 @@ func : multiply_grad backward : multiply_double_grad +- backward_api : multiply_triple_grad + forward : multiply_double_grad (Tensor x, Tensor y, Tensor fwd_grad_out, Tensor fwd_grad_grad_x, Tensor fwd_grad_grad_y, int aixs = -1) -> Tensor(grad_x), Tensor(grad_y), Tensor(grad_grad_out) + args : (Tensor x, Tensor y, Tensor fwd_grad_out, Tensor fwd_grad_grad_x, Tensor fwd_grad_grad_y, Tensor grad_x_grad, Tensor grad_y_grad, Tensor grad_grad_out_grad, int axis = -1) + output : Tensor(x_grad), Tensor(y_grad), Tensor(fwd_grad_out_grad), Tensor(fwd_grad_grad_x_grad), Tensor(fwd_grad_grad_y_grad) + infer_meta : + func : GeneralQuinaryGradInferMeta + param : [x, y, fwd_grad_out, x, y] + kernel : + func : multiply_triple_grad + optional : fwd_grad_grad_x, fwd_grad_grad_y, grad_grad_out_grad + - backward_api : mv_grad forward : mv (Tensor x, Tensor vec) -> Tensor(out) args : (Tensor x, Tensor vec, Tensor out_grad) @@ -1286,6 +1298,16 @@ func : relu_grad backward: relu_double_grad +- backward_api : reshape_double_grad + forward : reshape_grad (Tensor xshape, Tensor grad_out) -> Tensor(grad_x) + args : (Tensor grad_out, Tensor grad_x_grad) + output : Tensor(grad_out_grad) + infer_meta : + func : UnchangedInferMeta + param : [grad_out] + kernel : + func : reshape_double_grad + - backward_api : reshape_grad forward : reshape_with_xshape (Tensor x, IntArray shape) -> Tensor(out), Tensor(xshape) args : (Tensor xshape, Tensor out_grad) @@ -1299,6 +1321,7 @@ data_type: out_grad backend: out_grad layout: out_grad + backward : reshape_double_grad - backward_api : roi_align_grad forward : roi_align (Tensor x, Tensor boxes, Tensor boxes_num, int pooled_height, int pooled_width, float spatial_scale, int sampling_ratio, bool aligned) -> Tensor(out) @@ -1592,6 +1615,13 @@ func : subtract_grad no_need_buffer : x, y +- backward_api : sum_double_grad + forward : sum_grad (Tensor x, Tensor grad_out, int64_t[] dims, bool keep_dim, bool reduce_all=false) -> Tensor(grad_x) + args : (Tensor grad_x_grad, int64_t[] dims={}, bool keep_dim=false) + output : Tensor(grad_out_grad) + invoke : sum(grad_x_grad, dims, grad_x_grad.dtype(), keep_dim) + backward : sum_triple_grad + - backward_api : sum_grad forward : sum (Tensor x, int64_t[] dims={}, DataType out_dtype=paddle::experimental::DataType::UNDEFINED, bool keep_dim=false) -> Tensor(out) args : (Tensor x, Tensor out_grad, int64_t[] dims, bool keep_dim, bool reduce_all=false) @@ -1601,6 +1631,13 @@ param : [x] kernel : func : sum_grad + backward : sum_double_grad + +- backward_api : sum_triple_grad + forward : sum_double_grad (Tensor grad_grad_x, int64_t[] dims={}, bool keep_dim=false) -> Tensor(grad_grad_out) + args : (Tensor grad_grad_x, Tensor grad_grad_out_grad, int64_t[] dims={}, bool keep_dim=false, bool reduce_all=false) + output : Tensor(grad_grad_x_grad) + invoke : sum_grad(grad_grad_x, grad_grad_out_grad, dims, keep_dim, reduce_all) no_need_buffer : x - backward_api : swish_grad @@ -1695,6 +1732,12 @@ func : trace_grad no_need_buffer : x +- backward_api : transpose_double_grad + forward : transpose_grad (Tensor grad_out, int[] axis) -> Tensor(grad_x) + args : (Tensor grad_x_grad, int[] axis) + output : Tensor(grad_out_grad) + invoke : transpose(grad_x_grad, axis) + - backward_api : transpose_grad forward : transpose (Tensor x, int[] axis) -> Tensor(out) args : (Tensor out_grad, int[] axis) @@ -1704,6 +1747,7 @@ param : [out_grad, axis] kernel : func : transpose_grad + backward : transpose_double_grad - backward_api : triangular_solve_grad forward : triangular_solve (Tensor x, Tensor y, bool upper, bool tranpose, bool unitriangular) -> Tensor(out) From b075dee8beeff5b1db85908a87aa5358ea10e29f Mon Sep 17 00:00:00 2001 From: zyfncg Date: Thu, 14 Apr 2022 14:35:17 +0800 Subject: [PATCH 149/211] [PHI] Support some c++ api in paddle namespace (#41778) * support some c++ api in paddle namespace * change c++ api namespace in custom op --- paddle/phi/api/ext/tensor_compat.h | 108 ++++++++++++++++++ .../fluid/tests/custom_op/custom_linear_op.cc | 3 +- python/paddle/utils/code_gen/api.yaml | 2 +- 3 files changed, 110 insertions(+), 3 deletions(-) diff --git a/paddle/phi/api/ext/tensor_compat.h b/paddle/phi/api/ext/tensor_compat.h index e63390db06e82..1bcf3d78d1fcb 100644 --- a/paddle/phi/api/ext/tensor_compat.h +++ b/paddle/phi/api/ext/tensor_compat.h @@ -24,6 +24,114 @@ limitations under the License. */ namespace paddle { using Tensor = experimental::Tensor; // using several Tensor initialize functions in paddle namespace +using experimental::abs; +using experimental::acos; +using experimental::acosh; +using experimental::add; +using experimental::allclose; +using experimental::argsort; +using experimental::asin; +using experimental::asinh; +using experimental::atan; +using experimental::atan2; +using experimental::atanh; +using experimental::bernoulli; +using experimental::ceil; +using experimental::cholesky; +using experimental::cholesky_solve; +using experimental::clip; +using experimental::concat; +using experimental::conj; +using experimental::cos; +using experimental::cosh; +using experimental::cross; +using experimental::det; +using experimental::diag; +using experimental::diagonal; +using experimental::digamma; +using experimental::dist; +using experimental::divide; +using experimental::dot; +using experimental::elu; using experimental::empty; +using experimental::empty_like; +using experimental::equal_all; +using experimental::erf; +using experimental::erfinv; +using experimental::exp; +using experimental::expand; +using experimental::expm1; +using experimental::flatten; +using experimental::flip; +using experimental::floor; +using experimental::floor_divide; using experimental::full; +using experimental::gather; +using experimental::gather_nd; +using experimental::gelu; +using experimental::gumbel_softmax; +using experimental::imag; +using experimental::increment; +using experimental::index_sample; +using experimental::is_empty; +using experimental::isclose; +using experimental::isfinite; +using experimental::isinf; +using experimental::isnan; +using experimental::kron; +using experimental::kthvalue; +using experimental::label_smooth; +using experimental::lerp; +using experimental::lgamma; +using experimental::log; +using experimental::log10; +using experimental::log1p; +using experimental::log2; +using experimental::logit; +using experimental::masked_select; +using experimental::matmul; +using experimental::matrix_power; +using experimental::maximum; +using experimental::maxout; +using experimental::minimum; +using experimental::mode; +using experimental::multi_dot; +using experimental::multinomial; +using experimental::multiply; +using experimental::mv; +using experimental::nll_loss; +using experimental::one_hot; +using experimental::pixel_shuffle; +using experimental::poisson; +using experimental::qr; +using experimental::real; +using experimental::reciprocal; +using experimental::relu; +using experimental::reshape; +using experimental::roll; +using experimental::round; +using experimental::rsqrt; +using experimental::scatter; +using experimental::scatter_nd_add; +using experimental::selu; +using experimental::sign; +using experimental::silu; +using experimental::sin; +using experimental::sinh; +using experimental::split; +using experimental::sqrt; +using experimental::square; +using experimental::stack; +using experimental::strided_slice; +using experimental::subtract; +using experimental::tanh; +using experimental::thresholded_relu; +using experimental::tile; +using experimental::trace; +using experimental::triangular_solve; +using experimental::unbind; +using experimental::unique; +using experimental::unsqueeze; +using experimental::where; + } // namespace paddle diff --git a/python/paddle/fluid/tests/custom_op/custom_linear_op.cc b/python/paddle/fluid/tests/custom_op/custom_linear_op.cc index 76158596cb815..a561c845aba2b 100644 --- a/python/paddle/fluid/tests/custom_op/custom_linear_op.cc +++ b/python/paddle/fluid/tests/custom_op/custom_linear_op.cc @@ -20,8 +20,7 @@ limitations under the License. */ std::vector PhiLinearForward(const paddle::Tensor& x, const paddle::Tensor& weight, const paddle::Tensor& bias) { - return { - paddle::experimental::add(paddle::experimental::matmul(x, weight), bias)}; + return {paddle::add(paddle::matmul(x, weight), bias)}; } std::vector> LinearInferShape( diff --git a/python/paddle/utils/code_gen/api.yaml b/python/paddle/utils/code_gen/api.yaml index f5245d59babd2..a142225e6578c 100644 --- a/python/paddle/utils/code_gen/api.yaml +++ b/python/paddle/utils/code_gen/api.yaml @@ -2167,7 +2167,7 @@ data_type : x - api : unsqueeze - args : (Tensor x, IntArray axes) + args : (Tensor x, IntArray axis) output : Tensor(xshape), Tensor(out) infer_meta : func : UnsqueezeInferMeta From bda4965a0f651d0c23396815c83b9f9cc9212d4f Mon Sep 17 00:00:00 2001 From: Sing_chan <51314274+betterpig@users.noreply.github.com> Date: Thu, 14 Apr 2022 14:45:37 +0800 Subject: [PATCH 150/211] fix bug of set cuda lib in demo_ci and infer_ut (#41677) --- paddle/fluid/inference/api/demo_ci/CMakeLists.txt | 2 +- paddle/fluid/inference/tests/infer_ut/CMakeLists.txt | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/paddle/fluid/inference/api/demo_ci/CMakeLists.txt b/paddle/fluid/inference/api/demo_ci/CMakeLists.txt index c02fcd0781321..547e265d2fdb5 100644 --- a/paddle/fluid/inference/api/demo_ci/CMakeLists.txt +++ b/paddle/fluid/inference/api/demo_ci/CMakeLists.txt @@ -85,7 +85,7 @@ if(WITH_GPU) set(CUDA_LIB "/usr/local/cuda/lib64/" CACHE STRING "CUDA Library") else() set(CUDA_LIB "" CACHE STRING "CUDA_LIB") - if("${TENSORRT_ROOT}" STREQUAL "") + if("${CUDA_LIB}" STREQUAL "") if(DEFINED ENV{CUDA_PATH}) set(CUDA_LIB "$ENV{CUDA_PATH}\\lib\\x64") else() diff --git a/paddle/fluid/inference/tests/infer_ut/CMakeLists.txt b/paddle/fluid/inference/tests/infer_ut/CMakeLists.txt index 5c17e2d62d37d..ad7ef0c04ce67 100644 --- a/paddle/fluid/inference/tests/infer_ut/CMakeLists.txt +++ b/paddle/fluid/inference/tests/infer_ut/CMakeLists.txt @@ -84,7 +84,7 @@ if(WITH_GPU) set(CUDA_LIB "/usr/local/cuda/lib64/" CACHE STRING "CUDA Library") else() set(CUDA_LIB "" CACHE STRING "CUDA_LIB") - if("${TENSORRT_ROOT}" STREQUAL "") + if("${CUDA_LIB}" STREQUAL "") if(DEFINED ENV{CUDA_PATH}) set(CUDA_LIB "$ENV{CUDA_PATH}\\lib\\x64") else() From 8b07ce0e9486fa46cf57b168f2a2bf910fb5b91c Mon Sep 17 00:00:00 2001 From: helen88 Date: Thu, 14 Apr 2022 15:06:39 +0800 Subject: [PATCH 151/211] support multi layer and bidirection of lstm_grad, *test=kunlun (#41742) * support multi layer and bidirection of lstm_grad, *test=kunlun * support multi layer and bidirection of lstm_grad, *test=kunlun --- paddle/fluid/operators/rnn_op_xpu.cc | 247 ++++++++++++------ .../fluid/platform/device/xpu/xpu2_op_list.h | 1 + .../tests/unittests/xpu/test_rnn_op_xpu.py | 11 +- 3 files changed, 180 insertions(+), 79 deletions(-) diff --git a/paddle/fluid/operators/rnn_op_xpu.cc b/paddle/fluid/operators/rnn_op_xpu.cc index a18d0ebfca946..220d91bf4faab 100644 --- a/paddle/fluid/operators/rnn_op_xpu.cc +++ b/paddle/fluid/operators/rnn_op_xpu.cc @@ -125,23 +125,13 @@ class RnnXPUKernel : public framework::OpKernel { output->mutable_data(ctx.GetPlace()); last_h->mutable_data(ctx.GetPlace()); last_c->mutable_data(ctx.GetPlace()); + int gate_num = 4; + int hidden_data_idx = (num_layers - 1); + hidden_data_idx += (gate_num + 1) * num_layers; + const int& block_size = direction_num * seq_len * batch_size * hidden_size; + reserve_data->Resize({hidden_data_idx, block_size}); - reserve_data->Resize( - {num_layers * direction_num * seq_len * batch_size * hidden_size * 5}); reserve_data->mutable_data(ctx.GetPlace()); - Tensor internal_output_1_tensor, internal_output_2_tensor; - T* internal_output_1_ptr = nullptr; - T* internal_output_2_ptr = nullptr; - if (num_layers >= 2) { - internal_output_1_tensor.Resize(output->dims()); - internal_output_1_ptr = - internal_output_1_tensor.mutable_data(ctx.GetPlace()); - } - if (num_layers >= 3) { - internal_output_2_tensor.Resize(output->dims()); - internal_output_2_ptr = - internal_output_2_tensor.mutable_data(ctx.GetPlace()); - } // get ptr from tensor auto x = input->data(); auto init_h_ptr = init_h->data(); @@ -151,8 +141,9 @@ class RnnXPUKernel : public framework::OpKernel { auto last_c_ptr = last_c->data(); auto i_f_g_o_ptr = reserve_data->data(); auto c_ptr = - i_f_g_o_ptr + - num_layers * direction_num * seq_len * batch_size * hidden_size * 4; + i_f_g_o_ptr + num_layers * block_size * 4; // 4 for i_f_g_o offset + auto hidden_data_ptr = + c_ptr + num_layers * block_size * 1; // 1 for c offset std::vector seq_len_tensor(batch_size, seq_len); if (has_seq_length) { @@ -161,33 +152,26 @@ class RnnXPUKernel : public framework::OpKernel { int state_offset = pre_state[0]->dims()[1] * pre_state[0]->dims()[2]; + const T* cur_input_ptr = nullptr; + int cur_xdim = -1; + T* cur_output_ptr = y; for (int i = 0; i < num_layers; i++) { - auto i_f_g_o = i_f_g_o_ptr + - i * direction_num * seq_len * batch_size * hidden_size * 4; - auto c = c_ptr + i * direction_num * seq_len * batch_size * hidden_size; + auto i_f_g_o = i_f_g_o_ptr + i * block_size * 4; + auto c = c_ptr + i * block_size; + + cur_output_ptr = y; + if (i < num_layers - 1 && num_layers > 1) { + cur_output_ptr = hidden_data_ptr + i * block_size; + } - const T* cur_input_ptr = nullptr; - int cur_xdim = -1; if (i == 0) { cur_input_ptr = x; cur_xdim = input_dim; - } else if (i % 2 != 0) { - cur_input_ptr = internal_output_1_ptr; - cur_xdim = is_bidirec ? 2 * hidden_size : hidden_size; } else { - cur_input_ptr = internal_output_2_ptr; + cur_input_ptr = hidden_data_ptr + (i - 1) * block_size; cur_xdim = is_bidirec ? 2 * hidden_size : hidden_size; } - T* cur_output_ptr = nullptr; - if (i == num_layers - 1) { - cur_output_ptr = y; - } else if (i % 2 != 0) { - cur_output_ptr = internal_output_2_ptr; - } else { - cur_output_ptr = internal_output_1_ptr; - } - auto h_0 = init_h_ptr + direction_num * i * state_offset; auto c_0 = init_c_ptr + direction_num * i * state_offset; auto last_h = last_h_ptr + direction_num * i * state_offset; @@ -233,6 +217,8 @@ class RnnXPUKernel : public framework::OpKernel { template class RnnXPUGradKernel : public framework::OpKernel { + using XPUTyp = typename XPUTypeTrait::Type; + public: void Compute(const framework::ExecutionContext& ctx) const override { // get the tensor pointer for the input @@ -243,6 +229,7 @@ class RnnXPUGradKernel : public framework::OpKernel { auto* reserve_data = ctx.Input("Reserve"); const int& num_layers = ctx.Attr("num_layers"); const bool& is_bidirec = ctx.Attr("is_bidirec"); + const float& dropout_prob = ctx.Attr("dropout_prob"); const int& hidden_size = ctx.Attr("hidden_size"); const std::string& mode = ctx.Attr("mode"); @@ -257,16 +244,6 @@ class RnnXPUGradKernel : public framework::OpKernel { platform::errors::InvalidArgument( "XPU only support LSTM mode now, current mode is %s", mode)); - PADDLE_ENFORCE_EQ(is_bidirec, false, - platform::errors::InvalidArgument( - "XPU only support unidirectional LSTM now")); - - PADDLE_ENFORCE_EQ( - num_layers, 1, - platform::errors::InvalidArgument( - "XPU only support 1 layer LSTM now, current layer num is %s", - num_layers)); - auto init_h = pre_state[0]; auto init_c = pre_state[1]; @@ -289,11 +266,12 @@ class RnnXPUGradKernel : public framework::OpKernel { } // check shape - int seq_len = input->dims()[0]; - int batch_size = input->dims()[1]; - int input_dim = input->dims()[2]; + const int& seq_len = input->dims()[0]; + const int& batch_size = input->dims()[1]; + const int& input_dim = input->dims()[2]; + const int& direction_num = is_bidirec ? 2 : 1; PADDLE_ENFORCE_EQ( - init_h->dims()[0], num_layers, + init_h->dims()[0], num_layers * direction_num, platform::errors::InvalidArgument("The num_layers of in RNN layer must" " be the same as first dim of init " "hidden, but received num_layers:%d," @@ -301,7 +279,7 @@ class RnnXPUGradKernel : public framework::OpKernel { num_layers, init_h->dims()[0])); PADDLE_ENFORCE_EQ( - init_c->dims()[0], num_layers, + init_c->dims()[0], num_layers * direction_num, platform::errors::InvalidArgument( "The num_layers of in RNN layer must" " be the same as first dim of cell state hidden, but received" @@ -323,52 +301,165 @@ class RnnXPUGradKernel : public framework::OpKernel { // allocate the memory and initization the input_grad input_grad->mutable_data(input->dims(), ctx.GetPlace()); + auto& dev_ctx = ctx.template device_context(); + phi::funcs::SetConstant zero; + zero(dev_ctx, input_grad, static_cast(0.0)); + + Tensor a, b; + Tensor* dynamic_grad_pre_h = &a; + Tensor* dynamic_grad_pre_c = &b; if (init_h_grad) { - init_h_grad->mutable_data(init_h->dims(), ctx.GetPlace()); + init_h_grad->mutable_data(last_h_grad->dims(), ctx.GetPlace()); + zero(dev_ctx, init_h_grad, static_cast(0.0)); + } else { + dynamic_grad_pre_h->Resize(last_h_grad->dims()); + dynamic_grad_pre_h->mutable_data(ctx.GetPlace()); + zero(dev_ctx, dynamic_grad_pre_h, static_cast(0.0)); + init_h_grad = dynamic_grad_pre_h; } if (init_c_grad) { - init_c_grad->mutable_data(init_c->dims(), ctx.GetPlace()); + init_c_grad->mutable_data(last_c_grad->dims(), ctx.GetPlace()); + } else { + dynamic_grad_pre_c->Resize(last_h_grad->dims()); + dynamic_grad_pre_c->mutable_data(ctx.GetPlace()); + init_c_grad = dynamic_grad_pre_c; + } + + Tensor temp_input_grad_1, temp_input_grad_2; + T* input_grad_1_ptr = nullptr; + T* input_grad_2_ptr = nullptr; + if (num_layers >= 2) { + temp_input_grad_1.Resize(output_grad->dims()); + input_grad_1_ptr = temp_input_grad_1.mutable_data(ctx.GetPlace()); + } + if (num_layers >= 3) { + temp_input_grad_2.Resize(output_grad->dims()); + input_grad_2_ptr = temp_input_grad_2.mutable_data(ctx.GetPlace()); } // get ptr from tensor auto x = input->data(); - auto h_0 = init_h->data(); - auto c_0 = init_c->data(); - auto w_x = parameter_lists[0][0]; - auto w_h = parameter_lists[0][1]; + auto init_h_ptr = init_h->data(); + auto init_c_ptr = init_c->data(); auto y = output->data(); auto y_grad = output_grad->data(); auto last_h_grad_ptr = last_h_grad->data(); auto last_c_grad_ptr = last_c_grad->data(); auto x_grad = input_grad->data(); - auto h_0_grad = init_h_grad ? init_h_grad->data() : nullptr; - auto c_0_grad = init_c_grad ? init_c_grad->data() : nullptr; - auto w_x_grad = parameter_lists_grad[0][0]; - auto w_h_grad = parameter_lists_grad[0][1]; - auto b_x_grad = parameter_lists_grad[0][2]; - auto b_h_grad = parameter_lists_grad[0][3]; - auto i_f_g_o = reserve_data->data(); - auto c = i_f_g_o + seq_len * batch_size * hidden_size * 4; + auto init_h_grad_ptr = init_h_grad->data(); + auto init_c_grad_ptr = init_c_grad->data(); + const int& block_size = direction_num * seq_len * batch_size * hidden_size; + auto i_f_g_o_ptr = reserve_data->data(); + auto c_ptr = i_f_g_o_ptr + num_layers * block_size * 4; + auto hidden_data_ptr = c_ptr + num_layers * block_size * 1; + int state_offset = pre_state[0]->dims()[1] * pre_state[0]->dims()[2]; std::vector seq_len_tensor(batch_size, seq_len); if (has_seq_length) { seq_len_tensor = operators::GetDataFromTensor(sequence_length); } - auto& dev_ctx = ctx.template device_context(); - int r = xpu::lstm_grad( - dev_ctx.x_context(), (const T*)x, (const T*)h_0, (const T*)c_0, - (const T*)w_x, (const T*)w_h, (const T*)y, (const T*)y_grad, - (const T*)last_h_grad_ptr, (const T*)last_c_grad_ptr, - reinterpret_cast(x_grad), reinterpret_cast(h_0_grad), - reinterpret_cast(c_0_grad), w_x_grad, w_h_grad, b_x_grad, b_h_grad, - batch_size, input_dim, hidden_size, seq_len, seq_len_tensor, nullptr, - nullptr, nullptr, nullptr, i_f_g_o, c); - PADDLE_ENFORCE_EQ( - r, xpu::Error_t::SUCCESS, - platform::errors::External("RnnXPUGrad(lstm) return wrong " - "value[%d %s]", - r, XPUAPIErrorMsg[r])); + for (int i = num_layers - 1; i >= 0; --i) { + // the layer input output had saved, just use the data + auto w_x = parameter_lists[i][0]; + auto w_h = parameter_lists[i][1]; + auto bw_x = parameter_lists[i][4]; + auto bw_h = parameter_lists[i][5]; + + auto i_f_g_o = i_f_g_o_ptr + i * block_size * 4; + auto c = c_ptr + i * block_size; + + Tensor layer_input_t; + auto layer_input = x; + if (i > 0) { + layer_input_t.Resize(output->dims()); + layer_input = layer_input_t.mutable_data(ctx.GetPlace()); + float scale = static_cast(1.0f - dropout_prob); + auto hidden_data = hidden_data_ptr + (i - 1) * block_size; + int r = xpu::scale(dev_ctx.x_context(), + reinterpret_cast(hidden_data), + const_cast(layer_input), output->numel(), + false, scale, 0.0f); + PADDLE_ENFORCE_XDNN_SUCCESS(r, "scale"); + } else { + layer_input = x; + } + + auto layer_output = y; + if (i == num_layers - 1) { + layer_output = y; + } else { + layer_output = hidden_data_ptr + i * block_size; + } + + const T* cur_input_ptr = nullptr; + if (i == num_layers - 1) { + cur_input_ptr = y_grad; + } else if (i % 2 != 0) { + cur_input_ptr = input_grad_2_ptr; + } else { + cur_input_ptr = input_grad_1_ptr; + } + + T* cur_output_ptr = nullptr; + int cur_xdim = -1; + if (i == 0) { + cur_output_ptr = x_grad; + cur_xdim = input_dim; + } else if (i % 2 != 0) { + cur_output_ptr = input_grad_1_ptr; + cur_xdim = is_bidirec ? 2 * hidden_size : hidden_size; + } else { + cur_output_ptr = input_grad_2_ptr; + cur_xdim = is_bidirec ? 2 * hidden_size : hidden_size; + } + + auto w_x_grad = parameter_lists_grad[i][0]; + auto w_h_grad = parameter_lists_grad[i][1]; + auto b_x_grad = parameter_lists_grad[i][2]; + auto b_h_grad = parameter_lists_grad[i][3]; + + auto h_0 = init_h_ptr + direction_num * i * state_offset; + auto c_0 = init_c_ptr + direction_num * i * state_offset; + + auto h_0_grad = init_h_grad_ptr + direction_num * i * state_offset; + auto c_0_grad = init_c_grad_ptr + direction_num * i * state_offset; + auto h_t_grad = last_h_grad_ptr + direction_num * i * state_offset; + auto c_t_grad = last_c_grad_ptr + direction_num * i * state_offset; + + if (is_bidirec) { + auto bw_x_grad = parameter_lists_grad[i][4]; + auto bw_h_grad = parameter_lists_grad[i][5]; + auto bb_x_grad = parameter_lists_grad[i][6]; + auto bb_h_grad = parameter_lists_grad[i][7]; + + int r = xpu::bilstm_grad( + dev_ctx.x_context(), (const T*)layer_input, (const T*)h_0, + (const T*)c_0, (const T*)w_x, (const T*)w_h, (const T*)bw_x, + (const T*)bw_h, (const T*)layer_output, (const T*)cur_input_ptr, + (const T*)h_t_grad, (const T*)c_t_grad, + reinterpret_cast(cur_output_ptr), + reinterpret_cast(h_0_grad), reinterpret_cast(c_0_grad), + w_x_grad, w_h_grad, b_x_grad, b_h_grad, bw_x_grad, bw_h_grad, + bb_x_grad, bb_h_grad, batch_size, cur_xdim, hidden_size, seq_len, + seq_len_tensor, nullptr, nullptr, nullptr, nullptr, nullptr, + nullptr, i_f_g_o, c); + + PADDLE_ENFORCE_XDNN_SUCCESS(r, "bilstm_grad"); + } else { + int r = xpu::lstm_grad( + dev_ctx.x_context(), (const T*)layer_input, (const T*)h_0, + (const T*)c_0, (const T*)w_x, (const T*)w_h, (const T*)layer_output, + (const T*)cur_input_ptr, (const T*)h_t_grad, (const T*)c_t_grad, + reinterpret_cast(cur_output_ptr), + reinterpret_cast(h_0_grad), reinterpret_cast(c_0_grad), + w_x_grad, w_h_grad, b_x_grad, b_h_grad, batch_size, cur_xdim, + hidden_size, seq_len, seq_len_tensor, nullptr, nullptr, nullptr, + nullptr, i_f_g_o, c); + + PADDLE_ENFORCE_XDNN_SUCCESS(r, "lstm_grad"); + } + } } }; diff --git a/paddle/fluid/platform/device/xpu/xpu2_op_list.h b/paddle/fluid/platform/device/xpu/xpu2_op_list.h index 9915b4d8d34f8..750a389940c65 100644 --- a/paddle/fluid/platform/device/xpu/xpu2_op_list.h +++ b/paddle/fluid/platform/device/xpu/xpu2_op_list.h @@ -306,6 +306,7 @@ XPUOpMap& get_kl2_ops() { pOpKernelType(vartype::BOOL, XPUPlace()), pOpKernelType(vartype::FP32, XPUPlace())})}, {"rnn", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, + {"rnn_grad", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, {"roi_align", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, {"roi_align_grad", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, diff --git a/python/paddle/fluid/tests/unittests/xpu/test_rnn_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_rnn_op_xpu.py index 20a3fc69fe8d2..84edbab1eac91 100755 --- a/python/paddle/fluid/tests/unittests/xpu/test_rnn_op_xpu.py +++ b/python/paddle/fluid/tests/unittests/xpu/test_rnn_op_xpu.py @@ -122,7 +122,7 @@ def init_dtype(self): def set_xpu(self): self.__class__.use_xpu = True - self.__class__.no_need_check_grad = True + self.__class__.no_need_check_grad = False self.__class__.op_type = self.in_type def test_check_output(self): @@ -130,6 +130,15 @@ def test_check_output(self): self.place, atol=0.01, no_check_set=['Reserve', 'DropoutState']) + def test_grad(self): + if not self.is_test: + var_name_list = self.get_weight_names() + grad_check_list = ['Input', 'init_h', 'init_c'] + grad_check_list.extend(var_name_list) + self.check_grad_with_place(self.place, + set(grad_check_list), + ['Out', 'last_hidden', 'last_cell']) + def init_size(self): self.seq_length = 12 self.batch_size = 5 From 87757e248600266ad1085e34714113f987fcb999 Mon Sep 17 00:00:00 2001 From: Vigi Zhang Date: Thu, 14 Apr 2022 15:07:35 +0800 Subject: [PATCH 152/211] Fix links in security policy (#41796) --- SECURITY.md | 2 +- SECURITY_cn.md | 2 +- security/README.md | 2 +- security/README_cn.md | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/SECURITY.md b/SECURITY.md index 490c804e9de9d..79bf3353ad4f9 100644 --- a/SECURITY.md +++ b/SECURITY.md @@ -55,4 +55,4 @@ If malicious input can trigger memory corruption or non-clean exit, such bug is -[security advisories](https://github.com/PaddlePaddle/Paddle/security/README.md) +[security advisories](https://github.com/PaddlePaddle/Paddle/blob/develop/security/README.md) diff --git a/SECURITY_cn.md b/SECURITY_cn.md index e9f503192c1eb..00b222912d277 100644 --- a/SECURITY_cn.md +++ b/SECURITY_cn.md @@ -41,4 +41,4 @@ 如果输入非预期的参数后,对飞桨代码造成了内存破坏,或者非干净退出,这类行为被认定为存在安全问题。 -### [安全公告](https://github.com/PaddlePaddle/Paddle/security/README_cn.md) +### [安全公告](https://github.com/PaddlePaddle/Paddle/blob/develop/security/README_cn.md) diff --git a/security/README.md b/security/README.md index ab3dab8c0cc70..506bfbb91184a 100644 --- a/security/README.md +++ b/security/README.md @@ -4,7 +4,7 @@ We regularly publish security advisories about using PaddlePaddle. -*Note*: In conjunction with these security advisories, we strongly encourage PaddlePaddle users to read and understand PaddlePaddle's security model as outlined in [SECURITY.md](https://github.com/PaddlePaddle/Paddle/SECURITY.md). +*Note*: In conjunction with these security advisories, we strongly encourage PaddlePaddle users to read and understand PaddlePaddle's security model as outlined in [SECURITY.md](https://github.com/PaddlePaddle/Paddle/blob/develop/SECURITY.md). | Advisory Number | Type | Versions affected | Reported by | Additional Information| diff --git a/security/README_cn.md b/security/README_cn.md index 2ae23046469d4..49f486b0f7878 100644 --- a/security/README_cn.md +++ b/security/README_cn.md @@ -4,7 +4,7 @@ -注:我们非常建议飞桨用户阅读和理解[SECURITY_cn.md](https://github.com/PaddlePaddle/Paddle/SECURITY_cn.md)所介绍的飞桨安全模型,以便更好地了解此安全公告。 +注:我们非常建议飞桨用户阅读和理解[SECURITY_cn.md](https://github.com/PaddlePaddle/Paddle/blob/develop/SECURITY_cn.md)所介绍的飞桨安全模型,以便更好地了解此安全公告。 | 安全公告编号 | 类型 | 受影响版本 | 报告者 | 备注 | From 7e7d230011f5d81dd45fa00942282b4326e35a26 Mon Sep 17 00:00:00 2001 From: zmxdream Date: Thu, 14 Apr 2022 17:13:58 +0800 Subject: [PATCH 153/211] [XPUPS]modify xpu_kp.cmake with HETERPS&PSLIB (#41760) * modify xpu_kp.cmake with HETERPS&PSLIB * fix. test=develop * fix. test=develop * fix. test=develop * fix. test=develop --- cmake/xpu_kp.cmake | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/cmake/xpu_kp.cmake b/cmake/xpu_kp.cmake index 9cddbe1496478..166f8786337b1 100644 --- a/cmake/xpu_kp.cmake +++ b/cmake/xpu_kp.cmake @@ -122,6 +122,12 @@ macro(compile_kernel COMPILE_ARGS) string(REPLACE ";" " " XPU_CXX_DEFINES "${XPU_CXX_DEFINES}" ) separate_arguments(XPU_CXX_DEFINES UNIX_COMMAND "${XPU_CXX_DEFINES}") + set(ABI_VERSION "") + if(WITH_HETERPS AND WITH_PSLIB) + set(ABI_VERSION "-D_GLIBCXX_USE_CXX11_ABI=0") + else() + set(ABI_VERSION "-D_GLIBCXX_USE_CXX11_ABI=1") + endif() add_custom_command( OUTPUT kernel_build/${kernel_name}.bin.o @@ -130,7 +136,7 @@ macro(compile_kernel COMPILE_ARGS) COMMAND ${CMAKE_COMMAND} -E copy ${kernel_path}/${kernel_name}.kps kernel_build/${kernel_name}.xpu COMMAND - ${XPU_CLANG} --sysroot=${CXX_DIR} -std=c++11 -D_GLIBCXX_USE_CXX11_ABI=1 ${OPT_LEVEL} -fno-builtin -mcpu=xpu2 -fPIC ${XPU_CXX_DEFINES} ${XPU_CXX_FLAGS} ${XPU_CXX_INCLUDES} + ${XPU_CLANG} --sysroot=${CXX_DIR} -std=c++11 ${ABI_VERSION} ${OPT_LEVEL} -fno-builtin -mcpu=xpu2 -fPIC ${XPU_CXX_DEFINES} ${XPU_CXX_FLAGS} ${XPU_CXX_INCLUDES} -I. -o kernel_build/${kernel_name}.bin.o.sec kernel_build/${kernel_name}.xpu --xpu-device-only -c -v COMMAND @@ -153,7 +159,7 @@ macro(compile_kernel COMPILE_ARGS) COMMAND ${CMAKE_COMMAND} -E copy ${kernel_path}/${kernel_name}.kps kernel_build/${kernel_name}.xpu COMMAND - ${XPU_CLANG} --sysroot=${CXX_DIR} -std=c++11 -D_GLIBCXX_USE_CXX11_ABI=1 ${OPT_LEVEL} -fno-builtin -mcpu=xpu2 -fPIC ${XPU_CXX_DEFINES} ${XPU_CXX_FLAGS} ${XPU_CXX_INCLUDES} + ${XPU_CLANG} --sysroot=${CXX_DIR} -std=c++11 ${ABI_VERSION} ${OPT_LEVEL} -fno-builtin -mcpu=xpu2 -fPIC ${XPU_CXX_DEFINES} ${XPU_CXX_FLAGS} ${XPU_CXX_INCLUDES} -I. -o kernel_build/${kernel_name}.host.o kernel_build/${kernel_name}.xpu --xpu-host-only -c -v WORKING_DIRECTORY From 6dc881e976e9d5f7099ef4686235db3c08104eeb Mon Sep 17 00:00:00 2001 From: Sing_chan <51314274+betterpig@users.noreply.github.com> Date: Thu, 14 Apr 2022 17:40:29 +0800 Subject: [PATCH 154/211] fix bug of ps_py_proto cant find path for the folder not created (#41793) --- paddle/fluid/distributed/CMakeLists.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/paddle/fluid/distributed/CMakeLists.txt b/paddle/fluid/distributed/CMakeLists.txt index 0091c14bfd177..107a19cb7decc 100644 --- a/paddle/fluid/distributed/CMakeLists.txt +++ b/paddle/fluid/distributed/CMakeLists.txt @@ -4,6 +4,7 @@ if(WITH_PYTHON) py_proto_compile(ps_py_proto SRCS the_one_ps.proto) add_custom_target(ps_py_proto_init ALL COMMAND ${CMAKE_COMMAND} -E make_directory ${PADDLE_BINARY_DIR}/python/paddle/distributed/fleet/proto) + add_dependencies(ps_py_proto ps_py_proto_init) if (NOT WIN32) add_custom_command(TARGET ps_py_proto POST_BUILD COMMAND mv the_one_ps_pb2.py ${PADDLE_BINARY_DIR}/python/paddle/distributed/fleet/proto/) From 92d8d0bc757d520bd1f9f5876b508a8e2154df6f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?S=C5=82awomir=20Siwek?= Date: Thu, 14 Apr 2022 12:41:50 +0200 Subject: [PATCH 155/211] FC+elementwise_add (residual connection) (#41776) * Change tensor name to match activation * declare fc_eltwise_add pass * merge conv_eltwise refactor PR * first compilable draft * unittest feedback tools * Fuse pass tester * Move IsReachable() to shared file * 100% coverage of fuse_pass_tester.cc * register pass * Add bias node * Improve unit tests / remove bias node from pattern * improve fc_eltwiseadd_unittest * cancel eltwise_add fuse if act is already fused * Add elementwise_input scale * Residual MVP * Add new FC attrs * Add more test cases * Add missing op attrs * Adapt code to new Elementwise pattern * reuse existing fcpattern * improve code style * remove unused arguments * fix typo * remove whitespace * remove int8 related code * Remove attributes from base ops * style * style check * Remove input from base op * Set attribute during fuse * ut timeout * download and test model * DRY * apply feedback from review * Style check * fix typo * cosmetic changes * explicitly set residual as output * VIT-OCR accuracy check * trigger CI * remove whitespaces * fix missing data file --- paddle/fluid/framework/ir/CMakeLists.txt | 2 + .../fc_elementwise_add_mkldnn_fuse_pass.cc | 144 +++++++++++++ .../fc_elementwise_add_mkldnn_fuse_pass.h | 48 +++++ ...elementwise_add_mkldnn_fuse_pass_tester.cc | 202 ++++++++++++++++++ .../inference/api/paddle_pass_builder.cc | 8 + .../fluid/inference/api/paddle_pass_builder.h | 4 + .../fluid/inference/tests/api/CMakeLists.txt | 13 ++ .../tests/api/analyzer_bert_tester.cc | 1 + .../tests/api/analyzer_vit_ocr_tester.cc | 117 ++++++++++ paddle/fluid/operators/mkldnn/fc_mkldnn_op.cc | 41 +++- .../unittests/ir/inference/CMakeLists.txt | 1 + ...est_mkldnn_fc_elementwise_add_fuse_pass.py | 101 +++++++++ 12 files changed, 671 insertions(+), 11 deletions(-) create mode 100644 paddle/fluid/framework/ir/mkldnn/fc_elementwise_add_mkldnn_fuse_pass.cc create mode 100644 paddle/fluid/framework/ir/mkldnn/fc_elementwise_add_mkldnn_fuse_pass.h create mode 100644 paddle/fluid/framework/ir/mkldnn/fc_elementwise_add_mkldnn_fuse_pass_tester.cc create mode 100644 paddle/fluid/inference/tests/api/analyzer_vit_ocr_tester.cc create mode 100644 python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_fc_elementwise_add_fuse_pass.py diff --git a/paddle/fluid/framework/ir/CMakeLists.txt b/paddle/fluid/framework/ir/CMakeLists.txt index e8696a3c2276b..207ee713bf409 100755 --- a/paddle/fluid/framework/ir/CMakeLists.txt +++ b/paddle/fluid/framework/ir/CMakeLists.txt @@ -122,6 +122,7 @@ if(WITH_MKLDNN) pass_library(conv_activation_mkldnn_fuse_pass inference DIR mkldnn) pass_library(conv_concat_relu_mkldnn_fuse_pass inference DIR mkldnn) pass_library(conv_elementwise_add_mkldnn_fuse_pass inference DIR mkldnn) + pass_library(fc_elementwise_add_mkldnn_fuse_pass inference DIR mkldnn) pass_library(scale_matmul_fuse_pass inference DIR mkldnn) pass_library(cpu_bfloat16_placement_pass inference DIR mkldnn) pass_library(cpu_bfloat16_pass inference DIR mkldnn) @@ -208,6 +209,7 @@ if (WITH_MKLDNN) cc_test(test_conv_activation_mkldnn_fuse_pass SRCS mkldnn/conv_activation_mkldnn_fuse_pass_tester.cc DEPS conv_activation_mkldnn_fuse_pass) cc_test(test_conv_concat_relu_mkldnn_fuse_pass SRCS mkldnn/conv_concat_relu_mkldnn_fuse_pass_tester.cc DEPS conv_concat_relu_mkldnn_fuse_pass) cc_test(test_conv_elementwise_add_mkldnn_fuse_pass SRCS mkldnn/conv_elementwise_add_mkldnn_fuse_pass_tester.cc DEPS conv_elementwise_add_mkldnn_fuse_pass pass_test_util) + cc_test(test_fc_elementwise_add_mkldnn_fuse_pass SRCS mkldnn/fc_elementwise_add_mkldnn_fuse_pass_tester.cc DEPS fc_elementwise_add_mkldnn_fuse_pass pass_test_util) cc_test(test_fc_act_mkldnn_fuse_pass SRCS mkldnn/fc_act_mkldnn_fuse_pass_tester.cc DEPS fc_act_mkldnn_fuse_pass pass_test_util) cc_test(test_batch_norm_act_fuse_pass SRCS mkldnn/batch_norm_act_fuse_pass_tester.cc DEPS batch_norm_act_fuse_pass pass_test_util) set(TEST_CONV_BN_PASS_DEPS conv_bn_fuse_pass graph_to_program_pass conv_op conv_transpose_op math_function im2col vol2col batch_norm_op gelu_op activation_op elementwise_add_op concat_and_split naive_executor device_context eigen_function) diff --git a/paddle/fluid/framework/ir/mkldnn/fc_elementwise_add_mkldnn_fuse_pass.cc b/paddle/fluid/framework/ir/mkldnn/fc_elementwise_add_mkldnn_fuse_pass.cc new file mode 100644 index 0000000000000..2e62597f2ee29 --- /dev/null +++ b/paddle/fluid/framework/ir/mkldnn/fc_elementwise_add_mkldnn_fuse_pass.cc @@ -0,0 +1,144 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/ir/mkldnn/fc_elementwise_add_mkldnn_fuse_pass.h" +#include "paddle/fluid/framework/ir/graph_traits.h" +#include "paddle/fluid/framework/op_version_registry.h" +#include "paddle/fluid/string/pretty_log.h" + +namespace paddle { +namespace framework { +namespace ir { + +FCResidualConnectionMKLDNNFusePass::FCResidualConnectionMKLDNNFusePass() { + AddOpCompat(OpCompat("fc")) + .AddInput("Input") + .IsTensor() + .End() + .AddInput("W") + .IsTensor() + .End() + .AddInput("Bias") + .IsTensor() + .End() + .AddOutput("Out") + .IsTensor() + .End() + .AddAttr("in_num_col_dims") + .IsNumGE(1) + .End(); + + AddOpCompat(OpCompat("elementwise_add")) + .AddInput("X") + .IsTensor() + .End() + .AddInput("Y") + .IsTensor() + .End() + .AddOutput("Out") + .IsTensor() + .End() + .AddAttr("axis") + .IsIntIn({-1, 0, 1}) + .End(); +} + +GraphWithStats FCResidualConnectionMKLDNNFusePass::FuseFC( + const std::string& name_scope, const GraphWithStats& graph_with_stats, + bool fc_as_x) const { + GraphPatternDetector gpd; + auto pattern = gpd.mutable_pattern(); + patterns::FCMKLDNN fc_pattern{pattern, name_scope}; + bool fc_has_bias = true; + auto fc_output = fc_pattern( + gpd.mutable_pattern()->NewNode("fc")->AsInput()->assert_is_op_input( + "fc", "Input"), + fc_has_bias); + + patterns::ResidualElementwise elementwise_pattern{pattern, name_scope, + fc_as_x}; + elementwise_pattern( + fc_output, pattern->NewNode(elementwise_pattern.residual_data_repr()), + "elementwise_add", fc_as_x); + fc_output->AsIntermediate(); + + int found_fc_count = 0; + + auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph, + Graph* g) { + GET_IR_NODE_FROM_SUBGRAPH(fc_op, fc, fc_pattern); + GET_IR_NODE_FROM_SUBGRAPH(fc_input, input, fc_pattern); + GET_IR_NODE_FROM_SUBGRAPH(fc_weights, weights, fc_pattern); + GET_IR_NODE_FROM_SUBGRAPH(fc_output, output, fc_pattern); + + GET_IR_NODE_FROM_SUBGRAPH(elementwise_op, elementwise_op, + elementwise_pattern); + GET_IR_NODE_FROM_SUBGRAPH(residual_data, residual_data, + elementwise_pattern); + GET_IR_NODE_FROM_SUBGRAPH(elementwise_out, elementwise_out, + elementwise_pattern); + + if (FindFuseOption(*fc_op, *elementwise_op) != FUSE_MKLDNN) return; + if (!IsReachable(g, residual_data, fc_output)) return; + if (HasFusedActivation(fc_op)) return; + + if (!IsCompat(subgraph, g)) { + LOG(WARNING) + << "op compat for fc_elementwise_add_mkldnn_fuse_pass failed."; + return; + } + + fc_op->Op()->SetOutput("ResidualData", {residual_data->Name()}); + fc_op->Op()->SetOutput("Out", {elementwise_out->Name()}); + fc_op->Op()->SetAttr("fuse_residual_connection", true); + + GraphSafeRemoveNodes(g, {fc_output, elementwise_op}); + + IR_NODE_LINK_TO(residual_data, fc_op); + IR_NODE_LINK_TO(fc_op, elementwise_out); + + found_fc_count++; + }; + + gpd(graph_with_stats.first, handler); + if (!Has("disable_logs") || !Get("disable_logs")) { + std::stringstream msg_ss; + std::string fusionMode = fc_as_x ? "x" : "y"; + msg_ss << "--- Fused " << found_fc_count << " fc (as " << fusionMode + << ") + elementwise_add patterns"; + paddle::string::PrettyLogDetail(msg_ss.str().c_str()); + } + + return std::make_pair(graph_with_stats.first, + found_fc_count + graph_with_stats.second); +} + +void FCResidualConnectionMKLDNNFusePass::ApplyImpl(ir::Graph* graph) const { + FusePassBase::Init(name_scope_, graph); + auto graph_with_stats = FuseFC(name_scope_, std::make_pair(graph, 0), true); + graph_with_stats = FuseFC(name_scope_, graph_with_stats, false); + + AddStatis(graph_with_stats.second); +} +} // namespace ir +} // namespace framework +} // namespace paddle + +REGISTER_PASS(fc_elementwise_add_mkldnn_fuse_pass, + paddle::framework::ir::FCResidualConnectionMKLDNNFusePass); +REGISTER_PASS_CAPABILITY(fc_elementwise_add_mkldnn_fuse_pass) + .AddCombination( + paddle::framework::compatible::OpVersionComparatorCombination() + .LE("fc", 0) + .LE("elementwise_add", 1)); diff --git a/paddle/fluid/framework/ir/mkldnn/fc_elementwise_add_mkldnn_fuse_pass.h b/paddle/fluid/framework/ir/mkldnn/fc_elementwise_add_mkldnn_fuse_pass.h new file mode 100644 index 0000000000000..f92ce5bfc7044 --- /dev/null +++ b/paddle/fluid/framework/ir/mkldnn/fc_elementwise_add_mkldnn_fuse_pass.h @@ -0,0 +1,48 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/fluid/framework/ir/fuse_pass_base.h" +#include "paddle/fluid/framework/ir/graph_pattern_detector.h" + +namespace paddle { +namespace framework { +namespace ir { + +using GraphWithStats = std::pair; + +class FCResidualConnectionMKLDNNFusePass : public FusePassBase { + private: + GraphWithStats FuseFC(const std::string& name_scope, + const GraphWithStats& graph_with_stats, + bool fc_as_x) const; + + public: + FCResidualConnectionMKLDNNFusePass(); + virtual ~FCResidualConnectionMKLDNNFusePass() {} + + protected: + void ApplyImpl(ir::Graph* graph) const; + + static bool HasFusedActivation(Node* fc_node) { + return !( + fc_node->Op()->GetAttrIfExists("activation_type").empty()); + } + + const std::string name_scope_{"fc_elementwise_add_mkldnn_fuse"}; +}; +} // namespace ir +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/ir/mkldnn/fc_elementwise_add_mkldnn_fuse_pass_tester.cc b/paddle/fluid/framework/ir/mkldnn/fc_elementwise_add_mkldnn_fuse_pass_tester.cc new file mode 100644 index 0000000000000..d2d27be3fce5b --- /dev/null +++ b/paddle/fluid/framework/ir/mkldnn/fc_elementwise_add_mkldnn_fuse_pass_tester.cc @@ -0,0 +1,202 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include + +#include "paddle/fluid/framework/ir/mkldnn/fc_elementwise_add_mkldnn_fuse_pass.h" +#include "paddle/fluid/framework/ir/pass_test_util.h" +#include "paddle/fluid/framework/op_proto_maker.h" +#include "paddle/fluid/framework/op_version_registry.h" + +namespace paddle { +namespace framework { +namespace ir { + +// Nodes elementwise_add and FC_output are deleted +// FC node is removed and new version with fuse-pass is added +// In general, the graph is 2 vertices smaller (per fuse-pass) +constexpr int nodes_removed = 3; +constexpr int nodes_added = 1; + +OpDesc* Create_Op_FC(ProgramDesc* prog, + const std::vector& inputs, + const std::vector& outputs) { + auto* op = prog->MutableBlock(0)->AppendOp(); + op->SetType("fc"); + op->SetAttr("use_mkldnn", true); + op->SetAttr("in_num_col_dims", 1); + + for (const auto& input : inputs) { + op->SetInput(input.first, {input.second}); + } + for (const auto& output : outputs) { + op->SetOutput(output.first, {output.second}); + } + + op->SetAttr(OpProtoAndCheckerMaker::OpRoleAttrName(), + static_cast(OpRole::kForward)); + return op; +} + +OpDesc* Create_Op_elementwise_add( + ProgramDesc* prog, const std::vector& inputs, + const std::vector& outputs, + bool use_mkldnn = true) { + auto* op = prog->MutableBlock(0)->AppendOp(); + op->SetType("elementwise_add"); + op->SetAttr("use_mkldnn", use_mkldnn); + op->SetAttr("axis", -1); + + for (const auto& input : inputs) { + op->SetInput(input.first, {input.second}); + } + for (const auto& output : outputs) { + op->SetOutput(output.first, {output.second}); + } + + op->SetAttr(OpProtoAndCheckerMaker::OpRoleAttrName(), + static_cast(OpRole::kForward)); + return op; +} + +TEST(FCElementwiseAddMKLDNNFusePass, FCBiasAsY) { + auto prog = + test::BuildProgramDesc({"a", "b", "c", "d", "e"}, {"bias", "weights"}); + + test::CreateOp(&prog, "sigmoid", {{"X", "a"}}, {{"Out", "b"}}); + Create_Op_FC(&prog, {{"Input", "b"}, {"Bias", "bias"}, {"W", "weights"}}, + {{"Out", "c"}}); + Create_Op_elementwise_add(&prog, {{"X", "a"}, {"Y", "c"}}, {{"Out", "d"}}); + test::CreateOp(&prog, "relu", {{"X", "d"}}, {{"Out", "e"}}); + + Graph graph(prog); + + EXPECT_TRUE(test::RunPassAndAssert(&graph, + "fc_elementwise_add_mkldnn_fuse_pass", "a", + "e", nodes_removed, nodes_added)); + EXPECT_TRUE(test::AssertOpsCount(graph, {{"fc", 1}, {"elementwise_add", 0}})); +} + +TEST(FCElementwiseAddMKLDNNFusePass, FCBiasAsX) { + auto prog = + test::BuildProgramDesc({"a", "b", "c", "d", "e"}, {"bias", "weights"}); + + test::CreateOp(&prog, "sigmoid", {{"X", "a"}}, {{"Out", "b"}}); + Create_Op_FC(&prog, {{"Input", "b"}, {"Bias", "bias"}, {"W", "weights"}}, + {{"Out", "c"}}); + + Create_Op_elementwise_add(&prog, {{"X", "c"}, {"Y", "a"}}, {{"Out", "d"}}); + test::CreateOp(&prog, "relu", {{"X", "d"}}, {{"Out", "e"}}); + + Graph graph(prog); + + EXPECT_TRUE(test::RunPassAndAssert(&graph, + "fc_elementwise_add_mkldnn_fuse_pass", "a", + "e", nodes_removed, nodes_added)); + EXPECT_TRUE(test::AssertOpsCount(graph, {{"fc", 1}, {"elementwise_add", 0}})); +} + +TEST(FCElementwiseAddMKLDNNFusePass, NoFusion_NotResidualConnection) { + auto prog = test::BuildProgramDesc({"a", "b", "c", "d", "e", "f", "g"}, + {"bias", "weights", "bias2", "weights2"}); + + test::CreateOp(&prog, "sigmoid", {{"X", "a"}}, {{"Out", "b"}}); + Create_Op_FC(&prog, {{"Input", "b"}, {"Bias", "bias"}, {"W", "weights"}}, + {{"Out", "c"}}); + + Create_Op_FC(&prog, {{"Input", "d"}, {"Bias", "bias2"}, {"W", "weights2"}}, + {{"Out", "e"}}); + + Create_Op_elementwise_add(&prog, {{"X", "c"}, {"Y", "e"}}, {{"Out", "f"}}); + test::CreateOp(&prog, "relu", {{"X", "f"}}, {{"Out", "g"}}); + + Graph graph(prog); + + EXPECT_TRUE(test::RunPassAndAssert( + &graph, "fc_elementwise_add_mkldnn_fuse_pass", "a", "g", 0, 0)); + EXPECT_TRUE(test::AssertOpsCount(graph, {{"fc", 2}, {"elementwise_add", 1}})); +} + +TEST(FCElementwiseAddMKLDNNFusePass, FC_Residual_VITOCR) { + auto prog = test::BuildProgramDesc( + {"a", "b", "c", "d", "e", "f", "g", "h", "i"}, + {"ln_bias", "ln_scale", "bias", "weights", "bias2", "weights2"}); + + Create_Op_elementwise_add(&prog, {{"X", "a"}, {"Y", "b"}}, {{"Out", "c"}}); + + test::CreateOp(&prog, "layer_norm", + {{"X", "c"}, {"Bias", "ln_bias"}, {"Scale", "ln_scale"}}, + {{"Y", "d"}}); + Create_Op_FC(&prog, {{"Input", "d"}, {"Bias", "bias"}, {"W", "weights"}}, + {{"Out", "e"}}); + test::CreateOp(&prog, "gelu", {{"X", "e"}}, {{"Out", "f"}}); + Create_Op_FC(&prog, {{"Input", "f"}, {"Bias", "bias2"}, {"W", "weights2"}}, + {{"Out", "g"}}); + Create_Op_elementwise_add(&prog, {{"X", "g"}, {"Y", "c"}}, {{"Out", "h"}}); + test::CreateOp(&prog, "relu", {{"X", "h"}}, {{"Out", "i"}}); + + Graph graph(prog); + + EXPECT_TRUE(test::RunPassAndAssert(&graph, + "fc_elementwise_add_mkldnn_fuse_pass", "a", + "i", nodes_removed, nodes_added)); + EXPECT_TRUE(test::AssertOpsCount(graph, {{"fc", 2}, {"elementwise_add", 1}})); +} + +TEST(FCElementwiseAddMKLDNNFusePass, FC_Residual_Sequence) { + auto prog = test::BuildProgramDesc( + {"a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m"}, + {"ln_bias", "ln_scale", "bias", "weights", "bias2", "weights2", + "ln_bias2", "ln_scale2", "bias3", "weights3", "bias4", "weights4"}); + + Create_Op_elementwise_add(&prog, {{"X", "a"}, {"Y", "b"}}, {{"Out", "c"}}); + + test::CreateOp(&prog, "layer_norm", + {{"X", "c"}, {"Bias", "ln_bias"}, {"Scale", "ln_scale"}}, + {{"Y", "d"}}); + Create_Op_FC(&prog, {{"Input", "d"}, {"Bias", "bias"}, {"W", "weights"}}, + {{"Out", "e"}}); + test::CreateOp(&prog, "gelu", {{"X", "e"}}, {{"Out", "f"}}); + Create_Op_FC(&prog, {{"Input", "f"}, {"Bias", "bias2"}, {"W", "weights2"}}, + {{"Out", "g"}}); + Create_Op_elementwise_add(&prog, {{"X", "g"}, {"Y", "c"}}, {{"Out", "h"}}); + test::CreateOp(&prog, "layer_norm", + {{"X", "h"}, {"Bias", "ln_bias2"}, {"Scale", "ln_scale2"}}, + {{"Y", "i"}}); + Create_Op_FC(&prog, {{"Input", "i"}, {"Bias", "bias3"}, {"W", "weights3"}}, + {{"Out", "j"}}); + test::CreateOp(&prog, "gelu", {{"X", "j"}}, {{"Out", "k"}}); + Create_Op_FC(&prog, {{"Input", "k"}, {"Bias", "bias4"}, {"W", "weights4"}}, + {{"Out", "l"}}); + Create_Op_elementwise_add(&prog, {{"X", "h"}, {"Y", "l"}}, {{"Out", "m"}}); + + Graph graph(prog); + + EXPECT_TRUE(test::RunPassAndAssert(&graph, + "fc_elementwise_add_mkldnn_fuse_pass", "a", + "m", nodes_removed * 2, nodes_added * 2)); + EXPECT_TRUE(test::AssertOpsCount(graph, {{"fc", 4}, {"elementwise_add", 1}})); +} + +TEST(FCElementwiseAddMKLDNNFusePass, pass_op_version_check) { + ASSERT_TRUE( + paddle::framework::compatible::PassVersionCheckerRegistrar::GetInstance() + .IsPassCompatible("fc_elementwise_add_mkldnn_fuse_pass")); +} + +} // namespace ir +} // namespace framework +} // namespace paddle + +USE_PASS(fc_elementwise_add_mkldnn_fuse_pass); diff --git a/paddle/fluid/inference/api/paddle_pass_builder.cc b/paddle/fluid/inference/api/paddle_pass_builder.cc index ce733c53059b7..01988d5f539dc 100644 --- a/paddle/fluid/inference/api/paddle_pass_builder.cc +++ b/paddle/fluid/inference/api/paddle_pass_builder.cc @@ -20,6 +20,7 @@ #include #endif #include +#include #include namespace paddle { @@ -60,6 +61,12 @@ void PaddlePassBuilder::DeletePass(const std::string &pass_type) { } } +size_t PaddlePassBuilder::GetPassIndex(const std::string &pass_type) { + auto iter = std::find(std::begin(passes_), std::end(passes_), pass_type); + if (iter == std::end(passes_)) return -1; + return std::distance(std::begin(passes_), iter); +} + void PaddlePassBuilder::InsertPass(size_t idx, const std::string &pass_type) { passes_.insert(std::begin(passes_) + idx, pass_type); } @@ -300,6 +307,7 @@ void CpuPassStrategy::EnableMKLDNN() { // Disabled due to topology-dependent speed-up // "fc_mkldnn_pass", // "fc_act_mkldnn_fuse_pass", + "fc_elementwise_add_mkldnn_fuse_pass", // "batch_norm_act_fuse_pass", // "softplus_activation_mkldnn_fuse_pass", // "shuffle_channel_mkldnn_detect_pass", // diff --git a/paddle/fluid/inference/api/paddle_pass_builder.h b/paddle/fluid/inference/api/paddle_pass_builder.h index 231ee2cb1e8e6..db6bde62ddc7c 100644 --- a/paddle/fluid/inference/api/paddle_pass_builder.h +++ b/paddle/fluid/inference/api/paddle_pass_builder.h @@ -71,6 +71,10 @@ class PD_INFER_DECL PaddlePassBuilder { /// \param[in] idx the position to delete. void DeletePass(size_t idx); + /// \brief Get the certain position of a pass. + /// \param[in] pass_type the type of insert pass. + size_t GetPassIndex(const std::string &pass_type); + /// \brief Delete all passes that has a certain type 'pass_type'. /// \param[in] pass_type the certain pass type to be deleted. void DeletePass(const std::string &pass_type); diff --git a/paddle/fluid/inference/tests/api/CMakeLists.txt b/paddle/fluid/inference/tests/api/CMakeLists.txt index 06d1cd0814eb2..e9b8c0ce70f66 100644 --- a/paddle/fluid/inference/tests/api/CMakeLists.txt +++ b/paddle/fluid/inference/tests/api/CMakeLists.txt @@ -345,6 +345,19 @@ inference_analysis_test(test_analyzer_transformer_profile SRCS analyzer_transfor ARGS --infer_model=${TRANSFORMER_INSTALL_DIR}/model --infer_data=${TRANSFORMER_INSTALL_DIR}/data.txt --batch_size=8 --cpu_num_threads=${CPU_NUM_THREADS_ON_CI}) +# VIT-OCR +set(VIT_OCR_URL "https://paddle-qa.bj.bcebos.com/inference_model/2.1.1/ocr") +set(VIT_OCR_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/vit_ocr") +if (NOT EXISTS ${VIT_OCR_INSTALL_DIR}/vit_ocr.tgz) + inference_download_and_uncompress_without_verify(${VIT_OCR_INSTALL_DIR} ${VIT_OCR_URL} vit_ocr.tgz) +endif() +if (NOT EXISTS ${VIT_OCR_INSTALL_DIR}/datavit.txt) + file(DOWNLOAD ${VIT_OCR_URL}/datavit.txt ${VIT_OCR_INSTALL_DIR}/datavit.txt) +endif() +inference_analysis_test(test_analyzer_vit_ocr SRCS analyzer_vit_ocr_tester.cc + EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} + ARGS --infer_model=${VIT_OCR_INSTALL_DIR}/vit_ocr --infer_data=${VIT_OCR_INSTALL_DIR}/datavit.txt) + # ocr set(OCR_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/ocr") if (NOT EXISTS ${OCR_INSTALL_DIR}/ocr.tar.gz) diff --git a/paddle/fluid/inference/tests/api/analyzer_bert_tester.cc b/paddle/fluid/inference/tests/api/analyzer_bert_tester.cc index 8f7e51009223a..224bbaa7aab22 100644 --- a/paddle/fluid/inference/tests/api/analyzer_bert_tester.cc +++ b/paddle/fluid/inference/tests/api/analyzer_bert_tester.cc @@ -158,6 +158,7 @@ void profile(bool use_mkldnn = false) { config.EnableMKLDNN(); config.pass_builder()->AppendPass("fc_mkldnn_pass"); config.pass_builder()->AppendPass("fc_act_mkldnn_fuse_pass"); + config.pass_builder()->AppendPass("fc_elementwise_add_mkldnn_fuse_pass"); } std::vector> outputs; diff --git a/paddle/fluid/inference/tests/api/analyzer_vit_ocr_tester.cc b/paddle/fluid/inference/tests/api/analyzer_vit_ocr_tester.cc new file mode 100644 index 0000000000000..029f2f0421d15 --- /dev/null +++ b/paddle/fluid/inference/tests/api/analyzer_vit_ocr_tester.cc @@ -0,0 +1,117 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include +#include "paddle/fluid/inference/tests/api/tester_helper.h" + +namespace paddle { +namespace inference { +namespace analysis { + +struct Record { + std::vector data; + std::vector shape; +}; + +Record ProcessALine(const std::string &line) { + std::vector columns; + split(line, '\t', &columns); + CHECK_EQ(columns.size(), 2UL) + << "data format error, should be \t"; + + Record record; + std::vector data_strs; + split(columns[0], ' ', &data_strs); + for (auto &d : data_strs) { + record.data.push_back(std::stof(d)); + } + + std::vector shape_strs; + split(columns[1], ' ', &shape_strs); + for (auto &s : shape_strs) { + record.shape.push_back(std::stoi(s)); + } + + return record; +} + +void SetInput(std::vector> *inputs) { + std::string line; + std::ifstream file(FLAGS_infer_data); + std::getline(file, line); + auto record = ProcessALine(line); + + PaddleTensor input; + input.shape = record.shape; + input.dtype = PaddleDType::FLOAT32; + size_t input_size = record.data.size() * sizeof(float); + input.data.Resize(input_size); + memcpy(input.data.data(), record.data.data(), input_size); + std::vector input_slots; + input_slots.assign({input}); + (*inputs).emplace_back(input_slots); +} + +void SetConfig(AnalysisConfig *cfg, bool use_mkldnn = false) { + cfg->SetModel(FLAGS_infer_model + "/inference.pdmodel", + FLAGS_infer_model + "/inference.pdiparams"); + + if (use_mkldnn) { + cfg->EnableMKLDNN(); + cfg->SwitchIrOptim(); + + size_t insertingIndex = cfg->pass_builder()->GetPassIndex( + "fc_elementwise_add_mkldnn_fuse_pass"); + cfg->pass_builder()->InsertPass(insertingIndex, "fc_act_mkldnn_fuse_pass"); + cfg->pass_builder()->InsertPass(insertingIndex, "fc_mkldnn_pass"); + } +} + +// Compare results of NativeConfig and AnalysisConfig +void compare(bool use_mkldnn = false) { + AnalysisConfig cfg; + SetConfig(&cfg, use_mkldnn); + + std::vector> input_slots_all; + SetInput(&input_slots_all); + CompareNativeAndAnalysis( + reinterpret_cast(&cfg), input_slots_all); +} + +TEST(Analyzer_vit_ocr, compare) { compare(); } + +#ifdef PADDLE_WITH_MKLDNN +TEST(Analyzer_vit_ocr, compare_mkldnn) { compare(true /* use_mkldnn */); } +#endif + +#ifdef PADDLE_WITH_MKLDNN +// Check the fuse status +TEST(Analyzer_vit_ocr, fuse_status) { + AnalysisConfig cfg; + SetConfig(&cfg, true); + int num_ops; + auto predictor = CreatePaddlePredictor(cfg); + auto fuse_status = GetFuseStatis( + static_cast(predictor.get()), &num_ops); + + CHECK_EQ(fuse_status.at("fc_mkldnn_pass"), 33); + CHECK_EQ(fuse_status.at("conv_activation_mkldnn_fuse"), 2); + CHECK_EQ(fuse_status.at("fc_elementwise_add_mkldnn_fuse"), 16); +} +#endif + +} // namespace analysis +} // namespace inference +} // namespace paddle diff --git a/paddle/fluid/operators/mkldnn/fc_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/fc_mkldnn_op.cc index 30db4b3be662b..4078d012fce90 100644 --- a/paddle/fluid/operators/mkldnn/fc_mkldnn_op.cc +++ b/paddle/fluid/operators/mkldnn/fc_mkldnn_op.cc @@ -410,19 +410,17 @@ class FCPrimitiveFactory { const ExecutionContext& ctx) { auto scale_in_data = ctx.Attr("Scale_in"); auto scale_weights_data = ctx.Attr>("Scale_weights"); + bool has_activation = !ctx.Attr("activation_type").empty(); + bool force_fp32_output = ctx.Attr("force_fp32_output"); // If the output will be in floats, we don't multiply by scale_out. - float activation_scale = 1.0f; - float inner_scale = 1.0f; - if (!ctx.Attr("force_fp32_output")) { - // if has activation use it's scale, otherwise use inner scale. - if (!ctx.Attr("activation_type").empty()) { - activation_scale = ctx.Attr("Scale_out"); - } else { - inner_scale = ctx.Attr("Scale_out"); - } - } + float scale = (!force_fp32_output && has_activation) + ? ctx.Attr("Scale_out") + : 1.0f; + float inner_scale = (force_fp32_output || has_activation) + ? 1.0f + : ctx.Attr("Scale_out"); const size_t weight_scales_num = scale_weights_data.size(); std::vector output_shift_scale(weight_scales_num); @@ -435,7 +433,7 @@ class FCPrimitiveFactory { inner_scale / (scale_in_data * scale_weights_data[i]); } - return make_tuple(output_shift_scale, activation_scale); + return make_tuple(output_shift_scale, scale); } // Computing MKL-DNN's scaling mask which determines along which dimension @@ -467,6 +465,12 @@ class FCPrimitiveFactory { std::tie(output_shift_scale, scale) = ComputeOutputShiftScale(ctx); int mask = CreateMask(1, output_shift_scale.size() > 1); attributes.set_output_scales(mask, output_shift_scale); + float sum_scale = 1.0f; + + if (ctx.HasAttr("fuse_residual_connection") && + ctx.Attr("fuse_residual_connection")) { + post_operations.append_sum(sum_scale); + } if (ctx.Attr("activation_type") == "relu") { constexpr float negative_slope = 0.0f; @@ -531,6 +535,21 @@ class FCPrimitiveFactory { dnnl::memory CreateDstMemory( const dnnl::inner_product_forward::primitive_desc& fc_prim_desc, const ExecutionContext& ctx, Tensor* output) { + if (ctx.HasAttr("fuse_residual_connection") && + ctx.Attr("fuse_residual_connection")) { + auto* residual_param = ctx.Output("ResidualData"); + + PADDLE_ENFORCE_EQ( + output->dims(), residual_param->dims(), + platform::errors::InvalidArgument( + "Output and elementwise parameter need to have the " + "same dimension sizes, but got output's dimension = %d" + " and residual param's dimension =%d .", + output->dims().size(), residual_param->dims().size())); + + output->ShareDataWith(*residual_param); + } + auto dst_desc = fc_prim_desc.dst_desc(); auto buffer_size = dst_desc.get_size(); T_out* output_data = diff --git a/python/paddle/fluid/tests/unittests/ir/inference/CMakeLists.txt b/python/paddle/fluid/tests/unittests/ir/inference/CMakeLists.txt index 808821f06cbae..c23e2eaa154be 100755 --- a/python/paddle/fluid/tests/unittests/ir/inference/CMakeLists.txt +++ b/python/paddle/fluid/tests/unittests/ir/inference/CMakeLists.txt @@ -141,5 +141,6 @@ if (WITH_MKLDNN) set_tests_properties(test_conv_eltwiseadd_bn_fuse_pass PROPERTIES TIMEOUT 300) set_tests_properties(test_mkldnn_conv_mish_fuse_pass PROPERTIES TIMEOUT 300) set_tests_properties(test_mkldnn_fc_mish_fuse_pass PROPERTIES TIMEOUT 300) + set_tests_properties(test_mkldnn_fc_elementwise_add_fuse_pass PROPERTIES TIMEOUT 120) endif() endif() diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_fc_elementwise_add_fuse_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_fc_elementwise_add_fuse_pass.py new file mode 100644 index 0000000000000..22b8960497beb --- /dev/null +++ b/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_fc_elementwise_add_fuse_pass.py @@ -0,0 +1,101 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from auto_scan_test import PassAutoScanTest, SkipReasons +from program_config import TensorConfig, ProgramConfig, OpConfig +import numpy as np +import paddle.inference as paddle_infer +from functools import partial +from typing import Optional, List, Callable, Dict, Any, Set +import unittest + +import hypothesis +from hypothesis import given, settings, seed, example, assume +import hypothesis.strategies as st + + +class TestFCElementwiseAddMkldnnFusePass(PassAutoScanTest): + def sample_program_config(self, draw): + axis = draw(st.sampled_from([-1, 0, 1])) + fc_as_x = draw(st.sampled_from([True, False])) + fc_in = draw(st.sampled_from([32, 64])) + fc_wei = draw(st.sampled_from([32, 64])) + + def generate_input(): + return np.random.random([fc_in, fc_wei]).astype(np.float32) + + def generate_fc_weight(): + return np.random.random([fc_wei, fc_wei]).astype(np.float32) + + def generate_fc_bias(): + return np.random.random([fc_wei]).astype(np.float32) + + relu_op = OpConfig( + type="relu", + inputs={"X": ["input_data"]}, + outputs={"Out": ["relu_out"]}, + attrs={}) + + fc_op = OpConfig( + type="fc", + inputs={ + "Input": ["relu_out"], + "W": ["fc_weight"], + "Bias": ["fc_bias"] + }, + outputs={"Out": ["fc_output"]}, + attrs={ + "use_mkldnn": True, + "padding_weights": False, + "activation_type": "", + "in_num_col_dims": 1, + }) + + if fc_as_x: + inputs = {"X": ["fc_output"], "Y": ["input_data"]} + else: + inputs = {"X": ["input_data"], "Y": ["fc_output"]} + + elt_add_op = OpConfig( + type="elementwise_add", + inputs=inputs, + outputs={"Out": ["elementwise_output"]}, + attrs={'axis': axis}) + + model_net = [relu_op, fc_op, elt_add_op] + + program_config = ProgramConfig( + ops=model_net, + weights={ + "fc_weight": TensorConfig(data_gen=partial(generate_fc_weight)), + "fc_bias": TensorConfig(data_gen=partial(generate_fc_bias)), + }, + inputs={ + "input_data": TensorConfig(data_gen=partial(generate_input)) + }, + outputs=["elementwise_output"]) + + return program_config + + def sample_predictor_configs(self, program_config): + config = self.create_inference_config(use_mkldnn=True) + yield config, ["relu", "fc"], (1e-5, 1e-5) + + def test(self): + self.run_and_statis( + quant=False, passes=["fc_elementwise_add_mkldnn_fuse_pass"]) + + +if __name__ == "__main__": + unittest.main() From d0f3296b2febe618b4595202a5b2cdd52e9ff585 Mon Sep 17 00:00:00 2001 From: Jacek Czaja Date: Thu, 14 Apr 2022 12:45:43 +0200 Subject: [PATCH 156/211] Fix to #38693 (minimal UT) (#41026) * Add UT - Added missed data_layout - Added missing conversions - NDHWC added - NDHWC support in data_transform - another fix - condddate change - fix u- fix - fix - fix - fix - fix - fix to hack - compilation fix - fix to automatic merge * - reduced UT * - fix * - lint * - fix to lint --- .../fluid/framework/data_layout_transform.h | 8 ++ paddle/fluid/framework/data_transform.cc | 2 +- .../inference/api/details/zero_copy_tensor.cc | 3 +- paddle/fluid/platform/mkldnn_helper.h | 16 +++- paddle/phi/common/layout.h | 4 +- .../unittests/ir/inference/CMakeLists.txt | 1 + .../ir/inference/test_mkldnn_conv3d_op.py | 92 +++++++++++++++++++ 7 files changed, 118 insertions(+), 8 deletions(-) create mode 100644 python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_conv3d_op.py diff --git a/paddle/fluid/framework/data_layout_transform.h b/paddle/fluid/framework/data_layout_transform.h index 182ffe65c3c7e..e1189ab34cdd9 100644 --- a/paddle/fluid/framework/data_layout_transform.h +++ b/paddle/fluid/framework/data_layout_transform.h @@ -59,6 +59,10 @@ inline MKLDNNMemoryFormat ToMKLDNNFormat(const DataLayout& layout) { return MKLDNNMemoryFormat::nhwc; case DataLayout::kNCHW: return MKLDNNMemoryFormat::nchw; + case DataLayout::kNCDHW: + return MKLDNNMemoryFormat::ncdhw; + case DataLayout::kNDHWC: + return MKLDNNMemoryFormat::ndhwc; default: PADDLE_THROW(platform::errors::InvalidArgument( "Fail to convert layout %s to MKLDNN format.", @@ -72,6 +76,10 @@ inline DataLayout ToPaddleLayout(const MKLDNNMemoryFormat& format) { return DataLayout::kNHWC; case MKLDNNMemoryFormat::nchw: return DataLayout::kNCHW; + case MKLDNNMemoryFormat::ncdhw: + return DataLayout::kNCDHW; + case MKLDNNMemoryFormat::ndhwc: + return DataLayout::kNDHWC; default: PADDLE_THROW(platform::errors::InvalidArgument( "Fail to convert MKLDNN format to paddle layout.")); diff --git a/paddle/fluid/framework/data_transform.cc b/paddle/fluid/framework/data_transform.cc index 5ee93d4a20822..2c795c946235f 100644 --- a/paddle/fluid/framework/data_transform.cc +++ b/paddle/fluid/framework/data_transform.cc @@ -63,7 +63,7 @@ void TransformData(const OpKernelType &expected_kernel_type, out.ShareDataWith(input_tensor); // For NHWC data we need reshape of tensors as MKL-DNN // is expecting NHWC dims description order - if (lin == DataLayout::kNHWC) { + if (lin == DataLayout::kNHWC || lin == DataLayout::kNDHWC) { platform::MatchShapeToLayout(&out, lin, lout); // We register only NHWC assuming that model is consistent e.g. either // NHWC or NCHW diff --git a/paddle/fluid/inference/api/details/zero_copy_tensor.cc b/paddle/fluid/inference/api/details/zero_copy_tensor.cc index 77ab6bd590e55..0f26a1076a68c 100644 --- a/paddle/fluid/inference/api/details/zero_copy_tensor.cc +++ b/paddle/fluid/inference/api/details/zero_copy_tensor.cc @@ -579,7 +579,8 @@ std::vector Tensor::shape() const { // be done. Similarly for dim==1 when you have just one possible // combination. if (tensor->dims().size() < 3) return phi::vectorize(tensor->dims()); - if (out_layout == paddle::framework::DataLayout::kNHWC) { + if (out_layout == paddle::framework::DataLayout::kNHWC || + out_layout == paddle::framework::DataLayout::kNDHWC) { auto dims = phi::vectorize(tensor->dims()); std::rotate(dims.begin() + 1, dims.begin() + 2, dims.end()); return dims; diff --git a/paddle/fluid/platform/mkldnn_helper.h b/paddle/fluid/platform/mkldnn_helper.h index d2e48c1113860..17736a87409af 100644 --- a/paddle/fluid/platform/mkldnn_helper.h +++ b/paddle/fluid/platform/mkldnn_helper.h @@ -20,6 +20,7 @@ limitations under the License. */ #include #include #include + #include "dnnl.hpp" #include "paddle/fluid/framework/operator.h" #include "paddle/fluid/platform/place.h" @@ -102,20 +103,22 @@ inline void MatchShapeToLayout(framework::Tensor* tensor_in, switch (from) { case framework::DataLayout::kMKLDNN: - if (to == framework::DataLayout::kNHWC) { + if ((to == framework::DataLayout::kNHWC) || + (to == framework::DataLayout::kNDHWC)) { auto dims = phi::vectorize(tensor_in->dims()); std::rotate(dims.begin() + 1, dims.begin() + 2, dims.end()); tensor_in->Resize(phi::make_ddim(dims)); - VLOG(3) << "Rotating Shape from: kMKLDNN to: kNHWC output_shape" + VLOG(3) << "Rotating Shape from: kMKLDNN to: kNHWC/kNDHWC output_shape" << print_dims(dims); } break; case framework::DataLayout::kNHWC: + case framework::DataLayout::kNDHWC: if (to == framework::DataLayout::kMKLDNN) { auto dims = phi::vectorize(tensor_in->dims()); std::rotate(dims.begin() + 1, dims.end() - 1, dims.end()); tensor_in->Resize(phi::make_ddim(dims)); - VLOG(3) << "Rotating Shape from: kNHWC to: kMKLDNN output_shape" + VLOG(3) << "Rotating Shape from: kNHWC/kNDHWC to: kMKLDNN output_shape" << print_dims(dims); } break; @@ -279,7 +282,12 @@ inline dnnl::memory::format_tag GetMKLDNNFormat(dnnl::memory::desc mem_desc) { return dnnl::memory::format_tag::acdeb; } } else if (inner_nblks == 1) { - if (inner_blks[0] == 8 && inner_idxs[0] == 0) { + if (inner_blks[0] == 4 && inner_idxs[0] == 1) { + if (strides[0] >= strides[1] && strides[1] >= strides[2] && + strides[2] >= strides[3] && strides[3] >= strides[4]) { + return dnnl::memory::format_tag::aBcde4b; + } + } else if (inner_blks[0] == 8 && inner_idxs[0] == 0) { if (strides[0] >= strides[2] && strides[2] >= strides[3] && strides[3] >= strides[4] && strides[4] >= strides[1]) { return dnnl::memory::format_tag::Acdeb8a; diff --git a/paddle/phi/common/layout.h b/paddle/phi/common/layout.h index a5e4871f3d56b..6b2f657699fb1 100644 --- a/paddle/phi/common/layout.h +++ b/paddle/phi/common/layout.h @@ -26,13 +26,13 @@ enum class DataLayout { ANY = UNDEFINED, NHWC, NCHW, + NCDHW, + NDHWC, MKLDNN, SPARSE_COO, SPARSE_CSR, PSTRING_UNION, NUM_DATA_LAYOUTS, - NDHWC, - NCDHW, // See Note [ Why we need ALL in basic kernel key member? ] ALL_LAYOUT = UNDEFINED, // Note: Unify phi DataLayout and fluid::framework::DataLayout, diff --git a/python/paddle/fluid/tests/unittests/ir/inference/CMakeLists.txt b/python/paddle/fluid/tests/unittests/ir/inference/CMakeLists.txt index c23e2eaa154be..661fbbc7759c6 100755 --- a/python/paddle/fluid/tests/unittests/ir/inference/CMakeLists.txt +++ b/python/paddle/fluid/tests/unittests/ir/inference/CMakeLists.txt @@ -128,6 +128,7 @@ if (WITH_MKLDNN) set_tests_properties(test_mkldnn_depthwise_conv_pass PROPERTIES TIMEOUT 120) set_tests_properties(test_mkldnn_reshape_transpose_matmul_fuse_pass PROPERTIES TIMEOUT 100) set_tests_properties(test_mkldnn_mish_op PROPERTIES TIMEOUT 300) + set_tests_properties(test_mkldnn_conv3d_op PROPERTIES TIMEOUT 300) set_tests_properties(test_mkldnn_prelu_op PROPERTIES TIMEOUT 300) set_tests_properties(test_conv_act_mkldnn_fuse_pass PROPERTIES TIMEOUT 120) set_tests_properties(test_conv_transpose_eltwiseadd_bn_fuse_pass PROPERTIES TIMEOUT 250) diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_conv3d_op.py b/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_conv3d_op.py new file mode 100644 index 0000000000000..f6e668ed59097 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_conv3d_op.py @@ -0,0 +1,92 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from auto_scan_test import MkldnnAutoScanTest, SkipReasons +from program_config import TensorConfig, ProgramConfig, OpConfig +import numpy as np +import paddle.inference as paddle_infer +from functools import partial +from typing import Optional, List, Callable, Dict, Any, Set +import unittest + +import hypothesis +from hypothesis import given, settings, seed, example, assume +import hypothesis.strategies as st + + +class TestMkldnnConv3dOp(MkldnnAutoScanTest): + def is_program_valid(self, program_config: ProgramConfig) -> bool: + return True + + def sample_program_configs(self, *args, **kwargs): + def generate_input(*args, **kwargs): + if kwargs["data_format"] == "NCDHW": + return np.random.random( + [kwargs["batch_size"], 48, 64, 32, 64]).astype(np.float32) + else: + return np.random.random( + [kwargs["batch_size"], 64, 32, 64, 48]).astype(np.float32) + + def generate_weight(*args, **kwargs): + return np.random.random( + [16, int(48 / kwargs["groups"]), 3, 3, 3]).astype(np.float32) + + conv3d_op = OpConfig( + type="conv3d", + inputs={"Input": ["input_data"], + "Filter": ["conv_weight"]}, + outputs={"Output": ["conv_output"]}, + attrs={ + "data_format": kwargs["data_format"], + "dilations": kwargs["dilations"], + "padding_algorithm": kwargs["padding_algorithm"], + "groups": kwargs["groups"], + "paddings": kwargs["paddings"], + "strides": kwargs["strides"], + "is_test": True + }) + + program_config = ProgramConfig( + ops=[conv3d_op], + weights={ + "conv_weight": + TensorConfig(data_gen=partial(generate_weight, *args, **kwargs)) + }, + inputs={ + "input_data": + TensorConfig(data_gen=partial(generate_input, *args, **kwargs)) + }, + outputs=["conv_output"]) + + yield program_config + + def sample_predictor_configs(self, program_config): + config = self.create_inference_config(use_mkldnn=True) + yield config, (1e-5, 1e-5) + + @given( + data_format=st.sampled_from(["NCDHW", "NDHWC"]), + dilations=st.sampled_from([[1, 2, 1]]), + padding_algorithm=st.sampled_from(["EXPLICIT"]), + groups=st.sampled_from([2]), + paddings=st.sampled_from([[0, 3, 2]]), + strides=st.sampled_from([[1, 2, 1]]), + batch_size=st.integers( + min_value=1, max_value=4), ) + def test(self, *args, **kwargs): + self.run_test(*args, **kwargs) + + +if __name__ == "__main__": + unittest.main() From cbe7466ff733e917b84e65a423ced310d56ac20e Mon Sep 17 00:00:00 2001 From: liutiexing <74819124+liutiexing@users.noreply.github.com> Date: Thu, 14 Apr 2022 19:01:32 +0800 Subject: [PATCH 157/211] executor perf statistics (#41648) * executor perf statistics * fix ut * fix ut * fix ut * add ut * add ut --- .../framework/new_executor/CMakeLists.txt | 2 + .../new_executor/executor_statistics.cc | 627 ++++++++++++++++++ .../new_executor/executor_statistics.h | 27 + .../new_executor/standalone_executor.cc | 7 + .../new_executor/workqueue/CMakeLists.txt | 2 +- .../workqueue/nonblocking_threadpool.h | 9 +- paddle/fluid/pybind/CMakeLists.txt | 2 +- paddle/fluid/pybind/pybind.cc | 9 +- .../unittests/interpreter/CMakeLists.txt | 2 +- .../interpreter/test_standalone_executor.py | 105 +++ 10 files changed, 782 insertions(+), 10 deletions(-) create mode 100644 paddle/fluid/framework/new_executor/executor_statistics.cc create mode 100644 paddle/fluid/framework/new_executor/executor_statistics.h diff --git a/paddle/fluid/framework/new_executor/CMakeLists.txt b/paddle/fluid/framework/new_executor/CMakeLists.txt index b7b09da5ce03a..6046000739976 100644 --- a/paddle/fluid/framework/new_executor/CMakeLists.txt +++ b/paddle/fluid/framework/new_executor/CMakeLists.txt @@ -20,6 +20,8 @@ endif() cc_library(standalone_executor SRCS standalone_executor.cc DEPS interpretercore) +cc_library(staticgraph_executor_statistics SRCS executor_statistics.cc DEPS enforce glog os_info) + # cc_binary(standalone_executor_test SRCS standalone_executor_test.cc DEPS interpretercore standalone_executor operator op_registry executor ${GLOB_OP_LIB} ${GLOB_OPERATOR_DEPS} profiler) # skip win32 since wget is not installed by default on windows machine. if (WITH_GPU AND WITH_TESTING AND NOT WIN32 AND NOT "$ENV{CI_SKIP_CPP_TEST}" STREQUAL "ON") diff --git a/paddle/fluid/framework/new_executor/executor_statistics.cc b/paddle/fluid/framework/new_executor/executor_statistics.cc new file mode 100644 index 0000000000000..392d6c78f9c70 --- /dev/null +++ b/paddle/fluid/framework/new_executor/executor_statistics.cc @@ -0,0 +1,627 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/new_executor/executor_statistics.h" +#include +#include +#include +#include +#include +#include +#include +#include +#include "glog/logging.h" +#include "paddle/fluid/platform/flags.h" +#include "paddle/fluid/platform/os_info.h" +#include "paddle/fluid/platform/profiler/utils.h" + +DECLARE_bool(use_stream_safe_cuda_allocator); +PADDLE_DEFINE_EXPORTED_string(static_executor_perfstat_filepath, "", + "FLAGS_static_executor_perfstat_filepath " + "enables performance statistics for the static " + "graph executor."); + +namespace paddle { +namespace framework { + +class StatisticsEngine { + public: + int Apply(const platform::NodeTrees& trees); + + void Log(const std::string& full_filename); + + private: + // type + struct EventStat { + uint64_t total_time = 0; + size_t count = 0; + uint64_t normalization_time = 0; + }; + + struct Priority { + // use a smaller number to denote higher priority + int innerthread_priority = 0; + int interthread_priority = 0; + }; + + struct StdEvent { + size_t evt_idx; + uint64_t start_ns; + uint64_t end_ns; + + StdEvent(size_t idx, uint64_t start, uint64_t end) + : evt_idx(idx), start_ns(start), end_ns(end) {} + }; + + enum class ExecutorType { EXECUTOR, PARALLEL_EXECUTOR, INTERPRETER_CORE }; + + using Filter = std::function; + + int Init(const platform::NodeTrees& trees); + + int Stat(const platform::NodeTrees& trees); + + void InitStdEvents(); + + void InitInnerthreadPriorityForStdEvents(); + + void InitInterthreadPriorityForStdEvents(); + + int InitFiltersForExecutor(); + + int InitFiltersForParallelExecutor(); + + int InitFiltersForInterpreterCore(); + + int RegisterEventFilter(const std::string& std_event, Filter filter) { + auto iter = name2idx_.find(std_event); + if (iter == name2idx_.end()) { + LOG(WARNING) << "Unsupported std_event " << std_event; + return -1; + } + auto idx = iter->second; + if (filters_[idx]) { + LOG(WARNING) << "Duplicate registration for std_event(" << std_event + << ")"; + return -1; + } + filters_[idx] = std::move(filter); + return 0; + } + + void MergeEvents(std::function merger, + std::vector* in_out_evts); + + int MergeInnerthreadEvents(std::vector>* all_evts); + + int MergeInterthreadEvents(std::vector>* all_evts); + + int StatNormalizationTime(const std::vector>& all_evts); + + bool inited_ = false; + ExecutorType executor_type_; + std::vector names_; + std::vector filters_; + std::vector priorities_; + std::vector statistics_; + std::unordered_map name2idx_; +}; + +int StatisticsEngine::Apply(const platform::NodeTrees& tree) { + return Init(tree) || Stat(tree); +} + +int StatisticsEngine::Init(const platform::NodeTrees& trees) { + if (inited_) { + LOG(WARNING) << "Duplicate initialization for StatisticsEngine"; + return -1; + } + if (platform::GetCurrentThreadName() != "MainThread") { + LOG(WARNING) << "StatisticsEngin must run on the main thread"; + return -1; + } + inited_ = true; + InitStdEvents(); + InitInnerthreadPriorityForStdEvents(); + InitInterthreadPriorityForStdEvents(); + // determine executor type + uint64_t main_tid = platform::GetCurrentThreadId().sys_tid; + for (const auto& kv : trees.GetNodeTrees()) { + if (kv.first != main_tid) { + continue; + } + std::queue q; + q.push(kv.second); + while (!q.empty()) { + auto cur_node = q.front(); + q.pop(); + const auto& name = cur_node->Name(); + if (name.find("Executor::") == 0) { + VLOG(10) << "type: Executor"; + executor_type_ = ExecutorType::EXECUTOR; + return InitFiltersForExecutor(); + } else if (name.find("ParallelExecutor::") == 0) { + VLOG(10) << "type: ParallelExecutor"; + executor_type_ = ExecutorType::PARALLEL_EXECUTOR; + return InitFiltersForParallelExecutor(); + } else if (name.find("StandaloneExecutor::") == 0) { + VLOG(10) << "type: InterpreterCore"; + executor_type_ = ExecutorType::INTERPRETER_CORE; + return InitFiltersForInterpreterCore(); + } + for (const auto& child : cur_node->GetChildren()) { + q.push(child); + } + } + } + LOG(WARNING) << "Unsupported Executor"; + return -1; +} + +void StatisticsEngine::InitStdEvents() { + name2idx_["Total"] = names_.size(); + names_.push_back("Total"); + name2idx_["PythonEnd"] = names_.size(); + names_.push_back("PythonEnd"); + name2idx_["CplusplusEnd"] = names_.size(); + names_.push_back("CplusplusEnd"); + name2idx_["RunOp"] = names_.size(); + names_.push_back("RunOp"); + name2idx_["LuanchKernel"] = names_.size(); + names_.push_back("LuanchKernel"); + name2idx_["OpCompute"] = names_.size(); + names_.push_back("OpCompute"); + name2idx_["OpInfershape"] = names_.size(); + names_.push_back("OpInfershape"); + name2idx_["DataTransform"] = names_.size(); + names_.push_back("DataTransform"); + name2idx_["GarbageCollect"] = names_.size(); + names_.push_back("GarbageCollect"); + name2idx_["CalcNextOp"] = names_.size(); + names_.push_back("CalcNextOp"); + name2idx_["AllocateDeviceMem"] = names_.size(); + names_.push_back("AllocateDeviceMem"); + name2idx_["FreeDeviceMem"] = names_.size(); + names_.push_back("FreeDeviceMem"); + name2idx_["ThreadpoolAddTask"] = names_.size(); + names_.push_back("ThreadpoolAddTask"); + + size_t n = names_.size(); + filters_.resize(n); + priorities_.resize(n); + statistics_.resize(n); +} + +void StatisticsEngine::InitInnerthreadPriorityForStdEvents() { + int prio = 0; + priorities_[name2idx_["AllocateDeviceMem"]].innerthread_priority = ++prio; + priorities_[name2idx_["FreeDeviceMem"]].innerthread_priority = prio; + priorities_[name2idx_["ThreadpoolAddTask"]].innerthread_priority = prio; + + priorities_[name2idx_["CalcNextOp"]].innerthread_priority = ++prio; + priorities_[name2idx_["GarbageCollect"]].innerthread_priority = prio; + priorities_[name2idx_["OpCompute"]].innerthread_priority = prio; + priorities_[name2idx_["OpInfershape"]].innerthread_priority = prio; + priorities_[name2idx_["DataTransform"]].innerthread_priority = prio; + + priorities_[name2idx_["RunOp"]].innerthread_priority = ++prio; + + priorities_[name2idx_["CplusplusEnd"]].innerthread_priority = ++prio; + + priorities_[name2idx_["Total"]].innerthread_priority = ++prio; +} + +void StatisticsEngine::InitInterthreadPriorityForStdEvents() { + int prio = 0; + priorities_[name2idx_["LuanchKernel"]].interthread_priority = ++prio; + priorities_[name2idx_["AllocateDeviceMem"]].interthread_priority = ++prio; + priorities_[name2idx_["FreeDeviceMem"]].interthread_priority = ++prio; + priorities_[name2idx_["ThreadpoolAddTask"]].interthread_priority = ++prio; + + priorities_[name2idx_["CalcNextOp"]].interthread_priority = ++prio; + priorities_[name2idx_["GarbageCollect"]].interthread_priority = ++prio; + priorities_[name2idx_["OpInfershape"]].interthread_priority = ++prio; + priorities_[name2idx_["DataTransform"]].interthread_priority = ++prio; + + priorities_[name2idx_["RunOp"]].interthread_priority = ++prio; + priorities_[name2idx_["CplusplusEnd"]].interthread_priority = ++prio; + priorities_[name2idx_["PythonEnd"]].interthread_priority = prio; +} + +const char* alloc_device_mem = FLAGS_use_stream_safe_cuda_allocator + ? "StreamSafeCUDAAllocator::Allocate" + : "AutoGrowthBestFitAllocator::Allocate"; +const char* free_device_mem = FLAGS_use_stream_safe_cuda_allocator + ? "StreamSafeCUDAAllocator::Free" + : "AutoGrowthBestFitAllocator::Free"; + +int StatisticsEngine::InitFiltersForExecutor() { + return RegisterEventFilter("Total", + [](const platform::HostTraceEventNode& evt) { + return evt.Name().find("ProfileStep") == 0; + }) || + RegisterEventFilter("CplusplusEnd", + [](const platform::HostTraceEventNode& evt) { + return evt.Name() == + "Executor::RunPartialPreparedContext"; + }) || + RegisterEventFilter("RunOp", + [](const platform::HostTraceEventNode& evt) { + return evt.Type() == + platform::TracerEventType::Operator; + }) || + RegisterEventFilter( + "OpCompute", + [](const platform::HostTraceEventNode& evt) { + return evt.Name() == "compute" && + evt.Type() == platform::TracerEventType::OperatorInner; + }) || + RegisterEventFilter( + "OpInfershape", + [](const platform::HostTraceEventNode& evt) { + return evt.Name() == "infer_shape" && + evt.Type() == platform::TracerEventType::OperatorInner; + }) || + RegisterEventFilter("GarbageCollect", + [](const platform::HostTraceEventNode& evt) { + return evt.Name() == "CheckGC"; + }) || + RegisterEventFilter("AllocateDeviceMem", + [](const platform::HostTraceEventNode& evt) { + return evt.Name() == alloc_device_mem; + }) || + RegisterEventFilter("FreeDeviceMem", + [](const platform::HostTraceEventNode& evt) { + return evt.Name() == free_device_mem; + }) || + RegisterEventFilter( + "DataTransform", [](const platform::HostTraceEventNode& evt) { + return evt.Name() == "prepare_data" && + evt.Type() == platform::TracerEventType::OperatorInner; + }); +} + +int StatisticsEngine::InitFiltersForParallelExecutor() { + return RegisterEventFilter("Total", + [](const platform::HostTraceEventNode& evt) { + return evt.Name().find("ProfileStep") == 0; + }) || + RegisterEventFilter("CplusplusEnd", + [](const platform::HostTraceEventNode& evt) { + return evt.Name() == "ParallelExecutor::Run"; + }) || + RegisterEventFilter("RunOp", + [](const platform::HostTraceEventNode& evt) { + return evt.Type() == + platform::TracerEventType::Operator; + }) || + RegisterEventFilter( + "OpCompute", + [](const platform::HostTraceEventNode& evt) { + return evt.Name() == "compute" && + evt.Type() == platform::TracerEventType::OperatorInner; + }) || + RegisterEventFilter( + "OpInfershape", + [](const platform::HostTraceEventNode& evt) { + return evt.Name() == "infer_shape" && + evt.Type() == platform::TracerEventType::OperatorInner; + }) || + RegisterEventFilter("GarbageCollect", + [](const platform::HostTraceEventNode& evt) { + return evt.Name() == "eager_deletion" || + evt.Name() == "CheckGC"; + }) || + RegisterEventFilter("AllocateDeviceMem", + [](const platform::HostTraceEventNode& evt) { + return evt.Name() == alloc_device_mem; + }) || + RegisterEventFilter("FreeDeviceMem", + [](const platform::HostTraceEventNode& evt) { + return evt.Name() == free_device_mem; + }) || + RegisterEventFilter( + "DataTransform", + [](const platform::HostTraceEventNode& evt) { + return evt.Name() == "prepare_data" && + evt.Type() == platform::TracerEventType::OperatorInner; + }) || + RegisterEventFilter("ThreadpoolAddTask", + [](const platform::HostTraceEventNode& evt) { + return evt.Name() == "WorkQueue::AddTask"; + }); +} + +int StatisticsEngine::InitFiltersForInterpreterCore() { + return RegisterEventFilter("Total", + [](const platform::HostTraceEventNode& evt) { + return evt.Name().find("ProfileStep") == 0; + }) || + RegisterEventFilter("CplusplusEnd", + [](const platform::HostTraceEventNode& evt) { + return evt.Name() == "StandaloneExecutor::run"; + }) || + RegisterEventFilter("RunOp", + [](const platform::HostTraceEventNode& evt) { + return evt.Type() == + platform::TracerEventType::Operator; + }) || + RegisterEventFilter( + "OpCompute", + [](const platform::HostTraceEventNode& evt) { + return evt.Name() == "compute" && + evt.Type() == platform::TracerEventType::OperatorInner; + }) || + RegisterEventFilter( + "OpInfershape", + [](const platform::HostTraceEventNode& evt) { + return evt.Name() == "infer_shape" && + evt.Type() == platform::TracerEventType::OperatorInner; + }) || + RegisterEventFilter("GarbageCollect", + [](const platform::HostTraceEventNode& evt) { + return evt.Name() == "CheckGC" || + evt.Name() == "RecordStreamForGC"; + }) || + RegisterEventFilter("AllocateDeviceMem", + [](const platform::HostTraceEventNode& evt) { + return evt.Name() == alloc_device_mem; + }) || + RegisterEventFilter("FreeDeviceMem", + [](const platform::HostTraceEventNode& evt) { + return evt.Name() == free_device_mem; + }) || + RegisterEventFilter("CalcNextOp", + [](const platform::HostTraceEventNode& evt) { + return evt.Name() == "RunNextInstructions"; + }) || + RegisterEventFilter("ThreadpoolAddTask", + [](const platform::HostTraceEventNode& evt) { + return evt.Name() == "WorkQueue::AddTask"; + }); +} + +int StatisticsEngine::Stat(const platform::NodeTrees& trees) { + // Convert StdEvent + std::vector> all_evts; + for (const auto& tree : trees.GetNodeTrees()) { + std::vector thr_evts; + std::queue q; + q.push(tree.second); + std::unordered_set removed; + while (!q.empty()) { + auto cur_node = q.front(); + q.pop(); + for (const auto& child : cur_node->GetChildren()) { + // Remove duplicate operator records. + // See InterpreterCore::RunInstruction for details. + if (child->Type() == platform::TracerEventType::Operator && + cur_node->Name() == "compute") { + removed.insert(child); + } + q.push(child); + } + if (removed.count(cur_node) > 0) { + VLOG(10) << "Remove duplicate operator record: " << cur_node->Name(); + continue; + } + for (size_t idx = 0; idx < filters_.size(); ++idx) { + if (!filters_[idx]) { + continue; + } + if (filters_[idx](*cur_node)) { + thr_evts.emplace_back(idx, cur_node->StartNs(), cur_node->EndNs()); + break; + } + } + } + if (thr_evts.size() == 0) { + continue; + } + std::sort(thr_evts.begin(), thr_evts.end(), + [](const StdEvent& e1, const StdEvent& e2) { + return e1.start_ns < e2.start_ns; + }); + all_evts.push_back(std::move(thr_evts)); + } + if (all_evts.size() == 0) { + LOG(WARNING) << "No profiler events"; + return -1; + } + + // statistic total_time/count + for (const auto& thr_evts : all_evts) { + for (const auto& evt : thr_evts) { + auto& evt_stat = statistics_[evt.evt_idx]; + evt_stat.total_time += evt.end_ns - evt.start_ns; + evt_stat.count += 1; + } + } + auto& python_end = statistics_[name2idx_["PythonEnd"]]; + const auto& totol = statistics_[name2idx_["Total"]]; + const auto& cplusplus_end = statistics_[name2idx_["CplusplusEnd"]]; + python_end.total_time = totol.total_time - cplusplus_end.total_time; + python_end.count = cplusplus_end.count + 1; + + auto& luanch_kernel = statistics_[name2idx_["LuanchKernel"]]; + const auto& op_compute = statistics_[name2idx_["OpCompute"]]; + const auto& allocate = statistics_[name2idx_["AllocateDeviceMem"]]; + luanch_kernel.total_time = op_compute.total_time - allocate.total_time; + luanch_kernel.count = op_compute.count; + + if (executor_type_ != ExecutorType::EXECUTOR && + statistics_[name2idx_["ThreadpoolAddTask"]].count == 0) { + LOG(WARNING) << "Check your env variable FLAGS_host_trace_level, make sure " + "FLAGS_host_trace_level >= 10."; + return -1; + } + + // statistic normalization_time + return MergeInnerthreadEvents(&all_evts) || + MergeInterthreadEvents(&all_evts) || StatNormalizationTime(all_evts); +} + +void StatisticsEngine::MergeEvents(std::function merger, + std::vector* in_out_evts) { + auto evts = *in_out_evts; + std::sort(evts.begin(), evts.end(), + [](const StdEvent& e1, const StdEvent& e2) { + return e1.start_ns < e2.start_ns; + }); + + std::list merged; + auto iter = merged.begin(); + for (size_t i = 0; i < evts.size();) { + if (iter == merged.end()) { + iter = merged.insert(iter, evts[i]); + ++i; + } else if (iter->end_ns <= evts[i].start_ns) { + ++iter; + } else if (iter->evt_idx == evts[i].evt_idx) { + iter->end_ns = std::max(iter->end_ns, evts[i].end_ns); + ++i; + } else { + auto merged_type = merger(iter->evt_idx, evts[i].evt_idx); + if (merged_type == iter->evt_idx) { + if (evts[i].end_ns > iter->end_ns) { + evts[i].start_ns = iter->end_ns; + ++iter; + } else { + ++i; + } + } else { + StdEvent back = *iter; + if (back.start_ns != evts[i].start_ns) { + merged.insert(iter, {back.evt_idx, back.start_ns, evts[i].start_ns}); + } + *iter = evts[i]; + if (back.end_ns > evts[i].end_ns) { + auto pos = iter; + merged.insert(++pos, {back.evt_idx, evts[i].end_ns, back.end_ns}); + } + ++i; + } + } + } + in_out_evts->assign(merged.begin(), merged.end()); +} + +int StatisticsEngine::MergeInnerthreadEvents( + std::vector>* all_evts) { + auto merger = [& priorities = priorities_](size_t idx1, size_t idx2) { + return priorities[idx1].innerthread_priority <= + priorities[idx2].innerthread_priority + ? idx1 + : idx2; + }; + for (auto& thr_evts : *all_evts) { + MergeEvents(merger, &thr_evts); + for (auto& evt : thr_evts) { + if (names_[evt.evt_idx] == "Total") { + evt.evt_idx = name2idx_["PythonEnd"]; + } else if (names_[evt.evt_idx] == "OpCompute") { + evt.evt_idx = name2idx_["LuanchKernel"]; + } + } + } + return 0; +} + +int StatisticsEngine::MergeInterthreadEvents( + std::vector>* all_evts) { + auto merger = [& priorities = priorities_](size_t idx1, size_t idx2) { + return priorities[idx1].interthread_priority <= + priorities[idx2].interthread_priority + ? idx1 + : idx2; + }; + // K-way merge, just simplest impl + std::vector base_list; + base_list.swap(all_evts->at(0)); + for (size_t i = 1; i < all_evts->size(); ++i) { + auto& cur_list = all_evts->at(i); + base_list.reserve(base_list.size() + cur_list.size()); + base_list.insert(base_list.end(), cur_list.begin(), cur_list.end()); + MergeEvents(merger, &base_list); + } + all_evts->resize(1); + (*all_evts)[0].swap(base_list); + return 0; +} + +int StatisticsEngine::StatNormalizationTime( + const std::vector>& all_evts) { + if (all_evts.size() != 1) { + LOG(WARNING) << "Invalid argument"; + return -1; + } + for (const auto& evt : all_evts[0]) { + statistics_[evt.evt_idx].normalization_time += evt.end_ns - evt.start_ns; + } + // verify + uint64_t total = statistics_[name2idx_["Total"]].total_time; + uint64_t normalization_sum = 0; + for (size_t idx = 0; idx < statistics_.size(); ++idx) { + normalization_sum += statistics_[idx].normalization_time; + } + if (total - normalization_sum != 0) { + LOG(WARNING) << "total: " << total + << "is greater than normalization_sum:" << normalization_sum; + return -1; + } + return 0; +} + +void StatisticsEngine::Log(const std::string& filepath) { + std::ofstream ofs; + ofs.open(filepath, std::ofstream::out | std::ofstream::trunc); + if (!ofs) { + LOG(WARNING) << "Unable to open file " << filepath << " for writing data."; + return; + } + ofs << "["; + for (size_t idx = 0; idx < statistics_.size(); ++idx) { + const auto& evt_stat = statistics_[idx]; + ofs << platform::string_format(std::string(R"JSON( + { + "statistical item" : "%s", + "total time(ns)" : %llu, + "total number of times" : %llu, + "normalization time(ns)" : %llu + },)JSON"), + names_[idx].c_str(), evt_stat.total_time, + evt_stat.count, evt_stat.normalization_time); + } + ofs.seekp(-1, std::ios_base::end); + ofs << "]"; + if (ofs) { + LOG(INFO) << "writing the executor performance statistics to " << filepath; + } + ofs.close(); +} + +void StaticGraphExecutorPerfStatistics( + std::shared_ptr profiling_data) { + if (FLAGS_static_executor_perfstat_filepath.size() == 0) { + VLOG(5) << "StaticGraphExecutorPerfStatistics is disabled"; + return; + } + StatisticsEngine engine; + if (engine.Apply(*profiling_data) == 0) { + engine.Log(FLAGS_static_executor_perfstat_filepath); + } +} + +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/new_executor/executor_statistics.h b/paddle/fluid/framework/new_executor/executor_statistics.h new file mode 100644 index 0000000000000..530e9455968a8 --- /dev/null +++ b/paddle/fluid/framework/new_executor/executor_statistics.h @@ -0,0 +1,27 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include "paddle/fluid/platform/profiler/event_node.h" + +namespace paddle { +namespace framework { + +void StaticGraphExecutorPerfStatistics( + std::shared_ptr profiling_data); + +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/new_executor/standalone_executor.cc b/paddle/fluid/framework/new_executor/standalone_executor.cc index a22502314759f..4d4f7c74cd37e 100644 --- a/paddle/fluid/framework/new_executor/standalone_executor.cc +++ b/paddle/fluid/framework/new_executor/standalone_executor.cc @@ -13,6 +13,7 @@ // limitations under the License. #include "paddle/fluid/framework/new_executor/standalone_executor.h" #include "paddle/fluid/framework/new_executor/interpretercore_util.h" +#include "paddle/fluid/platform/profiler/event_tracing.h" namespace paddle { namespace framework { @@ -59,6 +60,9 @@ paddle::framework::FetchList StandaloneExecutor::Run( const std::vector& feed_names, const std::vector& feed_tensors, const std::vector& fetch_names) { + platform::RecordEvent record_event("StandaloneExecutor::run", + platform::TracerEventType::UserDefined, 1); + auto core = GetInterpreterCore(feed_names, fetch_names, true); return core->Run(feed_names, feed_tensors); @@ -67,6 +71,9 @@ paddle::framework::FetchList StandaloneExecutor::Run( paddle::framework::FetchList StandaloneExecutor::Run( const std::vector& feed_names, const std::vector& fetch_names) { + platform::RecordEvent record_event("StandaloneExecutor::run", + platform::TracerEventType::UserDefined, 1); + auto core = GetInterpreterCore(feed_names, fetch_names, false); VLOG(4) << "StandaloneExecutor: " << this << ", InterpreterCore: " << core; return core->Run(feed_names); diff --git a/paddle/fluid/framework/new_executor/workqueue/CMakeLists.txt b/paddle/fluid/framework/new_executor/workqueue/CMakeLists.txt index f47a274aaa4e5..2690b29e01b9d 100644 --- a/paddle/fluid/framework/new_executor/workqueue/CMakeLists.txt +++ b/paddle/fluid/framework/new_executor/workqueue/CMakeLists.txt @@ -1,3 +1,3 @@ cc_library(workqueue_utils SRCS workqueue_utils.cc events_waiter.cc DEPS enforce glog) -cc_library(workqueue SRCS workqueue.cc DEPS workqueue_utils enforce glog) +cc_library(workqueue SRCS workqueue.cc DEPS workqueue_utils enforce glog os_info) cc_test(workqueue_test SRCS workqueue_test.cc DEPS workqueue) diff --git a/paddle/fluid/framework/new_executor/workqueue/nonblocking_threadpool.h b/paddle/fluid/framework/new_executor/workqueue/nonblocking_threadpool.h index 44953fa192e27..a599bc41f678e 100644 --- a/paddle/fluid/framework/new_executor/workqueue/nonblocking_threadpool.h +++ b/paddle/fluid/framework/new_executor/workqueue/nonblocking_threadpool.h @@ -129,6 +129,7 @@ class ThreadPoolTempl { // this. We expect that such scenario is prevented by program, that is, // this is kept alive while any threads can potentially be in Schedule. if (!t.f) { + // Allow 'false positive' which makes a redundant notification. if (num_tasks > num_threads_ - blocked_) { VLOG(6) << "Add task, Notify"; ec_.Notify(false); @@ -379,9 +380,8 @@ class ThreadPoolTempl { return false; } - // Number of blocked threads is used as termination condition. - // If we are shutting down and all worker threads blocked without work, - // that's we are done. + // Number of blocked threads is used as notification condition. + // We must increase the counter before the emptiness check. blocked_++; // Now do a reliable emptiness check. @@ -393,6 +393,9 @@ class ThreadPoolTempl { return true; } + // Number of blocked threads is used as termination condition. + // If we are shutting down and all worker threads blocked without work, + // that's we are done. if (done_ && blocked_ == static_cast(num_threads_)) { ec_.CancelWait(); // Almost done, but need to re-check queues. diff --git a/paddle/fluid/pybind/CMakeLists.txt b/paddle/fluid/pybind/CMakeLists.txt index f8e7081de01bd..42eb79d75f857 100644 --- a/paddle/fluid/pybind/CMakeLists.txt +++ b/paddle/fluid/pybind/CMakeLists.txt @@ -350,7 +350,7 @@ if(WITH_PYTHON) add_custom_target(eager_op_function_generator_cmd ALL DEPENDS ${eager_impl_file}) endif() - list(APPEND PYBIND_DEPS interpretercore standalone_executor) + list(APPEND PYBIND_DEPS interpretercore standalone_executor staticgraph_executor_statistics) cc_library(op_function_common SRCS op_function_common.cc DEPS ${PYBIND_DEPS}) list(APPEND PYBIND_DEPS op_function_common) diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc index 0427fcece0b8b..7b63fdd6dd4cb 100644 --- a/paddle/fluid/pybind/pybind.cc +++ b/paddle/fluid/pybind/pybind.cc @@ -46,6 +46,7 @@ limitations under the License. */ #include "paddle/fluid/framework/ir/pass_builder.h" #include "paddle/fluid/framework/lod_rank_table.h" #include "paddle/fluid/framework/lod_tensor_array.h" +#include "paddle/fluid/framework/new_executor/executor_statistics.h" #include "paddle/fluid/framework/new_executor/standalone_executor.h" #include "paddle/fluid/framework/op_info.h" #include "paddle/fluid/framework/op_registry.h" @@ -2903,9 +2904,6 @@ All parameter, weight, gradient are variables in Paddle. .def("run", [](StandaloneExecutor &self, std::vector feed_names, std::vector fetch_names) { - platform::RecordEvent record_event( - "StandaloneExecutor::run", - platform::TracerEventType::UserDefined, 1); paddle::framework::FetchList ret; { pybind11::gil_scoped_release release; @@ -3380,7 +3378,10 @@ All parameter, weight, gradient are variables in Paddle. .def("stop", [](paddle::platform::Profiler *profiler) { platform::DisableHostEventRecorder(); - return profiler->Stop(); + auto result = profiler->Stop(); + framework::StaticGraphExecutorPerfStatistics( + result->GetNodeTrees()); + return result; }, py::return_value_policy::automatic_reference); diff --git a/python/paddle/fluid/tests/unittests/interpreter/CMakeLists.txt b/python/paddle/fluid/tests/unittests/interpreter/CMakeLists.txt index c1a2c36d8a344..09cc6ed5b5fce 100644 --- a/python/paddle/fluid/tests/unittests/interpreter/CMakeLists.txt +++ b/python/paddle/fluid/tests/unittests/interpreter/CMakeLists.txt @@ -2,7 +2,7 @@ file(GLOB TEST_INTERP_CASES RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "test_*.py") string(REPLACE ".py" "" TEST_INTERP_CASES "${TEST_INTERP_CASES}") foreach(target ${TEST_INTERP_CASES}) - py_test_modules(${target} MODULES ${target} ENVS FLAGS_allocator_strategy=auto_growth FLAGS_use_stream_safe_cuda_allocator=true FLAGS_fast_eager_deletion_mode=false FLAGS_eager_delete_tensor_gb=0) + py_test_modules(${target} MODULES ${target} ENVS FLAGS_host_trace_level=10 FLAGS_static_executor_perfstat_filepath=./perfstat FLAGS_allocator_strategy=auto_growth FLAGS_use_stream_safe_cuda_allocator=true FLAGS_fast_eager_deletion_mode=false FLAGS_eager_delete_tensor_gb=0) py_test_modules(${target}_non_eager_deletion MODULES ${target} ENVS FLAGS_allocator_strategy=auto_growth FLAGS_use_stream_safe_cuda_allocator=true FLAGS_fast_eager_deletion_mode=false FLAGS_eager_delete_tensor_gb=0.000001) py_test_modules(${target}_fast_gc MODULES ${target} ENVS FLAGS_allocator_strategy=auto_growth FLAGS_use_stream_safe_cuda_allocator=true FLAGS_fast_eager_deletion_mode=true FLAGS_eager_delete_tensor_gb=0) py_test_modules(${target}_fast_gc_non_eager_deletion MODULES ${target} ENVS FLAGS_allocator_strategy=auto_growth FLAGS_use_stream_safe_cuda_allocator=true FLAGS_fast_eager_deletion_mode=true FLAGS_eager_delete_tensor_gb=0.000001) diff --git a/python/paddle/fluid/tests/unittests/interpreter/test_standalone_executor.py b/python/paddle/fluid/tests/unittests/interpreter/test_standalone_executor.py index c07d4cc15bee0..a4dad5f53f14b 100644 --- a/python/paddle/fluid/tests/unittests/interpreter/test_standalone_executor.py +++ b/python/paddle/fluid/tests/unittests/interpreter/test_standalone_executor.py @@ -15,10 +15,13 @@ import os os.environ['FLAGS_use_stream_safe_cuda_allocator'] = "true" import sys +import shutil import unittest import paddle +import json from paddle.fluid import core from paddle.fluid.core import StandaloneExecutor +from paddle.profiler import profiler import numpy as np @@ -116,6 +119,107 @@ def build_program(): return main_program, startup_program, [mean] +class ExecutorStatisticsTestCase(unittest.TestCase): + def setUp(self): + self.iter_n = 3 + self.place = paddle.CUDAPlace(0) if core.is_compiled_with_cuda( + ) else paddle.CPUPlace() + + def test_standalone_executor_statistics(self): + if os.getenv("FLAGS_static_executor_perfstat_filepath") is None: + return + + paddle.seed(2020) + main_program, startup_program, fetch_list = build_program() + fetch_list = [x.name for x in fetch_list] + + p = core.Place() + p.set_place(self.place) + executor = StandaloneExecutor(p, startup_program.desc, + main_program.desc, core.Scope()) + + helper_profiler = profiler.Profiler( + targets=[profiler.ProfilerTarget.CPU], scheduler=(1, 2)) + helper_profiler.start() + for i in range(self.iter_n): + executor.run({}, fetch_list) + helper_profiler.step() + helper_profiler.stop() + + perfstat_filepath = os.environ[ + 'FLAGS_static_executor_perfstat_filepath'] + self.assertTrue(os.path.exists(perfstat_filepath)) + with open(perfstat_filepath, 'r') as load_f: + stat_res = json.load(load_f) + self.assertTrue(len(stat_res) > 0) + + os.remove(perfstat_filepath) + shutil.rmtree('./profiler_log') + + def test_parallel_executor_statistics(self): + if os.getenv("FLAGS_static_executor_perfstat_filepath") is None: + return + + paddle.seed(2020) + main_program, startup_program, fetch_list = build_program() + fetch_list = [x.name for x in fetch_list] + + main_program = paddle.fluid.compiler.CompiledProgram(main_program) + os.environ['FLAGS_USE_STANDALONE_EXECUTOR'] = '0' + executor = paddle.static.Executor(self.place) + os.environ['FLAGS_USE_STANDALONE_EXECUTOR'] = '1' + executor.run(startup_program) + + helper_profiler = profiler.Profiler( + targets=[profiler.ProfilerTarget.CPU], scheduler=(1, 2)) + helper_profiler.start() + for i in range(self.iter_n): + executor.run(main_program, fetch_list=fetch_list) + helper_profiler.step() + helper_profiler.stop() + + perfstat_filepath = os.environ[ + 'FLAGS_static_executor_perfstat_filepath'] + self.assertTrue(os.path.exists(perfstat_filepath)) + with open(perfstat_filepath, 'r') as load_f: + stat_res = json.load(load_f) + self.assertTrue(len(stat_res) > 0) + + os.remove(perfstat_filepath) + shutil.rmtree('./profiler_log') + + def test_executor_statistics(self): + if os.getenv("FLAGS_static_executor_perfstat_filepath") is None: + return + + paddle.seed(2020) + main_program, startup_program, fetch_list = build_program() + fetch_list = [x.name for x in fetch_list] + + os.environ['FLAGS_USE_STANDALONE_EXECUTOR'] = '0' + executor = paddle.static.Executor(self.place) + os.environ['FLAGS_USE_STANDALONE_EXECUTOR'] = '1' + executor.run(startup_program) + + helper_profiler = profiler.Profiler( + targets=[profiler.ProfilerTarget.CPU], scheduler=(1, 2)) + helper_profiler.start() + for i in range(self.iter_n): + executor.run(main_program, fetch_list=fetch_list) + helper_profiler.step() + helper_profiler.stop() + + perfstat_filepath = os.environ[ + 'FLAGS_static_executor_perfstat_filepath'] + self.assertTrue(os.path.exists(perfstat_filepath)) + with open(perfstat_filepath, 'r') as load_f: + stat_res = json.load(load_f) + self.assertTrue(len(stat_res) > 0) + + os.remove(perfstat_filepath) + shutil.rmtree('./profiler_log') + + class MultiStreamModelTestCase(unittest.TestCase): def setUp(self): self.iter_n = 2 @@ -155,6 +259,7 @@ def run_new_executor(self): p.set_place(self.place) inter_core = StandaloneExecutor(p, startup_program.desc, main_program.desc, core.Scope()) + outs = [] for i in range(self.iter_n): outs.append( From 64237c3f89086fa7ce8163c2f2283163365d76db Mon Sep 17 00:00:00 2001 From: zhangkaihuo Date: Thu, 14 Apr 2022 19:32:38 +0800 Subject: [PATCH 158/211] Supplementary documents (#41700) --- .../fluid/dygraph/varbase_patch_methods.py | 68 +++++++++++++++++++ 1 file changed, 68 insertions(+) diff --git a/python/paddle/fluid/dygraph/varbase_patch_methods.py b/python/paddle/fluid/dygraph/varbase_patch_methods.py index b2441e90fc9fb..03045579e7198 100644 --- a/python/paddle/fluid/dygraph/varbase_patch_methods.py +++ b/python/paddle/fluid/dygraph/varbase_patch_methods.py @@ -878,6 +878,28 @@ def pin_memory(self): @framework.dygraph_only def values(self): + """ + **Notes**: + **This API is ONLY available in Dygraph mode** + Get the values of current SparseTensor(COO or CSR). + + Returns: + Tensor: A DenseTensor + + Examples: + .. code-block:: python + + import paddle + from paddle.fluid.framework import _test_eager_guard + with _test_eager_guard(): + indices = [[0, 0, 1, 2, 2], [1, 3, 2, 0, 1]] + values = [1, 2, 3, 4, 5] + dense_shape = [3, 4] + sparse_x = paddle.sparse.sparse_coo_tensor(paddle.to_tensor(indices, dtype='int32'), paddle.to_tensor(values, dtype='float32'), shape=dense_shape) + print(sparse_x.values()) + #[1, 2, 3, 4, 5] + """ + if self.is_sparse_coo(): return _C_ops.final_state_sparse_coo_values(self) elif self.is_sparse_csr(): @@ -888,6 +910,30 @@ def values(self): @framework.dygraph_only def to_dense(self): + """ + **Notes**: + **This API is ONLY available in Dygraph mode** + Convert the current SparseTensor(COO or CSR) to DenseTensor. + + Returns: + Tensor: A DenseTensor + + Examples: + .. code-block:: python + + import paddle + from paddle.fluid.framework import _test_eager_guard + with _test_eager_guard(): + indices = [[0, 0, 1, 2, 2], [1, 3, 2, 0, 1]] + values = [1, 2, 3, 4, 5] + dense_shape = [3, 4] + sparse_x = paddle.sparse.sparse_coo_tensor(paddle.to_tensor(indices, dtype='int64'), paddle.to_tensor(values, dtype='float32'), shape=dense_shape) + dense_x = sparse_x.to_dense() + #[[0., 1., 0., 2.], + # [0., 0., 3., 0.], + # [4., 5., 0., 0.]] + """ + if self.is_sparse_coo(): return _C_ops.final_state_sparse_coo_to_dense(self) elif self.is_sparse_csr(): @@ -897,6 +943,28 @@ def to_dense(self): @framework.dygraph_only def to_sparse_coo(self, sparse_dim): + """ + **Notes**: + **This API is ONLY available in Dygraph mode** + Convert the current DenseTensor to SparseTensor in COO format. + + Returns: + Tensor: A SparseCooTensor + + Examples: + .. code-block:: python + + import paddle + from paddle.fluid.framework import _test_eager_guard + with _test_eager_guard(): + dense_x = [[0, 1, 0, 2], [0, 0, 3, 4]] + dense_x = paddle.to_tensor(dense_x, dtype='float32') + sparse_x = dense_x.to_sparse_coo(sparse_dim=2) + #indices=[[0, 0, 1, 1], + # [1, 3, 2, 3]], + #values=[1., 2., 3., 4.] + """ + if self.is_sparse_csr(): return _C_ops.final_state_sparse_to_sparse_coo(self, sparse_dim) elif self.is_sparse_coo(): From 3ce879dba700ef20415e95722de1c5845deab403 Mon Sep 17 00:00:00 2001 From: Yiqun Liu Date: Thu, 14 Apr 2022 19:57:40 +0800 Subject: [PATCH 159/211] Optimize the finding of max workspace size. (#41741) --- paddle/fluid/operators/conv_cudnn_helper.h | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/paddle/fluid/operators/conv_cudnn_helper.h b/paddle/fluid/operators/conv_cudnn_helper.h index 3c29c60b21565..1311f812be118 100644 --- a/paddle/fluid/operators/conv_cudnn_helper.h +++ b/paddle/fluid/operators/conv_cudnn_helper.h @@ -276,11 +276,12 @@ struct SearchAlgorithm { args.handle, args.idesc.desc(), args.wdesc.desc(), args.cdesc.desc(), args.odesc.desc(), static_cast(algo), &workspace_size); - if (status == CUDNN_STATUS_SUCCESS) { + if (status == CUDNN_STATUS_SUCCESS && + workspace_size <= workspace_size_limit) { max_workspace_size = std::max(workspace_size, max_workspace_size); } } - return std::min(max_workspace_size, workspace_size_limit); + return max_workspace_size; } else { return workspace_size_limit; } @@ -425,11 +426,12 @@ struct SearchAlgorithm { args.cdesc.desc(), args.idesc.desc(), static_cast(algo), &workspace_size); - if (status == CUDNN_STATUS_SUCCESS) { + if (status == CUDNN_STATUS_SUCCESS && + workspace_size <= workspace_size_limit) { max_workspace_size = std::max(workspace_size, max_workspace_size); } } - return std::min(max_workspace_size, workspace_size_limit); + return max_workspace_size; } else { return workspace_size_limit; } @@ -588,11 +590,12 @@ struct SearchAlgorithm { args.cdesc.desc(), args.wdesc.desc(), static_cast(algo), &workspace_size); - if (status == CUDNN_STATUS_SUCCESS) { + if (status == CUDNN_STATUS_SUCCESS && + workspace_size <= workspace_size_limit) { max_workspace_size = std::max(workspace_size, max_workspace_size); } } - return std::min(max_workspace_size, workspace_size_limit); + return max_workspace_size; } else { return workspace_size_limit; } From 54ccc308dba5d79ed6fa74a386773db4ef3e7604 Mon Sep 17 00:00:00 2001 From: YuanRisheng Date: Thu, 14 Apr 2022 19:59:39 +0800 Subject: [PATCH 160/211] [Phi] Support construct Scalar by using Non-CPU Tensor (#41765) * support construct scalar using non-cpu tensor * fix bugs when run unittest * fix compile bugs * fix bugs when run ci * fix compile bugs * fix bugs when move copy * perfect unit test * perfect unittest * update according to comment * add target dependency * deal with conflict * fix bugs when run unit test * fix unit test bugs --- paddle/fluid/platform/CMakeLists.txt | 6 +- paddle/phi/CMakeLists.txt | 2 +- paddle/phi/api/lib/CMakeLists.txt | 4 +- paddle/phi/api/lib/api_custom_impl.cc | 30 +--- paddle/phi/api/lib/scalar.cc | 49 ++++++ paddle/phi/api/lib/tensor_copy.cc | 57 +++++++ paddle/phi/api/lib/tensor_copy.h | 25 +++ paddle/phi/api/lib/utils/CMakeLists.txt | 2 +- paddle/phi/common/CMakeLists.txt | 2 +- paddle/phi/common/scalar.cc | 23 ++- paddle/phi/common/scalar.h | 90 +++++------ paddle/phi/core/CMakeLists.txt | 2 +- paddle/phi/core/selected_rows.cc | 26 +++ paddle/phi/core/selected_rows.h | 5 +- paddle/phi/core/utils/type_registry.h | 2 +- paddle/phi/tests/api/CMakeLists.txt | 4 +- paddle/phi/tests/api/test_fill_api.cc | 1 + paddle/phi/tests/api/test_scale_api.cc | 1 + paddle/phi/tests/common/CMakeLists.txt | 6 + paddle/phi/tests/common/test_scalar.cu | 205 ++++++++++++++++++++++++ paddle/phi/tests/core/CMakeLists.txt | 2 +- 21 files changed, 449 insertions(+), 95 deletions(-) create mode 100644 paddle/phi/api/lib/scalar.cc create mode 100644 paddle/phi/api/lib/tensor_copy.cc create mode 100644 paddle/phi/api/lib/tensor_copy.h create mode 100644 paddle/phi/core/selected_rows.cc create mode 100644 paddle/phi/tests/common/test_scalar.cu diff --git a/paddle/fluid/platform/CMakeLists.txt b/paddle/fluid/platform/CMakeLists.txt index 46059100b3802..f29546c5210d9 100644 --- a/paddle/fluid/platform/CMakeLists.txt +++ b/paddle/fluid/platform/CMakeLists.txt @@ -192,13 +192,13 @@ add_subdirectory(profiler) cc_library(device_tracer SRCS device_tracer.cc DEPS boost profiler_proto framework_proto ${GPU_CTX_DEPS}) if(WITH_GPU) - nv_library(profiler SRCS profiler.cc profiler.cu DEPS os_info device_tracer gpu_info enforce dynload_cuda new_profiler) + nv_library(profiler SRCS profiler.cc profiler.cu DEPS os_info device_tracer gpu_info enforce dynload_cuda new_profiler stats) nv_library(device_memory_aligment SRCS device_memory_aligment.cc DEPS cpu_info gpu_info place) elseif(WITH_ROCM) - hip_library(profiler SRCS profiler.cc profiler.cu DEPS os_info device_tracer gpu_info enforce new_profiler) + hip_library(profiler SRCS profiler.cc profiler.cu DEPS os_info device_tracer gpu_info enforce new_profiler stats) hip_library(device_memory_aligment SRCS device_memory_aligment.cc DEPS cpu_info gpu_info place) else() - cc_library(profiler SRCS profiler.cc DEPS os_info device_tracer enforce new_profiler) + cc_library(profiler SRCS profiler.cc DEPS os_info device_tracer enforce new_profiler stats) cc_library(device_memory_aligment SRCS device_memory_aligment.cc DEPS cpu_info place) endif() diff --git a/paddle/phi/CMakeLists.txt b/paddle/phi/CMakeLists.txt index 724b1ba556d4b..d43e327393f25 100644 --- a/paddle/phi/CMakeLists.txt +++ b/paddle/phi/CMakeLists.txt @@ -23,7 +23,7 @@ add_subdirectory(tools) add_subdirectory(tests) # make an unity target for compile deps -set(PHI_DEPS convert_utils dense_tensor phi_context kernel_factory kernel_context arg_map_context infermeta lod_utils op_compat_infos sparse_csr_tensor sparse_coo_tensor string_tensor) +set(PHI_DEPS convert_utils dense_tensor phi_context kernel_factory kernel_context arg_map_context infermeta lod_utils op_compat_infos sparse_csr_tensor sparse_coo_tensor string_tensor api_scalar) get_property(phi_kernels GLOBAL PROPERTY PHI_KERNELS) set(PHI_DEPS ${PHI_DEPS} ${phi_kernels}) diff --git a/paddle/phi/api/lib/CMakeLists.txt b/paddle/phi/api/lib/CMakeLists.txt index 9cc5d620280bc..e10ae8254a79e 100644 --- a/paddle/phi/api/lib/CMakeLists.txt +++ b/paddle/phi/api/lib/CMakeLists.txt @@ -164,7 +164,7 @@ cc_library(kernel_dispatch SRCS kernel_dispatch.cc DEPS phi_tensor_raw phi_conte cc_library(api_gen_utils SRCS api_gen_utils.cc DEPS phi_tensor_raw selected_rows sparse_csr_tensor sparse_coo_tensor) cc_library(phi_data_transform SRCS data_transform.cc DEPS phi_tensor_raw transfer_layout_kernel cast_kernel data_device_transform) cc_library(api_custom_impl SRCS api_custom_impl.cc DEPS phi_tensor_raw phi kernel_dispatch api_gen_utils backward_infermeta phi_data_transform) -cc_library(sparse_api_custom_impl SRCS sparse_api_custom_impl.cc DEPS phi_tensor_raw phi kernel_dispatch api_gen_utils phi_data_transform) +cc_library(sparse_api_custom_impl SRCS sparse_api_custom_impl.cc DEPS phi_tensor_raw phi kernel_dispatch api_gen_utils phi_data_transform tensor_copy) cc_library(phi_function_api SRCS ${api_source_file} DEPS phi_tensor_raw phi kernel_dispatch api_gen_utils phi_data_transform api_custom_impl) cc_library(phi_bw_function_api SRCS ${bw_api_source_file} DEPS phi_tensor_raw phi kernel_dispatch api_gen_utils backward_infermeta phi_data_transform phi_function_api api_custom_impl global_utils) @@ -173,3 +173,5 @@ cc_library(sparse_bw_api SRCS ${sparse_bw_api_source_file} DEPS phi_tensor_raw p cc_library(phi_dygraph_api SRCS ${dygraph_api_source_file} DEPS phi_tensor_raw phi kernel_dispatch api_gen_utils phi_data_transform phi_function_api sparse_api) cc_library(strings_api SRCS ${strings_api_source_file} DEPS phi_tensor_raw phi kernel_dispatch api_gen_utils) cc_library(phi_tensor SRCS tensor_method.cc DEPS phi_tensor_raw phi_function_api api_gen_utils kernel_dispatch infermeta sparse_api strings_api) +cc_library(tensor_copy SRCS tensor_copy.cc DEPS phi_tensor_raw copy_kernel kernel_dispatch api_gen_utils) +cc_library(api_scalar SRCS scalar.cc DEPS tensor_copy) diff --git a/paddle/phi/api/lib/api_custom_impl.cc b/paddle/phi/api/lib/api_custom_impl.cc index 2b80094a39e31..033ec569de811 100644 --- a/paddle/phi/api/lib/api_custom_impl.cc +++ b/paddle/phi/api/lib/api_custom_impl.cc @@ -17,6 +17,7 @@ limitations under the License. */ #include "paddle/phi/api/lib/api_gen_utils.h" #include "paddle/phi/api/lib/data_transform.h" #include "paddle/phi/api/lib/kernel_dispatch.h" +#include "paddle/phi/api/lib/tensor_copy.h" #include "paddle/phi/api/lib/utils/storage.h" #include "paddle/phi/common/type_traits.h" #include "paddle/phi/core/compat/convert_utils.h" @@ -424,35 +425,8 @@ std::vector> conv2d_grad_impl( } Tensor copy_to_impl(const Tensor& x, Place place, bool blocking) { - auto kernel_key_set = ParseKernelKeyByInputArgs(x); - kernel_key_set.backend_set = - kernel_key_set.backend_set | BackendSet(phi::TransToPhiBackend(place)); - auto kernel_key = kernel_key_set.GetHighestPriorityKernelKey(); - auto kernel = phi::KernelFactory::Instance().SelectKernelOrThrowError( - "copy", kernel_key); - - VLOG(6) << "copy API kernel key: " << kernel_key; - VLOG(6) << "copy API kernel: " << kernel; - - auto* dev_ctx = GetDeviceContextByBackend(kernel_key.backend()); - - auto dense_x = TensorToDenseTensor(x); - Tensor out; - auto kernel_out = SetKernelOutput(kernel_key.backend(), &out); - phi::MetaTensor meta_out(kernel_out); - phi::UnchangedInferMeta(*dense_x, &meta_out); - - using kernel_signature = void (*)(const platform::DeviceContext&, - const phi::DenseTensor&, - phi::Place, - bool, - phi::DenseTensor*); - - auto* kernel_fn = kernel.GetVariadicKernelFn(); - - (*kernel_fn)(*dev_ctx, *dense_x, place, blocking, kernel_out); - + copy(x, place, blocking, &out); return out; } diff --git a/paddle/phi/api/lib/scalar.cc b/paddle/phi/api/lib/scalar.cc new file mode 100644 index 0000000000000..c31338de09f1e --- /dev/null +++ b/paddle/phi/api/lib/scalar.cc @@ -0,0 +1,49 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/phi/common/scalar.h" + +#include "paddle/phi/api/lib/tensor_copy.h" +#include "paddle/phi/common/place.h" +#include "paddle/phi/core/enforce.h" + +namespace paddle { +namespace experimental { + +template <> +ScalarBase::ScalarBase(const Tensor& tensor_in) + : dtype_(tensor_in.dtype()) { // NOLINT + PADDLE_ENFORCE_EQ(tensor_in.numel(), + 1, + phi::errors::InvalidArgument( + "The Scalar only supports Tensor with 1 element, but " + "now Tensor has `%d` elements", + tensor_in.numel())); + auto tensor_in_place = tensor_in.place().GetType(); + if (tensor_in_place == phi::AllocationType::GPU) { + Tensor dst_tensor; + copy(tensor_in, phi::CPUPlace(), true, &dst_tensor); + GetDataFromTensor(dst_tensor); + } else if (tensor_in_place == phi::AllocationType::CPU) { + GetDataFromTensor(tensor_in); + } else { + PADDLE_THROW(phi::errors::Unimplemented( + "Now, it is not supported to construct Scalar using tensor that its " + "Place is (%s)", + tensor_in.place())); + } +} + +} // namespace experimental +} // namespace paddle diff --git a/paddle/phi/api/lib/tensor_copy.cc b/paddle/phi/api/lib/tensor_copy.cc new file mode 100644 index 0000000000000..57e3c28d8cb1f --- /dev/null +++ b/paddle/phi/api/lib/tensor_copy.cc @@ -0,0 +1,57 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/phi/api/lib/tensor_copy.h" +#include "paddle/phi/api/lib/api_gen_utils.h" +#include "paddle/phi/api/lib/kernel_dispatch.h" +#include "paddle/phi/api/lib/utils/storage.h" +#include "paddle/phi/core/compat/convert_utils.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/core/meta_tensor.h" +#include "paddle/phi/infermeta/unary.h" + +namespace paddle { +namespace experimental { + +void copy(const Tensor& src, Place place, bool blocking, Tensor* dst) { + auto kernel_key_set = ParseKernelKeyByInputArgs(src); + kernel_key_set.backend_set = + kernel_key_set.backend_set | BackendSet(phi::TransToPhiBackend(place)); + auto kernel_key = kernel_key_set.GetHighestPriorityKernelKey(); + auto kernel = phi::KernelFactory::Instance().SelectKernelOrThrowError( + "copy", kernel_key); + + VLOG(6) << "copy API kernel key: " << kernel_key; + VLOG(6) << "copy API kernel: " << kernel; + + auto* dev_ctx = GetDeviceContextByBackend(kernel_key.backend()); + + auto dense_x = TensorToDenseTensor(src); + + auto kernel_out = SetKernelOutput(kernel_key.backend(), dst); + phi::MetaTensor meta_out(kernel_out); + phi::UnchangedInferMeta(*dense_x, &meta_out); + + using kernel_signature = void (*)(const platform::DeviceContext&, + const phi::DenseTensor&, + phi::Place, + bool, + phi::DenseTensor*); + + auto* kernel_fn = kernel.GetVariadicKernelFn(); + (*kernel_fn)(*dev_ctx, *dense_x, place, blocking, kernel_out); +} + +} // namespace experimental +} // namespace paddle diff --git a/paddle/phi/api/lib/tensor_copy.h b/paddle/phi/api/lib/tensor_copy.h new file mode 100644 index 0000000000000..3ce45853319ec --- /dev/null +++ b/paddle/phi/api/lib/tensor_copy.h @@ -0,0 +1,25 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include "paddle/phi/api/include/tensor.h" + +namespace paddle { +namespace experimental { + +void copy(const Tensor& src, Place place, bool blocking, Tensor* dst); + +} // namespace experimental +} // namespace paddle diff --git a/paddle/phi/api/lib/utils/CMakeLists.txt b/paddle/phi/api/lib/utils/CMakeLists.txt index 94a16da2b7720..de97e7516f619 100644 --- a/paddle/phi/api/lib/utils/CMakeLists.txt +++ b/paddle/phi/api/lib/utils/CMakeLists.txt @@ -1,2 +1,2 @@ cc_library(phi_api_utils SRCS storage.cc tensor_utils.cc DEPS -tensor_base convert_utils dense_tensor lod_tensor selected_rows_utils place var_type_traits scalar string_tensor) +tensor_base convert_utils dense_tensor lod_tensor selected_rows_utils place var_type_traits string_tensor scalar) diff --git a/paddle/phi/common/CMakeLists.txt b/paddle/phi/common/CMakeLists.txt index 9bf692703860f..aa839eab587cb 100644 --- a/paddle/phi/common/CMakeLists.txt +++ b/paddle/phi/common/CMakeLists.txt @@ -1,2 +1,2 @@ cc_library(phi_place SRCS place.cc) -cc_library(scalar SRCS scalar.cc DEPS phi_enforce) +cc_library(scalar SRCS scalar.cc DEPS phi_enforce tensor) diff --git a/paddle/phi/common/scalar.cc b/paddle/phi/common/scalar.cc index 5cd55c1e88bed..41f1c9541823d 100644 --- a/paddle/phi/common/scalar.cc +++ b/paddle/phi/common/scalar.cc @@ -14,21 +14,32 @@ limitations under the License. */ #include "paddle/phi/common/scalar.h" +#include "paddle/phi/common/place.h" #include "paddle/phi/core/enforce.h" +#include "paddle/fluid/framework/tensor_util.h" +#include "paddle/fluid/platform/place.h" namespace paddle { namespace experimental { -// NOTE(xiongkun): why we put definition here? -// test_custom_op can't include enforce.h, because enforce.h includes gflags. -// so we decouple the include dependence of enforce.h by link. -void ThrowTensorConvertError(int num) { - PADDLE_ENFORCE_EQ(num, +// The Tensor must have one dim +template <> +ScalarBase::ScalarBase(const phi::DenseTensor& tensor_in) + : dtype_(tensor_in.dtype()) { // NOLINT + PADDLE_ENFORCE_EQ(tensor_in.numel(), 1, phi::errors::InvalidArgument( "The Scalar only supports Tensor with 1 element, but " "now Tensor has `%d` elements", - num)); + tensor_in.numel())); + auto cpu_place = phi::CPUPlace(); + if (!paddle::platform::is_same_place(tensor_in.place(), cpu_place)) { + phi::DenseTensor tensor; + framework::TensorCopySync(tensor_in, cpu_place, &tensor); + GetDataFromTensor(tensor); + } else { + GetDataFromTensor(tensor_in); + } } } // namespace experimental diff --git a/paddle/phi/common/scalar.h b/paddle/phi/common/scalar.h index 5134f4eb72639..c28f6185a556a 100644 --- a/paddle/phi/common/scalar.h +++ b/paddle/phi/common/scalar.h @@ -23,8 +23,6 @@ limitations under the License. */ namespace paddle { namespace experimental { -void ThrowTensorConvertError(int); - template class ScalarBase { public: @@ -105,50 +103,7 @@ class ScalarBase { } // The Tensor must have one dim - ScalarBase(const T& tensor) : dtype_(tensor.dtype()) { // NOLINT - is_from_tensor_ = true; - ThrowTensorConvertError(tensor.numel()); - switch (dtype_) { - case DataType::FLOAT32: - data_.f32 = tensor.template data()[0]; - break; - case DataType::FLOAT64: - data_.f64 = tensor.template data()[0]; - break; - case DataType::FLOAT16: - data_.f16 = tensor.template data()[0]; - break; - case DataType::BFLOAT16: - data_.bf16 = tensor.template data()[0]; - break; - case DataType::INT32: - data_.i32 = tensor.template data()[0]; - break; - case DataType::INT64: - data_.i64 = tensor.template data()[0]; - break; - case DataType::INT16: - data_.i16 = tensor.template data()[0]; - break; - case DataType::INT8: - data_.i8 = tensor.template data()[0]; - break; - case DataType::UINT8: - data_.ui8 = tensor.template data()[0]; - break; - case DataType::BOOL: - data_.b = tensor.template data()[0]; - break; - case DataType::COMPLEX64: - data_.c64 = tensor.template data()[0]; - break; - case DataType::COMPLEX128: - data_.c128 = tensor.template data()[0]; - break; - default: - PD_THROW("Invalid tensor data type `", dtype_, "`."); - } - } + ScalarBase(const T& tensor_in); // NOLINT template ScalarBase(const ScalarBase& other) { @@ -200,6 +155,49 @@ class ScalarBase { private: template friend void CopyScalar(const ScalarBase& src, ScalarBase* dst); + void GetDataFromTensor(const T& tensor) { + is_from_tensor_ = true; + switch (dtype_) { + case DataType::FLOAT32: + data_.f32 = tensor.template data()[0]; + break; + case DataType::FLOAT64: + data_.f64 = tensor.template data()[0]; + break; + case DataType::FLOAT16: + data_.f16 = tensor.template data()[0]; + break; + case DataType::BFLOAT16: + data_.bf16 = tensor.template data()[0]; + break; + case DataType::INT32: + data_.i32 = tensor.template data()[0]; + break; + case DataType::INT64: + data_.i64 = tensor.template data()[0]; + break; + case DataType::INT16: + data_.i16 = tensor.template data()[0]; + break; + case DataType::INT8: + data_.i8 = tensor.template data()[0]; + break; + case DataType::UINT8: + data_.ui8 = tensor.template data()[0]; + break; + case DataType::BOOL: + data_.b = tensor.template data()[0]; + break; + case DataType::COMPLEX64: + data_.c64 = tensor.template data()[0]; + break; + case DataType::COMPLEX128: + data_.c128 = tensor.template data()[0]; + break; + default: + PD_THROW("Invalid tensor data type `", dtype_, "`."); + } + } private: bool is_from_tensor_{false}; diff --git a/paddle/phi/core/CMakeLists.txt b/paddle/phi/core/CMakeLists.txt index b42b4388c2ce1..23574e98fbf17 100644 --- a/paddle/phi/core/CMakeLists.txt +++ b/paddle/phi/core/CMakeLists.txt @@ -23,7 +23,7 @@ cc_library(string_tensor SRCS string_tensor.cc DEPS convert_utils tensor_meta te cc_library(meta_tensor SRCS meta_tensor.cc DEPS tensor_base tensor_meta dense_tensor) cc_library(infermeta_utils SRCS infermeta_utils.cc DEPS meta_tensor) -cc_library(selected_rows SRCS selected_rows_impl.cc DEPS dense_tensor phi_enforce ddim memcpy) +cc_library(selected_rows SRCS selected_rows_impl.cc selected_rows.cc DEPS tensor_base dense_tensor phi_enforce ddim memcpy) cc_library(phi_device_context SRCS device_context.cc DEPS dense_tensor selected_rows) cc_library(custom_kernel SRCS custom_kernel.cc DEPS kernel_factory) diff --git a/paddle/phi/core/selected_rows.cc b/paddle/phi/core/selected_rows.cc new file mode 100644 index 0000000000000..dcf9c4182157a --- /dev/null +++ b/paddle/phi/core/selected_rows.cc @@ -0,0 +1,26 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/phi/core/selected_rows.h" + +namespace phi { + +SelectedRows::SelectedRows(const std::vector& rows, + const int64_t& height) + : impl_(std::make_shared(rows, height)) {} + +SelectedRows::SelectedRows() + : impl_(std::make_shared()) {} + +} // namespace phi diff --git a/paddle/phi/core/selected_rows.h b/paddle/phi/core/selected_rows.h index 7ee475b4d5d9e..a71c0471cc431 100644 --- a/paddle/phi/core/selected_rows.h +++ b/paddle/phi/core/selected_rows.h @@ -42,10 +42,9 @@ class SelectedRows : public TensorBase, * */ public: - SelectedRows(const std::vector& rows, const int64_t& height) - : impl_(std::make_shared(rows, height)) {} + SelectedRows(const std::vector& rows, const int64_t& height); - SelectedRows() : impl_(std::make_shared()) {} + SelectedRows(); const DenseTensor& value() const { return impl_->value(); } diff --git a/paddle/phi/core/utils/type_registry.h b/paddle/phi/core/utils/type_registry.h index 8d9f9167242c8..f27c3db2275c3 100644 --- a/paddle/phi/core/utils/type_registry.h +++ b/paddle/phi/core/utils/type_registry.h @@ -51,7 +51,7 @@ TypeInfo TypeRegistry::RegisterType(const std::string& type) { std::lock_guard guard(mutex_); assert(name_to_id_.find(type) == name_to_id_.end()); assert(names_.size() < std::numeric_limits::max()); - int8_t id = names_.size(); + int8_t id = static_cast(names_.size()); names_.emplace_back(type); name_to_id_[type] = id; return TypeInfo(id); diff --git a/paddle/phi/tests/api/CMakeLists.txt b/paddle/phi/tests/api/CMakeLists.txt index 94378aceff58c..dd4b7e62ec52f 100644 --- a/paddle/phi/tests/api/CMakeLists.txt +++ b/paddle/phi/tests/api/CMakeLists.txt @@ -11,14 +11,14 @@ cc_test(test_mean_api SRCS test_mean_api.cc DEPS ${COMMON_API_TEST_DEPS}) cc_test(test_dot_api SRCS test_dot_api.cc DEPS ${COMMON_API_TEST_DEPS}) cc_test(test_matmul_api SRCS test_matmul_api.cc DEPS ${COMMON_API_TEST_DEPS}) cc_test(test_empty_api SRCS test_empty_api.cc DEPS ${COMMON_API_TEST_DEPS}) -cc_test(test_fill_api SRCS test_fill_api.cc DEPS ${COMMON_API_TEST_DEPS}) +cc_test(test_fill_api SRCS test_fill_api.cc DEPS ${COMMON_API_TEST_DEPS} api_scalar) cc_test(test_elementwise_api SRCS test_elementwise_api.cc DEPS ${COMMON_API_TEST_DEPS}) cc_test(test_cast_api SRCS test_cast_api.cc DEPS ${COMMON_API_TEST_DEPS}) cc_test(test_reshape_api SRCS test_reshape_api.cc DEPS ${COMMON_API_TEST_DEPS}) cc_test(test_to_api SRCS test_to_api.cc DEPS ${COMMON_API_TEST_DEPS}) cc_test(test_slice_api SRCS test_slice_api.cc DEPS ${COMMON_API_TEST_DEPS}) cc_test(test_sum_api SRCS test_sum_api.cc DEPS ${COMMON_API_TEST_DEPS}) -cc_test(test_scale_api SRCS test_scale_api.cc DEPS ${COMMON_API_TEST_DEPS}) +cc_test(test_scale_api SRCS test_scale_api.cc DEPS ${COMMON_API_TEST_DEPS} api_scalar) cc_test(test_scale_benchmark SRCS test_scale_benchmark.cc DEPS ${COMMON_API_TEST_DEPS}) cc_test(test_conj_api SRCS test_conj_api.cc DEPS ${COMMON_API_TEST_DEPS}) cc_test(test_concat_api SRCS test_concat_api.cc DEPS ${COMMON_API_TEST_DEPS}) diff --git a/paddle/phi/tests/api/test_fill_api.cc b/paddle/phi/tests/api/test_fill_api.cc index bf57574d39093..523fa895d147e 100644 --- a/paddle/phi/tests/api/test_fill_api.cc +++ b/paddle/phi/tests/api/test_fill_api.cc @@ -22,6 +22,7 @@ limitations under the License. */ #include "paddle/phi/core/kernel_registry.h" PD_DECLARE_KERNEL(full, CPU, ALL_LAYOUT); +PD_DECLARE_KERNEL(copy, CPU, ALL_LAYOUT); namespace paddle { namespace tests { diff --git a/paddle/phi/tests/api/test_scale_api.cc b/paddle/phi/tests/api/test_scale_api.cc index a40ecc8485e4a..5f1e118946675 100644 --- a/paddle/phi/tests/api/test_scale_api.cc +++ b/paddle/phi/tests/api/test_scale_api.cc @@ -25,6 +25,7 @@ limitations under the License. */ PD_DECLARE_KERNEL(full, CPU, ALL_LAYOUT); PD_DECLARE_KERNEL(scale, CPU, ALL_LAYOUT); PD_DECLARE_KERNEL(scale_sr, CPU, ALL_LAYOUT); +PD_DECLARE_KERNEL(copy, CPU, ALL_LAYOUT); namespace paddle { namespace tests { diff --git a/paddle/phi/tests/common/CMakeLists.txt b/paddle/phi/tests/common/CMakeLists.txt index 710ea3c066472..ca6d20045d171 100644 --- a/paddle/phi/tests/common/CMakeLists.txt +++ b/paddle/phi/tests/common/CMakeLists.txt @@ -2,3 +2,9 @@ cc_test(phi_test_backend SRCS test_backend.cc DEPS gtest) cc_test(phi_test_data_layout SRCS test_data_layout.cc DEPS gtest) cc_test(phi_test_data_type SRCS test_data_type.cc DEPS gtest) cc_test(phi_test_place SRCS test_place.cc DEPS phi_place) +if (WITH_GPU) + nv_test(phi_test_scalar SRCS test_scalar.cu DEPS scalar api_scalar) +endif() +if(WITH_ROCM) + hip_test(phi_test_scalar SRCS test_scalar.cu DEPS scalar api_scalar) +endif() diff --git a/paddle/phi/tests/common/test_scalar.cu b/paddle/phi/tests/common/test_scalar.cu new file mode 100644 index 0000000000000..6b0caa175dc04 --- /dev/null +++ b/paddle/phi/tests/common/test_scalar.cu @@ -0,0 +1,205 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include // NOLINT +#include "gtest/gtest.h" +#include "paddle/fluid/memory/allocation/allocator_facade.h" +#include "paddle/phi/api/include/tensor.h" +#include "paddle/phi/api/lib/utils/allocator.h" +#include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/common/complex.h" +#include "paddle/phi/common/float16.h" +#include "paddle/phi/common/scalar.h" +#include "paddle/phi/core/dense_tensor.h" +#include "paddle/phi/core/kernel_registry.h" + +PD_DECLARE_KERNEL(copy, GPU, ALL_LAYOUT); + +namespace phi { +namespace tests { + +using DDim = phi::DDim; +using float16 = phi::dtype::float16; +using complex64 = ::phi::dtype::complex; +using complex128 = ::phi::dtype::complex; + +__global__ void FillTensor(float* data) { data[0] = 1; } + +TEST(Scalar, ConstructFromDenseTensor1) { + // 1. create tensor + const auto alloc = + std::make_unique(phi::CPUPlace()); + phi::DenseTensor dense_x( + alloc.get(), + phi::DenseTensorMeta( + phi::DataType::FLOAT16, phi::make_ddim({1}), phi::DataLayout::NCHW)); + phi::CPUContext dev_ctx; + dev_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance() + .GetAllocator(phi::CPUPlace()) + .get()); + dev_ctx.Init(); + + auto* dense_x_data = dev_ctx.Alloc(&dense_x); + dense_x_data[0] = 1; + phi::Scalar scalar_test(dense_x); + ASSERT_NEAR(1, scalar_test.to(), 1e-6); +} + +TEST(Scalar, ConstructFromDenseTensor2) { + // 1. create tensor + const auto alloc = + std::make_unique(phi::CPUPlace()); + phi::DenseTensor dense_x( + alloc.get(), + phi::DenseTensorMeta( + phi::DataType::INT16, phi::make_ddim({1}), phi::DataLayout::NCHW)); + phi::CPUContext dev_ctx; + dev_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance() + .GetAllocator(phi::CPUPlace()) + .get()); + dev_ctx.Init(); + + auto* dense_x_data = dev_ctx.Alloc(&dense_x); + dense_x_data[0] = 1; + phi::Scalar scalar_test(dense_x); + ASSERT_EQ(1, scalar_test.to()); +} + +TEST(Scalar, ConstructFromDenseTensor3) { + // 1. create tensor + const auto alloc = + std::make_unique(phi::CPUPlace()); + phi::DenseTensor dense_x( + alloc.get(), + phi::DenseTensorMeta( + phi::DataType::INT8, phi::make_ddim({1}), phi::DataLayout::NCHW)); + phi::CPUContext dev_ctx; + dev_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance() + .GetAllocator(phi::CPUPlace()) + .get()); + dev_ctx.Init(); + + auto* dense_x_data = dev_ctx.Alloc(&dense_x); + dense_x_data[0] = 1; + phi::Scalar scalar_test(dense_x); + ASSERT_EQ(1, scalar_test.to()); +} + +TEST(Scalar, ConstructFromDenseTensor4) { + // 1. create tensor + const auto alloc = + std::make_unique(phi::CPUPlace()); + phi::DenseTensor dense_x( + alloc.get(), + phi::DenseTensorMeta( + phi::DataType::BOOL, phi::make_ddim({1}), phi::DataLayout::NCHW)); + phi::CPUContext dev_ctx; + dev_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance() + .GetAllocator(phi::CPUPlace()) + .get()); + dev_ctx.Init(); + + auto* dense_x_data = dev_ctx.Alloc(&dense_x); + dense_x_data[0] = true; + phi::Scalar scalar_test(dense_x); + ASSERT_EQ(true, scalar_test.to()); +} + +TEST(Scalar, ConstructFromDenseTensor5) { + // 1. create tensor + const auto alloc = + std::make_unique(phi::CPUPlace()); + phi::DenseTensor dense_x(alloc.get(), + phi::DenseTensorMeta(phi::DataType::COMPLEX64, + phi::make_ddim({1}), + phi::DataLayout::NCHW)); + phi::CPUContext dev_ctx; + dev_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance() + .GetAllocator(phi::CPUPlace()) + .get()); + dev_ctx.Init(); + + auto* dense_x_data = dev_ctx.Alloc(&dense_x); + dense_x_data[0] = 1; + phi::Scalar scalar_test(dense_x); + complex64 expected_value(1, 0); + EXPECT_TRUE(expected_value == scalar_test.to()); +} + +TEST(Scalar, ConstructFromDenseTensor6) { + // 1. create tensor + const auto alloc = + std::make_unique(phi::CPUPlace()); + phi::DenseTensor dense_x(alloc.get(), + phi::DenseTensorMeta(phi::DataType::COMPLEX128, + phi::make_ddim({1}), + phi::DataLayout::NCHW)); + phi::CPUContext dev_ctx; + dev_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance() + .GetAllocator(phi::CPUPlace()) + .get()); + dev_ctx.Init(); + + auto* dense_x_data = dev_ctx.Alloc(&dense_x); + dense_x_data[0] = 1; + phi::Scalar scalar_test(dense_x); + complex128 expected_value(1, 0); + EXPECT_TRUE(expected_value == scalar_test.to()); +} + +TEST(Scalar, ConstructFromDenseTensor7) { + // 1. create tensor + const auto alloc = + std::make_unique(phi::GPUPlace()); + phi::DenseTensor dense_x( + alloc.get(), + phi::DenseTensorMeta( + phi::DataType::FLOAT32, phi::make_ddim({1}), phi::DataLayout::NCHW)); + phi::GPUContext dev_ctx; + dev_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance() + .GetAllocator(phi::GPUPlace()) + .get()); + dev_ctx.Init(); + + auto* dense_x_data = dev_ctx.Alloc(&dense_x); + FillTensor<<<1, 1, 0, dev_ctx.stream()>>>(dense_x_data); + dev_ctx.Wait(); + phi::Scalar scalar_test(dense_x); + ASSERT_NEAR(1, scalar_test.to(), 1e-6); +} + +TEST(Scalar, ConstructFromTensor) { + // 1. create tensor + const auto alloc = + std::make_unique(phi::GPUPlace()); + auto dense_x = std::make_shared( + alloc.get(), + phi::DenseTensorMeta( + phi::DataType::FLOAT32, phi::make_ddim({1}), phi::DataLayout::NCHW)); + + phi::GPUContext dev_ctx; + dev_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance() + .GetAllocator(phi::GPUPlace()) + .get()); + dev_ctx.Init(); + auto* dense_x_data = dev_ctx.Alloc(dense_x.get()); + FillTensor<<<1, 1, 0, dev_ctx.stream()>>>(dense_x_data); + dev_ctx.Wait(); + paddle::experimental::Tensor x(dense_x); + paddle::experimental::Scalar scalar_test(x); + ASSERT_NEAR(1, scalar_test.to(), 1e-6); +} + +} // namespace tests +} // namespace phi diff --git a/paddle/phi/tests/core/CMakeLists.txt b/paddle/phi/tests/core/CMakeLists.txt index 824d188457815..7d2fd90e6bb7b 100644 --- a/paddle/phi/tests/core/CMakeLists.txt +++ b/paddle/phi/tests/core/CMakeLists.txt @@ -1,4 +1,4 @@ -cc_test(test_custom_kernel SRCS test_custom_kernel.cc DEPS custom_kernel) +cc_test(test_custom_kernel SRCS test_custom_kernel.cc DEPS custom_kernel scalar) cc_test(test_dense_tensor SRCS test_dense_tensor.cc DEPS dense_tensor) cc_test(test_intrusive_ptr SRCS test_intrusive_ptr.cc) cc_test(test_type_info SRCS test_type_info.cc) From 4733fe601a46c4f1d1fa4c77c8eeb60638aabb50 Mon Sep 17 00:00:00 2001 From: Chen Weihang Date: Thu, 14 Apr 2022 20:37:56 +0800 Subject: [PATCH 161/211] remove all is initialized using (#41766) --- paddle/fluid/distributed/collective/reducer.cc | 4 ++-- paddle/fluid/eager/grad_node_info.cc | 4 ++-- paddle/fluid/eager/to_static/run_program_op_node.h | 2 +- paddle/fluid/eager/utils.cc | 2 +- paddle/fluid/pybind/eager_method.cc | 4 ++-- paddle/phi/api/lib/tensor_method.cc | 4 ++-- 6 files changed, 10 insertions(+), 10 deletions(-) diff --git a/paddle/fluid/distributed/collective/reducer.cc b/paddle/fluid/distributed/collective/reducer.cc index 63e92444b32cb..75153df936b1c 100644 --- a/paddle/fluid/distributed/collective/reducer.cc +++ b/paddle/fluid/distributed/collective/reducer.cc @@ -398,7 +398,7 @@ void EagerReducer::InitializeDenseGroups( "GRAD is SelectedRows", tensor_name)); - PADDLE_ENFORCE_EQ(tensor.is_initialized(), true, + PADDLE_ENFORCE_EQ(tensor.initialized(), true, platform::errors::PreconditionNotMet( "Tensor %s is not initialized.", tensor_name)); const auto size = tensor.numel(); @@ -710,7 +710,7 @@ void EagerReducer::MarkGroupReady(size_t group_index) { bool EagerReducer::HasGrad(size_t var_index) { auto grad = egr::EagerUtils::mutable_grad(tensors_[var_index]); - if (grad && grad->is_initialized()) { + if (grad && grad->initialized()) { return true; } else { return false; diff --git a/paddle/fluid/eager/grad_node_info.cc b/paddle/fluid/eager/grad_node_info.cc index 72b84b9db3210..5b4921320f6b0 100644 --- a/paddle/fluid/eager/grad_node_info.cc +++ b/paddle/fluid/eager/grad_node_info.cc @@ -125,7 +125,7 @@ void GradNodeBase::SetGradInMeta(const paddle::experimental::Tensor& fwd_out, auto& meta = metas[0]; meta.SetStopGradient(fwd_out_meta->StopGradient()); - if (!fwd_out.is_initialized()) { + if (!fwd_out.initialized()) { VLOG(6) << "Skip Configuring GradSlotMeta for uninitialized GradInput Tensor"; return; @@ -192,7 +192,7 @@ void GradNodeBase::SetGradInMeta( meta.SetStopGradient(fwd_out_meta->StopGradient()); } - if (!fwd_out_tensor.is_initialized()) { + if (!fwd_out_tensor.initialized()) { VLOG(6) << "Skip Configuring GradSlotMeta for uninitialized GradInput Tensor"; return; diff --git a/paddle/fluid/eager/to_static/run_program_op_node.h b/paddle/fluid/eager/to_static/run_program_op_node.h index 46f48778a9656..9347a76fd48f0 100644 --- a/paddle/fluid/eager/to_static/run_program_op_node.h +++ b/paddle/fluid/eager/to_static/run_program_op_node.h @@ -114,7 +114,7 @@ static void ShareTensorsIntoScope(const std::vector &tensors, paddle::framework::Scope *scope) { for (size_t i = 0; i < tensors.size(); ++i) { auto name = tensors[i].name(); - if (name == "Fake_var" || !tensors[i].is_initialized()) { + if (name == "Fake_var" || !tensors[i].initialized()) { continue; } auto *var = scope->Var(name); diff --git a/paddle/fluid/eager/utils.cc b/paddle/fluid/eager/utils.cc index 756563df4dfe7..66d877f06e21d 100644 --- a/paddle/fluid/eager/utils.cc +++ b/paddle/fluid/eager/utils.cc @@ -446,7 +446,7 @@ void EagerUtils::FillZeroForEmptyGradInputs( for (size_t i = 0; i < in_grads->size(); i++) { for (size_t j = 0; j < (*in_grads)[i].size(); j++) { paddle::experimental::Tensor& grad = (*in_grads)[i][j]; - if (!grad.is_initialized()) { + if (!grad.initialized()) { const GradSlotMeta& grad_in_meta = grad_in_metas[i][j]; PADDLE_ENFORCE( grad_in_meta.HasTensorMeta(), diff --git a/paddle/fluid/pybind/eager_method.cc b/paddle/fluid/pybind/eager_method.cc index 4610196726e75..8304db13c468e 100644 --- a/paddle/fluid/pybind/eager_method.cc +++ b/paddle/fluid/pybind/eager_method.cc @@ -617,7 +617,7 @@ static PyObject* tensor__getitem_index_not_tensor(TensorObject* self, // if index is a list, list_select_flag will be true bool list_select_flag = false; PADDLE_ENFORCE_EQ( - self->tensor.is_initialized(), true, + self->tensor.initialized(), true, platform::errors::InvalidArgument( "tensor %s has not been initialized, we can only slice initialized " "tensor please init it first with numpy or other tensor.", @@ -1146,7 +1146,7 @@ static PyObject* tensor__copy_gradient_from(TensorObject* self, PyObject* args, PyObject* kwargs) { EAGER_TRY auto src = CastPyArg2Tensor(PyTuple_GET_ITEM(args, 0), 0); - if (self->tensor.is_initialized()) { + if (self->tensor.initialized()) { PADDLE_ENFORCE_EQ(self->tensor.dtype(), src.dtype(), platform::errors::PreconditionNotMet( "Tensor %s has different data type with Tensor %s", diff --git a/paddle/phi/api/lib/tensor_method.cc b/paddle/phi/api/lib/tensor_method.cc index 79519f67d2ad3..51d4ec28200cb 100644 --- a/paddle/phi/api/lib/tensor_method.cc +++ b/paddle/phi/api/lib/tensor_method.cc @@ -73,7 +73,7 @@ Tensor::copy_to(const Place &target_place) const; void Tensor::copy_(const Tensor &src, const phi::Place &target_place, bool blocking) { - if (!src.is_initialized()) { + if (!src.initialized()) { VLOG(8) << "Src is empty, skip copy"; return; } @@ -81,7 +81,7 @@ void Tensor::copy_(const Tensor &src, auto kernel_key_set = ParseKernelKeyByInputArgs(src); KernelType kernel_type = ParseKernelTypeByInputArgs(src); VLOG(3) << "Deep copy Tensor from " << src.name() << " to " << name(); - if (is_initialized()) { + if (initialized()) { PADDLE_ENFORCE_EQ(dtype(), src.dtype(), phi::errors::PreconditionNotMet( From fbe2c31162d40b36bec0aabe12cd4780c10959ef Mon Sep 17 00:00:00 2001 From: Lijunhui <1578034415@qq.com> Date: Thu, 14 Apr 2022 20:41:35 +0800 Subject: [PATCH 162/211] [KP] Add registry for elementwise_add/max/min/sub/div/mul/floordiv on XPU2 with KP lib (#41494) * regist elementwise_xxx --- .../new_executor/standalone_executor_test.cc | 4 ++ .../elementwise/elementwise_add_op.kps | 2 +- .../platform/device/xpu/xpu_op_kpfirst_list.h | 12 ++++++ .../phi/kernels/funcs/elementwise_functor.h | 4 ++ .../kernels/impl/elementwise_kernel_impl.h | 2 +- .../{gpu => kps}/elementwise_kernel.cu | 41 +++++++++++++------ 6 files changed, 51 insertions(+), 14 deletions(-) rename paddle/phi/kernels/{gpu => kps}/elementwise_kernel.cu (84%) diff --git a/paddle/fluid/framework/new_executor/standalone_executor_test.cc b/paddle/fluid/framework/new_executor/standalone_executor_test.cc index fbcbb2ca23bcb..cbbb802b67d76 100644 --- a/paddle/fluid/framework/new_executor/standalone_executor_test.cc +++ b/paddle/fluid/framework/new_executor/standalone_executor_test.cc @@ -69,7 +69,11 @@ PD_DECLARE_KERNEL(split, GPU, ALL_LAYOUT); PD_DECLARE_KERNEL(concat, GPU, ALL_LAYOUT); PD_DECLARE_KERNEL(concat_grad, GPU, ALL_LAYOUT); PD_DECLARE_KERNEL(matmul, GPU, ALL_LAYOUT); +#ifdef PADDLE_WITH_XPU_KP PD_DECLARE_KERNEL(add_raw, GPU, ALL_LAYOUT); +#else +PD_DECLARE_KERNEL(add_raw, KPS, ALL_LAYOUT); +#endif PD_DECLARE_KERNEL(add, GPU, ALL_LAYOUT); PD_DECLARE_KERNEL(mean, GPU, ALL_LAYOUT); PD_DECLARE_KERNEL(sigmoid, GPU, ALL_LAYOUT); diff --git a/paddle/fluid/operators/elementwise/elementwise_add_op.kps b/paddle/fluid/operators/elementwise/elementwise_add_op.kps index 3b7457d72e15d..ecd52a310acdb 100644 --- a/paddle/fluid/operators/elementwise/elementwise_add_op.kps +++ b/paddle/fluid/operators/elementwise/elementwise_add_op.kps @@ -58,4 +58,4 @@ REGISTER_OP_CUDA_KERNEL( ops::ElementwiseAddKernel, ops::ElementwiseAddKernel>, ops::ElementwiseAddKernel>); -#endif \ No newline at end of file +#endif diff --git a/paddle/fluid/platform/device/xpu/xpu_op_kpfirst_list.h b/paddle/fluid/platform/device/xpu/xpu_op_kpfirst_list.h index f1742f8eb5a1f..9afde00a98be8 100644 --- a/paddle/fluid/platform/device/xpu/xpu_op_kpfirst_list.h +++ b/paddle/fluid/platform/device/xpu/xpu_op_kpfirst_list.h @@ -30,6 +30,18 @@ XPUOpMap& get_kp_ops() { static XPUOpMap s_xpu_kp_kernels{ {"elementwise_add", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, + {"elementwise_div", + XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, + {"elementwise_sub", + XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, + {"elementwise_max", + XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, + {"elementwise_min", + XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, + {"elementwise_mul", + XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, + {"elementwise_floordiv", + XPUKernelSet({pOpKernelType(vartype::INT32, XPUPlace())})}, // activation op {"exp", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, {"hard_swish", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, diff --git a/paddle/phi/kernels/funcs/elementwise_functor.h b/paddle/phi/kernels/funcs/elementwise_functor.h index 0ea5ff0e82e76..8d9dd65786705 100644 --- a/paddle/phi/kernels/funcs/elementwise_functor.h +++ b/paddle/phi/kernels/funcs/elementwise_functor.h @@ -542,7 +542,9 @@ struct InverseModuloFunctor< template struct FloorDivideFunctor { inline HOSTDEVICE T operator()(const T a, const T b) const { +#ifndef PADDLE_WITH_XPU_KP PADDLE_ENFORCE(b != 0, DIV_ERROR_INFO); +#endif return static_cast(std::trunc(a / b)); } }; @@ -550,7 +552,9 @@ struct FloorDivideFunctor { template struct InverseFloorDivideFunctor { inline HOSTDEVICE T operator()(const T a, const T b) const { +#ifndef PADDLE_WITH_XPU_KP PADDLE_ENFORCE(a != 0, DIV_ERROR_INFO); +#endif return static_cast(std::trunc(b / a)); } }; diff --git a/paddle/phi/kernels/impl/elementwise_kernel_impl.h b/paddle/phi/kernels/impl/elementwise_kernel_impl.h index 0e69d00110ead..d5c2c559b2c06 100644 --- a/paddle/phi/kernels/impl/elementwise_kernel_impl.h +++ b/paddle/phi/kernels/impl/elementwise_kernel_impl.h @@ -17,7 +17,7 @@ #include "paddle/phi/kernels/elementwise_kernel.h" #include "paddle/phi/kernels/funcs/elementwise_base.h" #include "paddle/phi/kernels/funcs/elementwise_functor.h" -#if defined(__NVCC__) || defined(__HIPCC__) +#if defined(__NVCC__) || defined(__HIPCC__) || defined(__xpu__) #include "paddle/phi/kernels/funcs/broadcast_function.h" #endif diff --git a/paddle/phi/kernels/gpu/elementwise_kernel.cu b/paddle/phi/kernels/kps/elementwise_kernel.cu similarity index 84% rename from paddle/phi/kernels/gpu/elementwise_kernel.cu rename to paddle/phi/kernels/kps/elementwise_kernel.cu index 73964d31a34ee..01a34c0f85eda 100644 --- a/paddle/phi/kernels/gpu/elementwise_kernel.cu +++ b/paddle/phi/kernels/kps/elementwise_kernel.cu @@ -13,8 +13,10 @@ // limitations under the License. #include "paddle/phi/backends/gpu/gpu_context.h" +#ifndef PADDLE_WITH_XPU_KP #include "paddle/phi/common/complex.h" #include "paddle/phi/common/float16.h" +#endif #include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/kernels/impl/elementwise_kernel_impl.h" @@ -40,7 +42,6 @@ namespace phi { /** * Kernels */ - // Create the definition of Add DEFINE_CUDA_ELEMENTWISE_OP(Add) // Create the definition of Subtract @@ -62,19 +63,34 @@ DEFINE_CUDA_ELEMENTWISE_OP(ElementwisePow) } // namespace phi +#ifdef PADDLE_WITH_XPU_KP +PD_REGISTER_KERNEL(add_raw, KPS, ALL_LAYOUT, phi::AddRawKernel, float) {} +PD_REGISTER_KERNEL( + subtract_raw, KPS, ALL_LAYOUT, phi::SubtractRawKernel, float) {} +PD_REGISTER_KERNEL(divide_raw, KPS, ALL_LAYOUT, phi::DivideRawKernel, float) {} +PD_REGISTER_KERNEL( + multiply_raw, KPS, ALL_LAYOUT, phi::MultiplyRawKernel, float) {} +PD_REGISTER_KERNEL(maximum_raw, KPS, ALL_LAYOUT, phi::MaximumRawKernel, float) { +} +PD_REGISTER_KERNEL(minimum_raw, KPS, ALL_LAYOUT, phi::MinimumRawKernel, float) { +} +PD_REGISTER_KERNEL( + floor_divide_raw, KPS, ALL_LAYOUT, phi::FloorDivideRawKernel, int) {} + +#else using float16 = phi::dtype::float16; using bfloat16 = phi::dtype::bfloat16; using complex64 = ::phi::dtype::complex; using complex128 = ::phi::dtype::complex; PD_REGISTER_KERNEL( - fmax, GPU, ALL_LAYOUT, phi::FMaxKernel, float, double, int, int64_t) {} + fmax, KPS, ALL_LAYOUT, phi::FMaxKernel, float, double, int, int64_t) {} PD_REGISTER_KERNEL( - fmin, GPU, ALL_LAYOUT, phi::FMinKernel, float, double, int, int64_t) {} + fmin, KPS, ALL_LAYOUT, phi::FMinKernel, float, double, int, int64_t) {} PD_REGISTER_KERNEL(add_raw, - GPU, + KPS, ALL_LAYOUT, phi::AddRawKernel, float, @@ -87,7 +103,7 @@ PD_REGISTER_KERNEL(add_raw, complex64, complex128) {} PD_REGISTER_KERNEL(subtract_raw, - GPU, + KPS, ALL_LAYOUT, phi::SubtractRawKernel, float, @@ -100,7 +116,7 @@ PD_REGISTER_KERNEL(subtract_raw, complex64, complex128) {} PD_REGISTER_KERNEL(divide_raw, - GPU, + KPS, ALL_LAYOUT, phi::DivideRawKernel, float, @@ -112,7 +128,7 @@ PD_REGISTER_KERNEL(divide_raw, complex64, complex128) {} PD_REGISTER_KERNEL(multiply_raw, - GPU, + KPS, ALL_LAYOUT, phi::MultiplyRawKernel, float, @@ -125,7 +141,7 @@ PD_REGISTER_KERNEL(multiply_raw, complex128, bfloat16) {} PD_REGISTER_KERNEL(maximum_raw, - GPU, + KPS, ALL_LAYOUT, phi::MaximumRawKernel, float, @@ -135,7 +151,7 @@ PD_REGISTER_KERNEL(maximum_raw, float16, bfloat16) {} PD_REGISTER_KERNEL(minimum_raw, - GPU, + KPS, ALL_LAYOUT, phi::MinimumRawKernel, float, @@ -145,7 +161,7 @@ PD_REGISTER_KERNEL(minimum_raw, float16, bfloat16) {} PD_REGISTER_KERNEL(modulo_raw, - GPU, + KPS, ALL_LAYOUT, phi::ModuloRawKernel, float, @@ -153,16 +169,17 @@ PD_REGISTER_KERNEL(modulo_raw, int, int64_t) {} PD_REGISTER_KERNEL(floor_divide_raw, - GPU, + KPS, ALL_LAYOUT, phi::FloorDivideRawKernel, int, int64_t) {} PD_REGISTER_KERNEL(elementwise_pow_raw, - GPU, + KPS, ALL_LAYOUT, phi::ElementwisePowRawKernel, float, double, int, int64_t) {} +#endif From 5087fe20c6984261308d5895522c524cefbe64f8 Mon Sep 17 00:00:00 2001 From: houj04 <35131887+houj04@users.noreply.github.com> Date: Thu, 14 Apr 2022 20:46:04 +0800 Subject: [PATCH 163/211] fix xpu cmake lib name. test=kunlun (#41786) --- cmake/external/xpu.cmake | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cmake/external/xpu.cmake b/cmake/external/xpu.cmake index d453e9d2a2acd..eb2fea91a6290 100644 --- a/cmake/external/xpu.cmake +++ b/cmake/external/xpu.cmake @@ -52,7 +52,7 @@ ELSEIF(WITH_CENTOS) SET(XPU_XDNN_URL "${XPU_XDNN_BASE_URL}/${XPU_XDNN_DIR_NAME}.tar.gz" CACHE STRING "" FORCE) ELSE() SET(XPU_XRE_DIR_NAME "xre-ubuntu_x86_64") - SET(XPU_XDNN_DIR_NAME "XDNN-bdcentos_x86_64") + SET(XPU_XDNN_DIR_NAME "XDNN-ubuntu_x86_64") SET(XPU_XCCL_DIR_NAME "xccl-bdcentos_x86_64") # default: use output by XDNN API team SET(XPU_XDNN_URL "${XPU_XDNN_BASE_URL}/${XPU_XDNN_DIR_NAME}.tar.gz" CACHE STRING "" FORCE) From a4f3c0e94bcc0db0a39abfec6481c6921e2c1245 Mon Sep 17 00:00:00 2001 From: chenjian Date: Thu, 14 Apr 2022 21:08:39 +0800 Subject: [PATCH 164/211] fix divide zero error when cpu only (#41794) --- python/paddle/profiler/profiler_statistic.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/paddle/profiler/profiler_statistic.py b/python/paddle/profiler/profiler_statistic.py index e4d4ff8c183bc..7465a8e80ff8e 100755 --- a/python/paddle/profiler/profiler_statistic.py +++ b/python/paddle/profiler/profiler_statistic.py @@ -869,7 +869,7 @@ def format_ratio(ratio, indent=0): '{} / - / - / - / {}'.format( format_time( other_gpu_time, unit=time_unit), - format_ratio(float(other_gpu_time) / gpu_total_time)) + format_ratio(float(other_gpu_time) / total_time)) ] all_row_values.append(row_values) # Calculate the column width From e7f0aa38aba0c831657c8de324b87d4ac927dad9 Mon Sep 17 00:00:00 2001 From: caozhou <48191911+Caozhou1995@users.noreply.github.com> Date: Thu, 14 Apr 2022 21:47:37 +0800 Subject: [PATCH 165/211] fix dtype bug (#41802) --- python/paddle/fluid/layers/tensor.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/python/paddle/fluid/layers/tensor.py b/python/paddle/fluid/layers/tensor.py index 3a8dfdc858079..5fa110e4db0fa 100644 --- a/python/paddle/fluid/layers/tensor.py +++ b/python/paddle/fluid/layers/tensor.py @@ -678,8 +678,7 @@ def assign(input, output=None): raise ValueError("The size of input is too big. Please consider " "saving it to file and 'load_op' to load it") if output is None: - output = helper.create_variable_for_type_inference( - dtype=input.dtype) + output = helper.create_variable_for_type_inference(dtype=dtype) helper.append_op( type='assign_value', outputs={'Out': [output]}, From 42abcc08c6f3186d23ab1f477e45c5b58067d0b5 Mon Sep 17 00:00:00 2001 From: hong <43953930+phlrain@users.noreply.github.com> Date: Fri, 15 Apr 2022 09:56:37 +0800 Subject: [PATCH 166/211] fix batch norm memory issue (#41717) * try to fix batch norm memory issue * fix batch norm memroy alloc bug * polish some code --- paddle/phi/kernels/gpu/batch_norm_grad_kernel.cu | 7 ++++--- paddle/phi/kernels/gpu/batch_norm_kernel.cu | 6 ++++-- 2 files changed, 8 insertions(+), 5 deletions(-) diff --git a/paddle/phi/kernels/gpu/batch_norm_grad_kernel.cu b/paddle/phi/kernels/gpu/batch_norm_grad_kernel.cu index 09bce3c9895b3..e15b4cc10d97e 100644 --- a/paddle/phi/kernels/gpu/batch_norm_grad_kernel.cu +++ b/paddle/phi/kernels/gpu/batch_norm_grad_kernel.cu @@ -570,7 +570,8 @@ void BatchNormGradRawKernel(const Context &ctx, /*sizeInBytes=*/&workspace_size)); workspace_tensor.Resize({static_cast(workspace_size)}); - workspace_ptr = ctx.template Alloc(&workspace_tensor); + workspace_ptr = + static_cast(ctx.template Alloc(&workspace_tensor)); PADDLE_ENFORCE_GPU_SUCCESS( paddle::platform::dynload::cudnnBatchNormalizationBackwardEx( @@ -603,8 +604,8 @@ void BatchNormGradRawKernel(const Context &ctx, /*activationDesc=*/nullptr, /*workspace=*/workspace_ptr, /*workSpaceSizeInBytes=*/workspace_size, - /*reserveSpace=*/const_cast( - reserve_space->template data()), + /*reserveSpace=*/const_cast( + reserve_space->template data()), /*reserveSpaceSizeInBytes=*/reserve_space_size)); #endif // CUDNN_VERSION_MIN(7, 4, 1) if (!called) { diff --git a/paddle/phi/kernels/gpu/batch_norm_kernel.cu b/paddle/phi/kernels/gpu/batch_norm_kernel.cu index 74a523f4ecf94..361e62e566035 100644 --- a/paddle/phi/kernels/gpu/batch_norm_kernel.cu +++ b/paddle/phi/kernels/gpu/batch_norm_kernel.cu @@ -498,9 +498,11 @@ void BatchNormKernel(const Context &ctx, /*sizeInBytes=*/&reserve_space_size)); reserve_space->Resize({static_cast(reserve_space_size)}); - reserve_space_ptr = ctx.template Alloc(reserve_space); + reserve_space_ptr = + static_cast(ctx.template Alloc(reserve_space)); workspace_tensor.Resize({static_cast(workspace_size)}); - workspace_ptr = ctx.template Alloc(&workspace_tensor); + workspace_ptr = + static_cast(ctx.template Alloc(&workspace_tensor)); PADDLE_ENFORCE_GPU_SUCCESS( paddle::platform::dynload::cudnnBatchNormalizationForwardTrainingEx( handle, From 605552a97d126925dba74fc6bf11a591221486a0 Mon Sep 17 00:00:00 2001 From: caozhou <48191911+Caozhou1995@users.noreply.github.com> Date: Fri, 15 Apr 2022 10:23:51 +0800 Subject: [PATCH 167/211] [Auto Parallel]update cluster (#41722) * update cluster --- .../distributed/auto_parallel/cluster.py | 249 +- .../unittests/auto_parallel/CMakeLists.txt | 1 + .../unittests/auto_parallel/test_cluster.py | 2022 +++++++++++++++++ 3 files changed, 2271 insertions(+), 1 deletion(-) create mode 100644 python/paddle/fluid/tests/unittests/auto_parallel/test_cluster.py diff --git a/python/paddle/distributed/auto_parallel/cluster.py b/python/paddle/distributed/auto_parallel/cluster.py index ef05ff7c7460e..3685729cb6c29 100644 --- a/python/paddle/distributed/auto_parallel/cluster.py +++ b/python/paddle/distributed/auto_parallel/cluster.py @@ -1,4 +1,4 @@ -# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -43,6 +43,8 @@ class LinkType(IntEnum): class Device: + NON_ACCELERATOR_TYPE = [DeviceType.CPU, DeviceType.NIC, DeviceType.UNKNOWN] + def __init__(self, global_id, local_id, machine): self._global_id = global_id self._local_id = local_id @@ -134,6 +136,10 @@ def __repr__(self): class Link: + + default_hop = 1 + default_nic_bandwith = 24 + def __init__(self, source, target): self._src = source self._tgt = target @@ -142,6 +148,7 @@ def __init__(self, source, target): self._bandwidth = None # latency is stored by millisecond self._latency = None + self._hop = None @property def source(self): @@ -183,6 +190,14 @@ def latency(self): def latency(self, value): self._latency = value + @property + def hop(self): + return self._hop + + @hop.setter + def hop(self, value): + self._hop = value + def __str__(self): str = "" str += "source_global_id: {}, target_global_id: {}, type: {}, bandwidth: {}, latency: {}".format( @@ -202,6 +217,8 @@ def __init__(self, id): self._port = None self._devices = {} self._links = {} + self._accelerators = {} + self._non_accelerator_cumulative_count = 0 @property def id(self): @@ -243,14 +260,23 @@ def devices(self): def links(self): return self._links + @property + def accelerators(self): + return self._accelerators + def add_device(self, device): # Use the device global_id as the key self._devices[device.global_id] = device + if device.type not in Device.NON_ACCELERATOR_TYPE: + self._accelerators[device.global_id] = device def add_link(self, link): # Use the source device global_id and target device global_id as the key self._links[(link.source.global_id, link.target.global_id)] = link + def get_link(self, source_global_id, target_global_id): + return self._links.get((source_global_id, target_global_id), None) + def __str__(self): str = "" for device in self.devices.values(): @@ -263,6 +289,109 @@ def __repr__(self): return self.__str__() +class AlphaLatency: + def __init__(self, alpha_latency): + assert isinstance(alpha_latency, dict) + self._base = alpha_latency.get("base", None) + self._inter = alpha_latency.get("inter", None) + self._intra = alpha_latency.get("intra", None) + self._switch = alpha_latency.get("switch", None) + if self._switch is not None: + try: + self._switch = float(self._switch) + except: + raise TypeError("The switch latency must be float") + self._base_ring = self._base.get( + "ring", None) if self._base is not None else None + self._base_tree = self._base.get( + "tree", None) if self._base is not None else None + self._base_inter = self._base.get( + "inter", None) if self._base is not None else None + if self._base_ring is not None: + try: + self._base_ring = float(self._base_ring) + except: + raise TypeError("The base ring latency must be float.") + if self._base_tree is not None: + try: + self._base_tree = float(self._base_tree) + except: + raise TypeError("The base ring latency must be float.") + + self._inter_ring = self._inter.get("ring", None) + self._inter_tree = self._inter.get("tree", None) + self._intra_ring = self._intra.get("ring", None) + self._intra_tree = self._intra.get("tree", None) + + if self._inter_ring is not None: + if isinstance(self._inter_ring, str): + assert self._inter_ring in ["NET"] + self._inter_ring = LinkType[self._inter_ring] + else: + try: + self._inter_ring = float(self._inter_ring) + except: + raise TypeError("The inter ring latency must be float.") + + if self._inter_tree is not None: + if isinstance(self._inter_tree, str): + assert self._inter_tree in ["NET"] + self._inter_tree = LinkType[self._inter_tree] + else: + try: + self._inter_tree = float(self._inter_tree) + except: + raise TypeError("The inter tree latency must be float.") + + if self._intra_ring is not None: + if isinstance(self._intra_ring, str): + assert self._intra_ring in ["NVL", "PHB"] + self._intra_ring = LinkType[self._intra_ring] + else: + try: + self._intra_ring = float(self._intra_ring) + except: + raise TypeError("The intra ring latency must be float.") + + if self._intra_tree is not None: + if isinstance(self._intra_tree, str): + assert self._intra_tree in ["NVL", "PHB"] + self._intra_tree = LinkType[self._intra_tree] + else: + try: + self._intra_tree = float(self._intra_tree) + except: + raise TypeError("The intra tree latency must be float.") + + @property + def base_ring(self): + return self._base_ring + + @property + def base_tree(self): + return self._base_tree + + @property + def switch(self): + return self._switch + + @property + def inter_ring(self): + return self._inter_ring + + @property + def inter_tree(self): + return self._inter_tree + + @property + def intra_ring(self): + return self._intra_ring + + @property + def intra_tree(self): + return self._intra_tree + + class Cluster: """ The cluster is an abstract of the hardware resource for training, which contains the cluster topology and @@ -276,6 +405,18 @@ def __init__(self): self._machines = {} # Cluster graph topology self._topology = None + # Latency for communication cost model + self._alpha_latency = None + self._rank_to_device_id = {} + self._device_id_to_rank = {} + + @property + def rank_to_device_id(self): + return self._rank_to_device_id + + @property + def device_id_to_rank(self): + return self._device_id_to_rank @property def machines(self): @@ -285,6 +426,35 @@ def add_machine(self, machine): assert isinstance(machine, Machine) self._machines[machine.id] = machine + # map rank to device id and map device id to rank + if machine.id != 0: + prev_machine = self._machines[machine.id - 1] + offset = prev_machine._non_accelerator_cumulative_count + for global_id in machine.devices: + if machine.devices[ + global_id].type not in Device.NON_ACCELERATOR_TYPE: + rank_id = global_id - offset + self._rank_to_device_id[rank_id] = global_id + self._device_id_to_rank[global_id] = rank_id + machine._non_accelerator_cumulative_count = len( + machine.devices) - len( + machine.accelerators + ) + prev_machine._non_accelerator_cumulative_count + else: + for global_id in machine.devices: + if machine.devices[ + global_id].type not in Device.NON_ACCELERATOR_TYPE: + rank_id = global_id + self._rank_to_device_id[rank_id] = global_id + self._device_id_to_rank[global_id] = rank_id + machine.accelerators[global_id] = machine.devices[global_id] + machine._non_accelerator_cumulative_count = len( + machine.devices) - len(machine.accelerators) + + @property + def alpha_latency(self): + return self._alpha_latency + def add_device(self, device): assert isinstance(device, Device) device.machine.add_device(device) @@ -344,8 +514,23 @@ def build_from_file(self, json_file_path): link.type = link_type link.bandwidth = float(link_info.get("bandwidth", 0)) link.latency = float(link_info.get("latency", 0)) + link.hop = link_info.get("hop", None) + if link.hop is None: + # Set the default of hop: If in the same machine, hop is 0. And if in the different machine, hop is 1. + source_machine = source.machine + target_machine = target.machine + if source_machine.id == target_machine.id: + link.hop = 0 + else: + link.hop = Link.default_hop self.add_link(link) + if "alpha_latency" in cluster_info: + self._alpha_latency = AlphaLatency( + cluster_info.get("alpha_latency")) + else: + self._alpha_latecy = None + def _generate_machine_id(self): cur_machine_id = self._num_machines self._num_machines += 1 @@ -359,6 +544,68 @@ def get_all_devices(self, device_type): devices.append(device) return devices + def get_beta(self, source_device_id, target_device_id): + # beta means the time transferring a byte, us/B + beta = None + convert_base = 1000 + device = self.get_device(source_device_id) + machine = device.machine + link = machine.get_link(source_device_id, target_device_id) + bandwidth = None + # None means the source and target are not connected directly, set NIC in default + if link is None: + bandwidth = Link.default_nic_bandwith + else: + bandwidth = link.bandwidth + + if bandwidth == 0.: + beta = 0 + else: + beta = 1 / (bandwidth * (convert_base**3 / 10**6)) + + return beta + + def get_hop(self, source_device_id, target_device_id): + beta = None + hop = None + device = self.get_device(source_device_id) + machine = device.machine + link = machine.get_link(source_device_id, target_device_id) + if link is not None: + hop = link.hop + else: + hop = Link.default_hop + return hop + + def cross_machine(self, device_ids): + machine_ids = set() + for device_id in device_ids: + device = self.get_device(device_id) + machine_id = device.machine.id + machine_ids.add(machine_id) + if len(machine_ids) == 1: + return False + else: + return True + + def convert_rank_to_device_id(self, group_ranks): + # group_ranks is global id of the rank in paddle + # task will use all of machine in this cluster with accelerators in default + device_ids = [] + for rank in group_ranks: + device_ids.append(self.rank_to_device_id[rank]) + return device_ids + + def get_involved_machine_count(self, device_ids): + machine_ids = set() + for device_id in device_ids: + device = self.get_device(device_id) + machine_id = device.machine.id + machine_ids.add(machine_id) + count = len(machine_ids) + assert count > 0 + return count + def __str__(self): str = "" for machine in self.machines.values(): diff --git a/python/paddle/fluid/tests/unittests/auto_parallel/CMakeLists.txt b/python/paddle/fluid/tests/unittests/auto_parallel/CMakeLists.txt index c16936db5a334..87031fe09e5a8 100644 --- a/python/paddle/fluid/tests/unittests/auto_parallel/CMakeLists.txt +++ b/python/paddle/fluid/tests/unittests/auto_parallel/CMakeLists.txt @@ -18,4 +18,5 @@ if(WITH_DISTRIBUTE AND WITH_GPU) py_test_modules(test_recorder MODULES test_recorder ENVS ${dist_ENVS}) py_test_modules(test_trial MODULES test_trial ENVS ${dist_ENVS}) py_test_modules(test_new_cost_model MODULES test_new_cost_model ENVS ${dist_ENVS}) + py_test_modules(test_cluster MODULES test_cluster ENVS ${dist_ENVS}) endif() diff --git a/python/paddle/fluid/tests/unittests/auto_parallel/test_cluster.py b/python/paddle/fluid/tests/unittests/auto_parallel/test_cluster.py new file mode 100644 index 0000000000000..dc22263b52040 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/auto_parallel/test_cluster.py @@ -0,0 +1,2022 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest +import os +import json + +import paddle +from paddle.distributed.auto_parallel.cluster import Cluster + +cluster_json = """ +{ + "alpha_latency": {"inter": {"ring": "NET", "tree": "NET"}, + "intra": {"ring": "NVL", "tree": "PHB"}, + "base": {"ring": 8.4, "tree": 0}, + "switch": 10.0}, + "machines": [ + { + "hostname": "yq01-sys-hic-v100-box-a225-0266", + "addr": "10.127.9.147", + "port": "60009", + "devices": [ + { + "global_id": 0, + "local_id": 0, + "type": "GPU", + "model": " Tesla V100-SXM2-32GB", + "memory": "32", + "sp_gflops": "15700", + "dp_gflops": "7800" + }, + { + "global_id": 1, + "local_id": 1, + "type": "GPU", + "model": " Tesla V100-SXM2-32GB", + "memory": "32", + "sp_gflops": "15700", + "dp_gflops": "7800" + }, + { + "global_id": 2, + "local_id": 2, + "type": "GPU", + "model": " Tesla V100-SXM2-32GB", + "memory": "32", + "sp_gflops": "15700", + "dp_gflops": "7800" + }, + { + "global_id": 3, + "local_id": 3, + "type": "GPU", + "model": " Tesla V100-SXM2-32GB", + "memory": "32", + "sp_gflops": "15700", + "dp_gflops": "7800" + }, + { + "global_id": 4, + "local_id": 4, + "type": "GPU", + "model": " Tesla V100-SXM2-32GB", + "memory": "32", + "sp_gflops": "15700", + "dp_gflops": "7800" + }, + { + "global_id": 5, + "local_id": 5, + "type": "GPU", + "model": " Tesla V100-SXM2-32GB", + "memory": "32", + "sp_gflops": "15700", + "dp_gflops": "7800" + }, + { + "global_id": 6, + "local_id": 6, + "type": "GPU", + "model": " Tesla V100-SXM2-32GB", + "memory": "32", + "sp_gflops": "15700", + "dp_gflops": "7800" + }, + { + "global_id": 7, + "local_id": 7, + "type": "GPU", + "model": " Tesla V100-SXM2-32GB", + "memory": "32", + "sp_gflops": "15700", + "dp_gflops": "7800" + }, + { + "global_id": 8, + "local_id": 0, + "type": "CPU", + "arch": "x86_64", + "vendor": "GenuineIntel", + "model": "Intel(R) Xeon(R) Gold 6148 CPU @ 2.40GH", + "memory": "502", + "sp_gflops": "150", + "dp_gflops": "75" + }, + { + "global_id": 9, + "local_id": 0, + "type": "NIC", + "width": 12.5, + "ip": "10.127.9.147" + } + ], + "links": [ + { + "source_global_id": 0, + "target_global_id": 1, + "type": "NVL", + "bandwidth": 235.0 + }, + { + "source_global_id": 0, + "target_global_id": 2, + "type": "NVL", + "bandwidth": 235.0 + }, + { + "source_global_id": 0, + "target_global_id": 3, + "type": "NVL", + "bandwidth": 235.0 + }, + { + "source_global_id": 0, + "target_global_id": 4, + "type": "NVL", + "bandwidth": 235.0 + }, + { + "source_global_id": 0, + "target_global_id": 5, + "type": "NVB", + "bandwidth": 235.0 + }, + { + "source_global_id": 0, + "target_global_id": 6, + "type": "NVB", + "bandwidth": 235.0 + }, + { + "source_global_id": 0, + "target_global_id": 7, + "type": "NVB", + "bandwidth": 235.0 + }, + { + "source_global_id": 0, + "target_global_id": 8, + "type": "PHB", + "bandwidth": 24.0 + }, + { + "source_global_id": 0, + "target_global_id": 9, + "type": "PHB", + "bandwidth": 24.0 + }, + { + "source_global_id": 1, + "target_global_id": 0, + "type": "NVL", + "bandwidth": 235.0 + }, + { + "source_global_id": 1, + "target_global_id": 2, + "type": "NVL", + "bandwidth": 235.0 + }, + { + "source_global_id": 1, + "target_global_id": 3, + "type": "NVL", + "bandwidth": 235.0 + }, + { + "source_global_id": 1, + "target_global_id": 4, + "type": "NVB", + "bandwidth": 235.0 + }, + { + "source_global_id": 1, + "target_global_id": 5, + "type": "NVL", + "bandwidth": 235.0 + }, + { + "source_global_id": 1, + "target_global_id": 6, + "type": "NVB", + "bandwidth": 235.0 + }, + { + "source_global_id": 1, + "target_global_id": 7, + "type": "NVB", + "bandwidth": 235.0 + }, + { + "source_global_id": 1, + "target_global_id": 8, + "type": "PHB", + "bandwidth": 24.0 + }, + { + "source_global_id": 1, + "target_global_id": 9, + "type": "PHB", + "bandwidth": 24.0 + }, + { + "source_global_id": 2, + "target_global_id": 0, + "type": "NVL", + "bandwidth": 235.0 + }, + { + "source_global_id": 2, + "target_global_id": 1, + "type": "NVL", + "bandwidth": 235.0 + }, + { + "source_global_id": 2, + "target_global_id": 3, + "type": "NVL", + "bandwidth": 235.0 + }, + { + "source_global_id": 2, + "target_global_id": 4, + "type": "NVB", + "bandwidth": 235.0 + }, + { + "source_global_id": 2, + "target_global_id": 5, + "type": "NVB", + "bandwidth": 235.0 + }, + { + "source_global_id": 2, + "target_global_id": 6, + "type": "NVL", + "bandwidth": 235.0 + }, + { + "source_global_id": 2, + "target_global_id": 7, + "type": "NVB", + "bandwidth": 235.0 + }, + { + "source_global_id": 2, + "target_global_id": 8, + "type": "PHB", + "bandwidth": 24.0 + }, + { + "source_global_id": 2, + "target_global_id": 9, + "type": "PHB", + "bandwidth": 24.0 + }, + { + "source_global_id": 3, + "target_global_id": 0, + "type": "NVL", + "bandwidth": 235.0 + }, + { + "source_global_id": 3, + "target_global_id": 1, + "type": "NVL", + "bandwidth": 235.0 + }, + { + "source_global_id": 3, + "target_global_id": 2, + "type": "NVL", + "bandwidth": 235.0 + }, + { + "source_global_id": 3, + "target_global_id": 4, + "type": "NVB", + "bandwidth": 235.0 + }, + { + "source_global_id": 3, + "target_global_id": 5, + "type": "NVB", + "bandwidth": 235.0 + }, + { + "source_global_id": 3, + "target_global_id": 6, + "type": "NVB", + "bandwidth": 235.0 + }, + { + "source_global_id": 3, + "target_global_id": 7, + "type": "NVL", + "bandwidth": 235.0 + }, + { + "source_global_id": 3, + "target_global_id": 8, + "type": "PHB", + "bandwidth": 24.0 + }, + { + "source_global_id": 3, + "target_global_id": 9, + "type": "PHB", + "bandwidth": 24.0 + }, + { + "source_global_id": 4, + "target_global_id": 0, + "type": "NVL", + "bandwidth": 235.0 + }, + { + "source_global_id": 4, + "target_global_id": 1, + "type": "NVB", + "bandwidth": 235.0 + }, + { + "source_global_id": 4, + "target_global_id": 2, + "type": "NVB", + "bandwidth": 235.0 + }, + { + "source_global_id": 4, + "target_global_id": 3, + "type": "NVB", + "bandwidth": 235.0 + }, + { + "source_global_id": 4, + "target_global_id": 5, + "type": "NVL", + "bandwidth": 235.0 + }, + { + "source_global_id": 4, + "target_global_id": 6, + "type": "NVL", + "bandwidth": 235.0 + }, + { + "source_global_id": 4, + "target_global_id": 7, + "type": "NVL", + "bandwidth": 235.0 + }, + { + "source_global_id": 4, + "target_global_id": 8, + "type": "PHB", + "bandwidth": 24.0 + }, + { + "source_global_id": 4, + "target_global_id": 9, + "type": "PHB", + "bandwidth": 24.0 + }, + { + "source_global_id": 5, + "target_global_id": 0, + "type": "NVB", + "bandwidth": 235.0 + }, + { + "source_global_id": 5, + "target_global_id": 1, + "type": "NVL", + "bandwidth": 235.0 + }, + { + "source_global_id": 5, + "target_global_id": 2, + "type": "NVB", + "bandwidth": 235.0 + }, + { + "source_global_id": 5, + "target_global_id": 3, + "type": "NVB", + "bandwidth": 235.0 + }, + { + "source_global_id": 5, + "target_global_id": 4, + "type": "NVL", + "bandwidth": 235.0 + }, + { + "source_global_id": 5, + "target_global_id": 6, + "type": "NVL", + "bandwidth": 235.0 + }, + { + "source_global_id": 5, + "target_global_id": 7, + "type": "NVL", + "bandwidth": 235.0 + }, + { + "source_global_id": 5, + "target_global_id": 8, + "type": "PHB", + "bandwidth": 24.0 + }, + { + "source_global_id": 5, + "target_global_id": 9, + "type": "PHB", + "bandwidth": 24.0 + }, + { + "source_global_id": 6, + "target_global_id": 0, + "type": "NVB", + "bandwidth": 235.0 + }, + { + "source_global_id": 6, + "target_global_id": 1, + "type": "NVB", + "bandwidth": 235.0 + }, + { + "source_global_id": 6, + "target_global_id": 2, + "type": "NVL", + "bandwidth": 235.0 + }, + { + "source_global_id": 6, + "target_global_id": 3, + "type": "NVB", + "bandwidth": 235.0 + }, + { + "source_global_id": 6, + "target_global_id": 4, + "type": "NVL", + "bandwidth": 235.0 + }, + { + "source_global_id": 6, + "target_global_id": 5, + "type": "NVL", + "bandwidth": 235.0 + }, + { + "source_global_id": 6, + "target_global_id": 7, + "type": "NVL", + "bandwidth": 235.0 + }, + { + "source_global_id": 6, + "target_global_id": 8, + "type": "PHB", + "bandwidth": 24.0 + }, + { + "source_global_id": 6, + "target_global_id": 9, + "type": "PHB", + "bandwidth": 24.0 + }, + { + "source_global_id": 7, + "target_global_id": 0, + "type": "NVB", + "bandwidth": 235.0 + }, + { + "source_global_id": 7, + "target_global_id": 1, + "type": "NVB", + "bandwidth": 235.0 + }, + { + "source_global_id": 7, + "target_global_id": 2, + "type": "NVB", + "bandwidth": 235.0 + }, + { + "source_global_id": 7, + "target_global_id": 3, + "type": "NVL", + "bandwidth": 235.0 + }, + { + "source_global_id": 7, + "target_global_id": 4, + "type": "NVL", + "bandwidth": 235.0 + }, + { + "source_global_id": 7, + "target_global_id": 5, + "type": "NVL", + "bandwidth": 235.0 + }, + { + "source_global_id": 7, + "target_global_id": 6, + "type": "NVL", + "bandwidth": 235.0 + }, + { + "source_global_id": 7, + "target_global_id": 8, + "type": "PHB", + "bandwidth": 24.0 + }, + { + "source_global_id": 7, + "target_global_id": 9, + "type": "PHB", + "bandwidth": 24.0 + }, + { + "source_global_id": 8, + "target_global_id": 0, + "type": "PHB", + "bandwidth": 24.0 + }, + { + "source_global_id": 8, + "target_global_id": 1, + "type": "PHB", + "bandwidth": 24.0 + }, + { + "source_global_id": 8, + "target_global_id": 2, + "type": "PHB", + "bandwidth": 24.0 + }, + { + "source_global_id": 8, + "target_global_id": 3, + "type": "PHB", + "bandwidth": 24.0 + }, + { + "source_global_id": 8, + "target_global_id": 4, + "type": "PHB", + "bandwidth": 24.0 + }, + { + "source_global_id": 8, + "target_global_id": 5, + "type": "PHB", + "bandwidth": 24.0 + }, + { + "source_global_id": 8, + "target_global_id": 6, + "type": "PHB", + "bandwidth": 24.0 + }, + { + "source_global_id": 8, + "target_global_id": 7, + "type": "PHB", + "bandwidth": 24.0 + }, + { + "source_global_id": 8, + "target_global_id": 9, + "type": "PHB", + "bandwidth": 24.0 + }, + { + "source_global_id": 9, + "target_global_id": 0, + "type": "PHB", + "bandwidth": 24.0 + }, + { + "source_global_id": 9, + "target_global_id": 1, + "type": "PHB", + "bandwidth": 24.0 + }, + { + "source_global_id": 9, + "target_global_id": 2, + "type": "PHB", + "bandwidth": 24.0 + }, + { + "source_global_id": 9, + "target_global_id": 3, + "type": "PHB", + "bandwidth": 24.0 + }, + { + "source_global_id": 9, + "target_global_id": 4, + "type": "PHB", + "bandwidth": 24.0 + }, + { + "source_global_id": 9, + "target_global_id": 5, + "type": "PHB", + "bandwidth": 24.0 + }, + { + "source_global_id": 9, + "target_global_id": 6, + "type": "PHB", + "bandwidth": 24.0 + }, + { + "source_global_id": 9, + "target_global_id": 7, + "type": "PHB", + "bandwidth": 24.0 + }, + { + "source_global_id": 9, + "target_global_id": 8, + "type": "PHB", + "bandwidth": 24.0 + } + ] + } + ] +} +""" + +multi_cluster_json = """{ + "machines": [ + { + "hostname": "yq01-sys-hic-v100-box-a225-0266", + "addr": "10.127.9.147", + "port": "60009", + "devices": [ + { + "global_id": 0, + "local_id": 0, + "type": "GPU", + "model": " Tesla V100-SXM2-32GB", + "memory": "32", + "sp_gflops": "15700", + "dp_gflops": "7800" + }, + { + "global_id": 1, + "local_id": 1, + "type": "GPU", + "model": " Tesla V100-SXM2-32GB", + "memory": "32", + "sp_gflops": "15700", + "dp_gflops": "7800" + }, + { + "global_id": 2, + "local_id": 2, + "type": "GPU", + "model": " Tesla V100-SXM2-32GB", + "memory": "32", + "sp_gflops": "15700", + "dp_gflops": "7800" + }, + { + "global_id": 3, + "local_id": 3, + "type": "GPU", + "model": " Tesla V100-SXM2-32GB", + "memory": "32", + "sp_gflops": "15700", + "dp_gflops": "7800" + }, + { + "global_id": 4, + "local_id": 4, + "type": "GPU", + "model": " Tesla V100-SXM2-32GB", + "memory": "32", + "sp_gflops": "15700", + "dp_gflops": "7800" + }, + { + "global_id": 5, + "local_id": 5, + "type": "GPU", + "model": " Tesla V100-SXM2-32GB", + "memory": "32", + "sp_gflops": "15700", + "dp_gflops": "7800" + }, + { + "global_id": 6, + "local_id": 6, + "type": "GPU", + "model": " Tesla V100-SXM2-32GB", + "memory": "32", + "sp_gflops": "15700", + "dp_gflops": "7800" + }, + { + "global_id": 7, + "local_id": 7, + "type": "GPU", + "model": " Tesla V100-SXM2-32GB", + "memory": "32", + "sp_gflops": "15700", + "dp_gflops": "7800" + }, + { + "global_id": 8, + "local_id": 0, + "type": "CPU", + "arch": "x86_64", + "vendor": "GenuineIntel", + "model": "Intel(R) Xeon(R) Gold 6148 CPU @ 2.40GH", + "memory": "502", + "sp_gflops": "150", + "dp_gflops": "75" + }, + { + "global_id": 9, + "local_id": 0, + "type": "NIC", + "width": 12.5, + "ip": "10.127.9.147" + } + ], + "links": [ + { + "source_global_id": 0, + "target_global_id": 1, + "type": "NVL", + "bandwidth": 235.0 + }, + { + "source_global_id": 0, + "target_global_id": 2, + "type": "NVL", + "bandwidth": 235.0 + }, + { + "source_global_id": 0, + "target_global_id": 3, + "type": "NVL", + "bandwidth": 235.0 + }, + { + "source_global_id": 0, + "target_global_id": 4, + "type": "NVL", + "bandwidth": 235.0 + }, + { + "source_global_id": 0, + "target_global_id": 5, + "type": "NVB", + "bandwidth": 235.0 + }, + { + "source_global_id": 0, + "target_global_id": 6, + "type": "NVB", + "bandwidth": 235.0 + }, + { + "source_global_id": 0, + "target_global_id": 7, + "type": "NVB", + "bandwidth": 235.0 + }, + { + "source_global_id": 0, + "target_global_id": 8, + "type": "PHB", + "bandwidth": 24.0 + }, + { + "source_global_id": 0, + "target_global_id": 9, + "type": "PHB", + "bandwidth": 24.0 + }, + { + "source_global_id": 1, + "target_global_id": 0, + "type": "NVL", + "bandwidth": 235.0 + }, + { + "source_global_id": 1, + "target_global_id": 2, + "type": "NVL", + "bandwidth": 235.0 + }, + { + "source_global_id": 1, + "target_global_id": 3, + "type": "NVL", + "bandwidth": 235.0 + }, + { + "source_global_id": 1, + "target_global_id": 4, + "type": "NVB", + "bandwidth": 235.0 + }, + { + "source_global_id": 1, + "target_global_id": 5, + "type": "NVL", + "bandwidth": 235.0 + }, + { + "source_global_id": 1, + "target_global_id": 6, + "type": "NVB", + "bandwidth": 235.0 + }, + { + "source_global_id": 1, + "target_global_id": 7, + "type": "NVB", + "bandwidth": 235.0 + }, + { + "source_global_id": 1, + "target_global_id": 8, + "type": "PHB", + "bandwidth": 24.0 + }, + { + "source_global_id": 1, + "target_global_id": 9, + "type": "PHB", + "bandwidth": 24.0 + }, + { + "source_global_id": 2, + "target_global_id": 0, + "type": "NVL", + "bandwidth": 235.0 + }, + { + "source_global_id": 2, + "target_global_id": 1, + "type": "NVL", + "bandwidth": 235.0 + }, + { + "source_global_id": 2, + "target_global_id": 3, + "type": "NVL", + "bandwidth": 235.0 + }, + { + "source_global_id": 2, + "target_global_id": 4, + "type": "NVB", + "bandwidth": 235.0 + }, + { + "source_global_id": 2, + "target_global_id": 5, + "type": "NVB", + "bandwidth": 235.0 + }, + { + "source_global_id": 2, + "target_global_id": 6, + "type": "NVL", + "bandwidth": 235.0 + }, + { + "source_global_id": 2, + "target_global_id": 7, + "type": "NVB", + "bandwidth": 235.0 + }, + { + "source_global_id": 2, + "target_global_id": 8, + "type": "PHB", + "bandwidth": 24.0 + }, + { + "source_global_id": 2, + "target_global_id": 9, + "type": "PHB", + "bandwidth": 24.0 + }, + { + "source_global_id": 3, + "target_global_id": 0, + "type": "NVL", + "bandwidth": 235.0 + }, + { + "source_global_id": 3, + "target_global_id": 1, + "type": "NVL", + "bandwidth": 235.0 + }, + { + "source_global_id": 3, + "target_global_id": 2, + "type": "NVL", + "bandwidth": 235.0 + }, + { + "source_global_id": 3, + "target_global_id": 4, + "type": "NVB", + "bandwidth": 235.0 + }, + { + "source_global_id": 3, + "target_global_id": 5, + "type": "NVB", + "bandwidth": 235.0 + }, + { + "source_global_id": 3, + "target_global_id": 6, + "type": "NVB", + "bandwidth": 235.0 + }, + { + "source_global_id": 3, + "target_global_id": 7, + "type": "NVL", + "bandwidth": 235.0 + }, + { + "source_global_id": 3, + "target_global_id": 8, + "type": "PHB", + "bandwidth": 24.0 + }, + { + "source_global_id": 3, + "target_global_id": 9, + "type": "PHB", + "bandwidth": 24.0 + }, + { + "source_global_id": 4, + "target_global_id": 0, + "type": "NVL", + "bandwidth": 235.0 + }, + { + "source_global_id": 4, + "target_global_id": 1, + "type": "NVB", + "bandwidth": 235.0 + }, + { + "source_global_id": 4, + "target_global_id": 2, + "type": "NVB", + "bandwidth": 235.0 + }, + { + "source_global_id": 4, + "target_global_id": 3, + "type": "NVB", + "bandwidth": 235.0 + }, + { + "source_global_id": 4, + "target_global_id": 5, + "type": "NVL", + "bandwidth": 235.0 + }, + { + "source_global_id": 4, + "target_global_id": 6, + "type": "NVL", + "bandwidth": 235.0 + }, + { + "source_global_id": 4, + "target_global_id": 7, + "type": "NVL", + "bandwidth": 235.0 + }, + { + "source_global_id": 4, + "target_global_id": 8, + "type": "PHB", + "bandwidth": 24.0 + }, + { + "source_global_id": 4, + "target_global_id": 9, + "type": "PHB", + "bandwidth": 24.0 + }, + { + "source_global_id": 5, + "target_global_id": 0, + "type": "NVB", + "bandwidth": 235.0 + }, + { + "source_global_id": 5, + "target_global_id": 1, + "type": "NVL", + "bandwidth": 235.0 + }, + { + "source_global_id": 5, + "target_global_id": 2, + "type": "NVB", + "bandwidth": 235.0 + }, + { + "source_global_id": 5, + "target_global_id": 3, + "type": "NVB", + "bandwidth": 235.0 + }, + { + "source_global_id": 5, + "target_global_id": 4, + "type": "NVL", + "bandwidth": 235.0 + }, + { + "source_global_id": 5, + "target_global_id": 6, + "type": "NVL", + "bandwidth": 235.0 + }, + { + "source_global_id": 5, + "target_global_id": 7, + "type": "NVL", + "bandwidth": 235.0 + }, + { + "source_global_id": 5, + "target_global_id": 8, + "type": "PHB", + "bandwidth": 24.0 + }, + { + "source_global_id": 5, + "target_global_id": 9, + "type": "PHB", + "bandwidth": 24.0 + }, + { + "source_global_id": 6, + "target_global_id": 0, + "type": "NVB", + "bandwidth": 235.0 + }, + { + "source_global_id": 6, + "target_global_id": 1, + "type": "NVB", + "bandwidth": 235.0 + }, + { + "source_global_id": 6, + "target_global_id": 2, + "type": "NVL", + "bandwidth": 235.0 + }, + { + "source_global_id": 6, + "target_global_id": 3, + "type": "NVB", + "bandwidth": 235.0 + }, + { + "source_global_id": 6, + "target_global_id": 4, + "type": "NVL", + "bandwidth": 235.0 + }, + { + "source_global_id": 6, + "target_global_id": 5, + "type": "NVL", + "bandwidth": 235.0 + }, + { + "source_global_id": 6, + "target_global_id": 7, + "type": "NVL", + "bandwidth": 235.0 + }, + { + "source_global_id": 6, + "target_global_id": 8, + "type": "PHB", + "bandwidth": 24.0 + }, + { + "source_global_id": 6, + "target_global_id": 9, + "type": "PHB", + "bandwidth": 24.0 + }, + { + "source_global_id": 7, + "target_global_id": 0, + "type": "NVB", + "bandwidth": 235.0 + }, + { + "source_global_id": 7, + "target_global_id": 1, + "type": "NVB", + "bandwidth": 235.0 + }, + { + "source_global_id": 7, + "target_global_id": 2, + "type": "NVB", + "bandwidth": 235.0 + }, + { + "source_global_id": 7, + "target_global_id": 3, + "type": "NVL", + "bandwidth": 235.0 + }, + { + "source_global_id": 7, + "target_global_id": 4, + "type": "NVL", + "bandwidth": 235.0 + }, + { + "source_global_id": 7, + "target_global_id": 5, + "type": "NVL", + "bandwidth": 235.0 + }, + { + "source_global_id": 7, + "target_global_id": 6, + "type": "NVL", + "bandwidth": 235.0 + }, + { + "source_global_id": 7, + "target_global_id": 8, + "type": "PHB", + "bandwidth": 24.0 + }, + { + "source_global_id": 7, + "target_global_id": 9, + "type": "PHB", + "bandwidth": 24.0 + }, + { + "source_global_id": 8, + "target_global_id": 0, + "type": "PHB", + "bandwidth": 24.0 + }, + { + "source_global_id": 8, + "target_global_id": 1, + "type": "PHB", + "bandwidth": 24.0 + }, + { + "source_global_id": 8, + "target_global_id": 2, + "type": "PHB", + "bandwidth": 24.0 + }, + { + "source_global_id": 8, + "target_global_id": 3, + "type": "PHB", + "bandwidth": 24.0 + }, + { + "source_global_id": 8, + "target_global_id": 4, + "type": "PHB", + "bandwidth": 24.0 + }, + { + "source_global_id": 8, + "target_global_id": 5, + "type": "PHB", + "bandwidth": 24.0 + }, + { + "source_global_id": 8, + "target_global_id": 6, + "type": "PHB", + "bandwidth": 24.0 + }, + { + "source_global_id": 8, + "target_global_id": 7, + "type": "PHB", + "bandwidth": 24.0 + }, + { + "source_global_id": 8, + "target_global_id": 9, + "type": "PHB", + "bandwidth": 24.0 + }, + { + "source_global_id": 9, + "target_global_id": 0, + "type": "PHB", + "bandwidth": 24.0 + }, + { + "source_global_id": 9, + "target_global_id": 1, + "type": "PHB", + "bandwidth": 24.0 + }, + { + "source_global_id": 9, + "target_global_id": 2, + "type": "PHB", + "bandwidth": 24.0 + }, + { + "source_global_id": 9, + "target_global_id": 3, + "type": "PHB", + "bandwidth": 24.0 + }, + { + "source_global_id": 9, + "target_global_id": 4, + "type": "PHB", + "bandwidth": 24.0 + }, + { + "source_global_id": 9, + "target_global_id": 5, + "type": "PHB", + "bandwidth": 24.0 + }, + { + "source_global_id": 9, + "target_global_id": 6, + "type": "PHB", + "bandwidth": 24.0 + }, + { + "source_global_id": 9, + "target_global_id": 7, + "type": "PHB", + "bandwidth": 24.0 + }, + { + "source_global_id": 9, + "target_global_id": 8, + "type": "PHB", + "bandwidth": 24.0 + }, + { + "source_global_id": 9, + "target_global_id": 19, + "type": "NET", + "bandwidth": 24.0 + } + ] + }, + { + "hostname": "yq01-sys-hic-k8s-v100-box-a225-0751", + "addr": "10.127.43.24", + "port": "60009", + "devices": [ + { + "global_id": 10, + "local_id": 0, + "type": "GPU", + "model": " Tesla V100-SXM2-32GB", + "memory": "32", + "sp_gflops": "15700", + "dp_gflops": "7800" + }, + { + "global_id": 11, + "local_id": 1, + "type": "GPU", + "model": " Tesla V100-SXM2-32GB", + "memory": "32", + "sp_gflops": "15700", + "dp_gflops": "7800" + }, + { + "global_id": 12, + "local_id": 2, + "type": "GPU", + "model": " Tesla V100-SXM2-32GB", + "memory": "32", + "sp_gflops": "15700", + "dp_gflops": "7800" + }, + { + "global_id": 13, + "local_id": 3, + "type": "GPU", + "model": " Tesla V100-SXM2-32GB", + "memory": "32", + "sp_gflops": "15700", + "dp_gflops": "7800" + }, + { + "global_id": 14, + "local_id": 4, + "type": "GPU", + "model": " Tesla V100-SXM2-32GB", + "memory": "32", + "sp_gflops": "15700", + "dp_gflops": "7800" + }, + { + "global_id": 15, + "local_id": 5, + "type": "GPU", + "model": " Tesla V100-SXM2-32GB", + "memory": "32", + "sp_gflops": "15700", + "dp_gflops": "7800" + }, + { + "global_id": 16, + "local_id": 6, + "type": "GPU", + "model": " Tesla V100-SXM2-32GB", + "memory": "32", + "sp_gflops": "15700", + "dp_gflops": "7800" + }, + { + "global_id": 17, + "local_id": 7, + "type": "GPU", + "model": " Tesla V100-SXM2-32GB", + "memory": "32", + "sp_gflops": "15700", + "dp_gflops": "7800" + }, + { + "global_id": 18, + "local_id": 0, + "type": "CPU", + "arch": "x86_64", + "vendor": "GenuineIntel", + "model": "Intel(R) Xeon(R) Gold 6271C CPU @ 2.60G", + "memory": "503", + "sp_gflops": "150", + "dp_gflops": "75" + }, + { + "global_id": 19, + "local_id": 0, + "type": "NIC", + "width": 12.5, + "ip": "10.127.43.24" + } + ], + "links": [ + { + "source_global_id": 10, + "target_global_id": 11, + "type": "NVL", + "bandwidth": 235.0 + }, + { + "source_global_id": 10, + "target_global_id": 12, + "type": "NVL", + "bandwidth": 235.0 + }, + { + "source_global_id": 10, + "target_global_id": 13, + "type": "NVL", + "bandwidth": 235.0 + }, + { + "source_global_id": 10, + "target_global_id": 14, + "type": "NVL", + "bandwidth": 235.0 + }, + { + "source_global_id": 10, + "target_global_id": 15, + "type": "NVB", + "bandwidth": 235.0 + }, + { + "source_global_id": 10, + "target_global_id": 16, + "type": "NVB", + "bandwidth": 235.0 + }, + { + "source_global_id": 10, + "target_global_id": 17, + "type": "NVB", + "bandwidth": 235.0 + }, + { + "source_global_id": 10, + "target_global_id": 18, + "type": "PHB", + "bandwidth": 24.0 + }, + { + "source_global_id": 10, + "target_global_id": 19, + "type": "PHB", + "bandwidth": 24.0 + }, + { + "source_global_id": 11, + "target_global_id": 10, + "type": "NVL", + "bandwidth": 235.0 + }, + { + "source_global_id": 11, + "target_global_id": 12, + "type": "NVL", + "bandwidth": 235.0 + }, + { + "source_global_id": 11, + "target_global_id": 13, + "type": "NVL", + "bandwidth": 235.0 + }, + { + "source_global_id": 11, + "target_global_id": 14, + "type": "NVB", + "bandwidth": 235.0 + }, + { + "source_global_id": 11, + "target_global_id": 15, + "type": "NVL", + "bandwidth": 235.0 + }, + { + "source_global_id": 11, + "target_global_id": 16, + "type": "NVB", + "bandwidth": 235.0 + }, + { + "source_global_id": 11, + "target_global_id": 17, + "type": "NVB", + "bandwidth": 235.0 + }, + { + "source_global_id": 11, + "target_global_id": 18, + "type": "PHB", + "bandwidth": 24.0 + }, + { + "source_global_id": 11, + "target_global_id": 19, + "type": "PHB", + "bandwidth": 24.0 + }, + { + "source_global_id": 12, + "target_global_id": 10, + "type": "NVL", + "bandwidth": 235.0 + }, + { + "source_global_id": 12, + "target_global_id": 11, + "type": "NVL", + "bandwidth": 235.0 + }, + { + "source_global_id": 12, + "target_global_id": 13, + "type": "NVL", + "bandwidth": 235.0 + }, + { + "source_global_id": 12, + "target_global_id": 14, + "type": "NVB", + "bandwidth": 235.0 + }, + { + "source_global_id": 12, + "target_global_id": 15, + "type": "NVB", + "bandwidth": 235.0 + }, + { + "source_global_id": 12, + "target_global_id": 16, + "type": "NVL", + "bandwidth": 235.0 + }, + { + "source_global_id": 12, + "target_global_id": 17, + "type": "NVB", + "bandwidth": 235.0 + }, + { + "source_global_id": 12, + "target_global_id": 18, + "type": "PHB", + "bandwidth": 24.0 + }, + { + "source_global_id": 12, + "target_global_id": 19, + "type": "PHB", + "bandwidth": 24.0 + }, + { + "source_global_id": 13, + "target_global_id": 10, + "type": "NVL", + "bandwidth": 235.0 + }, + { + "source_global_id": 13, + "target_global_id": 11, + "type": "NVL", + "bandwidth": 235.0 + }, + { + "source_global_id": 13, + "target_global_id": 12, + "type": "NVL", + "bandwidth": 235.0 + }, + { + "source_global_id": 13, + "target_global_id": 14, + "type": "NVB", + "bandwidth": 235.0 + }, + { + "source_global_id": 13, + "target_global_id": 15, + "type": "NVB", + "bandwidth": 235.0 + }, + { + "source_global_id": 13, + "target_global_id": 16, + "type": "NVB", + "bandwidth": 235.0 + }, + { + "source_global_id": 13, + "target_global_id": 17, + "type": "NVL", + "bandwidth": 235.0 + }, + { + "source_global_id": 13, + "target_global_id": 18, + "type": "PHB", + "bandwidth": 24.0 + }, + { + "source_global_id": 13, + "target_global_id": 19, + "type": "PHB", + "bandwidth": 24.0 + }, + { + "source_global_id": 14, + "target_global_id": 10, + "type": "NVL", + "bandwidth": 235.0 + }, + { + "source_global_id": 14, + "target_global_id": 11, + "type": "NVB", + "bandwidth": 235.0 + }, + { + "source_global_id": 14, + "target_global_id": 12, + "type": "NVB", + "bandwidth": 235.0 + }, + { + "source_global_id": 14, + "target_global_id": 13, + "type": "NVB", + "bandwidth": 235.0 + }, + { + "source_global_id": 14, + "target_global_id": 15, + "type": "NVL", + "bandwidth": 235.0 + }, + { + "source_global_id": 14, + "target_global_id": 16, + "type": "NVL", + "bandwidth": 235.0 + }, + { + "source_global_id": 14, + "target_global_id": 17, + "type": "NVL", + "bandwidth": 235.0 + }, + { + "source_global_id": 14, + "target_global_id": 18, + "type": "PHB", + "bandwidth": 24.0 + }, + { + "source_global_id": 14, + "target_global_id": 19, + "type": "PHB", + "bandwidth": 24.0 + }, + { + "source_global_id": 15, + "target_global_id": 10, + "type": "NVB", + "bandwidth": 235.0 + }, + { + "source_global_id": 15, + "target_global_id": 11, + "type": "NVL", + "bandwidth": 235.0 + }, + { + "source_global_id": 15, + "target_global_id": 12, + "type": "NVB", + "bandwidth": 235.0 + }, + { + "source_global_id": 15, + "target_global_id": 13, + "type": "NVB", + "bandwidth": 235.0 + }, + { + "source_global_id": 15, + "target_global_id": 14, + "type": "NVL", + "bandwidth": 235.0 + }, + { + "source_global_id": 15, + "target_global_id": 16, + "type": "NVL", + "bandwidth": 235.0 + }, + { + "source_global_id": 15, + "target_global_id": 17, + "type": "NVL", + "bandwidth": 235.0 + }, + { + "source_global_id": 15, + "target_global_id": 18, + "type": "PHB", + "bandwidth": 24.0 + }, + { + "source_global_id": 15, + "target_global_id": 19, + "type": "PHB", + "bandwidth": 24.0 + }, + { + "source_global_id": 16, + "target_global_id": 10, + "type": "NVB", + "bandwidth": 235.0 + }, + { + "source_global_id": 16, + "target_global_id": 11, + "type": "NVB", + "bandwidth": 235.0 + }, + { + "source_global_id": 16, + "target_global_id": 12, + "type": "NVL", + "bandwidth": 235.0 + }, + { + "source_global_id": 16, + "target_global_id": 13, + "type": "NVB", + "bandwidth": 235.0 + }, + { + "source_global_id": 16, + "target_global_id": 14, + "type": "NVL", + "bandwidth": 235.0 + }, + { + "source_global_id": 16, + "target_global_id": 15, + "type": "NVL", + "bandwidth": 235.0 + }, + { + "source_global_id": 16, + "target_global_id": 17, + "type": "NVL", + "bandwidth": 235.0 + }, + { + "source_global_id": 16, + "target_global_id": 18, + "type": "PHB", + "bandwidth": 24.0 + }, + { + "source_global_id": 16, + "target_global_id": 19, + "type": "PHB", + "bandwidth": 24.0 + }, + { + "source_global_id": 17, + "target_global_id": 10, + "type": "NVB", + "bandwidth": 235.0 + }, + { + "source_global_id": 17, + "target_global_id": 11, + "type": "NVB", + "bandwidth": 235.0 + }, + { + "source_global_id": 17, + "target_global_id": 12, + "type": "NVB", + "bandwidth": 235.0 + }, + { + "source_global_id": 17, + "target_global_id": 13, + "type": "NVL", + "bandwidth": 235.0 + }, + { + "source_global_id": 17, + "target_global_id": 14, + "type": "NVL", + "bandwidth": 235.0 + }, + { + "source_global_id": 17, + "target_global_id": 15, + "type": "NVL", + "bandwidth": 235.0 + }, + { + "source_global_id": 17, + "target_global_id": 16, + "type": "NVL", + "bandwidth": 235.0 + }, + { + "source_global_id": 17, + "target_global_id": 18, + "type": "PHB", + "bandwidth": 24.0 + }, + { + "source_global_id": 17, + "target_global_id": 19, + "type": "PHB", + "bandwidth": 24.0 + }, + { + "source_global_id": 18, + "target_global_id": 10, + "type": "PHB", + "bandwidth": 24.0 + }, + { + "source_global_id": 18, + "target_global_id": 11, + "type": "PHB", + "bandwidth": 24.0 + }, + { + "source_global_id": 18, + "target_global_id": 12, + "type": "PHB", + "bandwidth": 24.0 + }, + { + "source_global_id": 18, + "target_global_id": 13, + "type": "PHB", + "bandwidth": 24.0 + }, + { + "source_global_id": 18, + "target_global_id": 14, + "type": "PHB", + "bandwidth": 24.0 + }, + { + "source_global_id": 18, + "target_global_id": 15, + "type": "PHB", + "bandwidth": 24.0 + }, + { + "source_global_id": 18, + "target_global_id": 16, + "type": "PHB", + "bandwidth": 24.0 + }, + { + "source_global_id": 18, + "target_global_id": 17, + "type": "PHB", + "bandwidth": 24.0 + }, + { + "source_global_id": 18, + "target_global_id": 19, + "type": "PHB", + "bandwidth": 24.0 + }, + { + "source_global_id": 19, + "target_global_id": 10, + "type": "PHB", + "bandwidth": 24.0 + }, + { + "source_global_id": 19, + "target_global_id": 11, + "type": "PHB", + "bandwidth": 24.0 + }, + { + "source_global_id": 19, + "target_global_id": 12, + "type": "PHB", + "bandwidth": 24.0 + }, + { + "source_global_id": 19, + "target_global_id": 13, + "type": "PHB", + "bandwidth": 24.0 + }, + { + "source_global_id": 19, + "target_global_id": 14, + "type": "PHB", + "bandwidth": 24.0 + }, + { + "source_global_id": 19, + "target_global_id": 15, + "type": "PHB", + "bandwidth": 24.0 + }, + { + "source_global_id": 19, + "target_global_id": 16, + "type": "PHB", + "bandwidth": 24.0 + }, + { + "source_global_id": 19, + "target_global_id": 17, + "type": "PHB", + "bandwidth": 24.0 + }, + { + "source_global_id": 19, + "target_global_id": 18, + "type": "PHB", + "bandwidth": 24.0 + }, + { + "source_global_id": 19, + "target_global_id": 9, + "type": "NET", + "bandwidth": 24.0 + } + ] + } + ] +} +""" + + +class TestCluster(unittest.TestCase): + def test_single_machine(self): + # Build cluster + file_dir = os.path.dirname(os.path.abspath(__file__)) + cluster_json_path = os.path.join(file_dir, "auto_parallel_cluster.json") + cluster_json_object = json.loads(cluster_json) + with open(cluster_json_path, "w") as cluster_json_file: + json.dump(cluster_json_object, cluster_json_file) + cluster = Cluster() + cluster.build_from_file(cluster_json_path) + + beta = cluster.get_beta(0, 1) + hop = cluster.get_hop(0, 1) + cross_machine = cluster.cross_machine([0, 1]) + devices = cluster.convert_rank_to_device_id([0, 1, 2, 3]) + involved_machine_count = cluster.get_involved_machine_count(devices) + self.assertTrue(beta > 0) + self.assertTrue(hop == 0) + self.assertTrue(not cross_machine) + self.assertTrue(devices == [0, 1, 2, 3]) + self.assertTrue(involved_machine_count == 1) + + # Remove unnecessary files + if os.path.exists(cluster_json_path): + os.remove(cluster_json_path) + + def test_multi_machine(self): + # Build cluster + file_dir = os.path.dirname(os.path.abspath(__file__)) + cluster_json_path = os.path.join(file_dir, "auto_parallel_cluster.json") + cluster_json_object = json.loads(multi_cluster_json) + with open(cluster_json_path, "w") as cluster_json_file: + json.dump(cluster_json_object, cluster_json_file) + cluster = Cluster() + cluster.build_from_file(cluster_json_path) + + beta = cluster.get_beta(0, 11) + hop = cluster.get_hop(0, 11) + cross_machine = cluster.cross_machine([0, 11]) + devices = cluster.convert_rank_to_device_id([5, 6, 7, 8]) + involved_machine_count = cluster.get_involved_machine_count(devices) + self.assertTrue(beta > 0) + self.assertTrue(hop >= 0) + self.assertTrue(cross_machine) + self.assertTrue(devices == [5, 6, 7, 10]) + self.assertTrue(involved_machine_count == 2) + + # Remove unnecessary files + if os.path.exists(cluster_json_path): + os.remove(cluster_json_path) + + +if __name__ == "__main__": + unittest.main() From fc208b7efe7307b0d286410aa9e7ca7c5ca410bd Mon Sep 17 00:00:00 2001 From: fwenguang <95677191+fwenguang@users.noreply.github.com> Date: Fri, 15 Apr 2022 10:38:56 +0800 Subject: [PATCH 168/211] [MLU] add mlu new profiler (#41138) * [MLU] add mlu new profiler * fix format --- paddle/fluid/platform/device/mlu/mlu_info.h | 4 +- paddle/fluid/platform/profiler/CMakeLists.txt | 3 +- .../platform/profiler/chrometracing_logger.cc | 24 +- .../platform/profiler/mlu/CMakeLists.txt | 5 + .../profiler/mlu/cnpapi_data_process.cc | 263 ++++++++++++++++++ .../profiler/mlu/cnpapi_data_process.h | 35 +++ .../fluid/platform/profiler/mlu/mlu_tracer.cc | 154 ++++++++++ .../fluid/platform/profiler/mlu/mlu_tracer.h | 60 ++++ paddle/fluid/platform/profiler/profiler.cc | 12 + paddle/fluid/platform/profiler/profiler.h | 5 +- paddle/fluid/platform/profiler/trace_event.h | 2 + .../platform/profiler/trace_event_collector.h | 7 + paddle/fluid/pybind/pybind.cc | 2 + python/paddle/profiler/profiler.py | 11 +- 14 files changed, 574 insertions(+), 13 deletions(-) create mode 100644 paddle/fluid/platform/profiler/mlu/CMakeLists.txt create mode 100644 paddle/fluid/platform/profiler/mlu/cnpapi_data_process.cc create mode 100644 paddle/fluid/platform/profiler/mlu/cnpapi_data_process.h create mode 100644 paddle/fluid/platform/profiler/mlu/mlu_tracer.cc create mode 100644 paddle/fluid/platform/profiler/mlu/mlu_tracer.h diff --git a/paddle/fluid/platform/device/mlu/mlu_info.h b/paddle/fluid/platform/device/mlu/mlu_info.h index fcf06cb4f1c40..12c206ef2c445 100644 --- a/paddle/fluid/platform/device/mlu/mlu_info.h +++ b/paddle/fluid/platform/device/mlu/mlu_info.h @@ -16,7 +16,9 @@ limitations under the License. */ #ifdef PADDLE_WITH_MLU #include +#include #include +#include #include #ifdef PADDLE_WITH_CNCL #include @@ -33,7 +35,7 @@ using cnclStatus = cnclResult_t; #endif using mluStream = cnrtQueue_t; using mluCnnlHandle = cnnlHandle_t; -using mluEventHandle = CNnotifier; +using mluEventHandle = cnrtNotifier_t; using mluDeviceHandle = CNdev; namespace platform { diff --git a/paddle/fluid/platform/profiler/CMakeLists.txt b/paddle/fluid/platform/profiler/CMakeLists.txt index c903a52530ccb..084bc44dbc78b 100644 --- a/paddle/fluid/platform/profiler/CMakeLists.txt +++ b/paddle/fluid/platform/profiler/CMakeLists.txt @@ -1,12 +1,13 @@ cc_library(host_tracer SRCS host_tracer.cc DEPS enforce) cc_library(cuda_tracer SRCS cuda_tracer.cc cupti_data_process.cc DEPS workqueue_utils enforce glog) +add_subdirectory(mlu) cc_library(event_node SRCS event_node.cc DEPS enforce) cc_library(profiler_utils SRCS utils.cc DEPS enforce glog) add_subdirectory(dump) cc_library(profiler_logger SRCS chrometracing_logger.cc dump/serialization_logger.cc dump/deserialization_reader.cc DEPS nodetreeproto event_node profiler_utils) cc_library(event_bind SRCS event_python.cc DEPS profiler_logger) cc_library(cpu_utilization SRCS cpu_utilization.cc DEPS cpu_info os_info enforce glog) -cc_library(new_profiler SRCS profiler.cc DEPS host_tracer cuda_tracer profiler_utils cpu_utilization event_bind) +cc_library(new_profiler SRCS profiler.cc DEPS host_tracer cuda_tracer profiler_utils cpu_utilization event_bind mlu_tracer) cc_test(test_event_node SRCS test_event_node.cc DEPS event_node profiler_logger) cc_test(test_extra_info SRCS test_extra_info.cc DEPS profiler_utils) cc_test(test_serialization_logger SRCS dump/test_serialization_logger.cc DEPS event_bind) diff --git a/paddle/fluid/platform/profiler/chrometracing_logger.cc b/paddle/fluid/platform/profiler/chrometracing_logger.cc index d7879e7be517e..4ee95a530fb43 100644 --- a/paddle/fluid/platform/profiler/chrometracing_logger.cc +++ b/paddle/fluid/platform/profiler/chrometracing_logger.cc @@ -38,10 +38,12 @@ static std::string DefaultFileName() { } const char* ChromeTracingLogger::categary_name_[] = { - "Operator", "Dataloader", "ProfileStep", "CudaRuntime", - "Kernel", "Memcpy", "Memset", "UserDefined", - "OperatorInner", "Forward", "Backward", "Optimization", - "Communication", "PythonOp", "PythonUserDefined"}; + "Operator", "Dataloader", "ProfileStep", + "CudaRuntime", "Kernel", "Memcpy", + "Memset", "UserDefined", "OperatorInner", + "Forward", "Backward", "Optimization", + "Communication", "PythonOp", "PythonUserDefined", + "MluRuntime"}; void ChromeTracingLogger::OpenFile() { output_file_stream_.open(filename_, @@ -598,6 +600,12 @@ void ChromeTracingLogger::RefineDisplayName( (*it).second * 2, (*it).first, (*it).second, (*it).second * 2 + 1); } +#ifdef PADDLE_WITH_MLU + static std::string device_type("MLU"); +#else + static std::string device_type("GPU"); +#endif + for (auto it = deviceid_streamid_set_.begin(); it != deviceid_streamid_set_.end(); ++it) { output_file_stream_ << string_format( @@ -607,7 +615,7 @@ void ChromeTracingLogger::RefineDisplayName( "name": "process_name", "pid": %lld, "tid": %lld, "ph": "M", "args": { - "name": "Deivce %lld (GPU)" + "name": "Deivce %lld (%s)" } }, { @@ -632,9 +640,9 @@ void ChromeTracingLogger::RefineDisplayName( } }, )JSON"), - (*it).first, (*it).second, (*it).first, (*it).first, (*it).second, - (*it).second, (*it).first, (*it).second, (*it).first + 0x10000000, - (*it).first, (*it).second, (*it).second); + (*it).first, (*it).second, (*it).first, device_type.c_str(), + (*it).first, (*it).second, (*it).second, (*it).first, (*it).second, + (*it).first + 0x10000000, (*it).first, (*it).second, (*it).second); } } diff --git a/paddle/fluid/platform/profiler/mlu/CMakeLists.txt b/paddle/fluid/platform/profiler/mlu/CMakeLists.txt new file mode 100644 index 0000000000000..01b3757ea6912 --- /dev/null +++ b/paddle/fluid/platform/profiler/mlu/CMakeLists.txt @@ -0,0 +1,5 @@ +if(WITH_MLU) + set(MLU_INFO mlu_info) +endif() + +cc_library(mlu_tracer SRCS mlu_tracer.cc cnpapi_data_process.cc DEPS workqueue_utils enforce glog ${MLU_INFO}) diff --git a/paddle/fluid/platform/profiler/mlu/cnpapi_data_process.cc b/paddle/fluid/platform/profiler/mlu/cnpapi_data_process.cc new file mode 100644 index 0000000000000..eceb5fabe8dba --- /dev/null +++ b/paddle/fluid/platform/profiler/mlu/cnpapi_data_process.cc @@ -0,0 +1,263 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/platform/profiler/mlu/cnpapi_data_process.h" +#include +#include "paddle/fluid/platform/enforce.h" +#include "paddle/fluid/platform/os_info.h" + +#ifdef PADDLE_WITH_MLU +namespace paddle { +namespace platform { + +namespace { + +inline uint64_t GetTimeGap() { + static uint64_t time_gap = []() -> uint64_t { + uint64_t cpu_time = PosixInNsec(); + uint64_t mlu_time = cnpapiGetTimestamp(); + return (cpu_time - mlu_time); + }(); + return time_gap; +} + +void AddKernelRecord(const cnpapiActivityKernel* kernel, uint64_t start_ns, + TraceEventCollector* collector) { + static uint64_t time_gap = GetTimeGap(); + if (kernel->start + time_gap < start_ns) { + return; + } + DeviceTraceEvent event; + event.name = demangle(kernel->name); + event.type = TracerEventType::Kernel; + event.start_ns = kernel->start + time_gap; + event.end_ns = kernel->end + time_gap; + event.device_id = kernel->device_id; + event.context_id = kernel->context_id; + event.stream_id = kernel->queue_id; + event.correlation_id = kernel->correlation_id; + event.kernel_info.block_x = kernel->dimx; + event.kernel_info.block_y = kernel->dimy; + event.kernel_info.block_z = kernel->dimz; + event.kernel_info.grid_x = kernel->kernel_type; + event.kernel_info.grid_y = 0; + event.kernel_info.grid_z = 0; + event.kernel_info.queued = kernel->queued; + event.kernel_info.submitted = kernel->submitted; + event.kernel_info.completed = kernel->received; + collector->AddDeviceEvent(std::move(event)); +} + +const char* MemcpyKind(cnpapiActivityMemcpyType kind) { + switch (kind) { + case CNPAPI_ACTIVITY_MEMCPY_TYPE_HTOD: + return "MEMCPY_HtoD"; + case CNPAPI_ACTIVITY_MEMCPY_TYPE_DTOH: + return "MEMCPY_DtoH"; + case CNPAPI_ACTIVITY_MEMCPY_TYPE_DTOD: + return "MEMCPY_DtoD"; + case CNPAPI_ACTIVITY_MEMCPY_TYPE_HTOH: + return "MEMCPY_HtoH"; + case CNPAPI_ACTIVITY_MEMCPY_TYPE_PTOP: + return "MEMCPY_PtoP"; + default: + break; + } + return "MEMCPY"; +} + +void AddMemcpyRecord(const cnpapiActivityMemcpy* memcpy, uint64_t start_ns, + TraceEventCollector* collector) { + static uint64_t time_gap = GetTimeGap(); + if (memcpy->start + time_gap < start_ns) { + return; + } + DeviceTraceEvent event; + event.name = MemcpyKind(memcpy->copy_type); + event.type = TracerEventType::Memcpy; + event.start_ns = memcpy->start + time_gap; + event.end_ns = memcpy->end + time_gap; + event.device_id = memcpy->device_id; + event.context_id = memcpy->context_id; + event.stream_id = memcpy->queue_id; + event.correlation_id = memcpy->correlation_id; + event.memcpy_info.num_bytes = memcpy->bytes; + snprintf(event.memcpy_info.copy_kind, kMemKindMaxLen, "%s", + MemcpyKind(memcpy->copy_type)); + collector->AddDeviceEvent(std::move(event)); +} + +void AddMemcpy2Record(const cnpapiActivityMemcpyPtoP* memcpy2, + uint64_t start_ns, TraceEventCollector* collector) { + static uint64_t time_gap = GetTimeGap(); + if (memcpy2->start + time_gap < start_ns) { + return; + } + DeviceTraceEvent event; + event.name = MemcpyKind(memcpy2->copy_type); + event.type = TracerEventType::Memcpy; + event.start_ns = memcpy2->start + time_gap; + event.end_ns = memcpy2->end + time_gap; + event.device_id = memcpy2->device_id; + event.context_id = memcpy2->context_id; + event.stream_id = memcpy2->queue_id; + event.correlation_id = memcpy2->correlation_id; + event.memcpy_info.num_bytes = memcpy2->bytes; + snprintf(event.memcpy_info.copy_kind, kMemKindMaxLen, "%s", + MemcpyKind(memcpy2->copy_type)); + collector->AddDeviceEvent(std::move(event)); +} + +void AddMemsetRecord(const cnpapiActivityMemset* memset, uint64_t start_ns, + TraceEventCollector* collector) { + static uint64_t time_gap = GetTimeGap(); + if (memset->start + time_gap < start_ns) { + return; + } + DeviceTraceEvent event; + event.name = "MEMSET"; + event.type = TracerEventType::Memset; + event.start_ns = memset->start + time_gap; + event.end_ns = memset->end + time_gap; + event.device_id = memset->device_id; + event.context_id = memset->context_id; + event.stream_id = memset->queue_id; + event.correlation_id = memset->correlation_id; + event.memset_info.num_bytes = memset->bytes; + event.memset_info.value = memset->value; + collector->AddDeviceEvent(std::move(event)); +} + +class CnpapiRuntimeCbidStr { + public: + static const CnpapiRuntimeCbidStr& GetInstance() { + static CnpapiRuntimeCbidStr inst; + return inst; + } + + std::string RuntimeKind(cnpapi_CallbackId cbid) const { + auto iter = cbid_str_.find(cbid); + if (iter == cbid_str_.end()) { + return "MLU Runtime API " + std::to_string(cbid); + } + return iter->second; + } + + private: + CnpapiRuntimeCbidStr(); + + std::unordered_map cbid_str_; +}; + +CnpapiRuntimeCbidStr::CnpapiRuntimeCbidStr() { +#define REGISTER_RUNTIME_CBID_STR(cbid) \ + cbid_str_[CNPAPI_CNDRV_TRACE_CBID_##cbid] = #cbid + + REGISTER_RUNTIME_CBID_STR(cnMalloc); + REGISTER_RUNTIME_CBID_STR(cnMallocHost); + REGISTER_RUNTIME_CBID_STR(cnFree); + REGISTER_RUNTIME_CBID_STR(cnFreeHost); + REGISTER_RUNTIME_CBID_STR(cnMemcpy); + REGISTER_RUNTIME_CBID_STR(cnMemcpyPeer); + REGISTER_RUNTIME_CBID_STR(cnMemcpyHtoD); + REGISTER_RUNTIME_CBID_STR(cnMemcpyDtoH); + REGISTER_RUNTIME_CBID_STR(cnMemcpyDtoD); + REGISTER_RUNTIME_CBID_STR(cnMemcpyAsync); + REGISTER_RUNTIME_CBID_STR(cnMemcpyHtoDAsync); + REGISTER_RUNTIME_CBID_STR(cnMemcpyDtoHAsync); + REGISTER_RUNTIME_CBID_STR(cnMemcpyDtoDAsync); + REGISTER_RUNTIME_CBID_STR(cnMemcpyDtoD2D); + REGISTER_RUNTIME_CBID_STR(cnMemcpyDtoD3D); + REGISTER_RUNTIME_CBID_STR(cnMemcpy2D); + REGISTER_RUNTIME_CBID_STR(cnMemcpy3D); + REGISTER_RUNTIME_CBID_STR(cnMemsetD8); + REGISTER_RUNTIME_CBID_STR(cnMemsetD16); + REGISTER_RUNTIME_CBID_STR(cnMemsetD32); + REGISTER_RUNTIME_CBID_STR(cnMemsetD8Async); + REGISTER_RUNTIME_CBID_STR(cnMemsetD16Async); + REGISTER_RUNTIME_CBID_STR(cnMemsetD32Async); + REGISTER_RUNTIME_CBID_STR(cnInvokeKernel); + REGISTER_RUNTIME_CBID_STR(cnCreateQueue); + REGISTER_RUNTIME_CBID_STR(cnDestroyQueue); + REGISTER_RUNTIME_CBID_STR(cnQueueSync); + REGISTER_RUNTIME_CBID_STR(cnQueueWaitNotifier); + REGISTER_RUNTIME_CBID_STR(cnWaitNotifier); + REGISTER_RUNTIME_CBID_STR(cnCreateNotifier); + REGISTER_RUNTIME_CBID_STR(cnDestroyNotifier); + REGISTER_RUNTIME_CBID_STR(cnPlaceNotifier); + REGISTER_RUNTIME_CBID_STR(cnCtxCreate); + REGISTER_RUNTIME_CBID_STR(cnCtxDestroy); + REGISTER_RUNTIME_CBID_STR(cnCtxGetCurrent); + REGISTER_RUNTIME_CBID_STR(cnCtxSetCurrent); + REGISTER_RUNTIME_CBID_STR(cnCtxGetDevice); + REGISTER_RUNTIME_CBID_STR(cnCtxSync); +#undef REGISTER_RUNTIME_CBID_STR +} + +void AddApiRecord(const cnpapiActivityAPI* api, uint64_t start_ns, + TraceEventCollector* collector) { + static uint64_t time_gap = GetTimeGap(); + if (api->start + time_gap < start_ns) { + return; + } + RuntimeTraceEvent event; + event.name = CnpapiRuntimeCbidStr::GetInstance().RuntimeKind(api->cbid); + event.start_ns = api->start + time_gap; + event.end_ns = api->end + time_gap; + event.process_id = api->process_id; + event.thread_id = api->thread_id; + event.correlation_id = api->correlation_id; + event.callback_id = api->cbid; + event.type = TracerEventType::MluRuntime; + collector->AddRuntimeEvent(std::move(event)); +} + +} // namespace + +namespace details { + +void ProcessCnpapiActivityRecord(const cnpapiActivity* record, + uint64_t start_ns, + TraceEventCollector* collector) { + switch (record->type) { + case CNPAPI_ACTIVITY_TYPE_KERNEL: + AddKernelRecord(reinterpret_cast(record), + start_ns, collector); + break; + case CNPAPI_ACTIVITY_TYPE_MEMCPY: + AddMemcpyRecord(reinterpret_cast(record), + start_ns, collector); + break; + case CNPAPI_ACTIVITY_TYPE_MEMCPY_PTOP: + AddMemcpy2Record( + reinterpret_cast(record), start_ns, + collector); + break; + case CNPAPI_ACTIVITY_TYPE_MEMSET: + AddMemsetRecord(reinterpret_cast(record), + start_ns, collector); + break; + case CNPAPI_ACTIVITY_TYPE_CNDRV_API: + AddApiRecord(reinterpret_cast(record), start_ns, + collector); + break; + default: + break; + } +} + +} // namespace details +} // namespace platform +} // namespace paddle +#endif diff --git a/paddle/fluid/platform/profiler/mlu/cnpapi_data_process.h b/paddle/fluid/platform/profiler/mlu/cnpapi_data_process.h new file mode 100644 index 0000000000000..1f00b46d2c2ae --- /dev/null +++ b/paddle/fluid/platform/profiler/mlu/cnpapi_data_process.h @@ -0,0 +1,35 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#ifdef PADDLE_WITH_MLU +#include "paddle/fluid/platform/device/mlu/mlu_info.h" +#endif +#include "paddle/fluid/platform/profiler/trace_event_collector.h" + +namespace paddle { +namespace platform { +namespace details { + +#ifdef PADDLE_WITH_MLU +void ProcessCnpapiActivityRecord(const cnpapiActivity* record, + uint64_t start_ns, + TraceEventCollector* collector); +#endif + +} // namespace details +} // namespace platform +} // namespace paddle diff --git a/paddle/fluid/platform/profiler/mlu/mlu_tracer.cc b/paddle/fluid/platform/profiler/mlu/mlu_tracer.cc new file mode 100644 index 0000000000000..2d719a8bbfdcb --- /dev/null +++ b/paddle/fluid/platform/profiler/mlu/mlu_tracer.cc @@ -0,0 +1,154 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/platform/profiler/mlu/mlu_tracer.h" +#include +#include +#include "glog/logging.h" +#include "paddle/fluid/framework/new_executor/workqueue/workqueue_utils.h" +#include "paddle/fluid/platform/os_info.h" +#include "paddle/fluid/platform/profiler/mlu/cnpapi_data_process.h" + +#define CNPAPI_CALL(call) \ + do { \ + cnpapiResult _status = call; \ + if (_status != CNPAPI_SUCCESS) { \ + const char* errstr; \ + cnpapiGetResultString(_status, &errstr); \ + LOG(ERROR) << "Function " << #call << " failed with error " << errstr; \ + } \ + } while (0) + +namespace paddle { +namespace platform { + +namespace { + +void BufferRequestedCallback(uint64_t** buffer, size_t* size, + size_t* max_num_records) { + constexpr size_t kBufferSize = 1 << 23; // 8 MB + constexpr size_t kBufferAlignSize = 8; + *buffer = reinterpret_cast( + paddle::framework::AlignedMalloc(kBufferSize, kBufferAlignSize)); + *size = kBufferSize; + *max_num_records = 0; +} + +void BufferCompletedCallback(uint64_t* buffer, size_t size, size_t valid_size) { + if (buffer == nullptr || valid_size == 0) { + return; + } + auto mlu_tracer = &MluTracer::GetInstance(); + mlu_tracer->ProcessCnpapiActivity(buffer, valid_size); + + paddle::framework::AlignedFree(buffer); +} + +} // namespace + +MluTracer::MluTracer() { +#ifdef PADDLE_WITH_MLU + CNPAPI_CALL(cnpapiInit()); + CNPAPI_CALL(cnpapiActivityRegisterCallbacks(BufferRequestedCallback, + BufferCompletedCallback)); +#endif +} + +void MluTracer::PrepareTracing() { + PADDLE_ENFORCE_EQ( + state_ == TracerState::UNINITED || state_ == TracerState::STOPED, true, + platform::errors::PreconditionNotMet("MluTracer must be UNINITED")); + EnableCnpapiActivity(); + state_ = TracerState::READY; +} + +void MluTracer::StartTracing() { + PADDLE_ENFORCE_EQ(state_ == TracerState::READY, true, + platform::errors::PreconditionNotMet( + "MluTracer must be READY or STOPPED")); + tracing_start_ns_ = PosixInNsec(); + state_ = TracerState::STARTED; +} + +void MluTracer::StopTracing() { + PADDLE_ENFORCE_EQ( + state_, TracerState::STARTED, + platform::errors::PreconditionNotMet("MluTracer must be STARTED")); + DisableCnpapiActivity(); + state_ = TracerState::STOPED; +} + +void MluTracer::CollectTraceData(TraceEventCollector* collector) { + PADDLE_ENFORCE_EQ( + state_, TracerState::STOPED, + platform::errors::PreconditionNotMet("MluTracer must be STOPED")); + for (auto he : collector_.HostEvents()) { + collector->AddHostEvent(std::move(he)); + } + for (auto rte : collector_.RuntimeEvents()) { + collector->AddRuntimeEvent(std::move(rte)); + } + for (auto de : collector_.DeviceEvents()) { + collector->AddDeviceEvent(std::move(de)); + } + for (auto tn : collector_.ThreadNames()) { + collector->AddThreadName(tn.first, tn.second); + } + collector_.ClearAll(); +} + +void MluTracer::ProcessCnpapiActivity(uint64_t* buffer, size_t valid_size) { +#ifdef PADDLE_WITH_MLU + cnpapiActivity* record = nullptr; + while (true) { + cnpapiResult status = + cnpapiActivityGetNextRecord(buffer, valid_size, &record); + if (status == CNPAPI_SUCCESS) { + details::ProcessCnpapiActivityRecord(record, tracing_start_ns_, + &collector_); + } else if (status == CNPAPI_ERROR_INSUFFICIENT_MEMORY || + status == CNPAPI_ERROR_MAX_LIMIT_REACHED) { + break; + } else { + CNPAPI_CALL(status); + } + } +#endif +} + +void MluTracer::EnableCnpapiActivity() { +#ifdef PADDLE_WITH_MLU + CNPAPI_CALL(cnpapiActivityEnable(CNPAPI_ACTIVITY_TYPE_KERNEL)); + CNPAPI_CALL(cnpapiActivityEnable(CNPAPI_ACTIVITY_TYPE_MEMCPY)); + CNPAPI_CALL(cnpapiActivityEnable(CNPAPI_ACTIVITY_TYPE_MEMCPY_PTOP)); + CNPAPI_CALL(cnpapiActivityEnable(CNPAPI_ACTIVITY_TYPE_MEMSET)); + CNPAPI_CALL(cnpapiActivityEnable(CNPAPI_ACTIVITY_TYPE_CNDRV_API)); + VLOG(3) << "enable cnpapi activity"; +#endif +} + +void MluTracer::DisableCnpapiActivity() { +#ifdef PADDLE_WITH_MLU + CNPAPI_CALL(cnpapiActivityFlushAll()); + CNPAPI_CALL(cnpapiActivityDisable(CNPAPI_ACTIVITY_TYPE_KERNEL)); + CNPAPI_CALL(cnpapiActivityDisable(CNPAPI_ACTIVITY_TYPE_MEMCPY)); + CNPAPI_CALL(cnpapiActivityDisable(CNPAPI_ACTIVITY_TYPE_MEMCPY_PTOP)); + CNPAPI_CALL(cnpapiActivityDisable(CNPAPI_ACTIVITY_TYPE_MEMSET)); + CNPAPI_CALL(cnpapiActivityDisable(CNPAPI_ACTIVITY_TYPE_CNDRV_API)); + VLOG(3) << "disable cnpapi activity"; +#endif +} + +} // namespace platform +} // namespace paddle diff --git a/paddle/fluid/platform/profiler/mlu/mlu_tracer.h b/paddle/fluid/platform/profiler/mlu/mlu_tracer.h new file mode 100644 index 0000000000000..43c712b13ae2c --- /dev/null +++ b/paddle/fluid/platform/profiler/mlu/mlu_tracer.h @@ -0,0 +1,60 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#ifdef PADDLE_WITH_MLU +#include "paddle/fluid/platform/device/mlu/mlu_info.h" +#endif +#include "paddle/fluid/platform/macros.h" +#include "paddle/fluid/platform/profiler/tracer_base.h" + +namespace paddle { +namespace platform { + +class MluTracer : public TracerBase { + public: + static MluTracer& GetInstance() { + static MluTracer instance; + return instance; + } + + void PrepareTracing() override; + + void StartTracing() override; + + void StopTracing() override; + + void CollectTraceData(TraceEventCollector* collector) override; + + void ProcessCnpapiActivity(uint64_t* buffer, size_t valid_size); + + private: + MluTracer(); + + DISABLE_COPY_AND_ASSIGN(MluTracer); + + void EnableCnpapiActivity(); + + void DisableCnpapiActivity(); + + uint64_t tracing_start_ns_ = UINT64_MAX; + + TraceEventCollector collector_; +}; + +} // namespace platform +} // namespace paddle diff --git a/paddle/fluid/platform/profiler/profiler.cc b/paddle/fluid/platform/profiler/profiler.cc index ac46fbed10a20..a417eda1509e5 100644 --- a/paddle/fluid/platform/profiler/profiler.cc +++ b/paddle/fluid/platform/profiler/profiler.cc @@ -27,6 +27,7 @@ #include "paddle/fluid/platform/profiler/cuda_tracer.h" #include "paddle/fluid/platform/profiler/extra_info.h" #include "paddle/fluid/platform/profiler/host_tracer.h" +#include "paddle/fluid/platform/profiler/mlu/mlu_tracer.h" #include "paddle/fluid/platform/profiler/trace_event_collector.h" #include "paddle/fluid/platform/profiler/utils.h" @@ -52,6 +53,14 @@ bool Profiler::IsCuptiSupported() { return supported; } +bool Profiler::IsCnpapiSupported() { + bool supported = false; +#ifdef PADDLE_WITH_MLU + supported = true; +#endif + return supported; +} + Profiler::Profiler(const ProfilerOptions& options) { options_ = options; std::bitset<32> trace_switch(options_.trace_switch); @@ -63,6 +72,9 @@ Profiler::Profiler(const ProfilerOptions& options) { if (trace_switch.test(kProfileGPUOptionBit)) { tracers_.emplace_back(&CudaTracer::GetInstance(), false); } + if (trace_switch.test(kProfileMLUOptionBit)) { + tracers_.emplace_back(&MluTracer::GetInstance(), false); + } } Profiler::~Profiler() { alive_.store(false); } diff --git a/paddle/fluid/platform/profiler/profiler.h b/paddle/fluid/platform/profiler/profiler.h index d24ee504bc640..ea346a4fb748d 100644 --- a/paddle/fluid/platform/profiler/profiler.h +++ b/paddle/fluid/platform/profiler/profiler.h @@ -33,9 +33,10 @@ namespace platform { static constexpr uint32_t kProfileCPUOptionBit = 0; static constexpr uint32_t kProfileGPUOptionBit = 1; +static constexpr uint32_t kProfileMLUOptionBit = 2; struct ProfilerOptions { - uint32_t trace_switch = 0; // bit 0: cpu, bit 1: gpu + uint32_t trace_switch = 0; // bit 0: cpu, bit 1: gpu, bit 2: mlu uint32_t trace_level = FLAGS_host_trace_level; }; @@ -45,6 +46,8 @@ class Profiler { static bool IsCuptiSupported(); + static bool IsCnpapiSupported(); + void Prepare(); void Start(); diff --git a/paddle/fluid/platform/profiler/trace_event.h b/paddle/fluid/platform/profiler/trace_event.h index 16ef62fb51555..6d398a26eda10 100644 --- a/paddle/fluid/platform/profiler/trace_event.h +++ b/paddle/fluid/platform/profiler/trace_event.h @@ -50,6 +50,8 @@ enum class TracerEventType { PythonOp = 13, // Used to mark python level userdefined PythonUserDefined = 14, + // Used to mark mlu runtime record returned by cnpapi + MluRuntime = 15, // A flag to denote the number of current types NumTypes }; diff --git a/paddle/fluid/platform/profiler/trace_event_collector.h b/paddle/fluid/platform/profiler/trace_event_collector.h index cc85a178d14e5..5f2bc9dc90db9 100644 --- a/paddle/fluid/platform/profiler/trace_event_collector.h +++ b/paddle/fluid/platform/profiler/trace_event_collector.h @@ -52,6 +52,13 @@ class TraceEventCollector { return thread_names_; } + void ClearAll() { + thread_names_.clear(); + host_events_.clear(); + runtime_events_.clear(); + device_events_.clear(); + } + private: std::unordered_map thread_names_; std::list host_events_; diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc index 7b63fdd6dd4cb..982bf7646125b 100644 --- a/paddle/fluid/pybind/pybind.cc +++ b/paddle/fluid/pybind/pybind.cc @@ -3369,6 +3369,8 @@ All parameter, weight, gradient are variables in Paddle. .def("create", &paddle::platform::Profiler::Create, py::return_value_policy::take_ownership) .def("is_cupti_supported", &paddle::platform::Profiler::IsCuptiSupported) + .def("is_cnpapi_supported", + &paddle::platform::Profiler::IsCnpapiSupported) .def("prepare", [](paddle::platform::Profiler *profiler) { platform::EnableHostEventRecorder(); diff --git a/python/paddle/profiler/profiler.py b/python/paddle/profiler/profiler.py index 2fae583397a8e..3e60a82f1214a 100644 --- a/python/paddle/profiler/profiler.py +++ b/python/paddle/profiler/profiler.py @@ -52,16 +52,19 @@ class ProfilerState(Enum): class ProfilerTarget(Enum): r""" - ProfilerTarget is used to specify target device for :ref:`profiling ` . Only CPU and GPU are supported currently. + ProfilerTarget is used to specify target device for :ref:`profiling ` . Only CPU, GPU and MLU are supported currently. The meaning of each ProfilerState is as following - **ProfilerTarget.CPU** : Profile events on CPU. - **ProfilerTarget.GPU** : Profile events on GPU. + + - **ProfilerTarget.MLU** : Profile events on MLU. """ CPU = 0 GPU = 1 + MLU = 2 def make_scheduler(*, @@ -258,6 +261,8 @@ def _get_supported_targets() -> Iterable[ProfilerTarget]: """ if _Profiler.is_cupti_supported(): return [ProfilerTarget.CPU, ProfilerTarget.GPU] + if _Profiler.is_cnpapi_supported(): + return [ProfilerTarget.CPU, ProfilerTarget.MLU] return [ProfilerTarget.CPU] @@ -266,7 +271,7 @@ class Profiler: Profiler context manager, user interface to manage profiling process to start, stop, export profiling data and print summary table. Args: - targets (list, optional): specify target devices to profile, and all existing and supported devices will be chosen by default. Currently supported values, :ref:`ProfilerTarget.CPU ` and :ref:`ProfilerTarget.GPU ` . + targets (list, optional): specify target devices to profile, and all existing and supported devices will be chosen by default. Currently supported values, :ref:`ProfilerTarget.CPU ` , :ref:`ProfilerTarget.GPU ` and :ref:`ProfilerTarget.MLU ` . scheduler (Callable|tuple, optional): If it is a callable object, it takes a step number as parameter and return the corresponding :ref:`ProfilerState `. This callable object can be generated by :ref:`make_scheduler ` function. If not provided (None), the default scheduler will keep tracing until the profiler exits. If it is a tuple, it has two values start_batch and end_batch, which means profiling range [start_batch, end_batch). @@ -407,6 +412,8 @@ def __init__( profileoption.trace_switch |= 1 if ProfilerTarget.GPU in self.targets: profileoption.trace_switch |= (1 << 1) + if ProfilerTarget.MLU in self.targets: + profileoption.trace_switch |= (1 << 2) wrap_optimizers() self.profiler = _Profiler.create(profileoption) if callable(scheduler): From 10114859e2a5cfcc480fcae88db03a739aed98b0 Mon Sep 17 00:00:00 2001 From: fwenguang <95677191+fwenguang@users.noreply.github.com> Date: Fri, 15 Apr 2022 10:39:21 +0800 Subject: [PATCH 169/211] [MLU] add mlu activation kernels (#41751) --- paddle/fluid/operators/activation_op_mlu.cc | 138 ++++++++++++--- .../fluid/operators/fill_constant_op_mlu.cc | 16 +- paddle/fluid/operators/mean_op_mlu.cc | 3 +- .../operators/metrics/accuracy_op_mlu.cc | 8 +- paddle/fluid/operators/mlu/mlu_baseop.cc | 23 ++- paddle/fluid/operators/mlu/mlu_baseop.h | 6 +- .../optimizers/merged_momentum_op_mlu.cc | 5 +- .../operators/optimizers/momentum_op_mlu.cc | 3 +- .../reduce_ops/reduce_mean_op_mlu.cc | 4 +- paddle/fluid/operators/scale_op_mlu.cc | 10 +- .../tests/unittests/mlu/test_gelu_op_mlu.py | 151 ++++++++++++++++ .../unittests/mlu/test_leaky_relu_op_mlu.py | 143 +++++++++++++++ .../tests/unittests/mlu/test_relu6_op_mlu.py | 164 ++++++++++++++++++ .../unittests/mlu/test_sigmoid_op_mlu.py | 65 +++++++ .../tests/unittests/mlu/test_tanh_op_mlu.py | 147 ++++++++++++++++ 15 files changed, 831 insertions(+), 55 deletions(-) create mode 100644 python/paddle/fluid/tests/unittests/mlu/test_gelu_op_mlu.py create mode 100644 python/paddle/fluid/tests/unittests/mlu/test_leaky_relu_op_mlu.py create mode 100644 python/paddle/fluid/tests/unittests/mlu/test_relu6_op_mlu.py create mode 100644 python/paddle/fluid/tests/unittests/mlu/test_sigmoid_op_mlu.py create mode 100644 python/paddle/fluid/tests/unittests/mlu/test_tanh_op_mlu.py diff --git a/paddle/fluid/operators/activation_op_mlu.cc b/paddle/fluid/operators/activation_op_mlu.cc index 43d662830c0c8..f66b75fd1f319 100644 --- a/paddle/fluid/operators/activation_op_mlu.cc +++ b/paddle/fluid/operators/activation_op_mlu.cc @@ -15,12 +15,8 @@ limitations under the Licnse. */ #include #include -#include "paddle/fluid/framework/framework.pb.h" -#include "paddle/fluid/framework/tensor_util.h" #include "paddle/fluid/operators/activation_op.h" #include "paddle/fluid/operators/mlu/mlu_baseop.h" -#include "paddle/fluid/platform/device/mlu/device_context.h" -#include "paddle/phi/core/ddim.h" namespace paddle { namespace operators { @@ -38,20 +34,39 @@ class ActivationMLUKernel : public framework::OpKernel { output->mutable_data(ctx.GetPlace()); MLUCnnlActivationDesc act_desc(act_mode, alpha); - MLUCnnlTensorDesc input_desc(*input, CNNL_LAYOUT_ARRAY, - ToCnnlDataType(input->dtype())); - MLUCnnlTensorDesc output_desc(*output, CNNL_LAYOUT_ARRAY, - ToCnnlDataType(output->dtype())); - - MLUCnnl::Active(ctx, act_desc.get(), input_desc.get(), - reinterpret_cast(input->data()), - output_desc.get(), - reinterpret_cast(output->data())); + MLUCnnlTensorDesc input_desc(*input); + MLUCnnlTensorDesc output_desc(*output); + + MLUCnnl::Active(ctx, act_desc.get(), input_desc.get(), GetBasePtr(input), + output_desc.get(), GetBasePtr(output)); } }; +// For gelu, leaky_relu template -class ActivationGradMLUKernel : public framework::OpKernel { +class ActivationGradMLUKernelV1 : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* x = ctx.Input("X"); + auto* dout = ctx.Input(framework::GradVarName("Out")); + auto* dx = ctx.Output(framework::GradVarName("X")); + float alpha = ctx.HasAttr("alpha") ? ctx.Attr("alpha") : 1.0f; + + dx->mutable_data(ctx.GetPlace()); + + MLUCnnlTensorDesc x_desc(*x); + MLUCnnlTensorDesc dout_desc(*dout); + MLUCnnlTensorDesc dx_desc(*dx); + MLUCnnlActivationDesc act_desc(act_mode, alpha); + MLUCnnl::ActiveGrad(ctx, act_desc.get(), nullptr, nullptr, nullptr, nullptr, + dout_desc.get(), GetBasePtr(dout), x_desc.get(), + GetBasePtr(x), dx_desc.get(), GetBasePtr(dx)); + } +}; + +// For tanh, sigmoid +template +class ActivationGradMLUKernelV2 : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { auto* out = ctx.Input("Out"); @@ -61,18 +76,35 @@ class ActivationGradMLUKernel : public framework::OpKernel { dx->mutable_data(ctx.GetPlace()); - MLUCnnlTensorDesc dout_desc(*dout, CNNL_LAYOUT_ARRAY, - ToCnnlDataType(dout->dtype())); - MLUCnnlTensorDesc out_desc(*out, CNNL_LAYOUT_ARRAY, - ToCnnlDataType(out->dtype())); - MLUCnnlTensorDesc dx_desc(*dx, CNNL_LAYOUT_ARRAY, - ToCnnlDataType(dx->dtype())); + MLUCnnlTensorDesc out_desc(*out); + MLUCnnlTensorDesc dout_desc(*dout); + MLUCnnlTensorDesc dx_desc(*dx); MLUCnnlActivationDesc act_desc(act_mode, alpha); - MLUCnnl::ActiveGrad( - ctx, act_desc.get(), nullptr, nullptr, nullptr, nullptr, - dout_desc.get(), reinterpret_cast(dout->data()), - out_desc.get(), reinterpret_cast(out->data()), - dx_desc.get(), reinterpret_cast(dx->data())); + MLUCnnl::ActiveGrad(ctx, act_desc.get(), nullptr, nullptr, out_desc.get(), + GetBasePtr(out), dout_desc.get(), GetBasePtr(dout), + nullptr, nullptr, dx_desc.get(), GetBasePtr(dx)); + } +}; + +// For relu, relu6 +template +class ActivationGradMLUKernelV3 : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* out = ctx.Input("Out"); + auto* dout = ctx.Input(framework::GradVarName("Out")); + auto* dx = ctx.Output(framework::GradVarName("X")); + float alpha = ctx.HasAttr("alpha") ? ctx.Attr("alpha") : 1.0f; + + dx->mutable_data(ctx.GetPlace()); + + MLUCnnlTensorDesc out_desc(*out); + MLUCnnlTensorDesc dout_desc(*dout); + MLUCnnlTensorDesc dx_desc(*dx); + MLUCnnlActivationDesc act_desc(act_mode, alpha); + MLUCnnl::ActiveGrad(ctx, act_desc.get(), nullptr, nullptr, nullptr, nullptr, + dout_desc.get(), GetBasePtr(dout), out_desc.get(), + GetBasePtr(out), dx_desc.get(), GetBasePtr(dx)); } }; @@ -81,10 +113,60 @@ class ActivationGradMLUKernel : public framework::OpKernel { namespace ops = paddle::operators; +// relu REGISTER_OP_MLU_KERNEL( relu, ops::ActivationMLUKernel, ops::ActivationMLUKernel); REGISTER_OP_MLU_KERNEL( - relu_grad, ops::ActivationGradMLUKernel, - ops::ActivationGradMLUKernel); + relu_grad, ops::ActivationGradMLUKernelV3, + ops::ActivationGradMLUKernelV3); + +// relu6 +REGISTER_OP_MLU_KERNEL( + relu6, ops::ActivationMLUKernel, + ops::ActivationMLUKernel); +REGISTER_OP_MLU_KERNEL( + relu6_grad, ops::ActivationGradMLUKernelV3, + ops::ActivationGradMLUKernelV3); + +// sigmoid +REGISTER_OP_MLU_KERNEL(sigmoid, + ops::ActivationMLUKernel, + ops::ActivationMLUKernel); +REGISTER_OP_MLU_KERNEL( + sigmoid_grad, + ops::ActivationGradMLUKernelV2, + ops::ActivationGradMLUKernelV2); + +// tanh +REGISTER_OP_MLU_KERNEL( + tanh, ops::ActivationMLUKernel, + ops::ActivationMLUKernel); +REGISTER_OP_MLU_KERNEL( + tanh_grad, ops::ActivationGradMLUKernelV2, + ops::ActivationGradMLUKernelV2); + +// gelu +REGISTER_OP_MLU_KERNEL( + gelu, ops::ActivationMLUKernel, + ops::ActivationMLUKernel); +REGISTER_OP_MLU_KERNEL( + gelu_grad, ops::ActivationGradMLUKernelV1, + ops::ActivationGradMLUKernelV1); + +// leaky_relu +REGISTER_OP_MLU_KERNEL( + leaky_relu, ops::ActivationMLUKernel, + ops::ActivationMLUKernel); +REGISTER_OP_MLU_KERNEL( + leaky_relu_grad, + ops::ActivationGradMLUKernelV1, + ops::ActivationGradMLUKernelV1); diff --git a/paddle/fluid/operators/fill_constant_op_mlu.cc b/paddle/fluid/operators/fill_constant_op_mlu.cc index 10e7c72d158e6..f7463c5dd8821 100644 --- a/paddle/fluid/operators/fill_constant_op_mlu.cc +++ b/paddle/fluid/operators/fill_constant_op_mlu.cc @@ -51,6 +51,8 @@ class FillConstantMLUKernel : public framework::OpKernel { } } } + const T *value_data = &value; + cnnlPointerMode_t pointer_mode = CNNL_POINTER_MODE_HOST; if (ctx.HasInput("ValueTensor")) { auto *value_tensor = ctx.Input("ValueTensor"); PADDLE_ENFORCE_EQ( @@ -59,22 +61,18 @@ class FillConstantMLUKernel : public framework::OpKernel { "When use Tensor as value to set Tensor value in fill_cosntant, " "value input(ValueTensor) size must be 1, but get %d", value_tensor->numel())); - const T *tensor_data = value_tensor->data(); - framework::Tensor mlu_tensor; + value_data = value_tensor->data(); auto tmp_place = value_tensor->place(); if (platform::is_mlu_place(tmp_place)) { - framework::TensorCopySync(*value_tensor, platform::CPUPlace(), - &mlu_tensor); - tensor_data = mlu_tensor.data(); + pointer_mode = CNNL_POINTER_MODE_DEVICE; } - value = tensor_data[0]; } auto shape = GetShape(ctx); out_var->mutable_data(shape, ctx.GetPlace()); - MLUCnnlTensorDesc output_desc(*out_var, CNNL_LAYOUT_ARRAY, - ToCnnlDataType(out_var->dtype())); - MLUCnnl::Fill(ctx, value, output_desc.get(), GetBasePtr(out_var)); + MLUCnnlTensorDesc output_desc(*out_var); + MLUCnnl::Fill(ctx, pointer_mode, value_data, output_desc.get(), + GetBasePtr(out_var)); } }; } // namespace operators diff --git a/paddle/fluid/operators/mean_op_mlu.cc b/paddle/fluid/operators/mean_op_mlu.cc index 1fed01194c1a6..1456e749b1343 100644 --- a/paddle/fluid/operators/mean_op_mlu.cc +++ b/paddle/fluid/operators/mean_op_mlu.cc @@ -95,7 +95,8 @@ class MeanMLUGradKernel : public framework::OpKernel { MLUCnnlTensorDesc mean_var_desc(mean_var, CNNL_LAYOUT_ARRAY, ToCnnlDataType(mean_var.dtype())); auto value = static_cast(1.0 / static_cast(input_grad->numel())); - MLUCnnl::Fill(context, value, mean_var_desc.get(), GetBasePtr(&mean_var)); + MLUCnnl::Fill(context, CNNL_POINTER_MODE_HOST, &value, mean_var_desc.get(), + GetBasePtr(&mean_var)); // means mul output_grad MLUCnnlTensorDesc in_desc(*output_grad, CNNL_LAYOUT_ARRAY, diff --git a/paddle/fluid/operators/metrics/accuracy_op_mlu.cc b/paddle/fluid/operators/metrics/accuracy_op_mlu.cc index 1ce02ff4525c9..26c31d82e36eb 100644 --- a/paddle/fluid/operators/metrics/accuracy_op_mlu.cc +++ b/paddle/fluid/operators/metrics/accuracy_op_mlu.cc @@ -136,15 +136,17 @@ class AccuracyMLUKernel : public framework::OpKernel { // [total] total->mutable_data(ctx.GetPlace()); MLUCnnlTensorDesc total_desc(*total); - MLUCnnl::Fill(ctx, num_samples, total_desc.get(), GetBasePtr(total)); + MLUCnnl::Fill(ctx, CNNL_POINTER_MODE_HOST, &num_samples, total_desc.get(), + GetBasePtr(total)); // use `total` of type `float32` for calculating accuracy Tensor total_fp32(framework::TransToPhiDataType(VT::FP32)); total_fp32.Resize(total->dims()); total_fp32.mutable_data(ctx.GetPlace()); MLUCnnlTensorDesc total_fp32_desc(total_fp32); - MLUCnnl::Fill(ctx, static_cast(num_samples), total_fp32_desc.get(), - GetBasePtr(&total_fp32)); + float num_samples_fp32 = static_cast(num_samples); + MLUCnnl::Fill(ctx, CNNL_POINTER_MODE_HOST, &num_samples_fp32, + total_fp32_desc.get(), GetBasePtr(&total_fp32)); // [accuracy] accuracy->mutable_data(ctx.GetPlace()); diff --git a/paddle/fluid/operators/mlu/mlu_baseop.cc b/paddle/fluid/operators/mlu/mlu_baseop.cc index 1fdaa153e3c27..df091a7dc7535 100644 --- a/paddle/fluid/operators/mlu/mlu_baseop.cc +++ b/paddle/fluid/operators/mlu/mlu_baseop.cc @@ -208,8 +208,20 @@ MLUCnnlTensorDesc::~MLUCnnlTensorDesc() { MLUCnnlActivationDesc::MLUCnnlActivationDesc( const cnnlActivationMode_t act_mode, const float ceof) { PADDLE_ENFORCE_MLU_SUCCESS(cnnlCreateActivationDescriptor(&active_desc_)); - PADDLE_ENFORCE_MLU_SUCCESS(cnnlSetActivationDescriptor( - active_desc_, act_mode, CNNL_NOT_PROPAGATE_NAN, ceof)); + PADDLE_ENFORCE_MLU_SUCCESS(cnnlSetActivationDescriptor_v4( + active_desc_, act_mode, CNNL_ACTIVATION_HIGH_PRECISION, + CNNL_NOT_PROPAGATE_NAN, ceof, 1.0f /*sliced_dim*/, + 1.67326319217681884765625 /*selu_alpha*/, + 1.05070102214813232421875 /*selu_lambda*/)); +} + +MLUCnnlActivationDesc::MLUCnnlActivationDesc( + const cnnlActivationMode_t act_mode, const float ceof, + const float sliced_dim, const float selu_alpha, const float selu_lambda) { + PADDLE_ENFORCE_MLU_SUCCESS(cnnlCreateActivationDescriptor(&active_desc_)); + PADDLE_ENFORCE_MLU_SUCCESS(cnnlSetActivationDescriptor_v4( + active_desc_, act_mode, CNNL_ACTIVATION_HIGH_PRECISION, + CNNL_NOT_PROPAGATE_NAN, ceof, sliced_dim, selu_alpha, selu_lambda)); } const cnnlActivationDescriptor_t MLUCnnlActivationDesc::get() const { @@ -541,12 +553,15 @@ MLUCnnlTrigonDesc::~MLUCnnlTrigonDesc() { output_desc, output)); } -/* static */ void MLUCnnl::Fill(const ExecutionContext& ctx, float value, +/* static */ void MLUCnnl::Fill(const ExecutionContext& ctx, + const cnnlPointerMode_t pointer_mode, + const void* value_ptr, const cnnlTensorDescriptor_t output_desc, void* output) { cnnlHandle_t handle = GetHandleFromCTX(ctx); - PADDLE_ENFORCE_MLU_SUCCESS(cnnlFill(handle, value, output_desc, output)); + PADDLE_ENFORCE_MLU_SUCCESS( + cnnlFill_v3(handle, pointer_mode, value_ptr, output_desc, output)); } /* static */ void MLUCnnl::QuantifyOffline( diff --git a/paddle/fluid/operators/mlu/mlu_baseop.h b/paddle/fluid/operators/mlu/mlu_baseop.h index b55b10686e92e..64a99b2a6d273 100644 --- a/paddle/fluid/operators/mlu/mlu_baseop.h +++ b/paddle/fluid/operators/mlu/mlu_baseop.h @@ -218,6 +218,9 @@ class MLUCnnlActivationDesc { MLUCnnlActivationDesc(const MLUCnnlActivationDesc& desc) = delete; MLUCnnlActivationDesc& operator=(const MLUCnnlActivationDesc& desc) = delete; MLUCnnlActivationDesc(const cnnlActivationMode_t act_mode, const float ceof); + MLUCnnlActivationDesc(const cnnlActivationMode_t act_mode, const float ceof, + const float sliced_dim, const float selu_alpha, + const float selu_lambda); const cnnlActivationDescriptor_t get() const; ~MLUCnnlActivationDesc(); @@ -418,7 +421,8 @@ class MLUCnnl { const cnnlTensorDescriptor_t in1_desc, const void* in1, const cnnlTensorDescriptor_t output_desc, void* output); - static void Fill(const ExecutionContext& ctx, float value, + static void Fill(const ExecutionContext& ctx, + const cnnlPointerMode_t pointer_mode, const void* value_ptr, const cnnlTensorDescriptor_t output_desc, void* output); static void LRN(const ExecutionContext& ctx, const int local_size, diff --git a/paddle/fluid/operators/optimizers/merged_momentum_op_mlu.cc b/paddle/fluid/operators/optimizers/merged_momentum_op_mlu.cc index e5399ee36ba7f..b84a2bc579d3e 100644 --- a/paddle/fluid/operators/optimizers/merged_momentum_op_mlu.cc +++ b/paddle/fluid/operators/optimizers/merged_momentum_op_mlu.cc @@ -69,7 +69,7 @@ class MLUMergedMomentumOpKernel : public framework::OpKernel { "the same Tensors.")); } - auto mu = ctx.Attr("mu"); + auto mu = static_cast(ctx.Attr("mu")); auto lrs = ctx.MultiInput("LearningRate"); if (lrs.size() != 1) { PADDLE_ENFORCE_EQ( @@ -114,7 +114,8 @@ class MLUMergedMomentumOpKernel : public framework::OpKernel { Tensor mu_tensor = ctx.AllocateTmpTensor({1}, dev_ctx); MLUCnnlTensorDesc mu_tensor_desc(mu_tensor); - MLUCnnl::Fill(ctx, mu, mu_tensor_desc.get(), GetBasePtr(&mu_tensor)); + MLUCnnl::Fill(ctx, CNNL_POINTER_MODE_HOST, &mu, mu_tensor_desc.get(), + GetBasePtr(&mu_tensor)); for (size_t idx = 0; idx < n; ++idx) { RegularizationType regularization_flag = diff --git a/paddle/fluid/operators/optimizers/momentum_op_mlu.cc b/paddle/fluid/operators/optimizers/momentum_op_mlu.cc index 91e8aa643b981..71af14fd91c8c 100644 --- a/paddle/fluid/operators/optimizers/momentum_op_mlu.cc +++ b/paddle/fluid/operators/optimizers/momentum_op_mlu.cc @@ -52,7 +52,8 @@ class MLUMomentumOpKernel : public framework::OpKernel { Tensor mu_tensor = ctx.AllocateTmpTensor({1}, dev_ctx); MLUCnnlTensorDesc mu_tensor_desc(mu_tensor); - MLUCnnl::Fill(ctx, mu, mu_tensor_desc.get(), GetBasePtr(&mu_tensor)); + MLUCnnl::Fill(ctx, CNNL_POINTER_MODE_HOST, &mu, mu_tensor_desc.get(), + GetBasePtr(&mu_tensor)); Tensor regularized_grad; MLUCnnlTensorDesc param_desc(*param); diff --git a/paddle/fluid/operators/reduce_ops/reduce_mean_op_mlu.cc b/paddle/fluid/operators/reduce_ops/reduce_mean_op_mlu.cc index 45f4e43378f44..89e578dbdb6b7 100644 --- a/paddle/fluid/operators/reduce_ops/reduce_mean_op_mlu.cc +++ b/paddle/fluid/operators/reduce_ops/reduce_mean_op_mlu.cc @@ -103,8 +103,8 @@ class ReduceMeanGradMLUKernel : public framework::OpKernel { ToCnnlDataType(input_grad->dtype())); auto value = static_cast(1.0 / static_cast(reduce_numel)); - MLUCnnl::Fill(context, value, input_grad_desc.get(), - GetBasePtr(input_grad)); + MLUCnnl::Fill(context, CNNL_POINTER_MODE_HOST, &value, + input_grad_desc.get(), GetBasePtr(input_grad)); MLUCnnlOpTensorDesc op_tensor_desc(CNNL_OP_TENSOR_MUL, ToCnnlDataType(), CNNL_NOT_PROPAGATE_NAN); diff --git a/paddle/fluid/operators/scale_op_mlu.cc b/paddle/fluid/operators/scale_op_mlu.cc index 5237e70e319ad..f9e313e64b1e1 100644 --- a/paddle/fluid/operators/scale_op_mlu.cc +++ b/paddle/fluid/operators/scale_op_mlu.cc @@ -27,7 +27,7 @@ class ScaleMLUKernel : public framework::OpKernel { auto* in = framework::GetLoDTensorOrSelectedRowsValueFromVar(*in_var); // cnnl require input, scale, bias with same type. And all in device side. - auto& scale = ctx.Attr("scale"); + auto scale = static_cast(ctx.Attr("scale")); framework::Tensor scale_tensor; if (ctx.HasInput("ScaleTensor")) { framework::Tensor float_scale_tensor = @@ -49,14 +49,16 @@ class ScaleMLUKernel : public framework::OpKernel { } else { scale_tensor = ctx.AllocateTmpTensor({1}, dev_ctx); MLUCnnlTensorDesc scale_desc(scale_tensor); - MLUCnnl::Fill(ctx, scale, scale_desc.get(), GetBasePtr(&scale_tensor)); + MLUCnnl::Fill(ctx, CNNL_POINTER_MODE_HOST, &scale, scale_desc.get(), + GetBasePtr(&scale_tensor)); } - auto& bias = ctx.Attr("bias"); + auto bias = static_cast(ctx.Attr("bias")); framework::Tensor bias_tensor = ctx.AllocateTmpTensor({1}, dev_ctx); MLUCnnlTensorDesc bias_desc(bias_tensor); - MLUCnnl::Fill(ctx, bias, bias_desc.get(), GetBasePtr(&bias_tensor)); + MLUCnnl::Fill(ctx, CNNL_POINTER_MODE_HOST, &bias, bias_desc.get(), + GetBasePtr(&bias_tensor)); auto* out_var = ctx.OutputVar("Out"); if (in_var->IsType() && in_var != out_var) { diff --git a/python/paddle/fluid/tests/unittests/mlu/test_gelu_op_mlu.py b/python/paddle/fluid/tests/unittests/mlu/test_gelu_op_mlu.py new file mode 100644 index 0000000000000..c62d30d43c089 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/mlu/test_gelu_op_mlu.py @@ -0,0 +1,151 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import numpy as np +from scipy import special +import unittest +import sys +sys.path.append("..") +from op_test import OpTest +import paddle +import paddle.fluid as fluid + +paddle.enable_static() +SEED = 2021 + + +def np_gelu(x): + y = 0.5 * x * (1 + special.erf(x / np.sqrt(2))) + return y + + +class TestGelu(OpTest): + def setUp(self): + self.set_mlu() + self.op_type = "gelu" + self.place = paddle.MLUPlace(0) + + self.init_dtype() + np.random.seed(SEED) + x = np.random.uniform(1, 2, [11, 17]).astype(self.dtype) + out = np_gelu(x) + + self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)} + self.attrs = {} + self.outputs = {'Out': out} + + def set_mlu(self): + self.__class__.use_mlu = True + + def init_dtype(self): + self.dtype = np.float32 + + def test_check_output(self): + self.check_output_with_place(self.place, atol=1e-3) + + def test_check_grad(self): + self.check_grad_with_place( + self.place, ['X'], 'Out', max_relative_error=0.007) + + +class TestGeluFp16(OpTest): + def setUp(self): + self.set_mlu() + self.op_type = "gelu" + self.place = paddle.MLUPlace(0) + + self.init_dtype() + np.random.seed(SEED) + x = np.random.uniform(1, 2, [3, 4]).astype(self.dtype) + out = np_gelu(x) + + self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)} + self.attrs = {} + self.outputs = {'Out': out} + + def set_mlu(self): + self.__class__.use_mlu = True + self.__class__.no_need_check_grad = True + + def init_dtype(self): + self.dtype = np.float16 + + def test_check_output(self): + self.check_output_with_place(self.place, atol=1e-3) + + +class TestGeluNet(unittest.TestCase): + def _test(self, run_mlu=True): + main_prog = paddle.static.Program() + startup_prog = paddle.static.Program() + main_prog.random_seed = SEED + startup_prog.random_seed = SEED + np.random.seed(SEED) + + a_np = np.random.random(size=(32, 32)).astype('float32') + b_np = np.random.random(size=(32, 32)).astype('float32') + label_np = np.random.randint(2, size=(32, 1)).astype('int64') + + with paddle.static.program_guard(main_prog, startup_prog): + a = paddle.static.data(name="a", shape=[32, 32], dtype='float32') + b = paddle.static.data(name="b", shape=[32, 32], dtype='float32') + label = paddle.static.data( + name="label", shape=[32, 1], dtype='int64') + + c = paddle.multiply(a, b) + + fc_1 = fluid.layers.fc(input=c, size=128) + fc_1_gelu = fluid.layers.gelu(fc_1) + prediction = fluid.layers.fc(input=fc_1_gelu, size=2, act='softmax') + + cost = fluid.layers.cross_entropy(input=prediction, label=label) + loss = fluid.layers.reduce_mean(cost) + sgd = fluid.optimizer.SGD(learning_rate=0.01) + sgd.minimize(loss) + + if run_mlu: + place = paddle.MLUPlace(0) + else: + place = paddle.CPUPlace() + + exe = paddle.static.Executor(place) + exe.run(startup_prog) + + print("Start run on {}".format(place)) + for epoch in range(100): + + pred_res, loss_res = exe.run( + main_prog, + feed={"a": a_np, + "b": b_np, + "label": label_np}, + fetch_list=[prediction, loss]) + if epoch % 10 == 0: + print("Epoch {} | Prediction[0]: {}, Loss: {}".format( + epoch, pred_res[0], loss_res)) + + return pred_res, loss_res + + def test_mlu(self): + cpu_pred, cpu_loss = self._test(False) + mlu_pred, mlu_loss = self._test(True) + + self.assertTrue(np.allclose(mlu_pred, cpu_pred, atol=1e-3)) + self.assertTrue(np.allclose(mlu_loss, cpu_loss, atol=1e-3)) + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/mlu/test_leaky_relu_op_mlu.py b/python/paddle/fluid/tests/unittests/mlu/test_leaky_relu_op_mlu.py new file mode 100644 index 0000000000000..ec2150fceb133 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/mlu/test_leaky_relu_op_mlu.py @@ -0,0 +1,143 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import numpy as np +import unittest +import sys +sys.path.append("..") +from op_test import OpTest +from test_activation_op import ref_leaky_relu +import paddle +import paddle.fluid as fluid + +paddle.enable_static() +SEED = 2021 + + +class TestLeadyRelu(OpTest): + def setUp(self): + self.set_mlu() + self.op_type = "leaky_relu" + self.place = paddle.MLUPlace(0) + + self.init_dtype() + np.random.seed(SEED) + + self.set_inputs() + self.set_attrs() + self.set_outputs() + + def set_inputs(self): + x = np.random.uniform(-1, 1, [11, 17]).astype(self.dtype) + self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)} + + def set_attrs(self): + self.attrs = {} + + def set_outputs(self): + alpha = 0.02 if 'alpha' not in self.attrs else self.attrs['alpha'] + out = ref_leaky_relu(self.inputs['X'], alpha) + self.outputs = {'Out': out} + + def set_mlu(self): + self.__class__.use_mlu = True + + def init_dtype(self): + self.dtype = np.float32 + + def test_check_output(self): + self.check_output_with_place(self.place) + + def test_check_grad(self): + if self.dtype == np.float16: + self.check_grad_with_place( + self.place, ['X'], 'Out', max_relative_error=0.006) + else: + self.check_grad_with_place(self.place, ['X'], 'Out') + + +class TestLeadyReluFP16(TestLeadyRelu): + def init_dtype(self): + self.dtype = np.float16 + + +class TestLeadyRelu2(TestLeadyRelu): + def set_attrs(self): + self.attrs = {'alpha': 0.5} + + +class TestLeadyRelu3(TestLeadyRelu): + def set_attrs(self): + self.attrs = {'alpha': -0.5} + + +class TestLeakyReluNet(unittest.TestCase): + def _test(self, run_mlu=True): + main_prog = paddle.static.Program() + startup_prog = paddle.static.Program() + main_prog.random_seed = SEED + startup_prog.random_seed = SEED + np.random.seed(SEED) + + x_np = np.random.random(size=(32, 32)).astype('float32') + label_np = np.random.randint(2, size=(32, 1)).astype('int64') + + with paddle.static.program_guard(main_prog, startup_prog): + x = paddle.static.data(name="x", shape=[32, 32], dtype='float32') + label = paddle.static.data( + name="label", shape=[32, 1], dtype='int64') + + y = paddle.nn.functional.leaky_relu(x) + + fc_1 = fluid.layers.fc(input=y, size=128) + prediction = fluid.layers.fc(input=fc_1, size=2, act='softmax') + + cost = fluid.layers.cross_entropy(input=prediction, label=label) + loss = fluid.layers.reduce_mean(cost) + sgd = fluid.optimizer.SGD(learning_rate=0.01) + sgd.minimize(loss) + + if run_mlu: + place = paddle.MLUPlace(0) + else: + place = paddle.CPUPlace() + + exe = paddle.static.Executor(place) + exe.run(startup_prog) + + print("Start run on {}".format(place)) + for epoch in range(100): + + pred_res, loss_res = exe.run(main_prog, + feed={"x": x_np, + "label": label_np}, + fetch_list=[prediction, loss]) + if epoch % 10 == 0: + print("Epoch {} | Prediction[0]: {}, Loss: {}".format( + epoch, pred_res[0], loss_res)) + + return pred_res, loss_res + + def test_mlu(self): + cpu_pred, cpu_loss = self._test(False) + mlu_pred, mlu_loss = self._test(True) + + self.assertTrue(np.allclose(mlu_pred, cpu_pred)) + self.assertTrue(np.allclose(mlu_loss, cpu_loss)) + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/mlu/test_relu6_op_mlu.py b/python/paddle/fluid/tests/unittests/mlu/test_relu6_op_mlu.py new file mode 100644 index 0000000000000..54b1afd036331 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/mlu/test_relu6_op_mlu.py @@ -0,0 +1,164 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function +import paddle.fluid as fluid +import paddle +from op_test import OpTest + +import numpy as np +import unittest +import sys +sys.path.append("..") + +paddle.enable_static() +SEED = 2021 + + +def ref_relu6(x, threshold=6.0): + out = np.copy(x) + out[np.abs(x - threshold) < 0.005] = threshold + 0.02 + out = np.minimum(np.maximum(x, 0), threshold) + return out + + +class TestRelu6(OpTest): + def setUp(self): + self.set_mlu() + self.op_type = "relu6" + self.place = paddle.MLUPlace(0) + + self.init_dtype() + np.random.seed(SEED) + x = np.random.uniform(-1, 10, [10, 12]).astype(self.dtype) + x[np.abs(x) < 0.005] = 0.02 + out = ref_relu6(x) + + self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)} + self.attrs = {'threshold': 6.0} + self.outputs = {'Out': out} + + def set_mlu(self): + self.__class__.use_mlu = True + + def test_check_output(self): + self.check_output_with_place(self.place) + + def test_check_grad(self): + self.check_grad_with_place(self.place, ['X'], 'Out') + + def init_dtype(self): + self.dtype = np.float32 + + +class TestRelu6Float16(TestRelu6): + def set_mlu(self): + self.__class__.use_mlu = True + self.__class__.no_need_check_grad = True + + def set_attrs(self): + self.dtype = np.float16 + + def test_check_output(self): + self.check_output_with_place(self.place) + + +class TestReluNeg(TestRelu6): + def setUp(self): + self.set_mlu() + self.op_type = "relu6" + self.place = paddle.MLUPlace(0) + + self.init_dtype() + np.random.seed(SEED) + x = np.random.uniform(-10, -1, [10, 12]).astype(self.dtype) + x[np.abs(x) < 0.005] = 0.02 + out = ref_relu6(x) + + self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)} + self.attrs = {'threshold': 6.0} + self.outputs = {'Out': out} + + def set_mlu(self): + self.__class__.use_mlu = True + + def init_dtype(self): + self.dtype = np.float32 + + def test_check_output(self): + self.check_output_with_place(self.place) + + +class TestRelu6Net(unittest.TestCase): + def _test(self, run_mlu=True): + main_prog = paddle.static.Program() + startup_prog = paddle.static.Program() + main_prog.random_seed = SEED + startup_prog.random_seed = SEED + np.random.seed(SEED) + + a_np = np.random.random(size=(32, 32)).astype('float32') + b_np = np.random.random(size=(32, 32)).astype('float32') + label_np = np.random.randint(2, size=(32, 1)).astype('int64') + + with paddle.static.program_guard(main_prog, startup_prog): + a = paddle.static.data(name="a", shape=[32, 32], dtype='float32') + b = paddle.static.data(name="b", shape=[32, 32], dtype='float32') + label = paddle.static.data( + name="label", shape=[32, 1], dtype='int64') + + sum = paddle.add(a, b) + z = paddle.nn.functional.relu6(sum) + + fc_1 = fluid.layers.fc(input=z, size=128) + prediction = fluid.layers.fc(input=fc_1, size=2, act='softmax') + + cost = fluid.layers.cross_entropy(input=prediction, label=label) + loss = fluid.layers.reduce_mean(cost) + sgd = fluid.optimizer.SGD(learning_rate=0.01) + sgd.minimize(loss) + + if run_mlu: + place = paddle.MLUPlace(0) + else: + place = paddle.CPUPlace() + + exe = paddle.static.Executor(place) + exe.run(startup_prog) + + print("Start run on {}".format(place)) + for epoch in range(100): + + pred_res, loss_res = exe.run( + main_prog, + feed={"a": a_np, + "b": b_np, + "label": label_np}, + fetch_list=[prediction, loss]) + if epoch % 10 == 0: + print("Epoch {} | Prediction[0]: {}, Loss: {}".format( + epoch, pred_res[0], loss_res)) + + return pred_res, loss_res + + def test_mlu(self): + cpu_pred, cpu_loss = self._test(False) + mlu_pred, mlu_loss = self._test(True) + + self.assertTrue(np.allclose(mlu_pred, cpu_pred)) + self.assertTrue(np.allclose(mlu_loss, cpu_loss)) + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/mlu/test_sigmoid_op_mlu.py b/python/paddle/fluid/tests/unittests/mlu/test_sigmoid_op_mlu.py new file mode 100644 index 0000000000000..f4c5612377e1c --- /dev/null +++ b/python/paddle/fluid/tests/unittests/mlu/test_sigmoid_op_mlu.py @@ -0,0 +1,65 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import numpy as np +import unittest +import sys +from paddle.fluid.tests.unittests.op_test import OpTest +import paddle +import paddle.fluid as fluid + +paddle.enable_static() +SEED = 2021 + + +class TestMLUSigmoid(OpTest): + def setUp(self): + self.op_type = "sigmoid" + self.set_mlu() + self.init_dtype() + + np.random.seed(SEED) + x = np.random.uniform(-1, 1, [11, 17]).astype(self.dtype) + out = 1 / (1 + np.exp(-x)) + + self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)} + self.outputs = {'Out': out} + + def test_check_output(self): + self.check_output_with_place(self.place) + + def test_check_grad(self): + self.check_grad_with_place( + self.place, ['X'], 'Out', max_relative_error=0.01) + + def set_mlu(self): + self.__class__.use_mlu = True + self.place = paddle.MLUPlace(0) + + def init_dtype(self): + self.dtype = np.float32 + + +class TestMLUSigmoidFp16(TestMLUSigmoid): + def test_check_output(self): + self.check_output_with_place(self.place, atol=1e-3) + + def init_dtype(self): + self.dtype = np.float16 + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/mlu/test_tanh_op_mlu.py b/python/paddle/fluid/tests/unittests/mlu/test_tanh_op_mlu.py new file mode 100644 index 0000000000000..a5aeeac0ffb9e --- /dev/null +++ b/python/paddle/fluid/tests/unittests/mlu/test_tanh_op_mlu.py @@ -0,0 +1,147 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import numpy as np +import unittest +import sys +sys.path.append("..") +from op_test import OpTest +import paddle +import paddle.fluid as fluid + +paddle.enable_static() +SEED = 2021 + + +class TestTanh(OpTest): + def setUp(self): + self.set_mlu() + self.op_type = "tanh" + self.place = paddle.MLUPlace(0) + + self.init_dtype() + np.random.seed(SEED) + x = np.random.uniform(1, 2, [11, 17]).astype(self.dtype) + out = np.tanh(x) + + self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)} + self.attrs = {} + self.outputs = {'Out': out} + + def set_mlu(self): + self.__class__.use_mlu = True + + def init_dtype(self): + self.dtype = np.float32 + + def test_check_output(self): + self.check_output_with_place(self.place) + + def test_check_grad(self): + if self.dtype == np.float16: + self.check_grad(['X'], 'Out', max_relative_error=0.009) + else: + self.check_grad(['X'], 'Out', max_relative_error=0.009) + + +class TestTanhFp16(OpTest): + def setUp(self): + self.set_mlu() + self.op_type = "tanh" + self.place = paddle.MLUPlace(0) + + self.init_dtype() + np.random.seed(SEED) + x = np.random.uniform(1, 2, [3, 4]).astype(self.dtype) + out = np.tanh(x) + + self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)} + self.attrs = {} + self.outputs = {'Out': out} + + def set_mlu(self): + self.__class__.use_mlu = True + self.__class__.no_need_check_grad = True + + def init_dtype(self): + self.dtype = np.float16 + + def test_check_output(self): + self.check_output_with_place(self.place, atol=1e-3) + + +class TestTanhNet(unittest.TestCase): + def _test(self, run_mlu=True): + main_prog = paddle.static.Program() + startup_prog = paddle.static.Program() + main_prog.random_seed = SEED + startup_prog.random_seed = SEED + np.random.seed(SEED) + + a_np = np.random.random(size=(32, 32)).astype('float32') + b_np = np.random.random(size=(32, 32)).astype('float32') + label_np = np.random.randint(2, size=(32, 1)).astype('int64') + + with paddle.static.program_guard(main_prog, startup_prog): + a = paddle.static.data(name="a", shape=[32, 32], dtype='float32') + b = paddle.static.data(name="b", shape=[32, 32], dtype='float32') + label = paddle.static.data( + name="label", shape=[32, 1], dtype='int64') + + c = paddle.multiply(a, b) + d = paddle.tanh(c) + + fc_1 = fluid.layers.fc(input=d, size=128) + prediction = fluid.layers.fc(input=fc_1, size=2, act='softmax') + + cost = fluid.layers.cross_entropy(input=prediction, label=label) + loss = fluid.layers.reduce_mean(cost) + sgd = fluid.optimizer.SGD(learning_rate=0.01) + sgd.minimize(loss) + + if run_mlu: + place = paddle.MLUPlace(0) + else: + place = paddle.CPUPlace() + + exe = paddle.static.Executor(place) + exe.run(startup_prog) + + print("Start run on {}".format(place)) + for epoch in range(100): + + pred_res, loss_res = exe.run( + main_prog, + feed={"a": a_np, + "b": b_np, + "label": label_np}, + fetch_list=[prediction, loss]) + if epoch % 10 == 0: + print("Epoch {} | Prediction[0]: {}, Loss: {}".format( + epoch, pred_res[0], loss_res)) + + return pred_res, loss_res + + def test_mlu(self): + cpu_pred, cpu_loss = self._test(False) + mlu_pred, mlu_loss = self._test(True) + + self.assertTrue(np.allclose(mlu_pred, cpu_pred)) + self.assertTrue(np.allclose(mlu_loss, cpu_loss)) + + +if __name__ == '__main__': + unittest.main() From 35acfeda36caada80464051043a3f86ae2b76779 Mon Sep 17 00:00:00 2001 From: limingshu <61349199+JamesLim-sy@users.noreply.github.com> Date: Fri, 15 Apr 2022 10:57:42 +0800 Subject: [PATCH 170/211] Change cuDNN Conv kernel for auto tune feature (#41313) * change cudnn helper for auto-tune * Add FLAGS_use_autotune to set the global status of autotune and change the order of choosing algorithm. * Fix the bug in calculating and printing current step cache hit rate. * Improve the autotune cache and fix unittest. * Change the key from AlgorithmType to int64_t. * Fix unittest for cpu-only env. * change ChooseAlgoByWorkspace for heuristic mode Co-authored-by: Liu Yiqun --- paddle/fluid/eager/CMakeLists.txt | 2 +- paddle/fluid/imperative/CMakeLists.txt | 4 +- paddle/fluid/operators/conv_base_helper.h | 32 +- paddle/fluid/operators/conv_cudnn_helper.h | 727 ++++++++++-------- paddle/fluid/platform/flags.cc | 9 + paddle/fluid/pybind/pybind.cc | 6 +- paddle/phi/kernels/CMakeLists.txt | 14 +- paddle/phi/kernels/autotune/CMakeLists.txt | 9 +- paddle/phi/kernels/autotune/cache.cc | 37 + paddle/phi/kernels/autotune/cache.h | 96 ++- paddle/phi/kernels/autotune/cache_test.cc | 2 +- .../phi/kernels/autotune/switch_autotune.cc | 74 ++ paddle/phi/kernels/autotune/switch_autotune.h | 94 +-- .../tests/unittests/test_switch_autotune.py | 127 ++- 14 files changed, 731 insertions(+), 502 deletions(-) create mode 100644 paddle/phi/kernels/autotune/switch_autotune.cc diff --git a/paddle/fluid/eager/CMakeLists.txt b/paddle/fluid/eager/CMakeLists.txt index da326ff7d76d7..53ac895bfbccb 100644 --- a/paddle/fluid/eager/CMakeLists.txt +++ b/paddle/fluid/eager/CMakeLists.txt @@ -15,7 +15,7 @@ if(NOT ((NOT WITH_PYTHON) AND ON_INFER)) add_subdirectory(pylayer) cc_library(grad_tensor_holder SRCS grad_tensor_holder.cc DEPS grad_node_info gradient_accumulator) add_dependencies(grad_tensor_holder eager_final_state_codegen) - cc_library(backward SRCS backward.cc DEPS grad_tensor_holder utils autograd_meta grad_node_info) + cc_library(backward SRCS backward.cc DEPS grad_tensor_holder utils autograd_meta grad_node_info switch_autotune) endif() cc_library(grad_node_info SRCS grad_node_info.cc DEPS phi_api phi_tensor) diff --git a/paddle/fluid/imperative/CMakeLists.txt b/paddle/fluid/imperative/CMakeLists.txt index 3d8a5ab21f00f..69cd45222cef4 100644 --- a/paddle/fluid/imperative/CMakeLists.txt +++ b/paddle/fluid/imperative/CMakeLists.txt @@ -9,8 +9,8 @@ cc_library(layer SRCS layer.cc DEPS prepared_operator math_function imperative_f add_subdirectory(jit) cc_library(amp SRCS amp_auto_cast.cc DEPS layer var_helper) cc_library(tracer SRCS tracer.cc DEPS layer engine program_desc_tracer amp denormal garbage_collector var_helper) -cc_library(basic_engine SRCS basic_engine.cc DEPS layer gradient_accumulator) -cc_library(engine SRCS basic_engine.cc partial_grad_engine.cc DEPS layer gradient_accumulator) +cc_library(basic_engine SRCS basic_engine.cc DEPS layer gradient_accumulator switch_autotune) +cc_library(engine SRCS basic_engine.cc partial_grad_engine.cc DEPS layer gradient_accumulator switch_autotune) cc_library(imperative_profiler SRCS profiler.cc DEPS flags) if(NOT WIN32) if(WITH_NCCL OR WITH_RCCL) diff --git a/paddle/fluid/operators/conv_base_helper.h b/paddle/fluid/operators/conv_base_helper.h index c664d1935fe2e..9e1a323fc9f3d 100644 --- a/paddle/fluid/operators/conv_base_helper.h +++ b/paddle/fluid/operators/conv_base_helper.h @@ -22,6 +22,7 @@ limitations under the License. */ #include "paddle/fluid/framework/conv_search_cache.h" #include "paddle/fluid/operators/conv_cudnn_op_cache.h" #include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/kernels/autotune/cache.h" namespace paddle { namespace operators { @@ -41,12 +42,22 @@ struct SearchAlgorithm {}; // As the container of searchAlgorithm::Find() result. template struct SearchResult { - public: + SearchResult() {} + explicit SearchResult(AlgoT a) : algo(a) {} + AlgoT algo = static_cast(0); float time = -1.f; size_t workspace_size = 0; }; +template +static std::ostream& operator<<(std::ostream& out, const std::vector& v) { + out << "["; + for (auto const& tmp : v) out << tmp << ","; + out << "]"; + return out; +} + // As the container of conv relevant descriptors. template struct ConvArgsBase { @@ -68,6 +79,17 @@ struct ConvArgsBase { const framework::Tensor* o, const std::vector s, const std::vector p, const std::vector d, DataT dtype) : x(x), w(w), o(o), s(s), p(p), d(d), cudnn_dtype(dtype) {} + + template + size_t GetCacheKey() const { + auto x_shape = phi::vectorize(x->dims()); + auto w_shape = phi::vectorize(w->dims()); + VLOG(10) << "[ConvArgs] x_dims=" << x_shape << ", w_dims=" << w_shape + << ", strides=" << s << ", paddings=" << p << ", dilations=" << d; + return phi::autotune::ConvKey( + x_shape, w_shape, p, s, d, + paddle::experimental::CppTypeToDataType::Type()); + } }; static inline void GetNCDHW(const framework::DDim& dims, @@ -87,13 +109,5 @@ static inline void GetNCDHW(const framework::DDim& dims, } } -template -static std::ostream& operator<<(std::ostream& out, const std::vector& v) { - out << "["; - for (auto const& tmp : v) out << tmp << ","; - out << "]"; - return out; -} - } // namespace operators } // namespace paddle diff --git a/paddle/fluid/operators/conv_cudnn_helper.h b/paddle/fluid/operators/conv_cudnn_helper.h index 1311f812be118..419fb8a4ca703 100644 --- a/paddle/fluid/operators/conv_cudnn_helper.h +++ b/paddle/fluid/operators/conv_cudnn_helper.h @@ -17,6 +17,8 @@ limitations under the License. */ #include "paddle/fluid/operators/conv_base_helper.h" #include "paddle/fluid/platform/cuda_graph_with_memory_pool.h" #include "paddle/fluid/platform/device/gpu/gpu_info.h" +#include "paddle/fluid/platform/profiler.h" +#include "paddle/phi/kernels/autotune/switch_autotune.h" #include "paddle/phi/kernels/funcs/eigen/eigen_function.h" namespace paddle { @@ -67,20 +69,16 @@ static inline bool UseFixedWorkspace() { return FLAGS_conv_workspace_size_limit >= 0; } -static size_t CaclWorkspaceLimitInBytes(const phi::GPUContext& ctx) { - if (!UseFixedWorkspace()) { +static size_t CalcWorkspaceLimitInBytes(bool use_fixed_workspace) { + if (!use_fixed_workspace) { int device_id = platform::GetCurrentDeviceId(); int64_t allocated = memory::StatGetCurrentValue("Allocated", device_id); int64_t reserved = memory::StatGetCurrentValue("Reserved", device_id); int64_t availble = platform::GpuAvailableMemToAlloc(); - int64_t cur_workspace_size = ctx.cudnn_workspace_handle().WorkspaceSize(); VLOG(3) << "[memory] allocated=" << ToMegaBytes(allocated) << " MB, reserved=" << ToMegaBytes(reserved) - << " MB, available_to_alloc=" << ToMegaBytes(availble) - << " MB, current_workspace_size=" << ToMegaBytes(cur_workspace_size) - << " MB."; - return std::max(std::max(availble, cur_workspace_size), - reserved - allocated); + << " MB, available_to_alloc=" << ToMegaBytes(availble) << " MB."; + return std::max(availble, reserved - allocated); } else { return FLAGS_conv_workspace_size_limit * 1024 * 1024; } @@ -104,26 +102,44 @@ std::string GetPerfResultString(std::string prefix, return out.str(); } +// Choose an algorithm which has the minimize time cost and less memory. +// NOTE: perf_results is ordered by time. template void ChooseAlgoByWorkspace(const std::vector& perf_results, size_t workspace_limit, - SearchResult* algo_result) { + SearchResult* search_result) { + int best_algo_idx = -1; for (size_t i = 0; i < perf_results.size(); ++i) { auto result = perf_results[i]; if (result.status == CUDNN_STATUS_SUCCESS && result.memory < workspace_limit) { - algo_result->algo = result.algo; - algo_result->time = result.time; - algo_result->workspace_size = result.memory; - VLOG(3) << " algo=" << result.algo << ", time=" << result.time - << " ms, memory=" << ToMegaBytes(result.memory) - << " MB (limit=" << ToMegaBytes(workspace_limit) - << " MB), status=" << result.status; - return; + if (best_algo_idx == -1) { + // The algorithm which has minimize time cost and need a workspace_size + // fitting the workspace_limit constraint. + best_algo_idx = i; + // Each perf_results[i].time is set to be -1 in heuristic search. + if (perf_results[best_algo_idx].time < 0) { + break; + } + } else { + float best_algo_time = perf_results[best_algo_idx].time; + if ((result.time - best_algo_time) / best_algo_time < 0.01) { + best_algo_idx = (result.memory < perf_results[best_algo_idx].memory) + ? i + : best_algo_idx; + break; + } + } } } - VLOG(3) << "Can not find an algorithm that requires memory < " - << ToMegaBytes(workspace_limit) << " MB"; + if (best_algo_idx != -1) { + search_result->algo = perf_results[best_algo_idx].algo; + search_result->time = perf_results[best_algo_idx].time; + search_result->workspace_size = perf_results[best_algo_idx].memory; + } else { + VLOG(3) << "Can not find an algorithm that requires memory < " + << ToMegaBytes(workspace_limit) << " MB"; + } } static void SetConvMathType(const phi::GPUContext& ctx, cudnnDataType_t dtype, @@ -151,6 +167,10 @@ static void SetConvMathType(const phi::GPUContext& ctx, cudnnDataType_t dtype, #endif } +// cuDNN convolution forward algorithm searcher, consisted of three searching +// modes, namely: deterministic, heuristic and exhaustive_search mode. +// As well as one workspace size acquirsition function with respect to +// the chosen alogrithm. template <> struct SearchAlgorithm { using PerfT = cudnnConvolutionFwdAlgoPerf_t; @@ -162,90 +182,30 @@ struct SearchAlgorithm { const phi::GPUContext& ctx) { SearchResult result; auto dtype = platform::CudnnDataType::type; - size_t workspace_size_limit = CaclWorkspaceLimitInBytes(ctx); SetConvMathType(ctx, dtype, args.cdesc); - if (!exhaustive_search && !deterministic) { -#if CUDNN_VERSION >= 7001 - int actual_perf_count; - int best_algo_idx = 0; - std::vector perf_results(kNUM_CUDNN_FWD_ALGS); - PADDLE_ENFORCE_GPU_SUCCESS( - platform::dynload::cudnnGetConvolutionForwardAlgorithm_v7( - args.handle, args.idesc.desc(), args.wdesc.desc(), - args.cdesc.desc(), args.odesc.desc(), kNUM_CUDNN_FWD_ALGS, - &actual_perf_count, perf_results.data())); - result.algo = perf_results[best_algo_idx].algo; - result.workspace_size = perf_results[best_algo_idx].memory; - - if (result.workspace_size > workspace_size_limit) { -#if CUDNN_VERSION >= 8000 - // cudnnGetConvolutionForwardAlgorithm is removed in CUDNN-8 - ChooseAlgoByWorkspace(perf_results, workspace_size_limit, - &result); -#else - VLOG(3) << "Fallback to non-v7 method to find conv algorithm " - "becasue the workspace size request(" - << result.workspace_size << ") exceeds the limit(" - << workspace_size_limit << ")"; - PADDLE_ENFORCE_GPU_SUCCESS( - platform::dynload::cudnnGetConvolutionForwardAlgorithm( - args.handle, args.idesc.desc(), args.wdesc.desc(), - args.cdesc.desc(), args.odesc.desc(), - CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT, - workspace_size_limit, &(result.algo))); -#endif - } -#else - PADDLE_ENFORCE_GPU_SUCCESS( - platform::dynload::cudnnGetConvolutionForwardAlgorithm( - args.handle, args.idesc.desc(), args.wdesc.desc(), - args.cdesc.desc(), args.odesc.desc(), - CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT, - workspace_size_limit, &(result.algo))); -#endif - } else if (deterministic) { - result.algo = static_cast(1); + if (deterministic) { + result = FindAlgoDeterministic(); } else { - auto workspace_handle = ctx.cudnn_workspace_handle(); - auto x_dims = phi::vectorize(args.x->dims()); - auto w_dims = phi::vectorize(args.w->dims()); - VLOG(10) << "cudnnConvolutionFwdAlgoPerf_t:" - << ", x_dims:" << x_dims << ", w_dims:" << w_dims << ", args.s" - << args.s << ", args.p" << args.p << ", args.d" << args.d; - - AlgorithmsCache& algo_cache = - *(framework::ConvSearchCache::Instance().GetForward()); - - result.algo = algo_cache.GetAlgorithm( - x_dims, w_dims, args.s, args.p, args.d, 0, - static_cast(args.cudnn_dtype), [&]() { - int returned_algo_count; - std::vector perf_results(kNUM_CUDNN_FWD_ALGS); - size_t max_workspace_size = - FindMaxWorkspaceSize(args, workspace_size_limit); - VLOG(4) << "max_workspace_size=" << ToMegaBytes(max_workspace_size) - << " MB"; - - auto cudnn_find_func = [&](void* cudnn_workspace_ptr) { - PADDLE_ENFORCE_GPU_SUCCESS( - platform::dynload::cudnnFindConvolutionForwardAlgorithmEx( - args.handle, args.idesc.desc(), args.x->data(), - args.wdesc.desc(), args.w->data(), args.cdesc.desc(), - args.odesc.desc(), const_cast(args.o->data()), - kNUM_CUDNN_FWD_ALGS, &returned_algo_count, - perf_results.data(), cudnn_workspace_ptr, - max_workspace_size)); - }; - workspace_handle.RunFuncSync(cudnn_find_func, max_workspace_size, - UseFixedWorkspace()); - - VLOG(4) << GetPerfResultString( - "[Exhaustive Search] FwdAlgo Perf result", perf_results, - returned_algo_count, workspace_size_limit); - result.time = perf_results[0].time; - return perf_results[0].algo; - }); + // 1. Once turning on exhaustive FLAGS, always get exhaustive_search. + // 2. Once turning on auto-tune, runn heuristic search(default) before + // auto-tune process, run exhaustive_search during mentioned process. + // 3. After auto-tune process, run cached algorithm if cached, run + // default mode for the rest. + size_t key = args.GetCacheKey(); + auto& cache = phi::autotune::AutoTuneCache::Instance().GetConvForward(); + if (cache.Find(key)) { + result.algo = static_cast(cache.Get(key)); + } else { + bool use_autotune = + phi::autotune::AutoTuneStatus::Instance().UseAutoTune(); + if (exhaustive_search || use_autotune) { + result = FindAlgoExhaustiveSearch(args, ctx); + cache.Set(key, static_cast(result.algo)); + } else { + result = FindAlgoHeuristic(args, ctx); + } + } } VLOG(3) << "[cuDNN Convoltion] exhaustive_search=" << exhaustive_search << ", deterministic=" << deterministic @@ -265,8 +225,95 @@ struct SearchAlgorithm { } private: - static size_t FindMaxWorkspaceSize(const ConvArgs& args, - size_t workspace_size_limit) { + static SearchResult FindAlgoDeterministic() { + return SearchResult(static_cast(1)); + } + + // Heuristic search mode, calling the cudnnGetXxxAlgorithm. + static SearchResult FindAlgoHeuristic(const ConvArgs& args, + const phi::GPUContext& ctx) { + SearchResult result; + size_t workspace_size_limit = + CalcWorkspaceLimitInBytes(UseFixedWorkspace()); + +#if CUDNN_VERSION >= 7001 + int actual_perf_count; + int best_algo_idx = 0; + std::vector perf_results(kNUM_CUDNN_FWD_ALGS); + PADDLE_ENFORCE_GPU_SUCCESS( + platform::dynload::cudnnGetConvolutionForwardAlgorithm_v7( + args.handle, args.idesc.desc(), args.wdesc.desc(), + args.cdesc.desc(), args.odesc.desc(), kNUM_CUDNN_FWD_ALGS, + &actual_perf_count, perf_results.data())); + result.algo = perf_results[best_algo_idx].algo; + result.workspace_size = perf_results[best_algo_idx].memory; + + if (result.workspace_size > workspace_size_limit) { +#if CUDNN_VERSION >= 8000 + // cudnnGetConvolutionForwardAlgorithm is removed in CUDNN-8 + ChooseAlgoByWorkspace(perf_results, workspace_size_limit, + &result); +#else + VLOG(3) << "Fallback to non-v7 method to find conv algorithm " + "becasue the workspace size request(" + << result.workspace_size << ") exceeds the limit(" + << workspace_size_limit << ")"; + PADDLE_ENFORCE_GPU_SUCCESS( + platform::dynload::cudnnGetConvolutionForwardAlgorithm( + args.handle, args.idesc.desc(), args.wdesc.desc(), + args.cdesc.desc(), args.odesc.desc(), + CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT, + workspace_size_limit, &(result.algo))); +#endif + } +#else + PADDLE_ENFORCE_GPU_SUCCESS( + platform::dynload::cudnnGetConvolutionForwardAlgorithm( + args.handle, args.idesc.desc(), args.wdesc.desc(), + args.cdesc.desc(), args.odesc.desc(), + CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT, workspace_size_limit, + &(result.algo))); +#endif + return result; + } + + template + static SearchResult FindAlgoExhaustiveSearch( + const ConvArgs& args, const phi::GPUContext& ctx) { + SearchResult result; + size_t workspace_size_limit = + CalcWorkspaceLimitInBytes(UseFixedWorkspace()); + size_t max_workspace_size = GetMaxWorkspaceSize(args, workspace_size_limit); + VLOG(4) << "max_workspace_size=" << ToMegaBytes(max_workspace_size) + << " MB"; + + int returned_algo_count; + std::vector perf_results(kNUM_CUDNN_FWD_ALGS); + auto cudnn_find_func = [&](void* workspace_ptr) { + PADDLE_ENFORCE_GPU_SUCCESS( + platform::dynload::cudnnFindConvolutionForwardAlgorithmEx( + args.handle, args.idesc.desc(), args.x->data(), + args.wdesc.desc(), args.w->data(), args.cdesc.desc(), + args.odesc.desc(), const_cast(args.o->data()), + kNUM_CUDNN_FWD_ALGS, &returned_algo_count, perf_results.data(), + workspace_ptr, max_workspace_size)); + }; + + auto workspace_handle = ctx.cudnn_workspace_handle(); + workspace_handle.RunFuncSync(cudnn_find_func, max_workspace_size, + UseFixedWorkspace()); + + VLOG(4) << GetPerfResultString( + "[Exhaustive Search] FwdAlgo Perf result", perf_results, + returned_algo_count, workspace_size_limit); + ChooseAlgoByWorkspace(perf_results, workspace_size_limit, + &result); + + return result; + } + + static size_t GetMaxWorkspaceSize(const ConvArgs& args, + size_t workspace_size_limit) { if (!UseFixedWorkspace()) { size_t max_workspace_size = 0; for (size_t algo = 0; algo < kNUM_CUDNN_FWD_ALGS; ++algo) { @@ -288,6 +335,12 @@ struct SearchAlgorithm { } }; +// cuDNN convolution backward data-algorithm searcher, consisting of three +// searching modes, namely: deterministic, heuristic, and exhaustive_search +// mode. Specially, there are 2 pattens of exhaustive search mode, one for +// HALF precision only, one for the rest. +// As well as one workspace size acquirsition function with +// respect to the chosen alogrithm. template <> struct SearchAlgorithm { using PerfT = cudnnConvolutionBwdDataAlgoPerf_t; @@ -299,102 +352,31 @@ struct SearchAlgorithm { const phi::GPUContext& ctx) { SearchResult result; auto dtype = platform::CudnnDataType::type; - size_t workspace_size_limit = CaclWorkspaceLimitInBytes(ctx); SetConvMathType(ctx, dtype, args.cdesc); - if (!exhaustive_search && !deterministic) { -#if CUDNN_VERSION >= 7001 - int actual_perf_count; - int best_algo_idx = 0; - std::vector perf_results(kNUM_CUDNN_BWD_DATA_ALGS); - PADDLE_ENFORCE_GPU_SUCCESS( - platform::dynload::cudnnGetConvolutionBackwardDataAlgorithm_v7( - args.handle, args.wdesc.desc(), args.odesc.desc(), - args.cdesc.desc(), args.idesc.desc(), kNUM_CUDNN_BWD_DATA_ALGS, - &actual_perf_count, perf_results.data())); - result.algo = perf_results[best_algo_idx].algo; - -#if CUDNN_VERSION < 7500 - int stride_dim = args.x->dims().size() - 2; - bool blacklist = std::any_of(args.s.begin(), args.s.begin() + stride_dim, - [=](int n) { return n != 1; }); - if (blacklist && (perf_results[best_algo_idx].algo == - CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT_TILING || - perf_results[best_algo_idx].algo == - CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT)) { - result.algo = CUDNN_CONVOLUTION_BWD_DATA_ALGO_1; - } -#endif - result.workspace_size = GetWorkspaceSize(args, result.algo); - if (result.workspace_size > workspace_size_limit) { -#if CUDNN_VERSION >= 8000 - // cudnnGetConvolutionBackwardDataAlgorithm is removed in CUDNN-8 - ChooseAlgoByWorkspace(perf_results, workspace_size_limit, - &result); -#else - VLOG(1) << "Fallback to non-v7 method to find conv algorithm becasue " - "the workspace size request(" - << result.workspace_size << ") exceeds the limit(" - << workspace_size_limit << ")"; - PADDLE_ENFORCE_GPU_SUCCESS( - platform::dynload::cudnnGetConvolutionBackwardDataAlgorithm( - args.handle, args.wdesc.desc(), args.odesc.desc(), - args.cdesc.desc(), args.idesc.desc(), - CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT, - workspace_size_limit, &(result.algo))); -#endif - } -#else - PADDLE_ENFORCE_GPU_SUCCESS( - platform::dynload::cudnnGetConvolutionBackwardDataAlgorithm( - args.handle, args.wdesc.desc(), args.odesc.desc(), - args.cdesc.desc(), args.idesc.desc(), - CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT, - workspace_size_limit, &(result.algo))); -#endif - } else if (deterministic) { - result.algo = CUDNN_CONVOLUTION_BWD_DATA_ALGO_1; + if (deterministic) { + result = FindAlgoDeterministic(); } else { - auto workspace_handle = ctx.cudnn_workspace_handle(); - auto x_dims = phi::vectorize(args.x->dims()); - auto w_dims = phi::vectorize(args.w->dims()); - VLOG(10) << "cudnnConvolutionFwdAlgoPerf_t" - << ", x_dims:" << x_dims << ", w_dims:" << w_dims << ", args.s" - << args.s << ", args.p" << args.p << ", args.d" << args.d; - - AlgorithmsCache& algo_cache = - *(framework::ConvSearchCache::Instance().GetBackwardData()); - result.algo = algo_cache.GetAlgorithm( - x_dims, w_dims, args.s, args.p, args.d, 0, - static_cast(args.cudnn_dtype), [&]() { - int returned_algo_count; - std::vector perf_results(kNUM_CUDNN_BWD_DATA_ALGS); - size_t max_workspace_size = - FindMaxWorkspaceSize(args, workspace_size_limit); - VLOG(3) << "max_workspace_size=" << ToMegaBytes(max_workspace_size) - << " MB"; - - auto cudnn_find_func = [&](void* cudnn_workspace_ptr) { - PADDLE_ENFORCE_GPU_SUCCESS( - platform::dynload:: - cudnnFindConvolutionBackwardDataAlgorithmEx( - args.handle, args.wdesc.desc(), args.w->data(), - args.odesc.desc(), args.o->data(), - args.cdesc.desc(), args.idesc.desc(), - const_cast(args.x->data()), - kNUM_CUDNN_BWD_DATA_ALGS, &returned_algo_count, - perf_results.data(), cudnn_workspace_ptr, - max_workspace_size)); - }; - workspace_handle.RunFuncSync(cudnn_find_func, max_workspace_size, - UseFixedWorkspace()); - - VLOG(3) << GetPerfResultString( - "[Exhaustive Search] BwdDataAlgo Perf result", perf_results, - returned_algo_count, workspace_size_limit); - result.time = perf_results[0].time; - return perf_results[0].algo; - }); + // 1. Once turning on exhaustive FLAGS, always get exhaustive_search. + // 2. Once turning on auto-tune, runn heuristic search(default) before + // auto-tune process, run exhaustive_search during mentioned process. + // 3. After auto-tune process, run cached algorithm if cached, run + // default mode for the rest. + size_t key = args.GetCacheKey(); + auto& cache = + phi::autotune::AutoTuneCache::Instance().GetConvBackwardData(); + if (cache.Find(key)) { + result.algo = static_cast(cache.Get(key)); + } else { + bool use_autotune = + phi::autotune::AutoTuneStatus::Instance().UseAutoTune(); + if (exhaustive_search || use_autotune) { + result = FindAlgoExhaustiveSearch(args, ctx); + cache.Set(key, static_cast(result.algo)); + } else { + result = FindAlgoHeuristic(args, ctx); + } + } } VLOG(3) << "[cuDNN Convoltion] exhaustive_search=" << exhaustive_search << ", deterministic=" << deterministic @@ -414,8 +396,106 @@ struct SearchAlgorithm { } private: - static size_t FindMaxWorkspaceSize(const ConvArgs& args, - size_t workspace_size_limit) { + static SearchResult FindAlgoDeterministic() { + return SearchResult(CUDNN_CONVOLUTION_BWD_DATA_ALGO_1); + } + + static SearchResult FindAlgoHeuristic(const ConvArgs& args, + const phi::GPUContext& ctx) { + SearchResult result; + size_t workspace_size_limit = + CalcWorkspaceLimitInBytes(UseFixedWorkspace()); + +#if CUDNN_VERSION >= 7001 + int actual_perf_count; + int best_algo_idx = 0; + std::vector perf_results(kNUM_CUDNN_BWD_DATA_ALGS); + PADDLE_ENFORCE_GPU_SUCCESS( + platform::dynload::cudnnGetConvolutionBackwardDataAlgorithm_v7( + args.handle, args.wdesc.desc(), args.odesc.desc(), + args.cdesc.desc(), args.idesc.desc(), kNUM_CUDNN_BWD_DATA_ALGS, + &actual_perf_count, perf_results.data())); + result.algo = perf_results[best_algo_idx].algo; + +#if CUDNN_VERSION < 7500 + int stride_dim = args.x->dims().size() - 2; + bool blacklist = std::any_of(args.s.begin(), args.s.begin() + stride_dim, + [=](int n) { return n != 1; }); + if (blacklist && (perf_results[best_algo_idx].algo == + CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT_TILING || + perf_results[best_algo_idx].algo == + CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT)) { + result.algo = CUDNN_CONVOLUTION_BWD_DATA_ALGO_1; + } +#endif + result.workspace_size = GetWorkspaceSize(args, result.algo); + if (result.workspace_size > workspace_size_limit) { +#if CUDNN_VERSION >= 8000 + // cudnnGetConvolutionBackwardDataAlgorithm is removed in CUDNN-8 + ChooseAlgoByWorkspace(perf_results, workspace_size_limit, + &result); +#else + VLOG(1) << "Fallback to non-v7 method to find conv algorithm becasue " + "the workspace size request(" + << result.workspace_size << ") exceeds the limit(" + << workspace_size_limit << ")"; + PADDLE_ENFORCE_GPU_SUCCESS( + platform::dynload::cudnnGetConvolutionBackwardDataAlgorithm( + args.handle, args.wdesc.desc(), args.odesc.desc(), + args.cdesc.desc(), args.idesc.desc(), + CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT, + workspace_size_limit, &(result.algo))); +#endif + } +#else + PADDLE_ENFORCE_GPU_SUCCESS( + platform::dynload::cudnnGetConvolutionBackwardDataAlgorithm( + args.handle, args.wdesc.desc(), args.odesc.desc(), + args.cdesc.desc(), args.idesc.desc(), + CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT, + workspace_size_limit, &(result.algo))); +#endif + + return result; + } + + template + static SearchResult FindAlgoExhaustiveSearch( + const ConvArgs& args, const phi::GPUContext& ctx) { + SearchResult result; + size_t workspace_size_limit = + CalcWorkspaceLimitInBytes(UseFixedWorkspace()); + size_t max_workspace_size = GetMaxWorkspaceSize(args, workspace_size_limit); + VLOG(3) << "max_workspace_size=" << ToMegaBytes(max_workspace_size) + << " MB"; + + int returned_algo_count; + std::vector perf_results(kNUM_CUDNN_BWD_DATA_ALGS); + auto cudnn_find_func = [&](void* workspace_ptr) { + PADDLE_ENFORCE_GPU_SUCCESS( + platform::dynload::cudnnFindConvolutionBackwardDataAlgorithmEx( + args.handle, args.wdesc.desc(), args.w->data(), + args.odesc.desc(), args.o->data(), args.cdesc.desc(), + args.idesc.desc(), const_cast(args.x->data()), + kNUM_CUDNN_BWD_DATA_ALGS, &returned_algo_count, + perf_results.data(), workspace_ptr, max_workspace_size)); + }; + + auto workspace_handle = ctx.cudnn_workspace_handle(); + workspace_handle.RunFuncSync(cudnn_find_func, max_workspace_size, + UseFixedWorkspace()); + + VLOG(4) << GetPerfResultString( + "[Exhaustive Search] BwdDataAlgo Perf result", perf_results, + returned_algo_count, workspace_size_limit); + ChooseAlgoByWorkspace(perf_results, workspace_size_limit, + &result); + + return result; + } + + static size_t GetMaxWorkspaceSize(const ConvArgs& args, + size_t workspace_size_limit) { if (!UseFixedWorkspace()) { size_t max_workspace_size = 0; for (size_t algo = 0; algo < kNUM_CUDNN_BWD_DATA_ALGS; ++algo) { @@ -438,6 +518,10 @@ struct SearchAlgorithm { } }; +// cuDNN convution backward filter-algorithm searcher, consisted of three +// algorithm searching modes, namely: deterministic, heuristic, and +// exhaustive_search mode. As well as one workspace size acquirsition function +// with respect to the chosen alogrithm. template <> struct SearchAlgorithm { using PerfT = cudnnConvolutionBwdFilterAlgoPerf_t; @@ -450,113 +534,30 @@ struct SearchAlgorithm { platform::CUDAGraphCaptureModeGuard guard; SearchResult result; auto dtype = platform::CudnnDataType::type; - size_t workspace_size_limit = CaclWorkspaceLimitInBytes(ctx); SetConvMathType(ctx, dtype, args.cdesc); - if (!exhaustive_search && !deterministic) { -#if CUDNN_VERSION >= 7001 - int actual_perf_count; - int best_algo_idx = 0; - std::vector perf_results(kNUM_CUDNN_BWD_FILTER_ALGS); - PADDLE_ENFORCE_GPU_SUCCESS( - platform::dynload::cudnnGetConvolutionBackwardFilterAlgorithm_v7( - args.handle, args.idesc.desc(), args.odesc.desc(), - args.cdesc.desc(), args.wdesc.desc(), kNUM_CUDNN_BWD_FILTER_ALGS, - &actual_perf_count, perf_results.data())); - result.algo = perf_results[best_algo_idx].algo; - result.workspace_size = perf_results[best_algo_idx].memory; - - if (result.workspace_size > workspace_size_limit) { -#if CUDNN_VERSION >= 8000 - // cudnnGetConvolutionBackwardFilterAlgorithm is removed in CUDNN-8 - ChooseAlgoByWorkspace(perf_results, workspace_size_limit, - &result); -#else - VLOG(1) << "Fallback to non-v7 method to find conv algorithm becasue " - "the workspace size request(" - << result.workspace_size << ") exceeds the limit(" - << workspace_size_limit << ")"; - PADDLE_ENFORCE_GPU_SUCCESS( - platform::dynload::cudnnGetConvolutionBackwardFilterAlgorithm( - args.handle, args.idesc.desc(), args.odesc.desc(), - args.cdesc.desc(), args.wdesc.desc(), - CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT, - workspace_size_limit, &(result.algo))); -#endif - } -#else - PADDLE_ENFORCE_GPU_SUCCESS( - platform::dynload::cudnnGetConvolutionBackwardFilterAlgorithm( - args.handle, args.idesc.desc(), args.odesc.desc(), - args.cdesc.desc(), args.wdesc.desc(), - CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT, - workspace_size_limit, &(result.algo))); -#endif - } else if (deterministic) { - result.algo = CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1; + if (deterministic) { + result = FindAlgoDeterministic(); } else { - auto workspace_handle = ctx.cudnn_workspace_handle(); - auto x_dims = phi::vectorize(args.x->dims()); - auto w_dims = phi::vectorize(args.w->dims()); - VLOG(10) << "cudnnConvolutionFwdAlgoPerf_t:" - << ", x_dims:" << x_dims << ", w_dims:" << w_dims << ", args.s" - << args.s << ", args.p" << args.p << ", args.d" << args.d; - - AlgorithmsCache& algo_cache = - *(framework::ConvSearchCache::Instance().GetBackwardFilter()); - - if (dtype != CUDNN_DATA_HALF) { - result.algo = algo_cache.GetAlgorithm( - x_dims, w_dims, args.s, args.p, args.d, 0, - static_cast(args.cudnn_dtype), [&]() { - int returned_algo_count; - std::vector perf_results(kNUM_CUDNN_BWD_FILTER_ALGS); - size_t max_workspace_size = - FindMaxWorkspaceSize(args, workspace_size_limit); - VLOG(3) << "max_workspace_size=" - << ToMegaBytes(max_workspace_size) << " MB"; - - auto cudnn_find_func = [&](void* cudnn_workspace_ptr) { - PADDLE_ENFORCE_GPU_SUCCESS( - platform::dynload:: - cudnnFindConvolutionBackwardFilterAlgorithmEx( - args.handle, args.idesc.desc(), args.x->data(), - args.odesc.desc(), args.o->data(), - args.cdesc.desc(), args.wdesc.desc(), - const_cast(args.w->data()), - kNUM_CUDNN_BWD_FILTER_ALGS, &returned_algo_count, - perf_results.data(), cudnn_workspace_ptr, - max_workspace_size)); - }; - workspace_handle.RunFuncSync(cudnn_find_func, max_workspace_size, - UseFixedWorkspace()); - - VLOG(3) << GetPerfResultString( - "[Exhaustive Search] BwdFilterAlgo Perf result", perf_results, - returned_algo_count, workspace_size_limit); - result.time = perf_results[0].time; - return perf_results[0].algo; - }); + // 1. Once turning on exhaustive FLAGS, always get exhaustive_search. + // 2. Once turning on auto-tune, runn heuristic search(default) before + // auto-tune process, run exhaustive_search during mentioned process. + // 3. After auto-tune process, run cached algorithm if cached, run + // default mode for the rest. + size_t key = args.GetCacheKey(); + auto& cache = + phi::autotune::AutoTuneCache::Instance().GetConvBackwardFilter(); + if (cache.Find(key)) { + result.algo = static_cast(cache.Get(key)); } else { - result.algo = algo_cache.GetAlgorithm( - x_dims, w_dims, args.s, args.p, args.d, 0, - static_cast(args.cudnn_dtype), [&]() { - SearchResult algo_result; - int actual_algos = 0; - std::vector perf_results(kNUM_CUDNN_BWD_FILTER_ALGS); - - PADDLE_ENFORCE_GPU_SUCCESS( - platform::dynload:: - cudnnFindConvolutionBackwardFilterAlgorithm( - args.handle, args.idesc.desc(), args.odesc.desc(), - args.cdesc.desc(), args.wdesc.desc(), - perf_results.size(), &actual_algos, - perf_results.data())); - perf_results.resize(actual_algos); - ChooseAlgo(perf_results, workspace_size_limit, &algo_result); - result.time = algo_result.time; - return algo_result.algo; - }); + bool use_autotune = + phi::autotune::AutoTuneStatus::Instance().UseAutoTune(); + if (exhaustive_search || use_autotune) { + result = FindAlgoExhaustiveSearch(args, ctx); + cache.Set(key, static_cast(result.algo)); + } else { + result = FindAlgoHeuristic(args, ctx); + } } } VLOG(3) << "[cuDNN Convoltion] exhaustive_search=" << exhaustive_search @@ -578,8 +579,126 @@ struct SearchAlgorithm { } private: - static size_t FindMaxWorkspaceSize(const ConvArgs& args, - size_t workspace_size_limit) { + static SearchResult FindAlgoDeterministic() { + return SearchResult(CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1); + } + + static SearchResult FindAlgoHeuristic(const ConvArgs& args, + const phi::GPUContext& ctx) { + SearchResult result; + size_t workspace_size_limit = + CalcWorkspaceLimitInBytes(UseFixedWorkspace()); + +#if CUDNN_VERSION >= 7001 + int actual_perf_count; + int best_algo_idx = 0; + std::vector perf_results(kNUM_CUDNN_BWD_FILTER_ALGS); + PADDLE_ENFORCE_GPU_SUCCESS( + platform::dynload::cudnnGetConvolutionBackwardFilterAlgorithm_v7( + args.handle, args.idesc.desc(), args.odesc.desc(), + args.cdesc.desc(), args.wdesc.desc(), kNUM_CUDNN_BWD_FILTER_ALGS, + &actual_perf_count, perf_results.data())); + result.algo = perf_results[best_algo_idx].algo; + result.workspace_size = perf_results[best_algo_idx].memory; + + if (result.workspace_size > workspace_size_limit) { +#if CUDNN_VERSION >= 8000 + // cudnnGetConvolutionBackwardFilterAlgorithm is removed in CUDNN-8 + ChooseAlgoByWorkspace(perf_results, workspace_size_limit, + &result); +#else + VLOG(1) << "Fallback to non-v7 method to find conv algorithm becasue " + "the workspace size request(" + << result.workspace_size << ") exceeds the limit(" + << workspace_size_limit << ")"; + PADDLE_ENFORCE_GPU_SUCCESS( + platform::dynload::cudnnGetConvolutionBackwardFilterAlgorithm( + args.handle, args.idesc.desc(), args.odesc.desc(), + args.cdesc.desc(), args.wdesc.desc(), + CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT, + workspace_size_limit, &(result.algo))); +#endif + } +#else + PADDLE_ENFORCE_GPU_SUCCESS( + platform::dynload::cudnnGetConvolutionBackwardFilterAlgorithm( + args.handle, args.idesc.desc(), args.odesc.desc(), + args.cdesc.desc(), args.wdesc.desc(), + CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT, + workspace_size_limit, &(result.algo))); +#endif + + return result; + } + + template + static SearchResult FindAlgoExhaustiveSearch( + const ConvArgs& args, const phi::GPUContext& ctx) { + SearchResult result; + int returned_algo_count = 0; + std::vector perf_results(kNUM_CUDNN_BWD_FILTER_ALGS); + size_t workspace_size_limit = + CalcWorkspaceLimitInBytes(UseFixedWorkspace()); + auto workspace_handle = ctx.cudnn_workspace_handle(); + if (platform::CudnnDataType::type != CUDNN_DATA_HALF) { + size_t max_workspace_size = + GetMaxWorkspaceSize(args, workspace_size_limit); + VLOG(3) << "max_workspace_size=" << ToMegaBytes(max_workspace_size) + << " MB"; + + auto cudnn_find_func = [&](void* workspace_ptr) { + PADDLE_ENFORCE_GPU_SUCCESS( + platform::dynload::cudnnFindConvolutionBackwardFilterAlgorithmEx( + args.handle, args.idesc.desc(), args.x->data(), + args.odesc.desc(), args.o->data(), args.cdesc.desc(), + args.wdesc.desc(), const_cast(args.w->data()), + kNUM_CUDNN_BWD_FILTER_ALGS, &returned_algo_count, + perf_results.data(), workspace_ptr, max_workspace_size)); + }; + workspace_handle.RunFuncSync(cudnn_find_func, max_workspace_size, + UseFixedWorkspace()); + + VLOG(4) << GetPerfResultString( + "[Exhaustive Search] BwdFilterAlgo Perf result", perf_results, + returned_algo_count, workspace_size_limit); + ChooseAlgoByWorkspace(perf_results, workspace_size_limit, + &result); + } else { + int max_algos = GetAlgorithmMaxCount(args.handle); + std::vector perf_results(max_algos); + PADDLE_ENFORCE_GPU_SUCCESS( + platform::dynload::cudnnFindConvolutionBackwardFilterAlgorithm( + args.handle, args.idesc.desc(), args.odesc.desc(), + args.cdesc.desc(), args.wdesc.desc(), perf_results.size(), + &returned_algo_count, perf_results.data())); + perf_results.resize(returned_algo_count); + + VLOG(4) << GetPerfResultString( + "[Exhaustive Search] BwdFilterAlgo Perf result", perf_results, + perf_results.size(), workspace_size_limit); + ChooseAlgo(perf_results, workspace_size_limit, &result); + } + + return result; + } + + static int GetAlgorithmMaxCount(cudnnHandle_t handle) { +#if CUDNN_VERSION_MIN(7, 0, 1) + int max_algos = 0; + auto status = + platform::dynload::cudnnGetConvolutionBackwardFilterAlgorithmMaxCount( + handle, &max_algos); + if (status == gpuSuccess) { + VLOG(5) << "[BackwardFilter] max_algos: predefined=" + << kNUM_CUDNN_BWD_FILTER_ALGS << ", actual=" << max_algos; + return max_algos; + } +#endif + return kNUM_CUDNN_BWD_FILTER_ALGS; + } + + static size_t GetMaxWorkspaceSize(const ConvArgs& args, + size_t workspace_size_limit) { if (!UseFixedWorkspace()) { size_t max_workspace_size = 0; for (size_t algo = 0; algo < kNUM_CUDNN_BWD_FILTER_ALGS; ++algo) { @@ -604,10 +723,6 @@ struct SearchAlgorithm { static void ChooseAlgo(const std::vector& perf_results, size_t workspace_limit, SearchResult* algo_result) { - VLOG(3) << GetPerfResultString( - "[Exhaustive Search] BwdFilterAlgo Perf result", perf_results, - perf_results.size(), workspace_limit); - for (size_t i = 0; i != perf_results.size(); ++i) { const auto& result = perf_results[i]; if (result.status == CUDNN_STATUS_SUCCESS && diff --git a/paddle/fluid/platform/flags.cc b/paddle/fluid/platform/flags.cc index 18b53563cd64e..a43eaa41cfe83 100644 --- a/paddle/fluid/platform/flags.cc +++ b/paddle/fluid/platform/flags.cc @@ -774,3 +774,12 @@ DEFINE_bool(enable_ins_parser_file, false, #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) PADDLE_DEFINE_EXPORTED_bool(nccl_blocking_wait, false, "nccl blocking wait"); #endif + +/** + * Autotune related FLAG + * Name: FLAGS_use_autotune + * Since Version: 2.3.0 + * Value Range: bool, default=false + * Example: + */ +PADDLE_DEFINE_EXPORTED_bool(use_autotune, false, "Whether enable autotune."); diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc index 982bf7646125b..45fcd2fad98a8 100644 --- a/paddle/fluid/pybind/pybind.cc +++ b/paddle/fluid/pybind/pybind.cc @@ -4469,7 +4469,7 @@ All parameter, weight, gradient are variables in Paddle. return phi::autotune::AutoTuneStatus::Instance().DisableAutoTune(); }); - m.def("autotune_range", [](int64_t start, int64_t stop) { + m.def("set_autotune_range", [](int64_t start, int64_t stop) { return phi::autotune::AutoTuneStatus::Instance().SetAutoTuneRange(start, stop); }); @@ -4478,10 +4478,8 @@ All parameter, weight, gradient are variables in Paddle. [] { return phi::autotune::AutoTuneStatus::Instance().Update(); }); m.def("autotune_status", [] { - phi::autotune::AutoTuneCache::Instance().UpdateStatus(); py::dict res; - res["use_autotune"] = - phi::autotune::AutoTuneStatus::Instance().UseAutoTune(); + phi::autotune::AutoTuneCache::Instance().UpdateStatus(); res["step_id"] = phi::autotune::AutoTuneStatus::Instance().StepID(); res["cache_size"] = phi::autotune::AutoTuneCache::Instance().Size(); res["cache_hit_rate"] = diff --git a/paddle/phi/kernels/CMakeLists.txt b/paddle/phi/kernels/CMakeLists.txt index 937024d450a36..eec83a1ed8130 100644 --- a/paddle/phi/kernels/CMakeLists.txt +++ b/paddle/phi/kernels/CMakeLists.txt @@ -6,12 +6,15 @@ file(APPEND ${kernel_declare_file} "#include \"paddle/phi/core/kernel_registry.h # phi functors and functions called by kernels add_subdirectory(funcs) +# kernel autotune +add_subdirectory(autotune) + # phi depends all phi kernel targets set_property(GLOBAL PROPERTY PHI_KERNELS "") # [ 1. Common kernel compilation dependencies ] set(COMMON_KERNEL_DEPS dense_tensor sparse_coo_tensor sparse_csr_tensor kernel_context kernel_factory arg_map_context convert_utils lod_utils custom_kernel) -set(COMMON_KERNEL_DEPS ${COMMON_KERNEL_DEPS} eigen_function blas math_function im2col vol2col concat_and_split_functor selected_rows_functor ) +set(COMMON_KERNEL_DEPS ${COMMON_KERNEL_DEPS} eigen_function blas math_function im2col vol2col concat_and_split_functor selected_rows_functor) # remove this dep after removing fluid deps on tensor creation set(COMMON_KERNEL_DEPS ${COMMON_KERNEL_DEPS} phi_api_utils) set(COMMON_KERNEL_DEPS ${COMMON_KERNEL_DEPS} infermeta) @@ -27,12 +30,16 @@ kernel_library(full_kernel DEPS ${COMMON_KERNEL_DEPS} empty_kernel) # Some kernels depend on some targets that are not commonly used. # These targets are not suitable for common dependencies. # In this case, you need to manually generate them here. -set(MANUAL_BUILD_KERNELS cross_entropy_kernel adam_kernel adamw_kernel deformable_conv_kernel deformable_conv_grad_kernel eigh_kernel +set(AUTOTUNE_KERNELS conv_kernel conv_grad_kernel conv_grad_grad_kernel conv_transpose_kernel conv_transpose_grad_kernel) +set(MANUAL_BUILD_KERNELS ${AUTOTUNE_KERNELS} cross_entropy_kernel adam_kernel adamw_kernel deformable_conv_kernel deformable_conv_grad_kernel eigh_kernel gumbel_softmax_kernel gumbel_softmax_grad_kernel hierarchical_sigmoid_kernel hierarchical_sigmoid_grad_kernel matrix_power_kernel matrix_power_grad_kernel maxout_kernel maxout_grad_kernel pool_kernel put_along_axis_kernel put_along_axis_grad_kernel segment_pool_kernel segment_pool_grad_kernel softmax_kernel softmax_grad_kernel take_along_axis_kernel take_along_axis_grad_kernel triangular_solve_grad_kernel determinant_grad_kernel reduce_kernel rnn_kernel rnn_grad_kernel warpctc_kernel warpctc_grad_kernel) +foreach(src ${AUTOTUNE_KERNELS}) + kernel_library(${src} DEPS ${COMMON_KERNEL_DEPS} switch_autotune) +endforeach() kernel_library(adam_kernel DEPS gflags glog flags ${COMMON_KERNEL_DEPS} selected_rows_functor threadpool jit_kernel_helper) kernel_library(adamw_kernel DEPS ${COMMON_KERNEL_DEPS} adam_kernel) kernel_library(cross_entropy_kernel DEPS ${COMMON_KERNEL_DEPS} softmax cross_entropy) @@ -75,6 +82,3 @@ add_subdirectory(selected_rows) copy_if_different(${kernel_declare_file} ${kernel_declare_file_final}) # For strings kernels add_subdirectory(strings) - -# 5. kernel autotune -add_subdirectory(autotune) diff --git a/paddle/phi/kernels/autotune/CMakeLists.txt b/paddle/phi/kernels/autotune/CMakeLists.txt index f1702d883b9f0..63dc22459446f 100644 --- a/paddle/phi/kernels/autotune/CMakeLists.txt +++ b/paddle/phi/kernels/autotune/CMakeLists.txt @@ -1,11 +1,12 @@ if (WITH_GPU) - nv_test(gpu_timer_test SRCS gpu_timer_test.cu DEPS gtest) - nv_test(auto_tune_test SRCS auto_tune_test.cu DEPS gtest) + nv_test(gpu_timer_test SRCS gpu_timer_test.cu DEPS gtest) + nv_test(auto_tune_test SRCS auto_tune_test.cu DEPS gtest) elseif (WITH_ROCM) - hip_test(gpu_timer_test SRCS gpu_timer_test.cu DEPS gtest) - hip_test(auto_tune_test SRCS auto_tune_test.cu DEPS gtest) + hip_test(gpu_timer_test SRCS gpu_timer_test.cu DEPS gtest) + hip_test(auto_tune_test SRCS auto_tune_test.cu DEPS gtest) endif() cc_library(cache SRCS cache.cc DEPS boost) +cc_library(switch_autotune SRCS switch_autotune.cc DEPS cache flags) cc_test(cache_test SRCS cache_test.cc DEPS gtest cache) diff --git a/paddle/phi/kernels/autotune/cache.cc b/paddle/phi/kernels/autotune/cache.cc index bf68e2010151b..ef2cbe633d496 100644 --- a/paddle/phi/kernels/autotune/cache.cc +++ b/paddle/phi/kernels/autotune/cache.cc @@ -13,6 +13,8 @@ // limitations under the License. #include "paddle/phi/kernels/autotune/cache.h" +#include +#include "glog/logging.h" namespace phi { namespace autotune { @@ -32,5 +34,40 @@ size_t ConvKey(const std::vector& x_dims, static_cast(dtype)); } +std::string AlgorithmTypeString(int64_t algo_type) { + if (algo_type == static_cast(AlgorithmType::kConvForward)) { + return "conv_forward"; + } else if (algo_type == + static_cast(AlgorithmType::kConvBackwardData)) { + return "conv_backward_data"; + } else if (algo_type == + static_cast(AlgorithmType::kConvBackwardFilter)) { + return "conv_backward_filter"; + } + return std::to_string(algo_type); +} + +void AutoTuneCache::UpdateStatus() { + int64_t size = 0; + int64_t cache_hits = 0; + int64_t cache_misses = 0; + int name_width = 24; + std::cout.setf(std::ios::left); + for (auto& v : auto_tune_map_) { + VLOG(4) << "AlgoType: " << std::setfill(' ') << std::setw(name_width) + << AlgorithmTypeString(v.first) + << " Cache Size: " << v.second.Size() + << " Hits: " << v.second.CacheHits() + << " Misses: " << v.second.CacheMisses() + << " Hit Rate: " << v.second.CacheHitRate(); + size += v.second.Size(); + cache_hits += v.second.CacheHits(); + cache_misses += v.second.CacheMisses(); + } + total_size_ = size; + total_cache_hits_ = cache_hits; + total_cache_misses_ = cache_misses; +} + } // namespace autotune } // namespace phi diff --git a/paddle/phi/kernels/autotune/cache.h b/paddle/phi/kernels/autotune/cache.h index d492e7c151f91..37c5d134e8a61 100644 --- a/paddle/phi/kernels/autotune/cache.h +++ b/paddle/phi/kernels/autotune/cache.h @@ -13,11 +13,12 @@ // limitations under the License. #pragma once + #include #include +#include #include #include -#include "glog/logging.h" #include "paddle/phi/common/data_type.h" #include "paddle/phi/core/enforce.h" #include "paddle/phi/core/errors.h" @@ -92,6 +93,13 @@ class AlgorithmsCache { return ret; } + void Clean() { + std::lock_guard lock(*cache_mutex_); + hash_.clear(); + cache_hits_ = 0; + cache_misses_ = 0; + } + void Set(size_t key, AlgorithmT algo) { std::lock_guard lock(*cache_mutex_); hash_[key] = algo; @@ -116,15 +124,22 @@ class AlgorithmsCache { private: std::unordered_map hash_; std::shared_ptr cache_mutex_; - int64_t cache_hits_ = 0; - int64_t cache_misses_ = 0; + + int64_t cache_hits_{0}; + int64_t cache_misses_{0}; +}; + +enum class AlgorithmType { + kConvForward = 1, + kConvBackwardData = 2, + kConvBackwardFilter = 3, + kAlgorithmCount = 4 }; // AlgorithmsConfigKey -> AlgorithmsID -using AlgorithmsConfigKeyMap = AlgorithmsCache; -// AlgorithmsType -> AlgorithmsCache -using AlgorithmsTypeMap = - std::unordered_map; +using AlgorithmsCacheMap = AlgorithmsCache; +// AlgorithmType -> AlgorithmsCache +using AlgorithmsTypeMap = std::unordered_map; class AutoTuneCache { public: @@ -133,42 +148,30 @@ class AutoTuneCache { return autotune_cache; } - AlgorithmsConfigKeyMap& RegisterOrGet(const std::string& algo_type) { - std::lock_guard lock(*autotune_cache_mutex_); - if (auto_tune_map_.find(algo_type) == auto_tune_map_.end()) { - AlgorithmsConfigKeyMap cache; - auto_tune_map_[algo_type] = cache; - } - return auto_tune_map_[algo_type]; + AlgorithmsCacheMap& Get(const AlgorithmType& algo_type) { + return auto_tune_map_[static_cast(algo_type)]; } - void Clean(float miss_rate) { - std::lock_guard lock(*autotune_cache_mutex_); - // Set a small tolerance to avoid performance degradation - // due to large cache size under dynamic shape. - if (miss_rate > 0.01) { - auto_tune_map_.clear(); - } + AlgorithmsCacheMap& GetConvForward() { + return Get(AlgorithmType::kConvForward); + } + + AlgorithmsCacheMap& GetConvBackwardData() { + return Get(AlgorithmType::kConvBackwardData); + } + + AlgorithmsCacheMap& GetConvBackwardFilter() { + return Get(AlgorithmType::kConvBackwardFilter); } - void UpdateStatus() { - int64_t size = 0; - int64_t cache_hits = 0; - int64_t cache_misses = 0; + void Clean() { for (auto& v : auto_tune_map_) { - VLOG(4) << "AlgoType: " << v.first << " Cache Size: " << v.second.Size() - << " Hits: " << v.second.CacheHits() - << " Misses: " << v.second.CacheMisses() - << " Hit Rate: " << v.second.CacheHitRate(); - size += v.second.Size(); - cache_hits += v.second.CacheHits(); - cache_misses += v.second.CacheMisses(); + v.second.Clean(); } - total_size_ = size; - total_cache_hits_ = cache_hits; - total_cache_misses_ = cache_misses; } + void UpdateStatus(); + // The number of total config cached int64_t Size() const { return total_size_; } @@ -183,17 +186,30 @@ class AutoTuneCache { total_cache_hit_rate = static_cast(total_cache_hits_) / static_cast(total_num_accesses); } - return total_cache_hit_rate; } private: - AutoTuneCache() : autotune_cache_mutex_(new std::mutex()) {} + AutoTuneCache() : autotune_cache_mutex_(new std::mutex()) { + for (int i = 1; i < static_cast(AlgorithmType::kAlgorithmCount); ++i) { + Register(static_cast(i)); + } + } + + void Register(const AlgorithmType& algo_type) { + std::lock_guard lock(*autotune_cache_mutex_); + int64_t key = static_cast(algo_type); + if (auto_tune_map_.find(key) == auto_tune_map_.end()) { + AlgorithmsCacheMap cache; + auto_tune_map_[key] = cache; + } + } + AlgorithmsTypeMap auto_tune_map_; std::shared_ptr autotune_cache_mutex_; - int64_t total_cache_hits_ = 0; - int64_t total_cache_misses_ = 0; - int64_t total_size_ = 0; + int64_t total_cache_hits_{0}; + int64_t total_cache_misses_{0}; + int64_t total_size_{0}; }; } // namespace autotune diff --git a/paddle/phi/kernels/autotune/cache_test.cc b/paddle/phi/kernels/autotune/cache_test.cc index 92ba411624fc0..f99f8bfc8b821 100644 --- a/paddle/phi/kernels/autotune/cache_test.cc +++ b/paddle/phi/kernels/autotune/cache_test.cc @@ -22,7 +22,7 @@ enum ConvAlgos { GEMMKernel = 0, CuDNNKernel_1 = 1, CuDNNKernel_2 = 2 }; TEST(AlgosCache, AlgosCache) { auto autotune_cache = phi::autotune::AutoTuneCache::Instance(); - auto& cache = autotune_cache.RegisterOrGet("conv_fw"); + auto& cache = autotune_cache.GetConvForward(); std::vector x_shape = {4, 224, 224, 3}; std::vector w_shape = {32, 3, 3, 3}; diff --git a/paddle/phi/kernels/autotune/switch_autotune.cc b/paddle/phi/kernels/autotune/switch_autotune.cc new file mode 100644 index 0000000000000..6fda24ef3c860 --- /dev/null +++ b/paddle/phi/kernels/autotune/switch_autotune.cc @@ -0,0 +1,74 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/autotune/switch_autotune.h" + +#include "gflags/gflags.h" +#include "glog/logging.h" + +DECLARE_bool(use_autotune); + +namespace phi { +namespace autotune { + +void AutoTuneStatus::EnableAutoTune() { + FLAGS_use_autotune = true; + Init(); +} + +void AutoTuneStatus::DisableAutoTune() { + FLAGS_use_autotune = false; + Init(); +} + +void AutoTuneStatus::Update() { + current_steps_id_ += 1; + if (!FLAGS_use_autotune) { + return; + } + + // This fuction is called when each iter finished. + if (current_steps_id_ + 1 < start_step_id_) { + use_autotune_ = false; + } else if (current_steps_id_ + 1 >= start_step_id_ && + current_steps_id_ + 1 < stop_step_id_) { + use_autotune_ = true; + AutoTuneCache::Instance().UpdateStatus(); + step_hit_rates_.push_back(StepHitRate()); + VLOG(3) << "Step ID: " << current_steps_id_ + << ", Accumulative Cache Hit Rate: " + << static_cast(AutoTuneCache::Instance().CacheHitRate() * 100) + << "%, Cache Size: " << AutoTuneCache::Instance().Size() + << ", Current Step Hit Rate: " + << static_cast(StepHitRate() * 100) << "%"; + } else { + use_autotune_ = false; + // Set a small tolerance to avoid performance degradation + // due to large cache size under dynamic shape. + // TODO(limingshu): Currently works for conv op only, this + // method shall be opimized when more ops involved in. + // float miss_rate = static_cast(1) - RecentHitRate(); + // if (current_steps_id_ == stop_step_id_) { + // AutoTuneCache::Instance().Clean(miss_rate); + // } + if (VLOG_IS_ON(4)) { + AutoTuneCache::Instance().UpdateStatus(); + VLOG(4) << "Step ID: " << current_steps_id_ << ", Current Step Hit Rate: " + << static_cast(StepHitRate() * 100) << "%"; + } + } +} + +} // namespace autotune +} // namespace phi diff --git a/paddle/phi/kernels/autotune/switch_autotune.h b/paddle/phi/kernels/autotune/switch_autotune.h index 2f9621ed2079e..1793940542d47 100644 --- a/paddle/phi/kernels/autotune/switch_autotune.h +++ b/paddle/phi/kernels/autotune/switch_autotune.h @@ -13,10 +13,8 @@ // limitations under the License. #pragma once + #include -#include -#include -#include "glog/logging.h" #include "paddle/phi/kernels/autotune/cache.h" namespace phi { @@ -31,45 +29,11 @@ class AutoTuneStatus { bool UseAutoTune() { return use_autotune_; } - // EnableAutoTune and DisableAutoTune Should be used for debug only. - void EnableAutoTune() { - use_autotune_ = true; - Init(); - } - - void DisableAutoTune() { - use_autotune_ = false; - Init(); - } + // EnableAutoTune and DisableAutoTune should be used for debug only. + void EnableAutoTune(); + void DisableAutoTune(); - void Update() { - current_steps_id_ += 1; - - if (!use_autotune_ && !update_use_autotune_) { - return; - } - - if (current_steps_id_ < start_step_id_) { - use_autotune_ = false; - } else if (current_steps_id_ >= start_step_id_ && - current_steps_id_ < stop_step_id_) { - use_autotune_ = true; - AutoTuneCache::Instance().UpdateStatus(); - step_hit_rates_.push_back(StepHitRate()); - VLOG(3) << "Step ID " << current_steps_id_ - << ", Accumulative Cache Hit Rate: " - << AutoTuneCache::Instance().CacheHitRate() - << ", Cache Size: " << AutoTuneCache::Instance().Size() - << ", Current Step Hit Rate: " << StepHitRate(); - } else if (current_steps_id_ == stop_step_id_) { - use_autotune_ = false; - update_use_autotune_ = false; - // clean cache according miss rate - float miss_rate = static_cast(1) - RecentHitRate(); - AutoTuneCache::Instance().Clean(miss_rate); - VLOG(3) << "Recent Miss Rate: " << miss_rate; - } - } + void Update(); int64_t StepID() { return current_steps_id_; } @@ -84,19 +48,25 @@ class AutoTuneStatus { // Hit Rate of Current Step float StepHitRate() { - int64_t current_hits = AutoTuneCache::Instance().CacheHits(); - int64_t current_misses = AutoTuneCache::Instance().CacheMisses(); - int64_t step_hits_ = current_hits - previous_hits_; - int64_t step_misses_ = current_misses - previous_misses_; - float step_hit_rate = 0.; - int64_t step_num_accesses = step_hits_ + step_misses_; - if (step_num_accesses != 0) { - step_hit_rate = static_cast(step_hits_) / - static_cast(step_num_accesses); + static int64_t last_step_id = -2; + + if (last_step_id != current_steps_id_) { + int64_t current_hits = AutoTuneCache::Instance().CacheHits(); + int64_t current_misses = AutoTuneCache::Instance().CacheMisses(); + int64_t step_hits_ = current_hits - previous_hits_; + int64_t step_misses_ = current_misses - previous_misses_; + float step_hit_rate = 0.; + int64_t step_num_accesses = step_hits_ + step_misses_; + if (step_num_accesses != 0) { + step_hit_rate = static_cast(step_hits_) / + static_cast(step_num_accesses); + } + previous_hits_ = current_hits; + previous_misses_ = current_misses; + current_step_hit_rate_ = step_hit_rate; + last_step_id = current_steps_id_; } - previous_hits_ = current_hits; - previous_misses_ = current_misses; - return step_hit_rate; + return current_step_hit_rate_; } void SetAutoTuneRange(int64_t start, int64_t stop) { @@ -108,21 +78,21 @@ class AutoTuneStatus { AutoTuneStatus() = default; void Init() { - update_use_autotune_ = use_autotune_; + use_autotune_ = false; current_steps_id_ = -1; previous_hits_ = 0; previous_misses_ = 0; step_hit_rates_.clear(); - AutoTuneCache::Instance().Clean(1.0); + AutoTuneCache::Instance().Clean(); } - int64_t start_step_id_ = 0; - int64_t stop_step_id_ = 10; - int64_t current_steps_id_ = -1; - bool use_autotune_ = false; - bool update_use_autotune_ = false; - int64_t previous_hits_ = 0; - int64_t previous_misses_ = 0; + bool use_autotune_{false}; + int64_t start_step_id_{1}; + int64_t stop_step_id_{10}; + int64_t current_steps_id_{-1}; + int64_t previous_hits_{0}; + int64_t previous_misses_{0}; + float current_step_hit_rate_{0.f}; std::vector step_hit_rates_; }; diff --git a/python/paddle/fluid/tests/unittests/test_switch_autotune.py b/python/paddle/fluid/tests/unittests/test_switch_autotune.py index 1c08811d4b95c..1775272aac69d 100644 --- a/python/paddle/fluid/tests/unittests/test_switch_autotune.py +++ b/python/paddle/fluid/tests/unittests/test_switch_autotune.py @@ -14,7 +14,7 @@ import paddle import unittest -import numpy +import numpy as np class SimpleNet(paddle.nn.Layer): @@ -27,6 +27,7 @@ def forward(self, image, label=None): def train_dygraph(net, data): + data.stop_gradient = False out = net(data) loss = paddle.mean(out) adam = paddle.optimizer.Adam(parameters=net.parameters()) @@ -36,6 +37,7 @@ def train_dygraph(net, data): def static_program(net, data): + data.stop_gradient = False out = net(data) loss = paddle.mean(out) adam = paddle.optimizer.Adam() @@ -43,60 +45,64 @@ def static_program(net, data): return loss -def set_flags(enable_autotune): - if paddle.is_compiled_with_cuda(): - if enable_autotune: - paddle.set_flags({'FLAGS_conv_workspace_size_limit': -1}) - paddle.set_flags({'FLAGS_cudnn_exhaustive_search': 1}) - else: - paddle.set_flags({'FLAGS_conv_workspace_size_limit': 512}) - paddle.set_flags({'FLAGS_cudnn_exhaustive_search': 0}) - - class TestAutoTune(unittest.TestCase): + def set_flags(self, enable_autotune): + if paddle.is_compiled_with_cuda(): + if enable_autotune: + paddle.set_flags({'FLAGS_conv_workspace_size_limit': -1}) + else: + paddle.set_flags({'FLAGS_conv_workspace_size_limit': 512}) + + def get_flags(self, name): + res = paddle.get_flags(name) + return res[name] + + def get_expected_res(self, step_id, enable_autotune): + expected_res = { + "step_id": step_id, + "cache_size": 0, + "cache_hit_rate": 0 + } + if paddle.is_compiled_with_cuda(): + # Total 3 * num_iters cache accesses, only iter 2 hits the cache. + if enable_autotune and step_id >= 1: + expected_res["cache_size"] = 3 + if enable_autotune and step_id == 2: + expected_res["cache_hit_rate"] = np.round( + float(3) / float(9), 5) + return expected_res + def test_autotune(self): paddle.fluid.core.disable_autotune() - status = paddle.fluid.core.autotune_status() - self.assertEqual(status["use_autotune"], False) + self.assertEqual(self.get_flags("FLAGS_use_autotune"), False) paddle.fluid.core.enable_autotune() - status = paddle.fluid.core.autotune_status() - self.assertEqual(status["use_autotune"], True) + self.assertEqual(self.get_flags("FLAGS_use_autotune"), True) def check_status(self, expected_res): status = paddle.fluid.core.autotune_status() for key in status.keys(): - self.assertEqual(status[key], expected_res[key]) + if key == "cache_hit_rate": + v = np.round(status[key], 5) + else: + v = status[key] + self.assertEqual(v, expected_res[key]) class TestDygraphAutoTuneStatus(TestAutoTune): def run_program(self, enable_autotune): - set_flags(enable_autotune) + self.set_flags(enable_autotune) if enable_autotune: paddle.fluid.core.enable_autotune() else: paddle.fluid.core.disable_autotune() - paddle.fluid.core.autotune_range(1, 2) + paddle.fluid.core.set_autotune_range(1, 2) x_var = paddle.uniform((1, 1, 8, 8), dtype='float32', min=-1., max=1.) net = SimpleNet() for i in range(3): train_dygraph(net, x_var) - if i >= 1 and i < 2: - expected_res = { - "step_id": i, - "use_autotune": enable_autotune, - "cache_size": 0, - "cache_hit_rate": 0 - } - self.check_status(expected_res) - else: - expected_res = { - "step_id": i, - "use_autotune": False, - "cache_size": 0, - "cache_hit_rate": 0 - } - self.check_status(expected_res) + expected_res = self.get_expected_res(i, enable_autotune) + self.check_status(expected_res) def func_enable_autotune(self): self.run_program(enable_autotune=True) @@ -118,60 +124,45 @@ def test_disable_autotune(self): class TestStaticAutoTuneStatus(TestAutoTune): def run_program(self, enable_autotune): paddle.enable_static() - set_flags(enable_autotune) - if enable_autotune: - paddle.fluid.core.enable_autotune() - else: - paddle.fluid.core.disable_autotune() - paddle.fluid.core.autotune_range(1, 2) data_shape = [1, 1, 8, 8] - data = paddle.static.data(name='X', shape=data_shape, dtype='float32') - net = SimpleNet() - loss = static_program(net, data) + main_program = paddle.static.Program() + startup_program = paddle.static.Program() + with paddle.static.program_guard(main_program, startup_program): + data = paddle.static.data( + name='X', shape=data_shape, dtype='float32') + net = SimpleNet() + loss = static_program(net, data) place = paddle.CUDAPlace(0) if paddle.fluid.core.is_compiled_with_cuda( ) else paddle.CPUPlace() exe = paddle.static.Executor(place) - exe.run(paddle.static.default_startup_program()) - x = numpy.random.random(size=data_shape).astype('float32') + exe.run(startup_program) + x = np.random.random(size=data_shape).astype('float32') + + self.set_flags(enable_autotune) + if enable_autotune: + paddle.fluid.core.enable_autotune() + else: + paddle.fluid.core.disable_autotune() + paddle.fluid.core.set_autotune_range(1, 2) for i in range(3): - exe.run(feed={'X': x}, fetch_list=[loss]) + exe.run(program=main_program, feed={'X': x}, fetch_list=[loss]) status = paddle.fluid.core.autotune_status() - # In static mode, the startup_program will run at first. - # The expected step_id will be increased by 1. - if i >= 0 and i < 1: - expected_res = { - "step_id": i + 1, - "use_autotune": enable_autotune, - "cache_size": 0, - "cache_hit_rate": 0 - } - self.check_status(expected_res) - else: - expected_res = { - "step_id": i + 1, - "use_autotune": False, - "cache_size": 0, - "cache_hit_rate": 0 - } - self.check_status(expected_res) + expected_res = self.get_expected_res(i, enable_autotune) + self.check_status(expected_res) paddle.disable_static() def func_enable_autotune(self): self.run_program(enable_autotune=True) def test_enable_autotune(self): - with paddle.fluid.framework._test_eager_guard(): - self.func_enable_autotune() self.func_enable_autotune() def func_disable_autotune(self): self.run_program(enable_autotune=False) def test_disable_autotune(self): - with paddle.fluid.framework._test_eager_guard(): - self.func_disable_autotune() self.func_disable_autotune() From 2eac4db81970d04d8448a4d1796a9266440f0ed2 Mon Sep 17 00:00:00 2001 From: zhangxiaoci Date: Fri, 15 Apr 2022 11:12:51 +0800 Subject: [PATCH 171/211] support KL2 multi-card training, refactor KL2 unittest, *test=kunlun (#41543) --- .../unittests/xpu/test_squeeze2_op_xpu.py | 123 +++--- .../unittests/xpu/test_unsqueeze2_op_xpu.py | 390 +++++++++--------- 2 files changed, 274 insertions(+), 239 deletions(-) diff --git a/python/paddle/fluid/tests/unittests/xpu/test_squeeze2_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_squeeze2_op_xpu.py index a6269f43daa89..705e7c4cb0fef 100644 --- a/python/paddle/fluid/tests/unittests/xpu/test_squeeze2_op_xpu.py +++ b/python/paddle/fluid/tests/unittests/xpu/test_squeeze2_op_xpu.py @@ -21,67 +21,86 @@ from op_test import OpTest from op_test_xpu import XPUOpTest +from xpu.get_test_cover_info import create_test_class, get_xpu_op_support_types, XPUOpTestWrapper import paddle paddle.enable_static() -# Correct: General. -class TestSqueezeOp(XPUOpTest): - def setUp(self): - self.op_type = "squeeze2" - self.use_xpu = True - self.use_mkldnn = False - self.init_test_case() - self.inputs = {"X": np.random.random(self.ori_shape).astype("float32")} - self.init_attrs() - self.outputs = { - "Out": self.inputs["X"].reshape(self.new_shape), - "XShape": np.random.random(self.ori_shape).astype("float32") - } - - def test_check_output(self): - if paddle.is_compiled_with_xpu(): +class XPUTestSqueeze2Op(XPUOpTestWrapper): + def __init__(self): + self.op_name = "squeeze2" + self.use_dynamic_create_class = False + + class TestSqueeze2Op(XPUOpTest): + def setUp(self): + self.op_type = "squeeze2" + self.use_mkldnn = False + self.init_dtype() + self.init_test_case() + self.inputs = { + "X": np.random.random(self.ori_shape).astype(self.dtype) + } + self.outputs = { + "Out": self.inputs["X"].reshape(self.new_shape), + "XShape": np.random.random(self.ori_shape).astype(self.dtype) + } + self.init_attrs() + + def init_dtype(self): + self.dtype = self.in_type + + def init_attrs(self): + self.attrs = {"axes": self.axes} + + def init_test_case(self): + self.ori_shape = (1, 3, 1, 40) + self.axes = (0, 2) + self.new_shape = (3, 40) + + def test_check_output(self): place = paddle.XPUPlace(0) self.check_output_with_place(place, no_check_set=['XShape']) - def test_check_grad(self): - if paddle.is_compiled_with_xpu(): + def test_check_grad(self): place = paddle.XPUPlace(0) - self.check_grad_with_place(place, ['X'], 'Out') - - def init_test_case(self): - self.ori_shape = (1, 3, 1, 40) - self.axes = (0, 2) - self.new_shape = (3, 40) - - def init_attrs(self): - self.attrs = {"axes": self.axes} - - -# Correct: There is mins axis. -class TestSqueezeOp1(TestSqueezeOp): - def init_test_case(self): - self.ori_shape = (1, 20, 1, 5) - self.axes = (0, -2) - self.new_shape = (20, 5) - - -# Correct: No axes input. -class TestSqueezeOp2(TestSqueezeOp): - def init_test_case(self): - self.ori_shape = (1, 20, 1, 5) - self.axes = () - self.new_shape = (20, 5) - - -# Correct: Just part of axes be squeezed. -class TestSqueezeOp3(TestSqueezeOp): - def init_test_case(self): - self.ori_shape = (6, 1, 5, 1, 4, 1) - self.axes = (1, -1) - self.new_shape = (6, 5, 1, 4) - + if self.dtype in [np.float32, np.float64]: + self.check_grad_with_place(place, ['X'], 'Out') + elif self.dtype == np.bool_: + return + else: + user_defined_grad_outputs = np.random.random( + self.new_shape).astype(self.dtype) + self.check_grad_with_place( + place, ['X'], + 'Out', + user_defined_grad_outputs=user_defined_grad_outputs) + + # Correct: There is mins axis. + class TestSqueeze2Op1(TestSqueeze2Op): + def init_test_case(self): + self.ori_shape = (1, 20, 1, 5) + self.axes = (0, -2) + self.new_shape = (20, 5) + + # Correct: No axes input. + class TestSqueeze2Op2(TestSqueeze2Op): + def init_test_case(self): + self.ori_shape = (1, 20, 1, 5) + self.axes = () + self.new_shape = (20, 5) + + # Correct: Just part of axes be squeezed. + class TestSqueeze2Op3(TestSqueeze2Op): + def init_test_case(self): + self.ori_shape = (6, 1, 5, 1, 4, 1) + self.axes = (1, -1) + self.new_shape = (6, 5, 1, 4) + + +support_types = get_xpu_op_support_types("squeeze2") +for stype in support_types: + create_test_class(globals(), XPUTestSqueeze2Op, stype) if __name__ == "__main__": unittest.main() diff --git a/python/paddle/fluid/tests/unittests/xpu/test_unsqueeze2_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_unsqueeze2_op_xpu.py index 606053832eaba..f6c540d6c2c0a 100644 --- a/python/paddle/fluid/tests/unittests/xpu/test_unsqueeze2_op_xpu.py +++ b/python/paddle/fluid/tests/unittests/xpu/test_unsqueeze2_op_xpu.py @@ -23,209 +23,225 @@ import paddle.fluid as fluid from op_test import OpTest from op_test_xpu import XPUOpTest +from xpu.get_test_cover_info import create_test_class, get_xpu_op_support_types, XPUOpTestWrapper paddle.enable_static() -# Correct: General. -class TestUnsqueezeOp(XPUOpTest): - def setUp(self): - self.init_test_case() - self.use_xpu = True - self.use_mkldnn = False - self.op_type = "unsqueeze2" - self.inputs = {"X": np.random.random(self.ori_shape).astype("float32")} - self.init_attrs() - self.outputs = { - "Out": self.inputs["X"].reshape(self.new_shape), - "XShape": np.random.random(self.ori_shape).astype("float32") - } - - def test_check_output(self): - if paddle.is_compiled_with_xpu(): +class XPUTestUnsqueeze2Op(XPUOpTestWrapper): + def __init__(self): + self.op_name = "unsqueeze2" + self.use_dynamic_create_class = False + + class TestUnsqueeze2Op(XPUOpTest): + def setUp(self): + self.op_type = "unsqueeze2" + self.use_mkldnn = False + self.init_dtype() + self.init_test_case() + self.inputs = { + "X": np.random.random(self.ori_shape).astype(self.dtype) + } + self.outputs = { + "Out": self.inputs["X"].reshape(self.new_shape), + "XShape": np.random.random(self.ori_shape).astype(self.dtype) + } + self.init_attrs() + + def init_dtype(self): + self.dtype = self.in_type + + def init_attrs(self): + self.attrs = {"axes": self.axes} + + def init_test_case(self): + self.ori_shape = (3, 40) + self.axes = (1, 2) + self.new_shape = (3, 1, 1, 40) + + def test_check_output(self): place = paddle.XPUPlace(0) self.check_output_with_place(place, no_check_set=['XShape']) - def test_check_grad(self): - if paddle.is_compiled_with_xpu(): + def test_check_grad(self): place = paddle.XPUPlace(0) - self.check_grad_with_place(place, ['X'], 'Out') - - def init_test_case(self): - self.ori_shape = (3, 40) - self.axes = (1, 2) - self.new_shape = (3, 1, 1, 40) - - def init_attrs(self): - self.attrs = {"axes": self.axes} - - -# Correct: Single input index. -class TestUnsqueezeOp1(TestUnsqueezeOp): - def init_test_case(self): - self.ori_shape = (20, 5) - self.axes = (-1, ) - self.new_shape = (20, 5, 1) - - -# Correct: Mixed input axis. -class TestUnsqueezeOp2(TestUnsqueezeOp): - def init_test_case(self): - self.ori_shape = (20, 5) - self.axes = (0, -1) - self.new_shape = (1, 20, 5, 1) - - -# Correct: There is duplicated axis. -class TestUnsqueezeOp3(TestUnsqueezeOp): - def init_test_case(self): - self.ori_shape = (10, 2, 5) - self.axes = (0, 3, 3) - self.new_shape = (1, 10, 2, 1, 1, 5) - - -# Correct: Reversed axes. -class TestUnsqueezeOp4(TestUnsqueezeOp): - def init_test_case(self): - self.ori_shape = (10, 2, 5) - self.axes = (3, 1, 1) - self.new_shape = (10, 1, 1, 2, 5, 1) - - -# axes is a list(with tensor) -class TestUnsqueezeOp_AxesTensorList(XPUOpTest): - def setUp(self): - self.init_test_case() - self.use_xpu = True - self.use_mkldnn = False - self.op_type = "unsqueeze2" - - axes_tensor_list = [] - for index, ele in enumerate(self.axes): - axes_tensor_list.append(("axes" + str(index), np.ones( - (1)).astype('int32') * ele)) - - self.inputs = { - "X": np.random.random(self.ori_shape).astype("float32"), - "AxesTensorList": axes_tensor_list - } - self.init_attrs() - self.outputs = { - "Out": self.inputs["X"].reshape(self.new_shape), - "XShape": np.random.random(self.ori_shape).astype("float32") - } - - def test_check_output(self): - if paddle.is_compiled_with_xpu(): + if self.dtype in [np.float32, np.float64]: + self.check_grad_with_place(place, ['X'], 'Out') + elif self.dtype == np.bool_: + return + else: + user_defined_grad_outputs = np.random.random( + self.new_shape).astype(self.dtype) + self.check_grad_with_place( + place, ['X'], + 'Out', + user_defined_grad_outputs=user_defined_grad_outputs) + + # Correct: Single input index. + class TestUnsqueeze2Op1(TestUnsqueeze2Op): + def init_test_case(self): + self.ori_shape = (20, 5) + self.axes = (-1, ) + self.new_shape = (20, 5, 1) + + # Correct: Mixed input axis. + class TestUnsqueeze2Op2(TestUnsqueeze2Op): + def init_test_case(self): + self.ori_shape = (20, 5) + self.axes = (0, -1) + self.new_shape = (1, 20, 5, 1) + + # Correct: There is duplicated axis. + class TestUnsqueeze2Op3(TestUnsqueeze2Op): + def init_test_case(self): + self.ori_shape = (10, 2, 5) + self.axes = (0, 3, 3) + self.new_shape = (1, 10, 2, 1, 1, 5) + + # Correct: Reversed axes. + class TestUnsqueeze2Op4(TestUnsqueeze2Op): + def init_test_case(self): + self.ori_shape = (10, 2, 5) + self.axes = (3, 1, 1) + self.new_shape = (10, 1, 1, 2, 5, 1) + + # axes is a list(with tensor) + class TestUnsqueeze2Op_AxesTensorList(XPUOpTest): + def setUp(self): + self.op_type = "unsqueeze2" + self.use_mkldnn = False + self.init_dtype() + self.init_test_case() + + axes_tensor_list = [] + for index, ele in enumerate(self.axes): + axes_tensor_list.append(("axes" + str(index), np.ones( + (1)).astype('int32') * ele)) + + self.inputs = { + "X": np.random.random(self.ori_shape).astype(self.dtype), + "AxesTensorList": axes_tensor_list + } + self.init_attrs() + self.outputs = { + "Out": self.inputs["X"].reshape(self.new_shape), + "XShape": np.random.random(self.ori_shape).astype(self.dtype) + } + + def init_dtype(self): + self.dtype = self.in_type + + def test_check_output(self): place = paddle.XPUPlace(0) self.check_output_with_place(place, no_check_set=['XShape']) - def test_check_grad(self): - if paddle.is_compiled_with_xpu(): + def test_check_grad(self): place = paddle.XPUPlace(0) - self.check_grad_with_place(place, ['X'], 'Out') - - def init_test_case(self): - self.ori_shape = (20, 5) - self.axes = (1, 2) - self.new_shape = (20, 1, 1, 5) - - def init_attrs(self): - self.attrs = {} - - -class TestUnsqueezeOp1_AxesTensorList(TestUnsqueezeOp_AxesTensorList): - def init_test_case(self): - self.ori_shape = (20, 5) - self.axes = (-1, ) - self.new_shape = (20, 5, 1) - - -class TestUnsqueezeOp2_AxesTensorList(TestUnsqueezeOp_AxesTensorList): - def init_test_case(self): - self.ori_shape = (20, 5) - self.axes = (0, -1) - self.new_shape = (1, 20, 5, 1) - - -class TestUnsqueezeOp3_AxesTensorList(TestUnsqueezeOp_AxesTensorList): - def init_test_case(self): - self.ori_shape = (10, 2, 5) - self.axes = (0, 3, 3) - self.new_shape = (1, 10, 2, 1, 1, 5) - - -class TestUnsqueezeOp4_AxesTensorList(TestUnsqueezeOp_AxesTensorList): - def init_test_case(self): - self.ori_shape = (10, 2, 5) - self.axes = (3, 1, 1) - self.new_shape = (10, 1, 1, 2, 5, 1) - - -# axes is a Tensor -class TestUnsqueezeOp_AxesTensor(XPUOpTest): - def setUp(self): - self.init_test_case() - self.use_xpu = True - self.use_mkldnn = False - self.op_type = "unsqueeze2" - - self.inputs = { - "X": np.random.random(self.ori_shape).astype("float32"), - "AxesTensor": np.array(self.axes).astype("int32") - } - self.init_attrs() - self.outputs = { - "Out": self.inputs["X"].reshape(self.new_shape), - "XShape": np.random.random(self.ori_shape).astype("float32") - } - - def test_check_output(self): - if paddle.is_compiled_with_xpu(): + if self.dtype in [np.float32, np.float64]: + self.check_grad_with_place(place, ['X'], 'Out') + else: + return + + def init_test_case(self): + self.ori_shape = (20, 5) + self.axes = (1, 2) + self.new_shape = (20, 1, 1, 5) + + def init_attrs(self): + self.attrs = {} + + class TestUnsqueeze2Op1_AxesTensorList(TestUnsqueeze2Op_AxesTensorList): + def init_test_case(self): + self.ori_shape = (20, 5) + self.axes = (-1, ) + self.new_shape = (20, 5, 1) + + class TestUnsqueeze2Op2_AxesTensorList(TestUnsqueeze2Op_AxesTensorList): + def init_test_case(self): + self.ori_shape = (20, 5) + self.axes = (0, -1) + self.new_shape = (1, 20, 5, 1) + + class TestUnsqueeze2Op3_AxesTensorList(TestUnsqueeze2Op_AxesTensorList): + def init_test_case(self): + self.ori_shape = (10, 2, 5) + self.axes = (0, 3, 3) + self.new_shape = (1, 10, 2, 1, 1, 5) + + class TestUnsqueeze2Op4_AxesTensorList(TestUnsqueeze2Op_AxesTensorList): + def init_test_case(self): + self.ori_shape = (10, 2, 5) + self.axes = (3, 1, 1) + self.new_shape = (10, 1, 1, 2, 5, 1) + + # axes is a Tensor + class TestUnsqueeze2Op_AxesTensor(XPUOpTest): + def setUp(self): + self.op_type = "unsqueeze2" + self.use_mkldnn = False + self.init_test_case() + self.init_dtype() + + self.inputs = { + "X": np.random.random(self.ori_shape).astype(self.dtype), + "AxesTensor": np.array(self.axes).astype("int32") + } + self.init_attrs() + self.outputs = { + "Out": self.inputs["X"].reshape(self.new_shape), + "XShape": np.random.random(self.ori_shape).astype(self.dtype) + } + + def init_dtype(self): + self.dtype = self.in_type + + def test_check_output(self): place = paddle.XPUPlace(0) self.check_output_with_place(place, no_check_set=['XShape']) - def test_check_grad(self): - if paddle.is_compiled_with_xpu(): + def test_check_grad(self): place = paddle.XPUPlace(0) - self.check_grad_with_place(place, ['X'], 'Out') - - def init_test_case(self): - self.ori_shape = (20, 5) - self.axes = (1, 2) - self.new_shape = (20, 1, 1, 5) - - def init_attrs(self): - self.attrs = {} - - -class TestUnsqueezeOp1_AxesTensor(TestUnsqueezeOp_AxesTensor): - def init_test_case(self): - self.ori_shape = (20, 5) - self.axes = (-1, ) - self.new_shape = (20, 5, 1) - - -class TestUnsqueezeOp2_AxesTensor(TestUnsqueezeOp_AxesTensor): - def init_test_case(self): - self.ori_shape = (20, 5) - self.axes = (0, -1) - self.new_shape = (1, 20, 5, 1) - - -class TestUnsqueezeOp3_AxesTensor(TestUnsqueezeOp_AxesTensor): - def init_test_case(self): - self.ori_shape = (10, 2, 5) - self.axes = (0, 3, 3) - self.new_shape = (1, 10, 2, 1, 1, 5) - - -class TestUnsqueezeOp4_AxesTensor(TestUnsqueezeOp_AxesTensor): - def init_test_case(self): - self.ori_shape = (10, 2, 5) - self.axes = (3, 1, 1) - self.new_shape = (10, 1, 1, 2, 5, 1) - + if self.dtype in [np.float32, np.float64]: + self.check_grad_with_place(place, ['X'], 'Out') + else: + return + + def init_test_case(self): + self.ori_shape = (20, 5) + self.axes = (1, 2) + self.new_shape = (20, 1, 1, 5) + + def init_attrs(self): + self.attrs = {} + + class TestUnsqueeze2Op1_AxesTensor(TestUnsqueeze2Op_AxesTensor): + def init_test_case(self): + self.ori_shape = (20, 5) + self.axes = (-1, ) + self.new_shape = (20, 5, 1) + + class TestUnsqueeze2Op2_AxesTensor(TestUnsqueeze2Op_AxesTensor): + def init_test_case(self): + self.ori_shape = (20, 5) + self.axes = (0, -1) + self.new_shape = (1, 20, 5, 1) + + class TestUnsqueeze2Op3_AxesTensor(TestUnsqueeze2Op_AxesTensor): + def init_test_case(self): + self.ori_shape = (10, 2, 5) + self.axes = (0, 3, 3) + self.new_shape = (1, 10, 2, 1, 1, 5) + + class TestUnsqueeze2Op4_AxesTensor(TestUnsqueeze2Op_AxesTensor): + def init_test_case(self): + self.ori_shape = (10, 2, 5) + self.axes = (3, 1, 1) + self.new_shape = (10, 1, 1, 2, 5, 1) + + +support_types = get_xpu_op_support_types("unsqueeze2") +for stype in support_types: + create_test_class(globals(), XPUTestUnsqueeze2Op, stype) if __name__ == "__main__": unittest.main() From b9ee6a29358f59b35b81396f9f50113c123bc8d9 Mon Sep 17 00:00:00 2001 From: Asthestarsfalll <72954905+Asthestarsfalll@users.noreply.github.com> Date: Fri, 15 Apr 2022 11:13:08 +0800 Subject: [PATCH 172/211] =?UTF-8?q?=E3=80=90Hackathon=20No.25=E3=80=91?= =?UTF-8?q?=E4=B8=BA=20Paddle=20=E6=96=B0=E5=A2=9E=20nanquantile=20?= =?UTF-8?q?=E6=95=B0=E5=AD=A6=E8=AE=A1=E7=AE=97API=20(#41343)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- python/paddle/__init__.py | 2 + .../fluid/tests/unittests/test_quantile.py | 236 --------------- .../test_quantile_and_nanquantile.py | 268 ++++++++++++++++++ python/paddle/tensor/__init__.py | 2 + python/paddle/tensor/stat.py | 251 +++++++++++----- 5 files changed, 455 insertions(+), 304 deletions(-) delete mode 100644 python/paddle/fluid/tests/unittests/test_quantile.py create mode 100644 python/paddle/fluid/tests/unittests/test_quantile_and_nanquantile.py diff --git a/python/paddle/__init__.py b/python/paddle/__init__.py index 63f16c4eb78f1..3578b9a1aaeea 100755 --- a/python/paddle/__init__.py +++ b/python/paddle/__init__.py @@ -329,6 +329,7 @@ from .tensor.stat import numel # noqa: F401 from .tensor.stat import median # noqa: F401 from .tensor.stat import quantile # noqa: F401 +from .tensor.stat import nanquantile # noqa: F401 from .device import get_cudnn_version # noqa: F401 from .device import set_device # noqa: F401 from .device import get_device # noqa: F401 @@ -495,6 +496,7 @@ 'numel', 'median', 'quantile', + 'nanquantile', 'no_grad', 'set_grad_enabled', 'is_grad_enabled', diff --git a/python/paddle/fluid/tests/unittests/test_quantile.py b/python/paddle/fluid/tests/unittests/test_quantile.py deleted file mode 100644 index 936d1d3be3a19..0000000000000 --- a/python/paddle/fluid/tests/unittests/test_quantile.py +++ /dev/null @@ -1,236 +0,0 @@ -# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from __future__ import print_function - -import unittest -import numpy as np -import paddle - - -class TestQuantile(unittest.TestCase): - """ - This class is used for numerical precision testing. If there is - a corresponding numpy API, the precision comparison can be performed directly. - Otherwise, it needs to be verified by numpy implementated function. - """ - - def setUp(self): - np.random.seed(678) - self.input_data = np.random.rand(6, 7, 8, 9, 10) - - # Test correctness when q and axis are set. - def test_quantile_single_q(self): - x = paddle.to_tensor(self.input_data) - paddle_res = paddle.quantile(x, q=0.5, axis=2) - np_res = np.quantile(self.input_data, q=0.5, axis=2) - self.assertTrue(np.allclose(paddle_res.numpy(), np_res)) - - # Test correctness for default axis. - def test_quantile_with_no_axis(self): - x = paddle.to_tensor(self.input_data) - paddle_res = paddle.quantile(x, q=0.35) - np_res = np.quantile(self.input_data, q=0.35) - self.assertTrue(np.allclose(paddle_res.numpy(), np_res)) - - # Test correctness for multiple axis. - def test_quantile_with_multi_axis(self): - x = paddle.to_tensor(self.input_data) - paddle_res = paddle.quantile(x, q=0.75, axis=[0, 2, 3]) - np_res = np.quantile(self.input_data, q=0.75, axis=[0, 2, 3]) - self.assertTrue(np.allclose(paddle_res.numpy(), np_res)) - - # Test correctness when keepdim is set. - def test_quantile_with_keepdim(self): - x = paddle.to_tensor(self.input_data) - paddle_res = paddle.quantile(x, q=0.35, axis=4, keepdim=True) - np_res = np.quantile(self.input_data, q=0.35, axis=4, keepdims=True) - self.assertTrue(np.allclose(paddle_res.numpy(), np_res)) - - # Test correctness when all parameters are set. - def test_quantile_with_keepdim_and_multiple_axis(self): - x = paddle.to_tensor(self.input_data) - paddle_res = paddle.quantile(x, q=0.1, axis=[1, 4], keepdim=True) - np_res = np.quantile(self.input_data, q=0.1, axis=[1, 4], keepdims=True) - self.assertTrue(np.allclose(paddle_res.numpy(), np_res)) - - # Test correctness when q = 0. - def test_quantile_with_boundary_q(self): - x = paddle.to_tensor(self.input_data) - paddle_res = paddle.quantile(x, q=0, axis=3) - np_res = np.quantile(self.input_data, q=0, axis=3) - self.assertTrue(np.allclose(paddle_res.numpy(), np_res)) - - # Test correctness when input includes NaN. - def test_quantile_include_NaN(self): - input_data = np.random.randn(2, 3, 4) - input_data[0, 1, 1] = np.nan - x = paddle.to_tensor(input_data) - paddle_res = paddle.quantile(x, q=0.35, axis=0) - self.assertTrue(paddle.isnan(paddle_res[1, 1])) - - -class TestQuantileMuitlpleQ(unittest.TestCase): - """ - This class is used to test multiple input of q. - """ - - def setUp(self): - np.random.seed(678) - self.input_data = np.random.rand(10, 3, 4, 5, 4) - - def test_quantile(self): - x = paddle.to_tensor(self.input_data) - paddle_res = paddle.quantile(x, q=[0.3, 0.44], axis=-2) - np_res = np.quantile(self.input_data, q=[0.3, 0.44], axis=-2) - self.assertTrue(np.allclose(paddle_res.numpy(), np_res)) - - def test_quantile_multiple_axis(self): - x = paddle.to_tensor(self.input_data) - paddle_res = paddle.quantile(x, q=[0.2, 0.67], axis=[1, -1]) - np_res = np.quantile(self.input_data, q=[0.2, 0.67], axis=[1, -1]) - self.assertTrue(np.allclose(paddle_res.numpy(), np_res)) - - def test_quantile_multiple_axis_keepdim(self): - x = paddle.to_tensor(self.input_data) - paddle_res = paddle.quantile( - x, q=[0.1, 0.2, 0.3], axis=[1, 2], keepdim=True) - np_res = np.quantile( - self.input_data, q=[0.1, 0.2, 0.3], axis=[1, 2], keepdims=True) - self.assertTrue(np.allclose(paddle_res.numpy(), np_res)) - - -class TestQuantileError(unittest.TestCase): - """ - This class is used to test that exceptions are thrown correctly. - Validity of all parameter values and types should be considered. - """ - - def setUp(self): - self.x = paddle.randn((2, 3, 4)) - - def test_errors(self): - # Test error when q > 1 - def test_q_range_error_1(): - paddle_res = paddle.quantile(self.x, q=1.5) - - self.assertRaises(ValueError, test_q_range_error_1) - - # Test error when q < 0 - def test_q_range_error_2(): - paddle_res = paddle.quantile(self.x, q=[0.2, -0.3]) - - self.assertRaises(ValueError, test_q_range_error_2) - - # Test error with no valid q - def test_q_range_error_3(): - paddle_res = paddle.quantile(self.x, q=[]) - - self.assertRaises(ValueError, test_q_range_error_3) - - # Test error when x is not Tensor - def test_x_type_error(): - x = [1, 3, 4] - paddle_res = paddle.quantile(x, q=0.9) - - self.assertRaises(TypeError, test_x_type_error) - - # Test error when scalar axis is not int - def test_axis_type_error_1(): - paddle_res = paddle.quantile(self.x, q=0.4, axis=0.4) - - self.assertRaises(ValueError, test_axis_type_error_1) - - # Test error when axis in List is not int - def test_axis_type_error_2(): - paddle_res = paddle.quantile(self.x, q=0.4, axis=[1, 0.4]) - - self.assertRaises(ValueError, test_axis_type_error_2) - - # Test error when axis not in [-D, D) - def test_axis_value_error_1(): - paddle_res = paddle.quantile(self.x, q=0.4, axis=10) - - self.assertRaises(ValueError, test_axis_value_error_1) - - # Test error when axis not in [-D, D) - def test_axis_value_error_2(): - paddle_res = paddle.quantile(self.x, q=0.4, axis=[1, -10]) - - self.assertRaises(ValueError, test_axis_value_error_2) - - # Test error with no valid axis - def test_axis_value_error_3(): - paddle_res = paddle.quantile(self.x, q=0.4, axis=[]) - - self.assertRaises(ValueError, test_axis_value_error_3) - - -class TestQuantileRuntime(unittest.TestCase): - """ - This class is used to test the API could run correctly with - different devices, different data types, and dygraph/static mode. - """ - - def setUp(self): - np.random.seed(678) - self.input_data = np.random.rand(6, 7, 8, 9, 10) - self.dtypes = ['float32', 'float64'] - self.devices = ['cpu'] - if paddle.device.is_compiled_with_cuda(): - self.devices.append('gpu') - - def test_dygraph(self): - paddle.disable_static() - for device in self.devices: - # Check different devices - paddle.set_device(device) - for dtype in self.dtypes: - # Check different dtypes - np_input_data = self.input_data.astype(dtype) - x = paddle.to_tensor(np_input_data, dtype=dtype) - paddle_res = paddle.quantile(x, q=0.5, axis=2) - np_res = np.quantile(np_input_data, q=0.5, axis=2) - self.assertTrue(np.allclose(paddle_res.numpy(), np_res)) - - def test_static(self): - paddle.enable_static() - for device in self.devices: - x = paddle.static.data( - name="x", shape=self.input_data.shape, dtype=paddle.float32) - x_fp64 = paddle.static.data( - name="x_fp64", - shape=self.input_data.shape, - dtype=paddle.float64) - - results = paddle.quantile(x, q=0.5, axis=2) - np_input_data = self.input_data.astype('float32') - results_fp64 = paddle.quantile(x_fp64, q=0.5, axis=2) - np_input_data_fp64 = self.input_data.astype('float64') - - exe = paddle.static.Executor(device) - paddle_res, paddle_res_fp64 = exe.run( - paddle.static.default_main_program(), - feed={"x": np_input_data, - "x_fp64": np_input_data_fp64}, - fetch_list=[results, results_fp64]) - np_res = np.quantile(np_input_data, q=0.5, axis=2) - np_res_fp64 = np.quantile(np_input_data_fp64, q=0.5, axis=2) - self.assertTrue( - np.allclose(paddle_res, np_res) and np.allclose(paddle_res_fp64, - np_res_fp64)) - - -if __name__ == '__main__': - unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_quantile_and_nanquantile.py b/python/paddle/fluid/tests/unittests/test_quantile_and_nanquantile.py new file mode 100644 index 0000000000000..f0368cd2bc34f --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_quantile_and_nanquantile.py @@ -0,0 +1,268 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import unittest +import numpy as np +import paddle + +API_list = [(paddle.quantile, np.quantile), + (paddle.nanquantile, np.nanquantile)] + + +class TestQuantileAndNanquantile(unittest.TestCase): + """ + This class is used for numerical precision testing. If there is + a corresponding numpy API, the precision comparison can be performed directly. + Otherwise, it needs to be verified by numpy implementated function. + """ + + def setUp(self): + self.input_data = np.random.rand(4, 7, 6) + + # Test correctness when q and axis are set. + def test_single_q(self): + inp = self.input_data + for (func, res_func) in API_list: + x = paddle.to_tensor(inp) + paddle_res = func(x, q=0.5, axis=2) + np_res = res_func(inp, q=0.5, axis=2) + self.assertTrue(np.allclose(paddle_res.numpy(), np_res)) + inp[0, 1, 2] = np.nan + + # Test correctness for default axis. + def test_with_no_axis(self): + inp = self.input_data + for (func, res_func) in API_list: + x = paddle.to_tensor(inp) + paddle_res = func(x, q=0.35) + np_res = res_func(inp, q=0.35) + self.assertTrue(np.allclose(paddle_res.numpy(), np_res)) + inp[0, 2, 1] = np.nan + inp[0, 1, 2] = np.nan + + # Test correctness for multiple axis. + def test_with_multi_axis(self): + inp = self.input_data + for (func, res_func) in API_list: + x = paddle.to_tensor(inp) + paddle_res = func(x, q=0.75, axis=[0, 2]) + np_res = res_func(inp, q=0.75, axis=[0, 2]) + self.assertTrue(np.allclose(paddle_res.numpy(), np_res)) + inp[0, 5, 3] = np.nan + inp[0, 6, 2] = np.nan + + # Test correctness when keepdim is set. + def test_with_keepdim(self): + inp = self.input_data + for (func, res_func) in API_list: + x = paddle.to_tensor(inp) + paddle_res = func(x, q=0.35, axis=2, keepdim=True) + np_res = res_func(inp, q=0.35, axis=2, keepdims=True) + self.assertTrue(np.allclose(paddle_res.numpy(), np_res)) + inp[0, 3, 4] = np.nan + + # Test correctness when all parameters are set. + def test_with_keepdim_and_multiple_axis(self): + inp = self.input_data + for (func, res_func) in API_list: + x = paddle.to_tensor(inp) + paddle_res = func(x, q=0.1, axis=[1, 2], keepdim=True) + np_res = res_func(inp, q=0.1, axis=[1, 2], keepdims=True) + self.assertTrue(np.allclose(paddle_res.numpy(), np_res)) + inp[0, 6, 3] = np.nan + + # Test correctness when q = 0. + def test_with_boundary_q(self): + inp = self.input_data + for (func, res_func) in API_list: + x = paddle.to_tensor(inp) + paddle_res = func(x, q=0, axis=1) + np_res = res_func(inp, q=0, axis=1) + self.assertTrue(np.allclose(paddle_res.numpy(), np_res)) + inp[0, 2, 5] = np.nan + + # Test correctness when input includes NaN. + def test_quantile_include_NaN(self): + input_data = np.random.randn(2, 3, 4) + input_data[0, 1, 1] = np.nan + x = paddle.to_tensor(input_data) + paddle_res = paddle.quantile(x, q=0.35, axis=0) + np_res = np.quantile(input_data, q=0.35, axis=0) + self.assertTrue(np.allclose(paddle_res.numpy(), np_res, equal_nan=True)) + + # Test correctness when input filled with NaN. + def test_nanquantile_all_NaN(self): + input_data = np.full(shape=[2, 3], fill_value=np.nan) + input_data[0, 2] = 0 + x = paddle.to_tensor(input_data) + paddle_res = paddle.nanquantile(x, q=0.35, axis=0) + np_res = np.nanquantile(input_data, q=0.35, axis=0) + self.assertTrue(np.allclose(paddle_res.numpy(), np_res, equal_nan=True)) + + +class TestMuitlpleQ(unittest.TestCase): + """ + This class is used to test multiple input of q. + """ + + def setUp(self): + self.input_data = np.random.rand(5, 3, 4) + + def test_quantile(self): + x = paddle.to_tensor(self.input_data) + paddle_res = paddle.quantile(x, q=[0.3, 0.44], axis=-2) + np_res = np.quantile(self.input_data, q=[0.3, 0.44], axis=-2) + self.assertTrue(np.allclose(paddle_res.numpy(), np_res)) + + def test_quantile_multiple_axis(self): + x = paddle.to_tensor(self.input_data) + paddle_res = paddle.quantile(x, q=[0.2, 0.67], axis=[1, -1]) + np_res = np.quantile(self.input_data, q=[0.2, 0.67], axis=[1, -1]) + self.assertTrue(np.allclose(paddle_res.numpy(), np_res)) + + def test_quantile_multiple_axis_keepdim(self): + x = paddle.to_tensor(self.input_data) + paddle_res = paddle.quantile( + x, q=[0.1, 0.2, 0.3], axis=[1, 2], keepdim=True) + np_res = np.quantile( + self.input_data, q=[0.1, 0.2, 0.3], axis=[1, 2], keepdims=True) + self.assertTrue(np.allclose(paddle_res.numpy(), np_res)) + + +class TestError(unittest.TestCase): + """ + This class is used to test that exceptions are thrown correctly. + Validity of all parameter values and types should be considered. + """ + + def setUp(self): + self.x = paddle.randn((2, 3, 4)) + + def test_errors(self): + # Test error when q > 1 + def test_q_range_error_1(): + paddle_res = paddle.quantile(self.x, q=1.5) + + self.assertRaises(ValueError, test_q_range_error_1) + + # Test error when q < 0 + def test_q_range_error_2(): + paddle_res = paddle.quantile(self.x, q=[0.2, -0.3]) + + self.assertRaises(ValueError, test_q_range_error_2) + + # Test error with no valid q + def test_q_range_error_3(): + paddle_res = paddle.quantile(self.x, q=[]) + + self.assertRaises(ValueError, test_q_range_error_3) + + # Test error when x is not Tensor + def test_x_type_error(): + x = [1, 3, 4] + paddle_res = paddle.quantile(x, q=0.9) + + self.assertRaises(TypeError, test_x_type_error) + + # Test error when scalar axis is not int + def test_axis_type_error_1(): + paddle_res = paddle.quantile(self.x, q=0.4, axis=0.4) + + self.assertRaises(ValueError, test_axis_type_error_1) + + # Test error when axis in List is not int + def test_axis_type_error_2(): + paddle_res = paddle.quantile(self.x, q=0.4, axis=[1, 0.4]) + + self.assertRaises(ValueError, test_axis_type_error_2) + + # Test error when axis not in [-D, D) + def test_axis_value_error_1(): + paddle_res = paddle.quantile(self.x, q=0.4, axis=10) + + self.assertRaises(ValueError, test_axis_value_error_1) + + # Test error when axis not in [-D, D) + def test_axis_value_error_2(): + paddle_res = paddle.quantile(self.x, q=0.4, axis=[1, -10]) + + self.assertRaises(ValueError, test_axis_value_error_2) + + # Test error with no valid axis + def test_axis_value_error_3(): + paddle_res = paddle.quantile(self.x, q=0.4, axis=[]) + + self.assertRaises(ValueError, test_axis_value_error_3) + + +class TestQuantileRuntime(unittest.TestCase): + """ + This class is used to test the API could run correctly with + different devices, different data types, and dygraph/static mode. + """ + + def setUp(self): + self.input_data = np.random.rand(4, 7) + self.dtypes = ['float32', 'float64'] + self.devices = ['cpu'] + if paddle.device.is_compiled_with_cuda(): + self.devices.append('gpu') + + def test_dygraph(self): + paddle.disable_static() + for (func, res_func) in API_list: + for device in self.devices: + # Check different devices + paddle.set_device(device) + for dtype in self.dtypes: + # Check different dtypes + np_input_data = self.input_data.astype(dtype) + x = paddle.to_tensor(np_input_data, dtype=dtype) + paddle_res = func(x, q=0.5, axis=1) + np_res = res_func(np_input_data, q=0.5, axis=1) + self.assertTrue(np.allclose(paddle_res.numpy(), np_res)) + + def test_static(self): + paddle.enable_static() + for (func, res_func) in API_list: + for device in self.devices: + x = paddle.static.data( + name="x", shape=self.input_data.shape, dtype=paddle.float32) + x_fp64 = paddle.static.data( + name="x_fp64", + shape=self.input_data.shape, + dtype=paddle.float64) + + results = func(x, q=0.5, axis=1) + np_input_data = self.input_data.astype('float32') + results_fp64 = func(x_fp64, q=0.5, axis=1) + np_input_data_fp64 = self.input_data.astype('float64') + + exe = paddle.static.Executor(device) + paddle_res, paddle_res_fp64 = exe.run( + paddle.static.default_main_program(), + feed={"x": np_input_data, + "x_fp64": np_input_data_fp64}, + fetch_list=[results, results_fp64]) + np_res = res_func(np_input_data, q=0.5, axis=1) + np_res_fp64 = res_func(np_input_data_fp64, q=0.5, axis=1) + self.assertTrue( + np.allclose(paddle_res, np_res) and + np.allclose(paddle_res_fp64, np_res_fp64)) + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/tensor/__init__.py b/python/paddle/tensor/__init__.py index 3c4647d4d6b68..5f0fb4336e014 100755 --- a/python/paddle/tensor/__init__.py +++ b/python/paddle/tensor/__init__.py @@ -262,6 +262,7 @@ from .stat import numel # noqa: F401 from .stat import median # noqa: F401 from .stat import quantile # noqa: F401 +from .stat import nanquantile # noqa: F401 from .to_string import set_printoptions # noqa: F401 @@ -445,6 +446,7 @@ 'numel', 'median', 'quantile', + 'nanquantile', 'is_complex', 'is_integer', 'rank', diff --git a/python/paddle/tensor/stat.py b/python/paddle/tensor/stat.py index 9863abe1becbb..991b86fd47d16 100644 --- a/python/paddle/tensor/stat.py +++ b/python/paddle/tensor/stat.py @@ -342,13 +342,14 @@ def median(x, axis=None, keepdim=False, name=None): return out_tensor -def quantile(x, q, axis=None, keepdim=False): +def _compute_quantile(x, q, axis=None, keepdim=False, ignore_nan=False): """ Compute the quantile of the input along the specified axis. + Args: Args: x (Tensor): The input Tensor, it's data type can be float32, float64. - q (int|float|list): The q for calculate quantile, which should be in range [0, 1]. If q is a list, + q (int|float|list): The q for calculate quantile, which should be in range [0, 1]. If q is a list, each q will be calculated and the first dimension of output is same to the number of ``q`` . axis (int|list, optional): The axis along which to calculate quantile. ``axis`` should be int or list of int. ``axis`` should be in range [-D, D), where D is the dimensions of ``x`` . @@ -360,37 +361,28 @@ def quantile(x, q, axis=None, keepdim=False): the output Tensor is the same as ``x`` except in the reduced dimensions(it is of size 1 in this case). Otherwise, the shape of the output Tensor is squeezed in ``axis`` . Default is False. - name (str, optional): Name for the operation (optional, default is None). - For more information, please refer to :ref:`api_guide_Name`. + ignore_nan: (bool, optional): Whether to ignore NaN of input Tensor. + If ``ignore_nan`` is True, it will calculate nanquantile. + Otherwise it will calculate quantile. Default is False. Returns: - Tensor, results of quantile along ``axis`` of ``x``. If data type of ``x`` is float64, data type of results will be float64, otherwise data type will be float32. - - Examples: - .. code-block:: python - - import paddle - - x = paddle.randn((2,3)) - #[[-1.28740597, 0.49533170, -1.00698614], - # [-1.11656201, -1.01010525, -2.23457789]]) - - y1 = paddle.quantile(x, q=0.5, axis=[0, 1]) - # y1 = -1.06333363 - - y2 = paddle.quantile(x, q=0.5, axis=1) - # y2 = [-1.00698614, -1.11656201] - - y3 = paddle.quantile(x, q=[0.3, 0.5], axis=1) - # y3 =[[-1.11915410, -1.56376839], - # [-1.00698614, -1.11656201]] - - y4 = paddle.quantile(x, q=0.8, axis=1, keepdim=True) - # y4 = [[-0.10559537], - # [-1.05268800]]) + Tensor, results of quantile along ``axis`` of ``x``. + In order to obtain higher precision, data type of results will be float64. """ + # Validate x if not isinstance(x, Variable): raise TypeError("input x should be a Tensor.") + + # Validate q + if isinstance(q, (int, float)): + q = [q] + elif isinstance(q, (list, tuple)): + if len(q) <= 0: + raise ValueError("q should not be empty") + else: + raise TypeError("Type of q should be int, float, list or tuple.") + + # Validate axis dims = len(x.shape) out_shape = list(x.shape) if axis is None: @@ -399,7 +391,7 @@ def quantile(x, q, axis=None, keepdim=False): out_shape = [1] * dims else: if isinstance(axis, list): - if (len(axis) <= 0): + if len(axis) <= 0: raise ValueError("axis should not be empty") axis_src, axis_dst = [], [] for axis_single in axis: @@ -424,54 +416,177 @@ def quantile(x, q, axis=None, keepdim=False): if axis < 0: axis += dims out_shape[axis] = 1 + + mask = x.isnan() + valid_counts = mask.logical_not().sum(axis=axis, + keepdim=True, + dtype='float64') + indices = [] - if isinstance(q, (int, float)): - if q < 0 or q > 1: + + for q_num in q: + if q_num < 0 or q_num > 1: raise ValueError("q should be in range [0, 1]") - indices.append(q * (x.shape[axis] - 1)) - elif isinstance(q, (list, tuple)): - if len(q) <= 0: - raise ValueError("q should not be empty") - for q_num in q: - if q_num < 0 or q_num > 1: - raise ValueError("q should be in range [0, 1]") - indices.append(q_num * (x.shape[axis] - 1)) - else: - raise TypeError("Type of q should be int, float, list or tuple.") + if paddle.in_dynamic_mode(): + q_num = paddle.to_tensor(q_num, dtype='float64') + if ignore_nan: + indices.append(q_num * (valid_counts - 1)) + else: + # TODO(Asthestarsfalll): Use paddle.index_fill instead of where + index = q_num * (valid_counts - 1) + last_index = x.shape[axis] - 1 + nums = paddle.full_like(index, fill_value=last_index) + index = paddle.where(mask.any(axis=axis, keepdim=True), nums, index) + indices.append(index) + sorted_tensor = paddle.sort(x, axis) - indices_tensor = paddle.assign(indices).astype(paddle.float32) - indices_below = paddle.floor(indices_tensor).astype(paddle.int32) - indices_upper = paddle.ceil(indices_tensor).astype(paddle.int32) - outputs = [] - def expand_dim(indices, sorted_tensor_shape, axis): - assert axis < len(list(sorted_tensor_shape)) - expanded_shape = [1] * len(list(sorted_tensor_shape)) - expanded_shape = tuple(expanded_shape) - indices = indices.reshape(expanded_shape) - return indices + outputs = [] # TODO(chenjianye): replace the for-loop to directly take elements. - for i in range(len(indices)): - if (indices_upper[i] != indices_below[i]): - tensor_below = paddle.take_along_axis( - sorted_tensor, - expand_dim(indices_below[i], sorted_tensor.shape, axis), axis) - tensor_upper = paddle.take_along_axis( - sorted_tensor, - expand_dim(indices_upper[i], sorted_tensor.shape, axis), axis) - weights = (indices[i] - indices_below[i]).astype(x.dtype) - out = paddle.lerp(tensor_below, tensor_upper, weights) - else: - out = paddle.take_along_axis( - sorted_tensor, - expand_dim(indices_below[i], sorted_tensor.shape, axis), axis) + for index in indices: + indices_below = paddle.floor(index).astype(paddle.int32) + indices_upper = paddle.ceil(index).astype(paddle.int32) + tensor_upper = paddle.take_along_axis( + sorted_tensor, indices_upper, axis=axis) + tensor_below = paddle.take_along_axis( + sorted_tensor, indices_below, axis=axis) + weights = (index - indices_below.astype('float64')) + out = paddle.lerp( + tensor_below.astype('float64'), + tensor_upper.astype('float64'), weights) if not keepdim: out = paddle.squeeze(out, axis=axis) else: out = out.reshape(out_shape) outputs.append(out) - if isinstance(q, (list, tuple)): - return paddle.stack(outputs, 0) + + if len(q) > 1: + outputs = paddle.stack(outputs, 0) else: - return outputs[0] + outputs = outputs[0] + + return outputs + + +def quantile(x, q, axis=None, keepdim=False): + """ + Compute the quantile of the input along the specified axis. + If any values in a reduced row are NaN, then the quantiles for that reduction will be NaN. + + Args: + x (Tensor): The input Tensor, it's data type can be float32, float64. + q (int|float|list): The q for calculate quantile, which should be in range [0, 1]. If q is a list, + each q will be calculated and the first dimension of output is same to the number of ``q`` . + axis (int|list, optional): The axis along which to calculate quantile. ``axis`` should be int or list of int. + ``axis`` should be in range [-D, D), where D is the dimensions of ``x`` . + If ``axis`` is less than 0, it works the same way as :math:`axis + D`. + If ``axis`` is a list, quantile is calculated over all elements of given axises. + If ``axis`` is None, quantile is calculated over all elements of ``x``. Default is None. + keepdim (bool, optional): Whether to reserve the reduced dimension(s) + in the output Tensor. If ``keepdim`` is True, the dimensions of + the output Tensor is the same as ``x`` except in the reduced + dimensions(it is of size 1 in this case). Otherwise, the shape of + the output Tensor is squeezed in ``axis`` . Default is False. + name (str, optional): Name for the operation (optional, default is None). + For more information, please refer to :ref:`api_guide_Name`. + + Returns: + Tensor, results of quantile along ``axis`` of ``x``. + In order to obtain higher precision, data type of results will be float64. + + Examples: + .. code-block:: python + + import numpy as np + import paddle + + x = np.arange(0, 8, dtype=np.float32).reshape(4, 2) + # [[0 1] + # [2 3] + # [4 5] + # [6 7]] + y = paddle.to_tensor(x) + y1 = paddle.quantile(y, q=0.5, axis=[0, 1]) + # 3.5 + + y2 = paddle.quantile(y, q=0.5, axis=1) + # [0.5 2.5 4.5 6.5] + + y3 = paddle.quantile(y, q=[0.3, 0.5], axis=0) + # [[1.8 2.8] + # [3. 4. ]] + + x[0][0] = np.nan + y = paddle.to_tensor(x) + y4 = paddle.quantile(y, q=0.8, axis=1, keepdim=True) + # [[nan] + # [2.8] + # [4.8] + # [6.8]] + + """ + return _compute_quantile(x, q, axis=axis, keepdim=keepdim, ignore_nan=False) + + +def nanquantile(x, q, axis=None, keepdim=False): + """ + Compute the quantile of the input as if NaN values in input did not exist. + If all values in a reduced row are NaN, then the quantiles for that reduction will be NaN. + + Args: + x (Tensor): The input Tensor, it's data type can be float32, float64. + q (int|float|list): The q for calculate quantile, which should be in range [0, 1]. If q is a list, + each q will be calculated and the first dimension of output is same to the number of ``q`` . + axis (int|list, optional): The axis along which to calculate quantile. ``axis`` should be int or list of int. + ``axis`` should be in range [-D, D), where D is the dimensions of ``x`` . + If ``axis`` is less than 0, it works the same way as :math:`axis + D`. + If ``axis`` is a list, quantile is calculated over all elements of given axises. + If ``axis`` is None, quantile is calculated over all elements of ``x``. Default is None. + keepdim (bool, optional): Whether to reserve the reduced dimension(s) + in the output Tensor. If ``keepdim`` is True, the dimensions of + the output Tensor is the same as ``x`` except in the reduced + dimensions(it is of size 1 in this case). Otherwise, the shape of + the output Tensor is squeezed in ``axis`` . Default is False. + name (str, optional): Name for the operation (optional, default is None). + For more information, please refer to :ref:`api_guide_Name`. + + Returns: + Tensor, results of quantile along ``axis`` of ``x``. + In order to obtain higher precision, data type of results will be float64. + + Examples: + .. code-block:: python + + import numpy as np + import paddle + + x = np.array( + [[0, 1, 2, 3, 4], + [5, 6, 7, 8, 9]], + dtype=np.float32 + ) + x[0][0] = np.nan + + x = paddle.to_tensor(x) + y1 = paddle.nanquantile(x, q=0.5, axis=[0, 1]) + # 5.0 + + y2 = paddle.nanquantile(x, q=0.5, axis=1) + # [2.5 7. ] + + y3 = paddle.nanquantile(x, q=[0.3, 0.5], axis=0) + # [[5. 2.5 3.5 4.5 5.5] + # [5. 3.5 4.5 5.5 6.5] + + y4 = paddle.nanquantile(x, q=0.8, axis=1, keepdim=True) + # [[3.4] + # [8.2]] + + nan = paddle.full(shape=[2, 3], fill_value=np.nan) + y5 = paddle.nanquantile(nan, q=0.8, axis=1, keepdim=True) + # [[nan] + # [nan]] + + """ + return _compute_quantile(x, q, axis=axis, keepdim=keepdim, ignore_nan=True) From 840d2eb629cab71c73ffcb61003b66be22894a09 Mon Sep 17 00:00:00 2001 From: pangyoki Date: Fri, 15 Apr 2022 11:15:34 +0800 Subject: [PATCH 173/211] support no_need_buffer in eager_fluid state (#41720) * support no_need_buffer in eager_fluid state * change no_need_buffer info from fwd_info to bwd_info * fix CI fail, gru_unit donnot use no_need_buffer * fix conflict between no_need_buffer and dispensable * use tensor.define in dispensable * solve conflict * solve conflict --- .../auto_code_generator/eager_generator.cc | 39 +++++++++++++++---- paddle/fluid/pybind/op_function_generator.h | 8 ++++ .../fluid/tests/unittests/test_inplace.py | 14 +++++++ 3 files changed, 53 insertions(+), 8 deletions(-) diff --git a/paddle/fluid/eager/auto_code_generator/eager_generator.cc b/paddle/fluid/eager/auto_code_generator/eager_generator.cc index 726e049e61150..307f8fae31597 100644 --- a/paddle/fluid/eager/auto_code_generator/eager_generator.cc +++ b/paddle/fluid/eager/auto_code_generator/eager_generator.cc @@ -217,6 +217,13 @@ class GradNodeGenerationInfo { return &grad_attrs_; } + const std::unordered_set& GetNoNeedBufferInputs() const { + return no_need_buffer_ins_; + } + std::unordered_set* GetMutableNoNeedBufferInputs() { + return &no_need_buffer_ins_; + } + private: std::string op_base_type_; std::map grad_outs_slotname_map_; @@ -229,6 +236,7 @@ class GradNodeGenerationInfo { std::vector>> grad_outs_; paddle::framework::AttributeMap grad_attrs_; + std::unordered_set no_need_buffer_ins_; }; public: @@ -958,6 +966,12 @@ static bool CollectGradInformationFromOpInfo( VLOG(6) << "GradOuts Name: " << it.first; } } + + auto& inferer = op_base.Info().NoNeedBufferVarsInferer(); + if (inferer && !special_no_need_buffer_op_set.count(op_type)) { + *(*op_base_infos)[index].GetMutableNoNeedBufferInputs() = + inferer(g_ins, g_outs, *op_base_grad_attrs); + } } /* ------ Slot Name Matching ---- */ @@ -1129,11 +1143,14 @@ static std::string GenerateGradNodeCreationContent( for (const auto& iter : op_base_infos) { const std::map& grad_ins_fwd_slotname_map = iter.GetGradInsFwdSlotnameMap(); + const std::unordered_set& no_need_buffer_ins = + iter.GetNoNeedBufferInputs(); for (auto& kv : grad_ins_fwd_slotname_map) { const std::string& tensor_wrapper_name = kv.second; std::string full_reserved = "false"; if (fwd_outputs_name_pos_map.find(tensor_wrapper_name) == - fwd_outputs_name_pos_map.end()) { + fwd_outputs_name_pos_map.end() && + !no_need_buffer_ins.count(tensor_wrapper_name)) { full_reserved = "true"; } const char* SET_TENSOR_WRAPPER_TEMPLATE = @@ -2064,7 +2081,7 @@ static std::string GenerateSingleOpBase( } else { const char* DISPENSABLE_GRAD_INS_FWD_CONTENT_TEMPLATE = " auto %s = egr::EagerUtils::RecoverTensorWrapper(&this->%s);\n" - " if(%s.initialized()) %s[\"%s\"] = " + " if(%s.defined()) %s[\"%s\"] = " " egr::EagerUtils::TrySyncToVars(%s);\n"; generated_grad_function_body += paddle::string::Sprintf( DISPENSABLE_GRAD_INS_FWD_CONTENT_TEMPLATE, grad_input_name, @@ -2190,7 +2207,7 @@ static std::string GenerateSingleOpBase( grad_output_name, fwd_input_position); } else { const char* DISPENSABLE_GRAD_OUTS_FWD_CONTENT_TEMPLATE = - " if(%s.initialized()) %s[\"%s\"] = " + " if(%s.defined()) %s[\"%s\"] = " "{std::make_shared(egr::Controller::" "Instance().GenerateUniqueName())};\n"; generated_grad_function_body += paddle::string::Sprintf( @@ -2532,6 +2549,8 @@ static std::string GenerateGradNodeHeaderContents( for (const auto& iter : op_base_infos) { const std::map& grad_ins_fwd_slotname_map = iter.GetGradInsFwdSlotnameMap(); + const std::unordered_set& no_need_buffer_ins = + iter.GetNoNeedBufferInputs(); for (const auto& kv : grad_ins_fwd_slotname_map) { const std::string& tensor_wrapper_name = kv.second; @@ -2540,6 +2559,10 @@ static std::string GenerateGradNodeHeaderContents( std::string tensor_wrapper_arg_str; std::string tensor_wrapper_body_str; std::string full_reserved_str = "full_reserved"; + std::string no_need_buffer_str = "false"; + if (no_need_buffer_ins.count(tensor_wrapper_name)) { + no_need_buffer_str = "true"; + } if (duplicable_tensors.count(tensor_wrapper_name)) { const char* ATTR_TENSOR_WRAPPER_ARG_TEMPLATE = "const std::vector& %s"; @@ -2553,12 +2576,12 @@ static std::string GenerateGradNodeHeaderContents( const char* SET_TENSOR_WRAPPER_BODY_TEMPLATE = "for(const auto& eager_tensor : %s) {\n" - " %s.emplace_back( egr::TensorWrapper(eager_tensor, true " - "/*full_reserved*/) );\n" + " %s.emplace_back( egr::TensorWrapper(eager_tensor, %s " + "/*full_reserved*/, %s) );\n" " }\n"; tensor_wrapper_body_str = paddle::string::Sprintf( SET_TENSOR_WRAPPER_BODY_TEMPLATE, tensor_wrapper_name, - struct_tensor_wrapper_name); + struct_tensor_wrapper_name, full_reserved_str, no_need_buffer_str); const char* CLEAR_TENSOR_WRAPPER_TEMPLATE = "for (auto tw: %s) {\n" @@ -2579,10 +2602,10 @@ static std::string GenerateGradNodeHeaderContents( TENSOR_WRAPPER_MEMBER_TEMPLATE, struct_tensor_wrapper_name); const char* SET_TENSOR_WRAPPER_BODY_TEMPLATE = - "%s = egr::TensorWrapper(%s, %s /*full_reserved*/);\n"; + "%s = egr::TensorWrapper(%s, %s /*full_reserved*/, %s);\n"; tensor_wrapper_body_str = paddle::string::Sprintf( SET_TENSOR_WRAPPER_BODY_TEMPLATE, struct_tensor_wrapper_name, - tensor_wrapper_name, full_reserved_str); + tensor_wrapper_name, full_reserved_str, no_need_buffer_str); const char* CLEAR_TENSOR_WRAPPER_TEMPLATE = " %s.clear();\n"; clear_tensor_wrappers_str += paddle::string::Sprintf( diff --git a/paddle/fluid/pybind/op_function_generator.h b/paddle/fluid/pybind/op_function_generator.h index f1e9c7e8f491b..7b128bd3b0e4d 100644 --- a/paddle/fluid/pybind/op_function_generator.h +++ b/paddle/fluid/pybind/op_function_generator.h @@ -276,3 +276,11 @@ std::set special_inplace_op_set = { "sum", // `sum` op has duplicate input "assign", // output of `assign` op is in `op_passing_outs_map` }; + +// NOTE(pangyoki): Special no_need_buffer ops that are not supported in +// temporary. +// sequence_conv op will raise error to get no_need_buffer info during +// compiling. +std::set special_no_need_buffer_op_set = { + "sequence_conv", +}; diff --git a/python/paddle/fluid/tests/unittests/test_inplace.py b/python/paddle/fluid/tests/unittests/test_inplace.py index c54d3f02d43f0..99873eaa98870 100644 --- a/python/paddle/fluid/tests/unittests/test_inplace.py +++ b/python/paddle/fluid/tests/unittests/test_inplace.py @@ -510,5 +510,19 @@ def test_continuously_inplace(self): self.func_test_continuously_inplace() +class TestGetitemBeforeInplace(unittest.TestCase): + def test_getitem_before_inplace(self): + with _test_eager_guard(): + a = paddle.ones(shape=[4, 2, 3], dtype="float32") + a.stop_gradient = False + b = a**2 + b[0] = 3 + # getitem has no_need_buffer input + c = b[0:2] + loss = c.sum() + b[1] = 2 + loss.backward() + + if __name__ == '__main__': unittest.main() From 1665594dbb0d17d7382e3668214a01132b9d7106 Mon Sep 17 00:00:00 2001 From: zhangkaihuo Date: Fri, 15 Apr 2022 11:38:24 +0800 Subject: [PATCH 174/211] Add API: Sparse Convolution3D (#41434) --- .../sparse/cpu/convolution_grad_kernel.cc | 6 +- .../kernels/sparse/cpu/convolution_kernel.cc | 13 +- .../phi/kernels/sparse/gpu/convolution.cu.h | 349 +++++++--------- .../sparse/gpu/convolution_grad_kernel.cu | 6 +- .../kernels/sparse/gpu/convolution_kernel.cu | 57 ++- .../tests/unittests/test_sparse_conv_op.py | 76 +++- python/paddle/sparse/__init__.py | 6 +- python/paddle/sparse/functional/__init__.py | 4 +- python/paddle/sparse/functional/conv.py | 294 ++++++++++++++ python/paddle/sparse/layer/__init__.py | 2 + python/paddle/sparse/layer/conv.py | 380 ++++++++++++++++++ 11 files changed, 961 insertions(+), 232 deletions(-) create mode 100644 python/paddle/sparse/functional/conv.py create mode 100644 python/paddle/sparse/layer/conv.py diff --git a/paddle/phi/kernels/sparse/cpu/convolution_grad_kernel.cc b/paddle/phi/kernels/sparse/cpu/convolution_grad_kernel.cc index 216685f0f7191..34337db558c8a 100644 --- a/paddle/phi/kernels/sparse/cpu/convolution_grad_kernel.cc +++ b/paddle/phi/kernels/sparse/cpu/convolution_grad_kernel.cc @@ -139,16 +139,16 @@ void Conv3dGradCPUKernel(const CPUContext& dev_ctx, T* tmp_in_ptr = in_features_ptr + offsets[i] * in_channels; T* tmp_out_grad_ptr = out_grad_features_ptr + offsets[i] * out_channels; const T* tmp_kernel_ptr = kernel_ptr + i * in_channels * out_channels; - T* tmp_d_x_ptr = d_x_features_ptr + offsets[i] * out_channels; + T* tmp_d_x_ptr = d_x_features_ptr + offsets[i] * in_channels; T* tmp_d_kernel_ptr = d_kernel_ptr + i * in_channels * out_channels; // call gemm: d_kernel = transpose(x) * out_grad // (in_channels, n) * (n, out_channels) blas.GEMM(CblasTrans, CblasNoTrans, - M, - N, K, + N, + M, static_cast(1), tmp_in_ptr, tmp_out_grad_ptr, diff --git a/paddle/phi/kernels/sparse/cpu/convolution_kernel.cc b/paddle/phi/kernels/sparse/cpu/convolution_kernel.cc index c920f3c461287..d133464ab853c 100644 --- a/paddle/phi/kernels/sparse/cpu/convolution_kernel.cc +++ b/paddle/phi/kernels/sparse/cpu/convolution_kernel.cc @@ -50,16 +50,19 @@ void Conv3dCPUKernel(const CPUContext& dev_ctx, kernel_sizes[i] = kernel_dims[i]; } - phi::funcs::sparse::GetOutShape( - x_dims, kernel_sizes, paddings, dilations, strides, &out_dims); - const int in_channels = kernel_dims[3]; - const int out_channels = kernel_dims[4]; - std::vector subm_paddings(paddings), subm_strides(strides); if (subm) { + // the out shape of subm_conv is same as input shape + // reset the padding=kernel_size/2 and strides=1 phi::funcs::sparse::ResetSubmKernelSizeAndStrides( kernel.dims(), &subm_paddings, &subm_strides); } + + phi::funcs::sparse::GetOutShape( + x_dims, kernel_sizes, subm_paddings, dilations, subm_strides, &out_dims); + const int in_channels = kernel_dims[3]; + const int out_channels = kernel_dims[4]; + // Second algorithm: // https://pdfs.semanticscholar.org/5125/a16039cabc6320c908a4764f32596e018ad3.pdf // 1. product rulebook diff --git a/paddle/phi/kernels/sparse/gpu/convolution.cu.h b/paddle/phi/kernels/sparse/gpu/convolution.cu.h index 1bceb767b6708..2396a5975de4e 100644 --- a/paddle/phi/kernels/sparse/gpu/convolution.cu.h +++ b/paddle/phi/kernels/sparse/gpu/convolution.cu.h @@ -14,6 +14,7 @@ limitations under the License. */ #pragma once +#include #include #include #include @@ -22,6 +23,7 @@ limitations under the License. */ #include "paddle/phi/backends/gpu/gpu_context.h" #include "paddle/phi/backends/gpu/gpu_info.h" #include "paddle/phi/backends/gpu/gpu_launch_config.h" +#include "paddle/phi/kernels/copy_kernel.h" #include "paddle/phi/kernels/funcs/index_impl.cu.h" #include "paddle/phi/kernels/funcs/math_function.h" #include "paddle/phi/kernels/primitive/compute_primitives.h" @@ -143,35 +145,6 @@ inline IntT* SortedAndUniqueIndex(const Context& dev_ctx, return new_end.first; } -template -__global__ void SetFlagAndUpdateCounterKernel(const int* indexs, - const int n, - const int rulebook_len, - const int kernel_size, - T* rulebook_ptr, - int* counter_ptr) { - int tid = threadIdx.x + blockIdx.x * blockDim.x; - extern __shared__ int cache_count[]; // kernel_size - for (int i = threadIdx.x; i < kernel_size; i += blockDim.x) { - cache_count[i] = 0; - } - __syncthreads(); - - for (int i = tid; i < n; i += gridDim.x * blockDim.x) { - int index = indexs[i]; - T kernel_index = rulebook_ptr[index]; - rulebook_ptr[index + rulebook_len] = -1; - rulebook_ptr[index + 2 * rulebook_len] = -1; - rulebook_ptr[index] = -1; - atomicAdd(&cache_count[kernel_index], 1); - } - __syncthreads(); - - for (int i = threadIdx.x; i < kernel_size; i += blockDim.x) { - atomicSub(&counter_ptr[i], cache_count[i]); - } -} - /** * @brief: update the out index and indices * unique_keys: save the index of the output feature list @@ -221,6 +194,42 @@ __global__ void DistanceKernel(const T* start, const T* end, T* distance) { } } +template +__global__ void UpdateOutIndexAndCounterAfterLowerBound( + const IntT* x_indexs, + const IntT* bound_out, + const int rulebook_len, + const int kernel_size, + const int64_t non_zero_num, + IntT* rulebook_ptr, + IntT* out_indexs, + int* counter_ptr) { + extern __shared__ int cache_count[]; + for (int i = threadIdx.x; i < kernel_size; i += blockDim.x) { + cache_count[i] = 0; + } + __syncthreads(); + + CUDA_KERNEL_LOOP_TYPE(i, rulebook_len, int64_t) { + int j = bound_out[i]; + if (j >= 0 && j < non_zero_num && out_indexs[i] == x_indexs[j]) { + out_indexs[i] = j; + } else { + // mask this position will be remove + int kernel_index = rulebook_ptr[i]; + rulebook_ptr[i + rulebook_len] = -1; + rulebook_ptr[i + 2 * rulebook_len] = -1; + rulebook_ptr[i] = -1; + atomicAdd(&cache_count[kernel_index], 1); + } + } + __syncthreads(); + + for (int i = threadIdx.x; i < kernel_size; i += blockDim.x) { + atomicSub(&counter_ptr[i], cache_count[i]); + } +} + /** * @brief product rulebook * for input_i in x_indices: @@ -338,7 +347,6 @@ int ProductRuleBook(const Context& dev_ctx, SparseCooTensor* out, std::vector* h_counter, std::vector* h_offsets) { - // TODO(zhangkaihuo): use PD_VISIT_INTEGRAL_TYPES for secondary dispatch auto indices_dtype = paddle::experimental::CppTypeToDataType::Type(); const int64_t non_zero_num = x.nnz(); const auto& non_zero_indices = x.non_zero_indices(); @@ -362,7 +370,6 @@ int ProductRuleBook(const Context& dev_ctx, Dims4D d_paddings(1, paddings[2], paddings[1], paddings[0]); Dims4D d_strides(1, strides[2], strides[1], strides[0]); Dims4D d_dilations(1, dilations[2], dilations[1], dilations[0]); - // 1. product rule book phi::funcs::SetConstant set_zero; set_zero(dev_ctx, counter_per_kernel, 0); @@ -408,8 +415,8 @@ int ProductRuleBook(const Context& dev_ctx, cudaMemcpyDeviceToHost, #endif dev_ctx.stream()); - rulebook_len /= 3; dev_ctx.Wait(); + rulebook_len /= 3; if (subm) { // At present, hashtable is not used to map the input and output indexes. @@ -417,96 +424,41 @@ int ProductRuleBook(const Context& dev_ctx, // convolution, // and then the intermediate output index is subtracted from the input index // to obain the rulebook. - // get difference - IntT* A_key_ptr = rulebook_ptr + 2 * rulebook_len; - IntT* B_key_ptr = in_indexs.data(); - DenseTensorMeta val_meta(DataType::INT32, {rulebook_len}, DataLayout::NCHW); - DenseTensor A_val = phi::Empty(dev_ctx, std::move(val_meta)); - DenseTensor B_val = phi::Empty( - dev_ctx, DenseTensorMeta(DataType::INT32, {x.nnz()}, DataLayout::NCHW)); - phi::IndexKernel>( - dev_ctx, &A_val, kps::IdentityFunctor()); - phi::IndexKernel>( - dev_ctx, &B_val, kps::IdentityFunctor()); - DenseTensor key_result = phi::Empty( - dev_ctx, - DenseTensorMeta(indices_dtype, {rulebook_len + 1}, DataLayout::NCHW)); - DenseTensor val_result = phi::Empty(dev_ctx, std::move(val_meta)); - -#ifdef PADDLE_WITH_HIP - thrust::exclusive_scan(thrust::hip::par.on(dev_ctx.stream()), -#else - thrust::exclusive_scan(thrust::cuda::par.on(dev_ctx.stream()), -#endif - counter_ptr, - counter_ptr + kernel_size, - offsets_ptr); - std::vector offsets(kernel_size, 0); - // TODO(zhangkaihuo): used unified memcpy interface - phi::backends::gpu::GpuMemcpyAsync(offsets.data(), - offsets_ptr, - kernel_size * sizeof(int), -#ifdef PADDLE_WITH_HIP - hipMemcpyDeviceToHost, -#else - cudaMemcpyDeviceToHost, -#endif - dev_ctx.stream()); - dev_ctx.Wait(); - - thrust::pair end; - // Because set_diff does not support duplicate data, set_diff is performed - // separately for each segment of data. - // TODO(zhangkaihuo): Using hashtable here may get better performance, - // further tests ared needed. - for (int i = 0; i < kernel_size; i++) { - int start = offsets[i]; - int stop = i == kernel_size - 1 ? rulebook_len : offsets[i + 1]; - IntT* key_result_start = (i == 0 ? key_result.data() : end.first); - int* val_result_start = i == 0 ? val_result.data() : end.second; - end = -#ifdef PADDLE_WITH_HIP - thrust::set_difference_by_key(thrust::hip::par.on(dev_ctx.stream()), -#else - thrust::set_difference_by_key(thrust::cuda::par.on(dev_ctx.stream()), -#endif - A_key_ptr + start, - A_key_ptr + stop, - B_key_ptr, - B_key_ptr + x.nnz(), - A_val.data() + start, - B_val.data(), - key_result_start, - val_result_start); - } - DistanceKernel<<<1, 1, 0, dev_ctx.stream()>>>( - key_result.data(), - end.first, - key_result.data() + rulebook_len); - IntT len = 0; - phi::backends::gpu::GpuMemcpyAsync(&len, - key_result.data() + rulebook_len, - sizeof(IntT), + // call lower_bound to get the real index of out_index + const IntT* in_indexs_ptr = in_indexs.data(); + IntT* out_indexs_ptr = rulebook_ptr + 2 * rulebook_len; + DenseTensor bound = phi::Empty( + dev_ctx, + DenseTensorMeta( + indices_dtype, {static_cast(rulebook_len)}, DataLayout::NCHW)); + IntT* bound_ptr = bound.data(); #ifdef PADDLE_WITH_HIP - hipMemcpyDeviceToHost, + thrust::lower_bound(thrust::hip::par.on(dev_ctx.stream()), #else - cudaMemcpyDeviceToHost, + thrust::lower_bound(thrust::cuda::par.on(dev_ctx.stream()), #endif - dev_ctx.stream()); - dev_ctx.Wait(); - // set the diff value = -1, and update counter - auto config = phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, len, 1); - SetFlagAndUpdateCounterKernel<<>>( - val_result.data(), - len, + in_indexs_ptr, + in_indexs_ptr + in_indexs.numel(), + out_indexs_ptr, + out_indexs_ptr + rulebook_len, + bound_ptr); + + config = phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, rulebook_len, 1); + + UpdateOutIndexAndCounterAfterLowerBound<<>>( + in_indexs_ptr, + bound.data(), rulebook_len, kernel_size, + x.nnz(), rulebook_ptr, + out_indexs_ptr, counter_ptr); + // remove -1 #ifdef PADDLE_WITH_HIP IntT* last = thrust::remove(thrust::hip::par.on(dev_ctx.stream()), @@ -517,9 +469,9 @@ int ProductRuleBook(const Context& dev_ctx, rulebook_ptr + 3 * rulebook_len, -1); DistanceKernel<<<1, 1, 0, dev_ctx.stream()>>>( - rulebook_ptr, last, key_result.data() + rulebook_len); + rulebook_ptr, last, bound_ptr); phi::backends::gpu::GpuMemcpyAsync(&rulebook_len, - key_result.data() + rulebook_len, + bound_ptr, sizeof(IntT), #ifdef PADDLE_WITH_HIP hipMemcpyDeviceToHost, @@ -540,102 +492,111 @@ int ProductRuleBook(const Context& dev_ctx, counter_ptr + kernel_size, offsets_ptr); -#ifdef PADDLE_WITH_HIP phi::backends::gpu::GpuMemcpyAsync(&(*h_counter)[0], counter_ptr, kernel_size * sizeof(int), +#ifdef PADDLE_WITH_HIP hipMemcpyDeviceToHost, - dev_ctx.stream()); - phi::backends::gpu::GpuMemcpyAsync(&(*h_offsets)[0], - offsets_ptr, - kernel_size * sizeof(int), - hipMemcpyDeviceToHost, - dev_ctx.stream()); #else - phi::backends::gpu::GpuMemcpyAsync(&(*h_counter)[0], - counter_ptr, - kernel_size * sizeof(int), cudaMemcpyDeviceToHost, +#endif dev_ctx.stream()); + phi::backends::gpu::GpuMemcpyAsync(&(*h_offsets)[0], offsets_ptr, kernel_size * sizeof(int), +#ifdef PADDLE_WITH_HIP + hipMemcpyDeviceToHost, +#else cudaMemcpyDeviceToHost, - dev_ctx.stream()); #endif + dev_ctx.stream()); + rulebook->Resize({rulebook_rows, static_cast(rulebook_len)}); - // 3. sorted or merge the out index - out_index->ResizeAndAllocate({static_cast(rulebook_len)}); - unique_value->ResizeAndAllocate({static_cast(rulebook_len)}); - DenseTensor unique_key = phi::Empty( - dev_ctx, - DenseTensorMeta(paddle::experimental::CppTypeToDataType::Type(), - {static_cast(rulebook_len)}, - DataLayout::NCHW)); - int* out_index_ptr = out_index->data(); - int* unique_value_ptr = unique_value->data(); - IntT* unique_key_ptr = unique_key.data(); - - IntT* new_end = - SortedAndUniqueIndex(dev_ctx, - rulebook_ptr + 2 * rulebook_len, - rulebook_len, - out_index, - &unique_key, - unique_value); - // thrust::distance doesn't support stream parameters - // const int out_non_zero_num = thrust::distance(unique_key_ptr, - // new_end.first); - DistanceKernel<<<1, 1>>>( - unique_key_ptr, - new_end, - rulebook_ptr + rulebook_rows * rulebook_cols - 1); - IntT out_non_zero_num = 0; + if (!subm) { + // 3. sorted or merge the out index + out_index->ResizeAndAllocate({static_cast(rulebook_len)}); + unique_value->ResizeAndAllocate({static_cast(rulebook_len)}); + DenseTensor unique_key = phi::Empty( + dev_ctx, + DenseTensorMeta( + indices_dtype, {static_cast(rulebook_len)}, DataLayout::NCHW)); + int* out_index_ptr = out_index->data(); + int* unique_value_ptr = unique_value->data(); + IntT* unique_key_ptr = unique_key.data(); + + IntT* new_end = + SortedAndUniqueIndex(dev_ctx, + rulebook_ptr + 2 * rulebook_len, + rulebook_len, + out_index, + &unique_key, + unique_value); + // thrust::distance doesn't support stream parameters + // const int out_non_zero_num = thrust::distance(unique_key_ptr, + // new_end.first); + DistanceKernel<<<1, 1, 0, dev_ctx.stream()>>>( + unique_key_ptr, + new_end, + rulebook_ptr + rulebook_rows * rulebook_cols - 1); + IntT out_non_zero_num = 0; #ifdef PADDLE_WITH_HIP - phi::backends::gpu::GpuMemcpyAsync( - &out_non_zero_num, - rulebook_ptr + rulebook_rows * rulebook_cols - 1, - sizeof(IntT), - hipMemcpyDeviceToHost, - dev_ctx.stream()); + phi::backends::gpu::GpuMemcpyAsync( + &out_non_zero_num, + rulebook_ptr + rulebook_rows * rulebook_cols - 1, + sizeof(IntT), + hipMemcpyDeviceToHost, + dev_ctx.stream()); #else - phi::backends::gpu::GpuMemcpyAsync( - &out_non_zero_num, - rulebook_ptr + rulebook_rows * rulebook_cols - 1, - sizeof(IntT), - cudaMemcpyDeviceToHost, - dev_ctx.stream()); + phi::backends::gpu::GpuMemcpyAsync( + &out_non_zero_num, + rulebook_ptr + rulebook_rows * rulebook_cols - 1, + sizeof(IntT), + cudaMemcpyDeviceToHost, + dev_ctx.stream()); #endif - dev_ctx.Wait(); + dev_ctx.Wait(); - // 5. update out_indices and rulebook by unique_value_ptr - const int64_t sparse_dim = 4; - DenseTensorMeta indices_meta( - indices_dtype, {sparse_dim, out_non_zero_num}, DataLayout::NCHW); - DenseTensorMeta values_meta(x.dtype(), - {out_non_zero_num, kernel_sizes[4]}, - x.non_zero_elements().layout()); - phi::DenseTensor out_indices = phi::Empty(dev_ctx, std::move(indices_meta)); - phi::DenseTensor out_values = phi::Empty(dev_ctx, std::move(values_meta)); - - IntT* out_indices_ptr = out_indices.data(); - - config = - phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, out_non_zero_num, 1); - UpdateIndexKernel<<>>( - unique_key_ptr, - unique_value_ptr, - out_index_ptr, - out_non_zero_num, - rulebook_len, - d_out_dims, - out_indices_ptr, - rulebook_ptr + 2 * rulebook_len); - out->SetMember(out_indices, out_values, out_dims, true); + // 5. update out_indices and rulebook by unique_value_ptr + const int64_t sparse_dim = 4; + DenseTensorMeta indices_meta( + indices_dtype, {sparse_dim, out_non_zero_num}, DataLayout::NCHW); + DenseTensorMeta values_meta(x.dtype(), + {out_non_zero_num, kernel_sizes[4]}, + x.non_zero_elements().layout()); + phi::DenseTensor out_indices = phi::Empty(dev_ctx, std::move(indices_meta)); + phi::DenseTensor out_values = phi::Empty(dev_ctx, std::move(values_meta)); + + IntT* out_indices_ptr = out_indices.data(); + + config = + phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, out_non_zero_num, 1); + UpdateIndexKernel<<>>( + unique_key_ptr, + unique_value_ptr, + out_index_ptr, + out_non_zero_num, + rulebook_len, + d_out_dims, + out_indices_ptr, + rulebook_ptr + 2 * rulebook_len); + out->SetMember(out_indices, out_values, out_dims, true); + } else { + DenseTensor out_indices = + phi::EmptyLike(dev_ctx, x.non_zero_indices()); + DenseTensor out_values = + phi::Empty(dev_ctx, + DenseTensorMeta(x.dtype(), + {x.nnz(), kernel_sizes[4]}, + x.non_zero_elements().layout())); + phi::Copy( + dev_ctx, x.non_zero_indices(), dev_ctx.GetPlace(), false, &out_indices); + out->SetMember(out_indices, out_values, out_dims, true); + } return rulebook_len; } diff --git a/paddle/phi/kernels/sparse/gpu/convolution_grad_kernel.cu b/paddle/phi/kernels/sparse/gpu/convolution_grad_kernel.cu index 6c37f759923c3..ed9579fcd5b67 100644 --- a/paddle/phi/kernels/sparse/gpu/convolution_grad_kernel.cu +++ b/paddle/phi/kernels/sparse/gpu/convolution_grad_kernel.cu @@ -171,16 +171,16 @@ void Conv3dGradGPUKernel(const GPUContext& dev_ctx, T* tmp_in_ptr = in_features_ptr + offsets[i] * in_channels; T* tmp_out_grad_ptr = out_grad_features_ptr + offsets[i] * out_channels; const T* tmp_kernel_ptr = kernel_ptr + i * in_channels * out_channels; - T* tmp_d_x_ptr = d_x_features_ptr + offsets[i] * out_channels; + T* tmp_d_x_ptr = d_x_features_ptr + offsets[i] * in_channels; T* tmp_d_kernel_ptr = d_kernel_ptr + i * in_channels * out_channels; // call gemm: d_kernel = transpose(x) * out_grad // (in_channels, n) * (n, out_channels) blas.GEMM(CblasTrans, CblasNoTrans, - M, - N, K, + N, + M, static_cast(1), tmp_in_ptr, tmp_out_grad_ptr, diff --git a/paddle/phi/kernels/sparse/gpu/convolution_kernel.cu b/paddle/phi/kernels/sparse/gpu/convolution_kernel.cu index 83f19ce5785df..93da65dc0f7d8 100644 --- a/paddle/phi/kernels/sparse/gpu/convolution_kernel.cu +++ b/paddle/phi/kernels/sparse/gpu/convolution_kernel.cu @@ -17,6 +17,7 @@ limitations under the License. */ #include "paddle/phi/core/tensor_meta.h" #include "paddle/phi/core/visit_type.h" #include "paddle/phi/kernels/funcs/blas/blas.h" +#include "paddle/phi/kernels/funcs/scatter.cu.h" #include "paddle/phi/kernels/sparse/convolution_kernel.h" #include "paddle/phi/kernels/sparse/gpu/convolution.cu.h" @@ -45,8 +46,17 @@ void Conv3dGPUKernel(const GPUContext& dev_ctx, for (int i = 0; i < kernel_dims.size(); i++) { kernel_sizes[i] = kernel_dims[i]; } + + std::vector subm_paddings(paddings), subm_strides(strides); + if (subm) { + // the out shape of subm_conv is same as input shape + // reset the padding=kernel_size/2 and strides=1 + phi::funcs::sparse::ResetSubmKernelSizeAndStrides( + kernel.dims(), &subm_paddings, &subm_strides); + } + phi::funcs::sparse::GetOutShape( - x_dims, kernel_sizes, paddings, dilations, strides, &out_dims); + x_dims, kernel_sizes, subm_paddings, dilations, subm_strides, &out_dims); const int in_channels = kernel_dims[3]; const int out_channels = kernel_dims[4]; std::vector offsets(kernel_size + 1), h_counter(kernel_size); @@ -64,11 +74,6 @@ void Conv3dGPUKernel(const GPUContext& dev_ctx, DenseTensor out_index = phi::Empty(dev_ctx, std::move(index_meta)); DenseTensor unique_value = phi::Empty(dev_ctx, std::move(index_meta)); - std::vector subm_paddings(paddings), subm_strides(strides); - if (subm) { - phi::funcs::sparse::ResetSubmKernelSizeAndStrides( - kernel.dims(), &subm_paddings, &subm_strides); - } int n = ProductRuleBook(dev_ctx, x, kernel_sizes, @@ -147,18 +152,34 @@ void Conv3dGPUKernel(const GPUContext& dev_ctx, } // 4. scatter - config = phi::backends::gpu::GetGpuLaunchConfig1D( - dev_ctx, out->nnz() * out_channels, 1); - ScatterKernel<<>>(out_features_ptr, - unique_value.data(), - out_index.data(), - out->nnz(), - n, - out_channels, - out_values_ptr); + if (subm) { + set_zero(dev_ctx, out_values, static_cast(0.0f)); + config = + phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, n * out_channels, 1); + phi::funcs::ScatterCUDAKernel<<>>( + out_features_ptr, + rulebook_ptr + 2 * n, + out_values_ptr, + n, + out_channels, + false); + } else { + config = phi::backends::gpu::GetGpuLaunchConfig1D( + dev_ctx, out->nnz() * out_channels, 1); + ScatterKernel<<>>(out_features_ptr, + unique_value.data(), + out_index.data(), + out->nnz(), + n, + out_channels, + out_values_ptr); + } } /** * x: (N, D, H, W, C) diff --git a/python/paddle/fluid/tests/unittests/test_sparse_conv_op.py b/python/paddle/fluid/tests/unittests/test_sparse_conv_op.py index d5a61423e9c44..42f628c8fb1fd 100644 --- a/python/paddle/fluid/tests/unittests/test_sparse_conv_op.py +++ b/python/paddle/fluid/tests/unittests/test_sparse_conv_op.py @@ -40,14 +40,76 @@ def test_conv3d(self): correct_out_values = [[4], [10]] sparse_input = core.eager.sparse_coo_tensor(indices, values, dense_shape, False) - out = _C_ops.final_state_sparse_conv3d(sparse_input, dense_kernel, - paddings, dilations, strides, - 1, False) + out = paddle.sparse.functional.conv3d( + sparse_input, + dense_kernel, + bias=None, + stride=strides, + padding=paddings, + dilation=dilations, + groups=1, + data_format="NDHWC") out.backward(out) - #At present, only backward can be verified to work normally - #TODO(zhangkaihuo): compare the result with dense conv - print(sparse_input.grad.values()) assert np.array_equal(correct_out_values, out.values().numpy()) + def test_subm_conv3d(self): + with _test_eager_guard(): + indices = [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 1, 2], [1, 3, 2, 3]] + values = [[1], [2], [3], [4]] + indices = paddle.to_tensor(indices, dtype='int32') + values = paddle.to_tensor(values, dtype='float32') + dense_shape = [1, 1, 3, 4, 1] + sparse_x = paddle.sparse.sparse_coo_tensor( + indices, values, dense_shape, stop_gradient=True) + weight = paddle.randn((1, 3, 3, 1, 1), dtype='float32') + y = paddle.sparse.functional.subm_conv3d(sparse_x, weight) + assert np.array_equal(sparse_x.indices().numpy(), + y.indices().numpy()) + + def test_Conv3D(self): + with _test_eager_guard(): + #(4, non_zero_num), 4-D:(N, D, H, W) + indices = [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 1, 2], [1, 3, 2, 3]] + #(non_zero_num, C) + values = [[1], [2], [3], [4]] + indices = paddle.to_tensor(indices, dtype='int32') + values = paddle.to_tensor(values, dtype='float32') + dense_shape = [1, 1, 3, 4, 1] + correct_out_values = [[4], [10]] + sparse_input = paddle.sparse.sparse_coo_tensor(indices, values, + dense_shape, False) + + sparse_conv3d = paddle.sparse.Conv3D( + 1, 1, (1, 3, 3), data_format='NDHWC') + sparse_out = sparse_conv3d(sparse_input) + #test errors + with self.assertRaises(ValueError): + #Currently, only support data_format='NDHWC' + conv3d = paddle.sparse.SubmConv3D( + 1, 1, (1, 3, 3), data_format='NCDHW') + + def test_SubmConv3D(self): + with _test_eager_guard(): + indices = [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 1, 2], [1, 3, 2, 3]] + values = [[1], [2], [3], [4]] + indices = paddle.to_tensor(indices, dtype='int32') + values = paddle.to_tensor(values, dtype='float32') + dense_shape = [1, 1, 3, 4, 1] + correct_out_values = [[4], [10]] + sparse_input = paddle.sparse.sparse_coo_tensor(indices, values, + dense_shape, False) + + subm_conv3d = paddle.sparse.SubmConv3D( + 1, 1, (1, 3, 3), data_format='NDHWC') + # test extra_repr + print(subm_conv3d.extra_repr()) + + sparse_out = subm_conv3d(sparse_input) + # the output shape of subm_conv is same as input shape + assert np.array_equal(indices, sparse_out.indices().numpy()) -#TODO: Add more test case + #test errors + with self.assertRaises(ValueError): + #Currently, only support data_format='NDHWC' + conv3d = paddle.sparse.SubmConv3D( + 1, 1, (1, 3, 3), data_format='NCDHW') diff --git a/python/paddle/sparse/__init__.py b/python/paddle/sparse/__init__.py index aff9625469ef2..5e716d69379ed 100644 --- a/python/paddle/sparse/__init__.py +++ b/python/paddle/sparse/__init__.py @@ -15,5 +15,9 @@ from .creation import sparse_coo_tensor from .creation import sparse_csr_tensor from .layer.activation import ReLU +from .layer.conv import Conv3D +from .layer.conv import SubmConv3D -__all__ = ['sparse_coo_tensor', 'sparse_csr_tensor', 'ReLU'] +__all__ = [ + 'sparse_coo_tensor', 'sparse_csr_tensor', 'ReLU', 'Conv3D', 'SubmConv3D' +] diff --git a/python/paddle/sparse/functional/__init__.py b/python/paddle/sparse/functional/__init__.py index f4c5b33a5a7ea..93c3ccda4a613 100644 --- a/python/paddle/sparse/functional/__init__.py +++ b/python/paddle/sparse/functional/__init__.py @@ -13,5 +13,7 @@ # limitations under the License. from .activation import relu # noqa: F401 +from .conv import conv3d # noqa: F401 +from .conv import subm_conv3d # noqa: F401 -__all__ = ['relu'] +__all__ = ['relu', 'conv3d', 'subm_conv3d'] diff --git a/python/paddle/sparse/functional/conv.py b/python/paddle/sparse/functional/conv.py new file mode 100644 index 0000000000000..d8c0e5c914ccb --- /dev/null +++ b/python/paddle/sparse/functional/conv.py @@ -0,0 +1,294 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +__all__ = [] + +from paddle import _C_ops, in_dynamic_mode +from ...fluid.layers.utils import convert_to_list +from paddle.nn.functional.conv import _update_padding_nd + + +def _conv3d(x, + weight, + bias=None, + stride=1, + padding=0, + dilation=1, + groups=1, + subm=False, + data_format="NDHWC", + name=None): + assert in_dynamic_mode(), "Currently, only support dynamic mode" + assert bias == None, "Currently, sparse_conv3d does not support bias" + assert groups == 1, "Currently, only support groups=1" + + dims = 3 + + # Currently, only support 'NDHWC' + if data_format not in ["NDHWC"]: + raise ValueError("Attr(data_format) should be 'NDHWC'. Received " + "Attr(data_format): {}.".format(data_format)) + if len(x.shape) != 5: + raise ValueError( + "Input x should be 5D tensor, but received x with the shape of {}". + format(x.shape)) + + channel_last = (data_format == "NDHWC") + channel_dim = -1 if channel_last else 1 + if len(x.shape) != 5: + raise ValueError( + "Input x should be 5D tensor, but received x with the shape of {}". + format(x.shape)) + num_channels = x.shape[channel_dim] + if num_channels < 0: + raise ValueError( + "The channel dimension of the input({}) should be defined. " + "Received: {}.".format(x.shape, num_channels)) + + padding, padding_algorithm = _update_padding_nd(padding, channel_last, dims) + stride = convert_to_list(stride, dims, 'stride') + dilation = convert_to_list(dilation, dims, 'dilation') + op_type = "conv3d" + + return _C_ops.final_state_sparse_conv3d(x, weight, padding, dilation, + stride, groups, subm) + + +def conv3d(x, + weight, + bias=None, + stride=1, + padding=0, + dilation=1, + groups=1, + data_format="NDHWC", + name=None): + r""" + + The sparse convolution3d functional calculates the output based on the input, filter + and strides, paddings, dilations, groups parameters. Input(Input) and + Output(Output) are multidimensional SparseCooTensors with a shape of + :math:`[N, D, H, W, C]` . Where N is batch size, C is the number of + channels, D is the depth of the feature, H is the height of the feature, + and W is the width of the feature. If bias attribution is provided, + bias is added to the output of the convolution. + + For each input :math:`X`, the equation is: + + .. math:: + + Out = \sigma (W \ast X + b) + + In the above equation: + + * :math:`X`: Input value, a tensor with NCDHW or NDHWC format. + * :math:`W`: Filter value, a tensor with MCDHW format. + * :math:`\\ast`: Convolution operation. + * :math:`b`: Bias value, a 1-D tensor with shape [M]. + * :math:`Out`: Output value, the shape of :math:`Out` and :math:`X` may be different. + + Example: + + - Input: + + Input shape: :math:`(N, D_{in}, H_{in}, W_{in}, C_{in})` + + Filter shape: :math:`(D_f, H_f, W_f, C_{in}, C_{out})` + + - Output: + Output shape: :math:`(N, D_{out}, H_{out}, W_{out}, C_{out})` + + Where + + .. math:: + + D_{out}&= \\frac{(D_{in} + 2 * paddings[0] - (dilations[0] * (D_f - 1) + 1))}{strides[0]} + 1 \\\\ + H_{out}&= \\frac{(H_{in} + 2 * paddings[1] - (dilations[1] * (H_f - 1) + 1))}{strides[1]} + 1 \\\\ + W_{out}&= \\frac{(W_{in} + 2 * paddings[2] - (dilations[2] * (W_f - 1) + 1))}{strides[2]} + 1 + + Args: + x (Tensor): The input is 5-D SparseCooTensor with shape [N, D, H, W, C], the data + type of input is float16 or float32 or float64. + weight (Tensor): The convolution kernel, a Tensor with shape [kD, kH, kW, C/g, M], + where M is the number of filters(output channels), g is the number of groups, + kD, kH, kW are the filter's depth, height and width respectively. + bias (Tensor, optional): The bias, a Tensor of shape [M, ], currently, only support bias is None. + stride (int|list|tuple): The stride size. It means the stride in convolution. If stride is a + list/tuple, it must contain three integers, (stride_depth, stride_height, stride_width). + Otherwise, stride_depth = stride_height = stride_width = stride. Default: stride = 1. + padding (string|int|list|tuple): The padding size. It means the number of zero-paddings + on both sides for each dimension. If `padding` is a string, either 'VALID' or + 'SAME' which is the padding algorithm. If padding size is a tuple or list, + it could be in three forms: `[pad_depth, pad_height, pad_width]` or + `[pad_depth_front, pad_depth_back, pad_height_top, pad_height_bottom, pad_width_left, pad_width_right]`, + and when `data_format` is `"NCDHW"`, `padding` can be in the form + `[[0,0], [0,0], [pad_depth_front, pad_depth_back], [pad_height_top, pad_height_bottom], [pad_width_left, pad_width_right]]`. + when `data_format` is `"NDHWC"`, `padding` can be in the form + `[[0,0], [pad_depth_front, pad_depth_back], [pad_height_top, pad_height_bottom], [pad_width_left, pad_width_right], [0,0]]`. + Default: padding = 0. + dilation (int|list|tuple): The dilation size. It means the spacing between the kernel points. + If dilation is a list/tuple, it must contain three integers, (dilation_depth, dilation_height, + dilation_width). Otherwise, dilation_depth = dilation_height = dilation_width = dilation. + Default: dilation = 1. + groups (int): The groups number of the Conv3D Layer. According to grouped + convolution in Alex Krizhevsky's Deep CNN paper: when group=2, + the first half of the filters is only connected to the first half + of the input channels, while the second half of the filters is only + connected to the second half of the input channels. Default: groups=1. Currently, only support groups=1. + data_format (str, optional): Specify the data format of the input, and the data format of the output + will be consistent with that of the input. An optional string from: `"NCDHW"`, `"NDHWC"`. + The default is `"NDHWC"`. When it is `"NDHWC"`, the data is stored in the order of: + `[batch_size, input_depth, input_height, input_width, input_channels]`. + name(str|None): For detailed information, please refer + to :ref:`api_guide_Name`. Usually name is no need to set and + None by default. + + Returns: + A SparseCooTensor representing the conv3d, whose data type is the same with input. + + Examples: + .. code-block:: python + + import paddle + from paddle.fluid.framework import _test_eager_guard + + with _test_eager_guard(): + indices = [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 1, 2], [1, 3, 2, 3]] + values = [[1], [2], [3], [4]] + indices = paddle.to_tensor(indices, dtype='int32') + values = paddle.to_tensor(values, dtype='float32') + dense_shape = [1, 1, 3, 4, 1] + sparse_x = paddle.sparse.sparse_coo_tensor(indices, values, dense_shape, stop_gradient=True) + weight = paddle.randn((1, 3, 3, 1, 1), dtype='float32') + y = paddle.sparse.functional.conv3d(sparse_x, weight) + print(y.shape) + # (1, 1, 1, 2, 1) + """ + return _conv3d(x, weight, bias, stride, padding, dilation, groups, False, + data_format, name) + + +def subm_conv3d(x, + weight, + bias=None, + stride=1, + padding=0, + dilation=1, + groups=1, + data_format="NDHWC", + name=None): + r""" + + The sparse submanifold convolution3d functional calculates the output based on the input, filter + and strides, paddings, dilations, groups parameters. Input(Input) and + Output(Output) are multidimensional SparseCooTensors with a shape of + :math:`[N, D, H, W, C]` . Where N is batch size, C is the number of + channels, D is the depth of the feature, H is the height of the feature, + and W is the width of the feature. If bias attribution is provided, + bias is added to the output of the convolution. + + For each input :math:`X`, the equation is: + + .. math:: + + Out = W \ast X + b + + In the above equation: + + * :math:`X`: Input value, a tensor with NCDHW or NDHWC format. + * :math:`W`: Filter value, a tensor with DHWCM format. + * :math:`\\ast`: Submanifold Convolution operation, refer to the paper: https://arxiv.org/abs/1706.01307. + * :math:`b`: Bias value, a 1-D tensor with shape [M]. + * :math:`Out`: Output value, the shape of :math:`Out` and :math:`X` may be different. + + Example: + + - Input: + + Input shape: :math:`(N, D_{in}, H_{in}, W_{in}, C_{in})` + + Filter shape: :math:`(D_f, H_f, W_f, C_{in}, C_{out})` + + - Output: + Output shape: :math:`(N, D_{out}, H_{out}, W_{out}, C_{out})` + + Where + + .. math:: + + D_{out}&= \\frac{(D_{in} + 2 * paddings[0] - (dilations[0] * (D_f - 1) + 1))}{strides[0]} + 1 \\\\ + H_{out}&= \\frac{(H_{in} + 2 * paddings[1] - (dilations[1] * (H_f - 1) + 1))}{strides[1]} + 1 \\\\ + W_{out}&= \\frac{(W_{in} + 2 * paddings[2] - (dilations[2] * (W_f - 1) + 1))}{strides[2]} + 1 + + Args: + x (Tensor): The input is 5-D SparseCooTensor with shape [N, D, H, W, C], the data + type of input is float16 or float32 or float64. + weight (Tensor): The convolution kernel, a Tensor with shape [kD, kH, kW, C/g, M], + where M is the number of filters(output channels), g is the number of groups, + kD, kH, kW are the filter's depth, height and width respectively. + bias (Tensor, optional): The bias, a Tensor of shape [M, ], currently, only support bias is None. + stride (int|list|tuple): The stride size. It means the stride in convolution. If stride is a + list/tuple, it must contain three integers, (stride_depth, stride_height, stride_width). + Otherwise, stride_depth = stride_height = stride_width = stride. Default: stride = 1. + padding (string|int|list|tuple): The padding size. It means the number of zero-paddings + on both sides for each dimension. If `padding` is a string, either 'VALID' or + 'SAME' which is the padding algorithm. If padding size is a tuple or list, + it could be in three forms: `[pad_depth, pad_height, pad_width]` or + `[pad_depth_front, pad_depth_back, pad_height_top, pad_height_bottom, pad_width_left, pad_width_right]`, + and when `data_format` is `"NCDHW"`, `padding` can be in the form + `[[0,0], [0,0], [pad_depth_front, pad_depth_back], [pad_height_top, pad_height_bottom], [pad_width_left, pad_width_right]]`. + when `data_format` is `"NHWC"`, `padding` can be in the form + `[[0,0], [pad_depth_front, pad_depth_back], [pad_height_top, pad_height_bottom], [pad_width_left, pad_width_right], [0,0]]`. + Default: padding = 0. + dilation (int|list|tuple): The dilation size. It means the spacing between the kernel points. + If dilation is a list/tuple, it must contain three integers, (dilation_depth, dilation_height, + dilation_width). Otherwise, dilation_depth = dilation_height = dilation_width = dilation. + Default: dilation = 1. + groups (int): The groups number of the Conv3D Layer. According to grouped + convolution in Alex Krizhevsky's Deep CNN paper: when group=2, + the first half of the filters is only connected to the first half + of the input channels, while the second half of the filters is only + connected to the second half of the input channels. Currently, only support groups=1. + data_format (str, optional): Specify the data format of the input, and the data format of the output + will be consistent with that of the input. An optional string from: `"NCDHW"`, `"NDHWC"`. + The default is `"NDHWC"`. When it is `"NDHWC"`, the data is stored in the order of: + `[batch_size, input_depth, input_height, input_width, input_channels]`. + name(str|None): For detailed information, please refer + to :ref:`api_guide_Name`. Usually name is no need to set and + None by default. + + Returns: + A SparseCooTensor representing the conv3d, whose data type is + the same with input. + + Examples: + .. code-block:: python + + import paddle + from paddle.fluid.framework import _test_eager_guard + + with _test_eager_guard(): + indices = [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 1, 2], [1, 3, 2, 3]] + values = [[1], [2], [3], [4]] + indices = paddle.to_tensor(indices, dtype='int32') + values = paddle.to_tensor(values, dtype='float32') + dense_shape = [1, 1, 3, 4, 1] + sparse_x = paddle.sparse.sparse_coo_tensor(indices, values, dense_shape, stop_gradient=True) + weight = paddle.randn((1, 3, 3, 1, 1), dtype='float32') + y = paddle.sparse.functional.subm_conv3d(sparse_x, weight) + print(y.shape) + #(1, 1, 3, 4, 1) + """ + return _conv3d(x, weight, bias, stride, padding, dilation, groups, True, + data_format, name) diff --git a/python/paddle/sparse/layer/__init__.py b/python/paddle/sparse/layer/__init__.py index 66abce260b6f7..a0f9d068e677c 100644 --- a/python/paddle/sparse/layer/__init__.py +++ b/python/paddle/sparse/layer/__init__.py @@ -13,5 +13,7 @@ # limitations under the License. from .activation import ReLU +from .conv import Conv3D +from .conv import SubmConv3D __all__ = [] diff --git a/python/paddle/sparse/layer/conv.py b/python/paddle/sparse/layer/conv.py new file mode 100644 index 0000000000000..ff421a06a1344 --- /dev/null +++ b/python/paddle/sparse/layer/conv.py @@ -0,0 +1,380 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np +from .. import functional as F +from paddle.nn import Layer +from paddle.nn.initializer import Normal +from ..functional.conv import _update_padding_nd +from ...fluid.layers import utils + +__all__ = [] + + +class _Conv3D(Layer): + def __init__(self, + in_channels, + out_channels, + kernel_size, + stride=1, + padding=0, + dilation=1, + groups=1, + subm=False, + padding_mode='zeros', + weight_attr=None, + bias_attr=None, + data_format="NDHWC"): + super(_Conv3D, self).__init__() + assert weight_attr is not False, "weight_attr should not be False in Conv." + self._param_attr = weight_attr + self._bias_attr = bias_attr + self._groups = groups + self._in_channels = in_channels + self._out_channels = out_channels + self._data_format = data_format + self._subm = subm + + assert padding_mode == 'zeros', "Currently, only support padding_mode='zeros'" + assert groups == 1, "Currently, only support groups=1" + + valid_format = {'NDHWC'} + if data_format not in valid_format: + raise ValueError( + "data_format must be one of {}, but got data_format='{}'". + format(valid_format, data_format)) + + channel_last = data_format == "NDHWC" + + dims = 3 + self._stride = utils.convert_to_list(stride, dims, 'stride') + self._dilation = utils.convert_to_list(dilation, dims, 'dilation') + self._kernel_size = utils.convert_to_list(kernel_size, dims, + 'kernel_size') + self._padding = padding + self._padding_mode = padding_mode + self._updated_padding, self._padding_algorithm = _update_padding_nd( + padding, channel_last, dims) + + # the sparse conv restricts the shape is [D, H, W, in_channels, out_channels] + filter_shape = self._kernel_size + [ + self._in_channels, self._out_channels + ] + + def _get_default_param_initializer(): + filter_elem_num = np.prod(self._kernel_size) * self._in_channels + std = (2.0 / filter_elem_num)**0.5 + return Normal(0.0, std) + + self.weight = self.create_parameter( + shape=filter_shape, + attr=self._param_attr, + default_initializer=_get_default_param_initializer()) + #self.bias = self.create_parameter( + # attr=self._bias_attr, shape=[self._out_channels], is_bias=True) + self.bias = None + + def forward(self, x): + out = F.conv._conv3d( + x, + self.weight, + bias=self.bias, + stride=self._stride, + padding=self._updated_padding, + dilation=self._dilation, + groups=self._groups, + subm=self._subm, + data_format=self._data_format) + return out + + def extra_repr(self): + main_str = '{_in_channels}, {_out_channels}, kernel_size={_kernel_size}' + if self._stride != [1] * len(self._stride): + main_str += ', stride={_stride}' + if self._padding != 0: + main_str += ', padding={_padding}' + if self._padding_mode != 'zeros': + main_str += ', padding_mode={_padding_mode}' + if self._dilation != [1] * len(self._dilation): + main_str += ', dilation={_dilation}' + if self._groups != 1: + main_str += ', groups={_groups}' + main_str += ', data_format={_data_format}' + return main_str.format(**self.__dict__) + + +class Conv3D(_Conv3D): + r""" + **Sparse Convlution3d Layer** + The Sparse convolution3d layer calculates the output based on the input, filter + and strides, paddings, dilations, groups parameters. Input(Input) and + Output(Output) are multidimensional SparseCooTensors with a shape of + :math:`[N, D, H, W, C]` . Where N is batch size, C is the number of + channels, D is the depth of the feature, H is the height of the feature, + and W is the width of the feature. If bias attribution is provided, + bias is added to the output of the convolution. + For each input :math:`X`, the equation is: + + .. math:: + + Out = W \ast X + b + + In the above equation: + + * :math:`X`: Input value, a tensor with NDHWC format. + * :math:`W`: Filter value, a tensor with DHWCM format. + * :math:`\\ast`: Convolution operation. + * :math:`b`: Bias value, a 1-D tensor with shape [M]. + * :math:`Out`: Output value, the shape of :math:`Out` and :math:`X` may be different. + + Parameters: + in_channels(int): The number of input channels in the input image. + out_channels(int): The number of output channels produced by the convolution. + kernel_size(int|list|tuple, optional): The size of the convolving kernel. + stride(int|list|tuple, optional): The stride size. If stride is a list/tuple, it must + contain three integers, (stride_D, stride_H, stride_W). Otherwise, the + stride_D = stride_H = stride_W = stride. The default value is 1. + padding(int|str|tuple|list, optional): The padding size. Padding coule be in one of the following forms. + 1. a string in ['valid', 'same']. + 2. an int, which means each spartial dimension(depth, height, width) is zero paded by size of `padding` + 3. a list[int] or tuple[int] whose length is the number of spartial dimensions, which contains the amount of padding on each side for each spartial dimension. It has the form [pad_d1, pad_d2, ...]. + 4. a list[int] or tuple[int] whose length is 2 * number of spartial dimensions. It has the form [pad_before, pad_after, pad_before, pad_after, ...] for all spartial dimensions. + 5. a list or tuple of pairs of ints. It has the form [[pad_before, pad_after], [pad_before, pad_after], ...]. Note that, the batch dimension and channel dimension are also included. Each pair of integers correspond to the amount of padding for a dimension of the input. Padding in batch dimension and channel dimension should be [0, 0] or (0, 0). + The default value is 0. + dilation(int|list|tuple, optional): The dilation size. If dilation is a list/tuple, it must + contain three integers, (dilation_D, dilation_H, dilation_W). Otherwise, the + dilation_D = dilation_H = dilation_W = dilation. The default value is 1. + groups(int, optional): The groups number of the Conv3D Layer. According to grouped + convolution in Alex Krizhevsky's Deep CNN paper: when group=2, + the first half of the filters is only connected to the first half + of the input channels, while the second half of the filters is only + connected to the second half of the input channels. The default value is 1, currently, only support groups=1. + padding_mode(str, optional): ``'zeros'``, ``'reflect'``, ``'replicate'`` or ``'circular'``. Currently only support ``'zeros'``. + weight_attr(ParamAttr, optional): The parameter attribute for learnable parameters/weights + of conv3d. If it is set to None or one attribute of ParamAttr, conv3d + will create ParamAttr as param_attr. If it is set to None, the parameter + is initialized with :math:`Normal(0.0, std)`, and the :math:`std` is + :math:`(\frac{2.0 }{filter\_elem\_num})^{0.5}`. The default value is None. + bias_attr(ParamAttr|bool, optional): The parameter attribute for the bias of conv3d. + If it is set to False, no bias will be added to the output units. + If it is set to None or one attribute of ParamAttr, conv3d + will create ParamAttr as bias_attr. If the Initializer of the bias_attr + is not set, the bias is initialized zero. The default value is None. + data_format(str, optional): Data format that specifies the layout of input. + It can be "NCDHW" or "NDHWC". Currently, only support "NCDHW". + + Attribute: + + **weight** (Parameter): the learnable weights of filters of this layer. + + **bias** (Parameter): the learnable bias of this layer. + + Shape: + + - x: :math:`(N, D_{in}, H_{in}, W_{in}, C_{in})` + + - weight: :math:`(K_{d}, K_{h}, K_{w}, C_{in}, C_{out})` + + - bias: :math:`(C_{out})` + + - output: :math:`(N, D_{out}, H_{out}, W_{out}, C_{out})` + + Where + + .. math:: + + D_{out}&= \frac{(D_{in} + 2 * paddings[0] - (dilations[0] * (kernel\_size[0] - 1) + 1))}{strides[0]} + 1 + + H_{out}&= \frac{(H_{in} + 2 * paddings[1] - (dilations[1] * (kernel\_size[1] - 1) + 1))}{strides[1]} + 1 + + W_{out}&= \frac{(W_{in} + 2 * paddings[2] - (dilations[2] * (kernel\_size[2] - 1) + 1))}{strides[2]} + 1 + + Examples: + + .. code-block:: python + + import paddle + from paddle.fluid.framework import _test_eager_guard + + with _test_eager_guard(): + indices = [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 1, 2], [1, 3, 2, 3]] + values = [[1], [2], [3], [4]] + indices = paddle.to_tensor(indices, dtype='int32') + values = paddle.to_tensor(values, dtype='float32') + dense_shape = [1, 1, 3, 4, 1] + sparse_x = paddle.sparse.sparse_coo_tensor(indices, values, dense_shape, stop_gradient=True) + conv = paddle.sparse.Conv3D(1, 1, (1, 3, 3)) + y = conv(sparse_x) + print(y.shape) + # (1, 1, 1, 2, 1) + """ + + def __init__(self, + in_channels, + out_channels, + kernel_size, + stride=1, + padding=0, + dilation=1, + groups=1, + padding_mode='zeros', + weight_attr=None, + bias_attr=None, + data_format="NDHWC"): + super(Conv3D, self).__init__( + in_channels, + out_channels, + kernel_size, + stride=stride, + padding=padding, + dilation=dilation, + groups=groups, + subm=False, + padding_mode=padding_mode, + weight_attr=weight_attr, + bias_attr=bias_attr, + data_format=data_format) + + +class SubmConv3D(_Conv3D): + r""" + **Sparse Submanifold Convlution3d Layer** + The Sparse submanifold convolution3d layer calculates the output based on the input, filter + and strides, paddings, dilations, groups parameters. Input(Input) and + Output(Output) are multidimensional SparseCooTensors with a shape of + :math:`[N, D, H, W, C]` . Where N is batch size, C is the number of + channels, D is the depth of the feature, H is the height of the feature, + and W is the width of the feature. If bias attribution is provided, + bias is added to the output of the convolution. + For each input :math:`X`, the equation is: + + .. math:: + + Out =(W \ast X + b + + In the above equation: + + * :math:`X`: Input value, a tensor with NDHWC format. + * :math:`W`: Filter value, a tensor with DHWCM format. + * :math:`\\ast`: Submanifold Convolution operation, refer to the paper: https://arxiv.org/abs/1706.01307. + * :math:`b`: Bias value, a 1-D tensor with shape [M]. + * :math:`Out`: Output value, the shape of :math:`Out` and :math:`X` may be different. + + Parameters: + in_channels(int): The number of input channels in the input image. + out_channels(int): The number of output channels produced by the convolution. + kernel_size(int|list|tuple, optional): The size of the convolving kernel. + stride(int|list|tuple, optional): The stride size. If stride is a list/tuple, it must + contain three integers, (stride_D, stride_H, stride_W). Otherwise, the + stride_D = stride_H = stride_W = stride. The default value is 1. + padding(int|str|tuple|list, optional): The padding size. Padding coule be in one of the following forms. + 1. a string in ['valid', 'same']. + 2. an int, which means each spartial dimension(depth, height, width) is zero paded by size of `padding` + 3. a list[int] or tuple[int] whose length is the number of spartial dimensions, which contains the amount of padding on each side for each spartial dimension. It has the form [pad_d1, pad_d2, ...]. + 4. a list[int] or tuple[int] whose length is 2 * number of spartial dimensions. It has the form [pad_before, pad_after, pad_before, pad_after, ...] for all spartial dimensions. + 5. a list or tuple of pairs of ints. It has the form [[pad_before, pad_after], [pad_before, pad_after], ...]. Note that, the batch dimension and channel dimension are also included. Each pair of integers correspond to the amount of padding for a dimension of the input. Padding in batch dimension and channel dimension should be [0, 0] or (0, 0). + The default value is 0. + dilation(int|list|tuple, optional): The dilation size. If dilation is a list/tuple, it must + contain three integers, (dilation_D, dilation_H, dilation_W). Otherwise, the + dilation_D = dilation_H = dilation_W = dilation. The default value is 1. + groups(int, optional): The groups number of the Conv3D Layer. According to grouped + convolution in Alex Krizhevsky's Deep CNN paper: when group=2, + the first half of the filters is only connected to the first half + of the input channels, while the second half of the filters is only + connected to the second half of the input channels. The default value is 1. + padding_mode(str, optional): ``'zeros'``, ``'reflect'``, ``'replicate'`` or ``'circular'``. Currently only support ``'zeros'``. + weight_attr(ParamAttr, optional): The parameter attribute for learnable parameters/weights + of conv3d. If it is set to None or one attribute of ParamAttr, conv3d + will create ParamAttr as param_attr. If it is set to None, the parameter + is initialized with :math:`Normal(0.0, std)`, and the :math:`std` is + :math:`(\frac{2.0 }{filter\_elem\_num})^{0.5}`. The default value is None. + bias_attr(ParamAttr|bool, optional): The parameter attribute for the bias of conv3d. + If it is set to False, no bias will be added to the output units. + If it is set to None or one attribute of ParamAttr, conv3d + will create ParamAttr as bias_attr. If the Initializer of the bias_attr + is not set, the bias is initialized zero. The default value is None. + data_format(str, optional): Data format that specifies the layout of input. + It can be "NCDHW" or "NDHWC". Currently, only support "NCDHW". + + Attribute: + + **weight** (Parameter): the learnable weights of filters of this layer. + + **bias** (Parameter): the learnable bias of this layer. + + Shape: + + - x: :math:`(N, D_{in}, H_{in}, W_{in}, C_{in})` + + - weight: :math:`(K_{d}, K_{h}, K_{w}, C_{in}, C_{out})` + + - bias: :math:`(C_{out})` + + - output: :math:`(N, D_{out}, H_{out}, W_{out}, C_{out})` + + Where + + .. math:: + + D_{out}&= \frac{(D_{in} + 2 * paddings[0] - (dilations[0] * (kernel\_size[0] - 1) + 1))}{strides[0]} + 1 + + H_{out}&= \frac{(H_{in} + 2 * paddings[1] - (dilations[1] * (kernel\_size[1] - 1) + 1))}{strides[1]} + 1 + + W_{out}&= \frac{(W_{in} + 2 * paddings[2] - (dilations[2] * (kernel\_size[2] - 1) + 1))}{strides[2]} + 1 + + Examples: + + .. code-block:: python + + import paddle + from paddle.fluid.framework import _test_eager_guard + + with _test_eager_guard(): + indices = [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 1, 2], [1, 3, 2, 3]] + values = [[1], [2], [3], [4]] + dense_shape = [1, 1, 3, 4, 1] + indices = paddle.to_tensor(indices, dtype='int32') + values = paddle.to_tensor(values, dtype='float32') + sparse_x = paddle.sparse.sparse_coo_tensor(indices, values, dense_shape, stop_gradient=True) + subm_conv = paddle.sparse.SubmConv3D(1, 1, (1, 3, 3)) + y = subm_conv(sparse_x) + print(y.shape) + # (1, 1, 3, 4, 1) + """ + + def __init__(self, + in_channels, + out_channels, + kernel_size, + stride=1, + padding=0, + dilation=1, + groups=1, + padding_mode='zeros', + weight_attr=None, + bias_attr=None, + data_format="NDHWC"): + super(SubmConv3D, self).__init__( + in_channels, + out_channels, + kernel_size, + stride=stride, + padding=padding, + dilation=dilation, + groups=groups, + subm=True, + padding_mode=padding_mode, + weight_attr=weight_attr, + bias_attr=bias_attr, + data_format=data_format) From e83e44c78d16e15fd944dd524ca0102b06ef86d1 Mon Sep 17 00:00:00 2001 From: Chen Weihang Date: Fri, 15 Apr 2022 12:50:29 +0800 Subject: [PATCH 175/211] polish tensor depreacted method warning (#41807) --- paddle/phi/api/lib/tensor.cc | 92 +++++++++++++++++++----------------- 1 file changed, 49 insertions(+), 43 deletions(-) diff --git a/paddle/phi/api/lib/tensor.cc b/paddle/phi/api/lib/tensor.cc index 7eff846bbc1e3..1fb0803379894 100644 --- a/paddle/phi/api/lib/tensor.cc +++ b/paddle/phi/api/lib/tensor.cc @@ -67,13 +67,14 @@ Tensor::Tensor(std::shared_ptr tensor_impl) } Tensor::Tensor(const Place &place) { - LOG(WARNING) << "The Tensor(place) constructor is deprecated since version " - "2.3, and will be removed in version 2.4! Please use " - "`paddle::empty/full` method to create a new " - "Tensor instead. " - "Reason: A legal tensor cannot be constructed only based on " - "the `place`, and datatype, shape, layout, etc. is also " - "required."; + LOG_FIRST_N(WARNING, 1) + << "The Tensor(place) constructor is deprecated since version " + "2.3, and will be removed in version 2.4! Please use " + "`paddle::empty/full` method to create a new " + "Tensor instead. " + "Reason: A legal tensor cannot be constructed only based on " + "the `place`, and datatype, shape, layout, etc. is also " + "required."; DefaultAllocator alloc(detail::GetCorrectPlaceByPlaceType(place)); impl_ = std::move(std::make_shared( &alloc, @@ -82,13 +83,14 @@ Tensor::Tensor(const Place &place) { } Tensor::Tensor(const Place &place, const std::vector &shape) { - LOG(WARNING) << "The Tensor(place, shape) constructor is deprecated since " - "version 2.3, and will be removed in version 2.4! Please use " - "`paddle::empty/full` method to create a new " - "Tensor instead. " - "Reason: A legal tensor cannot be constructed only based on " - "the `place` and `shape`, and datatype, layout, etc. is also " - "required."; + LOG_FIRST_N(WARNING, 1) + << "The Tensor(place, shape) constructor is deprecated since " + "version 2.3, and will be removed in version 2.4! Please use " + "`paddle::empty/full` method to create a new " + "Tensor instead. " + "Reason: A legal tensor cannot be constructed only based on " + "the `place` and `shape`, and datatype, layout, etc. is also " + "required."; DefaultAllocator alloc(detail::GetCorrectPlaceByPlaceType(place)); impl_ = std::move(std::make_shared( &alloc, @@ -118,14 +120,15 @@ std::vector Tensor::shape() const { } void Tensor::reshape(const std::vector &shape) { - LOG(WARNING) << "The function of resetting the shape of the uninitialized " - "Tensor of the `reshape` method is deprecated since version " - "2.3, and will be removed in version 2.4, please use " - "`paddle::empty/full` method to create a new Tensor " - "instead. " - "reason: `reshape` means changing the tensor shape without " - "touching underlying data, this requires the total size of " - "the tensor to remain constant."; + LOG_FIRST_N(WARNING, 1) + << "The function of resetting the shape of the uninitialized " + "Tensor of the `reshape` method is deprecated since version " + "2.3, and will be removed in version 2.4, please use " + "`paddle::empty/full` method to create a new Tensor " + "instead. " + "reason: `reshape` means changing the tensor shape without " + "touching underlying data, this requires the total size of " + "the tensor to remain constant."; if (is_dense_tensor()) { static_cast(impl_.get())->Resize(phi::make_ddim(shape)); } else { @@ -175,15 +178,16 @@ bool Tensor::is_gpu_pinned() const { template T *Tensor::mutable_data() { - LOG(WARNING) << "Allocating memory through `mutable_data` method is " - "deprecated since version 2.3, and `mutable_data` method " - "will be removed in version 2.4! Please use " - "`paddle::empty/full` method to create a new " - "Tensor with allocated memory, and use data() method " - "to get the memory pointer of tensor instead. " - "Reason: When calling `mutable_data` to allocate memory, " - "the place, datatype, and data layout of tensor may be in " - "an illegal state."; + LOG_FIRST_N(WARNING, 1) + << "Allocating memory through `mutable_data` method is " + "deprecated since version 2.3, and `mutable_data` method " + "will be removed in version 2.4! Please use " + "`paddle::empty/full` method to create a new " + "Tensor with allocated memory, and use data() method " + "to get the memory pointer of tensor instead. " + "Reason: When calling `mutable_data` to allocate memory, " + "the place, datatype, and data layout of tensor may be in " + "an illegal state."; if (is_dense_tensor()) { return static_cast(impl_.get()) ->mutable_data(place()); @@ -208,15 +212,16 @@ Tensor::mutable_data(); template T *Tensor::mutable_data(const Place &place) { - LOG(WARNING) << "Allocating memory through `mutable_data` method is " - "deprecated since version 2.3, and `mutable_data` method " - "will be removed in version 2.4! Please use " - "`paddle::empty/full` method to create a new " - "Tensor with allocated memory, and use data() method " - "to get the memory pointer of tensor instead. " - "Reason: When calling `mutable_data` to allocate memory, " - "the datatype, and data layout of tensor may be in " - "an illegal state."; + LOG_FIRST_N(WARNING, 1) + << "Allocating memory through `mutable_data` method is " + "deprecated since version 2.3, and `mutable_data` method " + "will be removed in version 2.4! Please use " + "`paddle::empty/full` method to create a new " + "Tensor with allocated memory, and use data() method " + "to get the memory pointer of tensor instead. " + "Reason: When calling `mutable_data` to allocate memory, " + "the datatype, and data layout of tensor may be in " + "an illegal state."; if (is_dense_tensor()) { return static_cast(impl_.get())->mutable_data(place); } @@ -328,9 +333,10 @@ bool Tensor::defined() const { return impl_ != nullptr; } bool Tensor::initialized() const { return defined() && impl_->initialized(); } bool Tensor::is_initialized() const { - LOG(WARNING) << "The `is_initialized` method is deprecated since version " - "2.3, and will be removed in version 2.4! " - "Please use `initialized` method instead."; + LOG_FIRST_N(WARNING, 1) + << "The `is_initialized` method is deprecated since version " + "2.3, and will be removed in version 2.4! " + "Please use `initialized` method instead."; return defined() && impl_->initialized(); } From d72244828ef14a306ff5ca363bc140c06f187daa Mon Sep 17 00:00:00 2001 From: Allen Guo Date: Fri, 15 Apr 2022 13:33:14 +0800 Subject: [PATCH 176/211] [IPU] add mixed-precission support for ipu (#41733) * add mixed-precission support for ipu * restore cast_model_to_fp16 api * update UTs --- .../ir/ipu/optimizer_extract_pass.cc | 7 +- .../contrib/mixed_precision/fp16_utils.py | 14 +- .../fluid/tests/unittests/ipu/op_test_ipu.py | 54 ++- .../ipu/test_mixed_precision_inference_ipu.py | 140 +++++++ .../ipu/test_mixed_precision_training_ipu.py | 151 ++++++++ .../unittests/ipu/test_model_parallel_ipu.py | 357 ++++++++++++++++++ .../unittests/ipu/test_weight_decay_ipu.py | 118 ++++++ 7 files changed, 827 insertions(+), 14 deletions(-) create mode 100644 python/paddle/fluid/tests/unittests/ipu/test_mixed_precision_inference_ipu.py create mode 100644 python/paddle/fluid/tests/unittests/ipu/test_mixed_precision_training_ipu.py create mode 100644 python/paddle/fluid/tests/unittests/ipu/test_model_parallel_ipu.py create mode 100644 python/paddle/fluid/tests/unittests/ipu/test_weight_decay_ipu.py diff --git a/paddle/fluid/framework/ir/ipu/optimizer_extract_pass.cc b/paddle/fluid/framework/ir/ipu/optimizer_extract_pass.cc index 7cdb7a8854aad..7c517a50e9af4 100644 --- a/paddle/fluid/framework/ir/ipu/optimizer_extract_pass.cc +++ b/paddle/fluid/framework/ir/ipu/optimizer_extract_pass.cc @@ -30,9 +30,10 @@ std::set ignored_ops = { "elementwise_max", "elementwise_div", "elementwise_mul", - "scale", // adamax - "assign", // adamw - "squared_l2_norm" // gradient_clip_norm + "scale", // adamax + "assign", // adamw + "squared_l2_norm", // gradient_clip_norm + "cast", // mix-precision support }; const bool startswith(const std::string& str, const std::string& pre) { diff --git a/python/paddle/fluid/contrib/mixed_precision/fp16_utils.py b/python/paddle/fluid/contrib/mixed_precision/fp16_utils.py index e3e5bc4f32703..760e9ceb9ea2f 100644 --- a/python/paddle/fluid/contrib/mixed_precision/fp16_utils.py +++ b/python/paddle/fluid/contrib/mixed_precision/fp16_utils.py @@ -191,7 +191,8 @@ def _insert_cast_op(block, op, idx, src_dtype, dest_dtype): attrs={ "in_dtype": in_var.dtype, "out_dtype": out_var.dtype, - "op_device": op_device + "op_device": op_device, + "op_role": op.attr("op_role"), }) num_cast_ops += 1 _rename_arg(op, in_var.name, out_var.name) @@ -241,7 +242,8 @@ def _insert_cast_post_op(block, op, idx, src_dtype, dest_dtype, target_name, attrs={ "in_dtype": target_var.dtype, "out_dtype": cast_var.dtype, - "op_device": op.attr("op_device") + "op_device": op.attr("op_device"), + "op_role": op.attr("op_role"), }) num_cast_ops += 1 op_var_rename_map[block.idx][target_var.name] = cast_var.name @@ -415,7 +417,9 @@ def cast_model_to_fp16(program, amp_lists=None, use_fp16_guard=True): keep_fp32_ops.add(op) continue # processed below for in_name in op.input_names: - if _keep_fp32_input(op, in_name): + # for ipu, all inputs must be converted to fp16 + if not core.is_compiled_with_ipu() and _keep_fp32_input( + op, in_name): continue for in_var_name in op.input(in_name): in_var = None @@ -443,7 +447,9 @@ def cast_model_to_fp16(program, amp_lists=None, use_fp16_guard=True): format(op.type, in_var_name, in_var.dtype)) for out_name in op.output_names: - if _keep_fp32_output(op, out_name): + # for ipu, all outputs must be converted to fp16 + if not core.is_compiled_with_ipu() and _keep_fp32_output( + op, out_name): continue for out_var_name in op.output(out_name): out_var = None diff --git a/python/paddle/fluid/tests/unittests/ipu/op_test_ipu.py b/python/paddle/fluid/tests/unittests/ipu/op_test_ipu.py index 790388f30ead9..26fd42be6cd27 100644 --- a/python/paddle/fluid/tests/unittests/ipu/op_test_ipu.py +++ b/python/paddle/fluid/tests/unittests/ipu/op_test_ipu.py @@ -16,7 +16,7 @@ import random import unittest import numpy as np -from enum import Enum +from enum import IntEnum import paddle import paddle.static @@ -33,17 +33,24 @@ } -class ExecutionMode(Enum): +class ExecutionModeFull(IntEnum): + # Run fp32 model on cpu CPU_FP32 = 1 + # Run fp32 model on ipu IPU_FP32 = 2 - # enable_fp16 through ipu_strategy.enable_fp16 + # Convert model to fp16 using popart transform + # All parameters will be converted to fp16 + # TODO rename to IPU_FP16 IPU_POPART_FP16 = 3 + # Mix-precision mode, using `paddle.static.amp.fp16_guard()` to control the + # precision of each operator + IPU_MIXED_PRECISION = 4 - def __lt__(self, other): - return self.value < other.value - def __gt__(self, other): - return self.value > other.value +class ExecutionMode(IntEnum): + CPU_FP32 = ExecutionModeFull.CPU_FP32 + IPU_FP32 = ExecutionModeFull.IPU_FP32 + IPU_POPART_FP16 = ExecutionModeFull.IPU_POPART_FP16 def np_dtype_to_fluid_str(dtype: np.dtype) -> str: @@ -61,6 +68,12 @@ def setUpClass(cls): np.random.seed(cls.SEED) random.seed(cls.SEED) + # For ipu, most ops support fp16 + cls.amp_list = paddle.static.amp.CustomOpLists( + custom_black_list=[], custom_white_list=[]) + cls.amp_list.unsupported_list = {} + cls.amp_list.black_list = {} + # Enable paddle static graph mode paddle.enable_static() @@ -114,3 +127,30 @@ def check(self, outputs, check_shape=False): if check_shape: self.assertTrue(ipu_popart_fp16.shape == cpu_fp32.shape) + + ipu_mixed_precision = None + if ExecutionModeFull.IPU_MIXED_PRECISION in outputs.keys(): + ipu_mixed_precision = outputs[ + ExecutionModeFull.IPU_MIXED_PRECISION] + max_diff = np.abs( + ipu_mixed_precision.astype(np.float32) - cpu_fp32).max() + fp16_flag = np.allclose( + ipu_mixed_precision.astype(np.float32), + cpu_fp32, + rtol=self.rtol_fp16, + atol=self.atol_fp16) + self.assertTrue(fp16_flag, "max diff is %f" % (max_diff)) + + if check_shape: + self.assertTrue(ipu_mixed_precision.shape == cpu_fp32.shape) + + if ExecutionMode.IPU_POPART_FP16 in outputs.keys( + ) and ExecutionModeFull.IPU_MIXED_PRECISION in outputs.keys(): + max_diff = np.abs(ipu_popart_fp16 - ipu_mixed_precision).max() + self.assertEqual(ipu_popart_fp16.all(), + ipu_mixed_precision.all(), + "max diff is %f" % (max_diff)) + + if check_shape: + self.assertTrue( + ipu_popart_fp16.shape == ipu_mixed_precision.shape) diff --git a/python/paddle/fluid/tests/unittests/ipu/test_mixed_precision_inference_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_mixed_precision_inference_ipu.py new file mode 100644 index 0000000000000..a70550c1df702 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/ipu/test_mixed_precision_inference_ipu.py @@ -0,0 +1,140 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest + +import numpy as np +import paddle +import paddle.static +import paddle.nn.functional as F +from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest, ExecutionModeFull + + +@unittest.skipIf(not paddle.is_compiled_with_ipu(), + "core is not compiled with IPU") +class TestBase(IPUOpTest): + def setUp(self): + self.set_atol() + self.set_data_feed() + self.set_feed_attr() + + @property + def fp16_enabled(self): + return True + + def set_atol(self): + self.atol = 1e-6 + self.rtol = 1e-6 + self.atol_fp16 = 1e-3 + self.rtol_fp16 = 1e-3 + + def set_data_feed(self): + data = np.random.uniform(size=[1, 10, 27, 27]) + self.feed_fp32 = {"in_0": data.astype(np.float32)} + self.feed_fp16 = {"in_0": data.astype(np.float16)} + + def set_feed_attr(self): + self.feed_shape = [x.shape for x in self.feed_fp32.values()] + self.feed_list = list(self.feed_fp32.keys()) + + def dtype_check(self, program, to_fp16_var_names): + block = program.global_block() + assert len(to_fp16_var_names) > 0 + for var_name in to_fp16_var_names: + assert (block.var(var_name).dtype, paddle.float16) + + def _test_base(self, exec_mode): + generator = paddle.fluid.unique_name.UniqueNameGenerator() + scope = paddle.static.Scope() + main_prog = paddle.static.Program() + startup_prog = paddle.static.Program() + main_prog.random_seed = self.SEED + startup_prog.random_seed = self.SEED + + with paddle.fluid.unique_name.guard(generator): + with paddle.static.scope_guard(scope): + with paddle.static.program_guard(main_prog, startup_prog): + x = paddle.static.data( + name=self.feed_list[0], + shape=self.feed_shape[0], + dtype='float32') + + # using fp32 + x = paddle.static.nn.conv2d( + input=x, num_filters=3, filter_size=3) + x = paddle.static.nn.batch_norm(x, act='relu') + x = F.max_pool2d(x, kernel_size=2, stride=2) + + # using fp16 + with paddle.static.amp.fp16_guard(): + x = paddle.static.nn.conv2d( + input=x, num_filters=6, filter_size=3) + x = paddle.static.nn.batch_norm(x, act='relu') + x = F.max_pool2d(x, kernel_size=2, stride=2) + + # using fp32 + x = paddle.static.nn.fc(x, size=10) + loss = paddle.mean(x) + fetch_list = [loss.name] + + if exec_mode == ExecutionModeFull.CPU_FP32: + place = paddle.CPUPlace() + else: + place = paddle.IPUPlace() + + # cast model to fp16 + if exec_mode == ExecutionModeFull.IPU_MIXED_PRECISION: + to_fp16_var_names = paddle.static.amp.cast_model_to_fp16( + main_prog, self.amp_list) + self.dtype_check(main_prog, to_fp16_var_names) + + exe = paddle.static.Executor(place) + exe.run(startup_prog) + + # cast parameters to fp16 + if exec_mode == ExecutionModeFull.IPU_MIXED_PRECISION: + paddle.static.amp.cast_parameters_to_fp16( + paddle.CPUPlace(), + main_prog, + to_fp16_var_names=to_fp16_var_names) + + if exec_mode != ExecutionModeFull.CPU_FP32: + ipu_strategy = paddle.static.IpuStrategy() + ipu_strategy.set_graph_config(is_training=False) + if exec_mode == ExecutionModeFull.IPU_POPART_FP16: + ipu_strategy.set_precision_config(enable_fp16=True) + program = paddle.static.IpuCompiledProgram( + main_prog, ipu_strategy=ipu_strategy).compile( + self.feed_list, fetch_list) + else: + program = main_prog + + feed = self.feed_fp32 + result = exe.run(program, feed=feed, fetch_list=fetch_list) + return result[0] + + def test(self): + output_dict = {} + for mode in ExecutionModeFull: + if mode == ExecutionModeFull.IPU_POPART_FP16: + continue + if mode > ExecutionModeFull.IPU_FP32 and not self.fp16_enabled: + break + output_dict[mode] = self._test_base(mode).flatten() + + self.check(output_dict) + + +if __name__ == "__main__": + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/ipu/test_mixed_precision_training_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_mixed_precision_training_ipu.py new file mode 100644 index 0000000000000..224c0bddc22f9 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/ipu/test_mixed_precision_training_ipu.py @@ -0,0 +1,151 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest + +import numpy as np +import paddle +import paddle.static +import paddle.nn.functional as F +from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest, ExecutionModeFull + + +@unittest.skipIf(not paddle.is_compiled_with_ipu(), + "core is not compiled with IPU") +class TestBase(IPUOpTest): + def setUp(self): + self.set_atol() + self.set_training() + self.set_data_feed() + self.set_feed_attr() + + @property + def fp16_enabled(self): + return True + + def set_atol(self): + self.atol = 2e-6 + self.rtol = 1e-5 + self.atol_fp16 = 1e-2 + self.rtol_fp16 = 1e-3 + + def set_training(self): + self.is_training = True + self.epoch = 20 + + def set_data_feed(self): + data = np.random.uniform(size=[1, 3, 28, 28]) + self.feed_fp32 = {"in_0": data.astype(np.float32)} + self.feed_fp16 = {"in_0": data.astype(np.float16)} + + def set_feed_attr(self): + self.feed_shape = [x.shape for x in self.feed_fp32.values()] + self.feed_list = list(self.feed_fp32.keys()) + + def dtype_check(self, program, to_fp16_var_names): + block = program.global_block() + assert len(to_fp16_var_names) > 0 + for var_name in to_fp16_var_names: + assert (block.var(var_name).dtype, paddle.float16) + + def _test_base(self, exec_mode): + generator = paddle.fluid.unique_name.UniqueNameGenerator() + scope = paddle.static.Scope() + main_prog = paddle.static.Program() + startup_prog = paddle.static.Program() + main_prog.random_seed = self.SEED + startup_prog.random_seed = self.SEED + + with paddle.fluid.unique_name.guard(generator): + with paddle.static.scope_guard(scope): + with paddle.static.program_guard(main_prog, startup_prog): + x = paddle.static.data( + name=self.feed_list[0], + shape=self.feed_shape[0], + dtype='float32') + + # using fp32 + x = paddle.static.nn.conv2d( + input=x, num_filters=3, filter_size=3) + x = paddle.static.nn.batch_norm(x, act='relu') + x = F.max_pool2d(x, kernel_size=2, stride=2) + + # using fp16 + with paddle.static.amp.fp16_guard(): + x = paddle.static.nn.conv2d( + input=x, num_filters=6, filter_size=3) + x = paddle.static.nn.batch_norm(x, act='relu') + x = F.max_pool2d(x, kernel_size=2, stride=2) + + # using fp32 + x = paddle.static.nn.fc(x, size=10) + loss = paddle.mean(x) + + # optimizer + optimizer = paddle.optimizer.Adam(learning_rate=1e-2) + optimizer.minimize(loss, startup_prog) + fetch_list = [loss.name] + + # cast model to fp16 + if exec_mode == ExecutionModeFull.IPU_MIXED_PRECISION: + to_fp16_var_names = paddle.static.amp.cast_model_to_fp16( + main_prog, self.amp_list) + self.dtype_check(main_prog, to_fp16_var_names) + + if exec_mode == ExecutionModeFull.CPU_FP32: + place = paddle.CPUPlace() + else: + place = paddle.IPUPlace() + exe = paddle.static.Executor(place) + exe.run(startup_prog) + + # cast parameters to fp16 + if exec_mode == ExecutionModeFull.IPU_MIXED_PRECISION: + paddle.static.amp.cast_parameters_to_fp16( + paddle.CPUPlace(), + main_prog, + to_fp16_var_names=to_fp16_var_names) + + if exec_mode != ExecutionModeFull.CPU_FP32: + ipu_strategy = paddle.static.IpuStrategy() + ipu_strategy.set_graph_config(is_training=self.is_training) + if exec_mode == ExecutionModeFull.IPU_POPART_FP16: + ipu_strategy.set_precision_config(enable_fp16=True) + program = paddle.static.IpuCompiledProgram( + main_prog, ipu_strategy=ipu_strategy).compile( + self.feed_list, fetch_list) + else: + program = main_prog + + feed = self.feed_fp32 + result = [] + for i in range(self.epoch): + out = exe.run(program, feed=feed, fetch_list=fetch_list) + result.append(out) + return np.array(result) + + def test_base(self): + output_dict = {} + for mode in ExecutionModeFull: + if mode == ExecutionModeFull.IPU_POPART_FP16: + continue + if mode > ExecutionModeFull.IPU_FP32 and not self.fp16_enabled: + break + output_dict[mode] = self._test_base(mode).flatten() + + self.check(output_dict) + + +if __name__ == "__main__": + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/ipu/test_model_parallel_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_model_parallel_ipu.py new file mode 100644 index 0000000000000..792b88849faf3 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/ipu/test_model_parallel_ipu.py @@ -0,0 +1,357 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest + +import numpy as np +import paddle +import paddle.static +from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest + + +@unittest.skipIf(not paddle.is_compiled_with_ipu(), + "core is not compiled with IPU") +class TestBase(IPUOpTest): + def setUp(self): + self.set_atol() + self.set_training() + self.set_attrs() + self.set_data_feed() + + def set_training(self): + self.is_training = False + self.epoch = 10 + + def set_attrs(self): + self.ipu_options = { + "batches_per_step": 1, + "enable_pipelining": False, + "enable_gradient_accumulation": False, + "accumulation_factor": 1, + "enable_replicated_graphs": False, + "replicated_graph_count": 1, + } + self.cpu_bs = 1 + self.ipu_bs = 1 + + def set_data_feed(self): + np_image = np.random.rand(1, 3, 10, 10).astype(np.float32) + self.feed_cpu = {"image": np_image} + self.feed_ipu = {"image": np_image} + + def _test_base(self, run_ipu=True): + scope = paddle.static.Scope() + main_prog = paddle.static.Program() + startup_prog = paddle.static.Program() + main_prog.random_seed = self.SEED + startup_prog.random_seed = self.SEED + + bs = self.ipu_bs if run_ipu else self.cpu_bs + with paddle.static.scope_guard(scope): + with paddle.static.program_guard(main_prog, startup_prog): + image = paddle.static.data( + name='image', shape=[bs, 3, 10, 10], dtype='float32') + with paddle.static.ipu_shard_guard(index=0): + conv1 = paddle.static.nn.conv2d( + image, num_filters=3, filter_size=3, bias_attr=False) + with paddle.static.ipu_shard_guard(index=1): + conv2 = paddle.static.nn.conv2d( + conv1, num_filters=3, filter_size=3, bias_attr=False) + # should consider influence of bs + loss = paddle.mean(conv2) + + if self.is_training: + if self.optimizer == 'sgd': + opt = paddle.optimizer.SGD(learning_rate=1e-2) + elif self.optimizer == 'adam': + opt = paddle.optimizer.Adam(learning_rate=1e-2) + elif self.optimizer == 'lamb': + opt = paddle.optimizer.Lamb(learning_rate=1e-2) + else: + raise Exception('optimizer must be sgd, adam or lamb') + + opt.minimize(loss) + + if run_ipu: + place = paddle.IPUPlace() + else: + place = paddle.CPUPlace() + executor = paddle.static.Executor(place) + executor.run(startup_prog) + + if run_ipu: + feed_list = [image.name] + fetch_list = [loss.name] + ipu_strategy = paddle.static.IpuStrategy() + ipu_strategy.set_graph_config( + num_ipus=2 * self.ipu_options['replicated_graph_count'], + is_training=self.is_training, + enable_manual_shard=True) + ipu_strategy.set_options(self.ipu_options) + program = paddle.static.IpuCompiledProgram( + main_prog, + ipu_strategy=ipu_strategy).compile(feed_list, fetch_list) + else: + program = main_prog + + feed = self.feed_ipu if run_ipu else self.feed_cpu + epoch = self.epoch + if not run_ipu: + epoch *= self.ipu_options['replicated_graph_count'] + epoch *= self.ipu_options['batches_per_step'] + epoch *= self.ipu_options['accumulation_factor'] + epoch = epoch / (self.cpu_bs / self.ipu_bs) + result = [] + for i in range(int(epoch)): + loss_res = executor.run(program, feed=feed, fetch_list=[loss]) + result.append(loss_res) + return np.array(result).flatten() + + def test(self): + cpu_outputs = self._test_base(False) + ipu_outputs = self._test_base(True) + + self.assertTrue(np.allclose(cpu_outputs, ipu_outputs, atol=self.atol)) + + +class TestReplicaInference(TestBase): + def set_attrs(self): + self.ipu_options = { + "batches_per_step": 1, + "enable_pipelining": False, + "enable_gradient_accumulation": False, + "accumulation_factor": 1, + "enable_replicated_graphs": True, + "replicated_graph_count": 2, + } + self.cpu_bs = 1 + self.ipu_bs = 1 + + def set_data_feed(self): + np_image = np.random.rand(1, 3, 10, 10).astype(np.float32) + self.feed_cpu = {"image": np_image} + self.feed_ipu = { + "image": + np.tile(np_image, + [self.ipu_options['replicated_graph_count'], 1, 1, 1]) + } + + +class TestPipelineInference(TestBase): + def set_attrs(self): + self.ipu_options = { + "batches_per_step": 2, + "enable_pipelining": True, + "enable_gradient_accumulation": False, + "accumulation_factor": 1, + "enable_replicated_graphs": False, + "replicated_graph_count": 1, + } + self.cpu_bs = 1 + self.ipu_bs = 1 + + def set_data_feed(self): + np_image = np.random.rand(1, 3, 10, 10).astype(np.float32) + self.feed_cpu = {"image": np_image} + self.feed_ipu = { + "image": np.tile(np_image, + [self.ipu_options['batches_per_step'], 1, 1, 1]) + } + + +class TestTrainBase(TestBase): + def set_training(self): + self.is_training = True + self.epoch = 10 + + def set_attrs(self): + self.ipu_options = { + "batches_per_step": 1, + "enable_pipelining": False, + "enable_gradient_accumulation": False, + "accumulation_factor": 1, + "enable_replicated_graphs": False, + "replicated_graph_count": 1, + } + self.cpu_bs = 1 + self.ipu_bs = 1 + self.optimizer = 'sgd' + + +class TestReplicaTrain(TestTrainBase): + def set_attrs(self): + self.ipu_options = { + "batches_per_step": 1, + "enable_pipelining": False, + "enable_gradient_accumulation": False, + "accumulation_factor": 1, + "enable_replicated_graphs": True, + "replicated_graph_count": 2, + } + self.cpu_bs = 2 + self.ipu_bs = 1 + self.optimizer = 'sgd' + + def set_data_feed(self): + np_image = np.random.rand(1, 3, 10, 10).astype(np.float32) + self.feed_cpu = {"image": np.tile(np_image, [self.cpu_bs, 1, 1, 1])} + self.feed_ipu = { + "image": + np.tile(np_image, + [self.ipu_options['replicated_graph_count'], 1, 1, 1]) + } + + def test(self): + cpu_outputs = self._test_base(False) + ipu_outputs = self._test_base(True)[::2] + + self.assertTrue(np.allclose(cpu_outputs, ipu_outputs, atol=self.atol)) + + +class TestPipelineTrain(TestTrainBase): + def set_attrs(self): + self.ipu_options = { + "batches_per_step": 3, + "enable_pipelining": True, + "enable_gradient_accumulation": True, + "accumulation_factor": 3, + "enable_replicated_graphs": False, + "replicated_graph_count": 1, + } + self.cpu_bs = 3 + self.ipu_bs = 1 + self.optimizer = 'sgd' + + def set_data_feed(self): + np_image = np.random.rand(1, 3, 10, 10).astype(np.float32) + self.feed_cpu = {"image": np.tile(np_image, [self.cpu_bs, 1, 1, 1])} + bps_acc = self.ipu_options['batches_per_step'] * self.ipu_options[ + 'accumulation_factor'] + self.feed_ipu = {"image": np.tile(np_image, [bps_acc, 1, 1, 1])} + + def test(self): + cpu_outputs = self._test_base(False) + ipu_outputs = self._test_base(True)[::3] + + self.assertTrue(np.allclose(cpu_outputs, ipu_outputs, atol=self.atol)) + + +class TestAdamTrain(TestTrainBase): + def set_attrs(self): + self.ipu_options = { + "batches_per_step": 1, + "enable_pipelining": False, + "enable_gradient_accumulation": False, + "accumulation_factor": 1, + "enable_replicated_graphs": False, + "replicated_graph_count": 1, + } + self.cpu_bs = 1 + self.ipu_bs = 1 + self.optimizer = 'adam' + + +class TestAdamReplicaTrain(TestReplicaTrain): + def set_attrs(self): + self.ipu_options = { + "batches_per_step": 1, + "enable_pipelining": False, + "enable_gradient_accumulation": False, + "accumulation_factor": 1, + "enable_replicated_graphs": True, + "replicated_graph_count": 2, + } + self.cpu_bs = 2 + self.ipu_bs = 1 + self.optimizer = 'adam' + + +class TestAdamPipelineTrain(TestPipelineTrain): + def set_attrs(self): + self.ipu_options = { + "batches_per_step": 3, + "enable_pipelining": True, + "enable_gradient_accumulation": True, + "accumulation_factor": 3, + "enable_replicated_graphs": False, + "replicated_graph_count": 1, + } + self.cpu_bs = 3 + self.ipu_bs = 1 + self.optimizer = 'adam' + + +class TestAdamRecomputationTrain(TestPipelineTrain): + def set_attrs(self): + self.ipu_options = { + "batches_per_step": 3, + "enable_pipelining": True, + "enable_gradient_accumulation": True, + "accumulation_factor": 3, + "enable_replicated_graphs": False, + "replicated_graph_count": 1, + "auto_recomputation": 3, + } + self.cpu_bs = 3 + self.ipu_bs = 1 + self.optimizer = 'adam' + + +class TestLambTrain(TestAdamTrain): + def set_attrs(self): + self.ipu_options = { + "batches_per_step": 1, + "enable_pipelining": False, + "enable_gradient_accumulation": False, + "accumulation_factor": 1, + "enable_replicated_graphs": False, + "replicated_graph_count": 1, + } + self.cpu_bs = 1 + self.ipu_bs = 1 + self.optimizer = 'lamb' + + +class TestLambReplicaTrain(TestAdamReplicaTrain): + def set_attrs(self): + self.ipu_options = { + "batches_per_step": 1, + "enable_pipelining": False, + "enable_gradient_accumulation": False, + "accumulation_factor": 1, + "enable_replicated_graphs": True, + "replicated_graph_count": 2, + } + self.cpu_bs = 2 + self.ipu_bs = 1 + self.optimizer = 'lamb' + + +class TestLambPipelineTrain(TestAdamPipelineTrain): + def set_attrs(self): + self.ipu_options = { + "batches_per_step": 3, + "enable_pipelining": True, + "enable_gradient_accumulation": True, + "accumulation_factor": 3, + "enable_replicated_graphs": False, + "replicated_graph_count": 1, + } + self.cpu_bs = 3 + self.ipu_bs = 1 + self.optimizer = 'lamb' + + +if __name__ == "__main__": + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/ipu/test_weight_decay_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_weight_decay_ipu.py new file mode 100644 index 0000000000000..5e652ce48334d --- /dev/null +++ b/python/paddle/fluid/tests/unittests/ipu/test_weight_decay_ipu.py @@ -0,0 +1,118 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest + +import numpy as np +import paddle +import paddle.static +from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest + + +@unittest.skipIf(not paddle.is_compiled_with_ipu(), + "core is not compiled with IPU") +@unittest.skipIf(IPUOpTest.use_ipumodel(), "skip for ipumodel") +class TestBase(IPUOpTest): + def setUp(self): + self.set_atol() + self.set_data_feed() + self.set_feed_attr() + self.set_attrs() + + def set_atol(self): + self.atol = 1e-6 + + def set_data_feed(self): + self.feed = { + "image": np.random.uniform(size=[1, 3, 10, 10]).astype('float32'), + } + + def set_feed_attr(self): + self.feed_shape = [x.shape for x in self.feed.values()] + self.feed_list = list(self.feed.keys()) + self.feed_dtype = [x.dtype for x in self.feed.values()] + + def set_attrs(self): + self.attrs = { + "weight_decay": 4.0, + "loss_scaling": 1.0, + } + + def _test_optimizer(self, run_ipu=True): + def exclude_fn(param): + return param.name.endswith('.w_0') + + scope = paddle.static.Scope() + main_prog = paddle.static.Program() + startup_prog = paddle.static.Program() + main_prog.random_seed = self.SEED + startup_prog.random_seed = self.SEED + np.random.seed(self.SEED) + + with paddle.static.scope_guard(scope): + with paddle.static.program_guard(main_prog, startup_prog): + image = paddle.static.data( + name='image', shape=[1, 3, 10, 10], dtype='float32') + bias = paddle.fluid.layers.create_parameter( + shape=[1, 3, 10, 10], is_bias=True, dtype='float32') + add1 = image + bias + conv1 = paddle.static.nn.conv2d( + add1, num_filters=3, filter_size=3, bias_attr=False) + + loss = paddle.mean(conv1) + opt = paddle.optimizer.Lamb( + learning_rate=1e-1, + lamb_weight_decay=self.attrs['weight_decay'], + exclude_from_weight_decay_fn=exclude_fn) + opt.minimize(loss) + + if run_ipu: + place = paddle.IPUPlace() + else: + place = paddle.CPUPlace() + exe = paddle.static.Executor(place) + exe.run(startup_prog) + paddle.static.save(main_prog, "weight_decay") + + if run_ipu: + feed_list = [image.name] + fetch_list = [loss.name] + ipu_strategy = paddle.static.IpuStrategy() + ipu_strategy.set_graph_config(is_training=True) + ipu_strategy.set_options({ + 'loss_scaling': self.attrs["loss_scaling"] + }) + program = paddle.static.IpuCompiledProgram( + main_prog, ipu_strategy=ipu_strategy).compile(feed_list, + fetch_list) + else: + program = main_prog + + result = [] + for epoch in range(100): + loss_res = exe.run(program, feed=self.feed, fetch_list=[loss]) + result.append(loss_res) + + return np.array(result) + + def test(self): + # cpu and ipu dimenstion mismatch, cpu:(100, 1, 1), ipu:(100, 1) + ipu_loss = self._test_optimizer(True).flatten() + cpu_loss = self._test_optimizer(False).flatten() + + self.assertTrue(np.allclose(ipu_loss, cpu_loss, atol=self.atol)) + + +if __name__ == "__main__": + unittest.main() From ef6ff4ef0554efa2d480151109ef7ebef24ed496 Mon Sep 17 00:00:00 2001 From: zmxdream Date: Fri, 15 Apr 2022 13:39:14 +0800 Subject: [PATCH 177/211] [XPUPS]fix hashtable_kernel.kps (#41790) * refactor heter comm kernel * update. test=develop * update calc_shard_offset. test=develop * update xpu kernel. test=develop * update args of calc_shard_offset * update. test=develop * remove customGradMerger * update. test=develop * update. test=develop * fix. test=develop * update. test=develop * update. test=develop * update optimizer kernel * update. test=develop * update. test=develop * update. test=develop * update. test=develop * update. test=develop * update. test=develop * update. test=develop * update. test=develop * fix. test=develop * fix. test=develop * add optimizer kernel. test=develop * fix. test=develop * fix. test=develop * fix. test=develop * fix. test=develop * fix kunlun not support size_t. test=develop * fix. test=develop * fix. test=develop * fix. test=develop * fix. test=develop * fix. test=develop * fix. test=develop * fix. test=develop * fix. test=develop * fix. test=develop * update hashtable. test=develop * update. test=develop * fix. test=develop * fix. test=develop * fix. test=develop * fix. test=develop * fix. test=develop * fix. test=develop * fix. test=develop * update. test=develop * update. test=develop * fix. test=develop * fix. test=develop * fix. test=develop * fix. test=develop * fix. test=develop * fix. test=develop * fix. test=develop * fix. test=develop * fix. test=develop * fix. test=develop * fix. test=develop * fix. test=develop * fix. test=develop * fix. test=develop * fix. test=develop * fix. test=develop * template init. test=develop * hashtable template init. test=develop * fix. test=develop * fix. test=devlop * fix. test=develop * fix. test=develop * fix. test=develop * fix. test=develop * fix. test=develop * fix. test=develop * fix. test=develop * fix. test=develop * fix. test=develop * fix. test=develop * fix. test=develop * fix. test=develop * fix. test=develop * fix. test=develop * fix. test=develop * fix. test=develop * fix. test=develop * fix. test=develop * fix hashtable_kernel. test=develop * fix. test=develop * fix. test=develop * fix. test=develop * fix. test=develop Co-authored-by: WorgenZhang --- .../framework/fleet/heter_ps/hashtable.h | 2 +- .../fleet/heter_ps/hashtable_kernel.kps | 32 ++++---- .../framework/fleet/heter_ps/heter_comm.h | 3 +- .../framework/fleet/heter_ps/heter_comm_inl.h | 6 +- .../fleet/heter_ps/heter_comm_kernel.kps | 78 ++++++++++--------- 5 files changed, 63 insertions(+), 58 deletions(-) diff --git a/paddle/fluid/framework/fleet/heter_ps/hashtable.h b/paddle/fluid/framework/fleet/heter_ps/hashtable.h index 6a51713d74c19..b821ccecf0a29 100644 --- a/paddle/fluid/framework/fleet/heter_ps/hashtable.h +++ b/paddle/fluid/framework/fleet/heter_ps/hashtable.h @@ -74,7 +74,7 @@ class XPUCacheArray { // ValType* find(const KeyType& key) { return NULL; } // bool insert(const KeyType& key, const ValType& val) { return true; } - int prefetch(const int dev_id, XPUStream stream = NULL) {} + int prefetch(const int dev_id, XPUStream stream = NULL) { return 0; } size_t size() { return size_; } private: diff --git a/paddle/fluid/framework/fleet/heter_ps/hashtable_kernel.kps b/paddle/fluid/framework/fleet/heter_ps/hashtable_kernel.kps index 9d2a20a361e31..55edf883271b9 100644 --- a/paddle/fluid/framework/fleet/heter_ps/hashtable_kernel.kps +++ b/paddle/fluid/framework/fleet/heter_ps/hashtable_kernel.kps @@ -38,7 +38,7 @@ namespace framework { #if defined(PADDLE_WITH_XPU_KP) -__device__ void update_lr(float* w, float* g2sum, float g, // NOLINT +__device__ void update_lr(float& w, float& g2sum, float g, // NOLINT float scale) { __local__ float local_learning_rate; __local__ float local_initial_g2sum; @@ -55,17 +55,17 @@ __device__ void update_lr(float* w, float* g2sum, float g, // NOLINT sqrt(local_initial_g2sum / (local_initial_g2sum + g2sum)); double scaled_grad = g / scale; - (*w) += scaled_grad * ratio; + w += scaled_grad * ratio; if (w < local_min_bound) w = local_min_bound; if (w > local_max_bound) w = local_max_bound; add_g2sum += scaled_grad * scaled_grad; - (*g2sum) += add_g2sum; + g2sum += add_g2sum; } -__device__ void update_mf(int n, float* w, float* g2sum, const float* g, +__device__ void update_mf(int n, float* w, float& g2sum, const float* g, float scale) { __local__ float local_mf_learning_rate; __local__ float local_mf_initial_g2sum; @@ -92,16 +92,16 @@ __device__ void update_mf(int n, float* w, float* g2sum, const float* g, add_g2sum += scaled_grad * scaled_grad; } - (*g2sum) += add_g2sum / n; + g2sum += add_g2sum / n; } __device__ float xpu_rand_uniform() { return 0.1; } template -__device__ void update_value(ValType* val, const GradType* grad) { // NOLINT - (*val).slot = (*grad).slot; - (*val).show += (*grad).show; - (*val).clk += (*grad).clk; +__device__ void update_value(ValType& val, const GradType& grad) { // NOLINT + val.slot = grad.slot; + val.show += grad.show; + val.clk += grad.clk; __local__ float local_nonclk_coeff; __local__ float local_clk_coeff; @@ -114,25 +114,23 @@ __device__ void update_value(ValType* val, const GradType* grad) { // NOLINT GM2LM(optimizer_config::mf_create_thresholds, &local_mf_create_thresholds, sizeof(float)); - val.delta_score += local_nonclk_coeff * ((*grad).show - (*grad).clk) + - local_clk_coeff * (*grad).clk; + val.delta_score += + local_nonclk_coeff * (grad.show - grad.clk) + local_clk_coeff * grad.clk; - update_lr(&(*val).lr, &(*val).lr_g2sum, (*grad).lr_g, (*grad).show); + update_lr(val.lr, val.lr_g2sum, grad.lr_g, grad.show); if (val.mf_size == 0) { if (local_mf_create_thresholds <= - local_nonclk_coeff * ((*val).show - (*val).clk) + - local_clk_coeff * (*val).clk) { + local_nonclk_coeff * (val.show - val.clk) + local_clk_coeff * val.clk) { val.mf_size = MF_DIM + 1; val.mf[0] = 0; - xpu_rand_uniform(&); for (int i = 0; i < MF_DIM; ++i) { - (*val).mf[i + 1] = (xpu_rand_uniform()) * local_mf_initial_range; + val.mf[i + 1] = (xpu_rand_uniform()) * local_mf_initial_range; } } } else { - update_mf(MF_DIM, &val.mf[1], &val.mf[0], (*grad).mf_g, (*grad).show); + update_mf(MF_DIM, &val.mf[1], val.mf[0], grad.mf_g, grad.show); } } diff --git a/paddle/fluid/framework/fleet/heter_ps/heter_comm.h b/paddle/fluid/framework/fleet/heter_ps/heter_comm.h index 817fd8d38ee06..419bd716eb304 100644 --- a/paddle/fluid/framework/fleet/heter_ps/heter_comm.h +++ b/paddle/fluid/framework/fleet/heter_ps/heter_comm.h @@ -92,6 +92,7 @@ class HeterComm { nccl_inter_comms_ = inter_comms; node_size_ = comm_size; } +#endif bool need_transfer(int send_id, int receive_id) { return ((send_id / 4 != receive_id / 4) && (send_id + 4) % 8 != receive_id); @@ -101,8 +102,6 @@ class HeterComm { int get_transfer_devid(int send_id) { return (send_id + 4) % 8; } -#endif - void end_pass(); struct Node { diff --git a/paddle/fluid/framework/fleet/heter_ps/heter_comm_inl.h b/paddle/fluid/framework/fleet/heter_ps/heter_comm_inl.h index 3ced33b490d59..1e66b3cb25031 100644 --- a/paddle/fluid/framework/fleet/heter_ps/heter_comm_inl.h +++ b/paddle/fluid/framework/fleet/heter_ps/heter_comm_inl.h @@ -161,8 +161,8 @@ void HeterComm::destroy_storage(int start_index, nodes[i].key_storage); allocator->DeviceFree(resource_->dev_id(nodes[i].dev_num), nodes[i].val_storage); -#endif } +#endif } template @@ -804,9 +804,9 @@ void HeterComm::push_sparse(int dev_num, auto dst_place = platform::CPUPlace(); auto src_place = place; memory_copy(dst_place, h_left, src_place, d_left_ptr, - total_device * sizeof(int)); + total_device * sizeof(int), stream); memory_copy(dst_place, h_right, src_place, d_right_ptr, - total_device * sizeof(int)); + total_device * sizeof(int), stream); for (int i = 0; i < total_device; ++i) { int shard_len = h_right[i] - h_left[i] + 1; diff --git a/paddle/fluid/framework/fleet/heter_ps/heter_comm_kernel.kps b/paddle/fluid/framework/fleet/heter_ps/heter_comm_kernel.kps index c3e37d9eba34d..a1923a7f6019b 100644 --- a/paddle/fluid/framework/fleet/heter_ps/heter_comm_kernel.kps +++ b/paddle/fluid/framework/fleet/heter_ps/heter_comm_kernel.kps @@ -236,55 +236,62 @@ __global__ void fill_dvals_kernel(ValType* d_shard_vals, ValType* d_vals, // xpu implementation of heter_comm_kernel.h template -void fill_idx(T* idx, long long len, const StreamType& stream) { +void HeterCommKernel::fill_idx(T* idx, long long len, + const StreamType& stream) { fill_idx_kernel<<<4, 64, stream>>>(idx, len); } template -void calc_shard_offset(T* idx, T* left, T* right, long long len, int total_devs, - const StreamType& stream) { +void HeterCommKernel::calc_shard_offset(T* idx, T* left, T* right, + long long len, int total_devs, + const StreamType& stream) { calc_shard_offset_kernel<<<4, 64, stream>>>(idx, left, right, len, total_devs); } template -void calc_shard_index(KeyType* d_keys, long long len, T* shard_index, - int total_devs, const StreamType& stream) { +void HeterCommKernel::calc_shard_index(KeyType* d_keys, long long len, + T* shard_index, int total_devs, + const StreamType& stream) { calc_shard_index_kernel<<<4, 64, stream>>>( d_keys, len, shard_index, total_devs); } template -void fill_shard_key(KeyType* d_shard_keys, KeyType* d_keys, T* idx, - long long len, const StreamType& stream) { +void HeterCommKernel::fill_shard_key(KeyType* d_shard_keys, KeyType* d_keys, + T* idx, long long len, + const StreamType& stream) { fill_shard_key_kernel<<<4, 64, stream>>>(d_shard_keys, d_keys, idx, len); } template -void fill_shard_grads(KeyType* d_shard_keys, KeyType* d_keys, - GradType* d_shard_grads, GradType* d_grads, T* idx, - long long len, const StreamType& stream) { +void HeterCommKernel::fill_shard_grads(KeyType* d_shard_keys, KeyType* d_keys, + GradType* d_shard_grads, + GradType* d_grads, T* idx, long long len, + const StreamType& stream) { fill_shard_grads_kernel<<<4, 64, stream>>>( d_shard_keys, d_keys, d_shard_grads, d_grads, idx, len); } template -void fill_dvals(ValType* d_shard_vals, ValType* d_vals, T* idx, long long len, - const StreamType& stream) { +void HeterCommKernel::fill_dvals(ValType* d_shard_vals, ValType* d_vals, T* idx, + long long len, const StreamType& stream) { fill_dvals_kernel<<<4, 64, stream>>>(d_shard_vals, d_vals, idx, len); } template -void sort_pairs(void* d_temp_storage, size_t& temp_storage_bytes, // NOLINT - const KeyT* d_keys_in, // NOLINT - KeyT* d_keys_out, const ValueT* d_values_in, - ValueT* d_values_out, int num_items, int begin_bit, int end_bit, - StreamType stream, bool debug_synchronous) {} +void HeterCommKernel::sort_pairs(void* d_temp_storage, + size_t& temp_storage_bytes, // NOLINT + const KeyT* d_keys_in, // NOLINT + KeyT* d_keys_out, const ValueT* d_values_in, + ValueT* d_values_out, int num_items, + int begin_bit, int end_bit, StreamType stream, + bool debug_synchronous) {} template (int* idx, long long len, - const XPUStream& stream); -template void calc_shard_offset(int* idx, int* left, int* right, - long long len, int total_devs, - const XPUStream& stream); -template void calc_shard_index( +template void HeterCommKernel::fill_idx( + int* idx, long long len, const XPUStream& stream); +template void HeterCommKernel::calc_shard_offset( + int* idx, int* left, int* right, long long len, int total_devs, + const XPUStream& stream); +template void HeterCommKernel::calc_shard_index( unsigned long* d_keys, long long len, int* shard_index, int total_devs, const XPUStream& stream); -template void fill_shard_key( +template void HeterCommKernel::fill_shard_key( unsigned long* d_shard_keys, unsigned long* d_keys, int* idx, long long len, const XPUStream& stream); +template void HeterCommKernel::fill_shard_grads< + unsigned long, paddle::framework::FeaturePushValue, int, XPUStream>( + unsigned long* d_shard_keys, unsigned long* d_keys, + paddle::framework::FeaturePushValue* d_shard_grads, + paddle::framework::FeaturePushValue* d_grads, int* idx, long long len, + const XPUStream& stream); template void -fill_shard_grads(unsigned long* d_shard_keys, unsigned long* d_keys, - paddle::framework::FeaturePushValue* d_shard_grads, - paddle::framework::FeaturePushValue* d_grads, - int* idx, long long len, const XPUStream& stream); -template void fill_dvals( +HeterCommKernel::fill_dvals( paddle::framework::FeatureValue* d_shard_vals, paddle::framework::FeatureValue* d_vals, int* idx, long long len, const XPUStream& stream); -template void -sort_pairs( +template void HeterCommKernel::sort_pairs< + unsigned long, paddle::framework::FeaturePushValue, XPUStream>( void* d_temp_storage, size_t& temp_storage_bytes, // NOLINT const unsigned long* d_keys_in, // NOLINT @@ -326,14 +334,14 @@ sort_pairs( paddle::framework::FeaturePushValue* d_values_out, int num_items, int begin_bit, int end_bit, XPUStream stream, bool debug_synchronous); -template void sort_pairs( +template void HeterCommKernel::sort_pairs( void* d_temp_storage, size_t& temp_storage_bytes, // NOLINT const int* d_keys_in, // NOLINT int* d_keys_out, const int* d_values_in, int* d_values_out, int num_items, int begin_bit, int end_bit, XPUStream stream, bool debug_synchronous); -template void reduce_by_key< +template void HeterCommKernel::reduce_by_key< unsigned long*, unsigned long*, paddle::framework::FeaturePushValue*, paddle::framework::FeaturePushValue*, int*, XPUStream>( void* d_temp_storage, From a22b68b81fd798e67853fcf8fd7d7e06286fff00 Mon Sep 17 00:00:00 2001 From: Jack Zhou Date: Fri, 15 Apr 2022 13:53:18 +0800 Subject: [PATCH 178/211] Add eager string tensor (#41039) * Add core.eager.StringTensor __init__ which pyarray args can be passed * Add the numpy method of core.eager.StringTensor * revert tensor.to_string modification * Add ToPyObject for core.eager.StringTensor * Add debug string for core.eager.StringTensor * Remove place args of core.eager.StringTensor temporarily * Fix check string_tensor error * remove dtype of core.eager.StringTensor * add core.eager.StringTensor unittest * remove pstring from VarDesc * Add InitStringTensorWithStringTensor * Remove to_string modification * Remove zero_copy arg from StringTensor creator --- paddle/fluid/pybind/eager.cc | 371 +++++++++++++++++- paddle/fluid/pybind/eager.h | 1 + paddle/fluid/pybind/eager_method.cc | 93 +++++ paddle/fluid/pybind/eager_properties.cc | 10 + paddle/fluid/pybind/eager_utils.cc | 19 +- paddle/fluid/pybind/pybind.cc | 1 + paddle/fluid/pybind/tensor_py.h | 56 +++ paddle/phi/api/include/tensor.h | 8 + paddle/phi/api/lib/tensor.cc | 4 + paddle/phi/core/string_tensor.cc | 29 ++ paddle/phi/core/string_tensor.h | 2 + .../unittests/test_egr_string_tensor_api.py | 105 +++++ 12 files changed, 693 insertions(+), 6 deletions(-) create mode 100644 python/paddle/fluid/tests/unittests/test_egr_string_tensor_api.py diff --git a/paddle/fluid/pybind/eager.cc b/paddle/fluid/pybind/eager.cc index c529d121f3945..fa66e55e9c53a 100644 --- a/paddle/fluid/pybind/eager.cc +++ b/paddle/fluid/pybind/eager.cc @@ -36,12 +36,14 @@ limitations under the License. */ #include "paddle/fluid/pybind/tensor_py.h" #include "paddle/phi/api/lib/utils/storage.h" #include "paddle/phi/api/lib/utils/tensor_utils.h" +#include "paddle/phi/core/string_tensor.h" namespace paddle { namespace pybind { namespace py = ::pybind11; PyTypeObject* p_tensor_type; +PyTypeObject* p_string_tensor_type; // For StringTensor extern PyTypeObject* g_vartype_pytype; extern PyTypeObject* g_framework_tensor_pytype; @@ -101,6 +103,25 @@ void EmptyTensorInitializer(TensorObject* self, const std::string& name, } } +void EmptyStringTensorInitializer(TensorObject* self, const std::string& name, + const paddle::platform::Place& place, + const std::vector& dims = {}) { + auto ddims = phi::make_ddim(dims); + self->tensor.set_name(name); + // Note(zhoushunjie): Only support CPUPlace when create StringTensor + auto actual_place = platform::CPUPlace(); + // Allocate memory + const auto string_allocator = + std::make_unique(actual_place); + const auto alloc = string_allocator.get(); + std::shared_ptr string_tensor = + std::make_shared(alloc, phi::StringTensorMeta{ddims}); + if (phi::product(ddims) > 0) { + string_tensor->mutable_data(actual_place); + } + self->tensor.set_impl(string_tensor); +} + void InitTensorWithNumpyValue(TensorObject* self, const py::object& array, const paddle::platform::Place& place, bool zero_copy = false) { @@ -132,6 +153,28 @@ void InitTensorWithNumpyValue(TensorObject* self, const py::object& array, } } +void InitStringTensorWithNumpyValue(TensorObject* self, const py::object& obj) { + PADDLE_ENFORCE_EQ( + self->tensor.defined(), true, + paddle::platform::errors::Fatal( + "Calling InitStringTensorWithNumpyValue of Eager StringTensor " + "without " + "EmptyStringTensorInitializer is " + "forbidden. Please check your code and make sure you new a " + "eager tensor before init it with NumPy.")); + phi::StringTensor* impl_ptr = + static_cast(self->tensor.impl().get()); + paddle::platform::Place place = impl_ptr->place(); + auto array = obj.cast(); + if (platform::is_cpu_place(place)) { + SetStringTensorFromPyArray(impl_ptr, array, place); + } else { + PADDLE_THROW(platform::errors::InvalidArgument( + "StringTensor only support CPUPlace now, but receive %s", + place.DebugString())); + } +} + void InitTensorWithTensor(TensorObject* self, const paddle::experimental::Tensor& src, const paddle::platform::Place& place, @@ -171,6 +214,17 @@ void InitTensorWithFrameworkTensor(TensorObject* self, egr::EagerUtils::autograd_meta(&(self->tensor))->SetPersistable(false); } +void InitStringTensorWithStringTensor(TensorObject* self, + const paddle::experimental::Tensor& src, + const paddle::platform::Place& place, + const std::string& name) { + self->tensor.set_name(name); + auto impl = std::static_pointer_cast(src.impl()); + self->tensor.set_impl(impl); + VLOG(4) + << "Do ShareDataWith when using StringTensor to initialize StringTensor"; +} + py::object ParsePyArray( std::unordered_map kws_map, std::unordered_map kw_order_map, PyObject* args, @@ -236,13 +290,14 @@ int ParseBooleanArgs(std::string key, std::string ParseName(std::unordered_map kws_map, std::unordered_map kw_order_map, - PyObject* args, bool flag_kwargs, Py_ssize_t args_num) { + PyObject* args, bool flag_kwargs, Py_ssize_t args_num, + std::string unique_name_prefix = "generated_tensor") { std::string act_name = ""; if (kw_order_map["name"] <= args_num) { PyObject* name_obj = PyTuple_GET_ITEM(args, kw_order_map["name"] - 1); if (name_obj == Py_None) { act_name = - egr::Controller::Instance().GenerateUniqueName("generated_tensor"); + egr::Controller::Instance().GenerateUniqueName(unique_name_prefix); } else { act_name = CastPyArg2AttrString(name_obj, kw_order_map["name"] - 1); } @@ -250,13 +305,13 @@ std::string ParseName(std::unordered_map kws_map, if (flag_kwargs) { if ((kws_map["name"] == NULL) || (kws_map["name"] == Py_None)) { act_name = - egr::Controller::Instance().GenerateUniqueName("generated_tensor"); + egr::Controller::Instance().GenerateUniqueName(unique_name_prefix); } else { act_name = CastPyArg2AttrString(kws_map["name"], 0); } } else { act_name = - egr::Controller::Instance().GenerateUniqueName("generated_tensor"); + egr::Controller::Instance().GenerateUniqueName(unique_name_prefix); } } return act_name; @@ -368,6 +423,70 @@ void AutoInitTensorByTensor(TensorObject* py_tensor_ptr, } } +void AutoInitStringTensorByPyArray( + TensorObject* py_tensor_ptr, + std::unordered_map kws_map, PyObject* args, + bool flag_kwargs, Py_ssize_t args_num) { + // The first argument of the StringTensor constructor is PyArray, + // there are 4 arguments to construct the new StringTensor, + // kw_order_map's key is every arguments of the constructor, + // kw_order_map's value is the position of the arguments respectively. + // If u want to update this constructor with new arguments, + // need to update this map and to add or change related code. + std::unordered_map kw_order_map{{"value", 1}, + {"name", 2}}; + py::object numpy_value = py::object(); + paddle::platform::Place place = + egr::Controller::Instance().GetExpectedPlace(); + std::string act_name = ""; + + numpy_value = + ParsePyArray(kws_map, kw_order_map, args, flag_kwargs, args_num); + act_name = ParseName(kws_map, kw_order_map, args, flag_kwargs, args_num, + "generated_string_tensor"); + EmptyStringTensorInitializer(py_tensor_ptr, act_name, place); + InitStringTensorWithNumpyValue(py_tensor_ptr, numpy_value); +} + +void AutoInitStringTensorByStringTensor( + TensorObject* py_tensor_ptr, + std::unordered_map kws_map, PyObject* args, + bool flag_kwargs, Py_ssize_t args_num) { + // The first argument of the Tensor constructor is StringTensor, + // there are 3 arguments to construct the new StringTensor, + // kw_order_map's key is every arguments of the constructor, + // kw_order_map's value is the position of the arguments respectively. + // If u want to update this constructor with new arguments, + // need to update this map and to add or change related code. + std::unordered_map kw_order_map{{"value", 1}, + {"name", 2}}; + + paddle::platform::Place place = + egr::Controller::Instance().GetExpectedPlace(); + std::string act_name = ""; + + act_name = ParseName(kws_map, kw_order_map, args, flag_kwargs, args_num, + "generated_string_tensor"); + paddle::experimental::Tensor src_tensor; + if (kw_order_map["value"] <= args_num) { + src_tensor = + CastPyArg2Tensor(PyTuple_GET_ITEM(args, kw_order_map["value"] - 1), + kw_order_map["value"] - 1); + } else { + if (flag_kwargs && kws_map["value"] != NULL) { + src_tensor = CastPyArg2Tensor(kws_map["value"], 0); + } else { + PADDLE_THROW(platform::errors::InvalidArgument( + "The first expected kwargs is {value: Tensor}, " + "but could not parse the first argument {value: Tensor} " + "successfully. " + "Please check your input first and make sure you are on the right " + "way.")); + } + } + InitStringTensorWithStringTensor(py_tensor_ptr, src_tensor, place, act_name); +} + /** We should have init function with signature: * 1. * def __init__ () @@ -708,6 +827,204 @@ int TensorInit(PyObject* self, PyObject* args, PyObject* kwargs) { EAGER_CATCH_AND_THROW_RETURN_NEG } +/** We should have init function with signature: + * 1. + * def __init__ () + * + * 2. + * def __init__ ( + * ** dims: vector, + * ** name: std::string) + * + * 3. + * (should have at least one parameter, one parameter equals to case 4, zero + * parameter equals to case 1) + * def __init__ ( + * ** value: ndarray, + * ** zero_copy: bool, + * ** name: std::string) + * + * 4. + * def __init__ ( + * ** value: ndarray) + * + * 5. + * def __init__ ( + * ** tensor: Tensor) + * + * 6. + * (should have at least one parameter, one parameter equals to case 5, zero + * parameter equals to case 1.) + * def __init__ ( + * ** tensor: Tensor, + * ** name: std::string) + * **/ +int StringTensorInit(PyObject* self, PyObject* args, PyObject* kwargs) { + // set a flag to record use kwargs or not + bool flag_kwargs = false; + if (kwargs) flag_kwargs = true; + + // all kwargs + PyObject* kw_zero_copy = NULL; + + PyObject* kw_value = NULL; // receive PyArray or Tensor + PyObject* kw_name = NULL; + PyObject* kw_dims = NULL; + + // the keywords argument + static char* kwlist[] = { + const_cast("value"), const_cast("zero_copy"), + const_cast("name"), const_cast("dims"), NULL}; + // 'O' Store a Python object (without any conversion) in a C object pointer, + // '|' Indicates that the remaining arguments in the Python argument list are + // optional. + // PyArg_ParseTupleAndKeywords can Parse the parameters of a function that + // takes both positional and keyword parameters into local variables, + // which enhance case1, case2, case3, case4, case 5, case 6. + bool flag_ = + PyArg_ParseTupleAndKeywords(args, kwargs, "|OOOO", kwlist, &kw_value, + &kw_zero_copy, &kw_name, &kw_dims); + + // helper map + std::unordered_map kws_map{ + {"value", kw_value}, + {"zero_copy", kw_zero_copy}, + {"name", kw_name}, + {"dims", kw_dims}}; + + PADDLE_ENFORCE_EQ(flag_, true, + paddle::platform::errors::PreconditionNotMet( + "Could not parse args and kwargs successfully, " + "please check your input first and make" + "sure you are on the right way. " + "The expected arguments as follow: (" + "value, zero_copy, name, dims)")); + + PADDLE_ENFORCE_NOT_NULL( + self, paddle::platform::errors::Fatal( + "Calling __init__ of Eager Tensor without __new__ is " + "forbidden. Please check your code and make sure you new a " + "eager tensor before init it.")); + + auto py_tensor_ptr = reinterpret_cast(self); + + Py_ssize_t args_num = PyTuple_Size(args); + VLOG(6) << " args_num: " << args_num; + // args_num = 0, means that there is no position arguments. + if (args_num == (Py_ssize_t)0) { + if (!flag_kwargs) { + // case 1 + VLOG(6) << "Calling case1's string initializer."; + EmptyStringTensorInitializer( + py_tensor_ptr, egr::Controller::Instance().GenerateUniqueName( + "generated_string_tensor"), + egr::Controller::Instance().GetExpectedPlace()); + return 0; + } else { + if (kw_value != NULL) { + if (pybind11::detail::npy_api::get().PyArray_Check_(kw_value)) { + VLOG(6) << "Calling case3's or case4's string initializer"; + AutoInitStringTensorByPyArray(py_tensor_ptr, kws_map, args, + flag_kwargs, args_num); + return 0; + } else if (PyObject_IsInstance(kw_value, reinterpret_cast( + p_string_tensor_type))) { + VLOG(6) << "Calling case5's or case6's string initializer"; + AutoInitStringTensorByStringTensor(py_tensor_ptr, kws_map, args, + flag_kwargs, args_num); + return 0; + } else { + PADDLE_THROW(platform::errors::InvalidArgument( + "Could not parse the first keyword argument successfully, " + "the first keyword argument is value, but it should be PyArray " + "or StringTensor." + "Please check your input first and make sure you are on the " + "right way.")); + } + } else if (kw_dims != NULL) { + VLOG(6) << "Calling case2's string initializer."; + std::unordered_map kw_order_map{{"dims", 1}, + {"name", 2}}; + + std::vector dims = CastPyArg2VectorOfInt(kw_dims, 0); + std::string act_name = + ParseName(kws_map, kw_order_map, args, flag_kwargs, args_num, + "generated_string_tensor"); + EmptyStringTensorInitializer( + py_tensor_ptr, act_name, + egr::Controller::Instance().GetExpectedPlace(), dims); + return 0; + } else { + PADDLE_THROW(platform::errors::InvalidArgument( + "We not only support construct Tensor from numpy value " + "or StringTensor with python kwargs by this initializer, " + "but also even support dtype to init a empty StringTensor. " + "Please check your input first and make sure you call the existed " + "constructor.")); + } + } + } else if (args_num == (Py_ssize_t)1) { // case 3 ~ 6 + // 1 position args, remainting arguments are kwargs + PyObject* arg0_ptr = PyTuple_GET_ITEM(args, 0); + if (pybind11::detail::npy_api::get().PyArray_Check_(arg0_ptr)) { + VLOG(6) << "Calling case3's or case4's string initializer."; + AutoInitStringTensorByPyArray(py_tensor_ptr, kws_map, args, flag_kwargs, + args_num); + return 0; + } else if (PyObject_IsInstance(arg0_ptr, reinterpret_cast( + p_string_tensor_type))) { + VLOG(6) << "Calling case5's or case6's string initializer."; + AutoInitStringTensorByStringTensor(py_tensor_ptr, kws_map, args, + flag_kwargs, args_num); + return 0; + } else { + PADDLE_THROW(platform::errors::InvalidArgument( + "Could not parse the first keyword argument successfully, " + "the first keyword argument is value, but it should be PyArray " + "or StringTensor." + "Please check your input first and make sure you are on the " + "right way.")); + } + } else if (args_num == (Py_ssize_t)2) { // case 2 + // 2 position args + if (!flag_kwargs) { + PyObject* arg0_ptr = PyTuple_GET_ITEM(args, 0); + if (PyObject_IsInstance( + arg0_ptr, reinterpret_cast(p_string_tensor_type))) { + VLOG(6) << "Calling case6's string initializer."; + AutoInitStringTensorByStringTensor(py_tensor_ptr, kws_map, args, + flag_kwargs, args_num); + return 0; + } else if (pybind11::detail::npy_api::get().PyArray_Check_(arg0_ptr)) { + VLOG(6) << "Calling case3's string initializer."; + AutoInitStringTensorByPyArray(py_tensor_ptr, kws_map, args, flag_kwargs, + args_num); + return 0; + } else { + VLOG(6) << "Calling case2's string initializer."; + std::vector dims = CastPyArg2VectorOfInt(arg0_ptr, 0); + std::string act_name = ""; + PyObject* name_obj = PyTuple_GET_ITEM(args, 1); + if (name_obj == Py_None) { + act_name = egr::Controller::Instance().GenerateUniqueName( + "generated_string_tensor"); + } else { + act_name = CastPyArg2AttrString(PyTuple_GET_ITEM(args, 1), 1); + } + EmptyStringTensorInitializer( + py_tensor_ptr, act_name, + egr::Controller::Instance().GetExpectedPlace(), dims); + return 0; + } + } else { + PADDLE_THROW(platform::errors::Fatal( + "Can't not find expected num of args, please check your call, and " + "make sure u call the existed constructor.")); + } + } + return 1; +} + static void TensorDealloc(TensorObject* self) { if (self->weakrefs != NULL) PyObject_ClearWeakRefs(reinterpret_cast(self)); @@ -716,8 +1033,10 @@ static void TensorDealloc(TensorObject* self) { } extern struct PyGetSetDef variable_properties[]; +extern struct PyGetSetDef string_tensor_variable_properties[]; extern PyMethodDef variable_methods[]; +extern PyMethodDef string_tensor_variable_methods[]; PyNumberMethods number_methods; PySequenceMethods sequence_methods; @@ -772,5 +1091,49 @@ void BindEager(pybind11::module* module) { BindEagerOpFunctions(&m); } +void BindEagerStringTensor(pybind11::module* module) { + auto m = module->def_submodule("eager"); + + auto heap_type = reinterpret_cast( + PyType_Type.tp_alloc(&PyType_Type, 0)); + heap_type->ht_name = ToPyObject("StringTensor"); + heap_type->ht_qualname = ToPyObject("StringTensor"); + auto type = &heap_type->ht_type; + type->tp_name = "StringTensor"; + type->tp_basicsize = sizeof(TensorObject); + type->tp_dealloc = (destructor)TensorDealloc; + type->tp_as_number = &number_methods; + type->tp_as_sequence = &sequence_methods; + type->tp_as_mapping = &mapping_methods; + type->tp_methods = string_tensor_variable_methods; + type->tp_getset = string_tensor_variable_properties; + type->tp_init = StringTensorInit; + type->tp_new = TensorNew; + Py_INCREF(&PyBaseObject_Type); + type->tp_base = reinterpret_cast(&PyBaseObject_Type); + type->tp_flags |= + Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE | Py_TPFLAGS_HEAPTYPE; +#if PY_VERSION_HEX >= 0x03050000 + type->tp_as_async = &heap_type->as_async; +#endif + p_string_tensor_type = type; + + if (PyType_Ready(type) < 0) { + PADDLE_THROW(platform::errors::Fatal( + "Init Paddle error in BindEager(PyType_Ready).")); + return; + } + + Py_INCREF(type); + if (PyModule_AddObject(m.ptr(), "StringTensor", + reinterpret_cast(type)) < 0) { + Py_DECREF(type); + Py_DECREF(m.ptr()); + PADDLE_THROW(platform::errors::Fatal( + "Init Paddle error in BindEagerStringTensor(PyModule_AddObject).")); + return; + } +} + } // namespace pybind } // namespace paddle diff --git a/paddle/fluid/pybind/eager.h b/paddle/fluid/pybind/eager.h index 03676a677ac90..84df71ddeeb7b 100644 --- a/paddle/fluid/pybind/eager.h +++ b/paddle/fluid/pybind/eager.h @@ -39,6 +39,7 @@ typedef struct { } PyLayerObject; void BindEager(pybind11::module* m); +void BindEagerStringTensor(pybind11::module* module); void BindFunctions(PyObject* module); void BindEagerPyLayer(PyObject* module); diff --git a/paddle/fluid/pybind/eager_method.cc b/paddle/fluid/pybind/eager_method.cc index 8304db13c468e..542d59318bbad 100644 --- a/paddle/fluid/pybind/eager_method.cc +++ b/paddle/fluid/pybind/eager_method.cc @@ -257,6 +257,72 @@ static PyObject* tensor_method_numpy(TensorObject* self, PyObject* args, EAGER_CATCH_AND_THROW_RETURN_NULL } +static PyObject* tensor_method_numpy_for_string_tensor(TensorObject* self, + PyObject* args, + PyObject* kwargs) { + EAGER_TRY + auto& api = pybind11::detail::npy_api::get(); + if (!self->tensor.impl() || !self->tensor.impl()->initialized()) { + VLOG(6) << "The StringTensor is uninitialized. Return the empty string " + "numpy array."; + Py_intptr_t py_dims[paddle::framework::DDim::kMaxRank]; + Py_intptr_t py_strides[paddle::framework::DDim::kMaxRank]; + py_dims[0] = 0; + py_strides[0] = 0; + + PyObject* array = api.PyArray_NewFromDescr_( + api.PyArray_Type_, + api.PyArray_DescrFromType_(pybind11::detail::npy_api::NPY_UNICODE_), 1, + py_dims, py_strides, nullptr, + pybind11::detail::npy_api::NPY_ARRAY_ALIGNED_ | + pybind11::detail::npy_api::NPY_ARRAY_WRITEABLE_, + nullptr); + return array; + } + + if (self->tensor.is_cpu()) { + VLOG(6) << "Getting StringTensor's numpy value"; + auto string_tensor = + std::dynamic_pointer_cast(self->tensor.impl()); + const auto* st_ptr = string_tensor->data(); + auto numel = self->tensor.numel(); + auto tensor_dims = self->tensor.shape(); + // Get the max unicode length of StringTensor to create numpy unicode string + // array. + auto* longest_pstring = std::max_element( + st_ptr, st_ptr + numel, [](const auto& a, const auto& b) { + auto a_unicode_len = + phi::strings::GetUnicodeStrLen(a.data(), a.size()); + auto b_unicode_len = + phi::strings::GetUnicodeStrLen(b.data(), b.size()); + return a_unicode_len < b_unicode_len; + }); + size_t max_unicode_length = phi::strings::GetUnicodeStrLen( + longest_pstring->data(), longest_pstring->size()); + max_unicode_length = (max_unicode_length == 0) ? 1 : max_unicode_length; + VLOG(6) << "The max unicode length is " << max_unicode_length; + auto sp = std::make_unique(max_unicode_length * numel); + auto py_array_data = sp.get(); + memset(py_array_data, 0, max_unicode_length * numel * sizeof(uint32_t)); + for (int64_t i = 0; i < numel; ++i) { + auto curr_unicode_len = + phi::strings::GetUnicodeStrLen(st_ptr[i].data(), st_ptr[i].size()); + phi::strings::GetUnicodeStr(st_ptr[i].data(), + py_array_data + i * max_unicode_length, + curr_unicode_len); + } + py::array array(py::dtype("U" + std::to_string(max_unicode_length)), + tensor_dims, {}, py_array_data); + return array.release().ptr(); + } else { + PADDLE_THROW(platform::errors::InvalidArgument( + "StringTensor.numpy() only support cpu tensor.")); + Py_INCREF(Py_None); + return Py_None; + } + EAGER_CATCH_AND_THROW_RETURN_NULL +} + static PyObject* tensor_method__is_initialized(TensorObject* self, PyObject* args, PyObject* kwargs) { @@ -1433,6 +1499,18 @@ static PyObject* tensor_method__uva(TensorObject* self, PyObject* args, EAGER_CATCH_AND_THROW_RETURN_NULL } #endif +static PyObject* tensor_method__is_string_tensor_hold_allocation( + TensorObject* self, PyObject* args, PyObject* kwargs) { + EAGER_TRY + auto string_tensor = + std::dynamic_pointer_cast(self->tensor.impl()); + if (string_tensor) { + return ToPyObject(string_tensor->initialized()); + } else { + return ToPyObject(false); + } + EAGER_CATCH_AND_THROW_RETURN_NULL +} PyMethodDef variable_methods[] = { {"numpy", (PyCFunction)(void (*)(void))tensor_method_numpy, @@ -1545,5 +1623,20 @@ PyMethodDef variable_methods[] = { #endif {NULL, NULL, 0, NULL}}; +// variable_methods for core.eager.StringTensor +PyMethodDef string_tensor_variable_methods[] = { + {"numpy", + (PyCFunction)(void (*)(void))tensor_method_numpy_for_string_tensor, + METH_VARARGS | METH_KEYWORDS, NULL}, + {"_is_initialized", + (PyCFunction)(void (*)(void))tensor_method__is_initialized, + METH_VARARGS | METH_KEYWORDS, NULL}, + {"_is_string_tensor_hold_allocation", + (PyCFunction)( + void (*)(void))tensor_method__is_string_tensor_hold_allocation, + METH_VARARGS | METH_KEYWORDS, NULL}, + // TODO(zhoushunjie): Need to add _copy_to, copy_ for StringTensor. + {NULL, NULL, 0, NULL}}; + } // namespace pybind } // namespace paddle diff --git a/paddle/fluid/pybind/eager_properties.cc b/paddle/fluid/pybind/eager_properties.cc index 797b68fcb36ea..de66308a7baf6 100644 --- a/paddle/fluid/pybind/eager_properties.cc +++ b/paddle/fluid/pybind/eager_properties.cc @@ -204,5 +204,15 @@ struct PyGetSetDef variable_properties[] = { {"is_leaf", (getter)tensor_properties_is_leaf, nullptr, nullptr, nullptr}, {nullptr, nullptr, nullptr, nullptr, nullptr}}; +// variable_properties for core.eager.StringTensor +struct PyGetSetDef string_tensor_variable_properties[] = { + {"name", (getter)tensor_properties_get_name, + (setter)tensor_properties_set_name, nullptr, nullptr}, + {"shape", (getter)tensor_properties_get_shape, nullptr, nullptr, nullptr}, + {"place", (getter)tensor_properties_get_place, nullptr, nullptr, nullptr}, + {"_place_str", (getter)tensor_properties_get_place_str, nullptr, nullptr, + nullptr}, + {nullptr, nullptr, nullptr, nullptr, nullptr}}; + } // namespace pybind } // namespace paddle diff --git a/paddle/fluid/pybind/eager_utils.cc b/paddle/fluid/pybind/eager_utils.cc index 8baea3d0dbfe1..8fa21ef45f82f 100644 --- a/paddle/fluid/pybind/eager_utils.cc +++ b/paddle/fluid/pybind/eager_utils.cc @@ -36,6 +36,7 @@ namespace paddle { namespace pybind { extern PyTypeObject* p_tensor_type; +extern PyTypeObject* p_string_tensor_type; extern PyTypeObject* g_framework_scope_pytype; extern PyTypeObject* g_vartype_pytype; @@ -75,6 +76,8 @@ int TensorDtype2NumpyDtype(phi::DataType dtype) { return pybind11::detail::NPY_COMPLEX64; case phi::DataType::COMPLEX128: return pybind11::detail::NPY_COMPLEX128; + case phi::DataType::PSTRING: + return pybind11::detail::npy_api::NPY_UNICODE_; default: PADDLE_THROW(paddle::platform::errors::InvalidArgument( "Unknow phi::DataType, the int value = %d.", @@ -198,7 +201,9 @@ bool IsEagerTensor(PyObject* obj) { } paddle::experimental::Tensor CastPyArg2Tensor(PyObject* obj, ssize_t arg_pos) { - if (PyObject_IsInstance(obj, reinterpret_cast(p_tensor_type))) { + if (PyObject_IsInstance(obj, reinterpret_cast(p_tensor_type)) || + PyObject_IsInstance(obj, + reinterpret_cast(p_string_tensor_type))) { return reinterpret_cast(obj)->tensor; } else { PADDLE_THROW(platform::errors::InvalidArgument( @@ -508,7 +513,14 @@ PyObject* ToPyObject(const paddle::experimental::Tensor& value, Py_INCREF(Py_None); return Py_None; } - PyObject* obj = p_tensor_type->tp_alloc(p_tensor_type, 0); + PyObject* obj = nullptr; + if (value.initialized() && value.is_string_tensor()) { + // In order to return the core.eager.StringTensor, there is need + // to use p_string_tensor_type to create a python obj. + obj = p_string_tensor_type->tp_alloc(p_string_tensor_type, 0); + } else { + obj = p_tensor_type->tp_alloc(p_tensor_type, 0); + } if (obj) { auto v = reinterpret_cast(obj); new (&(v->tensor)) paddle::experimental::Tensor(); @@ -753,6 +765,9 @@ static paddle::experimental::Tensor& GetTensorFromPyObject( if (PyObject_IsInstance(obj, reinterpret_cast(p_tensor_type))) { return reinterpret_cast(obj)->tensor; + } else if (PyObject_IsInstance( + obj, reinterpret_cast(p_string_tensor_type))) { + return reinterpret_cast(obj)->tensor; } else { PADDLE_THROW(platform::errors::InvalidArgument( "%s(): argument '%s' (position %d) must be Tensor, but got %s", op_type, diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc index 45fcd2fad98a8..d6071617224c2 100644 --- a/paddle/fluid/pybind/pybind.cc +++ b/paddle/fluid/pybind/pybind.cc @@ -545,6 +545,7 @@ PYBIND11_MODULE(core_noavx, m) { BindImperative(&m); BindEager(&m); + BindEagerStringTensor(&m); BindCudaStream(&m); // Not used, just make sure cpu_info.cc is linked. diff --git a/paddle/fluid/pybind/tensor_py.h b/paddle/fluid/pybind/tensor_py.h index 3f7ce8b63f968..63b36bd917390 100644 --- a/paddle/fluid/pybind/tensor_py.h +++ b/paddle/fluid/pybind/tensor_py.h @@ -36,6 +36,8 @@ limitations under the License. */ #include "paddle/fluid/platform/device_context.h" #include "paddle/fluid/platform/float16.h" #include "paddle/fluid/platform/profiler/event_tracing.h" +#include "paddle/phi/core/string_tensor.h" +#include "paddle/phi/kernels/strings/unicode.h" #include "pybind11/numpy.h" #include "pybind11/pybind11.h" @@ -528,6 +530,60 @@ void SetTensorFromPyArray(framework::Tensor *self, const py::object &obj, } } +template +void SetStringTensorFromPyArray(phi::StringTensor *self, const py::array &array, + const P &place) { + bool is_string_pyarray = + array.dtype().kind() == 'S' || array.dtype().kind() == 'U'; + PADDLE_ENFORCE_EQ(is_string_pyarray, true, + platform::errors::InvalidArgument( + "Expect the dtype of numpy array is string or " + "unicode, but recevie dtype %s", + array.dtype())); + std::vector dims; + dims.reserve(array.ndim()); + dims.reserve(array.ndim()); + for (decltype(array.ndim()) i = 0; i < array.ndim(); ++i) { + dims.push_back(static_cast(array.shape()[i])); + } + self->Resize(phi::make_ddim(dims)); + auto itemsize = array.itemsize(); + if (paddle::platform::is_cpu_place(place)) { + auto dst = self->mutable_data(place); + if (array.dtype().kind() == 'S') { + for (int i = 0; i < self->numel(); ++i) { + dst[i] = + pstring(reinterpret_cast(array.data()) + itemsize * i, + itemsize); + } + } else { + // array.dtype().kind() == 'U' + VLOG(6) << "numpy array itemsize: " << itemsize; + for (int i = 0; i < self->numel(); ++i) { + // Note(zhoushunjie): The itemsize of unicode numpy array is the + // the size of each unicode string. Each unicode string is aligned + // to max length of the array of unicode strings, so the size of + // each unicode string is same. The size of each unicode character is + // 4, so the size of unicode string is 4 times of the length of + // unicode string. + auto unicode_len = itemsize / 4; + auto utf8_len = phi::strings::GetUTF8StrLen( + reinterpret_cast(array.data()) + unicode_len * i, + unicode_len); + pstring pstr(utf8_len - 1, 0); + phi::strings::GetUTF8Str( + reinterpret_cast(array.data()) + unicode_len * i, + pstr.mdata(), unicode_len); + dst[i] = pstr; + } + } + } else { + PADDLE_THROW(platform::errors::InvalidArgument( + "StringTensor only support CPUPlace now, but receive %s", + place.DebugString())); + } +} + template void SetUVATensorFromPyArrayImpl(framework::LoDTensor *self_tensor, const py::array_t &array, int device_id) { diff --git a/paddle/phi/api/include/tensor.h b/paddle/phi/api/include/tensor.h index 3c3da4b749ed0..e4a97e2c16f16 100644 --- a/paddle/phi/api/include/tensor.h +++ b/paddle/phi/api/include/tensor.h @@ -245,6 +245,14 @@ class PADDLE_API Tensor final { */ bool is_sparse_csr_tensor() const; + /** + * @brief Determine whether tensor is StringTensor + * + * @return true + * @return false + */ + bool is_string_tensor() const; + /* Part 3: Device and Backend methods */ /** diff --git a/paddle/phi/api/lib/tensor.cc b/paddle/phi/api/lib/tensor.cc index 1fb0803379894..67c1b711fc997 100644 --- a/paddle/phi/api/lib/tensor.cc +++ b/paddle/phi/api/lib/tensor.cc @@ -28,6 +28,7 @@ limitations under the License. */ #include "paddle/phi/core/selected_rows.h" #include "paddle/phi/core/sparse_coo_tensor.h" #include "paddle/phi/core/sparse_csr_tensor.h" +#include "paddle/phi/core/string_tensor.h" #include "paddle/phi/core/tensor_base.h" #include "paddle/phi/core/tensor_meta.h" #include "paddle/phi/core/tensor_utils.h" @@ -155,6 +156,9 @@ bool Tensor::is_sparse_coo_tensor() const { bool Tensor::is_sparse_csr_tensor() const { return phi::SparseCsrTensor::classof(impl_.get()); } +bool Tensor::is_string_tensor() const { + return phi::StringTensor::classof(impl_.get()); +} /* Part 3: Device and Backend methods */ Place Tensor::place() const { diff --git a/paddle/phi/core/string_tensor.cc b/paddle/phi/core/string_tensor.cc index 42f12b7820442..35444dc33fe78 100644 --- a/paddle/phi/core/string_tensor.cc +++ b/paddle/phi/core/string_tensor.cc @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/phi/core/string_tensor.h" +#include "paddle/phi/api/lib/utils/storage.h" namespace phi { @@ -161,4 +162,32 @@ void* StringTensor::AllocateFrom(Allocator* allocator, meta_.offset); } +dtype::pstring* StringTensor::mutable_data(const phi::Place& place, + size_t requested_size) { + PADDLE_ENFORCE_GE( + numel(), + 0, + phi::errors::PreconditionNotMet( + "The Tensor's element number must be equal or greater than zero. " + "The Tensor's shape is [", + dims(), + "] now")); + size_t size = numel() * SizeOf(dtype()); + if (requested_size && (requested_size > size)) { + size = requested_size; + } + + /* some versions of boost::variant don't have operator!= */ + if (holder_ == nullptr || !(holder_->place() == place) || + holder_->size() < size + meta_.offset) { + holder_.reset(); + holder_ = paddle::memory::AllocShared(place, size); + // Initialize the allocated bytes + init_holder(); + meta_.offset = 0; + } + return reinterpret_cast( + reinterpret_cast(holder_->ptr()) + meta_.offset); +} + } // namespace phi diff --git a/paddle/phi/core/string_tensor.h b/paddle/phi/core/string_tensor.h index 223ecaca58143..916c2a2bd4a4e 100644 --- a/paddle/phi/core/string_tensor.h +++ b/paddle/phi/core/string_tensor.h @@ -122,6 +122,8 @@ class StringTensor : public TensorBase, void* AllocateFrom(Allocator* allocator, DataType dtype, size_t requested_size = 0); + dtype::pstring* mutable_data(const phi::Place& place, + size_t requested_size = 0); private: friend class StringTensorUtils; diff --git a/python/paddle/fluid/tests/unittests/test_egr_string_tensor_api.py b/python/paddle/fluid/tests/unittests/test_egr_string_tensor_api.py new file mode 100644 index 0000000000000..def5f569b8f4c --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_egr_string_tensor_api.py @@ -0,0 +1,105 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import paddle.fluid.core as core +import paddle +import numpy as np +from paddle.fluid.framework import _test_eager_guard, in_dygraph_mode +import unittest +import copy + + +class EagerStringTensorTestCase(unittest.TestCase): + def setUp(self): + self.str_arr = np.array([ + ["15.4寸笔记本的键盘确实爽,基本跟台式机差不多了,蛮喜欢数字小键盘,输数字特方便,样子也很美观,做工也相当不错" + ], # From ChnSentiCorp + ["One of the very best Three Stooges shorts ever."] + ]) # From IMDB + + def test_constructor_with_args(self): + with _test_eager_guard(): + ST1 = core.eager.StringTensor() # constructor 1 + self.assertEqual(ST1.name, "generated_string_tensor_0") + self.assertEqual(ST1.shape, []) + self.assertEqual(ST1.numpy(), '') + + shape = [2, 3] + ST2 = core.eager.StringTensor(shape, "ST2") # constructor 2 + self.assertEqual(ST2.name, "ST2") + self.assertEqual(ST2.shape, shape) + self.assertTrue( + np.array_equal( + ST2.numpy(), np.empty( + shape, dtype=np.unicode_))) + + ST3 = core.eager.StringTensor(self.str_arr, "ST3") # constructor 3 + self.assertEqual(ST3.name, "ST3") + self.assertEqual(ST3.shape, list(self.str_arr.shape)) + self.assertTrue(np.array_equal(ST3.numpy(), self.str_arr)) + + ST4 = core.eager.StringTensor(self.str_arr) # constructor 4 + self.assertEqual(ST4.name, "generated_string_tensor_1") + self.assertEqual(ST4.shape, list(self.str_arr.shape)) + self.assertTrue(np.array_equal(ST4.numpy(), self.str_arr)) + + ST5 = core.eager.StringTensor(ST4) # constructor 5 + self.assertEqual(ST5.name, "generated_string_tensor_2") + self.assertEqual(ST5.shape, list(self.str_arr.shape)) + self.assertTrue(np.array_equal(ST5.numpy(), self.str_arr)) + + ST6 = core.eager.StringTensor(ST5, "ST6") # constructor 6 + self.assertEqual(ST6.name, "ST6") + self.assertEqual(ST6.shape, list(self.str_arr.shape)) + self.assertTrue(np.array_equal(ST6.numpy(), self.str_arr)) + + for st in [ST1, ST2, ST3, ST4, ST5, ST6]: + # All StringTensors are on cpu place so far. + self.assertTrue(st.place._equals(core.CPUPlace())) + + def test_constructor_with_kwargs(self): + with _test_eager_guard(): + shape = [2, 3] + ST1 = core.eager.StringTensor( + dims=shape, name="ST1") # constructor 2 + self.assertEqual(ST1.name, "ST1") + self.assertEqual(ST1.shape, shape) + self.assertTrue( + np.array_equal( + ST1.numpy(), np.empty( + shape, dtype=np.unicode_))) + + ST2 = core.eager.StringTensor( + self.str_arr, name="ST2") # constructor 3 + self.assertEqual(ST2.name, "ST2") + self.assertEqual(ST2.shape, list(self.str_arr.shape)) + self.assertTrue(np.array_equal(ST2.numpy(), self.str_arr)) + + ST3 = core.eager.StringTensor(ST2, name="ST3") # constructor 6 + self.assertEqual(ST3.name, "ST3") + self.assertEqual(ST3.shape, list(self.str_arr.shape)) + self.assertTrue(np.array_equal(ST3.numpy(), self.str_arr)) + + ST4 = core.eager.StringTensor( + value=ST2, name="ST4") # constructor 6 + self.assertEqual(ST4.name, "ST4") + self.assertEqual(ST4.shape, list(self.str_arr.shape)) + self.assertTrue(np.array_equal(ST4.numpy(), self.str_arr)) + for st in [ST1, ST2, ST3, ST4]: + # All StringTensors are on cpu place so far. + self.assertTrue(st.place._equals(core.CPUPlace())) + + +if __name__ == "__main__": + unittest.main() From 2d6b71a26d614ac7dd76ba07dd0394379a0489b5 Mon Sep 17 00:00:00 2001 From: fwenguang <95677191+fwenguang@users.noreply.github.com> Date: Fri, 15 Apr 2022 14:00:25 +0800 Subject: [PATCH 179/211] [MLU] add mlu softmax kernel (#41816) --- paddle/fluid/operators/mlu/mlu_baseop.cc | 12 ++ paddle/fluid/operators/mlu/mlu_baseop.h | 7 + paddle/fluid/operators/softmax_op_mlu.cc | 103 ++++++++++ .../unittests/mlu/test_softmax_op_mlu.py | 189 ++++++++++++++++++ 4 files changed, 311 insertions(+) create mode 100644 paddle/fluid/operators/softmax_op_mlu.cc create mode 100644 python/paddle/fluid/tests/unittests/mlu/test_softmax_op_mlu.py diff --git a/paddle/fluid/operators/mlu/mlu_baseop.cc b/paddle/fluid/operators/mlu/mlu_baseop.cc index df091a7dc7535..ecde4db3f334e 100644 --- a/paddle/fluid/operators/mlu/mlu_baseop.cc +++ b/paddle/fluid/operators/mlu/mlu_baseop.cc @@ -1158,6 +1158,18 @@ MLUCnnlTrigonDesc::~MLUCnnlTrigonDesc() { output_desc, output)); } +/* static */ void MLUCnnl::SoftmaxBackward( + const ExecutionContext& ctx, cnnlSoftmaxAlgorithm_t algorithm, + cnnlSoftmaxMode_t mode, const cnnlTensorDescriptor_t y_desc, const void* y, + const cnnlTensorDescriptor_t diff_y_desc, const void* diff_y, + const cnnlTensorDescriptor_t diff_x_desc, void* diff_x) { + cnnlHandle_t handle = GetHandleFromCTX(ctx); + + PADDLE_ENFORCE_MLU_SUCCESS( + cnnlSoftmaxBackward(handle, algorithm, mode, nullptr, y_desc, y, + diff_y_desc, diff_y, nullptr, diff_x_desc, diff_x)); +} + /* static */ void MLUCnnl::Softplus(const ExecutionContext& ctx, const cnnlTensorDescriptor_t features_desc, const void* features, diff --git a/paddle/fluid/operators/mlu/mlu_baseop.h b/paddle/fluid/operators/mlu/mlu_baseop.h index 64a99b2a6d273..00ad618329c99 100644 --- a/paddle/fluid/operators/mlu/mlu_baseop.h +++ b/paddle/fluid/operators/mlu/mlu_baseop.h @@ -740,6 +740,13 @@ class MLUCnnl { const cnnlTensorDescriptor_t output_desc, void* output); + static void SoftmaxBackward( + const ExecutionContext& ctx, cnnlSoftmaxAlgorithm_t algorithm, + cnnlSoftmaxMode_t mode, const cnnlTensorDescriptor_t y_desc, + const void* y, const cnnlTensorDescriptor_t diff_y_desc, + const void* diff_y, const cnnlTensorDescriptor_t diff_x_desc, + void* diff_x); + static void Softplus(const ExecutionContext& ctx, const cnnlTensorDescriptor_t features_desc, const void* features, diff --git a/paddle/fluid/operators/softmax_op_mlu.cc b/paddle/fluid/operators/softmax_op_mlu.cc new file mode 100644 index 0000000000000..9cb698e94fc56 --- /dev/null +++ b/paddle/fluid/operators/softmax_op_mlu.cc @@ -0,0 +1,103 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/mlu/mlu_baseop.h" +#include "paddle/phi/kernels/funcs/axis_utils.h" + +namespace paddle { +namespace operators { + +template +class SoftmaxMLUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* in = ctx.Input("X"); + auto* out = ctx.Output("Out"); + out->mutable_data(ctx.GetPlace()); + + const int rank = in->dims().size(); + const int axis = phi::funcs::CanonicalAxis(ctx.Attr("axis"), rank); + + // cnnl softmax only support 3-dims, regard all shape as [d1, d2, d3] + const int cnnl_softmax_dims = 3; + const int d1 = phi::funcs::SizeToAxis(axis, in->dims()); + const int d2 = in->dims()[axis]; + const int d3 = phi::funcs::SizeOutAxis(axis, in->dims()); + + // CNNL_SOFTMAX_MODE_LOW_DIMENSION has better perfermence, use it as much as + // possible. + cnnlSoftmaxMode_t mode = CNNL_SOFTMAX_MODE_LOW_DIMENSION; + std::vector regard_in_shape{d1, 1, d2}; + if (d3 != 1) { + mode = CNNL_SOFTMAX_MODE_MEDIUM_DIMENSION; + regard_in_shape = {d1, d2, d3}; + } + + static const cnnlSoftmaxAlgorithm_t algo = CNNL_SOFTMAX_ACCURATE; + MLUCnnlTensorDesc in_desc(cnnl_softmax_dims, regard_in_shape.data(), + ToCnnlDataType()); + MLUCnnl::SoftmaxForward(ctx, algo, mode, NULL, in_desc.get(), + GetBasePtr(in), NULL, in_desc.get(), + GetBasePtr(out)); + } +}; + +template +class SoftmaxGradMLUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* out = ctx.Input("Out"); + auto* dOut = ctx.Input(framework::GradVarName("Out")); + + auto* dX = ctx.Output(framework::GradVarName("X")); + dX->mutable_data(ctx.GetPlace()); + + const int rank = out->dims().size(); + const int axis = phi::funcs::CanonicalAxis(ctx.Attr("axis"), rank); + + // cnnl softmax only support 3-dims, regard all shape as [d1, d2, d3] + const int cnnl_softmax_dims = 3; + const int d1 = phi::funcs::SizeToAxis(axis, out->dims()); + const int d2 = out->dims()[axis]; + const int d3 = phi::funcs::SizeOutAxis(axis, out->dims()); + + // CNNL_SOFTMAX_MODE_LOW_DIMENSION has better perfermence, use it as much as + // possible. + cnnlSoftmaxMode_t mode = CNNL_SOFTMAX_MODE_LOW_DIMENSION; + std::vector regard_out_shape{d1, 1, d2}; + if (d3 != 1) { + mode = CNNL_SOFTMAX_MODE_MEDIUM_DIMENSION; + regard_out_shape = {d1, d2, d3}; + } + + static const cnnlSoftmaxAlgorithm_t algo = CNNL_SOFTMAX_ACCURATE; + MLUCnnlTensorDesc out_desc(cnnl_softmax_dims, regard_out_shape.data(), + ToCnnlDataType()); + MLUCnnl::SoftmaxBackward(ctx, algo, mode, out_desc.get(), GetBasePtr(out), + out_desc.get(), GetBasePtr(dOut), out_desc.get(), + GetBasePtr(dX)); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +namespace plat = paddle::platform; + +REGISTER_OP_MLU_KERNEL(softmax, ops::SoftmaxMLUKernel, + ops::SoftmaxMLUKernel); +REGISTER_OP_MLU_KERNEL(softmax_grad, ops::SoftmaxGradMLUKernel, + ops::SoftmaxGradMLUKernel); diff --git a/python/paddle/fluid/tests/unittests/mlu/test_softmax_op_mlu.py b/python/paddle/fluid/tests/unittests/mlu/test_softmax_op_mlu.py new file mode 100644 index 0000000000000..54acafcf0df5e --- /dev/null +++ b/python/paddle/fluid/tests/unittests/mlu/test_softmax_op_mlu.py @@ -0,0 +1,189 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import unittest +import numpy as np +import sys +sys.path.append('..') +from op_test import OpTest +import paddle.fluid.core as core +import paddle.fluid as fluid +from paddle.fluid import compiler, Program, program_guard +import paddle +import paddle.nn.functional as F + +paddle.enable_static() +np.random.seed(10) + + +def stable_softmax(x): + """Compute the softmax of vector x in a numerically stable way.""" + # clip to shiftx, otherwise, when calc loss with + # log(exp(shiftx)), may get log(0)=INF + shiftx = (x - np.max(x)).clip(-64.) + exps = np.exp(shiftx) + return exps / np.sum(exps) + + +def ref_softmax(x, axis=None, dtype=None): + x_t = x.copy() + if dtype is not None: + x_t = x_t.astype(dtype) + if axis is None: + axis = -1 + return np.apply_along_axis(stable_softmax, axis, x_t) + + +class TestSoftmaxOp(OpTest): + def get_x_shape(self): + return [10, 10] + + def get_axis(self): + return -1 + + def setUp(self): + self.op_type = "softmax" + self.place = paddle.MLUPlace(0) + self.dtype = np.float32 + self.init_kernel_type() + self.shape = self.get_x_shape() + self.axis = self.get_axis() + + np.random.seed(0) + x = np.random.uniform(0.1, 1, self.shape).astype(self.dtype) + out = np.apply_along_axis(stable_softmax, self.axis, x) + + self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)} + self.outputs = {'Out': out} + self.attrs = {'axis': self.axis, } + + def init_kernel_type(self): + pass + + def test_check_output(self): + self.check_output_with_place(self.place) + + def test_check_grad(self): + self.check_grad_with_place( + self.place, ["X"], "Out", max_relative_error=0.01) + + +class TestSoftmaxOp2(TestSoftmaxOp): + def get_x_shape(self): + return [2, 3, 4, 5] + + +class TestSoftmaxOp3(TestSoftmaxOp): + def get_x_shape(self): + return [2, 3, 4, 5] + + def get_axis(self): + return 0 + + +class TestSoftmaxOp4(TestSoftmaxOp): + def get_x_shape(self): + return [2, 3, 4, 5] + + def get_axis(self): + return 1 + + +class TestSoftmaxOp5(TestSoftmaxOp): + def get_x_shape(self): + return [2, 3, 4, 5] + + def get_axis(self): + return 2 + + +class TestSoftmaxOp6(TestSoftmaxOp): + def get_x_shape(self): + return [2, 3, 4, 5] + + def get_axis(self): + return 3 + + +class TestSoftmaxAPI(unittest.TestCase): + def setUp(self): + self.place = paddle.MLUPlace(0) + self.x_np = np.random.uniform(-1., 1., [2, 3, 4, 5]).astype('float32') + self.out_ref = np.apply_along_axis(stable_softmax, -1, self.x_np) + self.executed_api() + + def executed_api(self): + self.softmax = F.softmax + + def test_static_check(self): + with paddle.static.program_guard(paddle.static.Program()): + x = paddle.fluid.data('X', self.x_np.shape, 'float32') + out1 = self.softmax(x) + m = paddle.nn.Softmax() + out2 = m(x) + exe = paddle.static.Executor(self.place) + res = exe.run(feed={'X': self.x_np}, fetch_list=[out1, out2]) + out_ref = ref_softmax(self.x_np, axis=-1, dtype=None) + for r in res: + self.assertEqual(np.allclose(out_ref, r), True) + + def test_dygraph_check(self): + paddle.disable_static(self.place) + + x = paddle.to_tensor(self.x_np) + out1 = self.softmax(x) + x = paddle.to_tensor(self.x_np) + m = paddle.nn.Softmax() + out2 = m(x) + out_ref = ref_softmax(self.x_np, axis=-1, dtype=None) + for r in [out1, out2]: + self.assertEqual(np.allclose(out_ref, r.numpy()), True) + + out1 = self.softmax(x, axis=0) + x = paddle.to_tensor(self.x_np) + m = paddle.nn.Softmax(axis=0) + out2 = m(x) + out_ref = ref_softmax(self.x_np, axis=0, dtype=None) + for r in [out1, out2]: + self.assertEqual(np.allclose(out_ref, r.numpy()), True) + + out = self.softmax(x, dtype=np.float32) + out_ref = ref_softmax(self.x_np, axis=-1, dtype=np.float32) + self.assertEqual(np.allclose(out_ref, out.numpy()), True) + + paddle.enable_static() + + def test_error(self): + with paddle.static.program_guard(paddle.static.Program()): + # The input type must be Variable. + self.assertRaises(TypeError, self.softmax, 1) + # The input dtype must be float16, float32 + x_int32 = paddle.fluid.data( + name='x_int32', shape=[2, 3], dtype='int32') + self.assertRaises(TypeError, self.softmax, x_int32) + # support the input dtype is float16 + x_fp16 = paddle.fluid.data( + name='x_fp16', shape=[2, 3], dtype='float16') + self.softmax(x_fp16) + + +class TestSoftmaxInplaceAPI(TestSoftmaxAPI): + def executed_api(self): + self.softmax = F.softmax_ + + +if __name__ == "__main__": + unittest.main() From 30a1213b79b87b1cdb105f8d3adfa714bc6bc524 Mon Sep 17 00:00:00 2001 From: danleifeng <52735331+danleifeng@users.noreply.github.com> Date: Fri, 15 Apr 2022 14:06:21 +0800 Subject: [PATCH 180/211] =?UTF-8?q?=E3=80=90GPUPS=E3=80=91add=20afsclient?= =?UTF-8?q?=20and=20gpupsutil=20(#41324)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * add gpupsutil and afsclient; test=develop --- .../fluid/framework/fleet/ps_gpu_wrapper.cc | 14 +- paddle/fluid/framework/fleet/ps_gpu_wrapper.h | 4 + paddle/fluid/pybind/ps_gpu_wrapper_py.cc | 6 + python/paddle/distributed/fleet/utils/fs.py | 417 ++++++++++++++++ .../fluid/incubate/fleet/utils/fleet_util.py | 471 +++++++++++++++++- 5 files changed, 909 insertions(+), 3 deletions(-) diff --git a/paddle/fluid/framework/fleet/ps_gpu_wrapper.cc b/paddle/fluid/framework/fleet/ps_gpu_wrapper.cc index 115ec4d0102cc..5e1a08f33e3ef 100644 --- a/paddle/fluid/framework/fleet/ps_gpu_wrapper.cc +++ b/paddle/fluid/framework/fleet/ps_gpu_wrapper.cc @@ -72,6 +72,18 @@ int AfsWrapper::download(const std::string& local_file, const std::string& afs_file) { return afs_handler_.download_file(local_file, afs_file); } + +int AfsWrapper::touchz(const std::string& path) { + return afs_handler_.touchz(path); +} + +std::string AfsWrapper::cat(const std::string& path) { + return afs_handler_.cat(path); +} + +int AfsWrapper::mv(const std::string& old_path, const std::string& dest_path) { + return afs_handler_.mv(old_path, dest_path); +} #endif std::shared_ptr PSGPUWrapper::s_instance_ = NULL; @@ -84,7 +96,7 @@ void PSGPUWrapper::InitAfsApi(const std::string& fs_name, int ret = afs_handler_.init(fs_name.c_str(), fs_user.c_str(), pass_wd.c_str(), conf.c_str()); if (ret != 0) { - LOG(ERROR) << "AFS Init Error"; + VLOG(0) << "AFS Init Error"; } use_afs_api_ = 1; } diff --git a/paddle/fluid/framework/fleet/ps_gpu_wrapper.h b/paddle/fluid/framework/fleet/ps_gpu_wrapper.h index b7060764863f1..c5f674d8b47eb 100755 --- a/paddle/fluid/framework/fleet/ps_gpu_wrapper.h +++ b/paddle/fluid/framework/fleet/ps_gpu_wrapper.h @@ -71,6 +71,10 @@ class AfsWrapper { int download(const std::string& local_file, const std::string& afs_file); + int touchz(const std::string& path); + std::string cat(const std::string& path); + int mv(const std::string& old_path, const std::string& dest_path); + private: paddle::ps::AfsApiWrapper afs_handler_; }; diff --git a/paddle/fluid/pybind/ps_gpu_wrapper_py.cc b/paddle/fluid/pybind/ps_gpu_wrapper_py.cc index 79529fca7d1be..42703fc17bde5 100644 --- a/paddle/fluid/pybind/ps_gpu_wrapper_py.cc +++ b/paddle/fluid/pybind/ps_gpu_wrapper_py.cc @@ -81,6 +81,12 @@ void BindAfsWrapper(py::module* m) { .def("upload", &framework::AfsWrapper::upload, py::call_guard()) .def("remove", &framework::AfsWrapper::remove, + py::call_guard()) + .def("touchz", &framework::AfsWrapper::touchz, + py::call_guard()) + .def("cat", &framework::AfsWrapper::cat, + py::call_guard()) + .def("mv", &framework::AfsWrapper::mv, py::call_guard()); } #endif diff --git a/python/paddle/distributed/fleet/utils/fs.py b/python/paddle/distributed/fleet/utils/fs.py index 8895a529526f7..fab7b4ff4ce3d 100644 --- a/python/paddle/distributed/fleet/utils/fs.py +++ b/python/paddle/distributed/fleet/utils/fs.py @@ -1145,3 +1145,420 @@ def list_files_info(self, path_list): file_list.append({'path': file_path, 'size': file_size}) return file_list + + +class AFSClient(FS): + """ + A tool of AFS. Use AfsWrapper. + + Examples: + + .. code-block:: text + + from paddle.distributed.fleet.utils import AFSClient + client = AFSClient() + client.init("hdfs://xxx.hadoop.com:54310", "hello", "hello123", "./fs_conf") + client.ls_dir("hdfs:/test_hdfs_client") + """ + + def __init__( + self, + time_out=5 * 60 * 1000, # ms + sleep_inter=1000): # ms + self._fs = core.AfsWrapper() + self._time_out = time_out + + def init(self, fs_name, fs_user, fs_passwd, fs_conf): + self._fs.init(fs_name, fs_user, fs_passwd, fs_conf) + + def list_dirs(self, fs_path): + """ + Only list directorys under `fs_path` . + + Args: + fs_path(str): The HDFS file path. + + Returns: + List: A list of all its subdirectories, e.g. [subdirname1, subdirname1, ...]. + + Examples: + + .. code-block:: text + + from paddle.distributed.fleet.utils import AFSClient + + client = AFSClient() + client.init("hdfs://xxx.hadoop.com:54310", "hello", "hello123", "./fs_conf") + subdirs = client.list_dirs("hdfs:/test_hdfs_client") + """ + if not self.is_exist(fs_path): + return [] + # TODO:fengdanlei + dirs, files = self._ls_dir(fs_path) + return dirs + + def ls_dir(self, fs_path): + """ + List directorys and files under `fs_path` . + + Args: + fs_path(str): The HDFS file path. + + Returns: + Tuple: Return a 2-tuple, the first element is the list of all its subdirectories, + and the second one is the list of all its subfiles, e.g. ([subdirname1, subdirname1, ...], [filename1, filename2, ...]). + + Examples: + + .. code-block:: text + + from paddle.distributed.fleet.utils import AFSClient + + client = AFSClient() + client.init("hdfs://xxx.hadoop.com:54310", "hello", "hello123", "./fs_conf") + subdirs, files = client.ls_dir("hdfs:/test_hdfs_client") + """ + if not self.is_exist(fs_path): + return [], [] + + return self._ls_dir(fs_path) + + def _ls_dir(self, fs_path): + + files = self._fs.list(fs_path) + dirs = [fs_path] + return dirs, files + + def is_dir(self, fs_path): + """ + Whether the remote HDFS path is a directory. + + Args: + fs_path(str): The HDFS file path. + + Returns: + Bool: Return true if the path exists and it's a directory, otherwise return false. + + Examples: + + .. code-block:: text + + from paddle.distributed.fleet.utils import AFSClient + + client = AFSClient() + client.init("hdfs://xxx.hadoop.com:54310", "hello", "hello123", "./fs_conf") + ret = client.is_file("hdfs:/test_hdfs_client") + """ + if not self.is_exist(fs_path): + return False + + return self._is_dir(fs_path) + + def _is_dir(self, fs_path): + list_path = self._fs.list(fs_path) + if (len(list_path)) > 0: + return True + else: + return False + + def is_file(self, fs_path): + """ + Whether the remote HDFS path is a file. + + Args: + fs_path(str): The HDFS file path. + + Returns: + Bool: Return true if the path exists and it's a file, otherwise return false. + + Examples: + + .. code-block:: text + + from paddle.distributed.fleet.utils import AFSClient + + client = AFSClient() + client.init("hdfs://xxx.hadoop.com:54310", "hello", "hello123", "./fs_conf") + ret = client.is_file("hdfs:/test_hdfs_client") + """ + if not self.is_exist(fs_path): + return False + + return not self._is_dir(fs_path) + + def is_exist(self, fs_path): + """ + Whether the remote HDFS path exists. + + Args: + fs_path(str): The hdfs file path. + + Returns: + Bool: Whether it's is file or directory, return true if the path exists, + otherwise return false. + + Examples: + + .. code-block:: text + + from paddle.distributed.fleet.utils import AFSClient + + client = AFSClient() + client.init("hdfs://xxx.hadoop.com:54310", "hello", "hello123", "./fs_conf") + ret = client.is_exist("hdfs:/test_hdfs_client") + """ + return self._fs.exist(fs_path) + + def upload_dir(self, local_dir, dest_dir, overwrite=False): + """ + upload dir to hdfs + Args: + local_dir(str): local dir + dest_dir(str): hdfs dest dir + overwrite(bool): is overwrite + Returns: + return code + """ + local_dir = local_dir.rstrip("/") + dest_dir = dest_dir.rstrip("/") + local_basename = os.path.basename(local_dir) + if self.is_exist(dest_dir + "/" + local_basename) and overwrite: + self.delete(dest_dir + "/" + local_basename) + if not self.is_exist(dest_dir): + self.mkdirs(dest_dir) + self._fs.upload(local_dir, dest_dir) + + # can't retry + def upload(self, local_path, fs_path, multi_processes=1, overwrite=False): + """ + Upload the local path to remote HDFS. + + Args: + local_path(str): The local path. + fs_path(str): The HDFS path. + multi_processes(int|1): the upload data process at the same time, default=5 + overwrite(bool|False): will overwrite file on HDFS or not + + Examples: + + .. code-block:: text + + from paddle.distributed.fleet.utils import AFSClient + + client = AFSClient() + client.init("hdfs://xxx.hadoop.com:54310", "hello", "hello123", "./fs_conf") + client.upload("test_hdfs_client", "hdfs:/test_hdfs_client") + """ + + local = LocalFS() + if not local.is_exist(local_path): + raise FSFileNotExistsError("{} not exists".format(local_path)) + + self._fs.upload(local_path, fs_path) + + def download(self, fs_path, local_path, multi_processes=1, overwrite=False): + """ + Download remote HDFS path to the local. + + Args: + fs_path(str): The HDFS path. + local_path(str): The local path. + multi_processes(int|1): the download data process at the same time, default=1 + overwrite(bool): is overwrite + + Examples: + + .. code-block:: text + + from paddle.distributed.fleet.utils import AFSClient + + client = AFSClient() + client.init("hdfs://xxx.hadoop.com:54310", "hello", "hello123", "./fs_conf") + client.download("hdfs:/test_hdfs_client", "./") + """ + + def __subprocess_download(local_path, datas): + """ + download file from HDFS + Args: + local_path(str): the local file path + datas(str): the hdfs file path list + """ + for data in datas: + self._fs.download(local_path, data) + + if not self.is_exist(fs_path): + raise FSFileNotExistsError("{} not exits".format(fs_path)) + # download file + if self.is_file(fs_path): + return self._fs.download(local_path, fs_path) + # download dir + _, all_filenames = self.ls_dir(fs_path) + all_files = [fs_path + i for i in all_filenames] + procs = [] + for i in range(multi_processes): + process_datas = self._split_files(all_files, i, multi_processes) + p = multiprocessing.Process( + target=__subprocess_download, args=(local_path, process_datas)) + procs.append(p) + p.start() + + # complete the processes + for proc in procs: + proc.join() + + def mkdirs(self, fs_path): + """ + Create a remote HDFS directory. + + Args: + fs_path(str): The HDFS directory path. + + Examples: + + .. code-block:: text + + from paddle.distributed.fleet.utils import AFSClient + + client = AFSClient() + client.init("hdfs://xxx.hadoop.com:54310", "hello", "hello123", "./fs_conf") + client.mkdirs("hdfs:/test_hdfs_client") + """ + if self.is_exist(fs_path): + return + self._fs.mkdir(fs_path) + + def mv(self, fs_src_path, fs_dst_path, overwrite=False, test_exists=True): + """ + Move a remote HDFS file or directory from `fs_src_path` to `fs_dst_path` . + + Args: + fs_src_path(str): Name of the file or directory, that's needed to be moved. + fs_dst_path(str): Name of the file or directory to which to move to. + overwrite(bool): Whether to re-write `fs_dst_path` if that exists. Default is False. + test_exists(bool): Check the existence of `fs_src_path` and `fs_dst_path` . When `test_exists` is set true, if `fs_src_path` doesn't exist or `fs_dst_path` exists, program will throw an Excetption. + + Examples: + + .. code-block:: text + + from paddle.distributed.fleet.utils import AFSClient + + client = AFSClient() + client.init("hdfs://xxx.hadoop.com:54310", "hello", "hello123", "./fs_conf") + client.mv("hdfs:/test_hdfs_client", "hdfs:/test_hdfs_client2") + """ + if overwrite and self.is_exist(fs_dst_path): + self.delete(fs_dst_path) + + if test_exists: + if not self.is_exist(fs_src_path): + raise FSFileNotExistsError("{} is not exists".format( + fs_src_path)) + + if self.is_exist(fs_dst_path): + raise FSFileExistsError("{} exists already".format(fs_dst_path)) + + self._fs.mv(fs_src_path, fs_dst_path) + + def delete(self, fs_path): + """ + Delete a remote HDFS path, whether it's a file or directory. + + Args: + fs_path(str): The HDFS file path. + + Examples: + + .. code-block:: text + + from paddle.distributed.fleet.utils import HDFSClient + + from paddle.distributed.fleet.utils import AFSClient + + client = AFSClient() + client.init("hdfs://xxx.hadoop.com:54310", "hello", "hello123", "./fs_conf") + client.delete("hdfs:/test_hdfs_client") + """ + if not self.is_exist(fs_path): + return + self._fs.remove(fs_path) + + def touch(self, fs_path, exist_ok=True): + """ + Create a remote HDFS file. + + Args: + fs_path(str): The HDFS file path. + exist_ok(bool): When `fs_path` exists, if `exist_ok` is set false, + program will throw an Exception. Default is true. + + Examples: + + .. code-block:: text + + from paddle.distributed.fleet.utils import AFSClient + + client = AFSClient() + client.init("hdfs://xxx.hadoop.com:54310", "hello", "hello123", "./fs_conf") + client.touch("hdfs:/test_hdfs_client") + """ + if self.is_exist(fs_path): + if exist_ok: + return + raise FSFileExistsError + + return self._fs.touchz(fs_path) + + def need_upload_download(self): + return True + + def cat(self, fs_path=None): + """ + Cat a remote HDFS file. + + Args: + fs_path(str): The HDFS file path. + + Returns: + file content + + Examples: + + .. code-block:: text + + from paddle.distributed.fleet.utils import AFSClient + + client = AFSClient() + client.init("hdfs://xxx.hadoop.com:54310", "hello", "hello123", "./fs_conf") + client.cat("hdfs:/test_hdfs_client") + """ + if self.is_file(fs_path): + return self._fs.cat(fs_path) + else: + return "" + + def _split_files(self, files, trainer_id, trainers): + """ + split file list + Args: + files(list): file list + trainer_id(int): trainer mpi rank id + trainers(int): all trainers num + Returns: + fileist(list): file list of current trainer + """ + remainder = len(files) % trainers + blocksize = len(files) // trainers + + blocks = [blocksize] * trainers + for i in range(remainder): + blocks[i] += 1 + + trainer_files = [[]] * trainers + begin = 0 + for i in range(trainers): + trainer_files[i] = files[begin:begin + blocks[i]] + begin += blocks[i] + + return trainer_files[trainer_id] diff --git a/python/paddle/fluid/incubate/fleet/utils/fleet_util.py b/python/paddle/fluid/incubate/fleet/utils/fleet_util.py index 47f912c8715b0..5fc8fbd011629 100644 --- a/python/paddle/fluid/incubate/fleet/utils/fleet_util.py +++ b/python/paddle/fluid/incubate/fleet/utils/fleet_util.py @@ -25,11 +25,11 @@ import paddle.fluid as fluid from paddle.fluid import core from paddle.fluid.log_helper import get_logger -from paddle.distributed.fleet.utils.fs import LocalFS, HDFSClient +from paddle.distributed.fleet.utils.fs import LocalFS, HDFSClient, AFSClient from . import utils OpRole = core.op_proto_and_checker_maker.OpRole -__all__ = ["FleetUtil"] +__all__ = ["FleetUtil", "GPUPSUtil"] _logger = get_logger( __name__, logging.INFO, fmt='%(asctime)s %(levelname)s: %(message)s') @@ -1721,3 +1721,470 @@ def split_program_by_device(self, program): else: return [start_list[heter_index], end_list[heter_index], send_list[heter_index], \ recv_list[heter_index], program_list[heter_index]] + + +class GPUPSUtil(FleetUtil): + """ + GPUPSUtil provides some common functions for users' convenience. + + Examples: + .. code-block:: python + + from paddle.fluid.incubate.fleet.utils.fleet_util import GPUPSUtil + fleet_util = GPUPSUtil() + fleet_util.rank0_print("my log") + """ + + def __init__(self, fs_client=None): + super(GPUPSUtil, self).__init__("pslib") + self._afs = fs_client + # self._afs = fs_client._fs + + def init(self, fs_name, fs_user, fs_passwd, fs_conf): + r""" + init for fs config + + Args: + fs_name(str): fs name + fs_user(str): fs user + fs_passwd(str): fs password + fs_conf(str): fs and afs conf path + + Returns: + None + + Examples: + .. code-block:: python + + from paddle.fluid.incubate.fleet.utils.fleet_util import GPUPSUtil + fleet_util = GPUPSUtil() + fleet_util.init(20190722, 88, 88, "./afs.conf") + """ + self._afs.init(fs_name, fs_user, fs_passwd, fs_conf) + + def set_fsclient(self, fs_client): + r""" + set fs_client for fs config + + Args: + fs_client(AFSClient): fs_client object + + Returns: + None + + Examples: + .. code-block:: python + + from paddle.fluid.incubate.fleet.utils.fleet_util import GPUPSUtil + from paddle.distributed.fleet.utils.fs import AFSClient + hdfs_client = AFSClient() + fleet_util = GPUPSUtil() + fleet_util.set_fsclient(hdfs_client) + """ + self._afs = fs_client + + def get_last_save_xbox_base(self, output_path): + r""" + get last saved base xbox info from xbox_base_done.txt + + Args: + output_path(str): output path + + Returns: + [last_save_day, last_path, xbox_base_key] + last_save_day(int): day of saved model + last_path(str): model path + xbox_base_key(int): xbox key + + Examples: + .. code-block:: python + + from paddle.fluid.incubate.fleet.utils.fleet_util import GPUPSUtil + from paddle.distributed.fleet.utils.fs import AFSClient + hdfs_client = AFSClient() + fleet_util = GPUPSUtil() + fleet_util.set_fsclient(hdfs_client) + last_save_day, last_path, xbox_base_key = \ + fleet_util.get_last_save_xbox_base("hdfs:/my/path") + + """ + donefile_path = output_path + "/xbox_base_done.txt" + + if not self._afs.is_file(donefile_path): + return [-1, -1, int(time.time())] + self._afs.download(donefile_path, "./xbox_base_done.txt") + # pre_content = self._afs.cat(donefile_path) + pre_content = "" + with open("xbox_base_done.txt", "r") as f: + pre_content = f.read() + pre_content = pre_content.strip() + last_dict = json.loads(pre_content.split("\n")[-1]) + last_day = int(last_dict["input"].split("/")[-3]) + last_path = "/".join(last_dict["input"].split("/")[:-1]) + xbox_base_key = int(last_dict["key"]) + return [last_day, last_path, xbox_base_key] + + def get_last_save_xbox(self, output_path): + r""" + get last saved xbox info from xbox_patch_done.txt + + Args: + output_path(str): output path + + Returns: + [last_save_day, last_save_pass, last_path, xbox_base_key] + last_save_day(int): day of saved model + last_save_pass(int): pass id of saved + last_path(str): model path + xbox_base_key(int): xbox key + + Examples: + .. code-block:: python + + from paddle.fluid.incubate.fleet.utils.fleet_util import GPUPSUtil + from paddle.distributed.fleet.utils.fs import AFSClient + hdfs_client = AFSClient() + fleet_util = GPUPSUtil() + fleet_util.set_fsclient(hdfs_client) + last_save_day, last_save_pass, last_path, xbox_base_key = \ + fleet_util.get_last_save_xbox("hdfs:/my/path") + + """ + donefile_path = output_path + "/xbox_patch_done.txt" + + if not self._afs.is_file(donefile_path): + return [-1, -1, "", int(time.time())] + self._afs.download(donefile_path, "xbox_patch_done.txt") + pre_content = "" + with open("xbox_patch_done.txt", "r") as f: + pre_content = f.read() + pre_content = pre_content.strip() + last_dict = json.loads(pre_content.split("\n")[-1]) + last_day = int(last_dict["input"].split("/")[-3]) + last_pass = int(last_dict["input"].split("/")[-2].split("-")[-1]) + last_path = "/".join(last_dict["input"].split("/")[:-1]) + xbox_base_key = int(last_dict["key"]) + os.remove("xbox_patch_done.txt") + return [last_day, last_pass, last_path, xbox_base_key] + + def get_last_save_model(self, output_path): + r""" + get last saved model info from donefile.txt + + Args: + output_path(str): output path + + Returns: + [last_save_day, last_save_pass, last_path, xbox_base_key] + last_save_day(int): day of saved model + last_save_pass(int): pass id of saved + last_path(str): model path + xbox_base_key(int): xbox key + + Examples: + .. code-block:: python + + from paddle.fluid.incubate.fleet.utils.fleet_util import GPUPSUtil + from paddle.distributed.fleet.utils.fs import AFSClient + hdfs_client = AFSClient() + fleet_util = GPUPSUtil() + fleet_util.set_fsclient(hdfs_client) + last_save_day, last_save_pass, last_path, xbox_base_key = \ + fleet_util.get_last_save_model("hdfs:/my/path") + + """ + last_save_day = -1 + last_save_pass = -1 + last_path = "" + donefile_path = output_path + "/donefile.txt" + if not self._afs.is_file(donefile_path): + return [-1, -1, "", int(time.time())] + self._afs.download(donefile_path, "./donefile.txt") + content = "" + with open("donefile.txt", "r") as f: + content = f.read() + content = content.strip().split("\n")[-1].split("\t") + last_save_day = int(content[0]) + last_save_pass = int(content[3]) + last_path = content[2] + xbox_base_key = int(content[1]) + os.remove("donefile.txt") + return [last_save_day, last_save_pass, last_path, xbox_base_key] + + def write_model_donefile(self, + output_path, + day, + pass_id, + xbox_base_key, + donefile_name="donefile.txt"): + """ + write donefile when save model + + Args: + output_path(str): output path + day(str|int): training day + pass_id(str|int): training pass id + xbox_base_key(str|int): xbox base key + donefile_name(str): donefile name, default is "donefile.txt" + + Examples: + .. code-block:: python + + from paddle.fluid.incubate.fleet.utils.fleet_util import GPUPSUtil + from paddle.distributed.fleet.utils.fs import AFSClient + hdfs_client = AFSClient() + fleet_util = GPUPSUtil() + fleet_util.set_fsclient(hdfs_client) + fleet_util.write_model_donefile(output_path="hdfs:/my/output", + model_path="hdfs:/my/model", + day=20190723, + pass_id=66, + xbox_base_key=int(time.time())) + + """ + day = str(day) + pass_id = str(pass_id) + xbox_base_key = int(xbox_base_key) + + if pass_id != "-1": + suffix_name = "/%s/%s/" % (day, pass_id) + model_path = output_path.rstrip("/") + suffix_name + else: + suffix_name = "/%s/0/" % day + model_path = output_path.rstrip("/") + suffix_name + + if fleet.worker_index() == 0: + donefile_path = output_path + "/" + donefile_name + content = "%s\t%lu\t%s\t%s\t%d" % (day, xbox_base_key,\ + model_path, pass_id, 0) + if self._afs.is_file(donefile_path): + self._afs.download(donefile_path, donefile_name) + pre_content = "" + with open(donefile_name, "r") as f: + pre_content = f.read() + pre_content_list = pre_content.strip().split("\n") + day_list = [i.split("\t")[0] for i in pre_content_list] + pass_list = [i.split("\t")[3] for i in pre_content_list] + os.remove(donefile_name) + exist = False + for i in range(len(day_list)): + if int(day) == int(day_list[i]) and \ + int(pass_id) == int(pass_list[i]): + exist = True + break + if not exist: + with open(donefile_name, "w") as f: + f.write(pre_content.strip() + "\n") + f.write(content + "\n") + self._afs.delete(donefile_path) + self._afs.upload(donefile_name, donefile_path) + self.rank0_error("write %s/%s %s succeed" % \ + (day, pass_id, donefile_name)) + else: + self.rank0_error("not write %s because %s/%s already " + "exists" % (donefile_name, day, pass_id)) + else: + with open(donefile_name, "w") as f: + f.write(content + "\n") + self._afs.upload(donefile_name, donefile_path) + self.rank0_error("write %s/%s %s succeed" % \ + (day, pass_id, donefile_name)) + + def write_xbox_donefile(self, + output_path, + day, + pass_id, + xbox_base_key, + data_path, + hadoop_fs_name, + hadoop_fs_ugi, + monitor_data={}, + hadoop_home="$HADOOP_HOME", + donefile_name=None): + """ + write delta donefile or xbox base donefile + + Args: + output_path(str): output path + day(str|int): training day of model + pass_id(str|int): training pass id of model + xbox_base_key(str|int): xbox base key + data_path(str|list): training data path + monitor_data(dict): metrics + hadoop_home(str): hadoop home, default is "$HADOOP_HOME" + donefile_name(str): donefile name, default is None" + + Examples: + .. code-block:: python + + from paddle.fluid.incubate.fleet.utils.fleet_util import GPUPSUtil + from paddle.distributed.fleet.utils.fs import AFSClient + hdfs_client = AFSClient() + fleet_util = GPUPSUtil() + fleet_util.set_fsclient(hdfs_client) + fleet_util.write_xbox_donefile( + output_path="hdfs:/my/output/", + model_path="hdfs:/my/output/20190722/01", + day=20190722, + pass_id=1, + xbox_base_key=int(time.time()), + data_path="hdfs:/my/data/", + monitor_data={}) + + """ + day = str(day) + pass_id = str(pass_id) + xbox_base_key = int(xbox_base_key) + mode = None + if pass_id != "-1": + mode = "patch" + suffix_name = "/%s/delta-%s/" % (day, pass_id) + model_path = output_path.rstrip("/") + suffix_name + if donefile_name is None: + donefile_name = "xbox_patch_done.txt" + else: + mode = "base" + suffix_name = "/%s/base/" % day + model_path = output_path.rstrip("/") + suffix_name + if donefile_name is None: + donefile_name = "xbox_base_done.txt" + + if isinstance(data_path, list): + data_path = ",".join(data_path) + if fleet.worker_index() == 0: + donefile_path = output_path + "/" + donefile_name + xbox_str = self._get_xbox_str(output_path, day, model_path, \ + xbox_base_key, data_path, hadoop_fs_name, monitor_data={}, + mode=mode) + + if self._afs.is_exist(donefile_path): + self.rank0_info("exist %s succeed" % (donefile_path)) + self._afs.download(donefile_path, donefile_name) + pre_content = "" + with open(donefile_name, "r") as f: + pre_content = f.read() + last_dict = json.loads(pre_content.strip().split("\n")[-1]) + last_day = last_dict["input"].split("/")[-3] + last_pass = last_dict["input"].split("/")[-2].split("-")[-1] + + os.remove(donefile_name) + self.rank0_info("remove %s succeed" % (donefile_name)) + exist = False + if int(day) < int(last_day) or \ + int(day) == int(last_day) and \ + int(pass_id) <= int(last_pass): + exist = True + if not exist: + with open(donefile_name, "w") as f: + f.write(pre_content.strip() + "\n") + f.write(xbox_str + "\n") + self._afs.delete(donefile_path) + self._afs.upload(donefile_name, donefile_path) + self.rank0_info("write %s/%s %s succeed" % \ + (day, pass_id, donefile_name)) + else: + self.rank0_info("not write %s because %s/%s already " + "exists" % (donefile_name, day, pass_id)) + else: + with open(donefile_name, "w") as f: + f.write(xbox_str + "\n") + self._afs.upload(donefile_name, donefile_path) + self.rank0_error("write %s/%s %s succeed" % \ + (day, pass_id, donefile_name)) + + def write_cache_donefile(self, + output_path, + day, + pass_id, + key_num, + donefile_name="sparse_cache.meta", + **kwargs): + """ + write cache donefile + + Args: + output_path(str): output path + day(str|int): training day of model + pass_id(str|int): training pass id of model + key_num(str|int): save cache return value + donefile_name(str): donefile name, default is "sparse_cache.meta" + kwargs(dict): user defined properties + file_num(int): cache file num + table_id(int): cache table id + + Examples: + .. code-block:: python + + from paddle.fluid.incubate.fleet.utils.fleet_util import GPUPSUtil + from paddle.distributed.fleet.utils.fs import AFSClient + hdfs_client = AFSClient() + fleet_util = GPUPSUtil() + fleet_util.set_fsclient(hdfs_client) + fleet_util.write_cache_donefile( + output_path="hdfs:/my/output/", + day=20190722, + pass_id=1, + key_num=123456) + + """ + day = str(day) + pass_id = str(pass_id) + key_num = int(key_num) + file_num = kwargs.get("file_num", 16) + table_id = kwargs.get("table_id", 0) + + if pass_id != "-1": + suffix_name = "/%s/delta-%s/%03d_cache" % (day, pass_id, table_id) + model_path = output_path.rstrip("/") + suffix_name + else: + suffix_name = "/%s/base/%03d_cache" % (day, table_id) + model_path = output_path.rstrip("/") + suffix_name + + if fleet.worker_index() == 0: + donefile_path = model_path + "/" + donefile_name + + if self._afs.is_file(donefile_path): + self.rank0_error( \ + "not write because %s already exists" % donefile_path) + else: + meta_str = "file_prefix:part\npart_num:%s\nkey_num:%d\n" \ + % (file_num, key_num) + with open(donefile_name, "w") as f: + f.write(meta_str) + self._afs.upload(donefile_name, donefile_path) + self.rank0_error("write %s succeed" % donefile_path) + + def _get_xbox_str(self, + output_path, + day, + model_path, + xbox_base_key, + data_path, + hadoop_fs_name, + monitor_data={}, + mode="patch"): + xbox_dict = collections.OrderedDict() + if mode == "base": + xbox_dict["id"] = str(xbox_base_key) + elif mode == "patch": + xbox_dict["id"] = str(int(time.time())) + else: + print("warning: unknown mode %s, set it to patch" % mode) + mode = "patch" + xbox_dict["id"] = str(int(time.time())) + xbox_dict["key"] = str(xbox_base_key) + if model_path.startswith("hdfs:") or model_path.startswith("afs:"): + model_path = model_path[model_path.find(":") + 1:] + xbox_dict["input"] = hadoop_fs_name + model_path.rstrip("/") + "/000" + xbox_dict["record_count"] = "111111" + xbox_dict["partition_type"] = "2" + xbox_dict["job_name"] = "default_job_name" + xbox_dict["ins_tag"] = "feasign" + xbox_dict["ins_path"] = data_path + xbox_dict["job_id"] = os.environ.get("PADDLE_JOB_ID", "") + # currently hard code here, set monitor_data empty string + xbox_dict["monitor_data"] = "" + xbox_dict["monitor_path"] = output_path.rstrip("/") + "/monitor/" \ + + day + ".txt" + xbox_dict["mpi_size"] = str(fleet.worker_num()) + return json.dumps(xbox_dict) From 482e5b6cd10171a54b5470cf288fce314d65480a Mon Sep 17 00:00:00 2001 From: lilong12 Date: Fri, 15 Apr 2022 14:12:30 +0800 Subject: [PATCH 181/211] update (#41762) --- .../collective/ProcessGroupHeter.cc | 44 +++++++------------ .../distributed/ps/service/heter_client.cc | 5 +-- 2 files changed, 19 insertions(+), 30 deletions(-) diff --git a/paddle/fluid/distributed/collective/ProcessGroupHeter.cc b/paddle/fluid/distributed/collective/ProcessGroupHeter.cc index a48bda06323be..354a8e23ae41f 100644 --- a/paddle/fluid/distributed/collective/ProcessGroupHeter.cc +++ b/paddle/fluid/distributed/collective/ProcessGroupHeter.cc @@ -105,11 +105,12 @@ std::shared_ptr ProcessGroupHeter::AllReduce( if (local_rank_ == 0) { std::vector cpu_tensors; cpu_tensors.reserve(in_tensors.size()); + phi::DenseTensor cpu_tensor; for (size_t i = 0; i < in_tensors.size(); i++) { auto gpu_tensor = in_tensors[i]; - auto cpu_tensor = cpu_tensors[i]; cpu_tensor.Resize(gpu_tensor.dims()); framework::TensorCopySync(gpu_tensor, platform::CPUPlace(), &cpu_tensor); + cpu_tensors.push_back(cpu_tensor); } // Step3: do inter cluster allreduce if (with_switch_) { @@ -125,37 +126,32 @@ std::shared_ptr ProcessGroupHeter::AllReduce( framework::DataTypeSize(dense_cpu_tensor.dtype())); PADDLE_ENFORCE_EQ(ret, 0, platform::errors::PreconditionNotMet( "Send to the switch module error.")); - phi::DenseTensorMeta meta = phi::DenseTensorMeta( - dense_cpu_tensor.dtype(), dense_cpu_tensor.dims()); - std::shared_ptr dense_cpu_tensor2 = - std::make_shared( - std::make_unique( - paddle::platform::CPUPlace()) - .get(), - meta); - dense_cpu_tensor2->ResizeAndAllocate(dense_cpu_tensor.dims()); + phi::DenseTensor cpu_tensor2; + cpu_tensor2.AllocateFrom( + std::make_unique( + paddle::platform::CPUPlace()) + .get(), + dense_cpu_tensor.dtype(), dense_cpu_tensor.numel()); ret = client_->Recv( - gid_, {dense_cpu_tensor.name()}, dense_cpu_tensor2->data(), - dense_cpu_tensor2->numel() * - framework::DataTypeSize(dense_cpu_tensor2->dtype())); + gid_, {dense_cpu_tensor.name()}, cpu_tensor2.data(), + cpu_tensor2.numel() * framework::DataTypeSize(cpu_tensor2.dtype())); PADDLE_ENFORCE_EQ(ret, 0, platform::errors::PreconditionNotMet( "Recv from the switch module error.")); switch (dense_cpu_tensor.dtype()) { case DataType::FLOAT32: _do_add(reinterpret_cast(dense_cpu_tensor.data()), - reinterpret_cast(dense_cpu_tensor2->data()), + reinterpret_cast(cpu_tensor2.data()), dense_cpu_tensor.numel()); break; case DataType::FLOAT64: - _do_add( - reinterpret_cast(dense_cpu_tensor.data()), - reinterpret_cast(dense_cpu_tensor2->data()), - dense_cpu_tensor.numel()); + _do_add(reinterpret_cast(dense_cpu_tensor.data()), + reinterpret_cast(cpu_tensor2.data()), + dense_cpu_tensor.numel()); break; case DataType::INT32: _do_add(reinterpret_cast(dense_cpu_tensor.data()), - reinterpret_cast(dense_cpu_tensor2->data()), + reinterpret_cast(cpu_tensor2.data()), dense_cpu_tensor.numel()); break; default: @@ -207,9 +203,10 @@ std::shared_ptr ProcessGroupHeter::Broadcast( cpu_tensors.reserve(in_tensors.size()); for (size_t i = 0; i < in_tensors.size(); i++) { auto gpu_tensor = in_tensors[i]; - auto cpu_tensor = cpu_tensors[i]; + phi::DenseTensor cpu_tensor; cpu_tensor.Resize(gpu_tensor.dims()); framework::TensorCopySync(gpu_tensor, platform::CPUPlace(), &cpu_tensor); + cpu_tensors.push_back(cpu_tensor); } if (with_switch_) { if (local_rank_ == 0) { @@ -234,13 +231,6 @@ std::shared_ptr ProcessGroupHeter::Broadcast( PADDLE_ENFORCE_EQ(ret, 0, platform::errors::PreconditionNotMet( "Receive from the switch module error.")); - ret = client_->Recv( - gid_, {dense_cpu_tensor.name()}, dense_cpu_tensor.data(), - dense_cpu_tensor.numel() * - framework::DataTypeSize(dense_cpu_tensor.dtype())); - PADDLE_ENFORCE_EQ(ret, 0, - platform::errors::PreconditionNotMet( - "Receive from the switch module error.")); } } } else { diff --git a/paddle/fluid/distributed/ps/service/heter_client.cc b/paddle/fluid/distributed/ps/service/heter_client.cc index 4ca25dac826f0..16c1ff764dc3c 100644 --- a/paddle/fluid/distributed/ps/service/heter_client.cc +++ b/paddle/fluid/distributed/ps/service/heter_client.cc @@ -286,8 +286,7 @@ int HeterClient::Send(int group_id, const std::vector& var_names, request.add_vars_len(var_len); } auto& request_buffer = closure->cntl.request_attachment(); - request_buffer.append(reinterpret_cast(data_ptr), - data_size * sizeof(float)); + request_buffer.append(reinterpret_cast(data_ptr), data_size); auto promise = std::make_shared>(); closure->add_promise(promise); std::future fut = promise->get_future(); @@ -387,7 +386,7 @@ int HeterClient::Recv(int group_id, const std::vector& var_names, if (xpu_channels_.size() < 2) { LOG(ERROR) << "xpu_channels_ is null"; } - recv_switch_channels_.push_back(xpu_channels_[1]); + recv_switch_channels_.push_back(xpu_channels_[0]); } brpc::Channel* channel = recv_switch_channels_[0].get(); ::paddle::distributed::PsService_Stub stub(channel); From ff818c773792d8b367f5291457e3612acbdf118a Mon Sep 17 00:00:00 2001 From: TTerror Date: Fri, 15 Apr 2022 15:11:57 +0800 Subject: [PATCH 182/211] add fp16 for masked_select on kunlun, *test=kunlun (#41215) --- paddle/fluid/operators/masked_select_op_xpu.cc | 8 ++++++-- paddle/fluid/platform/device/xpu/xpu2_op_list.h | 1 + 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/paddle/fluid/operators/masked_select_op_xpu.cc b/paddle/fluid/operators/masked_select_op_xpu.cc index 00248165a511d..3845046825355 100644 --- a/paddle/fluid/operators/masked_select_op_xpu.cc +++ b/paddle/fluid/operators/masked_select_op_xpu.cc @@ -19,13 +19,15 @@ namespace operators { template class MaskedSelectXPUKernel : public framework::OpKernel { + using XPUType = typename XPUTypeTrait::Type; + public: void Compute(const framework::ExecutionContext& context) const override { auto input = context.Input("X"); auto mask = context.Input("Mask"); auto out = context.Output("Y"); auto* mask_data = mask->data(); - auto* input_data = input->data(); + auto* input_data = reinterpret_cast(input->data()); auto input_dim = input->dims(); auto mask_dim = mask->dims(); PADDLE_ENFORCE_EQ( @@ -51,7 +53,8 @@ class MaskedSelectXPUKernel : public framework::OpKernel { framework::DDim out_dim{out_size_cpu}; out->Resize(out_dim); - auto out_data = out->mutable_data(context.GetPlace()); + auto out_data = + reinterpret_cast(out->mutable_data(context.GetPlace())); auto input_shape = phi::vectorize(input_dim); auto mask_shape = phi::vectorize(mask_dim); @@ -69,6 +72,7 @@ class MaskedSelectXPUKernel : public framework::OpKernel { namespace ops = paddle::operators; namespace plat = paddle::platform; REGISTER_OP_XPU_KERNEL(masked_select, ops::MaskedSelectXPUKernel, + ops::MaskedSelectXPUKernel, ops::MaskedSelectXPUKernel, ops::MaskedSelectXPUKernel); #endif diff --git a/paddle/fluid/platform/device/xpu/xpu2_op_list.h b/paddle/fluid/platform/device/xpu/xpu2_op_list.h index 750a389940c65..6f4826bd8c39a 100644 --- a/paddle/fluid/platform/device/xpu/xpu2_op_list.h +++ b/paddle/fluid/platform/device/xpu/xpu2_op_list.h @@ -243,6 +243,7 @@ XPUOpMap& get_kl2_ops() { {"masked_select", XPUKernelSet({pOpKernelType(vartype::INT32, XPUPlace()), pOpKernelType(vartype::INT64, XPUPlace()), + pOpKernelType(vartype::FP16, XPUPlace()), pOpKernelType(vartype::FP32, XPUPlace())})}, {"matmul_grad", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, {"matmul_v2_grad", From e6fb65995df479448b216eb59e9c366d32ddcc64 Mon Sep 17 00:00:00 2001 From: Haohongxiang <86215757+haohongxiang@users.noreply.github.com> Date: Fri, 15 Apr 2022 15:16:39 +0800 Subject: [PATCH 183/211] [Dygraph] Refactor Model Parallel in eager mode (#41761) * refactor mp in eager mode * update * update * add uts --- .../fluid/operators/class_center_sample_op.cu | 43 ++++-- .../operators/collective/c_allreduce_op.h | 41 +++++- .../operators/collective/c_concat_op.cu.cc | 47 ++++-- .../c_softmax_with_cross_entropy_op.cu | 139 ++++++++++++++++++ .../c_softmax_with_cross_entropy_op.h | 12 ++ python/paddle/distributed/collective.py | 28 +++- python/paddle/fluid/dygraph/parallel.py | 2 +- python/paddle/fluid/layers/tensor.py | 4 +- .../test_parallel_class_center_sample.py | 4 + .../test_parallel_dygraph_dataparallel.py | 18 +-- .../test_parallel_dygraph_mp_layers.py | 3 + .../test_parallel_dygraph_tensor_parallel.py | 9 +- 12 files changed, 305 insertions(+), 45 deletions(-) diff --git a/paddle/fluid/operators/class_center_sample_op.cu b/paddle/fluid/operators/class_center_sample_op.cu index 92bf20d6cf95d..1064c77cc0041 100644 --- a/paddle/fluid/operators/class_center_sample_op.cu +++ b/paddle/fluid/operators/class_center_sample_op.cu @@ -27,8 +27,10 @@ namespace cub = hipcub; #include #include #include "paddle/fluid/operators/class_center_sample_op.h" +#include "paddle/phi/api/include/tensor.h" #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) +#include "paddle/fluid/distributed/collective/ProcessGroup.h" #include "paddle/fluid/platform/collective_helper.h" #include "paddle/fluid/platform/device/gpu/nccl_helper.h" #endif @@ -328,19 +330,34 @@ class ClassCenterSampleCUDAKernel : public framework::OpKernel { #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) if (nranks > 1) { - const auto& comm = - platform::NCCLCommContext::Instance().Get(rid, ctx.GetPlace()); - // use global calculate stream - const auto calcu_stream = - static_cast( - platform::DeviceContextPool::Instance().Get(ctx.GetPlace())) - ->stream(); - PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllReduce( - num_classes_per_device_ptr, num_classes_per_device_ptr, - num_classes_per_device.numel(), - platform::ToNCCLDataType( - framework::TransToProtoVarType(num_classes_per_device.dtype())), - ncclSum, comm->comm(), calcu_stream)); + auto map = distributed::ProcessGroupMapFromGid::getInstance(); + if (map->has(rid)) { + // Use ProcessGroup + distributed::ProcessGroup* pg = map->get(rid); + std::vector in_tensor; + std::vector out_tensor; + in_tensor.push_back(num_classes_per_device); + out_tensor.push_back(num_classes_per_device); + + distributed::AllreduceOptions opts; + opts.reduce_op = distributed::ReduceOp::SUM; + auto task = pg->AllReduce(in_tensor, out_tensor, opts); + task->Wait(); + } else { + const auto& comm = + platform::NCCLCommContext::Instance().Get(rid, ctx.GetPlace()); + // use global calculate stream + const auto calcu_stream = + static_cast( + platform::DeviceContextPool::Instance().Get(ctx.GetPlace())) + ->stream(); + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllReduce( + num_classes_per_device_ptr, num_classes_per_device_ptr, + num_classes_per_device.numel(), + platform::ToNCCLDataType( + framework::TransToProtoVarType(num_classes_per_device.dtype())), + ncclSum, comm->comm(), calcu_stream)); + } } #endif diff --git a/paddle/fluid/operators/collective/c_allreduce_op.h b/paddle/fluid/operators/collective/c_allreduce_op.h index 2c4e85400ca4a..404f7c017ac41 100644 --- a/paddle/fluid/operators/collective/c_allreduce_op.h +++ b/paddle/fluid/operators/collective/c_allreduce_op.h @@ -16,12 +16,14 @@ limitations under the License. */ #include +#include "paddle/fluid/distributed/collective/ProcessGroup.h" #include "paddle/fluid/framework/data_type.h" #include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/memory/memcpy.h" #include "paddle/fluid/memory/memory.h" #include "paddle/fluid/platform/device/npu/npu_op_runner.h" +#include "paddle/phi/api/include/tensor.h" #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || \ defined(PADDLE_WITH_ASCEND_CL) || defined(PADDLE_WITH_XPU_BKCL) || \ @@ -351,6 +353,7 @@ class CAllReduceOpCUDAKernel : public framework::OpKernel { #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) auto in = ctx.Input("X"); auto out = ctx.Output("Out"); + int rid = ctx.Attr("ring_id"); auto place = ctx.GetPlace(); ncclDataType_t dtype = @@ -360,7 +363,43 @@ class CAllReduceOpCUDAKernel : public framework::OpKernel { out->Resize(in->dims()); void* recvbuff = out->mutable_data(place); - int rid = ctx.Attr("ring_id"); + auto map = distributed::ProcessGroupMapFromGid::getInstance(); + if (map->has(rid)) { + // Use ProcessGroup + distributed::ProcessGroup* pg = map->get(rid); + std::vector in_tensor; + std::vector out_tensor; + in_tensor.push_back(*in); + out_tensor.push_back(*out); + + distributed::AllreduceOptions opts; + switch (red_type) { + case kRedSum: + opts.reduce_op = distributed::ReduceOp::SUM; + break; + + case kRedMax: + opts.reduce_op = distributed::ReduceOp::MAX; + break; + + case kRedMin: + opts.reduce_op = distributed::ReduceOp::MIN; + break; + + case kRedProd: + opts.reduce_op = distributed::ReduceOp::PRODUCT; + break; + + default: + PADDLE_THROW(platform::errors::InvalidArgument( + "Invalid reduce type: %d", red_type)); + } + + auto task = pg->AllReduce(in_tensor, out_tensor, opts); + task->Wait(); + return; + } + auto comm = platform::NCCLCommContext::Instance().Get(rid, place); gpuStream_t stream = nullptr; diff --git a/paddle/fluid/operators/collective/c_concat_op.cu.cc b/paddle/fluid/operators/collective/c_concat_op.cu.cc index db5a2317a2d81..d3d9db0e5f87e 100644 --- a/paddle/fluid/operators/collective/c_concat_op.cu.cc +++ b/paddle/fluid/operators/collective/c_concat_op.cu.cc @@ -16,8 +16,10 @@ limitations under the License. */ #include "paddle/fluid/operators/collective/c_concat_op.h" #include "paddle/fluid/operators/math/concat_and_split.h" +#include "paddle/phi/api/include/tensor.h" #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) +#include "paddle/fluid/distributed/collective/ProcessGroup.h" #include "paddle/fluid/platform/collective_helper.h" #include "paddle/fluid/platform/device/gpu/nccl_helper.h" #endif @@ -55,26 +57,39 @@ class CConcatOpCUDAKernel : public framework::OpKernel { rank, nranks)); #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) - auto comm = platform::NCCLCommContext::Instance().Get(rid, place); - PADDLE_ENFORCE_EQ( - nranks, comm->nranks(), - platform::errors::InvalidArgument("nranks: %s should equal to %s", - nranks, comm->nranks())); - framework::Tensor temp_out; framework::DDim temp_out_dims = x->dims(); temp_out_dims[0] *= nranks; temp_out.mutable_data(temp_out_dims, place); - int64_t send_numel = x->numel(); - const T* send_buff = x->data(); - T* recv_buff = temp_out.data(); - gpuStream_t stream = nullptr; - auto dev_ctx = platform::DeviceContextPool::Instance().Get(place); - stream = static_cast(dev_ctx)->stream(); - - PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllGather( - send_buff, recv_buff, send_numel, static_cast(dtype), - comm->comm(), stream)); + + auto map = distributed::ProcessGroupMapFromGid::getInstance(); + if (map->has(rid)) { + // Use ProcessGroup + distributed::ProcessGroup* pg = map->get(rid); + std::vector in_tensor; + std::vector out_tensor; + in_tensor.push_back(*x); + out_tensor.push_back(temp_out); + auto task = pg->AllGather(in_tensor, out_tensor); + task->Wait(); + } else { + auto comm = platform::NCCLCommContext::Instance().Get(rid, place); + PADDLE_ENFORCE_EQ( + nranks, comm->nranks(), + platform::errors::InvalidArgument("nranks: %s should equal to %s", + nranks, comm->nranks())); + + int64_t send_numel = x->numel(); + const T* send_buff = x->data(); + T* recv_buff = temp_out.data(); + gpuStream_t stream = nullptr; + auto dev_ctx = platform::DeviceContextPool::Instance().Get(place); + stream = static_cast(dev_ctx)->stream(); + + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllGather( + send_buff, recv_buff, send_numel, static_cast(dtype), + comm->comm(), stream)); + } std::vector inputs; int axis = x->dims().size() - 1; diff --git a/paddle/fluid/operators/collective/c_softmax_with_cross_entropy_op.cu b/paddle/fluid/operators/collective/c_softmax_with_cross_entropy_op.cu index b5beb770909b5..4c9fb14842489 100644 --- a/paddle/fluid/operators/collective/c_softmax_with_cross_entropy_op.cu +++ b/paddle/fluid/operators/collective/c_softmax_with_cross_entropy_op.cu @@ -18,6 +18,7 @@ limitations under the License. */ #include "paddle/fluid/platform/collective_helper.h" #include "paddle/fluid/platform/device/gpu/nccl_helper.h" #include "paddle/fluid/string/string_helper.h" +#include "paddle/phi/backends/gpu/gpu_context.h" #include "paddle/phi/kernels/funcs/axis_utils.h" namespace paddle { @@ -73,6 +74,21 @@ template class CSoftmaxWithCrossEntropyOpCUDAKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { + const int rid = ctx.Attr("ring_id"); + auto map = distributed::ProcessGroupMapFromGid::getInstance(); + if (map->has(rid)) { + CSoftmaxWithCrossEntropyProcessGroupFunctor functor_; + functor_(ctx); + } else { + CSoftmaxWithCrossEntropyFunctor functor_; + functor_(ctx); + } + } +}; + +template +struct CSoftmaxWithCrossEntropyFunctor { + void operator()(const framework::ExecutionContext& ctx) { const Tensor* logits = ctx.Input("Logits"); const Tensor* labels = ctx.Input("Label"); Tensor* softmax = ctx.Output("Softmax"); @@ -201,6 +217,129 @@ class CSoftmaxWithCrossEntropyOpCUDAKernel : public framework::OpKernel { } }; +template +struct CSoftmaxWithCrossEntropyProcessGroupFunctor { + void operator()(const framework::ExecutionContext& ctx) { + const Tensor* logits = ctx.Input("Logits"); + const Tensor* labels = ctx.Input("Label"); + Tensor* softmax = ctx.Output("Softmax"); + Tensor* loss = ctx.Output("Loss"); + + const int rid = ctx.Attr("ring_id"); + const int nranks = ctx.Attr("nranks"); + const int rank = ctx.Attr("rank"); + + const auto& place = ctx.GetPlace(); + auto& dev_ctx = ctx.template device_context(); + + auto map = distributed::ProcessGroupMapFromGid::getInstance(); + distributed::ProcessGroup* pg = map->get(rid); + distributed::AllreduceOptions opts; + opts.reduce_op = distributed::ReduceOp::SUM; + + // allocate memory on device. + softmax->mutable_data(place); + loss->mutable_data(place); + + const auto& logits_dims = logits->dims(); + const auto& labels_dims = labels->dims(); + + const int axis = logits_dims.size() - 1; + const int N = phi::funcs::SizeToAxis(axis, logits_dims); + const int D = phi::funcs::SizeFromAxis(axis, logits_dims); + + Tensor logits_2d, softmax_2d, loss_2d; + logits_2d.ShareDataWith(*logits).Resize({N, D}); + softmax_2d.ShareDataWith(*softmax).Resize({N, D}); + loss_2d.ShareDataWith(*loss).Resize({N, 1}); + + auto eigen_logits = math::EigenMatrix::From(logits_2d); + auto eigen_softmax = math::EigenMatrix::From(softmax_2d); + + // step 1, obtain logit_max + Tensor logits_max; + logits_max = + ctx.AllocateTmpTensor({N, 1}, dev_ctx); + + auto eigen_logits_max = math::EigenMatrix::From(logits_max); + Eigen::DSizes along_axis(1); + eigen_logits_max.device(*dev_ctx.eigen_device()) = + eigen_logits.maximum(along_axis); + + std::vector in_out; + in_out.push_back(logits_max); + pg->AllReduce(in_out, in_out, opts)->Synchronize(); + + // step 2, obtain logit - logit_max + Eigen::DSizes batch_by_one(N, 1); + Eigen::DSizes one_by_class(1, D); + + eigen_softmax.device(*dev_ctx.eigen_device()) = + (eigen_logits - + eigen_logits_max.reshape(batch_by_one).broadcast(one_by_class)) + .unaryExpr(math::ValueClip()); + + // step 3, obtain predict target + Tensor predicted_logits; + predicted_logits = + ctx.AllocateTmpTensor({N, 1}, dev_ctx); + predicted_logits.mutable_data(place); + + auto t = framework::EigenVector::Flatten(predicted_logits); + t.device(*dev_ctx.eigen_device()) = t.constant(static_cast(0)); + + const int start_index = rank * D; + const int end_index = start_index + D; + + int blocks = NumBlocks(N); + int threads = kNumCUDAThreads; + const auto& label_type = framework::TransToProtoVarType(labels->dtype()); + + if (label_type == framework::proto::VarType::INT32) { + MaskLabelByIndex<<>>( + predicted_logits.data(), softmax_2d.data(), + labels->data(), start_index, end_index, N, D, nranks); + } else if (label_type == framework::proto::VarType::INT64) { + MaskLabelByIndex<<>>( + predicted_logits.data(), softmax_2d.data(), + labels->data(), start_index, end_index, N, D, nranks); + } + + in_out.clear(); + in_out.push_back(predicted_logits); + pg->AllReduce(in_out, in_out, opts)->Synchronize(); + + // step 4, obtain exp(logit) + eigen_softmax.device(*dev_ctx.eigen_device()) = eigen_softmax.exp(); + + // step 5, obtain sum_exp_logits + Tensor sum_exp_logits; + sum_exp_logits = + ctx.AllocateTmpTensor({N, 1}, dev_ctx); + void* sum_exp_logits_buff = sum_exp_logits.mutable_data(place); + + auto eigen_sum_exp_logits = math::EigenMatrix::From(sum_exp_logits); + eigen_sum_exp_logits.device(*dev_ctx.eigen_device()) = + eigen_softmax.sum(along_axis); + + in_out.clear(); + in_out.push_back(sum_exp_logits); + pg->AllReduce(in_out, in_out, opts)->Synchronize(); + + auto eigen_loss = math::EigenMatrix::From(loss_2d); + auto eigen_predicted_logits = math::EigenMatrix::From(predicted_logits); + + eigen_loss.device(*dev_ctx.eigen_device()) = + (eigen_sum_exp_logits.log().unaryExpr(math::TolerableValue()) - + eigen_predicted_logits) + .unaryExpr(math::TolerableValue()); + + eigen_softmax.device(*dev_ctx.eigen_device()) = + (eigen_softmax * + eigen_sum_exp_logits.inverse().broadcast(one_by_class)); + } +}; + template class CSoftmaxWithCrossEntropyGradCUDAKernel : public framework::OpKernel { public: diff --git a/paddle/fluid/operators/collective/c_softmax_with_cross_entropy_op.h b/paddle/fluid/operators/collective/c_softmax_with_cross_entropy_op.h index f5399e3215d58..0336d565de2bf 100644 --- a/paddle/fluid/operators/collective/c_softmax_with_cross_entropy_op.h +++ b/paddle/fluid/operators/collective/c_softmax_with_cross_entropy_op.h @@ -18,11 +18,13 @@ limitations under the License. */ #include #include +#include "paddle/fluid/distributed/collective/ProcessGroup.h" #include "paddle/fluid/framework/data_type.h" #include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/operators/math/cross_entropy.h" #include "paddle/fluid/operators/math/softmax.h" +#include "paddle/phi/api/include/tensor.h" namespace paddle { namespace operators { @@ -36,5 +38,15 @@ class CSoftmaxWithCrossEntropyOpCPUKernel : public framework::OpKernel { } }; +template +struct CSoftmaxWithCrossEntropyFunctor { + void operator()(const framework::ExecutionContext& ctx); +}; + +template +struct CSoftmaxWithCrossEntropyProcessGroupFunctor { + void operator()(const framework::ExecutionContext& ctx); +}; + } // namespace operators } // namespace paddle diff --git a/python/paddle/distributed/collective.py b/python/paddle/distributed/collective.py index d2bed171aa27a..993b45b4eecf9 100644 --- a/python/paddle/distributed/collective.py +++ b/python/paddle/distributed/collective.py @@ -20,6 +20,7 @@ from ..fluid.framework import in_dygraph_mode from ..fluid.framework import OpProtoHolder from ..fluid.framework import _non_static_mode +from ..fluid.framework import _in_legacy_dygraph from ..fluid.framework import convert_np_dtype_to_dtype_ from ..fluid.framework import _varbase_creator from ..fluid.data_feeder import convert_dtype @@ -1132,13 +1133,36 @@ def _mp_allreduce(tensor, group=None, use_calc_stream=True, use_model_parallel=True): - """[it is same as allreduce above, but it suuports model parallel. And it support inplace startegy] + """[it is same as allreduce above, but it supports model parallel. And it support inplace startegy] """ if group is not None and not group.is_member(): return ring_id = 0 if group is None else group.id - if _non_static_mode(): + if in_dygraph_mode(): + assert op == ReduceOp.SUM, "Unknown parameter: {}.".format(op) + + from paddle.autograd import EagerPyLayer + + class mp_allreduce_eager(EagerPyLayer): + @staticmethod + def forward(ctx, tensor, use_calc_stream, ring_id, + use_model_parallel): + ctx.ring_id = ring_id + return _C_ops.c_allreduce_sum_( + tensor, 'use_calc_stream', use_calc_stream, 'ring_id', + ring_id, "use_model_parallel", use_model_parallel) + + @staticmethod + def backward(ctx, dy): + return _C_ops.c_identity(dy, 'use_calc_stream', True, 'ring_id', + ctx.ring_id, 'use_model_parallel', + True) + + return mp_allreduce_eager.apply(tensor, use_calc_stream, ring_id, + use_model_parallel) + + elif _in_legacy_dygraph(): if op == ReduceOp.SUM: return _C_ops.c_allreduce_sum_( tensor, 'use_calc_stream', use_calc_stream, 'ring_id', ring_id, diff --git a/python/paddle/fluid/dygraph/parallel.py b/python/paddle/fluid/dygraph/parallel.py index e6b891cbe00bf..fe1b56931f89d 100644 --- a/python/paddle/fluid/dygraph/parallel.py +++ b/python/paddle/fluid/dygraph/parallel.py @@ -378,7 +378,7 @@ def sync_params_buffers(model, param.name) # is_distributed param not need to sync when in mp mode - if isinstance(param, ParamBase): + if isinstance(param, (ParamBase, core.eager.Tensor)): if is_model_parallel and param.is_distributed: continue diff --git a/python/paddle/fluid/layers/tensor.py b/python/paddle/fluid/layers/tensor.py index 5fa110e4db0fa..693fbf20e64a8 100644 --- a/python/paddle/fluid/layers/tensor.py +++ b/python/paddle/fluid/layers/tensor.py @@ -329,7 +329,9 @@ def concat(input, axis=0, name=None): axis = axis.item(0) if not isinstance(input, Variable): input = [t for t in input if t.shape.count(0) == 0] - return _C_ops.final_state_concat(input, axis) + out = _varbase_creator() + _C_ops.concat(input, out, 'axis', axis) + return out if _in_legacy_dygraph(): if isinstance(axis, Variable): diff --git a/python/paddle/fluid/tests/unittests/test_parallel_class_center_sample.py b/python/paddle/fluid/tests/unittests/test_parallel_class_center_sample.py index 19fc617ea25cd..e2a526110f18a 100644 --- a/python/paddle/fluid/tests/unittests/test_parallel_class_center_sample.py +++ b/python/paddle/fluid/tests/unittests/test_parallel_class_center_sample.py @@ -14,16 +14,20 @@ from __future__ import print_function +import os import unittest import paddle.fluid as fluid from test_parallel_dygraph_dataparallel import TestMultipleGpus +from paddle.fluid.framework import _test_eager_guard class TestParallelClassCenterSample(TestMultipleGpus): def test_parallel_class_center_sample(self): self.run_mnist_2gpu('parallel_class_center_sample.py') + self.run_mnist_2gpu('parallel_class_center_sample.py', eager_mode=False) if __name__ == "__main__": + os.environ["FLAGS_enable_eager_mode"] = "1" unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_parallel_dygraph_dataparallel.py b/python/paddle/fluid/tests/unittests/test_parallel_dygraph_dataparallel.py index 59013236967db..8145e880a650e 100644 --- a/python/paddle/fluid/tests/unittests/test_parallel_dygraph_dataparallel.py +++ b/python/paddle/fluid/tests/unittests/test_parallel_dygraph_dataparallel.py @@ -100,6 +100,7 @@ def start_local_trainers(cluster, pod, training_script, training_script_args, + eager_mode=True, log_dir=None): current_env = copy.copy(os.environ.copy()) #paddle broadcast ncclUniqueId use socket, and @@ -119,6 +120,9 @@ def start_local_trainers(cluster, "PADDLE_TRAINER_ENDPOINTS": ",".join(cluster.trainers_endpoints()) } + if not eager_mode: + proc_env["FLAGS_enable_eager_mode"] = "%d" % 0 + current_env.update(proc_env) print("trainer proc env:{}".format(current_env)) @@ -145,15 +149,8 @@ def start_local_trainers(cluster, return procs -def get_dist_port_from_flags(): - DIST_UT_PORT = 6175 - if os.getenv("PADDLE_DIST_UT_PORT"): - DIST_UT_PORT = int(os.getenv("PADDLE_DIST_UT_PORT")) - return DIST_UT_PORT - - class TestMultipleGpus(unittest.TestCase): - def run_mnist_2gpu(self, target_file_name): + def run_mnist_2gpu(self, target_file_name, eager_mode=True): if not fluid.core.is_compiled_with_cuda( ) or fluid.core.get_cuda_device_count() == 0: return @@ -167,6 +164,7 @@ def run_mnist_2gpu(self, target_file_name): procs = start_local_trainers( cluster, pod, + eager_mode=eager_mode, training_script=target_file_name, training_script_args=[]) @@ -206,9 +204,9 @@ def test_multiple_gpus_dynamic(self): class TestDataParallelWithPyLayer(TestMultipleGpus): def test_parallel_dygraph_dataparallel_with_pylayer(self): - with _test_eager_guard(): - self.run_mnist_2gpu('parallel_dygraph_dataparallel_with_pylayer.py') self.run_mnist_2gpu('parallel_dygraph_dataparallel_with_pylayer.py') + self.run_mnist_2gpu( + 'parallel_dygraph_dataparallel_with_pylayer.py', eager_mode=False) class TestGradientCheckInEagerMode(TestMultipleGpus): diff --git a/python/paddle/fluid/tests/unittests/test_parallel_dygraph_mp_layers.py b/python/paddle/fluid/tests/unittests/test_parallel_dygraph_mp_layers.py index e0a2770852b63..da8df19a1e649 100644 --- a/python/paddle/fluid/tests/unittests/test_parallel_dygraph_mp_layers.py +++ b/python/paddle/fluid/tests/unittests/test_parallel_dygraph_mp_layers.py @@ -14,6 +14,7 @@ from __future__ import print_function +import os import unittest import paddle.fluid as fluid @@ -23,7 +24,9 @@ class TestModelParallelLayer(TestMultipleGpus): def test_hybrid_parallel_mp_layer(self): self.run_mnist_2gpu('hybrid_parallel_mp_layers.py') + self.run_mnist_2gpu('hybrid_parallel_mp_layers.py', eager_mode=False) if __name__ == "__main__": + os.environ["FLAGS_enable_eager_mode"] = "1" unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_parallel_dygraph_tensor_parallel.py b/python/paddle/fluid/tests/unittests/test_parallel_dygraph_tensor_parallel.py index 3705deb5ad856..fda6dc06309c5 100644 --- a/python/paddle/fluid/tests/unittests/test_parallel_dygraph_tensor_parallel.py +++ b/python/paddle/fluid/tests/unittests/test_parallel_dygraph_tensor_parallel.py @@ -14,6 +14,7 @@ from __future__ import print_function +import os import unittest import paddle.fluid as fluid @@ -22,20 +23,26 @@ class TestHybridParallel(TestMultipleGpus): def test_hybrid_parallel_mp_random(self): - self.run_mnist_2gpu('hybrid_parallel_mp_random.py') + # self.run_mnist_2gpu('hybrid_parallel_mp_random.py') + self.run_mnist_2gpu('hybrid_parallel_mp_random.py', eager_mode=False) def test_hybrid_parallel_mp_model(self): self.run_mnist_2gpu('hybrid_parallel_mp_model.py') + self.run_mnist_2gpu('hybrid_parallel_mp_model.py', eager_mode=False) def test_hybrid_parallel_mp_amp(self): self.run_mnist_2gpu('hybrid_parallel_mp_amp.py') + self.run_mnist_2gpu('hybrid_parallel_mp_amp.py', eager_mode=False) def test_hybrid_parallel_mp_fp16(self): self.run_mnist_2gpu('hybrid_parallel_mp_fp16.py') + self.run_mnist_2gpu('hybrid_parallel_mp_fp16.py', eager_mode=False) def test_hybrid_parallel_mp_clip_grad(self): self.run_mnist_2gpu('hybrid_parallel_mp_clip_grad.py') + self.run_mnist_2gpu('hybrid_parallel_mp_clip_grad.py', eager_mode=False) if __name__ == "__main__": + os.environ["FLAGS_enable_eager_mode"] = "1" unittest.main() From 27f28e82c93a002a4cc382787316f5fd5d97997f Mon Sep 17 00:00:00 2001 From: Zhanlue Yang Date: Fri, 15 Apr 2022 15:40:51 +0800 Subject: [PATCH 184/211] [DoubleGrad] Enabled test_imperative_star_gan_with_gradient_penalty.py under eager mode (#41730) * [DoubleGrad] Enabled double grad test cases in eager_mode for test_imperative_double_grad * Fixed elementwise issue * Addressed CI failures * [DoubleGrad] Enabled test_imperative_triple_grad test cases under eager_mode * [DoubleGrad] Enabled test_autograd_functional_dynamic.py under eager mode * Enabled more test cases * [DoubleGrad] Enabled test_imperative_star_gan_with_gradient_penalty.py under eager mode * Adjusted test_imperative_star_gan_with_gradient_penalty.py --- .../final_state_generator/codegen_utils.py | 1 + paddle/fluid/eager/tensor_wrapper.h | 1 - paddle/phi/kernels/conv_grad_grad_kernel.h | 10 ++--- .../phi/kernels/cpu/conv_grad_grad_kernel.cc | 10 ++--- .../kernels/gpudnn/conv_grad_grad_kernel.cu | 30 ++++++------- .../kernels/impl/conv_grad_grad_kernel_impl.h | 10 ++--- paddle/phi/ops/compat/conv2d_sig.cc | 4 +- ...perative_star_gan_with_gradient_penalty.py | 43 ++++++++++++++++++- python/paddle/nn/functional/activation.py | 5 ++- python/paddle/utils/code_gen/backward.yaml | 26 ++++++++++- 10 files changed, 103 insertions(+), 37 deletions(-) diff --git a/paddle/fluid/eager/auto_code_generator/final_state_generator/codegen_utils.py b/paddle/fluid/eager/auto_code_generator/final_state_generator/codegen_utils.py index 96af7dfc4fe65..ab8c28c33e78c 100644 --- a/paddle/fluid/eager/auto_code_generator/final_state_generator/codegen_utils.py +++ b/paddle/fluid/eager/auto_code_generator/final_state_generator/codegen_utils.py @@ -32,6 +32,7 @@ "add_triple_grad", "multiply_double_grad", "multiply_triple_grad", + "conv2d_grad_grad", ]) # For API dispatch used at python-level diff --git a/paddle/fluid/eager/tensor_wrapper.h b/paddle/fluid/eager/tensor_wrapper.h index e42e04a266b46..405105771b9b1 100644 --- a/paddle/fluid/eager/tensor_wrapper.h +++ b/paddle/fluid/eager/tensor_wrapper.h @@ -118,7 +118,6 @@ class TensorWrapper { p_ab_autograd_meta->SetGradNode(new_grad_node); } recovered_tensor.set_autograd_meta(p_ab_autograd_meta); - return recovered_tensor; } } diff --git a/paddle/phi/kernels/conv_grad_grad_kernel.h b/paddle/phi/kernels/conv_grad_grad_kernel.h index 339f1c00eaa50..0a359d778a681 100644 --- a/paddle/phi/kernels/conv_grad_grad_kernel.h +++ b/paddle/phi/kernels/conv_grad_grad_kernel.h @@ -20,11 +20,11 @@ namespace phi { template void ConvGradGradKernel(const Context& dev_ctx, - paddle::optional input_grad_grad, - paddle::optional filter_grad_grad, - const DenseTensor& out_grad, const DenseTensor& input, const DenseTensor& filter, + const DenseTensor& out_grad, + paddle::optional input_grad_grad, + paddle::optional filter_grad_grad, const std::vector& strides, const std::vector& paddings, const std::string& paddding_algorithm, @@ -34,9 +34,9 @@ void ConvGradGradKernel(const Context& dev_ctx, bool use_addto, int workspace_size_MB, bool exhaustive_search, - DenseTensor* out_grad_grad, DenseTensor* input_grad, - DenseTensor* filter_grad); + DenseTensor* filter_grad, + DenseTensor* out_grad_grad); template void Conv3DGradGradKernel(const Context& dev_ctx, diff --git a/paddle/phi/kernels/cpu/conv_grad_grad_kernel.cc b/paddle/phi/kernels/cpu/conv_grad_grad_kernel.cc index f157bb017f81c..4966c998dd37d 100644 --- a/paddle/phi/kernels/cpu/conv_grad_grad_kernel.cc +++ b/paddle/phi/kernels/cpu/conv_grad_grad_kernel.cc @@ -39,11 +39,11 @@ void Conv3DGradGradKernel(const Context& ctx, DenseTensor* input_grad, DenseTensor* filter_grad) { ConvGradGradKernel(ctx, - input_grad_grad, - filter_grad_grad, - out_grad, input, filter, + out_grad, + input_grad_grad, + filter_grad_grad, strides, paddings_t, padding_algorithm, @@ -53,9 +53,9 @@ void Conv3DGradGradKernel(const Context& ctx, use_addto, workspace_size_MB, exhaustive_search_t, - out_grad_grad, input_grad, - filter_grad); + filter_grad, + out_grad_grad); } } // namespace phi diff --git a/paddle/phi/kernels/gpudnn/conv_grad_grad_kernel.cu b/paddle/phi/kernels/gpudnn/conv_grad_grad_kernel.cu index 74525e63f476b..58c7ea69869b3 100644 --- a/paddle/phi/kernels/gpudnn/conv_grad_grad_kernel.cu +++ b/paddle/phi/kernels/gpudnn/conv_grad_grad_kernel.cu @@ -44,11 +44,11 @@ namespace phi { template void ConvCudnnGradGradKernel( const Context& ctx, - paddle::optional input_grad_grad, - paddle::optional filter_grad_grad, - const DenseTensor& out_grad, const DenseTensor& input, const DenseTensor& filter, + const DenseTensor& out_grad, + paddle::optional input_grad_grad, + paddle::optional filter_grad_grad, const std::vector& strides, const std::vector& paddings_t, const std::string& padding_algorithm, @@ -58,9 +58,9 @@ void ConvCudnnGradGradKernel( bool use_addto, int workspace_size_MB, bool exhaustive_search_t, - DenseTensor* out_grad_grad, DenseTensor* input_grad, - DenseTensor* filter_grad) { + DenseTensor* filter_grad, + DenseTensor* out_grad_grad) { auto X = &input; auto W = &filter; auto dO = &out_grad; @@ -689,11 +689,11 @@ void DepthwiseConvCudnnGradGradKernel( DenseTensor* input_grad, DenseTensor* filter_grad) { ConvCudnnGradGradKernel(ctx, - input_grad_grad, - filter_grad_grad, - out_grad, input, filter, + out_grad, + input_grad_grad, + filter_grad_grad, strides, paddings_t, padding_algorithm, @@ -703,9 +703,9 @@ void DepthwiseConvCudnnGradGradKernel( use_addto, workspace_size_MB, exhaustive_search_t, - out_grad_grad, input_grad, - filter_grad); + filter_grad, + out_grad_grad); } template @@ -729,11 +729,11 @@ void Conv3DCudnnGradGradKernel( DenseTensor* input_grad, DenseTensor* filter_grad) { ConvCudnnGradGradKernel(ctx, - input_grad_grad, - filter_grad_grad, - out_grad, input, filter, + out_grad, + input_grad_grad, + filter_grad_grad, strides, paddings_t, padding_algorithm, @@ -743,9 +743,9 @@ void Conv3DCudnnGradGradKernel( use_addto, workspace_size_MB, exhaustive_search_t, - out_grad_grad, input_grad, - filter_grad); + filter_grad, + out_grad_grad); } } // namespace phi diff --git a/paddle/phi/kernels/impl/conv_grad_grad_kernel_impl.h b/paddle/phi/kernels/impl/conv_grad_grad_kernel_impl.h index bc0ed44e17a33..64306bc827e4b 100644 --- a/paddle/phi/kernels/impl/conv_grad_grad_kernel_impl.h +++ b/paddle/phi/kernels/impl/conv_grad_grad_kernel_impl.h @@ -26,11 +26,11 @@ namespace phi { template void ConvGradGradKernel(const Context& dev_ctx, - paddle::optional input_grad_grad, - paddle::optional filter_grad_grad, - const DenseTensor& out_grad, const DenseTensor& input, const DenseTensor& filter, + const DenseTensor& out_grad, + paddle::optional input_grad_grad, + paddle::optional filter_grad_grad, const std::vector& strides_t, const std::vector& paddings_t, const std::string& padding_algorithm, @@ -40,9 +40,9 @@ void ConvGradGradKernel(const Context& dev_ctx, bool use_addto, int workspace_size_MB, bool exhaustive_search, - DenseTensor* out_grad_grad, DenseTensor* input_grad, - DenseTensor* filter_grad) { + DenseTensor* filter_grad, + DenseTensor* out_grad_grad) { const DenseTensor* X = &input; const DenseTensor* dY = &out_grad; const DenseTensor* ddX = input_grad_grad.get_ptr(); diff --git a/paddle/phi/ops/compat/conv2d_sig.cc b/paddle/phi/ops/compat/conv2d_sig.cc index 19e20fddcb811..7cc0d6ad17535 100644 --- a/paddle/phi/ops/compat/conv2d_sig.cc +++ b/paddle/phi/ops/compat/conv2d_sig.cc @@ -62,7 +62,7 @@ KernelSignature Conv2dGradOpArgumentMapping(const ArgumentMappingContext& ctx) { KernelSignature Conv2dDoubleGradOpArgumentMapping( const ArgumentMappingContext& ctx) { return KernelSignature("conv2d_grad_grad", - {"DDInput", "DDFilter", "DOutput", "Input", "Filter"}, + {"Input", "Filter", "DOutput", "DDInput", "DDFilter"}, {"strides", "paddings", "padding_algorithm", @@ -72,7 +72,7 @@ KernelSignature Conv2dDoubleGradOpArgumentMapping( "use_addto", "workspace_size_MB", "exhaustive_search"}, - {"DDOutput", "DInput", "DFilter"}); + {"DInput", "DFilter", "DDOutput"}); } } // namespace phi diff --git a/python/paddle/fluid/tests/unittests/test_imperative_star_gan_with_gradient_penalty.py b/python/paddle/fluid/tests/unittests/test_imperative_star_gan_with_gradient_penalty.py index be81c15677a3a..4e542fb13cd76 100644 --- a/python/paddle/fluid/tests/unittests/test_imperative_star_gan_with_gradient_penalty.py +++ b/python/paddle/fluid/tests/unittests/test_imperative_star_gan_with_gradient_penalty.py @@ -590,6 +590,46 @@ def func_main(self): if fluid.is_compiled_with_cuda(): self.place_test(fluid.CUDAPlace(0)) + def place_test(self, place): + cfg = Config(place, False) + + dataset = create_mnist_dataset(cfg) + dataset = paddle.reader.cache(dataset) + + fluid_dygraph_loss = [] + with fluid.dygraph.guard(cfg.place): + fluid_dygraph_model = DyGraphTrainModel(cfg) + for batch_id, (image_real, label_org, + label_trg) in enumerate(dataset()): + loss = fluid_dygraph_model.run(image_real, label_org, label_trg) + fluid_dygraph_loss.append(loss) + + eager_dygraph_loss = [] + with _test_eager_guard(): + with fluid.dygraph.guard(cfg.place): + eager_dygraph_model = DyGraphTrainModel(cfg) + for batch_id, (image_real, label_org, + label_trg) in enumerate(dataset()): + loss = eager_dygraph_model.run(image_real, label_org, + label_trg) + eager_dygraph_loss.append(loss) + + for (g_loss_f, d_loss_f), (g_loss_e, d_loss_e) in zip( + fluid_dygraph_loss, eager_dygraph_loss): + self.assertEqual(g_loss_f, g_loss_e) + self.assertEqual(d_loss_f, d_loss_e) + + def test_all_cases(self): + self.func_main() + + +class TestStarGANWithGradientPenaltyLegacy(unittest.TestCase): + def func_main(self): + self.place_test(fluid.CPUPlace()) + + if fluid.is_compiled_with_cuda(): + self.place_test(fluid.CUDAPlace(0)) + def place_test(self, place): cfg = Config(place) @@ -617,8 +657,7 @@ def place_test(self, place): self.assertEqual(d_loss_s, d_loss_d) def test_all_cases(self): - if _in_legacy_dygraph(): - self.func_main() + self.func_main() if __name__ == '__main__': diff --git a/python/paddle/nn/functional/activation.py b/python/paddle/nn/functional/activation.py index a0efdaac8ff7c..34acbfbf75463 100644 --- a/python/paddle/nn/functional/activation.py +++ b/python/paddle/nn/functional/activation.py @@ -434,7 +434,10 @@ def leaky_relu(x, negative_slope=0.01, name=None): out = F.leaky_relu(x) # [-0.02, 0., 1.] """ - if in_dynamic_mode(): + if in_dygraph_mode(): + return _C_ops.final_state_leaky_relu(x, negative_slope) + + if _in_legacy_dygraph(): return _C_ops.leaky_relu(x, 'alpha', negative_slope) check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64'], diff --git a/python/paddle/utils/code_gen/backward.yaml b/python/paddle/utils/code_gen/backward.yaml index a7b29b9f5aefc..64acc140c2117 100644 --- a/python/paddle/utils/code_gen/backward.yaml +++ b/python/paddle/utils/code_gen/backward.yaml @@ -261,6 +261,19 @@ args : (Tensor input, Tensor filter, Tensor out_grad, int[] strides, int[] paddings, str paddding_algorithm, int groups, int[] dilations, str data_format, bool use_addto, int workspace_size_MB, bool exhaustive_search) output : Tensor(input_grad), Tensor(filter_grad) invoke : conv2d_grad_impl(input, filter, out_grad, strides, paddings, paddding_algorithm, groups, dilations, data_format, use_addto, workspace_size_MB, exhaustive_search) + backward : conv2d_grad_grad + +- backward_api : conv2d_grad_grad + forward : conv2d_grad (Tensor input, Tensor filter, Tensor grad_out, int[] strides, int[] paddings, str paddding_algorithm, int groups, int[] dilations, str data_format, bool use_addto, int workspace_size_MB, bool exhaustive_search) -> Tensor(grad_input), Tensor(grad_filter) + args : (Tensor input, Tensor filter, Tensor grad_out, Tensor grad_input_grad, Tensor grad_filter_grad, int[] strides, int[] paddings, str paddding_algorithm, int groups, int[] dilations, str data_format, bool use_addto, int workspace_size_MB, bool exhaustive_search) + output : Tensor(input_grad), Tensor(filter_grad), Tensor(grad_out_grad) + infer_meta : + func : GeneralTernaryGradInferMeta + param: [input, filter, grad_out] + kernel : + func : conv2d_grad_grad + use_cudnn : true + optional : grad_input_grad, grad_filter_grad - backward_api : conv2d_transpose_grad forward : conv2d_transpose(Tensor x, Tensor filter, int[] strides, int[] paddings, int[] output_padding, int[] output_size, str padding_algorithm, int groups, int[] dilations, str data_format) -> Tensor(out) @@ -366,7 +379,7 @@ func : UnchangedInferMeta param : [x] kernel : - func : determinant_grad + func : determinant_grad - backward_api : diagonal_grad forward : diagonal (Tensor x, int offset, int axis1, int axis2) -> Tensor(out) @@ -755,6 +768,16 @@ data_type : out_grad optional : scale, bias +- backward_api : leaky_relu_double_grad + forward : leaky_relu_grad (Tensor x, Tensor grad_out, float alpha) -> Tensor(grad_x) + args : (Tensor x, Tensor grad_x_grad, float alpha) + output : Tensor(grad_out_grad) + infer_meta : + func : UnchangedInferMeta + param : [grad_x_grad] + kernel : + func : leaky_relu_double_grad + - backward_api : leaky_relu_grad forward : leaky_relu (Tensor x, float alpha) -> Tensor(out) args : (Tensor x, Tensor out_grad, float alpha) @@ -764,6 +787,7 @@ param : [x] kernel : func : leaky_relu_grad + backward : leaky_relu_double_grad - backward_api : lerp_grad forward : lerp (Tensor x, Tensor y, Tensor weight) -> Tensor(out) From 1927aff98b8a4feb3e408750b883e3a250d2d7dd Mon Sep 17 00:00:00 2001 From: chentianyu03 Date: Fri, 15 Apr 2022 16:57:28 +0800 Subject: [PATCH 185/211] [Phi]Reduce kernels into multiply files (#41747) * split reduce_kernel * rm reduce_kernel in cmake * split reduce_grad kernels * fix cmake build error * format code * fix standalone_executor_test error --- .../new_executor/standalone_executor_test.cc | 1 + paddle/phi/kernels/CMakeLists.txt | 4 +- .../phi/kernels/cpu/matrix_rank_tol_kernel.cc | 3 +- paddle/phi/kernels/cpu/reduce_all_kernel.cc | 37 ++++ paddle/phi/kernels/cpu/reduce_any_kernel.cc | 37 ++++ paddle/phi/kernels/cpu/reduce_kernel.cc | 145 --------------- .../phi/kernels/cpu/reduce_max_grad_kernel.cc | 28 +++ paddle/phi/kernels/cpu/reduce_max_kernel.cc | 39 +++++ .../kernels/cpu/reduce_mean_grad_kernel.cc | 43 +++++ paddle/phi/kernels/cpu/reduce_mean_kernel.cc | 39 +++++ .../phi/kernels/cpu/reduce_min_grad_kernel.cc | 28 +++ paddle/phi/kernels/cpu/reduce_min_kernel.cc | 39 +++++ .../kernels/cpu/reduce_prod_grad_kernel.cc | 28 +++ paddle/phi/kernels/cpu/reduce_prod_kernel.cc | 45 +++++ ...ad_kernel.cc => reduce_sum_grad_kernel.cc} | 51 +----- paddle/phi/kernels/cpu/reduce_sum_kernel.cc | 55 ++++++ .../phi/kernels/gpu/matrix_rank_tol_kernel.cu | 3 +- paddle/phi/kernels/gpu/reduce_all_kernel.cu | 36 ++++ paddle/phi/kernels/gpu/reduce_any_kernel.cu | 36 ++++ paddle/phi/kernels/gpu/reduce_kernel.cu | 158 ----------------- .../phi/kernels/gpu/reduce_max_grad_kernel.cu | 28 +++ paddle/phi/kernels/gpu/reduce_max_kernel.cu | 2 +- .../kernels/gpu/reduce_mean_grad_kernel.cu | 45 +++++ paddle/phi/kernels/gpu/reduce_mean_kernel.cu | 47 +++++ .../phi/kernels/gpu/reduce_min_grad_kernel.cu | 28 +++ paddle/phi/kernels/gpu/reduce_min_kernel.cu | 37 ++++ .../kernels/gpu/reduce_prod_grad_kernel.cu | 28 +++ paddle/phi/kernels/gpu/reduce_prod_kernel.cu | 43 +++++ ...ad_kernel.cu => reduce_sum_grad_kernel.cu} | 52 +----- paddle/phi/kernels/gpu/reduce_sum_kernel.cu | 56 ++++++ .../impl/reduce_max_grad_kernel_impl.h | 2 +- .../impl/reduce_min_grad_kernel_impl.h | 2 +- .../impl/reduce_prod_grad_kernel_impl.h | 2 +- paddle/phi/kernels/reduce_all_kernel.cc | 38 ++++ paddle/phi/kernels/reduce_all_kernel.h | 36 ++++ paddle/phi/kernels/reduce_any_kernel.cc | 38 ++++ paddle/phi/kernels/reduce_any_kernel.h | 35 ++++ paddle/phi/kernels/reduce_grad_kernel.h | 69 -------- paddle/phi/kernels/reduce_kernel.cc | 165 ------------------ paddle/phi/kernels/reduce_kernel.h | 153 ---------------- paddle/phi/kernels/reduce_max_grad_kernel.h | 31 ++++ paddle/phi/kernels/reduce_max_kernel.cc | 40 +++++ paddle/phi/kernels/reduce_max_kernel.h | 35 ++++ paddle/phi/kernels/reduce_mean_grad_kernel.h | 30 ++++ paddle/phi/kernels/reduce_mean_kernel.cc | 48 +++++ paddle/phi/kernels/reduce_mean_kernel.h | 49 ++++++ paddle/phi/kernels/reduce_min_grad_kernel.h | 31 ++++ paddle/phi/kernels/reduce_min_kernel.cc | 40 +++++ paddle/phi/kernels/reduce_min_kernel.h | 36 ++++ paddle/phi/kernels/reduce_prod_grad_kernel.h | 30 ++++ paddle/phi/kernels/reduce_prod_kernel.cc | 40 +++++ paddle/phi/kernels/reduce_prod_kernel.h | 44 +++-- paddle/phi/kernels/reduce_sum_grad_kernel.h | 30 ++++ paddle/phi/kernels/reduce_sum_kernel.cc | 71 ++++++++ paddle/phi/kernels/reduce_sum_kernel.h | 51 ++++++ paddle/phi/tests/kernels/test_mean_dev_api.cc | 2 +- paddle/phi/tests/kernels/test_sum_dev_api.cc | 2 +- 57 files changed, 1551 insertions(+), 820 deletions(-) create mode 100644 paddle/phi/kernels/cpu/reduce_all_kernel.cc create mode 100644 paddle/phi/kernels/cpu/reduce_any_kernel.cc delete mode 100644 paddle/phi/kernels/cpu/reduce_kernel.cc create mode 100644 paddle/phi/kernels/cpu/reduce_max_grad_kernel.cc create mode 100644 paddle/phi/kernels/cpu/reduce_max_kernel.cc create mode 100644 paddle/phi/kernels/cpu/reduce_mean_grad_kernel.cc create mode 100644 paddle/phi/kernels/cpu/reduce_mean_kernel.cc create mode 100644 paddle/phi/kernels/cpu/reduce_min_grad_kernel.cc create mode 100644 paddle/phi/kernels/cpu/reduce_min_kernel.cc create mode 100644 paddle/phi/kernels/cpu/reduce_prod_grad_kernel.cc create mode 100644 paddle/phi/kernels/cpu/reduce_prod_kernel.cc rename paddle/phi/kernels/cpu/{reduce_grad_kernel.cc => reduce_sum_grad_kernel.cc} (68%) create mode 100644 paddle/phi/kernels/cpu/reduce_sum_kernel.cc create mode 100644 paddle/phi/kernels/gpu/reduce_all_kernel.cu create mode 100644 paddle/phi/kernels/gpu/reduce_any_kernel.cu delete mode 100644 paddle/phi/kernels/gpu/reduce_kernel.cu create mode 100644 paddle/phi/kernels/gpu/reduce_max_grad_kernel.cu create mode 100644 paddle/phi/kernels/gpu/reduce_mean_grad_kernel.cu create mode 100644 paddle/phi/kernels/gpu/reduce_mean_kernel.cu create mode 100644 paddle/phi/kernels/gpu/reduce_min_grad_kernel.cu create mode 100644 paddle/phi/kernels/gpu/reduce_min_kernel.cu create mode 100644 paddle/phi/kernels/gpu/reduce_prod_grad_kernel.cu create mode 100644 paddle/phi/kernels/gpu/reduce_prod_kernel.cu rename paddle/phi/kernels/gpu/{reduce_grad_kernel.cu => reduce_sum_grad_kernel.cu} (51%) create mode 100644 paddle/phi/kernels/gpu/reduce_sum_kernel.cu create mode 100644 paddle/phi/kernels/reduce_all_kernel.cc create mode 100644 paddle/phi/kernels/reduce_all_kernel.h create mode 100644 paddle/phi/kernels/reduce_any_kernel.cc create mode 100644 paddle/phi/kernels/reduce_any_kernel.h delete mode 100644 paddle/phi/kernels/reduce_grad_kernel.h delete mode 100644 paddle/phi/kernels/reduce_kernel.cc delete mode 100644 paddle/phi/kernels/reduce_kernel.h create mode 100644 paddle/phi/kernels/reduce_max_grad_kernel.h create mode 100644 paddle/phi/kernels/reduce_max_kernel.cc create mode 100644 paddle/phi/kernels/reduce_max_kernel.h create mode 100644 paddle/phi/kernels/reduce_mean_grad_kernel.h create mode 100644 paddle/phi/kernels/reduce_mean_kernel.cc create mode 100644 paddle/phi/kernels/reduce_mean_kernel.h create mode 100644 paddle/phi/kernels/reduce_min_grad_kernel.h create mode 100644 paddle/phi/kernels/reduce_min_kernel.cc create mode 100644 paddle/phi/kernels/reduce_min_kernel.h create mode 100644 paddle/phi/kernels/reduce_prod_grad_kernel.h create mode 100644 paddle/phi/kernels/reduce_prod_kernel.cc create mode 100644 paddle/phi/kernels/reduce_sum_grad_kernel.h create mode 100644 paddle/phi/kernels/reduce_sum_kernel.cc create mode 100644 paddle/phi/kernels/reduce_sum_kernel.h diff --git a/paddle/fluid/framework/new_executor/standalone_executor_test.cc b/paddle/fluid/framework/new_executor/standalone_executor_test.cc index cbbb802b67d76..fe4b47cba6242 100644 --- a/paddle/fluid/framework/new_executor/standalone_executor_test.cc +++ b/paddle/fluid/framework/new_executor/standalone_executor_test.cc @@ -76,6 +76,7 @@ PD_DECLARE_KERNEL(add_raw, KPS, ALL_LAYOUT); #endif PD_DECLARE_KERNEL(add, GPU, ALL_LAYOUT); PD_DECLARE_KERNEL(mean, GPU, ALL_LAYOUT); +PD_DECLARE_KERNEL(mean_grad, GPU, ALL_LAYOUT); PD_DECLARE_KERNEL(sigmoid, GPU, ALL_LAYOUT); PD_DECLARE_KERNEL(sigmoid_grad, GPU, ALL_LAYOUT); PD_DECLARE_KERNEL(reshape_grad, GPU, ALL_LAYOUT); diff --git a/paddle/phi/kernels/CMakeLists.txt b/paddle/phi/kernels/CMakeLists.txt index eec83a1ed8130..a3a71ab692245 100644 --- a/paddle/phi/kernels/CMakeLists.txt +++ b/paddle/phi/kernels/CMakeLists.txt @@ -36,7 +36,7 @@ set(MANUAL_BUILD_KERNELS ${AUTOTUNE_KERNELS} cross_entropy_kernel adam_kernel ad matrix_power_kernel matrix_power_grad_kernel maxout_kernel maxout_grad_kernel pool_kernel put_along_axis_kernel put_along_axis_grad_kernel segment_pool_kernel segment_pool_grad_kernel softmax_kernel softmax_grad_kernel take_along_axis_kernel take_along_axis_grad_kernel - triangular_solve_grad_kernel determinant_grad_kernel reduce_kernel rnn_kernel rnn_grad_kernel warpctc_kernel warpctc_grad_kernel) + triangular_solve_grad_kernel determinant_grad_kernel reduce_sum_kernel rnn_kernel rnn_grad_kernel warpctc_kernel warpctc_grad_kernel) foreach(src ${AUTOTUNE_KERNELS}) kernel_library(${src} DEPS ${COMMON_KERNEL_DEPS} switch_autotune) endforeach() @@ -51,7 +51,7 @@ kernel_library(hierarchical_sigmoid_kernel DEPS ${COMMON_KERNEL_DEPS} matrix_bit kernel_library(hierarchical_sigmoid_grad_kernel DEPS ${COMMON_KERNEL_DEPS} matrix_bit_code) kernel_library(gumbel_softmax_kernel DEPS ${COMMON_KERNEL_DEPS} softmax) kernel_library(gumbel_softmax_grad_kernel DEPS ${COMMON_KERNEL_DEPS} softmax) -kernel_library(reduce_kernel DEPS ${COMMON_KERNEL_DEPS} cast_kernel) +kernel_library(reduce_sum_kernel DEPS ${COMMON_KERNEL_DEPS} cast_kernel) kernel_library(matrix_power_kernel DEPS ${COMMON_KERNEL_DEPS} matrix_inverse) kernel_library(matrix_power_grad_kernel DEPS ${COMMON_KERNEL_DEPS} matrix_inverse) kernel_library(maxout_kernel DEPS ${COMMON_KERNEL_DEPS} maxouting) diff --git a/paddle/phi/kernels/cpu/matrix_rank_tol_kernel.cc b/paddle/phi/kernels/cpu/matrix_rank_tol_kernel.cc index ae1e406d16eec..77c763171088c 100644 --- a/paddle/phi/kernels/cpu/matrix_rank_tol_kernel.cc +++ b/paddle/phi/kernels/cpu/matrix_rank_tol_kernel.cc @@ -23,7 +23,8 @@ #include "paddle/phi/kernels/funcs/eigen/common.h" #include "paddle/phi/kernels/funcs/elementwise_base.h" #include "paddle/phi/kernels/impl/matrix_rank_kernel_impl.h" -#include "paddle/phi/kernels/reduce_kernel.h" +#include "paddle/phi/kernels/reduce_max_kernel.h" +#include "paddle/phi/kernels/reduce_sum_kernel.h" namespace phi { diff --git a/paddle/phi/kernels/cpu/reduce_all_kernel.cc b/paddle/phi/kernels/cpu/reduce_all_kernel.cc new file mode 100644 index 0000000000000..3e8e38ee4447e --- /dev/null +++ b/paddle/phi/kernels/cpu/reduce_all_kernel.cc @@ -0,0 +1,37 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/reduce_all_kernel.h" + +#include "paddle/phi/backends/cpu/cpu_context.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/cpu/reduce.h" +#include "paddle/phi/kernels/funcs/reduce_functor.h" + +namespace phi { + +template +void AllRawKernel(const Context& dev_ctx, + const DenseTensor& x, + const std::vector& dims, + bool keep_dim, + bool reduce_all, + DenseTensor* out) { + phi::BoolReduceKernel( + dev_ctx, x, dims, keep_dim, reduce_all, out); +} + +} // namespace phi + +PD_REGISTER_KERNEL(all_raw, CPU, ALL_LAYOUT, phi::AllRawKernel, bool) {} diff --git a/paddle/phi/kernels/cpu/reduce_any_kernel.cc b/paddle/phi/kernels/cpu/reduce_any_kernel.cc new file mode 100644 index 0000000000000..4fd71f1d0b169 --- /dev/null +++ b/paddle/phi/kernels/cpu/reduce_any_kernel.cc @@ -0,0 +1,37 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/reduce_any_kernel.h" + +#include "paddle/phi/backends/cpu/cpu_context.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/cpu/reduce.h" +#include "paddle/phi/kernels/funcs/reduce_functor.h" + +namespace phi { + +template +void AnyRawKernel(const Context& dev_ctx, + const DenseTensor& x, + const std::vector& dims, + bool keep_dim, + bool reduce_all, + DenseTensor* out) { + phi::BoolReduceKernel( + dev_ctx, x, dims, keep_dim, reduce_all, out); +} + +} // namespace phi + +PD_REGISTER_KERNEL(any_raw, CPU, ALL_LAYOUT, phi::AnyRawKernel, bool) {} diff --git a/paddle/phi/kernels/cpu/reduce_kernel.cc b/paddle/phi/kernels/cpu/reduce_kernel.cc deleted file mode 100644 index bc99e2cb39a69..0000000000000 --- a/paddle/phi/kernels/cpu/reduce_kernel.cc +++ /dev/null @@ -1,145 +0,0 @@ -// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "paddle/phi/kernels/reduce_kernel.h" - -#include "paddle/phi/backends/cpu/cpu_context.h" -#include "paddle/phi/core/kernel_registry.h" -#include "paddle/phi/kernels/cpu/reduce.h" -#include "paddle/phi/kernels/funcs/reduce_functor.h" - -namespace phi { - -template -void MeanRawKernel(const Context& dev_ctx, - const DenseTensor& x, - const std::vector& dims, - bool keep_dim, - bool reduce_all, - DenseTensor* out) { - auto out_dtype = x.dtype(); - phi::Reduce( - dev_ctx, x, reduce_all, dims, keep_dim, out_dtype, out); -} - -template -void SumRawKernel(const Context& dev_ctx, - const DenseTensor& x, - const std::vector& dims, - bool keep_dim, - bool reduce_all, - DataType out_dtype, - DenseTensor* out) { - phi::Reduce( - dev_ctx, x, reduce_all, dims, keep_dim, out_dtype, out); -} - -template -void ProdRawKernel(const Context& dev_ctx, - const DenseTensor& x, - const std::vector& dims, - bool keep_dim, - bool reduce_all, - DenseTensor* out) { - auto out_dtype = x.dtype(); - phi::Reduce( - dev_ctx, x, reduce_all, dims, keep_dim, out_dtype, out); -} - -template -void MaxRawKernel(const Context& dev_ctx, - const DenseTensor& x, - const std::vector& dims, - bool keep_dim, - bool reduce_all, - DenseTensor* out) { - auto out_dtype = x.dtype(); - phi::Reduce( - dev_ctx, x, reduce_all, dims, keep_dim, out_dtype, out); -} - -template -void MinRawKernel(const Context& dev_ctx, - const DenseTensor& x, - const std::vector& dims, - bool keep_dim, - bool reduce_all, - DenseTensor* out) { - auto out_dtype = x.dtype(); - phi::Reduce( - dev_ctx, x, reduce_all, dims, keep_dim, out_dtype, out); -} - -template -void AllRawKernel(const Context& dev_ctx, - const DenseTensor& x, - const std::vector& dims, - bool keep_dim, - bool reduce_all, - DenseTensor* out) { - phi::BoolReduceKernel( - dev_ctx, x, dims, keep_dim, reduce_all, out); -} - -template -void AnyRawKernel(const Context& dev_ctx, - const DenseTensor& x, - const std::vector& dims, - bool keep_dim, - bool reduce_all, - DenseTensor* out) { - phi::BoolReduceKernel( - dev_ctx, x, dims, keep_dim, reduce_all, out); -} - -} // namespace phi - -using complex64 = ::phi::dtype::complex; -using complex128 = ::phi::dtype::complex; - -PD_REGISTER_KERNEL(sum_raw, - CPU, - ALL_LAYOUT, - phi::SumRawKernel, - bool, - float, - double, - phi::dtype::float16, - int16_t, - int, - int64_t, - complex64, - complex128) { - kernel->OutputAt(0).SetDataType(paddle::experimental::DataType::UNDEFINED); -} -PD_REGISTER_KERNEL( - mean_raw, CPU, ALL_LAYOUT, phi::MeanRawKernel, float, double, bool) {} - -PD_REGISTER_KERNEL(prod_raw, - CPU, - ALL_LAYOUT, - phi::ProdRawKernel, - float, - double, - int, - int64_t) {} - -PD_REGISTER_KERNEL( - max_raw, CPU, ALL_LAYOUT, phi::MaxRawKernel, float, double, int, int64_t) {} - -PD_REGISTER_KERNEL( - min_raw, CPU, ALL_LAYOUT, phi::MinRawKernel, float, double, int, int64_t) {} - -PD_REGISTER_KERNEL(all_raw, CPU, ALL_LAYOUT, phi::AllRawKernel, bool) {} -PD_REGISTER_KERNEL(any_raw, CPU, ALL_LAYOUT, phi::AnyRawKernel, bool) {} diff --git a/paddle/phi/kernels/cpu/reduce_max_grad_kernel.cc b/paddle/phi/kernels/cpu/reduce_max_grad_kernel.cc new file mode 100644 index 0000000000000..5255f06fb0cd3 --- /dev/null +++ b/paddle/phi/kernels/cpu/reduce_max_grad_kernel.cc @@ -0,0 +1,28 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/reduce_max_grad_kernel.h" + +#include "paddle/phi/backends/cpu/cpu_context.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/impl/reduce_max_grad_kernel_impl.h" + +PD_REGISTER_KERNEL(max_grad, + CPU, + ALL_LAYOUT, + phi::ReduceMaxGradKernel, + float, + double, + int, + int64_t) {} diff --git a/paddle/phi/kernels/cpu/reduce_max_kernel.cc b/paddle/phi/kernels/cpu/reduce_max_kernel.cc new file mode 100644 index 0000000000000..f9ea0aa0faf06 --- /dev/null +++ b/paddle/phi/kernels/cpu/reduce_max_kernel.cc @@ -0,0 +1,39 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/reduce_max_kernel.h" + +#include "paddle/phi/backends/cpu/cpu_context.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/cpu/reduce.h" +#include "paddle/phi/kernels/funcs/reduce_functor.h" + +namespace phi { + +template +void MaxRawKernel(const Context& dev_ctx, + const DenseTensor& x, + const std::vector& dims, + bool keep_dim, + bool reduce_all, + DenseTensor* out) { + auto out_dtype = x.dtype(); + phi::Reduce( + dev_ctx, x, reduce_all, dims, keep_dim, out_dtype, out); +} + +} // namespace phi + +PD_REGISTER_KERNEL( + max_raw, CPU, ALL_LAYOUT, phi::MaxRawKernel, float, double, int, int64_t) {} diff --git a/paddle/phi/kernels/cpu/reduce_mean_grad_kernel.cc b/paddle/phi/kernels/cpu/reduce_mean_grad_kernel.cc new file mode 100644 index 0000000000000..77176d5d7469e --- /dev/null +++ b/paddle/phi/kernels/cpu/reduce_mean_grad_kernel.cc @@ -0,0 +1,43 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/reduce_mean_grad_kernel.h" + +#include "paddle/phi/backends/cpu/cpu_context.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/funcs/reduce_functor.h" +#include "paddle/phi/kernels/impl/reduce_grad.h" +namespace phi { + +template +void ReduceMeanGradKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& out_grad, + const std::vector& dims, + bool keep_dim, + bool reduce_all, + DenseTensor* x_grad) { + ReduceGradKernel( + dev_ctx, x, paddle::none, out_grad, dims, keep_dim, reduce_all, x_grad); +} + +} // namespace phi + +PD_REGISTER_KERNEL(mean_grad, + CPU, + ALL_LAYOUT, + phi::ReduceMeanGradKernel, + bool, + float, + double) {} diff --git a/paddle/phi/kernels/cpu/reduce_mean_kernel.cc b/paddle/phi/kernels/cpu/reduce_mean_kernel.cc new file mode 100644 index 0000000000000..8fa687632f653 --- /dev/null +++ b/paddle/phi/kernels/cpu/reduce_mean_kernel.cc @@ -0,0 +1,39 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/reduce_mean_kernel.h" + +#include "paddle/phi/backends/cpu/cpu_context.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/cpu/reduce.h" +#include "paddle/phi/kernels/funcs/reduce_functor.h" + +namespace phi { + +template +void MeanRawKernel(const Context& dev_ctx, + const DenseTensor& x, + const std::vector& dims, + bool keep_dim, + bool reduce_all, + DenseTensor* out) { + auto out_dtype = x.dtype(); + phi::Reduce( + dev_ctx, x, reduce_all, dims, keep_dim, out_dtype, out); +} + +} // namespace phi + +PD_REGISTER_KERNEL( + mean_raw, CPU, ALL_LAYOUT, phi::MeanRawKernel, float, double, bool) {} diff --git a/paddle/phi/kernels/cpu/reduce_min_grad_kernel.cc b/paddle/phi/kernels/cpu/reduce_min_grad_kernel.cc new file mode 100644 index 0000000000000..4ba56a0a53a3a --- /dev/null +++ b/paddle/phi/kernels/cpu/reduce_min_grad_kernel.cc @@ -0,0 +1,28 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/reduce_min_grad_kernel.h" + +#include "paddle/phi/backends/cpu/cpu_context.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/impl/reduce_min_grad_kernel_impl.h" + +PD_REGISTER_KERNEL(min_grad, + CPU, + ALL_LAYOUT, + phi::ReduceMinGradKernel, + float, + double, + int, + int64_t) {} diff --git a/paddle/phi/kernels/cpu/reduce_min_kernel.cc b/paddle/phi/kernels/cpu/reduce_min_kernel.cc new file mode 100644 index 0000000000000..0a241c81dbe69 --- /dev/null +++ b/paddle/phi/kernels/cpu/reduce_min_kernel.cc @@ -0,0 +1,39 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/reduce_min_kernel.h" + +#include "paddle/phi/backends/cpu/cpu_context.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/cpu/reduce.h" +#include "paddle/phi/kernels/funcs/reduce_functor.h" + +namespace phi { + +template +void MinRawKernel(const Context& dev_ctx, + const DenseTensor& x, + const std::vector& dims, + bool keep_dim, + bool reduce_all, + DenseTensor* out) { + auto out_dtype = x.dtype(); + phi::Reduce( + dev_ctx, x, reduce_all, dims, keep_dim, out_dtype, out); +} + +} // namespace phi + +PD_REGISTER_KERNEL( + min_raw, CPU, ALL_LAYOUT, phi::MinRawKernel, float, double, int, int64_t) {} diff --git a/paddle/phi/kernels/cpu/reduce_prod_grad_kernel.cc b/paddle/phi/kernels/cpu/reduce_prod_grad_kernel.cc new file mode 100644 index 0000000000000..bec6deb907a0d --- /dev/null +++ b/paddle/phi/kernels/cpu/reduce_prod_grad_kernel.cc @@ -0,0 +1,28 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/reduce_prod_grad_kernel.h" + +#include "paddle/phi/backends/cpu/cpu_context.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/impl/reduce_prod_grad_kernel_impl.h" + +PD_REGISTER_KERNEL(prod_grad, + CPU, + ALL_LAYOUT, + phi::ReduceProdGradKernel, + float, + double, + int, + int64_t) {} diff --git a/paddle/phi/kernels/cpu/reduce_prod_kernel.cc b/paddle/phi/kernels/cpu/reduce_prod_kernel.cc new file mode 100644 index 0000000000000..d31a6e5626289 --- /dev/null +++ b/paddle/phi/kernels/cpu/reduce_prod_kernel.cc @@ -0,0 +1,45 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/reduce_prod_kernel.h" + +#include "paddle/phi/backends/cpu/cpu_context.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/cpu/reduce.h" +#include "paddle/phi/kernels/funcs/reduce_functor.h" + +namespace phi { + +template +void ProdRawKernel(const Context& dev_ctx, + const DenseTensor& x, + const std::vector& dims, + bool keep_dim, + bool reduce_all, + DenseTensor* out) { + auto out_dtype = x.dtype(); + phi::Reduce( + dev_ctx, x, reduce_all, dims, keep_dim, out_dtype, out); +} + +} // namespace phi + +PD_REGISTER_KERNEL(prod_raw, + CPU, + ALL_LAYOUT, + phi::ProdRawKernel, + float, + double, + int, + int64_t) {} diff --git a/paddle/phi/kernels/cpu/reduce_grad_kernel.cc b/paddle/phi/kernels/cpu/reduce_sum_grad_kernel.cc similarity index 68% rename from paddle/phi/kernels/cpu/reduce_grad_kernel.cc rename to paddle/phi/kernels/cpu/reduce_sum_grad_kernel.cc index d6347386112fd..66ae5e02ffc75 100644 --- a/paddle/phi/kernels/cpu/reduce_grad_kernel.cc +++ b/paddle/phi/kernels/cpu/reduce_sum_grad_kernel.cc @@ -12,7 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/phi/kernels/reduce_grad_kernel.h" +#include "paddle/phi/kernels/reduce_sum_grad_kernel.h" #include "paddle/phi/backends/cpu/cpu_context.h" #include "paddle/phi/core/kernel_registry.h" @@ -20,9 +20,6 @@ #include "paddle/phi/kernels/empty_kernel.h" #include "paddle/phi/kernels/funcs/reduce_functor.h" #include "paddle/phi/kernels/impl/reduce_grad.h" -#include "paddle/phi/kernels/impl/reduce_max_grad_kernel_impl.h" -#include "paddle/phi/kernels/impl/reduce_min_grad_kernel_impl.h" -#include "paddle/phi/kernels/impl/reduce_prod_grad_kernel_impl.h" namespace phi { template @@ -100,18 +97,6 @@ void ReduceSumGradKernel(const Context& dev_ctx, dev_ctx, x, paddle::none, out_grad, dims, keep_dim, reduce_all, x_grad); } -template -void ReduceMeanGradKernel(const Context& dev_ctx, - const DenseTensor& x, - const DenseTensor& out_grad, - const std::vector& dims, - bool keep_dim, - bool reduce_all, - DenseTensor* x_grad) { - ReduceGradKernel( - dev_ctx, x, paddle::none, out_grad, dims, keep_dim, reduce_all, x_grad); -} - } // namespace phi PD_REGISTER_KERNEL(sum_grad, @@ -127,37 +112,3 @@ PD_REGISTER_KERNEL(sum_grad, phi::dtype::complex, phi::dtype::complex) {} -PD_REGISTER_KERNEL(mean_grad, - CPU, - ALL_LAYOUT, - phi::ReduceMeanGradKernel, - bool, - float, - double) {} - -PD_REGISTER_KERNEL(prod_grad, - CPU, - ALL_LAYOUT, - phi::ReduceProdGradKernel, - float, - double, - int, - int64_t) {} - -PD_REGISTER_KERNEL(max_grad, - CPU, - ALL_LAYOUT, - phi::ReduceMaxGradKernel, - float, - double, - int, - int64_t) {} - -PD_REGISTER_KERNEL(min_grad, - CPU, - ALL_LAYOUT, - phi::ReduceMinGradKernel, - float, - double, - int, - int64_t) {} diff --git a/paddle/phi/kernels/cpu/reduce_sum_kernel.cc b/paddle/phi/kernels/cpu/reduce_sum_kernel.cc new file mode 100644 index 0000000000000..32b12ea684528 --- /dev/null +++ b/paddle/phi/kernels/cpu/reduce_sum_kernel.cc @@ -0,0 +1,55 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/reduce_sum_kernel.h" + +#include "paddle/phi/backends/cpu/cpu_context.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/cpu/reduce.h" +#include "paddle/phi/kernels/funcs/reduce_functor.h" + +namespace phi { + +template +void SumRawKernel(const Context& dev_ctx, + const DenseTensor& x, + const std::vector& dims, + bool keep_dim, + bool reduce_all, + DataType out_dtype, + DenseTensor* out) { + phi::Reduce( + dev_ctx, x, reduce_all, dims, keep_dim, out_dtype, out); +} + +} // namespace phi + +using complex64 = ::phi::dtype::complex; +using complex128 = ::phi::dtype::complex; + +PD_REGISTER_KERNEL(sum_raw, + CPU, + ALL_LAYOUT, + phi::SumRawKernel, + bool, + float, + double, + phi::dtype::float16, + int16_t, + int, + int64_t, + complex64, + complex128) { + kernel->OutputAt(0).SetDataType(paddle::experimental::DataType::UNDEFINED); +} diff --git a/paddle/phi/kernels/gpu/matrix_rank_tol_kernel.cu b/paddle/phi/kernels/gpu/matrix_rank_tol_kernel.cu index 66ba30f7ce694..be6cdc7825575 100644 --- a/paddle/phi/kernels/gpu/matrix_rank_tol_kernel.cu +++ b/paddle/phi/kernels/gpu/matrix_rank_tol_kernel.cu @@ -28,7 +28,8 @@ #include "paddle/phi/kernels/funcs/broadcast_function.h" #include "paddle/phi/kernels/funcs/compare_functors.h" #include "paddle/phi/kernels/impl/matrix_rank_kernel_impl.h" -#include "paddle/phi/kernels/reduce_kernel.h" +#include "paddle/phi/kernels/reduce_max_kernel.h" +#include "paddle/phi/kernels/reduce_sum_kernel.h" namespace phi { diff --git a/paddle/phi/kernels/gpu/reduce_all_kernel.cu b/paddle/phi/kernels/gpu/reduce_all_kernel.cu new file mode 100644 index 0000000000000..2963d3f206c2d --- /dev/null +++ b/paddle/phi/kernels/gpu/reduce_all_kernel.cu @@ -0,0 +1,36 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/reduce_all_kernel.h" + +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/gpu/reduce.h" + +namespace phi { + +template +void AllRawKernel(const Context& dev_ctx, + const DenseTensor& x, + const std::vector& dims, + bool keep_dim, + bool reduce_all, + DenseTensor* out) { + auto out_dtype = x.dtype(); + phi::Reduce( + dev_ctx, x, reduce_all, dims, keep_dim, out_dtype, out); +} + +} // namespace phi + +PD_REGISTER_KERNEL(all_raw, GPU, ALL_LAYOUT, phi::AllRawKernel, bool) {} diff --git a/paddle/phi/kernels/gpu/reduce_any_kernel.cu b/paddle/phi/kernels/gpu/reduce_any_kernel.cu new file mode 100644 index 0000000000000..39c8cbe442cbd --- /dev/null +++ b/paddle/phi/kernels/gpu/reduce_any_kernel.cu @@ -0,0 +1,36 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/reduce_any_kernel.h" + +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/gpu/reduce.h" + +namespace phi { + +template +void AnyRawKernel(const Context& dev_ctx, + const DenseTensor& x, + const std::vector& dims, + bool keep_dim, + bool reduce_all, + DenseTensor* out) { + auto out_dtype = x.dtype(); + phi::Reduce( + dev_ctx, x, reduce_all, dims, keep_dim, out_dtype, out); +} + +} // namespace phi + +PD_REGISTER_KERNEL(any_raw, GPU, ALL_LAYOUT, phi::AnyRawKernel, bool) {} diff --git a/paddle/phi/kernels/gpu/reduce_kernel.cu b/paddle/phi/kernels/gpu/reduce_kernel.cu deleted file mode 100644 index fabd13d4a737c..0000000000000 --- a/paddle/phi/kernels/gpu/reduce_kernel.cu +++ /dev/null @@ -1,158 +0,0 @@ -// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "paddle/phi/kernels/reduce_kernel.h" - -#include "paddle/phi/core/kernel_registry.h" -#include "paddle/phi/kernels/gpu/reduce.h" - -namespace phi { - -template -void MeanRawKernel(const Context& dev_ctx, - const DenseTensor& x, - const std::vector& dims, - bool keep_dim, - bool reduce_all, - DenseTensor* out) { - auto out_dtype = x.dtype(); - phi::Reduce( - dev_ctx, x, reduce_all, dims, keep_dim, out_dtype, out, true); -} - -template -void SumRawKernel(const Context& dev_ctx, - const DenseTensor& x, - const std::vector& dims, - bool keep_dim, - bool reduce_all, - DataType out_dtype, - DenseTensor* out) { - phi::Reduce( - dev_ctx, x, reduce_all, dims, keep_dim, out_dtype, out); -} - -template -void ProdRawKernel(const Context& dev_ctx, - const DenseTensor& x, - const std::vector& dims, - bool keep_dim, - bool reduce_all, - DenseTensor* out) { - auto out_dtype = x.dtype(); - phi::Reduce( - dev_ctx, x, reduce_all, dims, keep_dim, out_dtype, out); -} - -template -void MaxRawKernel(const Context& dev_ctx, - const DenseTensor& x, - const std::vector& dims, - bool keep_dim, - bool reduce_all, - DenseTensor* out) { - auto out_dtype = x.dtype(); - phi::Reduce( - dev_ctx, x, reduce_all, dims, keep_dim, out_dtype, out); -} - -template -void MinRawKernel(const Context& dev_ctx, - const DenseTensor& x, - const std::vector& dims, - bool keep_dim, - bool reduce_all, - DenseTensor* out) { - auto out_dtype = x.dtype(); - phi::Reduce( - dev_ctx, x, reduce_all, dims, keep_dim, out_dtype, out); -} - -template -void AllRawKernel(const Context& dev_ctx, - const DenseTensor& x, - const std::vector& dims, - bool keep_dim, - bool reduce_all, - DenseTensor* out) { - auto out_dtype = x.dtype(); - phi::Reduce( - dev_ctx, x, reduce_all, dims, keep_dim, out_dtype, out); -} - -template -void AnyRawKernel(const Context& dev_ctx, - const DenseTensor& x, - const std::vector& dims, - bool keep_dim, - bool reduce_all, - DenseTensor* out) { - auto out_dtype = x.dtype(); - phi::Reduce( - dev_ctx, x, reduce_all, dims, keep_dim, out_dtype, out); -} - -} // namespace phi - -using float16 = phi::dtype::float16; -using bfloat16 = phi::dtype::bfloat16; -using complex64 = ::phi::dtype::complex; -using complex128 = ::phi::dtype::complex; - -PD_REGISTER_KERNEL(sum_raw, - GPU, - ALL_LAYOUT, - phi::SumRawKernel, - bool, - float, - double, - float16, - bfloat16, - int16_t, - int, - int64_t, - complex64, - complex128) { - kernel->OutputAt(0).SetDataType(paddle::experimental::DataType::UNDEFINED); -} - -PD_REGISTER_KERNEL(mean_raw, - GPU, - ALL_LAYOUT, - phi::MeanRawKernel, - float, - double, - bool, - float16, - int, - int64_t) {} - -PD_REGISTER_KERNEL(prod_raw, - GPU, - ALL_LAYOUT, - phi::ProdRawKernel, - float, - double, - int, - int64_t) {} - -PD_REGISTER_KERNEL( - max_raw, GPU, ALL_LAYOUT, phi::MaxRawKernel, float, double, int, int64_t) {} - -PD_REGISTER_KERNEL( - min_raw, GPU, ALL_LAYOUT, phi::MinRawKernel, float, double, int, int64_t) {} - -PD_REGISTER_KERNEL(all_raw, GPU, ALL_LAYOUT, phi::AllRawKernel, bool) {} - -PD_REGISTER_KERNEL(any_raw, GPU, ALL_LAYOUT, phi::AnyRawKernel, bool) {} diff --git a/paddle/phi/kernels/gpu/reduce_max_grad_kernel.cu b/paddle/phi/kernels/gpu/reduce_max_grad_kernel.cu new file mode 100644 index 0000000000000..b4ff277b5026c --- /dev/null +++ b/paddle/phi/kernels/gpu/reduce_max_grad_kernel.cu @@ -0,0 +1,28 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/reduce_max_grad_kernel.h" + +#include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/impl/reduce_max_grad_kernel_impl.h" + +PD_REGISTER_KERNEL(max_grad, + GPU, + ALL_LAYOUT, + phi::ReduceMaxGradKernel, + float, + double, + int, + int64_t) {} diff --git a/paddle/phi/kernels/gpu/reduce_max_kernel.cu b/paddle/phi/kernels/gpu/reduce_max_kernel.cu index ddbc08b06c84b..98c3986c51dd6 100644 --- a/paddle/phi/kernels/gpu/reduce_max_kernel.cu +++ b/paddle/phi/kernels/gpu/reduce_max_kernel.cu @@ -12,7 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/phi/kernels/reduce_kernel.h" +#include "paddle/phi/kernels/reduce_max_kernel.h" #include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/kernels/gpu/reduce.h" diff --git a/paddle/phi/kernels/gpu/reduce_mean_grad_kernel.cu b/paddle/phi/kernels/gpu/reduce_mean_grad_kernel.cu new file mode 100644 index 0000000000000..b81a5e50dfb3e --- /dev/null +++ b/paddle/phi/kernels/gpu/reduce_mean_grad_kernel.cu @@ -0,0 +1,45 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/reduce_mean_grad_kernel.h" + +#include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/funcs/reduce_function.h" +#include "paddle/phi/kernels/gpu/reduce_grad.h" + +namespace phi { + +template +void ReduceMeanGradKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& out_grad, + const std::vector& dims, + bool keep_dim, + bool reduce_all, + DenseTensor* x_grad) { + ReduceGradKernel( + dev_ctx, x, out_grad, dims, keep_dim, reduce_all, x_grad); +} + +} // namespace phi + +PD_REGISTER_KERNEL(mean_grad, + GPU, + ALL_LAYOUT, + phi::ReduceMeanGradKernel, + bool, + float, + double, + phi::dtype::float16) {} diff --git a/paddle/phi/kernels/gpu/reduce_mean_kernel.cu b/paddle/phi/kernels/gpu/reduce_mean_kernel.cu new file mode 100644 index 0000000000000..5a2cc8036a158 --- /dev/null +++ b/paddle/phi/kernels/gpu/reduce_mean_kernel.cu @@ -0,0 +1,47 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/reduce_mean_kernel.h" + +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/gpu/reduce.h" + +namespace phi { + +template +void MeanRawKernel(const Context& dev_ctx, + const DenseTensor& x, + const std::vector& dims, + bool keep_dim, + bool reduce_all, + DenseTensor* out) { + auto out_dtype = x.dtype(); + phi::Reduce( + dev_ctx, x, reduce_all, dims, keep_dim, out_dtype, out, true); +} + +} // namespace phi + +using float16 = phi::dtype::float16; + +PD_REGISTER_KERNEL(mean_raw, + GPU, + ALL_LAYOUT, + phi::MeanRawKernel, + float, + double, + bool, + float16, + int, + int64_t) {} diff --git a/paddle/phi/kernels/gpu/reduce_min_grad_kernel.cu b/paddle/phi/kernels/gpu/reduce_min_grad_kernel.cu new file mode 100644 index 0000000000000..ea1d377c45976 --- /dev/null +++ b/paddle/phi/kernels/gpu/reduce_min_grad_kernel.cu @@ -0,0 +1,28 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/reduce_min_grad_kernel.h" + +#include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/impl/reduce_min_grad_kernel_impl.h" + +PD_REGISTER_KERNEL(min_grad, + GPU, + ALL_LAYOUT, + phi::ReduceMinGradKernel, + float, + double, + int, + int64_t) {} diff --git a/paddle/phi/kernels/gpu/reduce_min_kernel.cu b/paddle/phi/kernels/gpu/reduce_min_kernel.cu new file mode 100644 index 0000000000000..ba37d54895d0d --- /dev/null +++ b/paddle/phi/kernels/gpu/reduce_min_kernel.cu @@ -0,0 +1,37 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/reduce_min_kernel.h" + +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/gpu/reduce.h" + +namespace phi { + +template +void MinRawKernel(const Context& dev_ctx, + const DenseTensor& x, + const std::vector& dims, + bool keep_dim, + bool reduce_all, + DenseTensor* out) { + auto out_dtype = x.dtype(); + phi::Reduce( + dev_ctx, x, reduce_all, dims, keep_dim, out_dtype, out); +} + +} // namespace phi + +PD_REGISTER_KERNEL( + min_raw, GPU, ALL_LAYOUT, phi::MinRawKernel, float, double, int, int64_t) {} diff --git a/paddle/phi/kernels/gpu/reduce_prod_grad_kernel.cu b/paddle/phi/kernels/gpu/reduce_prod_grad_kernel.cu new file mode 100644 index 0000000000000..08444cf95d6c6 --- /dev/null +++ b/paddle/phi/kernels/gpu/reduce_prod_grad_kernel.cu @@ -0,0 +1,28 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/reduce_prod_grad_kernel.h" + +#include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/impl/reduce_prod_grad_kernel_impl.h" + +PD_REGISTER_KERNEL(prod_grad, + GPU, + ALL_LAYOUT, + phi::ReduceProdGradKernel, + float, + double, + int, + int64_t) {} diff --git a/paddle/phi/kernels/gpu/reduce_prod_kernel.cu b/paddle/phi/kernels/gpu/reduce_prod_kernel.cu new file mode 100644 index 0000000000000..db4ace1a02271 --- /dev/null +++ b/paddle/phi/kernels/gpu/reduce_prod_kernel.cu @@ -0,0 +1,43 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/reduce_prod_kernel.h" + +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/gpu/reduce.h" + +namespace phi { + +template +void ProdRawKernel(const Context& dev_ctx, + const DenseTensor& x, + const std::vector& dims, + bool keep_dim, + bool reduce_all, + DenseTensor* out) { + auto out_dtype = x.dtype(); + phi::Reduce( + dev_ctx, x, reduce_all, dims, keep_dim, out_dtype, out); +} + +} // namespace phi + +PD_REGISTER_KERNEL(prod_raw, + GPU, + ALL_LAYOUT, + phi::ProdRawKernel, + float, + double, + int, + int64_t) {} diff --git a/paddle/phi/kernels/gpu/reduce_grad_kernel.cu b/paddle/phi/kernels/gpu/reduce_sum_grad_kernel.cu similarity index 51% rename from paddle/phi/kernels/gpu/reduce_grad_kernel.cu rename to paddle/phi/kernels/gpu/reduce_sum_grad_kernel.cu index 0b59ba3ad39e4..1ad6b8fefe7e4 100644 --- a/paddle/phi/kernels/gpu/reduce_grad_kernel.cu +++ b/paddle/phi/kernels/gpu/reduce_sum_grad_kernel.cu @@ -12,15 +12,12 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/phi/kernels/reduce_grad_kernel.h" +#include "paddle/phi/kernels/reduce_sum_grad_kernel.h" #include "paddle/phi/backends/gpu/gpu_context.h" #include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/kernels/funcs/reduce_function.h" #include "paddle/phi/kernels/gpu/reduce_grad.h" -#include "paddle/phi/kernels/impl/reduce_max_grad_kernel_impl.h" -#include "paddle/phi/kernels/impl/reduce_min_grad_kernel_impl.h" -#include "paddle/phi/kernels/impl/reduce_prod_grad_kernel_impl.h" namespace phi { @@ -36,18 +33,6 @@ void ReduceSumGradKernel(const Context& dev_ctx, dev_ctx, x, out_grad, dims, keep_dim, reduce_all, x_grad); } -template -void ReduceMeanGradKernel(const Context& dev_ctx, - const DenseTensor& x, - const DenseTensor& out_grad, - const std::vector& dims, - bool keep_dim, - bool reduce_all, - DenseTensor* x_grad) { - ReduceGradKernel( - dev_ctx, x, out_grad, dims, keep_dim, reduce_all, x_grad); -} - } // namespace phi PD_REGISTER_KERNEL(sum_grad, @@ -64,38 +49,3 @@ PD_REGISTER_KERNEL(sum_grad, phi::dtype::complex, phi::dtype::complex) {} -PD_REGISTER_KERNEL(mean_grad, - GPU, - ALL_LAYOUT, - phi::ReduceMeanGradKernel, - bool, - float, - double, - phi::dtype::float16) {} - -PD_REGISTER_KERNEL(prod_grad, - GPU, - ALL_LAYOUT, - phi::ReduceProdGradKernel, - float, - double, - int, - int64_t) {} - -PD_REGISTER_KERNEL(max_grad, - GPU, - ALL_LAYOUT, - phi::ReduceMaxGradKernel, - float, - double, - int, - int64_t) {} - -PD_REGISTER_KERNEL(min_grad, - GPU, - ALL_LAYOUT, - phi::ReduceMinGradKernel, - float, - double, - int, - int64_t) {} diff --git a/paddle/phi/kernels/gpu/reduce_sum_kernel.cu b/paddle/phi/kernels/gpu/reduce_sum_kernel.cu new file mode 100644 index 0000000000000..28bdbd009bdae --- /dev/null +++ b/paddle/phi/kernels/gpu/reduce_sum_kernel.cu @@ -0,0 +1,56 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/reduce_sum_kernel.h" + +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/gpu/reduce.h" + +namespace phi { + +template +void SumRawKernel(const Context& dev_ctx, + const DenseTensor& x, + const std::vector& dims, + bool keep_dim, + bool reduce_all, + DataType out_dtype, + DenseTensor* out) { + phi::Reduce( + dev_ctx, x, reduce_all, dims, keep_dim, out_dtype, out); +} + +} // namespace phi + +using float16 = phi::dtype::float16; +using bfloat16 = phi::dtype::bfloat16; +using complex64 = ::phi::dtype::complex; +using complex128 = ::phi::dtype::complex; + +PD_REGISTER_KERNEL(sum_raw, + GPU, + ALL_LAYOUT, + phi::SumRawKernel, + bool, + float, + double, + float16, + bfloat16, + int16_t, + int, + int64_t, + complex64, + complex128) { + kernel->OutputAt(0).SetDataType(paddle::experimental::DataType::UNDEFINED); +} diff --git a/paddle/phi/kernels/impl/reduce_max_grad_kernel_impl.h b/paddle/phi/kernels/impl/reduce_max_grad_kernel_impl.h index cf11eb4d5a498..9051ef6845966 100644 --- a/paddle/phi/kernels/impl/reduce_max_grad_kernel_impl.h +++ b/paddle/phi/kernels/impl/reduce_max_grad_kernel_impl.h @@ -14,7 +14,7 @@ #pragma once -#include "paddle/phi/kernels/reduce_grad_kernel.h" +#include "paddle/phi/kernels/reduce_max_grad_kernel.h" #include "paddle/phi/kernels/funcs/reduce_functor.h" #include "paddle/phi/kernels/impl/reduce_grad.h" diff --git a/paddle/phi/kernels/impl/reduce_min_grad_kernel_impl.h b/paddle/phi/kernels/impl/reduce_min_grad_kernel_impl.h index 50d0645abcba5..53bd0b7d57f1e 100644 --- a/paddle/phi/kernels/impl/reduce_min_grad_kernel_impl.h +++ b/paddle/phi/kernels/impl/reduce_min_grad_kernel_impl.h @@ -14,7 +14,7 @@ #pragma once -#include "paddle/phi/kernels/reduce_grad_kernel.h" +#include "paddle/phi/kernels/reduce_min_grad_kernel.h" #include "paddle/phi/kernels/funcs/reduce_functor.h" #include "paddle/phi/kernels/impl/reduce_grad.h" diff --git a/paddle/phi/kernels/impl/reduce_prod_grad_kernel_impl.h b/paddle/phi/kernels/impl/reduce_prod_grad_kernel_impl.h index a81d74c845550..3bf103b0fda9c 100644 --- a/paddle/phi/kernels/impl/reduce_prod_grad_kernel_impl.h +++ b/paddle/phi/kernels/impl/reduce_prod_grad_kernel_impl.h @@ -14,7 +14,7 @@ #pragma once -#include "paddle/phi/kernels/reduce_grad_kernel.h" +#include "paddle/phi/kernels/reduce_prod_grad_kernel.h" #include "paddle/phi/kernels/funcs/reduce_functor.h" #include "paddle/phi/kernels/impl/reduce_grad.h" diff --git a/paddle/phi/kernels/reduce_all_kernel.cc b/paddle/phi/kernels/reduce_all_kernel.cc new file mode 100644 index 0000000000000..5525f0dbfa7ed --- /dev/null +++ b/paddle/phi/kernels/reduce_all_kernel.cc @@ -0,0 +1,38 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/reduce_all_kernel.h" + +#include "paddle/phi/backends/all_context.h" +#include "paddle/phi/core/kernel_registry.h" + +namespace phi { + +template +void AllKernel(const Context& dev_ctx, + const DenseTensor& x, + const std::vector& dims, + bool keep_dim, + DenseTensor* out) { + bool reduce_all = false; + AllRawKernel(dev_ctx, x, dims, keep_dim, reduce_all, out); +} + +} // namespace phi + +PD_REGISTER_KERNEL(all, CPU, ALL_LAYOUT, phi::AllKernel, bool) {} + +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +PD_REGISTER_KERNEL(all, GPU, ALL_LAYOUT, phi::AllKernel, bool) {} +#endif diff --git a/paddle/phi/kernels/reduce_all_kernel.h b/paddle/phi/kernels/reduce_all_kernel.h new file mode 100644 index 0000000000000..af34a0a5d4c6f --- /dev/null +++ b/paddle/phi/kernels/reduce_all_kernel.h @@ -0,0 +1,36 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/core/dense_tensor.h" + +namespace phi { + +template +void AllRawKernel(const Context& dev_ctx, + const DenseTensor& x, + const std::vector& dims, + bool keep_dim, + bool reduce_all, + DenseTensor* out); + +template +void AllKernel(const Context& dev_ctx, + const DenseTensor& x, + const std::vector& dims, + bool keep_dim, + DenseTensor* out); + +} // namespace phi diff --git a/paddle/phi/kernels/reduce_any_kernel.cc b/paddle/phi/kernels/reduce_any_kernel.cc new file mode 100644 index 0000000000000..01cbcd4029c77 --- /dev/null +++ b/paddle/phi/kernels/reduce_any_kernel.cc @@ -0,0 +1,38 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/reduce_any_kernel.h" + +#include "paddle/phi/backends/all_context.h" +#include "paddle/phi/core/kernel_registry.h" + +namespace phi { + +template +void AnyKernel(const Context& dev_ctx, + const DenseTensor& x, + const std::vector& dims, + bool keep_dim, + DenseTensor* out) { + bool reduce_all = false; + AnyRawKernel(dev_ctx, x, dims, keep_dim, reduce_all, out); +} + +} // namespace phi + +PD_REGISTER_KERNEL(any, CPU, ALL_LAYOUT, phi::AnyKernel, bool) {} + +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +PD_REGISTER_KERNEL(any, GPU, ALL_LAYOUT, phi::AnyKernel, bool) {} +#endif diff --git a/paddle/phi/kernels/reduce_any_kernel.h b/paddle/phi/kernels/reduce_any_kernel.h new file mode 100644 index 0000000000000..9514d02dbdf94 --- /dev/null +++ b/paddle/phi/kernels/reduce_any_kernel.h @@ -0,0 +1,35 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/core/dense_tensor.h" + +namespace phi { +template +void AnyRawKernel(const Context& dev_ctx, + const DenseTensor& x, + const std::vector& dims, + bool keep_dim, + bool reduce_all, + DenseTensor* out); + +template +void AnyKernel(const Context& dev_ctx, + const DenseTensor& x, + const std::vector& dims, + bool keep_dim, + DenseTensor* out); + +} // namespace phi diff --git a/paddle/phi/kernels/reduce_grad_kernel.h b/paddle/phi/kernels/reduce_grad_kernel.h deleted file mode 100644 index a88b8f92aeb7b..0000000000000 --- a/paddle/phi/kernels/reduce_grad_kernel.h +++ /dev/null @@ -1,69 +0,0 @@ -// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include "paddle/phi/common/data_type.h" -#include "paddle/phi/core/dense_tensor.h" -namespace phi { - -template -void ReduceSumGradKernel(const Context& dev_ctx, - const DenseTensor& x, - const DenseTensor& out_grad, - const std::vector& dims, - bool keep_dim, - bool reduce_all, - DenseTensor* x_grad); - -template -void ReduceMeanGradKernel(const Context& dev_ctx, - const DenseTensor& x, - const DenseTensor& out_grad, - const std::vector& dims, - bool keep_dim, - bool reduce_all, - DenseTensor* x_grad); - -template -void ReduceProdGradKernel(const Context& dev_ctx, - const DenseTensor& x, - const DenseTensor& out, - const DenseTensor& out_grad, - const std::vector& dims, - bool keep_dim, - bool reduce_all, - DenseTensor* x_grad); - -template -void ReduceMaxGradKernel(const Context& dev_ctx, - const DenseTensor& x, - const DenseTensor& out, - const DenseTensor& out_grad, - const std::vector& dims, - bool keep_dim, - bool reduce_all, - DenseTensor* x_grad); - -template -void ReduceMinGradKernel(const Context& dev_ctx, - const DenseTensor& x, - const DenseTensor& out, - const DenseTensor& out_grad, - const std::vector& dims, - bool keep_dim, - bool reduce_all, - DenseTensor* x_grad); - -} // namespace phi diff --git a/paddle/phi/kernels/reduce_kernel.cc b/paddle/phi/kernels/reduce_kernel.cc deleted file mode 100644 index 7638c782d547d..0000000000000 --- a/paddle/phi/kernels/reduce_kernel.cc +++ /dev/null @@ -1,165 +0,0 @@ -// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "paddle/phi/kernels/reduce_kernel.h" - -#include "paddle/phi/backends/all_context.h" -#include "paddle/phi/core/kernel_registry.h" - -namespace phi { - -template -void SumKernel(const Context& dev_ctx, - const DenseTensor& x, - const std::vector& dims, - DataType out_dtype, - bool keep_dim, - DenseTensor* out) { - bool reduce_all = false; - SumRawKernel(dev_ctx, x, dims, keep_dim, reduce_all, out_dtype, out); -} - -template -void MeanKernel(const Context& dev_ctx, - const DenseTensor& x, - const std::vector& dims, - bool keep_dim, - DenseTensor* out) { - bool reduce_all = false; - MeanRawKernel(dev_ctx, x, dims, keep_dim, reduce_all, out); -} - -template -void ProdKernel(const Context& dev_ctx, - const DenseTensor& x, - const std::vector& dims, - bool keep_dim, - DenseTensor* out) { - bool reduce_all = false; - ProdRawKernel(dev_ctx, x, dims, keep_dim, reduce_all, out); -} - -template -void MaxKernel(const Context& dev_ctx, - const DenseTensor& x, - const std::vector& dims, - bool keep_dim, - DenseTensor* out) { - bool reduce_all = false; - MaxRawKernel(dev_ctx, x, dims, keep_dim, reduce_all, out); -} - -template -void MinKernel(const Context& dev_ctx, - const DenseTensor& x, - const std::vector& dims, - bool keep_dim, - DenseTensor* out) { - bool reduce_all = false; - MinRawKernel(dev_ctx, x, dims, keep_dim, reduce_all, out); -} - -template -void AllKernel(const Context& dev_ctx, - const DenseTensor& x, - const std::vector& dims, - bool keep_dim, - DenseTensor* out) { - bool reduce_all = false; - AllRawKernel(dev_ctx, x, dims, keep_dim, reduce_all, out); -} - -template -void AnyKernel(const Context& dev_ctx, - const DenseTensor& x, - const std::vector& dims, - bool keep_dim, - DenseTensor* out) { - bool reduce_all = false; - AnyRawKernel(dev_ctx, x, dims, keep_dim, reduce_all, out); -} - -} // namespace phi - -using complex64 = ::phi::dtype::complex; -using complex128 = ::phi::dtype::complex; - -PD_REGISTER_KERNEL( - mean, CPU, ALL_LAYOUT, phi::MeanKernel, float, double, bool) {} - -PD_REGISTER_KERNEL(sum, - CPU, - ALL_LAYOUT, - phi::SumKernel, - bool, - float, - double, - phi::dtype::float16, - int16_t, - int, - int64_t, - complex64, - complex128) { - kernel->OutputAt(0).SetDataType(paddle::experimental::DataType::UNDEFINED); -} - -PD_REGISTER_KERNEL( - prod, CPU, ALL_LAYOUT, phi::ProdKernel, float, double, int, int64_t) {} - -PD_REGISTER_KERNEL( - max, CPU, ALL_LAYOUT, phi::MaxKernel, float, double, int, int64_t) {} -PD_REGISTER_KERNEL( - min, CPU, ALL_LAYOUT, phi::MinKernel, float, double, int, int64_t) {} -PD_REGISTER_KERNEL(all, CPU, ALL_LAYOUT, phi::AllKernel, bool) {} -PD_REGISTER_KERNEL(any, CPU, ALL_LAYOUT, phi::AnyKernel, bool) {} - -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) - -PD_REGISTER_KERNEL(mean, - GPU, - ALL_LAYOUT, - phi::MeanKernel, - float, - double, - bool, - int, - int64_t, - phi::dtype::float16) {} -PD_REGISTER_KERNEL(sum, - GPU, - ALL_LAYOUT, - phi::SumKernel, - bool, - float, - double, - phi::dtype::float16, - phi::dtype::bfloat16, - int16_t, - int, - int64_t, - complex64, - complex128) { - kernel->OutputAt(0).SetDataType(paddle::experimental::DataType::UNDEFINED); -} - -PD_REGISTER_KERNEL( - prod, GPU, ALL_LAYOUT, phi::ProdKernel, float, double, int, int64_t) {} - -PD_REGISTER_KERNEL( - max, GPU, ALL_LAYOUT, phi::MaxKernel, float, double, int, int64_t) {} -PD_REGISTER_KERNEL( - min, GPU, ALL_LAYOUT, phi::MinKernel, float, double, int, int64_t) {} -PD_REGISTER_KERNEL(all, GPU, ALL_LAYOUT, phi::AllKernel, bool) {} -PD_REGISTER_KERNEL(any, GPU, ALL_LAYOUT, phi::AnyKernel, bool) {} -#endif diff --git a/paddle/phi/kernels/reduce_kernel.h b/paddle/phi/kernels/reduce_kernel.h deleted file mode 100644 index 69bcb47bc98ea..0000000000000 --- a/paddle/phi/kernels/reduce_kernel.h +++ /dev/null @@ -1,153 +0,0 @@ -// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include "paddle/phi/core/dense_tensor.h" -#include "paddle/phi/infermeta/unary.h" - -namespace phi { -template -void SumRawKernel(const Context& dev_ctx, - const DenseTensor& x, - const std::vector& dims, - bool keep_dim, - bool reduce_all, - DataType out_dtype, - DenseTensor* out); - -template -void MeanRawKernel(const Context& dev_ctx, - const DenseTensor& x, - const std::vector& dims, - bool keep_dim, - bool reduce_all, - DenseTensor* out); - -template -void ProdRawKernel(const Context& dev_ctx, - const DenseTensor& x, - const std::vector& dims, - bool keep_dim, - bool reduce_all, - DenseTensor* out); - -template -void MaxRawKernel(const Context& dev_ctx, - const DenseTensor& x, - const std::vector& dims, - bool keep_dim, - bool reduce_all, - DenseTensor* out); - -template -void MinRawKernel(const Context& dev_ctx, - const DenseTensor& x, - const std::vector& dims, - bool keep_dim, - bool reduce_all, - DenseTensor* out); - -template -void AnyRawKernel(const Context& dev_ctx, - const DenseTensor& x, - const std::vector& dims, - bool keep_dim, - bool reduce_all, - DenseTensor* out); - -template -void AllRawKernel(const Context& dev_ctx, - const DenseTensor& x, - const std::vector& dims, - bool keep_dim, - bool reduce_all, - DenseTensor* out); - -template -void SumKernel(const Context& dev_ctx, - const DenseTensor& x, - const std::vector& dims, - DataType out_dtype, - bool keep_dim, - DenseTensor* out); - -template -void MeanKernel(const Context& dev_ctx, - const DenseTensor& x, - const std::vector& dims, - bool keep_dim, - DenseTensor* out); - -template -void ProdKernel(const Context& dev_ctx, - const DenseTensor& x, - const std::vector& dims, - bool keep_dim, - DenseTensor* out); - -template -void MaxKernel(const Context& dev_ctx, - const DenseTensor& x, - const std::vector& dims, - bool keep_dim, - DenseTensor* out); - -template -void MinKernel(const Context& dev_ctx, - const DenseTensor& x, - const std::vector& dims, - bool keep_dim, - DenseTensor* out); - -template -void AnyKernel(const Context& dev_ctx, - const DenseTensor& x, - const std::vector& dims, - bool keep_dim, - DenseTensor* out); - -template -void AllKernel(const Context& dev_ctx, - const DenseTensor& x, - const std::vector& dims, - bool keep_dim, - DenseTensor* out); - -template -DenseTensor Mean(const Context& dev_ctx, - const DenseTensor& x, - const std::vector& axis, - bool keep_dim) { - DenseTensor dense_out; - MetaTensor meta_out(&dense_out); - SumRawInferMeta(x, axis, keep_dim, false, x.dtype(), &meta_out); - MeanKernel(dev_ctx, x, axis, keep_dim, &dense_out); - return dense_out; -} - -template -DenseTensor Sum(const Context& dev_ctx, - const DenseTensor& x, - const std::vector& axis, - DataType dtype, - bool keep_dim) { - DenseTensor dense_out; - MetaTensor meta_out(&dense_out); - SumInferMeta(x, axis, dtype, keep_dim, &meta_out); - SumKernel(dev_ctx, x, axis, dtype, keep_dim, &dense_out); - return dense_out; -} - -} // namespace phi diff --git a/paddle/phi/kernels/reduce_max_grad_kernel.h b/paddle/phi/kernels/reduce_max_grad_kernel.h new file mode 100644 index 0000000000000..ef3d9f36d28de --- /dev/null +++ b/paddle/phi/kernels/reduce_max_grad_kernel.h @@ -0,0 +1,31 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/common/data_type.h" +#include "paddle/phi/core/dense_tensor.h" +namespace phi { + +template +void ReduceMaxGradKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& out, + const DenseTensor& out_grad, + const std::vector& dims, + bool keep_dim, + bool reduce_all, + DenseTensor* x_grad); + +} // namespace phi diff --git a/paddle/phi/kernels/reduce_max_kernel.cc b/paddle/phi/kernels/reduce_max_kernel.cc new file mode 100644 index 0000000000000..a7458a3e0ac13 --- /dev/null +++ b/paddle/phi/kernels/reduce_max_kernel.cc @@ -0,0 +1,40 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/reduce_max_kernel.h" + +#include "paddle/phi/backends/all_context.h" +#include "paddle/phi/core/kernel_registry.h" + +namespace phi { + +template +void MaxKernel(const Context& dev_ctx, + const DenseTensor& x, + const std::vector& dims, + bool keep_dim, + DenseTensor* out) { + bool reduce_all = false; + MaxRawKernel(dev_ctx, x, dims, keep_dim, reduce_all, out); +} + +} // namespace phi + +PD_REGISTER_KERNEL( + max, CPU, ALL_LAYOUT, phi::MaxKernel, float, double, int, int64_t) {} + +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +PD_REGISTER_KERNEL( + max, GPU, ALL_LAYOUT, phi::MaxKernel, float, double, int, int64_t) {} +#endif diff --git a/paddle/phi/kernels/reduce_max_kernel.h b/paddle/phi/kernels/reduce_max_kernel.h new file mode 100644 index 0000000000000..f224f494a7229 --- /dev/null +++ b/paddle/phi/kernels/reduce_max_kernel.h @@ -0,0 +1,35 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/core/dense_tensor.h" + +namespace phi { +template +void MaxRawKernel(const Context& dev_ctx, + const DenseTensor& x, + const std::vector& dims, + bool keep_dim, + bool reduce_all, + DenseTensor* out); + +template +void MaxKernel(const Context& dev_ctx, + const DenseTensor& x, + const std::vector& dims, + bool keep_dim, + DenseTensor* out); + +} // namespace phi diff --git a/paddle/phi/kernels/reduce_mean_grad_kernel.h b/paddle/phi/kernels/reduce_mean_grad_kernel.h new file mode 100644 index 0000000000000..ccda3160aa9e5 --- /dev/null +++ b/paddle/phi/kernels/reduce_mean_grad_kernel.h @@ -0,0 +1,30 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/common/data_type.h" +#include "paddle/phi/core/dense_tensor.h" +namespace phi { + +template +void ReduceMeanGradKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& out_grad, + const std::vector& dims, + bool keep_dim, + bool reduce_all, + DenseTensor* x_grad); + +} // namespace phi diff --git a/paddle/phi/kernels/reduce_mean_kernel.cc b/paddle/phi/kernels/reduce_mean_kernel.cc new file mode 100644 index 0000000000000..812cf8702e15c --- /dev/null +++ b/paddle/phi/kernels/reduce_mean_kernel.cc @@ -0,0 +1,48 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/reduce_mean_kernel.h" + +#include "paddle/phi/backends/all_context.h" +#include "paddle/phi/core/kernel_registry.h" + +namespace phi { + +template +void MeanKernel(const Context& dev_ctx, + const DenseTensor& x, + const std::vector& dims, + bool keep_dim, + DenseTensor* out) { + bool reduce_all = false; + MeanRawKernel(dev_ctx, x, dims, keep_dim, reduce_all, out); +} + +} // namespace phi + +PD_REGISTER_KERNEL( + mean, CPU, ALL_LAYOUT, phi::MeanKernel, float, double, bool) {} + +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +PD_REGISTER_KERNEL(mean, + GPU, + ALL_LAYOUT, + phi::MeanKernel, + float, + double, + bool, + int, + int64_t, + phi::dtype::float16) {} +#endif diff --git a/paddle/phi/kernels/reduce_mean_kernel.h b/paddle/phi/kernels/reduce_mean_kernel.h new file mode 100644 index 0000000000000..2ac4bd8a46e64 --- /dev/null +++ b/paddle/phi/kernels/reduce_mean_kernel.h @@ -0,0 +1,49 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/core/dense_tensor.h" +#include "paddle/phi/infermeta/unary.h" + +namespace phi { + +template +void MeanRawKernel(const Context& dev_ctx, + const DenseTensor& x, + const std::vector& dims, + bool keep_dim, + bool reduce_all, + DenseTensor* out); + +template +void MeanKernel(const Context& dev_ctx, + const DenseTensor& x, + const std::vector& dims, + bool keep_dim, + DenseTensor* out); + +template +DenseTensor Mean(const Context& dev_ctx, + const DenseTensor& x, + const std::vector& axis, + bool keep_dim) { + DenseTensor dense_out; + MetaTensor meta_out(&dense_out); + SumRawInferMeta(x, axis, keep_dim, false, x.dtype(), &meta_out); + MeanKernel(dev_ctx, x, axis, keep_dim, &dense_out); + return dense_out; +} + +} // namespace phi diff --git a/paddle/phi/kernels/reduce_min_grad_kernel.h b/paddle/phi/kernels/reduce_min_grad_kernel.h new file mode 100644 index 0000000000000..3c6ea3a3564cf --- /dev/null +++ b/paddle/phi/kernels/reduce_min_grad_kernel.h @@ -0,0 +1,31 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/common/data_type.h" +#include "paddle/phi/core/dense_tensor.h" +namespace phi { + +template +void ReduceMinGradKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& out, + const DenseTensor& out_grad, + const std::vector& dims, + bool keep_dim, + bool reduce_all, + DenseTensor* x_grad); + +} // namespace phi diff --git a/paddle/phi/kernels/reduce_min_kernel.cc b/paddle/phi/kernels/reduce_min_kernel.cc new file mode 100644 index 0000000000000..620b5167566f2 --- /dev/null +++ b/paddle/phi/kernels/reduce_min_kernel.cc @@ -0,0 +1,40 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/reduce_min_kernel.h" + +#include "paddle/phi/backends/all_context.h" +#include "paddle/phi/core/kernel_registry.h" + +namespace phi { + +template +void MinKernel(const Context& dev_ctx, + const DenseTensor& x, + const std::vector& dims, + bool keep_dim, + DenseTensor* out) { + bool reduce_all = false; + MinRawKernel(dev_ctx, x, dims, keep_dim, reduce_all, out); +} + +} // namespace phi + +PD_REGISTER_KERNEL( + min, CPU, ALL_LAYOUT, phi::MinKernel, float, double, int, int64_t) {} + +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +PD_REGISTER_KERNEL( + min, GPU, ALL_LAYOUT, phi::MinKernel, float, double, int, int64_t) {} +#endif diff --git a/paddle/phi/kernels/reduce_min_kernel.h b/paddle/phi/kernels/reduce_min_kernel.h new file mode 100644 index 0000000000000..bbf3f2ab81826 --- /dev/null +++ b/paddle/phi/kernels/reduce_min_kernel.h @@ -0,0 +1,36 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/core/dense_tensor.h" + +namespace phi { + +template +void MinRawKernel(const Context& dev_ctx, + const DenseTensor& x, + const std::vector& dims, + bool keep_dim, + bool reduce_all, + DenseTensor* out); + +template +void MinKernel(const Context& dev_ctx, + const DenseTensor& x, + const std::vector& dims, + bool keep_dim, + DenseTensor* out); + +} // namespace phi diff --git a/paddle/phi/kernels/reduce_prod_grad_kernel.h b/paddle/phi/kernels/reduce_prod_grad_kernel.h new file mode 100644 index 0000000000000..fbf9f19a1bb82 --- /dev/null +++ b/paddle/phi/kernels/reduce_prod_grad_kernel.h @@ -0,0 +1,30 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/common/data_type.h" +#include "paddle/phi/core/dense_tensor.h" +namespace phi { + +template +void ReduceProdGradKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& out, + const DenseTensor& out_grad, + const std::vector& dims, + bool keep_dim, + bool reduce_all, + DenseTensor* x_grad); +} // namespace phi diff --git a/paddle/phi/kernels/reduce_prod_kernel.cc b/paddle/phi/kernels/reduce_prod_kernel.cc new file mode 100644 index 0000000000000..5bd410709c6ba --- /dev/null +++ b/paddle/phi/kernels/reduce_prod_kernel.cc @@ -0,0 +1,40 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/reduce_prod_kernel.h" + +#include "paddle/phi/backends/all_context.h" +#include "paddle/phi/core/kernel_registry.h" + +namespace phi { + +template +void ProdKernel(const Context& dev_ctx, + const DenseTensor& x, + const std::vector& dims, + bool keep_dim, + DenseTensor* out) { + bool reduce_all = false; + ProdRawKernel(dev_ctx, x, dims, keep_dim, reduce_all, out); +} + +} // namespace phi + +PD_REGISTER_KERNEL( + prod, CPU, ALL_LAYOUT, phi::ProdKernel, float, double, int, int64_t) {} + +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +PD_REGISTER_KERNEL( + prod, GPU, ALL_LAYOUT, phi::ProdKernel, float, double, int, int64_t) {} +#endif diff --git a/paddle/phi/kernels/reduce_prod_kernel.h b/paddle/phi/kernels/reduce_prod_kernel.h index 5e92b6c4db14e..be46a554b57e1 100644 --- a/paddle/phi/kernels/reduce_prod_kernel.h +++ b/paddle/phi/kernels/reduce_prod_kernel.h @@ -1,29 +1,35 @@ -/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. #pragma once #include "paddle/phi/core/dense_tensor.h" namespace phi { +template +void ProdRawKernel(const Context& dev_ctx, + const DenseTensor& x, + const std::vector& dims, + bool keep_dim, + bool reduce_all, + DenseTensor* out); template -void ReduceProdKernel(const Context& dev_ctx, - const DenseTensor& x, - const std::vector& dims, - bool keep_dim, - bool reduce_all, - DenseTensor* out); +void ProdKernel(const Context& dev_ctx, + const DenseTensor& x, + const std::vector& dims, + bool keep_dim, + DenseTensor* out); } // namespace phi diff --git a/paddle/phi/kernels/reduce_sum_grad_kernel.h b/paddle/phi/kernels/reduce_sum_grad_kernel.h new file mode 100644 index 0000000000000..b8b6618d43ec9 --- /dev/null +++ b/paddle/phi/kernels/reduce_sum_grad_kernel.h @@ -0,0 +1,30 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/common/data_type.h" +#include "paddle/phi/core/dense_tensor.h" +namespace phi { + +template +void ReduceSumGradKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& out_grad, + const std::vector& dims, + bool keep_dim, + bool reduce_all, + DenseTensor* x_grad); + +} // namespace phi diff --git a/paddle/phi/kernels/reduce_sum_kernel.cc b/paddle/phi/kernels/reduce_sum_kernel.cc new file mode 100644 index 0000000000000..e2b13333d7f81 --- /dev/null +++ b/paddle/phi/kernels/reduce_sum_kernel.cc @@ -0,0 +1,71 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/reduce_sum_kernel.h" + +#include "paddle/phi/backends/all_context.h" +#include "paddle/phi/core/kernel_registry.h" + +namespace phi { + +template +void SumKernel(const Context& dev_ctx, + const DenseTensor& x, + const std::vector& dims, + DataType out_dtype, + bool keep_dim, + DenseTensor* out) { + bool reduce_all = false; + SumRawKernel(dev_ctx, x, dims, keep_dim, reduce_all, out_dtype, out); +} + +} // namespace phi + +using complex64 = ::phi::dtype::complex; +using complex128 = ::phi::dtype::complex; + +PD_REGISTER_KERNEL(sum, + CPU, + ALL_LAYOUT, + phi::SumKernel, + bool, + float, + double, + phi::dtype::float16, + int16_t, + int, + int64_t, + complex64, + complex128) { + kernel->OutputAt(0).SetDataType(paddle::experimental::DataType::UNDEFINED); +} + +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +PD_REGISTER_KERNEL(sum, + GPU, + ALL_LAYOUT, + phi::SumKernel, + bool, + float, + double, + phi::dtype::float16, + phi::dtype::bfloat16, + int16_t, + int, + int64_t, + complex64, + complex128) { + kernel->OutputAt(0).SetDataType(paddle::experimental::DataType::UNDEFINED); +} +#endif diff --git a/paddle/phi/kernels/reduce_sum_kernel.h b/paddle/phi/kernels/reduce_sum_kernel.h new file mode 100644 index 0000000000000..c969cea296db1 --- /dev/null +++ b/paddle/phi/kernels/reduce_sum_kernel.h @@ -0,0 +1,51 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/core/dense_tensor.h" +#include "paddle/phi/infermeta/unary.h" + +namespace phi { +template +void SumRawKernel(const Context& dev_ctx, + const DenseTensor& x, + const std::vector& dims, + bool keep_dim, + bool reduce_all, + DataType out_dtype, + DenseTensor* out); + +template +void SumKernel(const Context& dev_ctx, + const DenseTensor& x, + const std::vector& dims, + DataType out_dtype, + bool keep_dim, + DenseTensor* out); + +template +DenseTensor Sum(const Context& dev_ctx, + const DenseTensor& x, + const std::vector& axis, + DataType dtype, + bool keep_dim) { + DenseTensor dense_out; + MetaTensor meta_out(&dense_out); + SumInferMeta(x, axis, dtype, keep_dim, &meta_out); + SumKernel(dev_ctx, x, axis, dtype, keep_dim, &dense_out); + return dense_out; +} + +} // namespace phi diff --git a/paddle/phi/tests/kernels/test_mean_dev_api.cc b/paddle/phi/tests/kernels/test_mean_dev_api.cc index ce31b2021e01a..92fc7f3c92a98 100644 --- a/paddle/phi/tests/kernels/test_mean_dev_api.cc +++ b/paddle/phi/tests/kernels/test_mean_dev_api.cc @@ -15,7 +15,7 @@ limitations under the License. */ #include #include -#include "paddle/phi/kernels/reduce_kernel.h" +#include "paddle/phi/kernels/reduce_mean_kernel.h" #include "paddle/fluid/memory/allocation/allocator_facade.h" #include "paddle/phi/api/lib/utils/allocator.h" diff --git a/paddle/phi/tests/kernels/test_sum_dev_api.cc b/paddle/phi/tests/kernels/test_sum_dev_api.cc index 82fa90c1574bd..9e889ab4ea4f6 100644 --- a/paddle/phi/tests/kernels/test_sum_dev_api.cc +++ b/paddle/phi/tests/kernels/test_sum_dev_api.cc @@ -15,7 +15,7 @@ limitations under the License. */ #include #include -#include "paddle/phi/kernels/reduce_kernel.h" +#include "paddle/phi/kernels/reduce_sum_kernel.h" #include "paddle/fluid/memory/allocation/allocator_facade.h" #include "paddle/phi/api/lib/utils/allocator.h" From ea0a164b460b55d54763959c935a6486a6617162 Mon Sep 17 00:00:00 2001 From: chentianyu03 Date: Fri, 15 Apr 2022 18:01:28 +0800 Subject: [PATCH 186/211] [Yaml]add adamw yaml (#41678) * add adamw yaml * fix test case error * make the name of weight and bias in linear1 and linear2 to be constant --- paddle/phi/api/lib/api_custom_impl.cc | 193 ++++++++++++++++++ paddle/phi/api/lib/api_custom_impl.h | 21 ++ .../fluid/tests/unittests/test_adamw_op.py | 12 ++ python/paddle/optimizer/adamw.py | 26 ++- python/paddle/utils/code_gen/api.yaml | 6 + 5 files changed, 250 insertions(+), 8 deletions(-) diff --git a/paddle/phi/api/lib/api_custom_impl.cc b/paddle/phi/api/lib/api_custom_impl.cc index 033ec569de811..ae248a7bf1280 100644 --- a/paddle/phi/api/lib/api_custom_impl.cc +++ b/paddle/phi/api/lib/api_custom_impl.cc @@ -217,6 +217,199 @@ std::tuple adam_impl( ////////////////// Forward api impls ////////////////////// +std::tuple adamw_impl( + const Tensor& param, + const Tensor& grad, + const Tensor& learning_rate, + const Tensor& moment1, + const Tensor& moment2, + const Tensor& beta1_pow, + const Tensor& beta2_pow, + paddle::optional master_param, + paddle::optional skip_update, + const Scalar& beta1, + const Scalar& beta2, + const Scalar& epsilon, + float lr_ratio, + float coeff, + bool with_decay, + bool lazy_mode, + int64_t min_row_size_to_use_multithread, + bool multi_precision, + bool use_global_beta_pow) { + Backend kernel_backend = Backend::UNDEFINED; + DataLayout kernel_layout = DataLayout::UNDEFINED; + DataType kernel_data_type = DataType::UNDEFINED; + if (kernel_backend == Backend::UNDEFINED || + kernel_layout == DataLayout::UNDEFINED || + kernel_data_type == DataType::UNDEFINED) { + auto kernel_key_set = ParseKernelKeyByInputArgs(param); + auto kernel_key = kernel_key_set.GetHighestPriorityKernelKey(); + if (kernel_backend == Backend::UNDEFINED) { + kernel_backend = kernel_key.backend(); + } + if (kernel_layout == DataLayout::UNDEFINED) { + kernel_layout = kernel_key.layout(); + } + if (kernel_data_type == DataType::UNDEFINED) { + kernel_data_type = kernel_key.dtype(); + } + } + std::string kernel_name = "adamw"; + const auto& kernel = phi::KernelFactory::Instance().SelectKernelOrThrowError( + kernel_name, {kernel_backend, kernel_layout, kernel_data_type}); + VLOG(6) << kernel_name << " API kernel key: [" << kernel_backend << ", " + << kernel_layout << ", " << kernel_data_type << "]"; + VLOG(6) << kernel_name << " API kernel: " << kernel; + + auto* dev_ctx = GetDeviceContextByBackend(kernel_backend); + + auto input_param = PrepareData(param, kernel.InputAt(0), {}); + auto input_grad = PrepareData(grad, kernel.InputAt(1), {}); + auto input_lr = PrepareData(learning_rate, kernel.InputAt(2), {}); + auto input_moment1 = PrepareData(moment1, kernel.InputAt(3), {}); + auto input_moment2 = PrepareData(moment2, kernel.InputAt(4), {}); + auto input_beta1_pow = PrepareData(beta1_pow, kernel.InputAt(5), {}); + auto input_beta2_pow = PrepareData(beta2_pow, kernel.InputAt(6), {}); + paddle::optional input_master_param(paddle::none); + auto input_master_param_ptr = + PrepareData(master_param, kernel.InputAt(7), {}); + paddle::optional input_skip_update(paddle::none); + auto input_skip_update_ptr = PrepareData(skip_update, kernel.InputAt(8), {}); + + std::tuple api_output; + auto kernel_out_0 = input_param.get(); + auto kernel_out_1 = input_moment1.get(); + auto kernel_out_2 = input_moment2.get(); + auto kernel_out_3 = input_beta1_pow.get(); + auto kernel_out_4 = input_beta2_pow.get(); + phi::DenseTensor* kernel_out_5 = nullptr; + if (input_master_param_ptr) { + input_master_param = + paddle::make_optional(*input_master_param_ptr); + kernel_out_5 = + paddle::make_optional(*input_master_param_ptr) + .get_ptr(); + } + + if (input_skip_update_ptr) { + input_skip_update = + paddle::make_optional(*input_skip_update_ptr); + } + + paddle::optional input_meta_ref_master_param( + paddle::none); + phi::DenseTensor dt; + phi::MetaTensor input_meta_tmp_master_param(dt); + if (input_master_param_ptr) { + input_meta_tmp_master_param.set_dtype(input_master_param_ptr->dtype()); + input_meta_tmp_master_param.set_dims(input_master_param_ptr->dims()); + input_meta_tmp_master_param.set_layout(input_master_param_ptr->layout()); + input_meta_ref_master_param = input_meta_tmp_master_param; + } + + paddle::optional input_meta_ref_skip_update( + paddle::none); + phi::DenseTensor dt1; + phi::MetaTensor input_meta_tmp_skip_update(dt1); + if (input_skip_update_ptr) { + input_meta_tmp_skip_update.set_dtype(input_skip_update_ptr->dtype()); + input_meta_tmp_skip_update.set_dims(input_skip_update_ptr->dims()); + input_meta_tmp_skip_update.set_layout(input_skip_update_ptr->layout()); + input_meta_ref_skip_update = input_meta_tmp_skip_update; + } + + phi::MetaTensor meta_out_0(kernel_out_0); + phi::MetaTensor meta_out_1(kernel_out_1); + phi::MetaTensor meta_out_2(kernel_out_2); + phi::MetaTensor meta_out_3(kernel_out_3); + phi::MetaTensor meta_out_4(kernel_out_4); + phi::MetaTensor meta_out_5(kernel_out_5); + + phi::AdamwInferMeta(MakeMetaTensor(*input_param), + MakeMetaTensor(*input_grad), + MakeMetaTensor(*input_lr), + MakeMetaTensor(*input_moment1), + MakeMetaTensor(*input_moment2), + MakeMetaTensor(*input_beta1_pow), + MakeMetaTensor(*input_beta2_pow), + input_meta_ref_master_param, + input_meta_ref_skip_update, + beta1, + beta2, + epsilon, + lr_ratio, + coeff, + with_decay, + lazy_mode, + min_row_size_to_use_multithread, + multi_precision, + use_global_beta_pow, + &meta_out_0, + &meta_out_1, + &meta_out_2, + &meta_out_3, + &meta_out_4, + &meta_out_5); + + using kernel_signature = void (*)(const platform::DeviceContext&, + const phi::DenseTensor&, + const phi::DenseTensor&, + const phi::DenseTensor&, + const phi::DenseTensor&, + const phi::DenseTensor&, + const phi::DenseTensor&, + const phi::DenseTensor&, + paddle::optional, + paddle::optional, + const Scalar&, + const Scalar&, + const Scalar&, + float, + float, + bool, + bool, + int64_t, + bool, + bool, + phi::DenseTensor*, + phi::DenseTensor*, + phi::DenseTensor*, + phi::DenseTensor*, + phi::DenseTensor*, + phi::DenseTensor*); + auto* kernel_fn = kernel.GetVariadicKernelFn(); + + (*kernel_fn)(*dev_ctx, + *input_param, + *input_grad, + *input_lr, + *input_moment1, + *input_moment2, + *input_beta1_pow, + *input_beta2_pow, + input_master_param, + input_skip_update, + beta1, + beta2, + epsilon, + lr_ratio, + coeff, + with_decay, + lazy_mode, + min_row_size_to_use_multithread, + multi_precision, + use_global_beta_pow, + kernel_out_0, + kernel_out_1, + kernel_out_2, + kernel_out_3, + kernel_out_4, + kernel_out_5); + + return api_output; +} + Tensor conv2d_impl(const Tensor& input, const Tensor& filter, const std::vector& strides, diff --git a/paddle/phi/api/lib/api_custom_impl.h b/paddle/phi/api/lib/api_custom_impl.h index 4ddc3e5f4e0d2..46abcd90de32a 100644 --- a/paddle/phi/api/lib/api_custom_impl.h +++ b/paddle/phi/api/lib/api_custom_impl.h @@ -49,6 +49,27 @@ std::tuple adam_impl( bool multi_precision, bool use_global_beta_pow); +std::tuple adamw_impl( + const Tensor& param, + const Tensor& grad, + const Tensor& learning_rate, + const Tensor& moment1, + const Tensor& moment2, + const Tensor& beta1_pow, + const Tensor& beta2_pow, + paddle::optional master_param, + paddle::optional skip_update, + const Scalar& beta1, + const Scalar& beta2, + const Scalar& epsilon, + float lr_ratio, + float coeff, + bool with_decay, + bool lazy_mode, + int64_t min_row_size_to_use_multithread, + bool multi_precision, + bool use_global_beta_pow); + std::tuple batch_norm_impl( const Tensor& x, const Tensor& scale, diff --git a/python/paddle/fluid/tests/unittests/test_adamw_op.py b/python/paddle/fluid/tests/unittests/test_adamw_op.py index d5fa944802a47..d2eef785f6e07 100644 --- a/python/paddle/fluid/tests/unittests/test_adamw_op.py +++ b/python/paddle/fluid/tests/unittests/test_adamw_op.py @@ -20,6 +20,7 @@ from op_test import OpTest from functools import partial from paddle.framework import core +from paddle.fluid.framework import _test_eager_guard def adamw_step(inputs, attributes): @@ -238,6 +239,11 @@ def test_adamw_op_invalid_input(self): adam = paddle.optimizer.AdamW( 0.1, epsilon=-1, parameters=linear.parameters()) + def test_api_eager_dygraph(self): + with _test_eager_guard(): + self.test_adamw_op_dygraph() + self.test_adamw_op_invalid_input() + class TestAdamWOpGroup(TestAdamWOp): def test_adamw_op_dygraph(self): @@ -319,6 +325,12 @@ def test_adamw_op_dygraph(self): linear1 = paddle.nn.Linear(13, 8) linear2 = paddle.nn.Linear(8, 5) + # fix the linear name, simple_lr_setting function will use the name + linear1.weight.name = "linear_1.w_0" + linear1.bias.name = "linear_1.b_0" + linear2.weight.name = "linear_2.w_0" + linear2.bias.name = "linear_2.b_0" + simple_lr_fun = partial(simple_lr_setting, decay_rate=0.8, n_layers=2) adam = paddle.optimizer.AdamW( diff --git a/python/paddle/optimizer/adamw.py b/python/paddle/optimizer/adamw.py index e69dcf170d93c..0fa49745a95fb 100644 --- a/python/paddle/optimizer/adamw.py +++ b/python/paddle/optimizer/adamw.py @@ -290,14 +290,24 @@ def _append_optimize_op(self, block, param_and_grad): _beta2 = self._beta2 if not isinstance( self._beta2, Variable) else self._beta2.numpy().item(0) - _, _, _, _, _, _ = _C_ops.adamw( - param_and_grad[0], param_and_grad[1], lr, moment1, moment2, - beta1_pow_acc, beta2_pow_acc, master_weight, param_and_grad[0], - moment1, moment2, beta1_pow_acc, beta2_pow_acc, master_weight, - 'epsilon', self._epsilon, 'lazy_mode', self._lazy_mode, - 'min_row_size_to_use_multithread', 1000, 'beta1', _beta1, - 'beta2', _beta2, "with_decay", with_decay, 'coeff', self._coeff, - 'multi_precision', find_master, 'lr_ratio', lr_ratio_) + if framework.in_dygraph_mode(): + found_inf = self._get_auxiliary_var('found_inf') + _, _, _, _, _, _ = _C_ops.final_state_adamw( + param_and_grad[0], param_and_grad[1], lr, moment1, moment2, + beta1_pow_acc, beta2_pow_acc, master_weight, found_inf, + _beta1, _beta2, self._epsilon, lr_ratio_, self._coeff, + with_decay, self._lazy_mode, 1000, find_master, False) + else: + _, _, _, _, _, _ = _C_ops.adamw( + param_and_grad[0], param_and_grad[1], lr, moment1, moment2, + beta1_pow_acc, beta2_pow_acc, master_weight, + param_and_grad[0], moment1, moment2, beta1_pow_acc, + beta2_pow_acc, master_weight, 'epsilon', self._epsilon, + 'lazy_mode', self._lazy_mode, + 'min_row_size_to_use_multithread', 1000, 'beta1', _beta1, + 'beta2', _beta2, "with_decay", with_decay, 'coeff', + self._coeff, 'multi_precision', find_master, 'lr_ratio', + lr_ratio_) return None inputs = { diff --git a/python/paddle/utils/code_gen/api.yaml b/python/paddle/utils/code_gen/api.yaml index a142225e6578c..41b5fc26fa941 100644 --- a/python/paddle/utils/code_gen/api.yaml +++ b/python/paddle/utils/code_gen/api.yaml @@ -58,6 +58,12 @@ func : AdamaxInferMeta kernel : func : adamax + +- api : adamw + args : (Tensor param, Tensor grad, Tensor learning_rate, Tensor moment1, Tensor moment2, Tensor beta1_pow, Tensor beta2_pow, Tensor master_param, Tensor skip_update, Scalar beta1, Scalar beta2, Scalar epsilon, float lr_ratio, float coeff, bool with_decay, bool lazy_mode, int64_t min_row_size_to_use_multithread, bool multi_precision, bool use_global_beta_pow) + output : Tensor(param_out), Tensor(moment1_out), Tensor(moment2_out), Tensor(beta1_pow_out), Tensor(beta2_pow_out), Tensor(master_param_outs) + optional : master_param, skip_update + invoke : adamw_impl(param, grad, learning_rate, moment1, moment2, beta1_pow, beta2_pow, master_param, skip_update, beta1, beta2, epsilon, lr_ratio, coeff, with_decay, lazy_mode, min_row_size_to_use_multithread, multi_precision, use_global_beta_pow) - api : add args : (Tensor x, Tensor y) From e25b75b68283f163f3064f227bde960aee7c1e7e Mon Sep 17 00:00:00 2001 From: huangxu96 <46740794+huangxu96@users.noreply.github.com> Date: Fri, 15 Apr 2022 20:50:34 +0800 Subject: [PATCH 187/211] fix a bug which will casue cuda address error when the input size is very large (#41824) As the title --- paddle/fluid/operators/gather_scatter_kernel.cu | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) diff --git a/paddle/fluid/operators/gather_scatter_kernel.cu b/paddle/fluid/operators/gather_scatter_kernel.cu index dc87fc52aacb4..f97eb3d5e9d9a 100644 --- a/paddle/fluid/operators/gather_scatter_kernel.cu +++ b/paddle/fluid/operators/gather_scatter_kernel.cu @@ -119,7 +119,7 @@ struct gpu_gather_scatter_functor { is_scatter_like ? self_dims[dim] : src_dims[dim]; int64_t inner_dim_size = 1; int64_t outer_dim_size = 1; - for (int64_t i = 0; i < index_dims.size(); ++i) { + for (int64_t i = 0; i < dim; ++i) { inner_dim_size *= index_dims[i]; } @@ -127,11 +127,8 @@ struct gpu_gather_scatter_functor { outer_dim_size *= index_dims[i]; } - int64_t slice_size = 1; - for (int i = 1; i < src_dims.size(); ++i) slice_size *= src_dims[i]; - int block = 512; - int64_t n = slice_size * index_size; + int64_t n = inner_dim_size * select_dim_size * outer_dim_size; int64_t grid = (n + block - 1) / block; auto stream = reinterpret_cast(ctx).stream(); @@ -215,11 +212,8 @@ void gpu_scatter_input_grad_kernel(Tensor self, int dim, const Tensor& index, outer_dim_size *= index_dims[i]; } - int64_t slice_size = 1; - for (int i = 1; i < grad_dims.size(); ++i) slice_size *= grad_dims[i]; - int block = 512; - int64_t n = slice_size * index_size; + int64_t n = inner_dim_size * select_dim_size * outer_dim_size; int64_t grid = (n + block - 1) / block; auto stream = reinterpret_cast(ctx).stream(); From c37af19c963ab1b4c65ac4f7ca83e31f864c76d3 Mon Sep 17 00:00:00 2001 From: Roc <30228238+sljlp@users.noreply.github.com> Date: Fri, 15 Apr 2022 21:03:11 +0800 Subject: [PATCH 188/211] Moe ref (#41836) * moe ref * ref commit; test=document_fix * update; test=document_fix * update test=document_fix --- paddle/fluid/operators/assign_pos_op.cu | 11 ++++++++++- paddle/fluid/operators/limit_by_capacity_op.cu | 8 ++++++++ paddle/fluid/operators/number_count_op.cu | 8 ++++++++ paddle/fluid/operators/prune_gate_by_capacity_op.cu | 8 ++++++++ .../incubate/distributed/models/moe/gate/base_gate.py | 7 +++++++ .../distributed/models/moe/gate/gshard_gate.py | 7 +++++++ .../distributed/models/moe/gate/naive_gate.py | 9 ++++++++- .../distributed/models/moe/gate/switch_gate.py | 9 ++++++++- .../incubate/distributed/models/moe/grad_clip.py | 5 +++++ .../incubate/distributed/models/moe/moe_layer.py | 7 +++++++ .../paddle/incubate/distributed/models/moe/utils.py | 10 +++++++++- 11 files changed, 85 insertions(+), 4 deletions(-) diff --git a/paddle/fluid/operators/assign_pos_op.cu b/paddle/fluid/operators/assign_pos_op.cu index 5fa159b94f983..d96d36931b323 100644 --- a/paddle/fluid/operators/assign_pos_op.cu +++ b/paddle/fluid/operators/assign_pos_op.cu @@ -10,7 +10,16 @@ Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and -limitations under the License. */ +limitations under the License. + +The file has been adapted from the two files: + https://github.com/laekov/fastmoe/blob/master/cuda/local_exchange.cu + https://github.com/laekov/fastmoe/blob/master/cuda/local_exchange.cuh + Git commit hash: 295a615aacce7e54a37e7935274ba15e901c78e4 +We retain the following license from the original files: + Copyright 2021, Jiaao He + Licensed under the Apache License, Version 2.0 (the "License"). +*/ #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/operators/assign_pos_op.h" diff --git a/paddle/fluid/operators/limit_by_capacity_op.cu b/paddle/fluid/operators/limit_by_capacity_op.cu index 253ae8162c9b4..c77adf2200cbe 100644 --- a/paddle/fluid/operators/limit_by_capacity_op.cu +++ b/paddle/fluid/operators/limit_by_capacity_op.cu @@ -11,6 +11,14 @@ // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. +// +// The file has been adapted from the two files: +// https://github.com/laekov/fastmoe/blob/master/cuda/balancing.cu +// https://github.com/laekov/fastmoe/blob/master/cuda/balancing.cuh +// Git commit hash: 295a615aacce7e54a37e7935274ba15e901c78e4 +// We retain the following license from the original files: +// Copyright 2021, Jiaao He. All rights reserved. +// Licensed under the Apache License, Version 2.0 (the "License"). #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/operators/limit_by_capacity_op.h" diff --git a/paddle/fluid/operators/number_count_op.cu b/paddle/fluid/operators/number_count_op.cu index 0106c70d8eb53..923d89c24853f 100644 --- a/paddle/fluid/operators/number_count_op.cu +++ b/paddle/fluid/operators/number_count_op.cu @@ -11,6 +11,14 @@ // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. +// +// The file has been adapted from the two files: +// https://github.com/laekov/fastmoe/blob/master/cuda/local_exchange.cu +// https://github.com/laekov/fastmoe/blob/master/cuda/local_exchange.cuh +// Git commit hash: 295a615aacce7e54a37e7935274ba15e901c78e4 +// We retain the following license from the original files: +// Copyright 2021, Jiaao He. All rights reserved. +// Licensed under the Apache License, Version 2.0 (the "License"). #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/operators/number_count_op.h" diff --git a/paddle/fluid/operators/prune_gate_by_capacity_op.cu b/paddle/fluid/operators/prune_gate_by_capacity_op.cu index 953847512bc1a..7228bdbf3805a 100644 --- a/paddle/fluid/operators/prune_gate_by_capacity_op.cu +++ b/paddle/fluid/operators/prune_gate_by_capacity_op.cu @@ -11,6 +11,14 @@ // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. +// +// The file has been adapted from the two files: +// https://github.com/laekov/fastmoe/blob/master/cuda/balancing.cu +// https://github.com/laekov/fastmoe/blob/master/cuda/balancing.cuh +// Git commit hash: 295a615aacce7e54a37e7935274ba15e901c78e4 +// We retain the following license from the original files: +// Copyright 2021, Jiaao He. All rights reserved. +// Licensed under the Apache License, Version 2.0 (the "License"). #include "paddle/fluid/operators/prune_gate_by_capacity_op.h" #include "paddle/fluid/platform/device/gpu/gpu_primitives.h" diff --git a/python/paddle/incubate/distributed/models/moe/gate/base_gate.py b/python/paddle/incubate/distributed/models/moe/gate/base_gate.py index 100d201d4b3d1..f527e82f043c7 100644 --- a/python/paddle/incubate/distributed/models/moe/gate/base_gate.py +++ b/python/paddle/incubate/distributed/models/moe/gate/base_gate.py @@ -11,6 +11,13 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +# +# The file has been adapted from the file: +# https://github.com/laekov/fastmoe/blob/master/fmoe/gates/base_gate.py +# Git commit hash: 295a615aacce7e54a37e7935274ba15e901c78e4 +# We retain the following license from the original files: +# Copyright 2021, Jiaao He. All rights reserved. +# Licensed under the Apache License, Version 2.0 (the "License"). import paddle.nn as nn diff --git a/python/paddle/incubate/distributed/models/moe/gate/gshard_gate.py b/python/paddle/incubate/distributed/models/moe/gate/gshard_gate.py index 3ab3cf6901402..3618ec56e96c9 100644 --- a/python/paddle/incubate/distributed/models/moe/gate/gshard_gate.py +++ b/python/paddle/incubate/distributed/models/moe/gate/gshard_gate.py @@ -11,6 +11,13 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +# +# The file has been adapted from the file: +# https://github.com/laekov/fastmoe/blob/master/fmoe/gates/gshard_gate.py +# Git commit hash: 295a615aacce7e54a37e7935274ba15e901c78e4 +# We retain the following license from the original files: +# Copyright 2021, Jiaao He. All rights reserved. +# Licensed under the Apache License, Version 2.0 (the "License"). import math import paddle diff --git a/python/paddle/incubate/distributed/models/moe/gate/naive_gate.py b/python/paddle/incubate/distributed/models/moe/gate/naive_gate.py index 785d2e971bb36..491d1f95e10cb 100644 --- a/python/paddle/incubate/distributed/models/moe/gate/naive_gate.py +++ b/python/paddle/incubate/distributed/models/moe/gate/naive_gate.py @@ -1,5 +1,5 @@ # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -# +# # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at @@ -11,6 +11,13 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +# +# The file has been adapted from the file: +# https://github.com/laekov/fastmoe/blob/master/fmoe/gates/gshard_gate.py +# Git commit hash: 295a615aacce7e54a37e7935274ba15e901c78e4 +# We retain the following license from the original files: +# Copyright 2021, Jiaao He. All rights reserved. +# Licensed under the Apache License, Version 2.0 (the "License"). from .base_gate import BaseGate diff --git a/python/paddle/incubate/distributed/models/moe/gate/switch_gate.py b/python/paddle/incubate/distributed/models/moe/gate/switch_gate.py index 54bf3ab148ab2..776516989e5a1 100644 --- a/python/paddle/incubate/distributed/models/moe/gate/switch_gate.py +++ b/python/paddle/incubate/distributed/models/moe/gate/switch_gate.py @@ -1,5 +1,5 @@ # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -# +# # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at @@ -11,6 +11,13 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +# +# The file has been adapted from the file: +# https://github.com/laekov/fastmoe/blob/master/fmoe/gates/switch_gate.py +# Git commit hash: 295a615aacce7e54a37e7935274ba15e901c78e4 +# We retain the following license from the original files: +# Copyright 2021, Jiaao He. All rights reserved. +# Licensed under the Apache License, Version 2.0 (the "License"). import math import paddle diff --git a/python/paddle/incubate/distributed/models/moe/grad_clip.py b/python/paddle/incubate/distributed/models/moe/grad_clip.py index cde5455d27168..b620253b9f26f 100644 --- a/python/paddle/incubate/distributed/models/moe/grad_clip.py +++ b/python/paddle/incubate/distributed/models/moe/grad_clip.py @@ -55,6 +55,11 @@ class ClipGradForMOEByGlobalNorm(ClipGradBase): ``need_clip`` of ``ClipGradyGlobalNorm`` HAS BEEN DEPRECATED since 2.0. Please use ``need_clip`` in ``ParamAttr`` to speficiy the clip scope. + Reference: + https://github.com/laekov/fastmoe/blob/master/examples/megatron/clip-grad-v2.2.patch + Git commit hash: 295a615aacce7e54a37e7935274ba15e901c78e4 + + Args: clip_norm (float): The maximum norm value. is_expert_param_func (function): a function to decide whether a param should be put into moe_params_grads diff --git a/python/paddle/incubate/distributed/models/moe/moe_layer.py b/python/paddle/incubate/distributed/models/moe/moe_layer.py index 99cc38d04bdda..eebb635e3ead7 100644 --- a/python/paddle/incubate/distributed/models/moe/moe_layer.py +++ b/python/paddle/incubate/distributed/models/moe/moe_layer.py @@ -11,6 +11,13 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +# +# The file has been adapted from the file: +# https://github.com/laekov/fastmoe/blob/master/fmoe/layers.py +# Git commit hash: 295a615aacce7e54a37e7935274ba15e901c78e4 +# We retain the following license from the original files: +# Copyright 2021, Jiaao He. All rights reserved. +# Licensed under the Apache License, Version 2.0 (the "License"). import collections import math diff --git a/python/paddle/incubate/distributed/models/moe/utils.py b/python/paddle/incubate/distributed/models/moe/utils.py index 0e87fe3e31360..25c76c9753035 100644 --- a/python/paddle/incubate/distributed/models/moe/utils.py +++ b/python/paddle/incubate/distributed/models/moe/utils.py @@ -1,5 +1,5 @@ # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -# +# # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at @@ -11,6 +11,14 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +# +# The file has been adapted from the file: +# https://github.com/laekov/fastmoe/blob/master/fmoe/functions.py +# Git commit hash: 295a615aacce7e54a37e7935274ba15e901c78e4 +# We retain the following license from the original files: +# Copyright 2021, Jiaao He. All rights reserved. +# Licensed under the Apache License, Version 2.0 (the "License"). + from paddle.distributed.models.moe.utils import _number_count, _limit_by_capacity, _prune_gate_by_capacity, _assign_pos import paddle From ce72690caa96b0a0431f4deccd6127b5fff808cc Mon Sep 17 00:00:00 2001 From: seemingwang Date: Fri, 15 Apr 2022 22:43:10 +0800 Subject: [PATCH 189/211] gpu_graph engine optimization+ (#41455) * extract sub-graph * graph-engine merging * fix * fix * fix heter-ps config * test performance * test performance * test performance * test * test * update bfs * change cmake * test * test gpu speed * gpu_graph_engine optimization * add ssd layer to graph_engine * fix allocation * fix syntax error * fix syntax error * fix pscore class * fix * recover test * recover test * fix spelling * recover * fix --- .../ps/table/common_graph_table.cc | 179 +++++- .../distributed/ps/table/common_graph_table.h | 54 +- .../ps/table/depends/rocksdb_warpper.h | 1 + paddle/fluid/distributed/ps/table/table.cc | 4 - paddle/fluid/distributed/test/CMakeLists.txt | 2 +- .../fluid/distributed/test/graph_node_test.cc | 2 +- .../test/graph_table_sample_test.cc | 72 +-- paddle/fluid/distributed/the_one_ps.proto | 20 +- .../framework/fleet/heter_ps/CMakeLists.txt | 15 +- .../framework/fleet/heter_ps/gpu_graph_node.h | 25 +- .../fleet/heter_ps/graph_gpu_ps_table.h | 89 ++- .../fleet/heter_ps/graph_gpu_ps_table_inl.h | 579 +++++++++--------- .../framework/fleet/heter_ps/graph_sampler.h | 112 ++++ .../fleet/heter_ps/graph_sampler_inl.h | 161 +++++ .../framework/fleet/heter_ps/heter_comm.h | 2 +- .../fleet/heter_ps/test_cpu_graph_sample.cu | 1 - .../fleet/heter_ps/test_sample_rate.cu | 144 +++-- 17 files changed, 978 insertions(+), 484 deletions(-) create mode 100644 paddle/fluid/framework/fleet/heter_ps/graph_sampler.h create mode 100644 paddle/fluid/framework/fleet/heter_ps/graph_sampler_inl.h diff --git a/paddle/fluid/distributed/ps/table/common_graph_table.cc b/paddle/fluid/distributed/ps/table/common_graph_table.cc index 7aab679954709..df55fe93be3d8 100644 --- a/paddle/fluid/distributed/ps/table/common_graph_table.cc +++ b/paddle/fluid/distributed/ps/table/common_graph_table.cc @@ -28,7 +28,112 @@ namespace paddle { namespace distributed { #ifdef PADDLE_WITH_HETERPS +paddle::framework::GpuPsCommGraph GraphTable::make_gpu_ps_graph( + std::vector ids) { + std::vector> bags(task_pool_size_); + for (auto x : ids) { + int location = x % shard_num % task_pool_size_; + bags[location].push_back(x); + } + std::vector> tasks; + std::vector edge_array[task_pool_size_]; + std::vector node_array[task_pool_size_]; + for (int i = 0; i < (int)bags.size(); i++) { + if (bags[i].size() > 0) { + tasks.push_back(_shards_task_pool[i]->enqueue([&, i, this]() -> int { + paddle::framework::GpuPsGraphNode x; + for (int j = 0; j < (int)bags[i].size(); j++) { + Node *v = find_node(bags[i][j]); + x.node_id = bags[i][j]; + if (v == NULL) { + x.neighbor_size = 0; + x.neighbor_offset = 0; + node_array[i].push_back(x); + } else { + x.neighbor_size = v->get_neighbor_size(); + x.neighbor_offset = edge_array[i].size(); + node_array[i].push_back(x); + for (int k = 0; k < x.neighbor_size; k++) { + edge_array[i].push_back(v->get_neighbor_id(k)); + } + } + } + return 0; + })); + } + } + for (int i = 0; i < (int)tasks.size(); i++) tasks[i].get(); + paddle::framework::GpuPsCommGraph res; + int tot_len = 0; + for (int i = 0; i < task_pool_size_; i++) { + tot_len += (int)edge_array[i].size(); + } + res.neighbor_size = tot_len; + res.node_size = ids.size(); + res.neighbor_list = new int64_t[tot_len]; + res.node_list = new paddle::framework::GpuPsGraphNode[ids.size()]; + int offset = 0, ind = 0; + for (int i = 0; i < task_pool_size_; i++) { + for (int j = 0; j < (int)node_array[i].size(); j++) { + res.node_list[ind] = node_array[i][j]; + res.node_list[ind++].neighbor_offset += offset; + } + for (int j = 0; j < (int)edge_array[i].size(); j++) { + res.neighbor_list[offset + j] = edge_array[i][j]; + } + offset += edge_array[i].size(); + } + return res; +} +int32_t GraphTable::add_node_to_ssd(int64_t src_id, char *data, int len) { + if (_db != NULL) + _db->put(src_id % shard_num % task_pool_size_, (char *)&src_id, + sizeof(uint64_t), (char *)data, sizeof(int64_t) * len); + return 0; +} +char *GraphTable::random_sample_neighbor_from_ssd( + int64_t id, int sample_size, const std::shared_ptr rng, + int &actual_size) { + if (_db == NULL) { + actual_size = 0; + return NULL; + } + std::string str; + if (_db->get(id % shard_num % task_pool_size_, (char *)&id, sizeof(uint64_t), + str) == 0) { + int64_t *data = ((int64_t *)str.c_str()); + int n = str.size() / sizeof(int64_t); + std::unordered_map m; + // std::vector res; + int sm_size = std::min(n, sample_size); + actual_size = sm_size * Node::id_size; + char *buff = new char[actual_size]; + for (int i = 0; i < sm_size; i++) { + std::uniform_int_distribution distrib(0, n - i - 1); + int t = distrib(*rng); + // int t = rand() % (n-i); + int pos = 0; + auto iter = m.find(t); + if (iter != m.end()) { + pos = iter->second; + } else { + pos = t; + } + auto iter2 = m.find(n - i - 1); + int key2 = iter2 == m.end() ? n - i - 1 : iter2->second; + m[t] = key2; + m.erase(n - i - 1); + memcpy(buff + i * Node::id_size, &data[pos], Node::id_size); + // res.push_back(data[pos]); + } + return buff; + } + actual_size = 0; + return NULL; +} +#endif +/* int CompleteGraphSampler::run_graph_sampling() { pthread_rwlock_t *rw_lock = graph_table->rw_lock.get(); pthread_rwlock_rdlock(rw_lock); @@ -136,7 +241,8 @@ int BasicBfsGraphSampler::run_graph_sampling() { int task_size = 0; std::vector> tasks; int init_size = 0; - std::function bfs = [&, this](int i, int64_t id) -> int { + //__sync_fetch_and_add + std::function bfs = [&, this](int i, int id) -> int { if (this->status == GraphSamplerStatus::terminating) { int task_left = __sync_sub_and_fetch(&task_size, 1); if (task_left == 0) { @@ -289,6 +395,7 @@ int BasicBfsGraphSampler::run_graph_sampling() { std::this_thread::sleep_for(std::chrono::seconds(1)); } } + VLOG(0)<<"bfs returning"; } return 0; } @@ -304,7 +411,7 @@ void BasicBfsGraphSampler::init(size_t gpu_num, GraphTable *graph_table, } #endif - +*/ std::vector GraphShard::get_batch(int start, int end, int step) { if (start < 0) start = 0; std::vector res; @@ -316,6 +423,17 @@ std::vector GraphShard::get_batch(int start, int end, int step) { size_t GraphShard::get_size() { return bucket.size(); } +int32_t GraphTable::add_comm_edge(int64_t src_id, int64_t dst_id) { + size_t src_shard_id = src_id % shard_num; + + if (src_shard_id >= shard_end || src_shard_id < shard_start) { + return -1; + } + size_t index = src_shard_id - shard_start; + extra_shards[index]->add_graph_node(src_id)->build_edges(false); + extra_shards[index]->add_neighbor(src_id, dst_id, 1.0); + return 0; +} int32_t GraphTable::add_graph_node(std::vector &id_list, std::vector &is_weight_list) { size_t node_size = id_list.size(); @@ -554,9 +672,9 @@ int32_t GraphTable::load_nodes(const std::string &path, std::string node_type) { } int32_t GraphTable::load_edges(const std::string &path, bool reverse_edge) { -#ifdef PADDLE_WITH_HETERPS - if (gpups_mode) pthread_rwlock_rdlock(rw_lock.get()); -#endif + // #ifdef PADDLE_WITH_HETERPS + // if (gpups_mode) pthread_rwlock_rdlock(rw_lock.get()); + // #endif auto paths = paddle::string::split_string(path, ";"); int64_t count = 0; std::string sample_type = "random"; @@ -633,9 +751,9 @@ int32_t GraphTable::load_edges(const std::string &path, bool reverse_edge) { relocate the duplicate nodes to make them distributed evenly among threads. */ if (!use_duplicate_nodes) { -#ifdef PADDLE_WITH_HETERPS - if (gpups_mode) pthread_rwlock_unlock(rw_lock.get()); -#endif + // #ifdef PADDLE_WITH_HETERPS + // if (gpups_mode) pthread_rwlock_unlock(rw_lock.get()); + // #endif return 0; } @@ -712,9 +830,9 @@ int32_t GraphTable::load_edges(const std::string &path, bool reverse_edge) { delete extra_shards[i]; extra_shards[i] = extra_shards_copy[i]; } -#ifdef PADDLE_WITH_HETERPS - if (gpups_mode) pthread_rwlock_unlock(rw_lock.get()); -#endif + // #ifdef PADDLE_WITH_HETERPS + // if (gpups_mode) pthread_rwlock_unlock(rw_lock.get()); + // #endif return 0; } @@ -878,6 +996,17 @@ int32_t GraphTable::random_sample_neighbors( idx = seq_id[i][k]; int &actual_size = actual_sizes[idx]; if (node == nullptr) { +#ifdef PADDLE_WITH_HETERPS + if (search_level == 2) { + char *buffer_addr = random_sample_neighbor_from_ssd( + node_id, sample_size, rng, actual_size); + if (actual_size != 0) { + std::shared_ptr &buffer = buffers[idx]; + buffer.reset(buffer_addr, char_del); + } + continue; + } +#endif actual_size = 0; continue; } @@ -1085,25 +1214,29 @@ int32_t GraphTable::Initialize(const TableParameter &config, return Initialize(graph); } int32_t GraphTable::Initialize(const GraphParameter &graph) { + task_pool_size_ = graph.task_pool_size(); #ifdef PADDLE_WITH_HETERPS - if (graph.gpups_mode()) { - gpups_mode = true; - auto *sampler = - CREATE_PSCORE_CLASS(GraphSampler, graph.gpups_graph_sample_class()); - auto slices = - string::split_string(graph.gpups_graph_sample_args(), ","); - std::cout << "slices" << std::endl; - for (auto x : slices) std::cout << x << std::endl; - sampler->init(graph.gpu_num(), this, slices); - graph_sampler.reset(sampler); - } + _db = NULL; + search_level = graph.search_level(); + if (search_level >= 2) { + _db = paddle::distributed::RocksDBHandler::GetInstance(); + _db->initialize("./temp_gpups_db", task_pool_size_); + } +// gpups_mode = true; +// auto *sampler = +// CREATE_PSCORE_CLASS(GraphSampler, graph.gpups_graph_sample_class()); +// auto slices = +// string::split_string(graph.gpups_graph_sample_args(), ","); +// std::cout << "slices" << std::endl; +// for (auto x : slices) std::cout << x << std::endl; +// sampler->init(graph.gpu_num(), this, slices); +// graph_sampler.reset(sampler); #endif if (shard_num == 0) { server_num = 1; _shard_idx = 0; shard_num = graph.shard_num(); } - task_pool_size_ = graph.task_pool_size(); use_cache = graph.use_cache(); if (use_cache) { cache_size_limit = graph.cache_size_limit(); diff --git a/paddle/fluid/distributed/ps/table/common_graph_table.h b/paddle/fluid/distributed/ps/table/common_graph_table.h index acc484e6098d4..863c397b08ad2 100644 --- a/paddle/fluid/distributed/ps/table/common_graph_table.h +++ b/paddle/fluid/distributed/ps/table/common_graph_table.h @@ -38,6 +38,7 @@ #include #include "paddle/fluid/distributed/ps/table/accessor.h" #include "paddle/fluid/distributed/ps/table/common_table.h" +#include "paddle/fluid/distributed/ps/table/depends/rocksdb_warpper.h" #include "paddle/fluid/distributed/ps/table/graph/class_macro.h" #include "paddle/fluid/distributed/ps/table/graph/graph_node.h" #include "paddle/fluid/string/string_helper.h" @@ -351,6 +352,7 @@ class ScaledLRU { friend class RandomSampleLRU; }; +/* #ifdef PADDLE_WITH_HETERPS enum GraphSamplerStatus { waiting = 0, running = 1, terminating = 2 }; class GraphTable; @@ -363,6 +365,9 @@ class GraphSampler { return; }; } + virtual int loadData(const std::string &path){ + return 0; + } virtual int run_graph_sampling() = 0; virtual int start_graph_sampling() { if (status != GraphSamplerStatus::waiting) { @@ -403,15 +408,13 @@ class GraphSampler { std::vector sample_res; }; #endif +*/ class GraphTable : public Table { public: GraphTable() { use_cache = false; shard_num = 0; -#ifdef PADDLE_WITH_HETERPS - gpups_mode = false; -#endif rw_lock.reset(new pthread_rwlock_t()); } virtual ~GraphTable(); @@ -516,21 +519,28 @@ class GraphTable : public Table { return 0; } #ifdef PADDLE_WITH_HETERPS - virtual int32_t start_graph_sampling() { - return this->graph_sampler->start_graph_sampling(); - } - virtual int32_t end_graph_sampling() { - return this->graph_sampler->end_graph_sampling(); - } - virtual int32_t set_graph_sample_callback( - std::function &)> - callback) { - graph_sampler->set_graph_sample_callback(callback); - return 0; - } -// virtual GraphSampler *get_graph_sampler() { return graph_sampler.get(); } + // virtual int32_t start_graph_sampling() { + // return this->graph_sampler->start_graph_sampling(); + // } + // virtual int32_t end_graph_sampling() { + // return this->graph_sampler->end_graph_sampling(); + // } + // virtual int32_t set_graph_sample_callback( + // std::function &)> + // callback) { + // graph_sampler->set_graph_sample_callback(callback); + // return 0; + // } + virtual char *random_sample_neighbor_from_ssd( + int64_t id, int sample_size, const std::shared_ptr rng, + int &actual_size); + virtual int32_t add_node_to_ssd(int64_t id, char *data, int len); + virtual paddle::framework::GpuPsCommGraph make_gpu_ps_graph( + std::vector ids); + // virtual GraphSampler *get_graph_sampler() { return graph_sampler.get(); } + int search_level; #endif - protected: + virtual int32_t add_comm_edge(int64_t src_id, int64_t dst_id); std::vector shards, extra_shards; size_t shard_start, shard_end, server_num, shard_num_per_server, shard_num; int task_pool_size_ = 24; @@ -555,13 +565,14 @@ class GraphTable : public Table { std::shared_ptr rw_lock; #ifdef PADDLE_WITH_HETERPS // paddle::framework::GpuPsGraphTable gpu_graph_table; - bool gpups_mode; - // std::shared_ptr<::ThreadPool> graph_sample_pool; - std::shared_ptr graph_sampler; - REGISTER_GRAPH_FRIEND_CLASS(2, CompleteGraphSampler, BasicBfsGraphSampler) + paddle::distributed::RocksDBHandler *_db; +// std::shared_ptr<::ThreadPool> graph_sample_pool; +// std::shared_ptr graph_sampler; +// REGISTER_GRAPH_FRIEND_CLASS(2, CompleteGraphSampler, BasicBfsGraphSampler) #endif }; +/* #ifdef PADDLE_WITH_HETERPS REGISTER_PSCORE_REGISTERER(GraphSampler); class CompleteGraphSampler : public GraphSampler { @@ -603,6 +614,7 @@ class BasicBfsGraphSampler : public GraphSampler { sample_neighbors_map; }; #endif +*/ } // namespace distributed }; // namespace paddle diff --git a/paddle/fluid/distributed/ps/table/depends/rocksdb_warpper.h b/paddle/fluid/distributed/ps/table/depends/rocksdb_warpper.h index 0e25a89cb14d7..ff2271d468e39 100644 --- a/paddle/fluid/distributed/ps/table/depends/rocksdb_warpper.h +++ b/paddle/fluid/distributed/ps/table/depends/rocksdb_warpper.h @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +#pragma once #ifdef PADDLE_WITH_HETERPS #include #include diff --git a/paddle/fluid/distributed/ps/table/table.cc b/paddle/fluid/distributed/ps/table/table.cc index 0fbdfb6fcce77..307abbdf51e4a 100644 --- a/paddle/fluid/distributed/ps/table/table.cc +++ b/paddle/fluid/distributed/ps/table/table.cc @@ -31,10 +31,6 @@ namespace paddle { namespace distributed { REGISTER_PSCORE_CLASS(Table, GraphTable); REGISTER_PSCORE_CLASS(Table, MemoryDenseTable); -#ifdef PADDLE_WITH_HETERPS -REGISTER_PSCORE_CLASS(GraphSampler, CompleteGraphSampler); -REGISTER_PSCORE_CLASS(GraphSampler, BasicBfsGraphSampler); -#endif REGISTER_PSCORE_CLASS(Table, BarrierTable); REGISTER_PSCORE_CLASS(Table, TensorTable); REGISTER_PSCORE_CLASS(Table, DenseTensorTable); diff --git a/paddle/fluid/distributed/test/CMakeLists.txt b/paddle/fluid/distributed/test/CMakeLists.txt index cb46c38d4de4b..ff0ff26b9579f 100644 --- a/paddle/fluid/distributed/test/CMakeLists.txt +++ b/paddle/fluid/distributed/test/CMakeLists.txt @@ -25,7 +25,7 @@ set_source_files_properties(graph_node_split_test.cc PROPERTIES COMPILE_FLAGS ${ cc_test(graph_node_split_test SRCS graph_node_split_test.cc DEPS graph_py_service scope server client communicator ps_service boost table ps_framework_proto ${COMMON_DEPS}) set_source_files_properties(graph_table_sample_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) -cc_test(graph_table_sample_test SRCS graph_table_sample_test.cc DEPS scope server communicator ps_service boost table ps_framework_proto ${COMMON_DEPS}) +cc_test(graph_table_sample_test SRCS graph_table_sample_test.cc DEPS table ps_framework_proto ${COMMON_DEPS}) set_source_files_properties(feature_value_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) cc_test(feature_value_test SRCS feature_value_test.cc DEPS ${COMMON_DEPS} boost table) diff --git a/paddle/fluid/distributed/test/graph_node_test.cc b/paddle/fluid/distributed/test/graph_node_test.cc index b2c741df7a5dd..bde284b20e73c 100644 --- a/paddle/fluid/distributed/test/graph_node_test.cc +++ b/paddle/fluid/distributed/test/graph_node_test.cc @@ -679,7 +679,7 @@ void testCache() { st.query(0, &skey, 1, r); ASSERT_EQ((int)r.size(), 1); char* p = (char*)r[0].second.buffer.get(); - for (size_t j = 0; j < r[0].second.actual_size; j++) + for (int j = 0; j < (int)r[0].second.actual_size; j++) ASSERT_EQ(p[j], str[j]); r.clear(); } diff --git a/paddle/fluid/distributed/test/graph_table_sample_test.cc b/paddle/fluid/distributed/test/graph_table_sample_test.cc index 2866bd0bda025..d7f6f2f34d77a 100644 --- a/paddle/fluid/distributed/test/graph_table_sample_test.cc +++ b/paddle/fluid/distributed/test/graph_table_sample_test.cc @@ -25,18 +25,7 @@ #include #include "gtest/gtest.h" #include "paddle/fluid/distributed/ps.pb.h" -#include "paddle/fluid/distributed/ps/service/env.h" -#include "paddle/fluid/distributed/ps/service/sendrecv.pb.h" #include "paddle/fluid/distributed/ps/table/common_graph_table.h" -#include "paddle/fluid/distributed/ps/table/graph/graph_node.h" -#include "paddle/fluid/framework/lod_tensor.h" -#include "paddle/fluid/framework/program_desc.h" -#include "paddle/fluid/framework/scope.h" -#include "paddle/fluid/framework/tensor_util.h" -#include "paddle/fluid/framework/variable.h" -#include "paddle/fluid/platform/place.h" -#include "paddle/fluid/string/printf.h" -#include "paddle/phi/kernels/funcs/math_function.h" namespace framework = paddle::framework; namespace platform = paddle::platform; namespace operators = paddle::operators; @@ -83,66 +72,11 @@ void prepare_file(char file_name[], std::vector data) { } void testGraphSample() { -#ifdef PADDLE_WITH_HETERPS ::paddle::distributed::GraphParameter table_proto; - table_proto.set_gpups_mode(true); - table_proto.set_shard_num(127); - table_proto.set_gpu_num(2); + // table_proto.set_gpu_num(2); - distributed::GraphTable graph_table, graph_table1; - graph_table.initialize(table_proto); - prepare_file(edge_file_name, edges); - graph_table.load(std::string(edge_file_name), std::string("e>")); - std::vector res; - std::promise prom; - std::future fut = prom.get_future(); - graph_table.set_graph_sample_callback( - [&res, &prom](std::vector &res0) { - res = res0; - prom.set_value(0); - }); - graph_table.start_graph_sampling(); - fut.get(); - graph_table.end_graph_sampling(); - ASSERT_EQ(2, res.size()); - // 37 59 97 - for (int i = 0; i < (int)res[1].node_size; i++) { - std::cout << res[1].node_list[i].node_id << std::endl; - } - ASSERT_EQ(3, res[1].node_size); - - ::paddle::distributed::GraphParameter table_proto1; - table_proto1.set_gpups_mode(true); - table_proto1.set_shard_num(127); - table_proto1.set_gpu_num(2); - table_proto1.set_gpups_graph_sample_class("BasicBfsGraphSampler"); - table_proto1.set_gpups_graph_sample_args("5,5,1,1"); - graph_table1.initialize(table_proto1); - graph_table1.load(std::string(edge_file_name), std::string("e>")); - std::vector res1; - std::promise prom1; - std::future fut1 = prom1.get_future(); - graph_table1.set_graph_sample_callback( - [&res1, &prom1](std::vector &res0) { - res1 = res0; - prom1.set_value(0); - }); - graph_table1.start_graph_sampling(); - fut1.get(); - graph_table1.end_graph_sampling(); - // distributed::BasicBfsGraphSampler *sampler1 = - // (distributed::BasicBfsGraphSampler *)graph_table1.get_graph_sampler(); - // sampler1->start_graph_sampling(); - // std::this_thread::sleep_for (std::chrono::seconds(1)); - // std::vector res1;// = - // sampler1->fetch_sample_res(); - ASSERT_EQ(2, res1.size()); - // odd id:96 48 122 112 - for (int i = 0; i < (int)res1[0].node_size; i++) { - std::cout << res1[0].node_list[i].node_id << std::endl; - } - ASSERT_EQ(4, res1[0].node_size); -#endif + distributed::GraphTable graph_table; + graph_table.Initialize(table_proto); } TEST(testGraphSample, Run) { testGraphSample(); } diff --git a/paddle/fluid/distributed/the_one_ps.proto b/paddle/fluid/distributed/the_one_ps.proto index 34b11dfd1c5b7..197acc1824217 100644 --- a/paddle/fluid/distributed/the_one_ps.proto +++ b/paddle/fluid/distributed/the_one_ps.proto @@ -215,18 +215,16 @@ message SparseAdamSGDParameter { // SparseAdamSGDRule message GraphParameter { optional int32 task_pool_size = 1 [ default = 24 ]; - optional bool gpups_mode = 2 [ default = false ]; - optional string gpups_graph_sample_class = 3 + optional string gpups_graph_sample_class = 2 [ default = "CompleteGraphSampler" ]; - optional string gpups_graph_sample_args = 4 [ default = "" ]; - optional bool use_cache = 5 [ default = false ]; - optional int32 cache_size_limit = 6 [ default = 100000 ]; - optional int32 cache_ttl = 7 [ default = 5 ]; - optional GraphFeature graph_feature = 8; - optional string table_name = 9 [ default = "" ]; - optional string table_type = 10 [ default = "" ]; - optional int32 shard_num = 11 [ default = 127 ]; - optional int32 gpu_num = 12 [ default = 1 ]; + optional bool use_cache = 3 [ default = false ]; + optional int32 cache_size_limit = 4 [ default = 100000 ]; + optional int32 cache_ttl = 5 [ default = 5 ]; + optional GraphFeature graph_feature = 6; + optional string table_name = 7 [ default = "" ]; + optional string table_type = 8 [ default = "" ]; + optional int32 shard_num = 9 [ default = 127 ]; + optional int32 search_level = 10 [ default = 1 ]; } message GraphFeature { diff --git a/paddle/fluid/framework/fleet/heter_ps/CMakeLists.txt b/paddle/fluid/framework/fleet/heter_ps/CMakeLists.txt index cac366d6b22a1..e90d864fa1ab7 100644 --- a/paddle/fluid/framework/fleet/heter_ps/CMakeLists.txt +++ b/paddle/fluid/framework/fleet/heter_ps/CMakeLists.txt @@ -13,13 +13,16 @@ IF(WITH_GPU) nv_test(test_heter_comm SRCS feature_value.h DEPS heter_comm) nv_library(heter_ps SRCS heter_ps.cu DEPS heter_comm) if(WITH_PSCORE) - nv_library(graph_gpu_ps SRCS graph_gpu_ps_table.h DEPS heter_comm table) - nv_test(test_graph_comm SRCS test_graph.cu DEPS graph_gpu_ps) - nv_test(test_cpu_graph_sample SRCS test_cpu_graph_sample.cu DEPS graph_gpu_ps) - #nv_test(test_sample_rate SRCS test_sample_rate.cu DEPS graph_gpu_ps) - # ADD_EXECUTABLE(test_sample_rate test_sample_rate.cu) - # target_link_libraries(test_sample_rate graph_gpu_ps) + nv_library(graph_gpu_ps SRCS graph_gpu_ps_table.h DEPS heter_comm table) + nv_library(graph_sampler SRCS graph_sampler_inl.h DEPS graph_gpu_ps) + #nv_test(test_graph_comm SRCS test_graph.cu DEPS graph_gpu_ps) + #nv_test(test_cpu_graph_sample SRCS test_cpu_graph_sample.cu DEPS graph_gpu_ps) + #nv_test(test_sample_rate SRCS test_sample_rate.cu DEPS graph_gpu_ps) + # ADD_EXECUTABLE(test_sample_rate test_sample_rate.cu) + # target_link_libraries(test_sample_rate graph_gpu_ps graph_sampler) + # nv_test(test_graph_xx SRCS test_xx.cu DEPS graph_gpu_ps graph_sampler) endif() + ENDIF() IF(WITH_ROCM) hip_library(heter_comm SRCS heter_comm.h feature_value.h heter_resource.cc heter_resource.h hashtable.h DEPS cub device_context) diff --git a/paddle/fluid/framework/fleet/heter_ps/gpu_graph_node.h b/paddle/fluid/framework/fleet/heter_ps/gpu_graph_node.h index f18fa47fffd9a..27f14e8726d9c 100644 --- a/paddle/fluid/framework/fleet/heter_ps/gpu_graph_node.h +++ b/paddle/fluid/framework/fleet/heter_ps/gpu_graph_node.h @@ -14,6 +14,11 @@ #pragma once #ifdef PADDLE_WITH_HETERPS +#include +#include +#include "paddle/fluid/memory/allocation/allocator.h" +#include "paddle/fluid/memory/memory.h" +#include "paddle/fluid/platform/cuda_device_guard.h" namespace paddle { namespace framework { struct GpuPsGraphNode { @@ -94,16 +99,24 @@ struct NeighborSampleResult { int64_t *val; int *actual_sample_size, sample_size, key_size; int *offset; - NeighborSampleResult(int _sample_size, int _key_size) + std::shared_ptr val_mem, actual_sample_size_mem; + + NeighborSampleResult(int _sample_size, int _key_size, int dev_id) : sample_size(_sample_size), key_size(_key_size) { - actual_sample_size = NULL; - val = NULL; + platform::CUDADeviceGuard guard(dev_id); + platform::CUDAPlace place = platform::CUDAPlace(dev_id); + val_mem = + memory::AllocShared(place, _sample_size * _key_size * sizeof(int64_t)); + val = (int64_t *)val_mem->ptr(); + actual_sample_size_mem = + memory::AllocShared(place, _key_size * sizeof(int)); + actual_sample_size = (int *)actual_sample_size_mem->ptr(); offset = NULL; }; ~NeighborSampleResult() { - if (val != NULL) cudaFree(val); - if (actual_sample_size != NULL) cudaFree(actual_sample_size); - if (offset != NULL) cudaFree(offset); + // if (val != NULL) cudaFree(val); + // if (actual_sample_size != NULL) cudaFree(actual_sample_size); + // if (offset != NULL) cudaFree(offset); } }; diff --git a/paddle/fluid/framework/fleet/heter_ps/graph_gpu_ps_table.h b/paddle/fluid/framework/fleet/heter_ps/graph_gpu_ps_table.h index 3d1599a76e8eb..cd55d09608f54 100644 --- a/paddle/fluid/framework/fleet/heter_ps/graph_gpu_ps_table.h +++ b/paddle/fluid/framework/fleet/heter_ps/graph_gpu_ps_table.h @@ -14,6 +14,7 @@ #pragma once #include +#include #include "heter_comm.h" #include "paddle/fluid/distributed/ps/table/common_graph_table.h" #include "paddle/fluid/framework/fleet/heter_ps/gpu_graph_node.h" @@ -21,19 +22,64 @@ #ifdef PADDLE_WITH_HETERPS namespace paddle { namespace framework { - class GpuPsGraphTable : public HeterComm { public: - GpuPsGraphTable(std::shared_ptr resource) + GpuPsGraphTable(std::shared_ptr resource, int topo_aware) : HeterComm(1, resource) { load_factor_ = 0.25; rw_lock.reset(new pthread_rwlock_t()); + gpu_num = resource_->total_gpu(); cpu_table_status = -1; + if (topo_aware) { + int total_gpu = resource_->total_gpu(); + std::map device_map; + for (int i = 0; i < total_gpu; i++) { + device_map[resource_->dev_id(i)] = i; + VLOG(1) << " device " << resource_->dev_id(i) << " is stored on " << i; + } + path_.clear(); + path_.resize(total_gpu); + VLOG(1) << "topo aware overide"; + for (int i = 0; i < total_gpu; ++i) { + path_[i].resize(total_gpu); + for (int j = 0; j < total_gpu; ++j) { + auto &nodes = path_[i][j].nodes_; + nodes.clear(); + int from = resource_->dev_id(i); + int to = resource_->dev_id(j); + int transfer_id = i; + if (need_transfer(from, to) && + (device_map.find((from + 4) % 8) != device_map.end() || + device_map.find((to + 4) % 8) != device_map.end())) { + transfer_id = (device_map.find((from + 4) % 8) != device_map.end()) + ? ((from + 4) % 8) + : ((to + 4) % 8); + transfer_id = device_map[transfer_id]; + nodes.push_back(Node()); + Node &node = nodes.back(); + node.in_stream = resource_->comm_stream(i, transfer_id); + node.out_stream = resource_->comm_stream(transfer_id, i); + node.key_storage = NULL; + node.val_storage = NULL; + node.sync = 0; + node.gpu_num = transfer_id; + } + nodes.push_back(Node()); + Node &node = nodes.back(); + node.in_stream = resource_->comm_stream(i, transfer_id); + node.out_stream = resource_->comm_stream(transfer_id, i); + node.key_storage = NULL; + node.val_storage = NULL; + node.sync = 0; + node.gpu_num = j; + } + } + } } ~GpuPsGraphTable() { - if (cpu_table_status != -1) { - end_graph_sampling(); - } + // if (cpu_table_status != -1) { + // end_graph_sampling(); + // } } void build_graph_from_cpu(std::vector &cpu_node_list); NodeQueryResult *graph_node_sample(int gpu_id, int sample_size); @@ -41,21 +87,28 @@ class GpuPsGraphTable : public HeterComm { int sample_size, int len); NodeQueryResult *query_node_list(int gpu_id, int start, int query_size); void clear_graph_info(); - void move_neighbor_sample_result_to_source_gpu( - int gpu_id, int gpu_num, int *h_left, int *h_right, - int64_t *src_sample_res, thrust::host_vector &total_sample_size); - void move_neighbor_sample_size_to_source_gpu(int gpu_id, int gpu_num, - int *h_left, int *h_right, - int *actual_sample_size, - int *total_sample_size); + void move_neighbor_sample_result_to_source_gpu(int gpu_id, int gpu_num, + int sample_size, int *h_left, + int *h_right, + int64_t *src_sample_res, + int *actual_sample_size); + // void move_neighbor_sample_result_to_source_gpu( + // int gpu_id, int gpu_num, int *h_left, int *h_right, + // int64_t *src_sample_res, thrust::host_vector &total_sample_size); + // void move_neighbor_sample_size_to_source_gpu(int gpu_id, int gpu_num, + // int *h_left, int *h_right, + // int *actual_sample_size, + // int *total_sample_size); int init_cpu_table(const paddle::distributed::GraphParameter &graph); - int load(const std::string &path, const std::string ¶m); - virtual int32_t end_graph_sampling() { - return cpu_graph_table->end_graph_sampling(); - } - - private: + // int load(const std::string &path, const std::string ¶m); + // virtual int32_t end_graph_sampling() { + // return cpu_graph_table->end_graph_sampling(); + // } + int gpu_num; std::vector gpu_graph_list; + std::vector sample_status; + const int parallel_sample_size = 1; + const int dim_y = 256; std::shared_ptr cpu_graph_table; std::shared_ptr rw_lock; mutable std::mutex mutex_; diff --git a/paddle/fluid/framework/fleet/heter_ps/graph_gpu_ps_table_inl.h b/paddle/fluid/framework/fleet/heter_ps/graph_gpu_ps_table_inl.h index acd3f0a290d0b..c235378def51f 100644 --- a/paddle/fluid/framework/fleet/heter_ps/graph_gpu_ps_table_inl.h +++ b/paddle/fluid/framework/fleet/heter_ps/graph_gpu_ps_table_inl.h @@ -13,23 +13,10 @@ // limitations under the License. #pragma once - -#include -#include -#include -#include -#include -#include -#include -#include - #ifdef PADDLE_WITH_HETERPS //#include "paddle/fluid/framework/fleet/heter_ps/graph_gpu_ps_table.h" namespace paddle { namespace framework { - -constexpr int WARP_SIZE = 32; - /* comment 0 this kernel just serves as an example of how to sample nodes' neighbors. @@ -42,116 +29,113 @@ sample_size; */ -struct MaxFunctor { - int sample_size; - HOSTDEVICE explicit inline MaxFunctor(int sample_size) { - this->sample_size = sample_size; - } - HOSTDEVICE inline int operator()(int x) const { - if (x > sample_size) { - return sample_size; +__global__ void neighbor_sample_example(GpuPsCommGraph graph, int* node_index, + int* actual_size, int64_t* res, + int sample_len, int* sample_status, + int n, int from) { + // printf("%d %d %d\n",blockIdx.x,threadIdx.x,threadIdx.y); + int id = blockIdx.x * blockDim.y + threadIdx.y; + if (id < n) { + curandState rng; + curand_init(blockIdx.x, threadIdx.x, threadIdx.y, &rng); + int index = threadIdx.x; + int offset = id * sample_len; + int64_t* data = graph.neighbor_list; + int data_offset = graph.node_list[node_index[id]].neighbor_offset; + int neighbor_len = graph.node_list[node_index[id]].neighbor_size; + int ac_len; + if (sample_len > neighbor_len) + ac_len = neighbor_len; + else { + ac_len = sample_len; } - return x; - } -}; - -struct DegreeFunctor { - GpuPsCommGraph graph; - HOSTDEVICE explicit inline DegreeFunctor(GpuPsCommGraph graph) { - this->graph = graph; - } - HOSTDEVICE inline int operator()(int i) const { - return graph.node_list[i].neighbor_size; - } -}; - -template -__global__ void neighbor_sample(const uint64_t rand_seed, GpuPsCommGraph graph, - int sample_size, int* index, int len, - int64_t* sample_result, int* output_idx, - int* output_offset) { - assert(blockDim.x == WARP_SIZE); - assert(blockDim.y == BLOCK_WARPS); - - int i = blockIdx.x * TILE_SIZE + threadIdx.y; - const int last_idx = min(static_cast(blockIdx.x + 1) * TILE_SIZE, len); - curandState rng; - curand_init(rand_seed * gridDim.x + blockIdx.x, - threadIdx.y * WARP_SIZE + threadIdx.x, 0, &rng); - - while (i < last_idx) { - auto node_index = index[i]; - int degree = graph.node_list[node_index].neighbor_size; - const int offset = graph.node_list[node_index].neighbor_offset; - int output_start = output_offset[i]; - - if (degree <= sample_size) { - // Just copy - for (int j = threadIdx.x; j < degree; j += WARP_SIZE) { - sample_result[output_start + j] = graph.neighbor_list[offset + j]; - } - } else { - for (int j = threadIdx.x; j < degree; j += WARP_SIZE) { - output_idx[output_start + j] = j; + if (4 * ac_len >= 3 * neighbor_len) { + if (index == 0) { + res[offset] = curand(&rng) % (neighbor_len - ac_len + 1); } - __syncwarp(); - - for (int j = sample_size + threadIdx.x; j < degree; j += WARP_SIZE) { - const int num = curand(&rng) % (j + 1); - if (num < sample_size) { - atomicMax( - reinterpret_cast(output_idx + output_start + num), - static_cast(j)); + int start = res[offset]; + while (index < ac_len) { + res[offset + index] = data[data_offset + start + index]; + index += blockDim.x; + } + actual_size[id] = ac_len; + } else { + while (index < ac_len) { + int num = curand(&rng) % neighbor_len; + int* addr = sample_status + data_offset + num; + int expected = *addr; + if (!(expected & (1 << from))) { + int old = atomicCAS(addr, expected, expected | (1 << from)); + if (old == expected) { + res[offset + index] = num; + index += blockDim.x; + } } } - __syncwarp(); - - for (int j = threadIdx.x; j < sample_size; j += WARP_SIZE) { - const int perm_idx = output_idx[output_start + j] + offset; - sample_result[output_start + j] = graph.neighbor_list[perm_idx]; + index = threadIdx.x; + while (index < ac_len) { + int* addr = sample_status + data_offset + res[offset + index]; + int expected, old = *addr; + do { + expected = old; + old = atomicCAS(addr, expected, expected & (~(1 << from))); + } while (old != expected); + res[offset + index] = data[data_offset + res[offset + index]]; + index += blockDim.x; } + actual_size[id] = ac_len; } - - i += BLOCK_WARPS; } + // const size_t i = blockIdx.x * blockDim.x + threadIdx.x; + // if (i < n) { + // auto node_index = index[i]; + // actual_size[i] = graph.node_list[node_index].neighbor_size < sample_size + // ? graph.node_list[node_index].neighbor_size + // : sample_size; + // int offset = graph.node_list[node_index].neighbor_offset; + // for (int j = 0; j < actual_size[i]; j++) { + // sample_result[sample_size * i + j] = graph.neighbor_list[offset + j]; + // } + // } } int GpuPsGraphTable::init_cpu_table( const paddle::distributed::GraphParameter& graph) { cpu_graph_table.reset(new paddle::distributed::GraphTable); - cpu_table_status = cpu_graph_table->initialize(graph); - if (cpu_table_status != 0) return cpu_table_status; - std::function&)> callback = - [this](std::vector& res) { - pthread_rwlock_wrlock(this->rw_lock.get()); - this->clear_graph_info(); - this->build_graph_from_cpu(res); - pthread_rwlock_unlock(this->rw_lock.get()); - cv_.notify_one(); - }; - cpu_graph_table->set_graph_sample_callback(callback); + cpu_table_status = cpu_graph_table->Initialize(graph); + // if (cpu_table_status != 0) return cpu_table_status; + // std::function&)> callback = + // [this](std::vector& res) { + // pthread_rwlock_wrlock(this->rw_lock.get()); + // this->clear_graph_info(); + // this->build_graph_from_cpu(res); + // pthread_rwlock_unlock(this->rw_lock.get()); + // cv_.notify_one(); + // }; + // cpu_graph_table->set_graph_sample_callback(callback); return cpu_table_status; } -int GpuPsGraphTable::load(const std::string& path, const std::string& param) { - int status = cpu_graph_table->load(path, param); - if (status != 0) { - return status; - } - std::unique_lock lock(mutex_); - cpu_graph_table->start_graph_sampling(); - cv_.wait(lock); - return 0; -} +// int GpuPsGraphTable::load(const std::string& path, const std::string& param) +// { +// int status = cpu_graph_table->load(path, param); +// if (status != 0) { +// return status; +// } +// std::unique_lock lock(mutex_); +// cpu_graph_table->start_graph_sampling(); +// cv_.wait(lock); +// return 0; +// } /* comment 1 gpu i triggers a neighbor_sample task, when this task is done, this function is called to move the sample result on other gpu back - to gpu i and aggragate the result. + to gup i and aggragate the result. the sample_result is saved on src_sample_res and the actual sample size for each node is saved on actual_sample_size. the number of actual sample_result for @@ -168,106 +152,163 @@ int GpuPsGraphTable::load(const std::string& path, const std::string& param) { that's what fill_dvals does. */ -void GpuPsGraphTable::move_neighbor_sample_size_to_source_gpu( - int gpu_id, int gpu_num, int* h_left, int* h_right, int* actual_sample_size, - int* total_sample_size) { - // This function copyed actual_sample_size to source_gpu, - // and calculate total_sample_size of each gpu sample number. + +void GpuPsGraphTable::move_neighbor_sample_result_to_source_gpu( + int start_index, int gpu_num, int sample_size, int* h_left, int* h_right, + int64_t* src_sample_res, int* actual_sample_size) { + int shard_len[gpu_num]; for (int i = 0; i < gpu_num; i++) { if (h_left[i] == -1 || h_right[i] == -1) { continue; } - auto shard_len = h_right[i] - h_left[i] + 1; - auto& node = path_[gpu_id][i].nodes_.front(); + shard_len[i] = h_right[i] - h_left[i] + 1; + int cur_step = path_[start_index][i].nodes_.size() - 1; + for (int j = cur_step; j > 0; j--) { + cudaMemcpyAsync(path_[start_index][i].nodes_[j - 1].val_storage, + path_[start_index][i].nodes_[j].val_storage, + path_[start_index][i].nodes_[j - 1].val_bytes_len, + cudaMemcpyDefault, + path_[start_index][i].nodes_[j - 1].out_stream); + } + auto& node = path_[start_index][i].nodes_.front(); + cudaMemcpyAsync( + reinterpret_cast(src_sample_res + h_left[i] * sample_size), + node.val_storage + sizeof(int64_t) * shard_len[i], + node.val_bytes_len - sizeof(int64_t) * shard_len[i], cudaMemcpyDefault, + node.out_stream); + // resource_->remote_stream(i, start_index)); cudaMemcpyAsync(reinterpret_cast(actual_sample_size + h_left[i]), - node.val_storage + sizeof(int) * shard_len, - sizeof(int) * shard_len, cudaMemcpyDefault, + node.val_storage + sizeof(int) * shard_len[i], + sizeof(int) * shard_len[i], cudaMemcpyDefault, node.out_stream); } for (int i = 0; i < gpu_num; ++i) { if (h_left[i] == -1 || h_right[i] == -1) { - total_sample_size[i] = 0; continue; } - auto& node = path_[gpu_id][i].nodes_.front(); + auto& node = path_[start_index][i].nodes_.front(); cudaStreamSynchronize(node.out_stream); - - auto shard_len = h_right[i] - h_left[i] + 1; - thrust::device_vector t_actual_sample_size(shard_len); - thrust::copy(actual_sample_size + h_left[i], - actual_sample_size + h_left[i] + shard_len, - t_actual_sample_size.begin()); - total_sample_size[i] = thrust::reduce(t_actual_sample_size.begin(), - t_actual_sample_size.end()); + // cudaStreamSynchronize(resource_->remote_stream(i, start_index)); } -} - -void GpuPsGraphTable::move_neighbor_sample_result_to_source_gpu( - int gpu_id, int gpu_num, int* h_left, int* h_right, int64_t* src_sample_res, - thrust::host_vector& total_sample_size) { /* - if total_sample_size is [4, 5, 1, 6], - then cumsum_total_sample_size is [0, 4, 9, 10]; - */ - thrust::host_vector cumsum_total_sample_size(gpu_num, 0); - thrust::exclusive_scan(total_sample_size.begin(), total_sample_size.end(), - cumsum_total_sample_size.begin(), 0); - for (int i = 0; i < gpu_num; i++) { - if (h_left[i] == -1 || h_right[i] == -1) { - continue; + std::queue que; + // auto& node = path_[gpu_id][i].nodes_.front(); + // cudaMemcpyAsync( + // reinterpret_cast(src_sample_res + h_left[i] * sample_size), + // node.val_storage + sizeof(int64_t) * shard_len, + // node.val_bytes_len - sizeof(int64_t) * shard_len, cudaMemcpyDefault, + // node.out_stream); + // cudaMemcpyAsync(reinterpret_cast(actual_sample_size + h_left[i]), + // node.val_storage + sizeof(int) * shard_len, + // sizeof(int) * shard_len, cudaMemcpyDefault, + // node.out_stream); + int cur_step = path_[start_index][i].nodes_.size() - 1; + auto& node = path_[start_index][i].nodes_[cur_step]; + if (cur_step == 0) { + // cudaMemcpyAsync(reinterpret_cast(src_val + h_left[i]), + // node.val_storage, node.val_bytes_len, + // cudaMemcpyDefault, + // node.out_stream); + // VLOG(0)<<"copy "<(src_sample_res + h_left[i] * sample_size), + node.val_storage + sizeof(int64_t) * shard_len[i], + node.val_bytes_len - sizeof(int64_t) * shard_len[i], + cudaMemcpyDefault, + node.out_stream); + //resource_->remote_stream(i, start_index)); + cudaMemcpyAsync(reinterpret_cast(actual_sample_size + h_left[i]), + node.val_storage + sizeof(int) * shard_len[i], + sizeof(int) * shard_len[i], cudaMemcpyDefault, + node.out_stream); + //resource_->remote_stream(i, start_index)); + } else { + CopyTask t(&path_[start_index][i], cur_step - 1); + que.push(t); + // VLOG(0)<<"copy "<remote_stream(i, start_index)); + } + } + while (!que.empty()) { + CopyTask& cur_task = que.front(); + que.pop(); + int cur_step = cur_task.step; + if (cur_task.path->nodes_[cur_step].sync) { + cudaStreamSynchronize(cur_task.path->nodes_[cur_step].out_stream); + //cudaStreamSynchronize(resource_->remote_stream(cur_task.path->nodes_.back().gpu_num, + start_index)); + } + if (cur_step > 0) { + CopyTask c(cur_task.path, cur_step - 1); + que.push(c); + cudaMemcpyAsync(cur_task.path->nodes_[cur_step - 1].val_storage, + cur_task.path->nodes_[cur_step].val_storage, + cur_task.path->nodes_[cur_step - 1].val_bytes_len, + cudaMemcpyDefault, + cur_task.path->nodes_[cur_step - 1].out_stream); + //resource_->remote_stream(cur_task.path->nodes_.back().gpu_num, + start_index)); + } else if (cur_step == 0) { + int end_index = cur_task.path->nodes_.back().gpu_num; + // cudaMemcpyAsync(reinterpret_cast(src_val + h_left[end_index]), + // cur_task.path->nodes_[cur_step].val_storage, + // cur_task.path->nodes_[cur_step].val_bytes_len, + // cudaMemcpyDefault, + // cur_task.path->nodes_[cur_step].out_stream); + //VLOG(0)<<"copy "<nodes_[cur_step].gpu_num<< " to + "<(src_sample_res + + h_left[end_index] * sample_size), + cur_task.path->nodes_[cur_step].val_storage + + sizeof(int64_t) * shard_len[end_index], + cur_task.path->nodes_[cur_step].val_bytes_len - + sizeof(int64_t) * shard_len[end_index], + cudaMemcpyDefault, + cur_task.path->nodes_[cur_step].out_stream); + //resource_->remote_stream(cur_task.path->nodes_.back().gpu_num, + start_index)); + cudaMemcpyAsync( + reinterpret_cast(actual_sample_size + h_left[end_index]), + cur_task.path->nodes_[cur_step].val_storage + + sizeof(int) * shard_len[end_index], + sizeof(int) * shard_len[end_index], cudaMemcpyDefault, + cur_task.path->nodes_[cur_step].out_stream); + //resource_->remote_stream(cur_task.path->nodes_.back().gpu_num, + start_index)); } - auto shard_len = h_right[i] - h_left[i] + 1; - // int cur_step = path_[gpu_id][i].nodes_.size() - 1; - // auto& node = path_[gpu_id][i].nodes_[cur_step]; - auto& node = path_[gpu_id][i].nodes_.front(); - cudaMemcpyAsync( - reinterpret_cast(src_sample_res + cumsum_total_sample_size[i]), - node.val_storage + sizeof(int64_t) * shard_len, - sizeof(int64_t) * total_sample_size[i], cudaMemcpyDefault, - node.out_stream); } for (int i = 0; i < gpu_num; ++i) { if (h_left[i] == -1 || h_right[i] == -1) { continue; } - auto& node = path_[gpu_id][i].nodes_.front(); + auto& node = path_[start_index][i].nodes_.front(); cudaStreamSynchronize(node.out_stream); + //cudaStreamSynchronize(resource_->remote_stream(i, start_index)); } + */ } /* TODO: how to optimize it to eliminate the for loop */ -__global__ void fill_dvalues_actual_sample_size(int* d_shard_actual_sample_size, - int* d_actual_sample_size, - int* idx, int len) { +__global__ void fill_dvalues(int64_t* d_shard_vals, int64_t* d_vals, + int* d_shard_actual_sample_size, + int* d_actual_sample_size, int* idx, + int sample_size, int len) { const size_t i = blockIdx.x * blockDim.x + threadIdx.x; if (i < len) { d_actual_sample_size[idx[i]] = d_shard_actual_sample_size[i]; - } -} - -template -__global__ void fill_dvalues_sample_result(int64_t* d_shard_vals, - int64_t* d_vals, - int* d_actual_sample_size, int* idx, - int* offset, int* d_offset, - int len) { - assert(blockDim.x == WARP_SIZE); - assert(blockDim.y == BLOCK_WARPS); - - int i = blockIdx.x * TILE_SIZE + threadIdx.y; - const int last_idx = min(static_cast(blockIdx.x + 1) * TILE_SIZE, len); - while (i < last_idx) { - const int sample_size = d_actual_sample_size[idx[i]]; - for (int j = threadIdx.x; j < sample_size; j += WARP_SIZE) { - d_vals[offset[idx[i]] + j] = d_shard_vals[d_offset[i] + j]; + // d_vals[idx[i]] = d_shard_vals[i]; + for (int j = 0; j < sample_size; j++) { + d_vals[idx[i] * sample_size + j] = d_shard_vals[i * sample_size + j]; } -#ifdef PADDLE_WITH_CUDA - __syncwarp(); -#endif - i += BLOCK_WARPS; } } @@ -307,6 +348,8 @@ gpu i saves the ith graph from cpu_graph_list void GpuPsGraphTable::build_graph_from_cpu( std::vector& cpu_graph_list) { + VLOG(0) << "in build_graph_from_cpu cpu_graph_list size = " + << cpu_graph_list.size(); PADDLE_ENFORCE_EQ( cpu_graph_list.size(), resource_->total_gpu(), platform::errors::InvalidArgument("the cpu node list size doesn't match " @@ -314,7 +357,9 @@ void GpuPsGraphTable::build_graph_from_cpu( clear_graph_info(); for (int i = 0; i < cpu_graph_list.size(); i++) { platform::CUDADeviceGuard guard(resource_->dev_id(i)); + // platform::CUDADeviceGuard guard(i); gpu_graph_list.push_back(GpuPsCommGraph()); + sample_status.push_back(NULL); auto table = new Table(std::max(1, cpu_graph_list[i].node_size) / load_factor_); tables_.push_back(table); @@ -337,6 +382,10 @@ void GpuPsGraphTable::build_graph_from_cpu( gpu_graph_list[i].node_size = 0; } if (cpu_graph_list[i].neighbor_size) { + int* addr; + cudaMalloc((void**)&addr, cpu_graph_list[i].neighbor_size * sizeof(int)); + cudaMemset(addr, 0, cpu_graph_list[i].neighbor_size * sizeof(int)); + sample_status[i] = addr; cudaMalloc((void**)&gpu_graph_list[i].neighbor_list, cpu_graph_list[i].neighbor_size * sizeof(int64_t)); cudaMemcpy(gpu_graph_list[i].neighbor_list, @@ -382,15 +431,19 @@ NeighborSampleResult* GpuPsGraphTable::graph_neighbor_sample(int gpu_id, */ - NeighborSampleResult* result = new NeighborSampleResult(sample_size, len); + NeighborSampleResult* result = + new NeighborSampleResult(sample_size, len, resource_->dev_id(gpu_id)); if (len == 0) { return result; } - + platform::CUDAPlace place = platform::CUDAPlace(resource_->dev_id(gpu_id)); + platform::CUDADeviceGuard guard(resource_->dev_id(gpu_id)); + // cudaMalloc((void**)&result->val, len * sample_size * sizeof(int64_t)); + // cudaMalloc((void**)&result->actual_sample_size, len * sizeof(int)); + int* actual_sample_size = result->actual_sample_size; + int64_t* val = result->val; int total_gpu = resource_->total_gpu(); - int dev_id = resource_->dev_id(gpu_id); - platform::CUDAPlace place = platform::CUDAPlace(dev_id); - platform::CUDADeviceGuard guard(dev_id); + // int dev_id = resource_->dev_id(gpu_id); auto stream = resource_->local_stream(gpu_id, 0); int grid_size = (len - 1) / block_size_ + 1; @@ -411,6 +464,11 @@ NeighborSampleResult* GpuPsGraphTable::graph_neighbor_sample(int gpu_id, auto d_shard_keys = memory::Alloc(place, len * sizeof(int64_t)); int64_t* d_shard_keys_ptr = reinterpret_cast(d_shard_keys->ptr()); + auto d_shard_vals = memory::Alloc(place, sample_size * len * sizeof(int64_t)); + int64_t* d_shard_vals_ptr = reinterpret_cast(d_shard_vals->ptr()); + auto d_shard_actual_sample_size = memory::Alloc(place, len * sizeof(int)); + int* d_shard_actual_sample_size_ptr = + reinterpret_cast(d_shard_actual_sample_size->ptr()); split_input_to_shard(key, d_idx_ptr, len, d_left_ptr, d_right_ptr, gpu_id); @@ -423,7 +481,7 @@ NeighborSampleResult* GpuPsGraphTable::graph_neighbor_sample(int gpu_id, cudaMemcpyDeviceToHost); cudaMemcpy(h_right, d_right_ptr, total_gpu * sizeof(int), cudaMemcpyDeviceToHost); - + // auto start1 = std::chrono::steady_clock::now(); for (int i = 0; i < total_gpu; ++i) { int shard_len = h_left[i] == -1 ? 0 : h_right[i] - h_left[i] + 1; if (shard_len == 0) { @@ -450,138 +508,107 @@ NeighborSampleResult* GpuPsGraphTable::graph_neighbor_sample(int gpu_id, of alloc_mem_i, actual_sample_size_of_x equals ((int *)alloc_mem_i)[shard_len + x] */ - create_storage(gpu_id, i, shard_len * sizeof(int64_t), shard_len * (1 + sample_size) * sizeof(int64_t)); } + // auto end1 = std::chrono::steady_clock::now(); + // auto tt = std::chrono::duration_cast(end1 - + // start1); + // VLOG(0)<< "create storage time " << tt.count() << " us"; walk_to_dest(gpu_id, total_gpu, h_left, h_right, d_shard_keys_ptr, NULL); for (int i = 0; i < total_gpu; ++i) { if (h_left[i] == -1) { continue; } - // auto& node = path_[gpu_id][i].nodes_.back(); - auto& node = path_[gpu_id][i].nodes_.front(); + auto& node = path_[gpu_id][i].nodes_.back(); cudaStreamSynchronize(node.in_stream); platform::CUDADeviceGuard guard(resource_->dev_id(i)); + // platform::CUDADeviceGuard guard(i); // use the key-value map to update alloc_mem_i[0,shard_len) - tables_[i]->rwlock_->RDLock(); + // tables_[i]->rwlock_->RDLock(); tables_[i]->get(reinterpret_cast(node.key_storage), reinterpret_cast(node.val_storage), h_right[i] - h_left[i] + 1, resource_->remote_stream(i, gpu_id)); + // node.in_stream); + auto shard_len = h_right[i] - h_left[i] + 1; + auto graph = gpu_graph_list[i]; + int* id_array = reinterpret_cast(node.val_storage); + int* actual_size_array = id_array + shard_len; + int64_t* sample_array = (int64_t*)(id_array + shard_len * 2); + int sample_grid_size = (shard_len - 1) / dim_y + 1; + dim3 block(parallel_sample_size, dim_y); + dim3 grid(sample_grid_size); + // int sample_grid_size = shard_len / block_size_ + 1; + // VLOG(0)<<"in sample grid_size = "<val, query_size * sizeof(int64_t)); int64_t* val = result->val; - int dev_id = resource_->dev_id(gpu_id); - platform::CUDADeviceGuard guard(dev_id); + // int dev_id = resource_->dev_id(gpu_id); + // platform::CUDADeviceGuard guard(dev_id); + platform::CUDADeviceGuard guard(resource_->dev_id(gpu_id)); std::vector idx, gpu_begin_pos, local_begin_pos, sample_size; int size = 0; /* @@ -647,6 +675,7 @@ NodeQueryResult* GpuPsGraphTable::query_node_list(int gpu_id, int start, for (int i = 0; i < idx.size(); i++) { int dev_id_i = resource_->dev_id(idx[i]); platform::CUDADeviceGuard guard(dev_id_i); + // platform::CUDADeviceGuard guard(i); auto& node = path_[gpu_id][idx[i]].nodes_.front(); int grid_size = (sample_size[i] - 1) / block_size_ + 1; node_query_example<< +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "paddle/fluid/distributed/ps/table/common_graph_table.h" +#include "paddle/fluid/framework/fleet/heter_ps/gpu_graph_node.h" +#include "paddle/fluid/framework/fleet/heter_ps/graph_gpu_ps_table.h" +#include "paddle/fluid/string/printf.h" +#include "paddle/fluid/string/string_helper.h" +#ifdef PADDLE_WITH_HETERPS +namespace paddle { +namespace framework { +enum GraphSamplerStatus { waiting = 0, running = 1, terminating = 2 }; +class GraphSampler { + public: + GraphSampler() { + status = GraphSamplerStatus::waiting; + thread_pool.reset(new ::ThreadPool(1)); + } + virtual int start_service(std::string path) { + load_from_ssd(path); + VLOG(0) << "load from ssd over"; + std::promise prom; + std::future fut = prom.get_future(); + graph_sample_task_over = thread_pool->enqueue([&prom, this]() { + VLOG(0) << " promise set "; + prom.set_value(0); + status = GraphSamplerStatus::running; + return run_graph_sampling(); + }); + return fut.get(); + return 0; + } + virtual int end_graph_sampling() { + if (status == GraphSamplerStatus::running) { + status = GraphSamplerStatus::terminating; + return graph_sample_task_over.get(); + } + return -1; + } + ~GraphSampler() { end_graph_sampling(); } + virtual int load_from_ssd(std::string path) = 0; + ; + virtual int run_graph_sampling() = 0; + ; + virtual void init(GpuPsGraphTable *gpu_table, + std::vector args_) = 0; + std::shared_ptr<::ThreadPool> thread_pool; + GraphSamplerStatus status; + std::future graph_sample_task_over; +}; + +class CommonGraphSampler : public GraphSampler { + public: + CommonGraphSampler() {} + virtual ~CommonGraphSampler() {} + GpuPsGraphTable *g_table; + virtual int load_from_ssd(std::string path); + virtual int run_graph_sampling(); + virtual void init(GpuPsGraphTable *g, std::vector args); + GpuPsGraphTable *gpu_table; + paddle::distributed::GraphTable *table; + std::vector gpu_edges_count; + int64_t cpu_edges_count; + int64_t gpu_edges_limit, cpu_edges_limit, gpu_edges_each_limit; + std::vector> gpu_set; + int gpu_num; +}; + +class AllInGpuGraphSampler : public GraphSampler { + public: + AllInGpuGraphSampler() {} + virtual ~AllInGpuGraphSampler() {} + // virtual pthread_rwlock_t *export_rw_lock(); + virtual int run_graph_sampling(); + virtual int load_from_ssd(std::string path); + virtual void init(GpuPsGraphTable *g, std::vector args_); + + protected: + paddle::distributed::GraphTable *graph_table; + GpuPsGraphTable *gpu_table; + std::vector> sample_nodes; + std::vector> sample_neighbors; + std::vector sample_res; + // std::shared_ptr random; + int gpu_num; +}; +} +}; +#include "paddle/fluid/framework/fleet/heter_ps/graph_sampler_inl.h" +#endif diff --git a/paddle/fluid/framework/fleet/heter_ps/graph_sampler_inl.h b/paddle/fluid/framework/fleet/heter_ps/graph_sampler_inl.h new file mode 100644 index 0000000000000..ad4b00b11aa39 --- /dev/null +++ b/paddle/fluid/framework/fleet/heter_ps/graph_sampler_inl.h @@ -0,0 +1,161 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifdef PADDLE_WITH_HETERPS +namespace paddle { +namespace framework { +int CommonGraphSampler::load_from_ssd(std::string path) { + std::ifstream file(path); + auto _db = table->_db; + std::string line; + while (std::getline(file, line)) { + auto values = paddle::string::split_string(line, "\t"); + std::cout << values.size(); + if (values.size() < 2) continue; + auto neighbors = paddle::string::split_string(values[1], ";"); + std::vector neighbor_data; + for (auto x : neighbors) { + neighbor_data.push_back(std::stoll(x)); + } + auto src_id = std::stoll(values[0]); + _db->put(0, (char *)&src_id, sizeof(uint64_t), (char *)neighbor_data.data(), + sizeof(int64_t) * neighbor_data.size()); + int gpu_shard = src_id % gpu_num; + if (gpu_edges_count[gpu_shard] + neighbor_data.size() <= + gpu_edges_each_limit) { + gpu_edges_count[gpu_shard] += neighbor_data.size(); + gpu_set[gpu_shard].insert(src_id); + } + if (cpu_edges_count + neighbor_data.size() <= cpu_edges_limit) { + cpu_edges_count += neighbor_data.size(); + for (auto x : neighbor_data) { + // table->add_neighbor(src_id, x); + table->shards[src_id % table->shard_num] + ->add_graph_node(src_id) + ->build_edges(false); + table->shards[src_id % table->shard_num]->add_neighbor(src_id, x, 1.0); + } + } + std::vector graph_list; + for (int i = 0; i < gpu_num; i++) { + std::vector ids(gpu_set[i].begin(), gpu_set[i].end()); + graph_list.push_back(table->make_gpu_ps_graph(ids)); + } + gpu_table->build_graph_from_cpu(graph_list); + for (int i = 0; i < graph_list.size(); i++) { + delete[] graph_list[i].node_list; + delete[] graph_list[i].neighbor_list; + } + } +} +int CommonGraphSampler::run_graph_sampling() { return 0; } +void CommonGraphSampler::init(GpuPsGraphTable *g, + std::vector args) { + this->gpu_table = g; + gpu_num = g->gpu_num; + gpu_edges_limit = args.size() > 0 ? std::stoll(args[0]) : 1000000000LL; + cpu_edges_limit = args.size() > 1 ? std::stoll(args[1]) : 1000000000LL; + gpu_edges_each_limit = gpu_edges_limit / gpu_num; + if (gpu_edges_each_limit > INT_MAX) gpu_edges_each_limit = INT_MAX; + table = g->cpu_graph_table.get(); + gpu_edges_count = std::vector(gpu_num, 0); + cpu_edges_count = 0; + gpu_set = std::vector>(gpu_num); +} + +int AllInGpuGraphSampler::run_graph_sampling() { return 0; } +int AllInGpuGraphSampler::load_from_ssd(std::string path) { + graph_table->load_edges(path, false); + sample_nodes.clear(); + sample_neighbors.clear(); + sample_res.clear(); + sample_nodes.resize(gpu_num); + sample_neighbors.resize(gpu_num); + sample_res.resize(gpu_num); + std::vector>> + sample_nodes_ex(graph_table->task_pool_size_); + std::vector>> sample_neighbors_ex( + graph_table->task_pool_size_); + for (int i = 0; i < graph_table->task_pool_size_; i++) { + sample_nodes_ex[i].resize(gpu_num); + sample_neighbors_ex[i].resize(gpu_num); + } + std::vector> tasks; + for (size_t i = 0; i < graph_table->shards.size(); ++i) { + tasks.push_back( + graph_table->_shards_task_pool[i % graph_table->task_pool_size_] + ->enqueue([&, i, this]() -> int { + if (this->status == GraphSamplerStatus::terminating) return 0; + paddle::framework::GpuPsGraphNode node; + std::vector &v = + this->graph_table->shards[i]->get_bucket(); + size_t ind = i % this->graph_table->task_pool_size_; + for (size_t j = 0; j < v.size(); j++) { + size_t location = v[j]->get_id() % this->gpu_num; + node.node_id = v[j]->get_id(); + node.neighbor_size = v[j]->get_neighbor_size(); + node.neighbor_offset = + (int)sample_neighbors_ex[ind][location].size(); + sample_nodes_ex[ind][location].emplace_back(node); + for (int k = 0; k < node.neighbor_size; k++) + sample_neighbors_ex[ind][location].push_back( + v[j]->get_neighbor_id(k)); + } + return 0; + })); + } + for (size_t i = 0; i < tasks.size(); i++) tasks[i].get(); + tasks.clear(); + for (size_t i = 0; i < gpu_num; i++) { + tasks.push_back( + graph_table->_shards_task_pool[i % graph_table->task_pool_size_] + ->enqueue([&, i, this]() -> int { + if (this->status == GraphSamplerStatus::terminating) return 0; + int total_offset = 0; + size_t ind = i; + for (int j = 0; j < this->graph_table->task_pool_size_; j++) { + for (size_t k = 0; k < sample_nodes_ex[j][ind].size(); k++) { + sample_nodes[ind].push_back(sample_nodes_ex[j][ind][k]); + sample_nodes[ind].back().neighbor_offset += total_offset; + } + size_t neighbor_size = sample_neighbors_ex[j][ind].size(); + total_offset += neighbor_size; + for (size_t k = 0; k < neighbor_size; k++) { + sample_neighbors[ind].push_back( + sample_neighbors_ex[j][ind][k]); + } + } + return 0; + })); + } + for (size_t i = 0; i < tasks.size(); i++) tasks[i].get(); + for (size_t i = 0; i < gpu_num; i++) { + sample_res[i].node_list = sample_nodes[i].data(); + sample_res[i].neighbor_list = sample_neighbors[i].data(); + sample_res[i].node_size = sample_nodes[i].size(); + sample_res[i].neighbor_size = sample_neighbors[i].size(); + } + + gpu_table->build_graph_from_cpu(sample_res); + return 0; +} +void AllInGpuGraphSampler::init(GpuPsGraphTable *g, + std::vector args_) { + this->gpu_table = g; + this->gpu_num = g->gpu_num; + graph_table = g->cpu_graph_table.get(); +} +} +}; +#endif diff --git a/paddle/fluid/framework/fleet/heter_ps/heter_comm.h b/paddle/fluid/framework/fleet/heter_ps/heter_comm.h index 419bd716eb304..5e4be02962ea9 100644 --- a/paddle/fluid/framework/fleet/heter_ps/heter_comm.h +++ b/paddle/fluid/framework/fleet/heter_ps/heter_comm.h @@ -210,11 +210,11 @@ class HeterComm { std::vector> path_; float load_factor_{0.75}; int block_size_{256}; + int topo_aware_{0}; private: std::unique_ptr heter_comm_kernel_; std::vector storage_; - int topo_aware_{0}; int feanum_{1800 * 2048}; int multi_node_{0}; int node_size_; diff --git a/paddle/fluid/framework/fleet/heter_ps/test_cpu_graph_sample.cu b/paddle/fluid/framework/fleet/heter_ps/test_cpu_graph_sample.cu index 0f7e38ac95e1b..62a0df9430002 100644 --- a/paddle/fluid/framework/fleet/heter_ps/test_cpu_graph_sample.cu +++ b/paddle/fluid/framework/fleet/heter_ps/test_cpu_graph_sample.cu @@ -66,7 +66,6 @@ TEST(TEST_FLEET, graph_sample) { 1,4,7 gpu 2: 2,5,8 - query(2,6) returns nodes [6,9,1,4,7,2] */ ::paddle::distributed::GraphParameter table_proto; diff --git a/paddle/fluid/framework/fleet/heter_ps/test_sample_rate.cu b/paddle/fluid/framework/fleet/heter_ps/test_sample_rate.cu index a4b1a6a7aee1e..887bda4be4a89 100644 --- a/paddle/fluid/framework/fleet/heter_ps/test_sample_rate.cu +++ b/paddle/fluid/framework/fleet/heter_ps/test_sample_rate.cu @@ -40,6 +40,7 @@ #include "paddle/fluid/framework/fleet/heter_ps/feature_value.h" #include "paddle/fluid/framework/fleet/heter_ps/graph_gpu_ps_table.h" +#include "paddle/fluid/framework/fleet/heter_ps/graph_sampler.h" #include "paddle/fluid/framework/fleet/heter_ps/heter_comm.h" #include "paddle/fluid/framework/fleet/heter_ps/heter_resource.h" #include "paddle/fluid/framework/fleet/heter_ps/optimizer.cuh.h" @@ -52,9 +53,13 @@ namespace memory = paddle::memory; namespace distributed = paddle::distributed; std::string input_file; -int fixed_key_size = 100, sample_size = 100, +int exe_count = 100; +int use_nv = 1; +int fixed_key_size = 50000, sample_size = 32, bfs_sample_nodes_in_each_shard = 10000, init_search_size = 1, - bfs_sample_edges = 20; + bfs_sample_edges = 20, gpu_num1 = 8, gpu_num = 8; +std::string gpu_str = "0,1,2,3,4,5,6,7"; +int64_t *key[8]; std::vector edges = { std::string("37\t45\t0.34"), std::string("37\t145\t0.31"), std::string("37\t112\t0.21"), std::string("96\t48\t1.4"), @@ -83,7 +88,7 @@ void testSampleRate() { pthread_rwlock_init(&rwlock, NULL); { ::paddle::distributed::GraphParameter table_proto; - table_proto.set_gpups_mode(false); + // table_proto.set_gpups_mode(false); table_proto.set_shard_num(127); table_proto.set_task_pool_size(24); std::cerr << "initializing begin"; @@ -163,25 +168,48 @@ void testSampleRate() { std::chrono::duration_cast(end1 - start1); std::cerr << "total time cost without cache is " << tt.count() << " us" << std::endl; + int64_t tot = 0; + for (int i = 0; i < 10; i++) { + for (auto x : sample_id[i]) tot += x; + } + VLOG(0) << "sum = " << tot; } - const int gpu_num = 8; - ::paddle::distributed::GraphParameter table_proto; - table_proto.set_gpups_mode(true); - table_proto.set_shard_num(127); - table_proto.set_gpu_num(gpu_num); - table_proto.set_gpups_graph_sample_class("BasicBfsGraphSampler"); - table_proto.set_gpups_graph_sample_args(std::to_string(init_search_size) + - ",100000000,10000000,1,1"); - std::vector dev_ids; - for (int i = 0; i < gpu_num; i++) { - dev_ids.push_back(i); + gpu_num = 0; + int st = 0, u = 0; + std::vector device_id_mapping; + while (u < gpu_str.size()) { + VLOG(0) << u << " " << gpu_str[u]; + if (gpu_str[u] == ',') { + auto p = gpu_str.substr(st, u - st); + int id = std::stoi(p); + VLOG(0) << "got a new device id" << id; + device_id_mapping.push_back(id); + st = u + 1; + } + u++; } + auto p = gpu_str.substr(st, gpu_str.size() - st); + int id = std::stoi(p); + VLOG(0) << "got a new device id" << id; + device_id_mapping.push_back(id); + gpu_num = device_id_mapping.size(); + ::paddle::distributed::GraphParameter table_proto; + table_proto.set_shard_num(24); + // table_proto.set_gpups_graph_sample_class("CompleteGraphSampler"); + std::shared_ptr resource = - std::make_shared(dev_ids); + std::make_shared(device_id_mapping); resource->enable_p2p(); - GpuPsGraphTable g(resource); + GpuPsGraphTable g(resource, use_nv); g.init_cpu_table(table_proto); - g.load(std::string(input_file), std::string("e>")); + std::vector arg; + AllInGpuGraphSampler sampler; + sampler.init(&g, arg); + // g.load(std::string(input_file), std::string("e>")); + // sampler.start(std::string(input_file)); + // sampler.load_from_ssd(std::string(input_file)); + sampler.start_service(input_file); + /* NodeQueryResult *query_node_res; query_node_res = g.query_node_list(0, 0, ids.size() + 10000); @@ -209,52 +237,65 @@ void testSampleRate() { auto q = g.query_node_list(0, st, ids.size() / 20); VLOG(0) << " the " << i << "th iteration size = " << q->actual_sample_size; } -// NodeQueryResult *query_node_list(int gpu_id, int start, int query_size); - -/* - void *key; + // NodeQueryResult *query_node_list(int gpu_id, int start, int query_size); +*/ + for (int i = 0; i < gpu_num1; i++) { + platform::CUDADeviceGuard guard(device_id_mapping[i]); + cudaMalloc((void **)&key[i], ids.size() * sizeof(int64_t)); + cudaMemcpy(key[i], ids.data(), ids.size() * sizeof(int64_t), + cudaMemcpyHostToDevice); + } + /* cudaMalloc((void **)&key, ids.size() * sizeof(int64_t)); cudaMemcpy(key, ids.data(), ids.size() * sizeof(int64_t), cudaMemcpyHostToDevice); - std::vector res[gpu_num]; + */ + /* + std::vector> res(gpu_num1); + for (int i = 0; i < gpu_num1; i++) { + int st = 0; + int size = ids.size(); + NeighborSampleResult *result = new NeighborSampleResult(sample_size, size); + platform::CUDAPlace place = platform::CUDAPlace(device_id_mapping[i]); + platform::CUDADeviceGuard guard(device_id_mapping[i]); + cudaMalloc((void **)&result->val, size * sample_size * sizeof(int64_t)); + cudaMalloc((void **)&result->actual_sample_size, size * sizeof(int)); + res[i].push_back(result); + } + */ start = 0; - auto func = [&rwlock, &g, &res, &start, - &gpu_num, &ids, &key](int i) { - while (true) { - int s, sn; - bool exit = false; - pthread_rwlock_wrlock(&rwlock); - if (start < ids.size()) { - s = start; - sn = ids.size() - start; - sn = min(sn, fixed_key_size); - start += sn; - } else { - exit = true; + auto func = [&rwlock, &g, &start, &ids](int i) { + int st = 0; + int size = ids.size(); + for (int k = 0; k < exe_count; k++) { + st = 0; + while (st < size) { + int len = std::min(fixed_key_size, (int)ids.size() - st); + auto r = g.graph_neighbor_sample(i, (int64_t *)(key[i] + st), + sample_size, len); + st += len; + delete r; } - pthread_rwlock_unlock(&rwlock); - if (exit) break; - auto r = - g.graph_neighbor_sample(i, (int64_t *)(key + s), sample_size, sn); - res[i].push_back(r); } }; auto start1 = std::chrono::steady_clock::now(); - std::thread thr[gpu_num]; - for (int i = 0; i < gpu_num; i++) { + std::thread thr[gpu_num1]; + for (int i = 0; i < gpu_num1; i++) { thr[i] = std::thread(func, i); } - for (int i = 0; i < gpu_num; i++) thr[i].join(); + for (int i = 0; i < gpu_num1; i++) thr[i].join(); auto end1 = std::chrono::steady_clock::now(); auto tt = std::chrono::duration_cast(end1 - start1); - std::cerr << "total time cost without cache is " << tt.count() << " us" - << std::endl; -*/ + std::cerr << "total time cost without cache is " + << tt.count() / exe_count / gpu_num1 << " us" << std::endl; + for (int i = 0; i < gpu_num1; i++) { + cudaFree(key[i]); + } #endif } -// TEST(testSampleRate, Run) { testSampleRate(); } +TEST(TEST_FLEET, sample_rate) { testSampleRate(); } int main(int argc, char *argv[]) { for (int i = 0; i < argc; i++) @@ -276,5 +317,14 @@ int main(int argc, char *argv[]) { VLOG(0) << "sample_size neighbor_size is " << sample_size; if (argc > 4) init_search_size = std::stoi(argv[4]); VLOG(0) << " init_search_size " << init_search_size; + if (argc > 5) { + gpu_str = argv[5]; + } + VLOG(0) << " gpu_str= " << gpu_str; + gpu_num = 0; + if (argc > 6) gpu_num1 = std::stoi(argv[6]); + VLOG(0) << " gpu_thread_num= " << gpu_num1; + if (argc > 7) use_nv = std::stoi(argv[7]); + VLOG(0) << " use_nv " << use_nv; testSampleRate(); } From 56dafc4feae665ea9a0ff24b19d23576cf61d3c5 Mon Sep 17 00:00:00 2001 From: ziyoujiyi <73728031+ziyoujiyi@users.noreply.github.com> Date: Fri, 15 Apr 2022 23:31:34 +0800 Subject: [PATCH 190/211] solve brpc compile in arm-ubantu18 (#41649) * back fl * delete ssl cert * . * make warning * . * unittest paral degree * solve unittest * heter & multi cloud commm ready * . * . * arm_brpc compile * . * . * . * . * . * . * . * . * . * . * . * . * . * . * only output is ok * base is ok * . * . * . * . * . * . * . * . * add switch server bin * . * . * . * . * . * . * . * . * . * . * . * . * . * . * . * . * . * . * . * . * . * . * . * . * . * . * . * . * . * . * . * . * . * . * . * adapt brpc ssl * . * . * . * . * . * . * . * . * . * . * . * . * . * . * . * . * . * . * . * . * . * . * . * . * . * . * . * . * . * . * . * . * . * . * . --- CMakeLists.txt | 1 + cmake/configure.cmake | 4 + cmake/external/arm_brpc.cmake | 76 +++++++++++++++ cmake/external/brpc.cmake | 2 +- cmake/external/gflags.cmake | 86 +++++++++++------ cmake/external/glog.cmake | 89 +++++++++++------ cmake/external/protobuf.cmake | 94 ++++++++++++------ cmake/third_party.cmake | 18 +++- paddle/fluid/distributed/CMakeLists.txt | 0 .../distributed/fleet_executor/CMakeLists.txt | 4 +- .../distributed/ps/service/heter_client.h | 8 ++ .../distributed/ps/service/heter_server.cc | 10 ++ paddle/fluid/framework/CMakeLists.txt | 3 + paddle/fluid/operators/pscore/CMakeLists.txt | 9 +- .../operators/pscore/switch_server_test.cc | 95 +++++++++++++++++++ paddle/fluid/pybind/CMakeLists.txt | 6 +- 16 files changed, 408 insertions(+), 97 deletions(-) mode change 100644 => 100755 cmake/configure.cmake create mode 100755 cmake/external/arm_brpc.cmake mode change 100644 => 100755 cmake/external/brpc.cmake mode change 100644 => 100755 cmake/external/gflags.cmake mode change 100644 => 100755 cmake/external/glog.cmake mode change 100644 => 100755 cmake/external/protobuf.cmake mode change 100644 => 100755 cmake/third_party.cmake mode change 100644 => 100755 paddle/fluid/distributed/CMakeLists.txt mode change 100644 => 100755 paddle/fluid/distributed/fleet_executor/CMakeLists.txt mode change 100644 => 100755 paddle/fluid/distributed/ps/service/heter_server.cc create mode 100755 paddle/fluid/operators/pscore/switch_server_test.cc mode change 100644 => 100755 paddle/fluid/pybind/CMakeLists.txt diff --git a/CMakeLists.txt b/CMakeLists.txt index 6988434996bcc..b0680a782cf7f 100755 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -246,6 +246,7 @@ option(WITH_ASCEND_INT64 "Compile with int64 kernel for ascend NPU" OFF) option(WITH_POCKETFFT "Compile with pocketfft support" ON) option(WITH_RECORD_BUILDTIME "Compile PaddlePaddle with record all targets build time" OFF) option(WITH_CUSTOM_DEVICE "Compile with custom device support" OFF) +option(WITH_ARM_BRPC "Supprot Brpc in Arm" OFF) if(WITH_RECORD_BUILDTIME) set_property(GLOBAL PROPERTY RULE_LAUNCH_COMPILE "${CMAKE_CURRENT_SOURCE_DIR}/tools/get_build_time.sh") diff --git a/cmake/configure.cmake b/cmake/configure.cmake old mode 100644 new mode 100755 index 20a35c91bdde1..5608b6f6f348b --- a/cmake/configure.cmake +++ b/cmake/configure.cmake @@ -74,6 +74,10 @@ if(WITH_PSLIB) add_definitions(-DPADDLE_WITH_PSLIB) endif() +if(WITH_ARM_BRPC) + add_definitions(-DPADDLE_WITH_ARM_BRPC) +endif() + if(WITH_GLOO) add_definitions(-DPADDLE_WITH_GLOO) endif() diff --git a/cmake/external/arm_brpc.cmake b/cmake/external/arm_brpc.cmake new file mode 100755 index 0000000000000..83935ae0c6346 --- /dev/null +++ b/cmake/external/arm_brpc.cmake @@ -0,0 +1,76 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +INCLUDE(ExternalProject) + +#find_package(OpenSSL REQUIRED) + +#message(STATUS "ssl:" ${OPENSSL_SSL_LIBRARY}) +#message(STATUS "crypto:" ${OPENSSL_CRYPTO_LIBRARY}) + +#ADD_LIBRARY(ssl SHARED IMPORTED GLOBAL) +#SET_PROPERTY(TARGET ssl PROPERTY IMPORTED_LOCATION ${OPENSSL_SSL_LIBRARY}) + +#ADD_LIBRARY(crypto SHARED IMPORTED GLOBAL) +#SET_PROPERTY(TARGET crypto PROPERTY IMPORTED_LOCATION ${OPENSSL_CRYPTO_LIBRARY}) + +IF((NOT DEFINED ARM_BRPC_NAME) OR (NOT DEFINED ARM_BRPC_URL)) + SET(ARM_BRPC_VER "1.1.0" CACHE STRING "" FORCE) + SET(ARM_BRPC_NAME "arm_brpc" CACHE STRING "" FORCE) +ENDIF() + +MESSAGE(STATUS "ARM_BRPC_NAME: ${ARM_BRPC_NAME}, ARM_BRPC_URL: ${ARM_BRPC_URL}") +SET(ARM_BRPC_PREFIX_DIR "${THIRD_PARTY_PATH}/arm_brpc") +SET(ARM_BRPC_PROJECT "extern_arm_brpc") +SET(ARM_BRPC_DOWNLOAD_DIR "${ARM_BRPC_PREFIX_DIR}/src/${ARM_BRPC_PROJECT}") +SET(ARM_BRPC_DST_DIR "output") +SET(ARM_BRPC_INSTALL_ROOT "${THIRD_PARTY_PATH}/install") +SET(ARM_BRPC_INSTALL_DIR ${ARM_BRPC_INSTALL_ROOT}/arm_brpc/output) +SET(ARM_BRPC_ROOT ${ARM_BRPC_INSTALL_DIR}) +SET(ARM_BRPC_INC_DIR ${ARM_BRPC_ROOT}/include) +SET(ARM_BRPC_LIB_DIR ${ARM_BRPC_ROOT}/lib) +SET(ARM_BRPC_LIB ${ARM_BRPC_LIB_DIR}/libbrpc.a) +SET(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_RPATH}" "${ARM_BRPC_ROOT}/lib") + +INCLUDE_DIRECTORIES(${ARM_BRPC_INSTALL_ROOT}/${ARM_BRPC_NAME}/output/include) + +FILE(WRITE ${ARM_BRPC_DOWNLOAD_DIR}/CMakeLists.txt + "PROJECT(ARM_BRPC)\n" + "cmake_minimum_required(VERSION 3.0)\n" + "install(DIRECTORY ${ARM_BRPC_DST_DIR} ${ARM_BRPC_DST_DIR} \n" + " DESTINATION ${ARM_BRPC_NAME})\n") + +SET(ARM_BRPC_URL "https://paddlerec.bj.bcebos.com/online_infer/arm_brpc_ubuntu18/output.tar.gz" CACHE STRING "" FORCE) +ExternalProject_Add( + ${ARM_BRPC_PROJECT} + ${EXTERNAL_PROJECT_LOG_ARGS} + PREFIX ${ARM_BRPC_PREFIX_DIR} + DOWNLOAD_DIR ${ARM_BRPC_DOWNLOAD_DIR} + DOWNLOAD_COMMAND rm -rf output.tar.gz + && wget --no-check-certificate ${ARM_BRPC_URL} + && tar zxvf output.tar.gz + #DOWNLOAD_COMMAND cp /home/wangbin44/Paddle/build/output.tar.gz . + # && tar zxvf output.tar.gz + DOWNLOAD_NO_PROGRESS 1 + UPDATE_COMMAND "" + CMAKE_ARGS -DCMAKE_INSTALL_PREFIX=${ARM_BRPC_INSTALL_ROOT} + -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE} + CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${ARM_BRPC_INSTALL_ROOT} + -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE} + BUILD_BYPRODUCTS ${ARM_BRPC_LIB} +) + +ADD_LIBRARY(arm_brpc STATIC IMPORTED GLOBAL) # 直接导入已经生成的库 +SET_PROPERTY(TARGET arm_brpc PROPERTY IMPORTED_LOCATION ${ARM_BRPC_LIB}) +ADD_DEPENDENCIES(arm_brpc ${ARM_BRPC_PROJECT}) diff --git a/cmake/external/brpc.cmake b/cmake/external/brpc.cmake old mode 100644 new mode 100755 index 93bd26655d83a..c891708751aa8 --- a/cmake/external/brpc.cmake +++ b/cmake/external/brpc.cmake @@ -41,6 +41,7 @@ ExternalProject_Add( ${EXTERNAL_PROJECT_LOG_ARGS} # TODO(gongwb): change to de newst repo when they changed GIT_REPOSITORY "https://github.com/wangjiawei04/brpc" + #GIT_REPOSITORY "https://github.com/ziyoujiyi/brpc" # ssl error in the previous repo(can be mannual fixed) GIT_TAG "e203afb794caf027da0f1e0776443e7d20c0c28e" PREFIX ${BRPC_PREFIX_DIR} UPDATE_COMMAND "" @@ -74,4 +75,3 @@ ADD_DEPENDENCIES(brpc extern_brpc) add_definitions(-DBRPC_WITH_GLOG) LIST(APPEND external_project_dependencies brpc) - diff --git a/cmake/external/gflags.cmake b/cmake/external/gflags.cmake old mode 100644 new mode 100755 index a445633f989cf..056ff32c8c0d9 --- a/cmake/external/gflags.cmake +++ b/cmake/external/gflags.cmake @@ -29,35 +29,63 @@ ENDIF(WIN32) INCLUDE_DIRECTORIES(${GFLAGS_INCLUDE_DIR}) -ExternalProject_Add( - extern_gflags - ${EXTERNAL_PROJECT_LOG_ARGS} - ${SHALLOW_CLONE} - GIT_REPOSITORY ${GFLAGS_REPOSITORY} - GIT_TAG ${GFLAGS_TAG} - PREFIX ${GFLAGS_PREFIX_DIR} - UPDATE_COMMAND "" - BUILD_COMMAND ${BUILD_COMMAND} - INSTALL_COMMAND ${INSTALL_COMMAND} - CMAKE_ARGS -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER} - -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER} - -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS} - -DCMAKE_CXX_FLAGS_RELEASE=${CMAKE_CXX_FLAGS_RELEASE} - -DCMAKE_CXX_FLAGS_DEBUG=${CMAKE_CXX_FLAGS_DEBUG} - -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS} - -DCMAKE_C_FLAGS_DEBUG=${CMAKE_C_FLAGS_DEBUG} - -DCMAKE_C_FLAGS_RELEASE=${CMAKE_C_FLAGS_RELEASE} - -DBUILD_STATIC_LIBS=ON - -DCMAKE_INSTALL_PREFIX=${GFLAGS_INSTALL_DIR} - -DCMAKE_POSITION_INDEPENDENT_CODE=ON - -DBUILD_TESTING=OFF - -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE} - ${EXTERNAL_OPTIONAL_ARGS} - CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${GFLAGS_INSTALL_DIR} - -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON - -DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE} - BUILD_BYPRODUCTS ${GFLAGS_LIBRARIES} -) +if(WITH_ARM_BRPC) + SET(ARM_GFLAGS_URL "https://paddlerec.bj.bcebos.com/online_infer/arm_brpc_ubuntu18/arm_gflags.tar.gz" CACHE STRING "" FORCE) + set(GFLAGS_SOURCE_DIR ${THIRD_PARTY_PATH}/gflags/src/extern_gflags) + FILE(WRITE ${GFLAGS_SOURCE_DIR}/CMakeLists.txt + "PROJECT(ARM_GFLAGS)\n" + "cmake_minimum_required(VERSION 3.0)\n" + "install(DIRECTORY arm_gflags/bin arm_gflags/include arm_gflags/lib \n" + " DESTINATION . USE_SOURCE_PERMISSIONS)\n") + ExternalProject_Add( + extern_gflags + ${EXTERNAL_PROJECT_LOG_ARGS} + ${SHALLOW_CLONE} + PREFIX ${GFLAGS_PREFIX_DIR} + DOWNLOAD_DIR ${GFLAGS_SOURCE_DIR} + DOWNLOAD_COMMAND rm -rf arm_gflags.tar.gz && + wget --no-check-certificate ${ARM_GFLAGS_URL} + && tar zxvf arm_gflags.tar.gz + #DOWNLOAD_COMMAND cp /home/wangbin44/Paddle/build/arm_gflags.tar.gz . + # && tar zxvf arm_gflags.tar.gz + UPDATE_COMMAND "" + CMAKE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${GFLAGS_INSTALL_DIR} + -DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE} + CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${GFLAGS_INSTALL_DIR} + -DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE} + BUILD_BYPRODUCTS ${GFLAGS_LIBRARIES} + ) +else() + ExternalProject_Add( + extern_gflags + ${EXTERNAL_PROJECT_LOG_ARGS} + ${SHALLOW_CLONE} + GIT_REPOSITORY ${GFLAGS_REPOSITORY} + GIT_TAG ${GFLAGS_TAG} + PREFIX ${GFLAGS_PREFIX_DIR} + UPDATE_COMMAND "" + BUILD_COMMAND ${BUILD_COMMAND} + INSTALL_COMMAND ${INSTALL_COMMAND} + CMAKE_ARGS -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER} + -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER} + -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS} + -DCMAKE_CXX_FLAGS_RELEASE=${CMAKE_CXX_FLAGS_RELEASE} + -DCMAKE_CXX_FLAGS_DEBUG=${CMAKE_CXX_FLAGS_DEBUG} + -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS} + -DCMAKE_C_FLAGS_DEBUG=${CMAKE_C_FLAGS_DEBUG} + -DCMAKE_C_FLAGS_RELEASE=${CMAKE_C_FLAGS_RELEASE} + -DBUILD_STATIC_LIBS=ON + -DCMAKE_INSTALL_PREFIX=${GFLAGS_INSTALL_DIR} + -DCMAKE_POSITION_INDEPENDENT_CODE=ON + -DBUILD_TESTING=OFF + -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE} + ${EXTERNAL_OPTIONAL_ARGS} + CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${GFLAGS_INSTALL_DIR} + -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON + -DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE} + BUILD_BYPRODUCTS ${GFLAGS_LIBRARIES} + ) +endif() ADD_LIBRARY(gflags STATIC IMPORTED GLOBAL) SET_PROPERTY(TARGET gflags PROPERTY IMPORTED_LOCATION ${GFLAGS_LIBRARIES}) diff --git a/cmake/external/glog.cmake b/cmake/external/glog.cmake old mode 100644 new mode 100755 index 455f81b041ea6..b2f3afdabf415 --- a/cmake/external/glog.cmake +++ b/cmake/external/glog.cmake @@ -31,36 +31,65 @@ ENDIF(WIN32) INCLUDE_DIRECTORIES(${GLOG_INCLUDE_DIR}) -ExternalProject_Add( - extern_glog - ${EXTERNAL_PROJECT_LOG_ARGS} - ${SHALLOW_CLONE} - GIT_REPOSITORY ${GLOG_REPOSITORY} - GIT_TAG ${GLOG_TAG} - DEPENDS gflags - PREFIX ${GLOG_PREFIX_DIR} - UPDATE_COMMAND "" - CMAKE_ARGS -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER} - -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER} - -DCMAKE_CXX_FLAGS=${GLOG_CMAKE_CXX_FLAGS} - -DCMAKE_CXX_FLAGS_RELEASE=${CMAKE_CXX_FLAGS_RELEASE} - -DCMAKE_CXX_FLAGS_DEBUG=${CMAKE_CXX_FLAGS_DEBUG} - -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS} - -DCMAKE_C_FLAGS_DEBUG=${CMAKE_C_FLAGS_DEBUG} - -DCMAKE_C_FLAGS_RELEASE=${CMAKE_C_FLAGS_RELEASE} - -DCMAKE_INSTALL_PREFIX=${GLOG_INSTALL_DIR} - -DCMAKE_INSTALL_LIBDIR=${GLOG_INSTALL_DIR}/lib - -DCMAKE_POSITION_INDEPENDENT_CODE=ON - -DWITH_GFLAGS=OFF - -DBUILD_TESTING=OFF - -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE} - ${EXTERNAL_OPTIONAL_ARGS} - CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${GLOG_INSTALL_DIR} - -DCMAKE_INSTALL_LIBDIR:PATH=${GLOG_INSTALL_DIR}/lib - -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON - -DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE} - BUILD_BYPRODUCTS ${GLOG_LIBRARIES} -) +if(WITH_ARM_BRPC) + SET(ARM_GLOG_URL "https://paddlerec.bj.bcebos.com/online_infer/arm_brpc_ubuntu18/arm_glog.tar.gz" CACHE STRING "" FORCE) + set(GLOG_SOURCE_DIR ${THIRD_PARTY_PATH}/glog/src/extern_glog) + FILE(WRITE ${GLOG_SOURCE_DIR}/CMakeLists.txt + "PROJECT(ARM_GLOGS)\n" + "cmake_minimum_required(VERSION 3.0)\n" + "install(DIRECTORY arm_glog/include arm_glog/lib \n" + " DESTINATION . USE_SOURCE_PERMISSIONS)\n") + ExternalProject_Add( + extern_glog + ${EXTERNAL_PROJECT_LOG_ARGS} + ${SHALLOW_CLONE} + DEPENDS gflags + PREFIX ${GLOG_PREFIX_DIR} + DOWNLOAD_DIR ${GLOG_SOURCE_DIR} + DOWNLOAD_COMMAND rm -rf arm_glog.tar.gz && + wget --no-check-certificate ${ARM_GLOG_URL} + && tar zxvf arm_glog.tar.gz + #DOWNLOAD_COMMAND cp /home/wangbin44/Paddle/build/arm_glog.tar.gz . + # && tar zxvf arm_glog.tar.gz + UPDATE_COMMAND "" + CMAKE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${GLOG_INSTALL_DIR} + -DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE} + CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${GLOG_INSTALL_DIR} + -DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE} + BUILD_BYPRODUCTS ${GLOG_LIBRARIES} + ) +else() + ExternalProject_Add( + extern_glog + ${EXTERNAL_PROJECT_LOG_ARGS} + ${SHALLOW_CLONE} + GIT_REPOSITORY ${GLOG_REPOSITORY} + GIT_TAG ${GLOG_TAG} + DEPENDS gflags + PREFIX ${GLOG_PREFIX_DIR} + UPDATE_COMMAND "" + CMAKE_ARGS -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER} + -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER} + -DCMAKE_CXX_FLAGS=${GLOG_CMAKE_CXX_FLAGS} + -DCMAKE_CXX_FLAGS_RELEASE=${CMAKE_CXX_FLAGS_RELEASE} + -DCMAKE_CXX_FLAGS_DEBUG=${CMAKE_CXX_FLAGS_DEBUG} + -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS} + -DCMAKE_C_FLAGS_DEBUG=${CMAKE_C_FLAGS_DEBUG} + -DCMAKE_C_FLAGS_RELEASE=${CMAKE_C_FLAGS_RELEASE} + -DCMAKE_INSTALL_PREFIX=${GLOG_INSTALL_DIR} + -DCMAKE_INSTALL_LIBDIR=${GLOG_INSTALL_DIR}/lib + -DCMAKE_POSITION_INDEPENDENT_CODE=ON + -DWITH_GFLAGS=OFF + -DBUILD_TESTING=OFF + -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE} + ${EXTERNAL_OPTIONAL_ARGS} + CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${GLOG_INSTALL_DIR} + -DCMAKE_INSTALL_LIBDIR:PATH=${GLOG_INSTALL_DIR}/lib + -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON + -DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE} + BUILD_BYPRODUCTS ${GLOG_LIBRARIES} + ) +endif() ADD_LIBRARY(glog STATIC IMPORTED GLOBAL) SET_PROPERTY(TARGET glog PROPERTY IMPORTED_LOCATION ${GLOG_LIBRARIES}) diff --git a/cmake/external/protobuf.cmake b/cmake/external/protobuf.cmake old mode 100644 new mode 100755 index 58ff5f0d2b715..3a59ea6bc92a2 --- a/cmake/external/protobuf.cmake +++ b/cmake/external/protobuf.cmake @@ -219,37 +219,67 @@ FUNCTION(build_protobuf TARGET_NAME BUILD_FOR_HOST) SET(PROTOBUF_REPOSITORY ${GIT_URL}/protocolbuffers/protobuf.git) SET(PROTOBUF_TAG 9f75c5aa851cd877fb0d93ccc31b8567a6706546) endif() - - ExternalProject_Add( - ${TARGET_NAME} - ${EXTERNAL_PROJECT_LOG_ARGS} - ${SHALLOW_CLONE} - GIT_REPOSITORY ${PROTOBUF_REPOSITORY} - GIT_TAG ${PROTOBUF_TAG} - PREFIX ${PROTOBUF_PREFIX_DIR} - UPDATE_COMMAND "" - DEPENDS zlib - CONFIGURE_COMMAND - ${CMAKE_COMMAND} ${PROTOBUF_SOURCE_DIR}/cmake - ${OPTIONAL_ARGS} - -Dprotobuf_BUILD_TESTS=OFF - -DCMAKE_SKIP_RPATH=ON - -DCMAKE_POSITION_INDEPENDENT_CODE=ON - -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE} - -DCMAKE_INSTALL_PREFIX=${PROTOBUF_INSTALL_DIR} - -DCMAKE_INSTALL_LIBDIR=lib - -DBUILD_SHARED_LIBS=OFF - CMAKE_CACHE_ARGS - -DCMAKE_INSTALL_PREFIX:PATH=${PROTOBUF_INSTALL_DIR} - -DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE} - -DCMAKE_VERBOSE_MAKEFILE:BOOL=OFF - -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON - ${OPTIONAL_CACHE_ARGS} - BUILD_BYPRODUCTS ${PROTOBUF_INSTALL_DIR}/lib/libprotobuf${CMAKE_STATIC_LIBRARY_SUFFIX} - BUILD_BYPRODUCTS ${PROTOBUF_INSTALL_DIR}/lib/libprotobuf-lite${CMAKE_STATIC_LIBRARY_SUFFIX} - BUILD_BYPRODUCTS ${PROTOBUF_INSTALL_DIR}/lib/libprotoc${CMAKE_STATIC_LIBRARY_SUFFIX} - BUILD_BYPRODUCTS ${PROTOBUF_INSTALL_DIR}/bin/protoc${CMAKE_EXECUTABLE_SUFFIX} - ) + if(WITH_ARM_BRPC) + SET(ARM_PROTOBUF_URL "https://paddlerec.bj.bcebos.com/online_infer/arm_brpc_ubuntu18/arm_protobuf.tar.gz" CACHE STRING "" FORCE) + FILE(WRITE ${PROTOBUF_SOURCE_DIR}/CMakeLists.txt + "PROJECT(ARM_PROTOBUF)\n" + "cmake_minimum_required(VERSION 3.0)\n" + "install(DIRECTORY arm_protobuf/bin arm_protobuf/include arm_protobuf/lib \n" + " DESTINATION . USE_SOURCE_PERMISSIONS)\n") + ExternalProject_Add( + ${TARGET_NAME} + ${EXTERNAL_PROJECT_LOG_ARGS} + ${SHALLOW_CLONE} + PREFIX ${PROTOBUF_PREFIX_DIR} + DOWNLOAD_DIR ${PROTOBUF_SOURCE_DIR} + DOWNLOAD_COMMAND rm -rf arm_protobuf.tar.gz + && wget --no-check-certificate ${ARM_PROTOBUF_URL} + && tar zxvf arm_protobuf.tar.gz + #DOWNLOAD_COMMAND cp /home/wangbin44/Paddle/build/arm_protobuf.tar.gz . + # && tar zxvf arm_protobuf.tar.gz + UPDATE_COMMAND "" + CMAKE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${PROTOBUF_INSTALL_DIR} + -DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE} + CMAKE_CACHE_ARGS + -DCMAKE_INSTALL_PREFIX:PATH=${PROTOBUF_INSTALL_DIR} + -DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE} + BUILD_BYPRODUCTS ${PROTOBUF_INSTALL_DIR}/lib/libprotobuf${CMAKE_STATIC_LIBRARY_SUFFIX} + BUILD_BYPRODUCTS ${PROTOBUF_INSTALL_DIR}/lib/libprotobuf-lite${CMAKE_STATIC_LIBRARY_SUFFIX} + BUILD_BYPRODUCTS ${PROTOBUF_INSTALL_DIR}/lib/libprotoc${CMAKE_STATIC_LIBRARY_SUFFIX} + BUILD_BYPRODUCTS ${PROTOBUF_INSTALL_DIR}/bin/protoc${CMAKE_EXECUTABLE_SUFFIX} + ) + else() + ExternalProject_Add( + ${TARGET_NAME} + ${EXTERNAL_PROJECT_LOG_ARGS} + ${SHALLOW_CLONE} + GIT_REPOSITORY ${PROTOBUF_REPOSITORY} + GIT_TAG ${PROTOBUF_TAG} + PREFIX ${PROTOBUF_PREFIX_DIR} + UPDATE_COMMAND "" + DEPENDS zlib + CONFIGURE_COMMAND + ${CMAKE_COMMAND} ${PROTOBUF_SOURCE_DIR}/cmake + ${OPTIONAL_ARGS} + -Dprotobuf_BUILD_TESTS=OFF + -DCMAKE_SKIP_RPATH=ON + -DCMAKE_POSITION_INDEPENDENT_CODE=ON + -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE} + -DCMAKE_INSTALL_PREFIX=${PROTOBUF_INSTALL_DIR} + -DCMAKE_INSTALL_LIBDIR=lib + -DBUILD_SHARED_LIBS=OFF + CMAKE_CACHE_ARGS + -DCMAKE_INSTALL_PREFIX:PATH=${PROTOBUF_INSTALL_DIR} + -DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE} + -DCMAKE_VERBOSE_MAKEFILE:BOOL=OFF + -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON + ${OPTIONAL_CACHE_ARGS} + BUILD_BYPRODUCTS ${PROTOBUF_INSTALL_DIR}/lib/libprotobuf${CMAKE_STATIC_LIBRARY_SUFFIX} + BUILD_BYPRODUCTS ${PROTOBUF_INSTALL_DIR}/lib/libprotobuf-lite${CMAKE_STATIC_LIBRARY_SUFFIX} + BUILD_BYPRODUCTS ${PROTOBUF_INSTALL_DIR}/lib/libprotoc${CMAKE_STATIC_LIBRARY_SUFFIX} + BUILD_BYPRODUCTS ${PROTOBUF_INSTALL_DIR}/bin/protoc${CMAKE_EXECUTABLE_SUFFIX} + ) + endif() ENDFUNCTION() if(WITH_ONNXRUNTIME) @@ -258,6 +288,8 @@ elseif(WITH_ASCEND OR WITH_ASCEND_CL) SET(PROTOBUF_VERSION 3.8.0) elseif(WITH_IPU) SET(PROTOBUF_VERSION 3.6.1) +elseif(WITH_ARM_BRPC) + SET(PROTOBUF_VERSION 3.7.1-baidu-ee-common) else() SET(PROTOBUF_VERSION 3.1.0) endif() diff --git a/cmake/third_party.cmake b/cmake/third_party.cmake old mode 100644 new mode 100755 index 7df095c6c2ec0..f8a841fecbc0a --- a/cmake/third_party.cmake +++ b/cmake/third_party.cmake @@ -167,6 +167,13 @@ if(WIN32 OR APPLE) SET(WITH_PSLIB OFF CACHE STRING "Disable PSLIB package in Windows and MacOS" FORCE) endif() + if(WITH_ARM_BRPC) + MESSAGE(WARNING + "Windows or Mac is not supported with ARM_BRPC in Paddle yet." + "Force WITH_ARM_BRPC=OFF") + SET(WITH_ARM_BRPC OFF CACHE STRING "Disable ARM_BRPC package in Windows and MacOS" FORCE) + endif() + if(WITH_LIBMCT) MESSAGE(WARNING "Windows or Mac is not supported with LIBMCT in Paddle yet." @@ -338,9 +345,14 @@ if (WITH_PSCORE) include(external/leveldb) list(APPEND third_party_deps extern_leveldb) - - include(external/brpc) - list(APPEND third_party_deps extern_brpc) + + if (WITH_ARM_BRPC) + include(external/arm_brpc) + list(APPEND third_party_deps extern_arm_brpc) + else() + include(external/brpc) + list(APPEND third_party_deps extern_brpc) + endif() include(external/libmct) # download, build, install libmct list(APPEND third_party_deps extern_libmct) diff --git a/paddle/fluid/distributed/CMakeLists.txt b/paddle/fluid/distributed/CMakeLists.txt old mode 100644 new mode 100755 diff --git a/paddle/fluid/distributed/fleet_executor/CMakeLists.txt b/paddle/fluid/distributed/fleet_executor/CMakeLists.txt old mode 100644 new mode 100755 index 977a125627ba5..a36e8e648b193 --- a/paddle/fluid/distributed/fleet_executor/CMakeLists.txt +++ b/paddle/fluid/distributed/fleet_executor/CMakeLists.txt @@ -4,7 +4,9 @@ if(WITH_PYTHON) endif() proto_library(interceptor_message_proto SRCS interceptor_message.proto) -if(WITH_DISTRIBUTE AND WITH_PSCORE) +if(WITH_ARM_BRPC) + set(BRPC_DEPS arm_brpc snappy gflags glog) +elseif(WITH_DISTRIBUTE AND WITH_PSCORE) set(BRPC_DEPS brpc ssl crypto protobuf zlib leveldb snappy gflags glog) else() set(BRPC_DEPS "") diff --git a/paddle/fluid/distributed/ps/service/heter_client.h b/paddle/fluid/distributed/ps/service/heter_client.h index 006f87ddf5b06..d1e0f21c7dd84 100755 --- a/paddle/fluid/distributed/ps/service/heter_client.h +++ b/paddle/fluid/distributed/ps/service/heter_client.h @@ -93,7 +93,15 @@ class HeterClient { options.timeout_ms = FLAGS_pserver_timeout_ms; std::vector>* client_channels = nullptr; if (peer_role == PEER_ROLE_IS_SWITCH) { +#ifdef PADDLE_WITH_ARM_BRPC + if (need_encrypt) { + options.mutable_ssl_options(); + } + options.connection_type = ""; + VLOG(4) << "ssl enabled in arm"; +#else options.ssl_options.enable = need_encrypt; +#endif client_channels = &peer_switch_channels_; } else if (peer_role == PEER_ROLE_IS_WORKER) { client_channels = &peer_worker_channels_; diff --git a/paddle/fluid/distributed/ps/service/heter_server.cc b/paddle/fluid/distributed/ps/service/heter_server.cc old mode 100644 new mode 100755 index e21bf093f1915..292b12611c494 --- a/paddle/fluid/distributed/ps/service/heter_server.cc +++ b/paddle/fluid/distributed/ps/service/heter_server.cc @@ -32,8 +32,13 @@ void HeterServer::StartHeterService(bool neeed_encrypt) { server_.AddService(&service_, brpc::SERVER_DOESNT_OWN_SERVICE); brpc::ServerOptions options; if (neeed_encrypt) { +#ifdef PADDLE_WITH_ARM_BRPC + options.mutable_ssl_options()->default_cert.certificate = "/cert.pem"; + options.mutable_ssl_options()->default_cert.private_key = "/key.pem"; +#else options.ssl_options.default_cert.certificate = "/cert.pem"; options.ssl_options.default_cert.private_key = "/key.pem"; +#endif } if (server_.Start(endpoint_.c_str(), &options) != 0) { VLOG(0) << "HeterServer start fail. Try again."; @@ -67,8 +72,13 @@ void HeterServer::StartHeterInterService(bool neeed_encrypt) { server_inter_.AddService(&service_, brpc::SERVER_DOESNT_OWN_SERVICE); brpc::ServerOptions options; if (neeed_encrypt) { +#ifdef PADDLE_WITH_ARM_BRPC + options.mutable_ssl_options()->default_cert.certificate = "/cert.pem"; + options.mutable_ssl_options()->default_cert.private_key = "/key.pem"; +#else options.ssl_options.default_cert.certificate = "/cert.pem"; options.ssl_options.default_cert.private_key = "/key.pem"; +#endif } if (server_inter_.Start(endpoint_inter_.c_str(), &options) != 0) { VLOG(4) << "switch inter server start fail. Try again."; diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt index ad9f37b98bd3d..b6a7aea4f9cd7 100755 --- a/paddle/fluid/framework/CMakeLists.txt +++ b/paddle/fluid/framework/CMakeLists.txt @@ -126,6 +126,9 @@ if(WITH_PSLIB) elseif(NOT WITH_HETERPS) set(BRPC_DEPS brpc ssl crypto) endif() + if (WITH_ARM_BRPC) + set(BRPC_DEPS arm_brpc) + endif() endif() cc_library(scope SRCS scope.cc DEPS glog threadpool xxhash var_type_traits) diff --git a/paddle/fluid/operators/pscore/CMakeLists.txt b/paddle/fluid/operators/pscore/CMakeLists.txt index 863370540da82..de0ee481aa6e7 100755 --- a/paddle/fluid/operators/pscore/CMakeLists.txt +++ b/paddle/fluid/operators/pscore/CMakeLists.txt @@ -6,7 +6,11 @@ include(operators) set(DISTRIBUTE_DEPS "") -list(APPEND DISTRIBUTE_DEPS executor fleet ps_service brpc_utils heter_server heter_client ps_framework_proto framework_proto sendrecv_rpc brpc leveldb ssl crypto protobuf gflags glog zlib snappy device_context) +if (WITH_ARM_BRPC) + list(APPEND DISTRIBUTE_DEPS executor fleet ps_service brpc_utils heter_server heter_client ps_framework_proto framework_proto sendrecv_rpc arm_brpc gflags glog snappy device_context) +else() + list(APPEND DISTRIBUTE_DEPS executor fleet ps_service brpc_utils heter_server heter_client ps_framework_proto framework_proto sendrecv_rpc brpc leveldb ssl crypto protobuf gflags glog zlib snappy device_context) +endif() set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor -Wno-error=parentheses") @@ -40,3 +44,6 @@ cc_test(heter_listen_and_server_test SRCS heter_listen_and_server_test.cc DEPS e #set_source_files_properties(heter_cloud_comm_cpu_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) #cc_test(heter_cloud_comm_cpu_test SRCS heter_cloud_comm_cpu_test.cc DEPS executor scope proto_desc scale_op heter_listen_and_serv_op ${RPC_DEPS} ${DISTRIBUTE_DEPS} eigen_function) + +set_source_files_properties(switch_server_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) +cc_binary(switch_server_test SRCS switch_server_test.cc DEPS executor scope proto_desc scale_op heter_listen_and_serv_op ${RPC_DEPS} ${DISTRIBUTE_DEPS} eigen_function) diff --git a/paddle/fluid/operators/pscore/switch_server_test.cc b/paddle/fluid/operators/pscore/switch_server_test.cc new file mode 100755 index 0000000000000..2286b99bb8da5 --- /dev/null +++ b/paddle/fluid/operators/pscore/switch_server_test.cc @@ -0,0 +1,95 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#if defined PADDLE_WITH_PSCORE +#include + +#include +#include +#include +#include +#include // NOLINT + +#include "gflags/gflags.h" +#include "gtest/gtest.h" +#include "paddle/fluid/distributed/ps/service/heter_client.h" +#include "paddle/fluid/distributed/ps/service/heter_server.h" + +namespace framework = paddle::framework; +namespace platform = paddle::platform; +namespace distributed = paddle::distributed; + +DEFINE_string(switch_addr_inner, "127.0.0.1:6000", "addr of inner cluster"); +DEFINE_string(switch_addr_heter, "127.0.0.1:6100", "add of inter cluster"); +DEFINE_string(peer_switch_addr, "127.0.0.1:7100", "add of inter cluster"); + +void StartSwitchServer( + std::shared_ptr& switch_server_ptr, // NOLINT + std::vector endpoints, + std::vector peer_endpoints) { + switch_server_ptr->SetPeerEndPoints(peer_endpoints); + switch_server_ptr->SetEndPoint(endpoints[0]); + switch_server_ptr->StartHeterService(false); +} + +void StartSwitchInterServer( + std::shared_ptr& switch_server_ptr, // NOLINT + std::vector endpoints, + std::vector peer_endpoints) { + LOG(INFO) << "switch heter service started"; + switch_server_ptr->SetPeerEndPoints(peer_endpoints); + switch_server_ptr->SetInterEndpoint(endpoints[0]); + switch_server_ptr->StartHeterInterService(false); +} + +int main(int argc, char* argv[]) { + platform::CPUPlace place; + platform::CPUDeviceContext ctx(place); + framework::Executor exe(place); + + framework::ProgramDesc program; + exe.Prepare(program, 0); // solve undefined symbol: tensor_table.cc + + google::ParseCommandLineFlags(&argc, &argv, true); + + std::string switch_a_endpoint(FLAGS_switch_addr_inner); + std::string switch_a_endpoint_inter(FLAGS_switch_addr_heter); + std::string switch_b_endpoint_inter(FLAGS_peer_switch_addr); + + std::shared_ptr switch_server_ptr_a = + std::make_shared(); + + std::vector end_points{switch_a_endpoint}; + std::vector peer_endpoints{switch_b_endpoint_inter}; + std::thread switch_server_a_thread(StartSwitchServer, + std::ref(switch_server_ptr_a), end_points, + peer_endpoints); + switch_server_ptr_a->WaitServerReady(); + + end_points = {switch_a_endpoint_inter}; + peer_endpoints = {switch_b_endpoint_inter}; + std::thread switch_server_a_thread_inter(StartSwitchInterServer, + std::ref(switch_server_ptr_a), + end_points, peer_endpoints); + switch_server_ptr_a->WaitServerReady(); + + switch_server_a_thread.join(); + LOG(INFO) << "switch_server_a_thread joined"; + + switch_server_a_thread_inter.join(); + LOG(INFO) << "switch_server_a_thread_inter joined"; + + return 0; +} +#endif diff --git a/paddle/fluid/pybind/CMakeLists.txt b/paddle/fluid/pybind/CMakeLists.txt old mode 100644 new mode 100755 index 42eb79d75f857..b0ebe5026b5d4 --- a/paddle/fluid/pybind/CMakeLists.txt +++ b/paddle/fluid/pybind/CMakeLists.txt @@ -133,7 +133,11 @@ if (WITH_PSLIB) set_source_files_properties(heter_wrapper_py.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) endif(WITH_PSLIB) if (WITH_PSCORE) - set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor -Wno-error=sign-compare -Wno-error=unused-variable -Wno-error=return-type -Wno-error=unused-but-set-variable -Wno-error=type-limits -Wno-error=unknown-pragmas -Wno-error=parentheses -Wno-error=unused-result") + if (WITH_ARM_BRPC) + set(DISTRIBUTE_COMPILE_FLAGS "-faligned-new -Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor -Wno-error=sign-compare -Wno-error=unused-variable -Wno-error=return-type -Wno-error=unused-but-set-variable -Wno-error=type-limits -Wno-error=unknown-pragmas -Wno-error=parentheses -Wno-error=unused-result") + else() + set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor -Wno-error=sign-compare -Wno-error=unused-variable -Wno-error=return-type -Wno-error=unused-but-set-variable -Wno-error=type-limits -Wno-error=unknown-pragmas -Wno-error=parentheses -Wno-error=unused-result") + endif() set_source_files_properties(fleet_py.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) list(APPEND PYBIND_DEPS fleet communicator index_wrapper index_sampler) list(APPEND PYBIND_SRCS fleet_py.cc) From ebf4fe6e5b849f74decbe0327c0c989adbab2e0d Mon Sep 17 00:00:00 2001 From: levi131 <83750468+levi131@users.noreply.github.com> Date: Sat, 16 Apr 2022 10:36:53 +0800 Subject: [PATCH 191/211] Lml/prim op pywrapper (#41813) * native commit for triple grad of sigmod * Updated unittests files * init functional jacobian api * Updated trible_test func * Updated gradient_checker & test_script * finish test with dtype float32 * add float64 test case * polish code * use atol=1e-5 with dtype float64 * fix for ci * set timeout for test_jacobian * fix dygraph grad to support high differential * polish API docstring * Updated gradient checker and some related files * fix double grad strip error for high differential * fix double grad strip error for high differential * Add Sigmoid triple grad tests * fix dygraph double grad dtype error when calling for high differential senario * Updated triple grad teses func * Use np.random to initialize ddx * Updated triple_grad_check func * add todo for gradient checker and refine some comments * remove additional code * add test for warnging in backward.py * format python code * support multi input in triple gradient checker * Add matmul triple grad kernel * Updated comments of TODO * Supported some special tests * Change code-format to follow CI std * Updated gradient_checker.py * Fix conflicts * Removed unnecessary printing log * Change code style to follow CI std * merge upstream * add priops.py * add_p * rm useless files * add sub_p mul_p div_p * add sqrt_p and tanh_p * add reshape_p * add broadcast_p * Add python primitive wrappers. * Jvp rules updated. * JVP rules done for all the 17 primops. * quick check and fixes. * add jvp(op, *args) * add broadcast_p fill_constant_p matmul_p reduce_p reshape_p transpose_p * add split_p and concat_p * add gather_p and scatter_add_p * add slice_select_p and slice_assign_p * Add transpose rules. * add multi input check for add_p, sub_p, mul_p, div_p * update concat_p * Linearize and transpose in progress.. * refine gather_p and scatter_add_p * updated. * update transpose. * refine slice_assign_p and slice_select_p * init commit for lower * Merged with primitive ops. * small update * add rules for orig2prim and prim2orig * add 9 test for prim ops * add more test and fix some bug * add more test * register proto * Adding primops test. * add shape valid check for broadcast_p op, and add keepdim attr into reduce_p op proto * support multi input and multi output for split_p and concat_p * Test updated. * update * fix slice bug for slice_select_p and slice_assign_p * updated. * Ops updated. * Refactor and bug fixes. * updated. * finish orig2prim and prim2orig rules * dtype for axis attr should be long int * update dtype for axis attr int64_t * update for iscan CI * Update primx. * Refactor vars in primx. * update for lower transform * update primx.py * update * Fix linearize and transpose. * Update is_dot * Update is_dot * Update is_dot * add gradient aggregation, fix add_transpose. * pass first linearize+transpose test. * update test * add_prim_op_pywrapper * Add primops UT * Fix set_value and update * Fix code format and PR-CI-Coverage Co-authored-by: veyron95 Co-authored-by: Jiabin Yang <360788950@qq.com> Co-authored-by: Tongxin Bai Co-authored-by: 0x45f --- python/paddle/autograd/primops.py | 267 ++++++++++++++++++ python/paddle/autograd/primreg.py | 54 ++++ .../fluid/tests/unittests/test_primops.py | 147 ++++++++++ 3 files changed, 468 insertions(+) create mode 100644 python/paddle/autograd/primops.py create mode 100644 python/paddle/autograd/primreg.py create mode 100644 python/paddle/fluid/tests/unittests/test_primops.py diff --git a/python/paddle/autograd/primops.py b/python/paddle/autograd/primops.py new file mode 100644 index 0000000000000..66f641e54467c --- /dev/null +++ b/python/paddle/autograd/primops.py @@ -0,0 +1,267 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import paddle +from paddle.fluid import unique_name, core +from paddle.fluid.framework import default_main_program, default_startup_program +from paddle.fluid.layer_helper import LayerHelper +from .primreg import REGISTER_FN + + +def _simple_unop(helper): + optype = helper.layer_type + x, out = tuple(map(helper.kwargs.get, ('x', 'out'))) + if out is None: + out = helper.create_variable_for_type_inference(dtype=x.dtype) + + helper.append_op(type=optype, inputs={'X': x}, outputs={'Y': out}, attrs={}) + return out + + +def _simple_binop(helper): + optype = helper.layer_type + x, y, out = tuple(map(helper.kwargs.get, ('x', 'y', 'out'))) + if out is None: + out = helper.create_variable_for_type_inference(dtype=x.dtype) + + helper.append_op( + type=optype, inputs={'X': x, + 'Y': y}, outputs={'Z': out}, attrs={}) + return out + + +def _manipulation_unop(helper): + optype = helper.layer_type + x, out = tuple(map(helper.kwargs.get, ('x', 'out'))) + + attrs = { + k: helper.kwargs[k] + for k in ('shape', 'axis', 'index') if k in helper.kwargs + } + + if out is None: + out = helper.create_variable_for_type_inference(dtype=x.dtype) + + helper.append_op( + type=optype, inputs={'X': x}, outputs={'Y': out}, attrs=attrs) + return out + + +# Each primitive op is given a Python constructor for sake of convenience. +def fill_const(value, shape, dtype, out=None): + attrs = {'value': value, 'shape': shape, 'dtype': dtype} + helper = LayerHelper('fill_constant_p', **locals()) + if out is None: + out = helper.create_variable_for_type_inference(dtype) + helper.append_op(type=helper.layer_type, outputs={'Y': out}, attrs=attrs) + return out + + +def neg(x, out=None): + zero = fill_const(0.0, x.shape, x.dtype) + return sub(zero, x) + + +def set_value(x, y, axis, starts, ends, strides, out): + assert x is out, "x and out should be the same Tensor in set_value" + attrs = {'axes': axis, 'starts': starts, 'ends': ends, 'steps': strides} + helper = LayerHelper('set_value', **locals()) + helper.append_op( + type=helper.layer_type, + inputs={'Input': x, + 'ValueTensor': y}, + outputs={'Out': out}, + attrs=attrs) + return out + + +@REGISTER_FN('add_p', 'X', 'Y', 'Z') +def add(x, y, out=None): + return _simple_binop(LayerHelper('add_p', **locals())) + + +@REGISTER_FN('sub_p', 'X', 'Y', 'Z') +def sub(x, y, out=None): + return _simple_binop(LayerHelper('sub_p', **locals())) + + +@REGISTER_FN('mul_p', 'X', 'Y', 'Z') +def mul(x, y, out=None): + return _simple_binop(LayerHelper('mul_p', **locals())) + + +@REGISTER_FN('div_p', 'X', 'Y', 'Z') +def div(x, y, out=None): + return _simple_binop(LayerHelper('div_p', **locals())) + + +@REGISTER_FN('sqrt_p', 'X', 'Y') +def sqrt(x, out=None): + return _simple_unop(LayerHelper('sqrt_p', **locals())) + + +@REGISTER_FN('tanh_p', 'X', 'Y') +def tanh(x, out=None): + return _simple_unop(LayerHelper('tanh_p', **locals())) + + +@REGISTER_FN('reshape_p', 'X', 'Y') +def reshape(x, shape, out=None): + return _manipulation_unop(LayerHelper('reshape_p', **locals())) + + +@REGISTER_FN('broadcast_p', 'X', 'Y') +def broadcast(x, shape, out=None): + return _manipulation_unop(LayerHelper('broadcast_p', **locals())) + + +@REGISTER_FN('transpose_p', 'X', 'Y') +def transpose(x, axis=None, out=None): + return _manipulation_unop(LayerHelper('transpose_p', **locals())) + + +@REGISTER_FN('split_p', 'X', 'YS') +def split(x, num_or_sections, axis=0, outs=None): + if isinstance(num_or_sections, (list, tuple)): + n = len(num_or_sections) + else: + assert isinstance(num_or_sections, int) + n = num_or_sections + + attrs = {'num_or_sections': num_or_sections, 'axis': axis} + + helper = LayerHelper('split_p', **locals()) + if outs is None: + outs = [ + helper.create_variable_for_type_inference(dtype=x.dtype) + for i in range(n) + ] + helper.append_op( + type=helper.layer_type, + inputs={'X': x}, + outputs={'YS': outs}, + attrs=attrs) + return outs + + +@REGISTER_FN('concat_p', 'XS', 'Y') +def concat(xs, axis=0, out=None): + assert isinstance(xs, (list, tuple)) and len(xs) > 0 + attrs = {'axis': axis} + helper = LayerHelper('concat_p', **locals()) + if out is None: + out = helper.create_variable_for_type_inference(dtype=xs[0].dtype) + helper.append_op( + type=helper.layer_type, + inputs={'XS': xs}, + outputs={'Y': out}, + attrs=attrs) + return out + + +@REGISTER_FN('reduce_p', 'X', 'Y') +def reduce(x, axis, keepdim=False, out=None): + assert isinstance(axis, (tuple, list)) + assert isinstance(keepdim, bool) + + attrs = {'axis': axis, 'keepdim': keepdim} + + helper = LayerHelper('reduce_p', **locals()) + if out is None: + out = helper.create_variable_for_type_inference(dtype=x.dtype) + + helper.append_op( + type=helper.layer_type, + inputs={'X': x}, + outputs={'Y': out}, + attrs=attrs) + return out + + +@REGISTER_FN('matmul_p', 'X', 'Y', 'Z') +def matmul(x, y, out=None): + return _simple_binop(LayerHelper('matmul_p', **locals())) + + +@REGISTER_FN('slice_select_p', 'X', 'Y') +def slice_select(x, axis, starts, ends, strides, out=None): + assert isinstance(axis, (list, tuple)), ( + f'Argument type error. `axis` is supposed to be int, list or' + f' tuple but found {type(axis)}.') + assert isinstance(starts, (list, tuple)) + assert isinstance(ends, (list, tuple)) + assert len(axis) == len(starts) == len(ends) == len(strides) + + attrs = {'axis': axis, 'starts': starts, 'ends': ends, 'strides': strides} + helper = LayerHelper('slice_select_p', **locals()) + if out is None: + out = helper.create_variable_for_type_inference(dtype=x.dtype) + helper.append_op( + type=helper.layer_type, + inputs={'X': x}, + outputs={'Y': out}, + attrs=attrs) + return out + + +@REGISTER_FN('slice_assign_p', 'X', 'Y', 'Z') +def slice_assign(x, y, axis, starts, ends, strides, out=None): + assert len(starts) == len(ends) == len(strides) == len(axis) + assert len(y.shape) == len(x.shape) + + attrs = {'axis': axis, 'starts': starts, 'ends': ends, 'strides': strides} + helper = LayerHelper('slice_assign_p', **locals()) + if out is None: + out = helper.create_variable_for_type_inference(dtype=x.dtype) + helper.append_op( + type=helper.layer_type, + inputs={'X': x, + 'Y': y}, + outputs={'Z': out}, + attrs=attrs) + return out + + +@REGISTER_FN('gather_p', 'X', 'Y') +def gather(x, indextensor, axis, out=None): + attrs = {'axis': axis} + helper = LayerHelper('gather_p', **locals()) + if out is None: + out = helper.create_variable_for_type_inference(dtype=x.dtype) + helper.append_op( + type=helper.layer_type, + inputs={'X': x, + 'IndexTensor': indextensor}, + outputs={'Y': out}, + attrs=attrs) + return out + + +@REGISTER_FN('scatter_add_p', 'X', 'Y', 'IndexTensor', 'Z') +def scatter_add(x, y, indextensor, axis, out=None): + assert len(x.shape) == len(y.shape) + assert len(indextensor.shape) == 1 + assert y.shape[axis] == indextensor.shape[0] + attrs = {'axis': axis} + helper = LayerHelper('scatter_add_p', **locals()) + if out is None: + out = helper.create_variable_for_type_inference(dtype=x.dtype) + helper.append_op( + type=helper.layer_type, + inputs={'X': x, + 'Y': y, + 'IndexTensor': indextensor}, + outputs={'Z': out}, + attrs=attrs) + return out diff --git a/python/paddle/autograd/primreg.py b/python/paddle/autograd/primreg.py new file mode 100644 index 0000000000000..cffb4bc050b4b --- /dev/null +++ b/python/paddle/autograd/primreg.py @@ -0,0 +1,54 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import functools + + +class Registry(object): + """ A general registry object. """ + __slots__ = ['name', 'tab'] + + def __init__(self, name): + self.name = name + self.tab = {} + + def register(self, name, value): + assert name not in self.tab + self.tab[name] = value + + def lookup(self, name): + assert name in self.tab, f'No registry entry is found with name: {name}' + return self.tab[name] + + +_primop_fn = Registry('primop_fn') +_orig2prim = Registry('orig2prim') +_prim2orig = Registry('prim2orig') +_primop_jvp = Registry('primop_jvp') +_primop_transpose = Registry('primop_transpose') +_primop_position_argnames = Registry('primop_position_argnames') + + +def REGISTER_FN(op_type, *position_argnames): + """Decorator for registering the Python function for a primitive op.""" + + assert isinstance(op_type, str) + + _primop_position_argnames.register(op_type, position_argnames) + + def wrapper(f): + _primop_fn.register(op_type, f) + return f + + return wrapper diff --git a/python/paddle/fluid/tests/unittests/test_primops.py b/python/paddle/fluid/tests/unittests/test_primops.py new file mode 100644 index 0000000000000..cbf77c2666611 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_primops.py @@ -0,0 +1,147 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest +import numpy as np + +import paddle +from paddle.autograd.primops import ( + neg, set_value, add, sub, mul, div, sqrt, tanh, reshape, broadcast, + transpose, split, concat, reduce, matmul, slice_select, slice_assign, + gather, scatter_add, fill_const) + + +class TestPyPrimOps(unittest.TestCase): + """ Test Python wrappers of primitive ops. """ + + def setUp(self): + paddle.enable_static() + + def test_ops(self): + A = np.random.rand(1) + B = np.random.rand(2) + C = np.random.rand(2, 3) + D = np.random.rand(2, 3) + E = np.random.rand(3, 2) + + a = paddle.static.data(name='A', shape=A.shape, dtype='float32') + b = paddle.static.data(name='B', shape=B.shape, dtype='float32') + c = paddle.static.data(name='C', shape=C.shape, dtype='float32') + d = paddle.static.data(name='D', shape=D.shape, dtype='float32') + e = paddle.static.data(name='E', shape=E.shape, dtype='float32') + + add_1 = add(a, a) + self.assertEqual(add_1.dtype, a.dtype) + self.assertEqual(add_1.shape, a.shape) + + add_2 = add(c, d) + self.assertEqual(add_2.dtype, c.dtype) + self.assertEqual(add_2.shape, c.shape) + + sub_1 = sub(c, d) + self.assertEqual(sub_1.dtype, c.dtype) + self.assertEqual(sub_1.shape, c.shape) + + mul_1 = mul(c, d) + self.assertEqual(mul_1.dtype, c.dtype) + self.assertEqual(mul_1.shape, c.shape) + + div_1 = div(c, d) + self.assertEqual(div_1.dtype, c.dtype) + self.assertEqual(div_1.shape, c.shape) + + sqrt_1 = sqrt(b) + self.assertEqual(sqrt_1.dtype, b.dtype) + self.assertEqual(sqrt_1.shape, b.shape) + + tanh_1 = tanh(d) + self.assertEqual(tanh_1.dtype, d.dtype) + self.assertEqual(tanh_1.shape, d.shape) + + reshape_1 = reshape(c, d.shape) + self.assertEqual(reshape_1.dtype, c.dtype) + self.assertEqual(reshape_1.shape, d.shape) + + broadcast_1 = broadcast(b, e.shape) + self.assertEqual(broadcast_1.dtype, b.dtype) + self.assertEqual(broadcast_1.shape, e.shape) + + transpose_1 = transpose(c, axis=[1, 0]) + self.assertEqual(transpose_1.dtype, c.dtype) + self.assertEqual(transpose_1.shape, e.shape) + + split_1_0, split_1_1 = split(c, num_or_sections=[1, 2], axis=1) + self.assertEqual(split_1_0.dtype, c.dtype) + self.assertEqual(split_1_0.shape, (2, 1)) + self.assertEqual(split_1_1.shape, (2, 2)) + + concat_1 = concat([c, d], axis=0) + self.assertEqual(concat_1.dtype, c.dtype) + self.assertEqual(concat_1.shape, (4, 3)) + + reduce_1 = reduce(d, axis=[1]) + self.assertEqual(reduce_1.dtype, d.dtype) + self.assertEqual(reduce_1.shape, (2, )) + + reduce_2 = reduce(c, axis=[0, 1]) + self.assertEqual(reduce_2.dtype, c.dtype) + self.assertEqual(reduce_2.shape, (1, )) + # TODO: reduce + keepdim + + matmul_1 = matmul(d, e) + self.assertEqual(matmul_1.dtype, d.dtype) + self.assertEqual(matmul_1.shape, (2, 2)) + + slice_select_1 = slice_select( + e, axis=[0], starts=[0], ends=[2], strides=[1]) + self.assertEqual(slice_select_1.dtype, e.dtype) + self.assertEqual(slice_select_1.shape, (2, 2)) + + slice_select_2 = slice_select( + d, axis=[0, 1], starts=[0, 1], ends=[2, 3], strides=[1, 2]) + self.assertEqual(slice_select_2.dtype, d.dtype) + self.assertEqual(slice_select_2.shape, (2, 1)) + + y = broadcast(b, [2, 2]) + slice_assign_1 = slice_assign( + d, y, axis=[1], starts=[1], ends=[3], strides=[1]) + self.assertEqual(slice_assign_1.dtype, d.dtype) + self.assertEqual(slice_assign_1.shape, d.shape) + + index = paddle.static.data('index', shape=[5], dtype='int32') + gather_1 = gather(e, index, axis=0) + self.assertEqual(gather_1.dtype, e.dtype) + self.assertEqual(gather_1.shape, (5, 2)) + + y = paddle.rand([5, 2], dtype='float32') + scatter_add_1 = scatter_add(e, y, index, axis=0) + self.assertEqual(scatter_add_1.dtype, e.dtype) + self.assertEqual(scatter_add_1.shape, e.shape) + + fill_const_1 = fill_const(value=10, shape=a.shape, dtype=a.dtype) + self.assertEqual(fill_const_1.shape, a.shape) + self.assertEqual(fill_const_1.dtype, a.dtype) + + neg_1 = neg(x=b) + self.assertEqual(neg_1.shape, b.shape) + self.assertEqual(neg_1.dtype, b.dtype) + + set_value_1 = set_value( + d, a, axis=[1], starts=[1], ends=[3], strides=[1], out=d) + self.assertEqual(set_value_1.shape, d.shape) + self.assertEqual(set_value_1.dtype, d.dtype) + + +if __name__ == '__main__': + unittest.main() From e9a632375f7859a6a3e5b2a1f6b98515aa5af9b0 Mon Sep 17 00:00:00 2001 From: Roc <30228238+sljlp@users.noreply.github.com> Date: Sat, 16 Apr 2022 12:09:36 +0800 Subject: [PATCH 192/211] Moe ref (#41864) * moe ref * ref commit; test=document_fix * update; test=document_fix * update test=document_fix * update; test=document_fix --- .../paddle/incubate/distributed/models/moe/gate/naive_gate.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/paddle/incubate/distributed/models/moe/gate/naive_gate.py b/python/paddle/incubate/distributed/models/moe/gate/naive_gate.py index 491d1f95e10cb..c3c68685445c8 100644 --- a/python/paddle/incubate/distributed/models/moe/gate/naive_gate.py +++ b/python/paddle/incubate/distributed/models/moe/gate/naive_gate.py @@ -13,7 +13,7 @@ # limitations under the License. # # The file has been adapted from the file: -# https://github.com/laekov/fastmoe/blob/master/fmoe/gates/gshard_gate.py +# https://github.com/laekov/fastmoe/blob/master/fmoe/gates/naive_gate.py # Git commit hash: 295a615aacce7e54a37e7935274ba15e901c78e4 # We retain the following license from the original files: # Copyright 2021, Jiaao He. All rights reserved. From 5e5ae0a004d64e21ae61c75456bc8da81ac392f3 Mon Sep 17 00:00:00 2001 From: Baibaifan <39549453+Baibaifan@users.noreply.github.com> Date: Sat, 16 Apr 2022 13:50:29 +0800 Subject: [PATCH 193/211] fix_sharding_copy_right (#41849) --- .../dygraph_optimizer/sharding_optimizer_stage2.py | 13 ++++++++++--- .../sharding/group_sharded_optimizer_stage2.py | 13 ++++++++++--- .../meta_parallel/sharding/group_sharded_stage2.py | 13 ++++++++++--- .../meta_parallel/sharding/group_sharded_storage.py | 13 ++++++++++--- .../fleet/meta_parallel/sharding/sharding_stage2.py | 13 ++++++++++--- .../distributed/fleet/utils/internal_storage.py | 13 ++++++++++--- 6 files changed, 60 insertions(+), 18 deletions(-) diff --git a/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/sharding_optimizer_stage2.py b/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/sharding_optimizer_stage2.py index a2c741667ed77..fb43b89e1a623 100644 --- a/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/sharding_optimizer_stage2.py +++ b/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/sharding_optimizer_stage2.py @@ -11,9 +11,16 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#Taken and modified for fairscale from: -# https://github.com/facebookresearch/fairscale/blob/main/fairscale/optim/oss.py -#Commit: 8acbec718f3c70a6b9785470bb9e05cd84fc3f8e + +# The file has been adapted from fairscale file: +# https://github.com/facebookresearch/fairscale/blob/main/fairscale/optim/oss.py +# Git commit hash: 8acbec718f3c70a6b9785470bb9e05cd84fc3f8e +# We retain the following license from the original files: + +# Copyright (c) Facebook, Inc. and its affiliates. All rights reserved. +# +# This source code is licensed under the BSD license found in the +# LICENSE file in the root directory of this source tree. import copy import logging diff --git a/python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_optimizer_stage2.py b/python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_optimizer_stage2.py index 9df68dc419efa..70d2d2a1930c9 100644 --- a/python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_optimizer_stage2.py +++ b/python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_optimizer_stage2.py @@ -11,9 +11,16 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#Taken and modified for fairscale from: -# https://github.com/facebookresearch/fairscale/blob/main/fairscale/optim/oss.py -#Commit: 8acbec718f3c70a6b9785470bb9e05cd84fc3f8e + +# The file has been adapted from fairscale file: +# https://github.com/facebookresearch/fairscale/blob/main/fairscale/optim/oss.py +# Git commit hash: 8acbec718f3c70a6b9785470bb9e05cd84fc3f8e +# We retain the following license from the original files: + +# Copyright (c) Facebook, Inc. and its affiliates. All rights reserved. +# +# This source code is licensed under the BSD license found in the +# LICENSE file in the root directory of this source tree. import copy import logging diff --git a/python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_stage2.py b/python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_stage2.py index 5f39ea0fd900f..0c045c45fd599 100644 --- a/python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_stage2.py +++ b/python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_stage2.py @@ -11,9 +11,16 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#Taken and modified for fairscale from: -# https://github.com/facebookresearch/fairscale/blob/main/fairscale/nn/data_parallel/sharded_ddp.py -#Commit: 8acbec718f3c70a6b9785470bb9e05cd84fc3f8e + +# The file has been adapted from fairscale file: +# https://github.com/facebookresearch/fairscale/blob/main/fairscale/nn/data_parallel/sharded_ddp.py +# Git commit hash: 8acbec718f3c70a6b9785470bb9e05cd84fc3f8e +# We retain the following license from the original files: + +# Copyright (c) Facebook, Inc. and its affiliates. All rights reserved. +# +# This source code is licensed under the BSD license found in the +# LICENSE file in the root directory of this source tree. import logging import time diff --git a/python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_storage.py b/python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_storage.py index 7a57fb29b9472..4d706870a91e9 100644 --- a/python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_storage.py +++ b/python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_storage.py @@ -11,9 +11,16 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#Taken and modified for fairscale from: -# https://github.com/facebookresearch/fairscale/blob/main/fairscale/nn/misc/param_bucket.py -#Commit: 8acbec718f3c70a6b9785470bb9e05cd84fc3f8e + +# The file has been adapted from fairscale file: +# https://github.com/facebookresearch/fairscale/blob/main/fairscale/nn/misc/param_bucket.py +# Git commit hash: 8acbec718f3c70a6b9785470bb9e05cd84fc3f8e +# We retain the following license from the original files: + +# Copyright (c) Facebook, Inc. and its affiliates. All rights reserved. +# +# This source code is licensed under the BSD license found in the +# LICENSE file in the root directory of this source tree. import os import time diff --git a/python/paddle/distributed/fleet/meta_parallel/sharding/sharding_stage2.py b/python/paddle/distributed/fleet/meta_parallel/sharding/sharding_stage2.py index c6f05023e6138..b09d256d9bb60 100644 --- a/python/paddle/distributed/fleet/meta_parallel/sharding/sharding_stage2.py +++ b/python/paddle/distributed/fleet/meta_parallel/sharding/sharding_stage2.py @@ -11,9 +11,16 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#Taken and modified for fairscale from: -# https://github.com/facebookresearch/fairscale/blob/main/fairscale/nn/data_parallel/sharded_ddp.py -#Commit: 8acbec718f3c70a6b9785470bb9e05cd84fc3f8e + +# The file has been adapted from fairscale file: +# https://github.com/facebookresearch/fairscale/blob/main/fairscale/nn/data_parallel/sharded_ddp.py +# Git commit hash: 8acbec718f3c70a6b9785470bb9e05cd84fc3f8e +# We retain the following license from the original files: + +# Copyright (c) Facebook, Inc. and its affiliates. All rights reserved. +# +# This source code is licensed under the BSD license found in the +# LICENSE file in the root directory of this source tree. import os import contextlib diff --git a/python/paddle/distributed/fleet/utils/internal_storage.py b/python/paddle/distributed/fleet/utils/internal_storage.py index 469da22366748..80d8d8562d48f 100644 --- a/python/paddle/distributed/fleet/utils/internal_storage.py +++ b/python/paddle/distributed/fleet/utils/internal_storage.py @@ -11,9 +11,16 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#Taken and modified for fairscale from: -# https://github.com/facebookresearch/fairscale/blob/main/fairscale/nn/misc/param_bucket.py -#Commit: 8acbec718f3c70a6b9785470bb9e05cd84fc3f8e + +# The file has been adapted from fairscale file: +# https://github.com/facebookresearch/fairscale/blob/main/fairscale/nn/misc/param_bucket.py +# Git commit hash: 8acbec718f3c70a6b9785470bb9e05cd84fc3f8e +# We retain the following license from the original files: + +# Copyright (c) Facebook, Inc. and its affiliates. All rights reserved. +# +# This source code is licensed under the BSD license found in the +# LICENSE file in the root directory of this source tree. import os import time From f3753b7f6705f5f56a38f82886ab053efdd47634 Mon Sep 17 00:00:00 2001 From: helen88 Date: Sat, 16 Apr 2022 16:37:03 +0800 Subject: [PATCH 194/211] modify xpu.cmake,*test=kunlun (#41832) * modify xpu.cmake,*test=kunlun * modify xpu.cmake,*test=kunlun * modify xpu.cmake,*test=kunlun * modify xpu.cmake,*test=kunlun --- cmake/external/xpu.cmake | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cmake/external/xpu.cmake b/cmake/external/xpu.cmake index eb2fea91a6290..cda8029bfe4e4 100644 --- a/cmake/external/xpu.cmake +++ b/cmake/external/xpu.cmake @@ -24,9 +24,9 @@ endif() IF(WITH_AARCH64) SET(XPU_XRE_DIR_NAME "xre-kylin_aarch64") - SET(XPU_XDNN_DIR_NAME "xdnn-kylin_aarch64") + SET(XPU_XDNN_DIR_NAME "XDNN-kylin_aarch64") SET(XPU_XCCL_DIR_NAME "xccl-kylin_aarch64") - SET(XPU_XDNN_URL "${XPU_BASE_URL}/${XPU_XDNN_DIR_NAME}.tar.gz" CACHE STRING "" FORCE) + SET(XPU_XDNN_URL "${XPU_XDNN_BASE_URL}/${XPU_XDNN_DIR_NAME}.tar.gz" CACHE STRING "" FORCE) ELSEIF(WITH_SUNWAY) SET(XPU_XRE_DIR_NAME "xre-deepin_sw6_64") SET(XPU_XDNN_DIR_NAME "xdnn-deepin_sw6_64") From 21aa3adc706f3460f97aa4f5699bbea2034b259f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=8E=8B=E6=98=8E=E5=86=AC?= <78149749+winter-wang@users.noreply.github.com> Date: Sat, 16 Apr 2022 22:27:35 +0800 Subject: [PATCH 195/211] move fc_functor from fluid to phi.test=develop (#41856) --- paddle/fluid/operators/CMakeLists.txt | 2 +- paddle/fluid/operators/attention_lstm_op.cc | 4 +- paddle/fluid/operators/fc_op.h | 4 +- paddle/fluid/operators/fused/fusion_gru_op.cc | 6 +- .../fluid/operators/fused/fusion_lstm_op.cc | 6 +- .../fused/fusion_seqconv_eltadd_relu_op.cc | 4 +- .../fused/fusion_seqexpand_concat_fc_op.cc | 4 +- paddle/fluid/operators/fused/multi_gru_op.cc | 2 +- paddle/fluid/operators/math/CMakeLists.txt | 1 - paddle/fluid/operators/math/fc.cc | 88 ----------- paddle/fluid/operators/math/fc.cu | 133 ---------------- paddle/phi/kernels/funcs/CMakeLists.txt | 1 + paddle/phi/kernels/funcs/fc_functor.cc | 106 +++++++++++++ paddle/phi/kernels/funcs/fc_functor.cu | 149 ++++++++++++++++++ .../fc.h => phi/kernels/funcs/fc_functor.h} | 24 +-- 15 files changed, 286 insertions(+), 248 deletions(-) delete mode 100644 paddle/fluid/operators/math/fc.cc delete mode 100644 paddle/fluid/operators/math/fc.cu create mode 100644 paddle/phi/kernels/funcs/fc_functor.cc create mode 100644 paddle/phi/kernels/funcs/fc_functor.cu rename paddle/{fluid/operators/math/fc.h => phi/kernels/funcs/fc_functor.h} (62%) diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt index 63bf3ab6a0382..3112d0d8205a8 100644 --- a/paddle/fluid/operators/CMakeLists.txt +++ b/paddle/fluid/operators/CMakeLists.txt @@ -166,7 +166,7 @@ lod_tensor maxouting unpooling pooling lod_rank_table context_project sequence_pooling executor device_memory_aligment generator) set(COMMON_OP_DEPS ${COMMON_OP_DEPS} dynload_warpctc) set(COMMON_OP_DEPS ${COMMON_OP_DEPS} sequence_padding sequence_scale cos_sim_functor memory jit_kernel_helper concat_and_split cross_entropy softmax vol2col im2col sampler sample_prob tree2col) -set(COMMON_OP_DEPS ${COMMON_OP_DEPS} sequence2batch lstm_compute matrix_bit_code gru_compute activation_functions beam_search fc matrix_inverse matrix_solve) +set(COMMON_OP_DEPS ${COMMON_OP_DEPS} sequence2batch lstm_compute matrix_bit_code gru_compute activation_functions beam_search fc_functor matrix_inverse matrix_solve) set(COMMON_OP_DEPS ${COMMON_OP_DEPS} box_wrapper boost ps_gpu_wrapper) set(COMMON_OP_DEPS ${COMMON_OP_DEPS} common_infer_shape_functions) set(COMMON_OP_DEPS ${COMMON_OP_DEPS} eigen_function) diff --git a/paddle/fluid/operators/attention_lstm_op.cc b/paddle/fluid/operators/attention_lstm_op.cc index 78ea8b6b6fbeb..bf7d609370a8d 100644 --- a/paddle/fluid/operators/attention_lstm_op.cc +++ b/paddle/fluid/operators/attention_lstm_op.cc @@ -14,10 +14,10 @@ limitations under the License. */ #include "paddle/fluid/operators/attention_lstm_op.h" #include -#include "paddle/fluid/operators/math/fc.h" #include "paddle/fluid/platform/cpu_info.h" #include "paddle/phi/kernels/funcs/blas/blas.h" #include "paddle/phi/kernels/funcs/cpu_vec.h" +#include "paddle/phi/kernels/funcs/fc_functor.h" namespace paddle { namespace operators { @@ -377,7 +377,7 @@ class AttentionLSTMKernel : public framework::OpKernel { // x(TxM) * fc (Mx1) part of atten_wgt(M+D)x1 auto& dev_ctx = ctx.template device_context(); - math::FCFunctor fc; + phi::funcs::FCFunctor fc; fc(dev_ctx, total_T, 1, M, x_data, atten_w_data, atted_x_data, atten_b_data); diff --git a/paddle/fluid/operators/fc_op.h b/paddle/fluid/operators/fc_op.h index dfa10e6de72e8..6d3b531ce0aa6 100644 --- a/paddle/fluid/operators/fc_op.h +++ b/paddle/fluid/operators/fc_op.h @@ -17,7 +17,7 @@ limitations under the License. */ #include #include #include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/operators/math/fc.h" +#include "paddle/phi/kernels/funcs/fc_functor.h" namespace paddle { namespace operators { @@ -80,7 +80,7 @@ class FCOpKernel : public framework::OpKernel { T* output_data = output->mutable_data(ctx.GetPlace()); auto& dev_ctx = ctx.template device_context(); - math::FCFunctor fc; + phi::funcs::FCFunctor fc; fc(dev_ctx, M, w_dims1, w_dims0, input_data, w_data, output_data, bias ? bias->data() : NULL, with_relu, padding_weights); } diff --git a/paddle/fluid/operators/fused/fusion_gru_op.cc b/paddle/fluid/operators/fused/fusion_gru_op.cc index 3311e3b4ebc9e..afbd5380a8301 100644 --- a/paddle/fluid/operators/fused/fusion_gru_op.cc +++ b/paddle/fluid/operators/fused/fusion_gru_op.cc @@ -18,8 +18,8 @@ limitations under the License. */ #include #include "paddle/fluid/framework/op_version_registry.h" #include "paddle/fluid/operators/jit/kernels.h" -#include "paddle/fluid/operators/math/fc.h" #include "paddle/phi/kernels/funcs/blas/blas.h" +#include "paddle/phi/kernels/funcs/fc_functor.h" #include "paddle/phi/kernels/funcs/sequence2batch.h" #ifdef PADDLE_WITH_MKLDNN #include "paddle/fluid/platform/mkldnn_helper.h" @@ -298,7 +298,7 @@ class FusionGRUKernel : public framework::OpKernel { auto blas = phi::funcs::GetBlas(ctx); auto& dev_ctx = ctx.template device_context(); - math::FCFunctor fc; + phi::funcs::FCFunctor fc; fc(dev_ctx, total_T, D3, M, x_data, wx_data, xx_data, bias ? bias->data() : nullptr); @@ -370,7 +370,7 @@ class FusionGRUKernel : public framework::OpKernel { auto blas = phi::funcs::GetBlas(dev_ctx); phi::funcs::LoDTensor2BatchFunctor to_batch; - math::FCFunctor fc; + phi::funcs::FCFunctor fc; if (M > D3) { fc(dev_ctx, total_T, D3, M, x_data, wx_data, xx_data, bias ? bias->data() : nullptr); diff --git a/paddle/fluid/operators/fused/fusion_lstm_op.cc b/paddle/fluid/operators/fused/fusion_lstm_op.cc index 00be8b09d1296..3dada660aeffe 100644 --- a/paddle/fluid/operators/fused/fusion_lstm_op.cc +++ b/paddle/fluid/operators/fused/fusion_lstm_op.cc @@ -15,8 +15,8 @@ limitations under the License. */ #include "paddle/fluid/operators/fused/fusion_lstm_op.h" #include #include "paddle/fluid/operators/jit/kernels.h" -#include "paddle/fluid/operators/math/fc.h" #include "paddle/phi/kernels/funcs/blas/blas.h" +#include "paddle/phi/kernels/funcs/fc_functor.h" #include "paddle/phi/kernels/funcs/sequence2batch.h" #ifdef PADDLE_WITH_MKLDNN #include "paddle/fluid/platform/mkldnn_helper.h" @@ -346,7 +346,7 @@ class FuisonLSTMKernel : public framework::OpKernel { auto blas = phi::funcs::GetBlas(ctx); auto& dev_ctx = ctx.template device_context(); - math::FCFunctor fc; + phi::funcs::FCFunctor fc; fc(dev_ctx, total_T, D4, M, x_data, wx_data, xx_data, bias->data()); int xx_offset = D4; @@ -424,7 +424,7 @@ class FuisonLSTMKernel : public framework::OpKernel { phi::funcs::LoDTensor2BatchFunctor to_batch; auto& dev_ctx = ctx.template device_context(); auto blas = phi::funcs::GetBlas(dev_ctx); - math::FCFunctor fc; + phi::funcs::FCFunctor fc; if (M > D4) { fc(dev_ctx, x_dims[0], D4, M, x_data, wx_data, xx_data, bias->data()); to_batch(dev_ctx, *xx, batched_input, true, is_reverse); diff --git a/paddle/fluid/operators/fused/fusion_seqconv_eltadd_relu_op.cc b/paddle/fluid/operators/fused/fusion_seqconv_eltadd_relu_op.cc index f71cf1fd43374..ee28a54805653 100644 --- a/paddle/fluid/operators/fused/fusion_seqconv_eltadd_relu_op.cc +++ b/paddle/fluid/operators/fused/fusion_seqconv_eltadd_relu_op.cc @@ -15,8 +15,8 @@ limitations under the License. */ #include "paddle/fluid/operators/fused/fusion_seqconv_eltadd_relu_op.h" #include // for min, max #include -#include "paddle/fluid/operators/math/fc.h" #include "paddle/phi/kernels/funcs/blas/blas.h" +#include "paddle/phi/kernels/funcs/fc_functor.h" namespace paddle { namespace operators { @@ -244,7 +244,7 @@ class FusionSeqConvEltAddReluKernel : public framework::OpKernel { } } auto& dev_ctx = ctx.template device_context(); - math::FCFunctor fc; + phi::funcs::FCFunctor fc; fc(dev_ctx, x_dims[0], w_dims[1], w_dims[0], col_data, w_data, y_data, b_data, true); } diff --git a/paddle/fluid/operators/fused/fusion_seqexpand_concat_fc_op.cc b/paddle/fluid/operators/fused/fusion_seqexpand_concat_fc_op.cc index 1000d0488dc3f..58613173ad212 100644 --- a/paddle/fluid/operators/fused/fusion_seqexpand_concat_fc_op.cc +++ b/paddle/fluid/operators/fused/fusion_seqexpand_concat_fc_op.cc @@ -14,10 +14,10 @@ limitations under the License. */ #include "paddle/fluid/operators/fused/fusion_seqexpand_concat_fc_op.h" #include -#include "paddle/fluid/operators/math/fc.h" #include "paddle/fluid/platform/cpu_info.h" #include "paddle/phi/kernels/funcs/blas/blas.h" #include "paddle/phi/kernels/funcs/cpu_vec.h" +#include "paddle/phi/kernels/funcs/fc_functor.h" namespace paddle { namespace operators { @@ -212,7 +212,7 @@ class FusionSeqExpandConcatFCOpKernel : public framework::OpKernel { auto blas = phi::funcs::GetBlas(ctx); auto& dev_ctx = ctx.template device_context(); - math::FCFunctor fc; + phi::funcs::FCFunctor fc; fc(dev_ctx, total_T, D, M0, ref_in_data, w_data, out_data, b ? b->data() : NULL); w_data = w_data + M0 * D; diff --git a/paddle/fluid/operators/fused/multi_gru_op.cc b/paddle/fluid/operators/fused/multi_gru_op.cc index c2260c53b2edd..e7d697767fcac 100644 --- a/paddle/fluid/operators/fused/multi_gru_op.cc +++ b/paddle/fluid/operators/fused/multi_gru_op.cc @@ -18,8 +18,8 @@ limitations under the License. */ #include #include #include "paddle/fluid/operators/jit/kernels.h" -#include "paddle/fluid/operators/math/fc.h" #include "paddle/phi/kernels/funcs/blas/blas.h" +#include "paddle/phi/kernels/funcs/fc_functor.h" #include "paddle/phi/kernels/funcs/sequence2batch.h" #ifdef PADDLE_WITH_MKLDNN #include "paddle/fluid/platform/mkldnn_helper.h" diff --git a/paddle/fluid/operators/math/CMakeLists.txt b/paddle/fluid/operators/math/CMakeLists.txt index df8150b192b6c..913ce07ec673c 100644 --- a/paddle/fluid/operators/math/CMakeLists.txt +++ b/paddle/fluid/operators/math/CMakeLists.txt @@ -36,7 +36,6 @@ if (WITH_ASCEND_CL) else() math_library(beam_search DEPS math_function) endif() -math_library(fc DEPS blas jit_kernel_helper) math_library(matrix_bit_code) diff --git a/paddle/fluid/operators/math/fc.cc b/paddle/fluid/operators/math/fc.cc deleted file mode 100644 index 4599177fc13aa..0000000000000 --- a/paddle/fluid/operators/math/fc.cc +++ /dev/null @@ -1,88 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/operators/math/fc.h" - -#include "paddle/fluid/operators/jit/kernels.h" -#include "paddle/phi/kernels/funcs/blas/blas.h" - -namespace paddle { -namespace operators { -namespace math { - -template -class FCFunctor { - public: - void operator()(const platform::CPUDeviceContext& context, const int M, - const int N, const int K, const T* X, const T* W, T* Y, - const T* B = nullptr, bool relu = false, - bool padding_weights = false) { - auto blas = phi::funcs::GetBlas(context); - framework::Tensor Y1; - T* Y1_data = nullptr; - if (padding_weights) { - const int NN = N + 4; - const int KK = K + 4; - framework::Tensor X1; - T* X1_data = X1.mutable_data({M * KK}, platform::CPUPlace()); - Y1_data = Y1.mutable_data({M * (N + 4)}, platform::CPUPlace()); -#ifdef PADDLE_WITH_MKLML -#pragma omp parallel for -#endif - for (int i = 0; i < M; i++) { - memcpy(X1_data + i * KK, X + i * K, K * sizeof(T)); - } - blas.GEMM(false, false, M, N, K, static_cast(1.0), X1_data, KK, W, NN, - static_cast(0.0), Y1_data, NN); - } else { - blas.MatMul(M, N, K, X, W, Y); - } - if (B == NULL) { - if (padding_weights) { -#ifdef PADDLE_WITH_MKLML -#pragma omp parallel for -#endif - for (int i = 0; i < M; i++) { - memcpy(Y + i * N, Y1_data + i * (N + 4), N * sizeof(T)); - } - } - PADDLE_ENFORCE_EQ(relu, false, - platform::errors::PermissionDenied( - "When bias is NULL, relu can not be true.")); - return; - } - auto compute = - relu - ? jit::KernelFuncs, - platform::CPUPlace>::Cache() - .At(N) - : jit::KernelFuncs, platform::CPUPlace>::Cache() - .At(N); -#ifdef PADDLE_WITH_MKLML -#pragma omp parallel for -#endif - for (int i = 0; i < M; i++) { - T* dst = Y + i * N; - T* src = (padding_weights) ? Y1_data + i * (N + 4) : dst; - compute(B, src, dst, N); - } - } -}; - -template class FCFunctor; -template class FCFunctor; - -} // namespace math -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/math/fc.cu b/paddle/fluid/operators/math/fc.cu deleted file mode 100644 index 2f94eef34a320..0000000000000 --- a/paddle/fluid/operators/math/fc.cu +++ /dev/null @@ -1,133 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include -#include "paddle/fluid/operators/math/fc.h" -#include "paddle/phi/kernels/funcs/blas/blas.h" - -namespace paddle { -namespace operators { -namespace math { - -template -struct FcTypeTraits; - -template <> -struct FcTypeTraits { - typedef float4 Type; -}; - -template <> -struct FcTypeTraits { - typedef double4 Type; -}; - -template -__global__ void bias_relu_v4(const int num, const T* bias, T* data, int K) { - int tid = blockIdx.x * blockDim.x + threadIdx.x; - if (tid < num) { - int bias_idx = tid % K; - const T bias_ptr = bias[bias_idx]; - const T in_ptr = data[tid]; - T packed_val; - packed_val.x = in_ptr.x + bias_ptr.x; - packed_val.y = in_ptr.y + bias_ptr.y; - packed_val.z = in_ptr.z + bias_ptr.z; - packed_val.w = in_ptr.w + bias_ptr.w; - if (DoRelu) { - packed_val.x = fmaxf(0.f, packed_val.x); - packed_val.y = fmaxf(0.f, packed_val.y); - packed_val.z = fmaxf(0.f, packed_val.z); - packed_val.w = fmaxf(0.f, packed_val.w); - } - data[tid] = packed_val; - } -} - -template -__global__ void InplaceAddReluKernel(const int N, const T* bias, T* data) { - int offset = blockIdx.x * N; - - for (int i = threadIdx.x; i < N; i += BlockDim) { - T temp; -#if defined(__HIPCC__) || __CUDA_ARCH__ >= 350 - temp = __ldg(data + offset + i) + __ldg(bias + i); -#else - temp = data[offset + i] + bias[i]; -#endif - if (DoRelu) { - data[offset + i] = static_cast(temp > 0) * temp; - } else { - data[offset + i] = temp; - } - } -} - -template -class FCFunctor { - public: - void operator()(const platform::CUDADeviceContext& context, const int M, - const int N, const int K, const T* X, const T* W, T* Y, - const T* B = nullptr, bool relu = false, - bool padding_weights = false) { - PADDLE_ENFORCE_EQ( - padding_weights, false, - platform::errors::PermissionDenied( - "Weight padding in fc can not be used in GPU scope.")); - auto blas = phi::funcs::GetBlas(context); - blas.GEMM(false, false, M, N, K, static_cast(1.0), X, K, W, N, - static_cast(0.0), Y, N); - if (B == NULL) { - return; - } - - // M * N - if (N % 4 == 0) { - const int threads = 256; - const int num = M * N / 4; - const int blocks = (num + threads - 1) / threads; - typedef typename FcTypeTraits::Type trans_type; - auto* bias_ptr_v4 = reinterpret_cast(B); - auto* data_ptr_v4 = reinterpret_cast(Y); - if (relu) { - bias_relu_v4<<>>( - num, bias_ptr_v4, data_ptr_v4, N / 4); - } else { - bias_relu_v4<<>>( - num, bias_ptr_v4, data_ptr_v4, N / 4); - } - } else { - const int threads = 256; - const int blocks = M; - if (relu) { - InplaceAddReluKernel<<>>( - N, B, Y); - } else { - InplaceAddReluKernel<<>>( - N, B, Y); - } - } - } -}; - -template class FCFunctor; -template class FCFunctor; - -} // namespace math -} // namespace operators -} // namespace paddle diff --git a/paddle/phi/kernels/funcs/CMakeLists.txt b/paddle/phi/kernels/funcs/CMakeLists.txt index b1f010cdff103..6d16fc8f81895 100644 --- a/paddle/phi/kernels/funcs/CMakeLists.txt +++ b/paddle/phi/kernels/funcs/CMakeLists.txt @@ -5,6 +5,7 @@ add_subdirectory(detail) math_library(deformable_conv_functor DEPS dense_tensor) math_library(concat_and_split_functor DEPS dense_tensor) +math_library(fc_functor DEPS blas jit_kernel_helper) math_library(gru_compute DEPS activation_functions math_function) math_library(lstm_compute DEPS activation_functions) math_library(math_function DEPS blas dense_tensor tensor) diff --git a/paddle/phi/kernels/funcs/fc_functor.cc b/paddle/phi/kernels/funcs/fc_functor.cc new file mode 100644 index 0000000000000..e14f8522c969a --- /dev/null +++ b/paddle/phi/kernels/funcs/fc_functor.cc @@ -0,0 +1,106 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/phi/kernels/funcs/fc_functor.h" + +#include "paddle/fluid/operators/jit/kernels.h" +#include "paddle/fluid/platform/device_context.h" +#include "paddle/phi/kernels/funcs/blas/blas.h" + +namespace phi { +namespace funcs { + +template +void FCFunctor::operator()(const DeviceContext& context, + const int M, + const int N, + const int K, + const T* X, + const T* W, + T* Y, + const T* B, + bool relu, + bool padding_weights) { + auto blas = GetBlas(context); + paddle::framework::Tensor Y1; + T* Y1_data = nullptr; + if (padding_weights) { + const int NN = N + 4; + const int KK = K + 4; + paddle::framework::Tensor X1; + T* X1_data = X1.mutable_data({M * KK}, paddle::platform::CPUPlace()); + Y1_data = Y1.mutable_data({M * (N + 4)}, paddle::platform::CPUPlace()); +#ifdef PADDLE_WITH_MKLML +#pragma omp parallel for +#endif + for (int i = 0; i < M; i++) { + memcpy(X1_data + i * KK, X + i * K, K * sizeof(T)); + } + blas.GEMM(false, + false, + M, + N, + K, + static_cast(1.0), + X1_data, + KK, + W, + NN, + static_cast(0.0), + Y1_data, + NN); + } else { + blas.MatMul(M, N, K, X, W, Y); + } + if (B == NULL) { + if (padding_weights) { +#ifdef PADDLE_WITH_MKLML +#pragma omp parallel for +#endif + for (int i = 0; i < M; i++) { + memcpy(Y + i * N, Y1_data + i * (N + 4), N * sizeof(T)); + } + } + PADDLE_ENFORCE_EQ( + relu, + false, + errors::PermissionDenied("When bias is NULL, relu can not be true.")); + return; + } + auto compute = relu + ? paddle::operators::jit::KernelFuncs< + paddle::operators::jit::VAddReluTuple, + paddle::platform::CPUPlace>::Cache() + .At(N) + : paddle::operators::jit::KernelFuncs< + paddle::operators::jit::VAddTuple, + paddle::platform::CPUPlace>::Cache() + .At(N); +#ifdef PADDLE_WITH_MKLML +#pragma omp parallel for +#endif + for (int i = 0; i < M; i++) { + T* dst = Y + i * N; + T* src = (padding_weights) ? Y1_data + i * (N + 4) : dst; + compute(B, src, dst, N); + } +} + +template class FCFunctor; +template class FCFunctor; +template class FCFunctor; +template class FCFunctor; + +} // namespace funcs +} // namespace phi diff --git a/paddle/phi/kernels/funcs/fc_functor.cu b/paddle/phi/kernels/funcs/fc_functor.cu new file mode 100644 index 0000000000000..a26f0edcab272 --- /dev/null +++ b/paddle/phi/kernels/funcs/fc_functor.cu @@ -0,0 +1,149 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include "paddle/fluid/platform/device_context.h" +#include "paddle/phi/kernels/funcs/blas/blas.h" +#include "paddle/phi/kernels/funcs/fc_functor.h" + +namespace phi { +namespace funcs { + +template +struct FcTypeTraits; + +template <> +struct FcTypeTraits { + typedef float4 Type; +}; + +template <> +struct FcTypeTraits { + typedef double4 Type; +}; + +template +__global__ void bias_relu_v4(const int num, const T* bias, T* data, int K) { + int tid = blockIdx.x * blockDim.x + threadIdx.x; + if (tid < num) { + int bias_idx = tid % K; + const T bias_ptr = bias[bias_idx]; + const T in_ptr = data[tid]; + T packed_val; + packed_val.x = in_ptr.x + bias_ptr.x; + packed_val.y = in_ptr.y + bias_ptr.y; + packed_val.z = in_ptr.z + bias_ptr.z; + packed_val.w = in_ptr.w + bias_ptr.w; + if (DoRelu) { + packed_val.x = fmaxf(0.f, packed_val.x); + packed_val.y = fmaxf(0.f, packed_val.y); + packed_val.z = fmaxf(0.f, packed_val.z); + packed_val.w = fmaxf(0.f, packed_val.w); + } + data[tid] = packed_val; + } +} + +template +__global__ void InplaceAddReluKernel(const int N, const T* bias, T* data) { + int offset = blockIdx.x * N; + + for (int i = threadIdx.x; i < N; i += BlockDim) { + T temp; +#if defined(__HIPCC__) || __CUDA_ARCH__ >= 350 + temp = __ldg(data + offset + i) + __ldg(bias + i); +#else + temp = data[offset + i] + bias[i]; +#endif + if (DoRelu) { + data[offset + i] = static_cast(temp > 0) * temp; + } else { + data[offset + i] = temp; + } + } +} + +template +void FCFunctor::operator()(const DeviceContext& context, + const int M, + const int N, + const int K, + const T* X, + const T* W, + T* Y, + const T* B, + bool relu, + bool padding_weights) { + PADDLE_ENFORCE_EQ(padding_weights, + false, + errors::PermissionDenied( + "Weight padding in fc can not be used in GPU scope.")); + auto blas = phi::funcs::GetBlas(context); + blas.GEMM(false, + false, + M, + N, + K, + static_cast(1.0), + X, + K, + W, + N, + static_cast(0.0), + Y, + N); + if (B == NULL) { + return; + } + + // M * N + if (N % 4 == 0) { + const int threads = 256; + const int num = M * N / 4; + const int blocks = (num + threads - 1) / threads; + typedef typename FcTypeTraits::Type trans_type; + auto* bias_ptr_v4 = reinterpret_cast(B); + auto* data_ptr_v4 = reinterpret_cast(Y); + if (relu) { + bias_relu_v4<<>>( + num, bias_ptr_v4, data_ptr_v4, N / 4); + } else { + bias_relu_v4<<>>( + num, bias_ptr_v4, data_ptr_v4, N / 4); + } + } else { + const int threads = 256; + const int blocks = M; + if (relu) { + InplaceAddReluKernel<<>>( + N, B, Y); + } else { + InplaceAddReluKernel<<>>( + N, B, Y); + } + } +} + +template class FCFunctor; +template class FCFunctor; + +template class FCFunctor; +template class FCFunctor; + +} // namespace funcs +} // namespace phi diff --git a/paddle/fluid/operators/math/fc.h b/paddle/phi/kernels/funcs/fc_functor.h similarity index 62% rename from paddle/fluid/operators/math/fc.h rename to paddle/phi/kernels/funcs/fc_functor.h index 02f81587c739f..3c759acb194b0 100644 --- a/paddle/fluid/operators/math/fc.h +++ b/paddle/phi/kernels/funcs/fc_functor.h @@ -1,4 +1,4 @@ -/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -17,19 +17,23 @@ limitations under the License. */ #include #include "paddle/fluid/platform/device_context.h" -namespace paddle { -namespace operators { -namespace math { +namespace phi { +namespace funcs { template class FCFunctor { public: - void operator()(const DeviceContext& context, const int M, const int N, - const int K, const T* X, const T* W, T* Y, - const T* B = nullptr, bool relu = false, + void operator()(const DeviceContext& context, + const int M, + const int N, + const int K, + const T* X, + const T* W, + T* Y, + const T* B = nullptr, + bool relu = false, bool weight_pass = false); }; -} // namespace math -} // namespace operators -} // namespace paddle +} // namespace funcs +} // namespace phi From b5d9c31c70b1a1eab959302464f22aec5cc27812 Mon Sep 17 00:00:00 2001 From: Chen Weihang Date: Sun, 17 Apr 2022 10:41:21 +0800 Subject: [PATCH 196/211] [CustomOp] Fix PlaceType related compat error (#41826) * fix place type related compat error * fix test failed * remove dll decl * revert place type change * add dll decl --- paddle/phi/api/lib/tensor_method.cc | 11 ++--- paddle/phi/common/place.cc | 35 ++++++++++++---- paddle/phi/common/place.h | 40 ++++++++++++++----- paddle/phi/tests/api/CMakeLists.txt | 4 +- .../fluid/tests/custom_op/custom_relu_op.cc | 1 - .../fluid/tests/custom_op/custom_relu_op.cu | 4 ++ 6 files changed, 70 insertions(+), 25 deletions(-) diff --git a/paddle/phi/api/lib/tensor_method.cc b/paddle/phi/api/lib/tensor_method.cc index 51d4ec28200cb..463b72d0dbf5b 100644 --- a/paddle/phi/api/lib/tensor_method.cc +++ b/paddle/phi/api/lib/tensor_method.cc @@ -39,11 +39,12 @@ Tensor Tensor::copy_to(Place place, bool blocking) const { template Tensor Tensor::copy_to(const Place &target_place) const { - LOG(WARNING) << "The Tensor's `copy_to` method is deprecated since version " - "2.3, and will be removed in version 2.4, please use " - "`copy_to` method without template argument instead. " - "reason: copying a Tensor to another device does not need " - "to specify the data type template argument."; + LOG_FIRST_N(WARNING, 1) + << "The Tensor's `copy_to` method is deprecated since version " + "2.3, and will be removed in version 2.4, please use " + "`copy_to` method without template argument instead. " + "reason: copying a Tensor to another device does not need " + "to specify the data type template argument."; return copy_to(target_place, /*blocking=*/false); } diff --git a/paddle/phi/common/place.cc b/paddle/phi/common/place.cc index a77042757c7ba..667d0a32b93da 100644 --- a/paddle/phi/common/place.cc +++ b/paddle/phi/common/place.cc @@ -18,6 +18,8 @@ limitations under the License. */ #include #include +#include "glog/logging.h" + #include "paddle/phi/api/ext/exception.h" namespace phi { @@ -108,17 +110,34 @@ uint32_t Place::Hash::operator()(const Place &place) const { return hash_value; } +Place::Place(paddle::PlaceType type) + : device(0), + alloc_type_(static_cast(type)), + device_type_id_(GetOrRegisterGlobalDeviceTypeId("")) { + LOG_FIRST_N(WARNING, 1) + << "The `paddle::PlaceType::kCPU/kGPU` is deprecated since version " + "2.3, and will be removed in version 2.4! Please use " + "`paddle::CPUPlace()/GPUPlace()` to represent the place type."; +} + } // namespace phi namespace paddle { -phi::Place PlaceType::kUNK = phi::Place(); -phi::Place PlaceType::kCPU = phi::Place(phi::AllocationType::CPU); -// GPU Place contains device id, here we use default value 0, so it cannot -// use for multi-casd cases, but because it is static variable, it is difficult -// to get the exact device id at all time. -// NOTE: Please DO NOT use this place in the framework!!! -// It only for external compatibility -phi::Place PlaceType::kGPU = phi::Place(phi::AllocationType::GPU); +bool operator==(const Place &place, PlaceType place_type) { + LOG_FIRST_N(WARNING, 1) + << "The `paddle::PlaceType::kCPU/kGPU` is deprecated since version " + "2.3, and will be removed in version 2.4! Please use " + "`Tensor::is_cpu()/is_gpu()` method to determine the type of place."; + return place.GetType() == static_cast(place_type); +} + +bool operator==(PlaceType place_type, const Place &place) { + LOG_FIRST_N(WARNING, 1) + << "The `paddle::PlaceType::kCPU/kGPU` is deprecated since version " + "2.3, and will be removed in version 2.4! Please use " + "`Tensor::is_cpu()/is_gpu()` method to determine the type of place."; + return static_cast(place_type) == place.GetType(); +} } // namespace paddle diff --git a/paddle/phi/common/place.h b/paddle/phi/common/place.h index d43fc497277c5..ed9fb7876425d 100644 --- a/paddle/phi/common/place.h +++ b/paddle/phi/common/place.h @@ -18,6 +18,10 @@ limitations under the License. */ #include "paddle/phi/api/include/dll_decl.h" +namespace paddle { +enum class PlaceType; +} + namespace phi { enum class AllocationType : int8_t { @@ -57,6 +61,9 @@ class PADDLE_API Place { alloc_type_(type), device_type_id_(GetOrRegisterGlobalDeviceTypeId(dev_type)) {} + // See NOTE [ Why need to temporarily adapt to PlaceType? ] + Place(paddle::PlaceType type); // NOLINT + void Reset(AllocationType type, int8_t device_id = 0, const std::string& dev_type = "") noexcept { @@ -214,14 +221,26 @@ using XPUPlace = phi::XPUPlace; using NPUPlace = phi::NPUPlace; } // namespace experimental -/* NOTE: In order to remove and be compatible with the enumeration type -`PlaceType` of custom operator, we define a temporary type. +using AllocationType = phi::AllocationType; +using Place = phi::Place; +using CPUPlace = phi::CPUPlace; +using GPUPlace = phi::GPUPlace; + +/* NOTE [ Why need to temporarily adapt to PlaceType? ] -This type cannot add any new type!!! It is only used for compatibility with +`PlaceType` emum class is the place type used by custom operators since the +release of 2.0. Since 2.3, we have refactored the operator library and designed +a new external Place type. The original PlaceType is no longer suitable for use +as an internal type of the framework, but immediately delete the PlaceType, +it will cause the previous custom operators to be incompatible, so it cannot be +deleted in the short term. We'd better delete this abandoned data type in 2.4. + +Note: This type cannot add any new type!!! It is only used for compatibility +with historical writing and we will remove this temporary type in the future. This Type cannot be used in framework! only used for custom operator! -The historical PlaceType define: +The original PlaceType define: - enum class PlaceType { kUNK = -1, kCPU, kGPU }; @@ -230,13 +249,14 @@ The historical PlaceType using: - PD_CHECK(x.place() == paddle::PlaceType::kCPU) - auto out = paddle::Tensor(paddle::PlaceType::kCPU, x.shape()); -The new type cannot be used as int value! If you use as int, please modify -the implementation. */ -struct PADDLE_API PlaceType { - static phi::Place kUNK; - static phi::Place kCPU; - static phi::Place kGPU; +enum class PlaceType { + kUNK = static_cast(phi::AllocationType::UNDEFINED), + kCPU = static_cast(phi::AllocationType::CPU), + kGPU = static_cast(phi::AllocationType::GPU), }; +PADDLE_API bool operator==(const Place& place, PlaceType place_type); +PADDLE_API bool operator==(PlaceType place_type, const Place& place); + } // namespace paddle diff --git a/paddle/phi/tests/api/CMakeLists.txt b/paddle/phi/tests/api/CMakeLists.txt index dd4b7e62ec52f..5c1d0989629dc 100644 --- a/paddle/phi/tests/api/CMakeLists.txt +++ b/paddle/phi/tests/api/CMakeLists.txt @@ -1,4 +1,6 @@ -if(WITH_ROCM) +if(WITH_GPU) + nv_test(test_phi_tensor SRCS test_pten_tensor.cc DEPS phi_tensor glog) +elseif(WITH_ROCM) hip_test(test_phi_tensor SRCS test_pten_tensor.cc DEPS phi_tensor glog) else() cc_test(test_phi_tensor SRCS test_pten_tensor.cc DEPS phi_tensor glog) diff --git a/python/paddle/fluid/tests/custom_op/custom_relu_op.cc b/python/paddle/fluid/tests/custom_op/custom_relu_op.cc index 4ff9adf4f8fec..121a855a18f84 100644 --- a/python/paddle/fluid/tests/custom_op/custom_relu_op.cc +++ b/python/paddle/fluid/tests/custom_op/custom_relu_op.cc @@ -108,7 +108,6 @@ std::vector relu_cuda_double_backward( const paddle::Tensor& out, const paddle::Tensor& ddx); std::vector ReluForward(const paddle::Tensor& x) { - // TODO(chenweihang): Check Input if (x.place() == paddle::PlaceType::kCPU) { return relu_cpu_forward(x); } else if (x.place() == paddle::PlaceType::kGPU) { diff --git a/python/paddle/fluid/tests/custom_op/custom_relu_op.cu b/python/paddle/fluid/tests/custom_op/custom_relu_op.cu index 8b9693054d1c4..364a2216b9e8e 100644 --- a/python/paddle/fluid/tests/custom_op/custom_relu_op.cu +++ b/python/paddle/fluid/tests/custom_op/custom_relu_op.cu @@ -53,6 +53,7 @@ __global__ void relu_cuda_double_backward_kernel(const data_t* out_data, } std::vector relu_cuda_forward(const paddle::Tensor& x) { + CHECK_GPU_INPUT(x); auto out = paddle::Tensor(paddle::PlaceType::kGPU, x.shape()); int numel = x.size(); @@ -70,6 +71,9 @@ std::vector relu_cuda_forward(const paddle::Tensor& x) { std::vector relu_cuda_backward(const paddle::Tensor& x, const paddle::Tensor& out, const paddle::Tensor& grad_out) { + CHECK_GPU_INPUT(x); + CHECK_GPU_INPUT(out); + CHECK_GPU_INPUT(grad_out); auto grad_x = paddle::Tensor(paddle::PlaceType::kGPU, x.shape()); int numel = out.size(); From 7ee31a96b436de4b0701de2ba56bd0b2a653994c Mon Sep 17 00:00:00 2001 From: Chen Weihang Date: Sun, 17 Apr 2022 18:59:58 +0800 Subject: [PATCH 197/211] [Perf] Optimize dygraph scheduling performance (#41696) * split phi and fluid infermeta context * resolve conflict * fix type error * optimize scheduling perf * spec small vector size * replace all grad var name * fix test failed * move init defalut signature * polish details * polish details * fix no init bug * init sig for tests * add init sig for infer * fix infrt error * fix infrt failed * fix kunlun error * fix infrt failed --- paddle/fluid/framework/infershape_utils.cc | 160 ++++++++++++++---- paddle/fluid/framework/infershape_utils.h | 83 ++++++--- .../new_executor/new_executor_defs.cc | 9 +- .../new_executor/new_executor_defs.h | 8 +- paddle/fluid/framework/op_desc.cc | 12 +- paddle/fluid/framework/operator.cc | 24 +-- paddle/fluid/framework/operator.h | 1 + paddle/fluid/framework/phi_utils.cc | 73 ++++---- paddle/fluid/framework/phi_utils.h | 6 +- paddle/fluid/framework/shape_inference.h | 10 +- paddle/fluid/imperative/infer_shape_context.h | 15 +- paddle/fluid/imperative/prepared_operator.cc | 33 ++-- paddle/fluid/imperative/prepared_operator.h | 8 +- .../fluid/inference/api/analysis_predictor.cc | 1 + paddle/fluid/inference/api/api_impl.cc | 1 + .../fluid/operators/controlflow/while_op.cc | 6 +- .../detection/collect_fpn_proposals_op.cc | 6 +- paddle/fluid/pybind/imperative.cc | 50 +++--- paddle/fluid/pybind/pybind.cc | 2 + .../infrt/dialect/phi/pass/kernel_op_desc.cc | 9 +- paddle/infrt/host_context/value.h | 1 + paddle/phi/core/compat/arg_map_context.h | 40 ++--- paddle/phi/core/compat/convert_utils.cc | 2 +- paddle/phi/core/compat/convert_utils.h | 2 +- paddle/phi/core/compat/op_utils.h | 8 +- paddle/phi/core/infermeta_utils.cc | 40 ++--- paddle/phi/core/infermeta_utils.h | 57 ++++--- paddle/phi/core/kernel_context.cc | 4 +- paddle/phi/core/kernel_factory.cc | 24 ++- paddle/phi/core/kernel_factory.h | 39 +++-- paddle/phi/core/meta_tensor.cc | 2 + paddle/phi/core/meta_tensor.h | 8 +- paddle/phi/core/type_defs.h | 7 +- paddle/phi/infermeta/backward.cc | 6 +- paddle/phi/infermeta/backward.h | 6 +- paddle/phi/infermeta/multiary.cc | 24 +-- paddle/phi/infermeta/multiary.h | 24 +-- paddle/phi/kernels/concat_kernel.h | 2 +- paddle/phi/ops/compat/abs_sig.cc | 3 +- paddle/phi/ops/compat/activation_sig.cc | 39 ++--- paddle/phi/ops/compat/adam_sig.cc | 6 +- paddle/phi/ops/compat/adamw_sig.cc | 6 +- paddle/phi/ops/compat/addmm_sig.cc | 9 +- paddle/phi/ops/compat/argsort_sig.cc | 4 +- paddle/phi/ops/compat/atan2_sig.cc | 6 +- paddle/phi/ops/compat/batch_norm_sig.cc | 41 +++-- paddle/phi/ops/compat/bce_loss_sig.cc | 6 +- .../ops/compat/bilinear_tensor_product_sig.cc | 7 +- .../phi/ops/compat/broadcast_tensors_sig.cc | 2 +- paddle/phi/ops/compat/cholesky_sig.cc | 6 +- paddle/phi/ops/compat/cholesky_solve_sig.cc | 4 +- paddle/phi/ops/compat/clip_sig.cc | 26 +-- paddle/phi/ops/compat/complex_sig.cc | 6 +- paddle/phi/ops/compat/concat_sig.cc | 12 +- paddle/phi/ops/compat/conv2d_sig.cc | 4 +- paddle/phi/ops/compat/conv3d_sig.cc | 4 +- paddle/phi/ops/compat/conv_transpose_sig.cc | 12 +- paddle/phi/ops/compat/cross_sig.cc | 6 +- paddle/phi/ops/compat/cumprod_sig.cc | 6 +- paddle/phi/ops/compat/deformable_conv_sig.cc | 7 +- paddle/phi/ops/compat/depthwise_conv2d_sig.cc | 4 +- paddle/phi/ops/compat/determinant_sig.cc | 6 +- paddle/phi/ops/compat/diag_sig.cc | 2 +- paddle/phi/ops/compat/diagonal_sig.cc | 4 +- paddle/phi/ops/compat/digamma_sig.cc | 3 +- paddle/phi/ops/compat/dist_sig.cc | 6 +- paddle/phi/ops/compat/dot_sig.cc | 6 +- paddle/phi/ops/compat/dropout_sig.cc | 4 +- paddle/phi/ops/compat/eigh_sig.cc | 12 +- paddle/phi/ops/compat/elementwise_sig.cc | 54 +++--- paddle/phi/ops/compat/embedding_sig.cc | 16 +- paddle/phi/ops/compat/erf_sig.cc | 3 +- paddle/phi/ops/compat/erfinv_sig.cc | 3 +- paddle/phi/ops/compat/expand_as_sig.cc | 6 +- paddle/phi/ops/compat/expand_sig.cc | 18 +- paddle/phi/ops/compat/flatten_sig.cc | 2 +- paddle/phi/ops/compat/frobenius_norm_sig.cc | 4 +- paddle/phi/ops/compat/gather_scatter_sig.cc | 14 +- paddle/phi/ops/compat/gather_sig.cc | 8 +- paddle/phi/ops/compat/gelu_sig.cc | 6 +- paddle/phi/ops/compat/graph_send_recv_sig.cc | 4 +- paddle/phi/ops/compat/grid_sampler_sig.cc | 4 +- paddle/phi/ops/compat/gumbel_softmax_sig.cc | 6 +- .../ops/compat/hierarchical_sigmoid_sig.cc | 74 ++++---- paddle/phi/ops/compat/huber_loss_sig.cc | 4 +- paddle/phi/ops/compat/index_sample_sig.cc | 6 +- paddle/phi/ops/compat/index_select_sig.cc | 6 +- paddle/phi/ops/compat/interpolate_sig.cc | 115 ++++++------- paddle/phi/ops/compat/kldiv_loss_sig.cc | 4 +- paddle/phi/ops/compat/kron_sig.cc | 6 +- paddle/phi/ops/compat/kthvalue_sig.cc | 4 +- paddle/phi/ops/compat/label_smooth_sig.cc | 6 +- paddle/phi/ops/compat/layer_norm_sig.cc | 9 +- paddle/phi/ops/compat/lerp_sig.cc | 4 +- paddle/phi/ops/compat/lgamma_sig.cc | 3 +- paddle/phi/ops/compat/log_loss_sig.cc | 4 +- paddle/phi/ops/compat/log_softmax_sig.cc | 6 +- paddle/phi/ops/compat/logsumexp_sig.cc | 4 +- paddle/phi/ops/compat/masked_select_sig.cc | 6 +- paddle/phi/ops/compat/matmul_sig.cc | 8 +- paddle/phi/ops/compat/matrix_power_sig.cc | 6 +- paddle/phi/ops/compat/maxout_sig.cc | 6 +- paddle/phi/ops/compat/mean_sig.cc | 3 +- paddle/phi/ops/compat/meshgrid_sig.cc | 3 +- paddle/phi/ops/compat/mode_sig.cc | 4 +- paddle/phi/ops/compat/mul_sig.cc | 4 +- paddle/phi/ops/compat/multi_dot_sig.cc | 3 +- paddle/phi/ops/compat/multiplex_sig.cc | 3 +- paddle/phi/ops/compat/mv_sig.cc | 6 +- paddle/phi/ops/compat/nll_loss_sig.cc | 9 +- paddle/phi/ops/compat/norm_sig.cc | 4 +- paddle/phi/ops/compat/p_norm_sig.cc | 4 +- paddle/phi/ops/compat/pad3d_sig.cc | 8 +- paddle/phi/ops/compat/pad_sig.cc | 6 +- paddle/phi/ops/compat/pixel_shuffle_sig.cc | 4 +- paddle/phi/ops/compat/poisson_sig.cc | 3 +- paddle/phi/ops/compat/pool_sig.cc | 16 +- paddle/phi/ops/compat/prelu_sig.cc | 4 +- paddle/phi/ops/compat/psroi_pool_sig.cc | 4 +- paddle/phi/ops/compat/put_along_axis_sig.cc | 4 +- paddle/phi/ops/compat/reduce_sig.cc | 20 +-- paddle/phi/ops/compat/reshape_sig.cc | 3 +- paddle/phi/ops/compat/rnn_sig.cc | 8 +- paddle/phi/ops/compat/roi_align_sig.cc | 4 +- paddle/phi/ops/compat/roi_pool_sig.cc | 4 +- paddle/phi/ops/compat/roll_sig.cc | 6 +- paddle/phi/ops/compat/segment_pool_sig.cc | 13 +- paddle/phi/ops/compat/selu_sig.cc | 6 +- paddle/phi/ops/compat/set_value_sig.cc | 137 +++++++-------- .../sigmoid_cross_entropy_with_logits_sig.cc | 4 +- paddle/phi/ops/compat/slice_sig.cc | 36 ++-- paddle/phi/ops/compat/softmax_sig.cc | 6 +- .../compat/softmax_with_cross_entropy_sig.cc | 4 +- paddle/phi/ops/compat/squeeze_sig.cc | 6 +- paddle/phi/ops/compat/stack_sig.cc | 3 +- paddle/phi/ops/compat/strided_slice_sig.cc | 28 +-- paddle/phi/ops/compat/take_along_axis_sig.cc | 4 +- paddle/phi/ops/compat/temporal_shift_sig.cc | 4 +- paddle/phi/ops/compat/tile_sig.cc | 18 +- paddle/phi/ops/compat/top_k_sig.cc | 4 +- paddle/phi/ops/compat/trace_sig.cc | 4 +- paddle/phi/ops/compat/transpose_sig.cc | 3 +- paddle/phi/ops/compat/triangular_solve_sig.cc | 4 +- paddle/phi/ops/compat/tril_triu_sig.cc | 6 +- paddle/phi/ops/compat/trunc_sig.cc | 3 +- paddle/phi/ops/compat/unfold_sig.cc | 4 +- paddle/phi/ops/compat/unsqueeze_sig.cc | 2 +- paddle/phi/ops/compat/unstack_sig.cc | 3 +- paddle/phi/ops/compat/warpctc_sig.cc | 9 +- paddle/phi/ops/compat/where_grad_sig.cc | 4 +- paddle/phi/ops/compat/yolov3_loss_sig.cc | 36 ++-- paddle/phi/tests/core/test_meta_fn_utils.cc | 16 +- paddle/testing/CMakeLists.txt | 2 +- paddle/testing/paddle_gtest_main.cc | 2 + python/paddle/fluid/__init__.py | 1 + python/paddle/utils/code_gen/api_base.py | 2 +- 156 files changed, 1037 insertions(+), 985 deletions(-) diff --git a/paddle/fluid/framework/infershape_utils.cc b/paddle/fluid/framework/infershape_utils.cc index 17acbde2a09e7..bd71ade7e9311 100644 --- a/paddle/fluid/framework/infershape_utils.cc +++ b/paddle/fluid/framework/infershape_utils.cc @@ -308,10 +308,100 @@ void CompatMetaTensor::share_meta(const MetaTensor& meta_tensor) { share_lod(meta_tensor); } -phi::InferMetaContext BuildInferMetaContext(InferShapeContext* ctx, - const std::string& op_type) { +void CompatInferMetaContext::EmplaceBackInput(CompatMetaTensor input) { + int index = compat_inputs_.size(); + compat_inputs_.emplace_back(std::move(input)); + input_range_.emplace_back(std::pair(index, index + 1)); +} +void CompatInferMetaContext::EmplaceBackOutput(CompatMetaTensor output) { + int index = compat_outputs_.size(); + compat_outputs_.emplace_back(std::move(output)); + output_range_.emplace_back(std::pair(index, index + 1)); +} + +void CompatInferMetaContext::EmplaceBackInputs( + paddle::SmallVector inputs) { + int index = compat_inputs_.size(); + input_range_.emplace_back(std::pair(index, index + inputs.size())); + compat_inputs_.insert(compat_inputs_.end(), + std::make_move_iterator(inputs.begin()), + std::make_move_iterator(inputs.end())); +} + +void CompatInferMetaContext::EmplaceBackOutputs( + paddle::SmallVector + outputs) { + int index = compat_outputs_.size(); + output_range_.emplace_back( + std::pair(index, index + outputs.size())); + compat_outputs_.insert(compat_outputs_.end(), + std::make_move_iterator(outputs.begin()), + std::make_move_iterator(outputs.end())); +} + +const phi::MetaTensor& CompatInferMetaContext::InputAt(size_t idx) const { + return compat_inputs_.at(idx); +} + +paddle::optional +CompatInferMetaContext::OptionalInputAt(size_t idx) const { + const auto& input = compat_inputs_.at(idx); + return input.initialized() + ? paddle::optional{input} + : paddle::optional{paddle::none}; +} + +std::vector CompatInferMetaContext::InputsBetween( + size_t start, size_t end) const { + std::vector result; + result.reserve(end - start); + + for (size_t i = start; i < end; ++i) { + auto& in = compat_inputs_.at(i); + result.emplace_back(in.initialized() ? &in : nullptr); + } + + return result; +} + +paddle::optional> +CompatInferMetaContext::OptionalInputsBetween(size_t start, size_t end) const { + const auto& first = compat_inputs_.at(start); + + if (first.initialized()) { + std::vector result; + result.reserve(end - start); + + for (size_t i = start; i < end; ++i) { + auto& in = compat_inputs_.at(i); + result.emplace_back(in.initialized() ? &in : nullptr); + } + + return paddle::optional>(result); + } + return paddle::optional>( + paddle::none); +} + +phi::MetaTensor* CompatInferMetaContext::MutableOutputAt(size_t idx) { + auto& out = compat_outputs_.at(idx); + return out.initialized() ? &out : nullptr; +} + +std::vector CompatInferMetaContext::MutableOutputBetween( + size_t start, size_t end) { + std::vector result; + result.reserve(end - start); + for (size_t i = start; i < end; ++i) { + auto& out = compat_outputs_.at(i); + result.emplace_back(out.initialized() ? &out : nullptr); + } + return result; +} + +CompatInferMetaContext BuildInferMetaContext(InferShapeContext* ctx, + const std::string& op_type) { // 1. get kernel args - InitDefaultKernelSignatureMap(); auto arg_map_fn = phi::OpUtilsMap::Instance().GetArgumentMappingFn(op_type); PADDLE_ENFORCE_NOT_NULL( arg_map_fn, platform::errors::NotFound( @@ -321,52 +411,47 @@ phi::InferMetaContext BuildInferMetaContext(InferShapeContext* ctx, VLOG(3) << "BuildInferMetaContext: op kernel signature - " << signature; // 2. build infermeta context - phi::InferMetaContext infer_meta_context( + CompatInferMetaContext infer_meta_context( {ctx->IsRuntime(), ctx->IsRunMKLDNNKernel()}); auto& input_names = std::get<0>(signature.args); auto& attr_names = std::get<1>(signature.args); auto& output_names = std::get<2>(signature.args); - auto kernels_map = - phi::KernelFactory::Instance().SelectKernelMap(signature.name); - if (kernels_map.size() == 0) { - PADDLE_THROW( - platform::errors::Unimplemented("Not find `%s` kernels when construct " - "InferMetaContext.", - signature.name)); - } - auto attr_defs = kernels_map.cbegin()->second.args_def().attribute_defs(); + const auto& args_def = + phi::KernelFactory::Instance().GetFirstKernelArgsDef(signature.name); + const auto& attr_defs = args_def.attribute_defs(); - // TODO(chenweihang): support multiple inputs and outputs later - phi::InferMetaContext infer_mete_context; for (auto& in_name : input_names) { if (ctx->HasInputs(in_name)) { - auto input_var = ctx->GetInputVarPtrs(in_name); + auto input_var = std::move(ctx->GetInputVarPtrs(in_name)); if (input_var.size() == 1) { infer_meta_context.EmplaceBackInput( - std::make_shared(input_var[0], ctx->IsRuntime())); + std::move(CompatMetaTensor(input_var[0], ctx->IsRuntime()))); } else { - paddle::SmallVector> inputs; - inputs.reserve(input_var.size()); + paddle::SmallVector + inputs; for (const auto& in : input_var) { - inputs.push_back( - std::make_shared(in, ctx->IsRuntime())); + inputs.emplace_back( + std::move(CompatMetaTensor(in, ctx->IsRuntime()))); } infer_meta_context.EmplaceBackInputs(std::move(inputs)); } } else { - infer_meta_context.EmplaceBackInput({nullptr}); + infer_meta_context.EmplaceBackInput( + std::move(CompatMetaTensor(ctx->IsRuntime()))); } } + VLOG(6) << "BuildInferMetaContext: Done inputs"; + auto attr_reader = ctx->Attrs(); for (size_t i = 0; i < attr_names.size(); ++i) { - auto attr_name = attr_names[i]; + auto& attr_name = attr_names[i]; if (attr_defs[i].type_index == std::type_index(typeid(phi::IntArray))) { // When attr is a vector_tensor or tensor, transform it to IntArray if (ctx->HasInputs(attr_name) || ctx->HasInput(attr_name)) { - const auto& infershape_inputs = ctx->GetInputVarPtrs(attr_name); + auto infershape_inputs = std::move(ctx->GetInputVarPtrs(attr_name)); if (ctx->IsRuntime()) { // If is in runtime, we will get tensor's value for IntArray // and push it into attrs @@ -456,7 +541,7 @@ phi::InferMetaContext BuildInferMetaContext(InferShapeContext* ctx, attr_name)); } } else if (ctx->HasInput(attr_name)) { - const auto& infershape_input = ctx->GetInputVarPtrs(attr_name); + auto infershape_input = std::move(ctx->GetInputVarPtrs(attr_name)); if (infershape_input.size() == 1) { if (ctx->IsRuntime()) { Variable* var = BOOST_GET_CONST(Variable*, infershape_input[0]); @@ -581,7 +666,7 @@ phi::InferMetaContext BuildInferMetaContext(InferShapeContext* ctx, // convert from data if (attr_defs[i].type_index == std::type_index(typeid(int32_t))) { if (ctx->IsRuntime()) { - const auto& infershape_inputs = ctx->GetInputVarPtrs(attr_name); + auto infershape_inputs = std::move(ctx->GetInputVarPtrs(attr_name)); auto var_temp = BOOST_GET_CONST(Variable*, infershape_inputs[i]); auto val = experimental::MakePhiScalarFromVar(*var_temp); int32_t val_int = val.template to(); @@ -596,36 +681,41 @@ phi::InferMetaContext BuildInferMetaContext(InferShapeContext* ctx, } } + VLOG(6) << "BuildInferMetaContext: Done attrs"; + for (auto& out_name : output_names) { if (ctx->HasOutputs(out_name, true)) { - auto output_var = ctx->GetOutputVarPtrs(out_name); + auto output_var = std::move(ctx->GetOutputVarPtrs(out_name)); if (output_var.size() == 1) { - infer_meta_context.EmplaceBackOutput(std::make_shared( - output_var[0], ctx->IsRuntime())); + infer_meta_context.EmplaceBackOutput( + std::move(CompatMetaTensor(output_var[0], ctx->IsRuntime()))); } else { - paddle::SmallVector> outputs; - outputs.reserve(output_var.size()); + paddle::SmallVector + outputs; for (const auto& out : output_var) { if (ctx->IsRuntime()) { if (BOOST_GET_CONST(Variable*, out)) { outputs.emplace_back( - std::make_shared(out, ctx->IsRuntime())); + std::move(CompatMetaTensor(out, ctx->IsRuntime()))); continue; } } else if (BOOST_GET_CONST(VarDesc*, out)) { outputs.emplace_back( - std::make_shared(out, ctx->IsRuntime())); + std::move(CompatMetaTensor(out, ctx->IsRuntime()))); continue; } - outputs.emplace_back(nullptr); + outputs.emplace_back(std::move(CompatMetaTensor(ctx->IsRuntime()))); } infer_meta_context.EmplaceBackOutputs(std::move(outputs)); } } else { - infer_meta_context.EmplaceBackOutput({nullptr}); + infer_meta_context.EmplaceBackOutput( + std::move(CompatMetaTensor(ctx->IsRuntime()))); } } + VLOG(6) << "BuildInferMetaContext: Done outputs"; + return infer_meta_context; } diff --git a/paddle/fluid/framework/infershape_utils.h b/paddle/fluid/framework/infershape_utils.h index 022f194b667eb..e54f2e81e7e9f 100644 --- a/paddle/fluid/framework/infershape_utils.h +++ b/paddle/fluid/framework/infershape_utils.h @@ -18,38 +18,24 @@ limitations under the License. */ #include "paddle/fluid/framework/op_info.h" #include "paddle/fluid/framework/shape_inference.h" +#include "paddle/phi/core/infermeta_utils.h" #include "paddle/phi/core/meta_tensor.h" -namespace phi { -class InferMetaContext; -} // namespace phi namespace paddle { namespace framework { -phi::InferMetaContext BuildInferMetaContext(InferShapeContext* ctx, - const std::string& op_type); - -#define DECLARE_INFER_SHAPE_FUNCTOR(op_type, functor_name, fn) \ - struct functor_name : public paddle::framework::InferShapeBase { \ - void operator()( \ - paddle::framework::InferShapeContext* ctx) const override { \ - auto infer_meta_context = \ - paddle::framework::BuildInferMetaContext(ctx, #op_type); \ - fn(&infer_meta_context); \ - } \ - } - // TODO(chenweihang): Support TensorArray later class CompatMetaTensor : public phi::MetaTensor { public: + explicit CompatMetaTensor(bool is_runtime) + : is_runtime_(is_runtime), initialized_(false) {} CompatMetaTensor(InferShapeVarPtr var, bool is_runtime) : var_(std::move(var)), is_runtime_(is_runtime) {} - CompatMetaTensor() = default; - CompatMetaTensor(const CompatMetaTensor&) = default; CompatMetaTensor(CompatMetaTensor&&) = default; - CompatMetaTensor& operator=(const CompatMetaTensor&) = delete; - CompatMetaTensor& operator=(CompatMetaTensor&&) = delete; + CompatMetaTensor& operator=(CompatMetaTensor&&) = default; + CompatMetaTensor(const CompatMetaTensor&) = default; + CompatMetaTensor& operator=(const CompatMetaTensor&) = default; int64_t numel() const override; @@ -71,6 +57,8 @@ class CompatMetaTensor : public phi::MetaTensor { void share_meta(const MetaTensor& meta_tensor) override; + bool initialized() const override { return initialized_; }; + private: const LoD& GetRuntimeLoD() const { auto* var = BOOST_GET_CONST(Variable*, var_); @@ -95,7 +83,62 @@ class CompatMetaTensor : public phi::MetaTensor { InferShapeVarPtr var_; bool is_runtime_; + bool initialized_{true}; +}; + +// Note: In order to avoid using shared_ptr to manage MetaTensor in +// InferMetaContext, inherit and implement InferMetaContext separately +// for compatibility with fluid, shared_ptr will cause significant decrease +// in scheduling performance +class CompatInferMetaContext : public phi::InferMetaContext { + public: + CompatInferMetaContext() = default; + explicit CompatInferMetaContext(phi::MetaConfig config) + : phi::InferMetaContext(config) {} + + void EmplaceBackInput(CompatMetaTensor input); + void EmplaceBackOutput(CompatMetaTensor output); + + void EmplaceBackInputs( + paddle::SmallVector inputs); + void EmplaceBackOutputs( + paddle::SmallVector + outputs); + + const phi::MetaTensor& InputAt(size_t idx) const override; + paddle::optional OptionalInputAt( + size_t idx) const override; + + std::vector InputsBetween(size_t start, + size_t end) const override; + paddle::optional> + OptionalInputsBetween(size_t start, size_t end) const override; + + phi::MetaTensor* MutableOutputAt(size_t idx) override; + std::vector MutableOutputBetween(size_t start, + size_t end) override; + + virtual ~CompatInferMetaContext() = default; + + private: + paddle::SmallVector + compat_inputs_; + paddle::SmallVector + compat_outputs_; }; +CompatInferMetaContext BuildInferMetaContext(InferShapeContext* ctx, + const std::string& op_type); + +#define DECLARE_INFER_SHAPE_FUNCTOR(op_type, functor_name, fn) \ + struct functor_name : public paddle::framework::InferShapeBase { \ + void operator()( \ + paddle::framework::InferShapeContext* ctx) const override { \ + auto infer_meta_context = \ + paddle::framework::BuildInferMetaContext(ctx, #op_type); \ + fn(&infer_meta_context); \ + } \ + } + } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/new_executor/new_executor_defs.cc b/paddle/fluid/framework/new_executor/new_executor_defs.cc index 089e68fe48c52..8f6bac76e2a15 100644 --- a/paddle/fluid/framework/new_executor/new_executor_defs.cc +++ b/paddle/fluid/framework/new_executor/new_executor_defs.cc @@ -328,20 +328,21 @@ bool InterpretercoreInferShapeContext::IsRunMKLDNNKernel() const { } // TODO(paddle-dev): Can this be template? -std::vector InterpretercoreInferShapeContext::GetInputVarPtrs( +paddle::SmallVector +InterpretercoreInferShapeContext::GetInputVarPtrs( const std::string& name) const { const std::vector& vars = InputVars(name); - std::vector res; + paddle::SmallVector res; res.reserve(vars.size()); res.insert(res.begin(), vars.begin(), vars.end()); return res; } -std::vector +paddle::SmallVector InterpretercoreInferShapeContext::GetOutputVarPtrs( const std::string& name) const { const std::vector& vars = OutputVars(name); - std::vector res; + paddle::SmallVector res; res.reserve(vars.size()); res.insert(res.begin(), vars.begin(), vars.end()); return res; diff --git a/paddle/fluid/framework/new_executor/new_executor_defs.h b/paddle/fluid/framework/new_executor/new_executor_defs.h index aab32cfa06d40..e257b71742400 100644 --- a/paddle/fluid/framework/new_executor/new_executor_defs.h +++ b/paddle/fluid/framework/new_executor/new_executor_defs.h @@ -90,11 +90,11 @@ class InterpretercoreInferShapeContext : public InferShapeContext { bool IsRunMKLDNNKernel() const override; // TODO(paddle-dev): Can this be template? - std::vector GetInputVarPtrs( - const std::string& name) const override; + paddle::SmallVector + GetInputVarPtrs(const std::string& name) const override; - std::vector GetOutputVarPtrs( - const std::string& name) const override; + paddle::SmallVector + GetOutputVarPtrs(const std::string& name) const override; DDim GetInputDim(const std::string& name) const override; diff --git a/paddle/fluid/framework/op_desc.cc b/paddle/fluid/framework/op_desc.cc index 15b979086d1eb..d27bf0e150f97 100644 --- a/paddle/fluid/framework/op_desc.cc +++ b/paddle/fluid/framework/op_desc.cc @@ -202,10 +202,10 @@ class CompileTimeInferShapeContext : public InferShapeContext { } } - std::vector GetInputVarPtrs( - const std::string &name) const override { + paddle::SmallVector + GetInputVarPtrs(const std::string &name) const override { const std::vector arg_names = Inputs(name); - std::vector res; + paddle::SmallVector res; res.reserve(arg_names.size()); std::transform(arg_names.begin(), arg_names.end(), std::back_inserter(res), [this](const std::string &name) { @@ -214,10 +214,10 @@ class CompileTimeInferShapeContext : public InferShapeContext { return res; } - std::vector GetOutputVarPtrs( - const std::string &name) const override { + paddle::SmallVector + GetOutputVarPtrs(const std::string &name) const override { const std::vector arg_names = Outputs(name); - std::vector res; + paddle::SmallVector res; res.reserve(arg_names.size()); std::transform(arg_names.begin(), arg_names.end(), std::back_inserter(res), [this](const std::string &name) { diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc index d9704d70b45ec..871c459c71764 100644 --- a/paddle/fluid/framework/operator.cc +++ b/paddle/fluid/framework/operator.cc @@ -945,19 +945,19 @@ class RuntimeInferShapeContext : public InferShapeContext { } // TODO(paddle-dev): Can this be template? - std::vector GetInputVarPtrs( - const std::string& name) const override { + paddle::SmallVector + GetInputVarPtrs(const std::string& name) const override { const std::vector& vars = InputVars(name); - std::vector res; + paddle::SmallVector res; res.reserve(vars.size()); res.insert(res.begin(), vars.begin(), vars.end()); return res; } - std::vector GetOutputVarPtrs( - const std::string& name) const override { + paddle::SmallVector + GetOutputVarPtrs(const std::string& name) const override { const std::vector& vars = OutputVars(name); - std::vector res; + paddle::SmallVector res; res.reserve(vars.size()); res.insert(res.begin(), vars.begin(), vars.end()); return res; @@ -1324,8 +1324,8 @@ void OperatorWithKernel::RunImpl(const Scope& scope, << ", using_kernel_key:" << *kernel_type_.get(); auto try_pt_kernel_key = TransOpKernelTypeToPhiKernelKey(*kernel_type_.get()); - if (!phi::KernelFactory::Instance().IsSelectKernelValid( - pt_kernel_name, try_pt_kernel_key)) { + if (!phi::KernelFactory::Instance().HasKernel(pt_kernel_name, + try_pt_kernel_key)) { kernel_type_->library_type_ = expected_kernel_key_library_type; VLOG(3) << "modify XPU KP kernel in static graph: " << type_ << " is failed " << *kernel_type_.get(); @@ -2113,10 +2113,12 @@ OpKernelType OperatorWithKernel::GetKernelTypeForVar( KernelSignature OperatorWithKernel::GetExpectedPhiKernelArgs( const ExecutionContext& ctx) const { - InitDefaultKernelSignatureMap(); ExecutionArgumentMappingContext arg_mapping_ctx(ctx); - return phi::OpUtilsMap::Instance().GetArgumentMappingFn(Type())( - arg_mapping_ctx); + if (arg_map_fn_ == nullptr) { + arg_map_fn_.reset(new phi::ArgumentMappingFn( + phi::OpUtilsMap::Instance().GetArgumentMappingFn(Type()))); + } + return (*arg_map_fn_)(arg_mapping_ctx); } Scope* OperatorWithKernel::PreparePhiData( diff --git a/paddle/fluid/framework/operator.h b/paddle/fluid/framework/operator.h index f7fc83f1d6d30..f0887eb919c30 100644 --- a/paddle/fluid/framework/operator.h +++ b/paddle/fluid/framework/operator.h @@ -701,6 +701,7 @@ class OperatorWithKernel : public OperatorBase { mutable bool run_kp_kernel = false; mutable std::unique_ptr pt_kernel_signature_; mutable std::unique_ptr pt_kernel_; + mutable std::unique_ptr arg_map_fn_; }; extern bool OpSupportGPU(const std::string& op_type); diff --git a/paddle/fluid/framework/phi_utils.cc b/paddle/fluid/framework/phi_utils.cc index 8e6f082da1026..75bab0594758b 100644 --- a/paddle/fluid/framework/phi_utils.cc +++ b/paddle/fluid/framework/phi_utils.cc @@ -25,6 +25,7 @@ limitations under the License. */ #include "paddle/phi/core/compat/convert_utils.h" #include "paddle/phi/core/compat/op_utils.h" #include "paddle/phi/core/kernel_factory.h" +#include "paddle/phi/core/type_defs.h" namespace paddle { namespace framework { @@ -40,9 +41,9 @@ class KernelArgsNameMakerByOpProto : public KernelArgsNameMaker { ~KernelArgsNameMakerByOpProto() {} - const paddle::SmallVector& GetInputArgsNames() override; - const paddle::SmallVector& GetOutputArgsNames() override; - const paddle::SmallVector& GetAttrsArgsNames() override; + const paddle::SmallVector& GetInputArgsNames() override; + const paddle::SmallVector& GetOutputArgsNames() override; + const paddle::SmallVector& GetAttrsArgsNames() override; KernelSignature GetKernelSignature(); @@ -52,9 +53,9 @@ class KernelArgsNameMakerByOpProto : public KernelArgsNameMaker { private: const framework::proto::OpProto* op_proto_; - paddle::SmallVector input_names_; - paddle::SmallVector output_names_; - paddle::SmallVector attr_names_; + paddle::SmallVector input_names_; + paddle::SmallVector output_names_; + paddle::SmallVector attr_names_; }; OpKernelType TransPhiKernelKeyToOpKernelType(const phi::KernelKey& kernel_key) { @@ -102,7 +103,7 @@ phi::KernelKey FallBackToCpu(const OpKernelType& expected_kernel_key, if (platform::is_xpu_place(expected_kernel_key.place_) || paddle::platform::is_in_xpu_black_list(op.Type())) { VLOG(3) << "phi missing XPU kernel: " << op.Type() - << ", phipected_kernel_key:" << expected_kernel_key + << ", expected_kernel_key:" << expected_kernel_key << ", fallbacking to CPU one!"; return phi::KernelKey(phi::Backend::CPU, kernel_key.layout(), kernel_key.dtype()); @@ -111,7 +112,7 @@ phi::KernelKey FallBackToCpu(const OpKernelType& expected_kernel_key, #ifdef PADDLE_WITH_ASCEND_CL if (platform::is_npu_place(expected_kernel_key.place_)) { VLOG(3) << "phi missing NPU kernel: " << op.Type() - << ", phipected_kernel_key:" << expected_kernel_key + << ", expected_kernel_key:" << expected_kernel_key << ", fallbacking to CPU one!"; return phi::KernelKey(phi::Backend::CPU, kernel_key.layout(), kernel_key.dtype()); @@ -120,7 +121,7 @@ phi::KernelKey FallBackToCpu(const OpKernelType& expected_kernel_key, #ifdef PADDLE_WITH_MLU if (platform::is_mlu_place(expected_kernel_key.place_)) { VLOG(3) << "phi missing MLU kernel: " << op.Type() - << ", phipected_kernel_key:" << expected_kernel_key + << ", expected_kernel_key:" << expected_kernel_key << ", fallbacking to CPU one!"; return phi::KernelKey(phi::Backend::CPU, kernel_key.layout(), kernel_key.dtype()); @@ -129,7 +130,7 @@ phi::KernelKey FallBackToCpu(const OpKernelType& expected_kernel_key, #ifdef PADDLE_WITH_IPU if (platform::is_ipu_place(expected_kernel_key.place_)) { VLOG(3) << "phi missing IPU kernel: " << op.Type() - << ", phipected_kernel_key:" << expected_kernel_key + << ", expected_kernel_key:" << expected_kernel_key << ", fallbacking to CPU one!"; return phi::KernelKey(phi::Backend::CPU, kernel_key.layout(), kernel_key.dtype()); @@ -139,7 +140,7 @@ phi::KernelKey FallBackToCpu(const OpKernelType& expected_kernel_key, if (platform::is_custom_place(expected_kernel_key.place_)) { VLOG(3) << "phi missing " << expected_kernel_key.place_.GetDeviceType() << " kernel: " << op.Type() - << ", phipected_kernel_key:" << expected_kernel_key + << ", expected_kernel_key:" << expected_kernel_key << ", fallbacking to CPU one!"; return phi::KernelKey(phi::Backend::CPU, kernel_key.layout(), kernel_key.dtype()); @@ -148,45 +149,52 @@ phi::KernelKey FallBackToCpu(const OpKernelType& expected_kernel_key, return phi::KernelKey(); } -const paddle::SmallVector& +const paddle::SmallVector& KernelArgsNameMakerByOpProto::GetInputArgsNames() { for (int i = 0; i < op_proto_->inputs_size(); ++i) { auto& in = op_proto_->inputs()[i]; auto& in_name = in.name(); if ((in.has_extra() && in.extra()) || (in.has_quant() && in.quant())) { - VLOG(6) << "Parse PhiKernel input: skip extra & quant input - " - << in_name; continue; } // If contains dispensable input, we should override the // OpArgumentMapping method self in phi/ops/compat dir if (in.has_dispensable() && in.dispensable()) { - VLOG(6) << "Parse PhiKernel input: skip dispensable input - " << in_name; continue; } - VLOG(6) << "Parse PhiKernel input: " << in_name; - input_names_.emplace_back(in_name); + input_names_.emplace_back(in_name.c_str()); + } + if (VLOG_IS_ON(10)) { + std::ostringstream sout; + sout << "PhiKernel inputs: "; + std::copy(input_names_.begin(), input_names_.end(), + std::ostream_iterator(sout, ", ")); + VLOG(10) << sout.str(); } return input_names_; } -const paddle::SmallVector& +const paddle::SmallVector& KernelArgsNameMakerByOpProto::GetOutputArgsNames() { for (int i = 0; i < op_proto_->outputs_size(); ++i) { auto& out = op_proto_->outputs()[i]; auto& out_name = out.name(); if ((out.has_extra() && out.extra()) || (out.has_quant() && out.quant())) { - VLOG(6) << "Parse PhiKernel output: skip extra & quant output - " - << out_name; continue; } - VLOG(6) << "Parse PhiKernel output: " << out_name; - output_names_.emplace_back(out_name); + output_names_.emplace_back(out_name.c_str()); + } + if (VLOG_IS_ON(10)) { + std::ostringstream sout; + sout << "PhiKernel outputs: "; + std::copy(output_names_.begin(), output_names_.end(), + std::ostream_iterator(sout, ", ")); + VLOG(10) << sout.str(); } return output_names_; } -const paddle::SmallVector& +const paddle::SmallVector& KernelArgsNameMakerByOpProto::GetAttrsArgsNames() { for (int i = 0; i < op_proto_->attrs_size(); ++i) { auto& attr = op_proto_->attrs()[i]; @@ -195,25 +203,26 @@ KernelArgsNameMakerByOpProto::GetAttrsArgsNames() { attr_name == "op_role" || attr_name == "op_role_var" || attr_name == "op_namescope" || attr_name == "op_callstack" || attr_name == "op_device") { - VLOG(6) << "Parse PhiKernel attribute: skip needless attr - " - << attr_name; continue; } if ((attr.has_extra() && attr.extra()) || (attr.has_quant() && attr.quant())) { - VLOG(6) << "Parse PhiKernel attribute: skip extra & quant attr - " - << attr_name; continue; } - VLOG(6) << "Parse PhiKernel attribute: " << attr_name; - attr_names_.emplace_back(attr_name); + attr_names_.emplace_back(attr_name.c_str()); + } + if (VLOG_IS_ON(10)) { + std::ostringstream sout; + sout << "PhiKernel attributes: "; + std::copy(attr_names_.begin(), attr_names_.end(), + std::ostream_iterator(sout, ", ")); + VLOG(10) << sout.str(); } - return attr_names_; } KernelSignature KernelArgsNameMakerByOpProto::GetKernelSignature() { - return KernelSignature(phi::TransToPhiKernelName(op_proto_->type()), + return KernelSignature(phi::TransToPhiKernelName(op_proto_->type()).c_str(), GetInputArgsNames(), GetAttrsArgsNames(), GetOutputArgsNames()); } @@ -228,7 +237,7 @@ void InitDefaultKernelSignatureMap() { if (phi::KernelFactory::Instance().HasCompatiblePhiKernel(op_type) && op_proto) { paddle::framework::KernelArgsNameMakerByOpProto maker(op_proto); - VLOG(10) << "Register kernel signature for " << op_type; + VLOG(10) << "Register `" << op_type << "` kernel signature:"; phi::DefaultKernelSignatureMap::Instance().Insert( op_type, std::move(maker.GetKernelSignature())); } diff --git a/paddle/fluid/framework/phi_utils.h b/paddle/fluid/framework/phi_utils.h index a17578816921b..392a3f9b06b3c 100644 --- a/paddle/fluid/framework/phi_utils.h +++ b/paddle/fluid/framework/phi_utils.h @@ -55,9 +55,9 @@ phi::KernelKey FallBackToCpu(const OpKernelType& expected_kernel_key, class KernelArgsNameMaker { public: virtual ~KernelArgsNameMaker() {} - virtual const paddle::SmallVector& GetInputArgsNames() = 0; - virtual const paddle::SmallVector& GetOutputArgsNames() = 0; - virtual const paddle::SmallVector& GetAttrsArgsNames() = 0; + virtual const paddle::SmallVector& GetInputArgsNames() = 0; + virtual const paddle::SmallVector& GetOutputArgsNames() = 0; + virtual const paddle::SmallVector& GetAttrsArgsNames() = 0; }; void InitDefaultKernelSignatureMap(); diff --git a/paddle/fluid/framework/shape_inference.h b/paddle/fluid/framework/shape_inference.h index 6ba60590cf8f3..bf9731bafce64 100644 --- a/paddle/fluid/framework/shape_inference.h +++ b/paddle/fluid/framework/shape_inference.h @@ -21,6 +21,8 @@ limitations under the License. */ #include "paddle/fluid/framework/var_desc.h" #include "paddle/fluid/framework/variable.h" #include "paddle/phi/core/ddim.h" +#include "paddle/phi/core/type_defs.h" +#include "paddle/utils/small_vector.h" namespace paddle { namespace framework { @@ -106,10 +108,10 @@ class InferShapeContext { virtual bool IsRunMKLDNNKernel() const = 0; - virtual std::vector GetInputVarPtrs( - const std::string &name) const = 0; - virtual std::vector GetOutputVarPtrs( - const std::string &name) const = 0; + virtual paddle::SmallVector + GetInputVarPtrs(const std::string &name) const = 0; + virtual paddle::SmallVector + GetOutputVarPtrs(const std::string &name) const = 0; protected: virtual std::vector GetRepeatedDims(const std::string &name) const = 0; diff --git a/paddle/fluid/imperative/infer_shape_context.h b/paddle/fluid/imperative/infer_shape_context.h index 1e5b112ece21f..5b63334c9ea99 100644 --- a/paddle/fluid/imperative/infer_shape_context.h +++ b/paddle/fluid/imperative/infer_shape_context.h @@ -235,9 +235,10 @@ class DygraphInferShapeContext : public framework::InferShapeContext { (op_kernel_type_->data_layout_ == framework::DataLayout::kMKLDNN)); } - std::vector GetInputVarPtrs( - const std::string& name) const override { - std::vector res; + paddle::SmallVector + GetInputVarPtrs(const std::string& name) const override { + paddle::SmallVector + res; auto it = var_map_in_->find(name); PADDLE_ENFORCE_NE( it, var_map_in_->end(), @@ -248,9 +249,11 @@ class DygraphInferShapeContext : public framework::InferShapeContext { return res; } - std::vector GetOutputVarPtrs( - const std::string& name) const override { - std::vector res; + paddle::SmallVector + GetOutputVarPtrs(const std::string& name) const override { + paddle::SmallVector + res; auto it = var_map_out_->find(name); PADDLE_ENFORCE_NE( it, var_map_out_->end(), diff --git a/paddle/fluid/imperative/prepared_operator.cc b/paddle/fluid/imperative/prepared_operator.cc index 0ad5e808b1d1a..cef7417ea4195 100644 --- a/paddle/fluid/imperative/prepared_operator.cc +++ b/paddle/fluid/imperative/prepared_operator.cc @@ -36,6 +36,8 @@ DECLARE_bool(run_kp_kernel); namespace paddle { namespace imperative { +static const phi::Kernel empty_kernel; + const std::shared_ptr& GetVariableWrapper( const std::shared_ptr& var) { return var->SharedVar(); @@ -108,12 +110,13 @@ PreparedOp::PreparedOp(const framework::OperatorBase& op, ctx_(ctx), kernel_type_(kernel_type), func_(func), - dev_ctx_(dev_ctx) {} + dev_ctx_(dev_ctx), + pt_kernel_(empty_kernel) {} PreparedOp::PreparedOp(const framework::OperatorBase& op, const framework::RuntimeContext& ctx, const framework::OpKernelType& kernel_type, - const framework::KernelSignature& kernel_signature, + framework::KernelSignature&& kernel_signature, const phi::Kernel& pt_kernel, platform::DeviceContext* dev_ctx) : op_(op), @@ -122,7 +125,7 @@ PreparedOp::PreparedOp(const framework::OperatorBase& op, func_(nullptr), dev_ctx_(dev_ctx), run_phi_kernel_(true), - pt_kernel_signature_(kernel_signature), + pt_kernel_signature_(std::move(kernel_signature)), pt_kernel_(pt_kernel) {} template @@ -170,7 +173,8 @@ PreparedOp PrepareImpl(const NameVarMap& ins, #endif if (phi::KernelFactory::Instance().HasCompatiblePhiKernel(op.Type())) { - pt_kernel_signature = op.GetExpectedPhiKernelArgs(dygraph_exe_ctx); + pt_kernel_signature = + std::move(op.GetExpectedPhiKernelArgs(dygraph_exe_ctx)); VLOG(6) << pt_kernel_signature; pt_kernel_name = pt_kernel_signature.name; @@ -200,8 +204,8 @@ PreparedOp PrepareImpl(const NameVarMap& ins, << ", using_kernel_key:" << expected_kernel_key; phi::KernelKey try_pt_kernel_key = TransOpKernelTypeToPhiKernelKey(expected_kernel_key); - if (!phi::KernelFactory::Instance().IsSelectKernelValid( - pt_kernel_name, try_pt_kernel_key)) { + if (!phi::KernelFactory::Instance().HasKernel(pt_kernel_name, + try_pt_kernel_key)) { expected_kernel_key.library_type_ = expected_kernel_key_library_type; VLOG(3) << "modify XPU KP kernel: " << op.Type() << " is failed " << expected_kernel_key; @@ -211,8 +215,8 @@ PreparedOp PrepareImpl(const NameVarMap& ins, #endif pt_kernel_key = TransOpKernelTypeToPhiKernelKey(expected_kernel_key); - auto pt_kernel = phi::KernelFactory::Instance().SelectKernel(pt_kernel_name, - pt_kernel_key); + auto& pt_kernel = phi::KernelFactory::Instance().SelectKernel( + pt_kernel_name, pt_kernel_key); if (pt_kernel.IsValid() #if defined(PADDLE_WITH_XPU) && !defined(PADDLE_WITH_XPU_KP) @@ -227,9 +231,8 @@ PreparedOp PrepareImpl(const NameVarMap& ins, dev_ctx = pool.Get(expected_kernel_key.place_); } - // TODO(chenweihang): using CPUKernel when miss device kernel case - return PreparedOp(op, ctx, expected_kernel_key, pt_kernel_signature, - pt_kernel, dev_ctx); + return PreparedOp(op, ctx, expected_kernel_key, + std::move(pt_kernel_signature), pt_kernel, dev_ctx); } else { VLOG(6) << "Dynamic mode ChoosePhiKernel - kernel `" << pt_kernel_name << "` not found."; @@ -270,15 +273,16 @@ PreparedOp PrepareImpl(const NameVarMap& ins, if (phi::KernelFactory::Instance().HasCompatiblePhiKernel(op.Type())) { auto pt_cpu_kernel_key = FallBackToCpu(expected_kernel_key, pt_kernel_key, op); - auto pt_cpu_kernel = phi::KernelFactory::Instance().SelectKernel( + auto& pt_cpu_kernel = phi::KernelFactory::Instance().SelectKernel( pt_kernel_name, pt_cpu_kernel_key); if (pt_cpu_kernel.IsValid()) { VLOG(6) << "Dynamic mode PrepareImpl - kernel name: " << pt_kernel_name << " | kernel key: " << pt_cpu_kernel_key << " | kernel: " << pt_cpu_kernel; auto* cpu_ctx = pool.Get(paddle::platform::CPUPlace()); - return PreparedOp(op, ctx, expected_kernel_key, pt_kernel_signature, - pt_cpu_kernel, cpu_ctx); + return PreparedOp(op, ctx, expected_kernel_key, + std::move(pt_kernel_signature), pt_cpu_kernel, + cpu_ctx); } } } @@ -505,7 +509,6 @@ static void PreparedOpRunPtImpl( #endif } - // TODO(chenweihang): add debug flags later if (framework::IsComplexType(kernel_type.data_type_)) { HandleComplexGradToRealGrad(outs); } diff --git a/paddle/fluid/imperative/prepared_operator.h b/paddle/fluid/imperative/prepared_operator.h index 04d0b4ca7a5db..b3c5a6b5fa220 100644 --- a/paddle/fluid/imperative/prepared_operator.h +++ b/paddle/fluid/imperative/prepared_operator.h @@ -154,7 +154,7 @@ class PreparedOp { PreparedOp(const framework::OperatorBase& op, const framework::RuntimeContext& ctx, const framework::OpKernelType& kernel_type, - const framework::KernelSignature& kernel_signature, + framework::KernelSignature&& kernel_signature, const phi::Kernel& pt_kernel, platform::DeviceContext* dev_ctx); static PreparedOp Prepare(const NameVarMap& ins, @@ -206,7 +206,7 @@ class PreparedOp { bool run_phi_kernel_{false}; bool run_kp_kernel_{false}; framework::KernelSignature pt_kernel_signature_; - phi::Kernel pt_kernel_; + const phi::Kernel& pt_kernel_; }; const inline framework::Attribute& GetAttr( @@ -289,7 +289,7 @@ void BuildDygraphPhiKernelContext( } } - auto ins_vector = it->second; + auto& ins_vector = it->second; size_t end_idx = start_idx + ins_vector.size(); for (size_t offset = 0; offset < ins_vector.size(); ++offset) { @@ -587,7 +587,7 @@ void PreparePhiData(const phi::Kernel& pt_kernel, auto& ins_vector = ins.at(input_names[i]); for (size_t offset = 0; offset < ins_vector.size(); ++offset) { - auto var = ins_vector[offset]; + auto& var = ins_vector[offset]; const auto* tensor_in = GetTensorFromVar(var->Var()); if (tensor_in && tensor_in->IsInitialized()) { if (in_def.backend == phi::Backend::ALL_BACKEND) { diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc index f1d56000b03ca..7badcb395ea70 100644 --- a/paddle/fluid/inference/api/analysis_predictor.cc +++ b/paddle/fluid/inference/api/analysis_predictor.cc @@ -226,6 +226,7 @@ bool AnalysisPredictor::PrepareScope( status_is_cloned_ = true; } else { paddle::framework::InitDevices(); + paddle::framework::InitDefaultKernelSignatureMap(); // TODO(wilber): we need to release memory occupied by weights. scope_.reset(new paddle::framework::Scope()); status_is_cloned_ = false; diff --git a/paddle/fluid/inference/api/api_impl.cc b/paddle/fluid/inference/api/api_impl.cc index 73d14f215e2ab..1c4369af646af 100644 --- a/paddle/fluid/inference/api/api_impl.cc +++ b/paddle/fluid/inference/api/api_impl.cc @@ -92,6 +92,7 @@ bool NativePaddlePredictor::Init( "The sub_scope should not be nullptr.")); } else { paddle::framework::InitDevices(); + paddle::framework::InitDefaultKernelSignatureMap(); scope_.reset(new paddle::framework::Scope()); } diff --git a/paddle/fluid/operators/controlflow/while_op.cc b/paddle/fluid/operators/controlflow/while_op.cc index 03a244a457cd0..eb44655c88f18 100644 --- a/paddle/fluid/operators/controlflow/while_op.cc +++ b/paddle/fluid/operators/controlflow/while_op.cc @@ -517,10 +517,8 @@ class WhileGradOpShapeInference : public framework::InferShapeBase { ctx->HasInputs(kOutputs); ctx->HasInputs(framework::GradVarName(kOutputs)); auto pg_ig_names = ctx->Outputs(kXGRAD); - std::vector in_var_ptrs = - ctx->GetInputVarPtrs(kX); - std::vector out_var_ptrs = - ctx->GetOutputVarPtrs(kXGRAD); + auto in_var_ptrs = ctx->GetInputVarPtrs(kX); + auto out_var_ptrs = ctx->GetOutputVarPtrs(kXGRAD); PADDLE_ENFORCE_EQ(in_var_ptrs.size(), out_var_ptrs.size(), platform::errors::InvalidArgument( "The size of Inputs(X) must be the same as " diff --git a/paddle/fluid/operators/detection/collect_fpn_proposals_op.cc b/paddle/fluid/operators/detection/collect_fpn_proposals_op.cc index 44f602237da2e..92c9ab34aa454 100644 --- a/paddle/fluid/operators/detection/collect_fpn_proposals_op.cc +++ b/paddle/fluid/operators/detection/collect_fpn_proposals_op.cc @@ -63,10 +63,8 @@ class CollectFpnProposalsOp : public framework::OperatorWithKernel { context->ShareLoD("MultiLevelRois", "FpnRois"); } if (context->IsRuntime() && !context->HasInputs("MultiLevelRoIsNum")) { - std::vector roi_inputs = - context->GetInputVarPtrs("MultiLevelRois"); - std::vector score_inputs = - context->GetInputVarPtrs("MultiLevelScores"); + auto roi_inputs = context->GetInputVarPtrs("MultiLevelRois"); + auto score_inputs = context->GetInputVarPtrs("MultiLevelScores"); for (size_t i = 0; i < roi_inputs.size(); ++i) { framework::Variable *roi_var = BOOST_GET(framework::Variable *, roi_inputs[i]); diff --git a/paddle/fluid/pybind/imperative.cc b/paddle/fluid/pybind/imperative.cc index e09c205db14e7..4caf51ecc4bf8 100644 --- a/paddle/fluid/pybind/imperative.cc +++ b/paddle/fluid/pybind/imperative.cc @@ -60,6 +60,7 @@ limitations under the License. */ #include "paddle/fluid/pybind/uva_utils.h" #include "paddle/phi/core/compat/arg_map_context.h" #include "paddle/phi/core/compat/type_defs.h" +#include "paddle/phi/core/type_defs.h" namespace paddle { namespace pybind { @@ -2027,26 +2028,35 @@ void BindImperative(py::module *m_ptr) { *(imperative::AmpOperators::Instance().GetMutableAllowOps()), *(imperative::AmpOperators::Instance().GetMutableBlockOps())); }) - .def("_get_kernel_signature", - [](imperative::Tracer &self, const std::string &type, - const PyNameVarBaseMap &ins, const PyNameVarBaseMap &outs, - framework::AttributeMap attrs) { - // TODO(xiongkun): move this function outside of tracer. - auto ins_map = ConvertToNameTensorMap(ins); - auto outs_map = ConvertToNameTensorMap(outs); - { - auto to_vector = [](paddle::SmallVector &vec) { - return std::vector(vec.begin(), vec.end()); - }; - auto ret = self.GetExpectedKernelSignature(type, ins_map, - outs_map, attrs); - auto kernelsig_ins = to_vector(std::get<0>(ret.args)); - auto kernelsig_attrs = to_vector(std::get<1>(ret.args)); - auto kernelsig_outs = to_vector(std::get<2>(ret.args)); - return std::make_tuple(kernelsig_ins, kernelsig_attrs, - kernelsig_outs); - } - }) + .def( + "_get_kernel_signature", + [](imperative::Tracer &self, const std::string &type, + const PyNameVarBaseMap &ins, const PyNameVarBaseMap &outs, + framework::AttributeMap attrs) { + // TODO(xiongkun): move this function outside of tracer. + auto ins_map = ConvertToNameTensorMap(ins); + auto outs_map = ConvertToNameTensorMap(outs); + { + auto input_to_vector = + [](paddle::SmallVector &vec) { + return std::vector(vec.begin(), vec.end()); + }; + auto output_to_vector = + [](paddle::SmallVector &vec) { + return std::vector(vec.begin(), vec.end()); + }; + auto attr_to_vector = [](paddle::SmallVector &vec) { + return std::vector(vec.begin(), vec.end()); + }; + auto ret = self.GetExpectedKernelSignature(type, ins_map, + outs_map, attrs); + auto kernelsig_ins = input_to_vector(std::get<0>(ret.args)); + auto kernelsig_attrs = attr_to_vector(std::get<1>(ret.args)); + auto kernelsig_outs = output_to_vector(std::get<2>(ret.args)); + return std::make_tuple(kernelsig_ins, kernelsig_attrs, + kernelsig_outs); + } + }) .def("trace", [](imperative::Tracer &self, const std::string &type, const PyNameVarBaseMap &ins, const PyNameVarBaseMap &outs, diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc index d6071617224c2..a303951d8596d 100644 --- a/paddle/fluid/pybind/pybind.cc +++ b/paddle/fluid/pybind/pybind.cc @@ -2941,6 +2941,8 @@ All parameter, weight, gradient are variables in Paddle. framework::LoadOpMetaInfoAndRegisterOp(dso_name)); }); m.def("init_devices", []() { framework::InitDevices(); }); + m.def("init_default_kernel_signatures", + []() { framework::InitDefaultKernelSignatureMap(); }); m.def("is_compiled_with_cuda", IsCompiledWithCUDA); m.def("is_compiled_with_ascend", IsCompiledWithAscend); m.def("is_compiled_with_rocm", IsCompiledWithROCM); diff --git a/paddle/infrt/dialect/phi/pass/kernel_op_desc.cc b/paddle/infrt/dialect/phi/pass/kernel_op_desc.cc index a26e8e2dca570..b1aa81260968f 100644 --- a/paddle/infrt/dialect/phi/pass/kernel_op_desc.cc +++ b/paddle/infrt/dialect/phi/pass/kernel_op_desc.cc @@ -15,6 +15,7 @@ #include "paddle/infrt/dialect/phi/pass/kernel_op_desc.h" #include #include "paddle/infrt/dialect/phi/data_type.h" +#include "paddle/phi/core/type_defs.h" #include "paddle/phi/kernels/declarations.h" namespace infrt { @@ -92,10 +93,10 @@ std::vector GetCandidateKernels( phi_kernel_desc.input_types.clear(); phi_kernel_desc.output_types.clear(); phi::KernelArgsDef args_def = kernel_key_map.at(kernel_key).args_def(); - const paddle::SmallVector& input_arg = - args_def.input_defs(); - const paddle::SmallVector& output_arg = - args_def.output_defs(); + const paddle::SmallVector& + input_arg = args_def.input_defs(); + const paddle::SmallVector& + output_arg = args_def.output_defs(); for (auto tensor_arg : input_arg) { phi_kernel_desc.input_types.emplace_back(ConvertPlaceFromPhi(tensor_arg)); } diff --git a/paddle/infrt/host_context/value.h b/paddle/infrt/host_context/value.h index ecd118818099d..1834cb4c0db05 100644 --- a/paddle/infrt/host_context/value.h +++ b/paddle/infrt/host_context/value.h @@ -91,6 +91,7 @@ using ValueVariantType = std::vector<::phi::DenseTensor*>, paddle::experimental::ScalarBase<::phi::DenseTensor>, paddle::experimental::IntArrayBase<::phi::DenseTensor>, + std::vector, std::vector<::phi::MetaTensor*>, ::phi::MetaConfig, paddle::experimental::Backend, diff --git a/paddle/phi/core/compat/arg_map_context.h b/paddle/phi/core/compat/arg_map_context.h index 71cec01141164..122ebed21942a 100644 --- a/paddle/phi/core/compat/arg_map_context.h +++ b/paddle/phi/core/compat/arg_map_context.h @@ -19,45 +19,33 @@ limitations under the License. */ #include #include "paddle/phi/common/place.h" +#include "paddle/phi/core/type_defs.h" #include "paddle/utils/any.h" #include "paddle/utils/flat_hash_map.h" #include "paddle/utils/small_vector.h" namespace phi { -constexpr char kGradVarSuffix[] = "@GRAD"; - -constexpr size_t kGradVarSuffixSize = 5U; - -inline std::string GradVarName(const std::string& var_name) { - std::string result; - result.reserve(var_name.size() + kGradVarSuffixSize); - result += var_name; - result += kGradVarSuffix; - return result; -} - // tuple(input_names, attr_names, output_names) -using KernelArgsTuple = std::tuple, - paddle::SmallVector, - paddle::SmallVector>; +using KernelArgsTuple = std::tuple, + paddle::SmallVector, + paddle::SmallVector>; struct KernelSignature { - std::string name; + const char* name; KernelArgsTuple args; KernelSignature() = default; - KernelSignature(std::string&& kernel_name, - paddle::SmallVector&& inputs, - paddle::SmallVector&& attrs, - paddle::SmallVector&& outputs) - : name(std::move(kernel_name)), - args(std::make_tuple(inputs, attrs, outputs)) {} - KernelSignature(const std::string& kernel_name, - const paddle::SmallVector& inputs, - const paddle::SmallVector& attrs, - const paddle::SmallVector& outputs) + KernelSignature(const char* kernel_name, + paddle::SmallVector&& inputs, + paddle::SmallVector&& attrs, + paddle::SmallVector&& outputs) + : name(kernel_name), args(std::make_tuple(inputs, attrs, outputs)) {} + KernelSignature(const char* kernel_name, + const paddle::SmallVector& inputs, + const paddle::SmallVector& attrs, + const paddle::SmallVector& outputs) : name(kernel_name), args(std::make_tuple(inputs, attrs, outputs)) {} // TODO(chenweihang): add assign constructor to solve windows compile diff --git a/paddle/phi/core/compat/convert_utils.cc b/paddle/phi/core/compat/convert_utils.cc index 43febb2ac0430..4fa11ac7860ef 100644 --- a/paddle/phi/core/compat/convert_utils.cc +++ b/paddle/phi/core/compat/convert_utils.cc @@ -102,7 +102,7 @@ phi::Place TransToPhiPlace(const Backend& backend, bool set_device_id) { } } -std::string TransToPhiKernelName(const std::string& fluid_op_name) { +const std::string& TransToPhiKernelName(const std::string& fluid_op_name) { return OpUtilsMap::Instance().GetBaseKernelName(fluid_op_name); } diff --git a/paddle/phi/core/compat/convert_utils.h b/paddle/phi/core/compat/convert_utils.h index 621459764873e..5982ab0deff83 100644 --- a/paddle/phi/core/compat/convert_utils.h +++ b/paddle/phi/core/compat/convert_utils.h @@ -22,7 +22,7 @@ limitations under the License. */ namespace phi { -std::string TransToPhiKernelName(const std::string& fluid_op_name); +const std::string& TransToPhiKernelName(const std::string& fluid_op_name); const std::string& TransToFluidOpName(const std::string& phi_kernel_name); Backend TransToPhiBackend(const phi::Place& place); diff --git a/paddle/phi/core/compat/op_utils.h b/paddle/phi/core/compat/op_utils.h index 6716f4791803d..9c926fa871b67 100644 --- a/paddle/phi/core/compat/op_utils.h +++ b/paddle/phi/core/compat/op_utils.h @@ -26,6 +26,8 @@ limitations under the License. */ namespace phi { +const static std::string deprecated_kernel_name = "deprecated"; // NOLINT + const std::unordered_set standard_kernel_suffixs({ "sr", // SelectedRows kernel "raw" // fallback kernel of origfinal fluid op @@ -134,9 +136,9 @@ class OpUtilsMap { arg_mapping_fn_map_.insert({std::move(op_type), std::move(fn)}); } - std::string GetBaseKernelName(const std::string& op_type) const { + const std::string& GetBaseKernelName(const std::string& op_type) const { if (deprecated_op_names.find(op_type) != deprecated_op_names.end()) { - return "deprecated"; + return deprecated_kernel_name; } auto it = base_kernel_name_map_.find(op_type); if (it == base_kernel_name_map_.end()) { @@ -150,7 +152,7 @@ class OpUtilsMap { auto it = arg_mapping_fn_map_.find(op_type); if (it == arg_mapping_fn_map_.end()) { auto func = - [op_type](const ArgumentMappingContext& ctx) -> KernelSignature { + [&op_type](const ArgumentMappingContext& ctx) -> KernelSignature { return DefaultKernelSignatureMap::Instance().Get(op_type); }; return func; diff --git a/paddle/phi/core/infermeta_utils.cc b/paddle/phi/core/infermeta_utils.cc index 0496d727e8d3b..70f26102cbad1 100644 --- a/paddle/phi/core/infermeta_utils.cc +++ b/paddle/phi/core/infermeta_utils.cc @@ -20,14 +20,12 @@ void InferMetaContext::SetMetaConfig(MetaConfig config) { config_ = std::move(config); } -void InferMetaContext::EmplaceBackInput( - std::shared_ptr input) { +void InferMetaContext::EmplaceBackInput(MetaTensor input) { int index = inputs_.size(); inputs_.emplace_back(std::move(input)); input_range_.emplace_back(std::pair(index, index + 1)); } -void InferMetaContext::EmplaceBackOutput( - std::shared_ptr output) { +void InferMetaContext::EmplaceBackOutput(MetaTensor output) { int index = outputs_.size(); outputs_.emplace_back(std::move(output)); output_range_.emplace_back(std::pair(index, index + 1)); @@ -37,7 +35,7 @@ void InferMetaContext::EmplaceBackAttr(paddle::any attr) { } void InferMetaContext::EmplaceBackInputs( - paddle::SmallVector> inputs) { + paddle::SmallVector inputs) { int index = inputs_.size(); input_range_.emplace_back(std::pair(index, index + inputs.size())); inputs_.insert(inputs_.end(), @@ -45,7 +43,7 @@ void InferMetaContext::EmplaceBackInputs( std::make_move_iterator(inputs.end())); } void InferMetaContext::EmplaceBackOutputs( - paddle::SmallVector> outputs) { + paddle::SmallVector outputs) { int index = outputs_.size(); output_range_.emplace_back( std::pair(index, index + outputs.size())); @@ -64,24 +62,25 @@ const std::pair& InferMetaContext::OutputRangeAt(size_t idx) const { const MetaConfig& InferMetaContext::GetMetaConfig() const { return config_; } const MetaTensor& InferMetaContext::InputAt(size_t idx) const { - return *inputs_.at(idx); + return inputs_.at(idx); } -paddle::optional InferMetaContext::OptionalInputAt( +paddle::optional InferMetaContext::OptionalInputAt( size_t idx) const { const auto& input = inputs_.at(idx); - return input ? paddle::optional{static_cast< - const phi::MetaTensor&>(*input)} - : paddle::optional{paddle::none}; + return input.initialized() + ? paddle::optional{input} + : paddle::optional{paddle::none}; } -std::vector InferMetaContext::InputsBetween(size_t start, - size_t end) const { - std::vector result; +std::vector InferMetaContext::InputsBetween( + size_t start, size_t end) const { + std::vector result; result.reserve(end - start); for (size_t i = start; i < end; ++i) { - result.push_back(inputs_.at(i).get()); + auto& in = inputs_.at(i); + result.emplace_back(in.initialized() ? &in : nullptr); } return result; @@ -91,12 +90,13 @@ paddle::optional> InferMetaContext::OptionalInputsBetween(size_t start, size_t end) const { const auto& first = inputs_.at(start); - if (first) { + if (first.initialized()) { std::vector result; result.reserve(end - start); for (size_t i = start; i < end; ++i) { - result.push_back(inputs_.at(i).get()); + auto& in = inputs_.at(i); + result.emplace_back(in.initialized() ? &in : nullptr); } return paddle::optional>(result); @@ -105,7 +105,8 @@ InferMetaContext::OptionalInputsBetween(size_t start, size_t end) const { } MetaTensor* InferMetaContext::MutableOutputAt(size_t idx) { - return outputs_.at(idx).get(); + auto& out = outputs_.at(idx); + return out.initialized() ? &out : nullptr; } std::vector InferMetaContext::MutableOutputBetween(size_t start, @@ -113,7 +114,8 @@ std::vector InferMetaContext::MutableOutputBetween(size_t start, std::vector result; result.reserve(end - start); for (size_t i = start; i < end; ++i) { - result.emplace_back(outputs_.at(i).get()); + auto& out = outputs_.at(i); + result.emplace_back(out.initialized() ? &out : nullptr); } return result; } diff --git a/paddle/phi/core/infermeta_utils.h b/paddle/phi/core/infermeta_utils.h index fad437f82c331..699c38ebd4702 100644 --- a/paddle/phi/core/infermeta_utils.h +++ b/paddle/phi/core/infermeta_utils.h @@ -37,28 +37,28 @@ class InferMetaContext { explicit InferMetaContext(MetaConfig config) : config_(config) {} void SetMetaConfig(MetaConfig config); - void EmplaceBackInput(std::shared_ptr input); - void EmplaceBackOutput(std::shared_ptr output); + const MetaConfig& GetMetaConfig() const; + + void EmplaceBackInput(MetaTensor input); + void EmplaceBackOutput(MetaTensor output); void EmplaceBackAttr(paddle::any attr); void EmplaceBackInputs( - paddle::SmallVector> inputs); + paddle::SmallVector inputs); void EmplaceBackOutputs( - paddle::SmallVector> outputs); + paddle::SmallVector outputs); - const std::pair& InputRangeAt(size_t idx) const; - const std::pair& OutputRangeAt(size_t idx) const; + virtual const MetaTensor& InputAt(size_t idx) const; + virtual paddle::optional OptionalInputAt(size_t idx) const; - const MetaConfig& GetMetaConfig() const; - - const MetaTensor& InputAt(size_t idx) const; - paddle::optional OptionalInputAt(size_t idx) const; - std::vector InputsBetween(size_t start, size_t end) const; - paddle::optional> + virtual std::vector InputsBetween(size_t start, + size_t end) const; + virtual paddle::optional> OptionalInputsBetween(size_t start, size_t end) const; - MetaTensor* MutableOutputAt(size_t idx); - std::vector MutableOutputBetween(size_t start, size_t end); + virtual MetaTensor* MutableOutputAt(size_t idx); + virtual std::vector MutableOutputBetween(size_t start, + size_t end); template AttrType AttrAt(size_t idx) { @@ -73,19 +73,24 @@ class InferMetaContext { } } - private: + const std::pair& InputRangeAt(size_t idx) const; + const std::pair& OutputRangeAt(size_t idx) const; + + virtual ~InferMetaContext() = default; + + protected: MetaConfig config_; - // NOTE(chenweihang): Because the MetaTensor is a base class, and MetaTensor - // objects are all created in each round, so we have to use smart pointer - // here, maybe we can implemented a new InferMetaContext and a series utils - // specifically for fluid to avoid using shared_ptr - paddle::SmallVector> inputs_; - paddle::SmallVector> outputs_; - paddle::SmallVector attrs_; + paddle::SmallVector attrs_; - paddle::SmallVector> input_range_; - paddle::SmallVector> output_range_; + paddle::SmallVector, phi::kInputSmallVectorSize> + input_range_; + paddle::SmallVector, phi::kOutputSmallVectorSize> + output_range_; + + private: + paddle::SmallVector inputs_; + paddle::SmallVector outputs_; }; #define PD_INFER_META(...) \ @@ -159,7 +164,7 @@ struct InferMetaFnImpl { }; template - struct InferMetaFnCallHelper&, Tail...> { + struct InferMetaFnCallHelper&, Tail...> { template static void Call(InferMetaContext* ctx, PreviousArgs&... pargs) { static_assert(attr_idx == 0, @@ -167,7 +172,7 @@ struct InferMetaFnImpl { static_assert(out_idx == 0, "InferMeta's Input should appear before Outputs."); const std::pair range = ctx->InputRangeAt(in_idx); - std::vector arg = + std::vector arg = ctx->InputsBetween(range.first, range.second); InferMetaFnCallHelper< Tail...>::template Call(ctx, diff --git a/paddle/phi/core/kernel_context.cc b/paddle/phi/core/kernel_context.cc index 234e3528c363b..cf862cbde18f9 100644 --- a/paddle/phi/core/kernel_context.cc +++ b/paddle/phi/core/kernel_context.cc @@ -79,7 +79,7 @@ void KernelContext::EmplaceBackAttr(paddle::any attr) { void KernelContext::AssignInputRange(std::pair&& range, size_t idx) { if (idx < input_range_.size()) { - input_range_[idx] = range; + input_range_[idx] = std::move(range); } else if (idx == input_range_.size()) { input_range_.emplace_back(range); } else { @@ -93,7 +93,7 @@ void KernelContext::AssignInputRange(std::pair&& range, size_t idx) { void KernelContext::AssignOutputRange(std::pair&& range, size_t idx) { if (idx < output_range_.size()) { - output_range_[idx] = range; + output_range_[idx] = std::move(range); } else if (idx == output_range_.size()) { output_range_.emplace_back(range); } else { diff --git a/paddle/phi/core/kernel_factory.cc b/paddle/phi/core/kernel_factory.cc index a1ce90c2c78ae..d3fd2e0204e54 100644 --- a/paddle/phi/core/kernel_factory.cc +++ b/paddle/phi/core/kernel_factory.cc @@ -19,6 +19,8 @@ namespace phi { +const static Kernel empty_kernel; // NOLINT + uint32_t KernelKey::Hash::operator()(const KernelKey& key) const { uint32_t hash_value = 0; // |----31-20------|---19-12---|---11-8----|---7-0---| @@ -37,15 +39,15 @@ KernelFactory& KernelFactory::Instance() { return g_op_kernel_factory; } -Kernel KernelFactory::SelectKernel(const std::string& kernel_name, - const KernelKey& kernel_key) const { +const Kernel& KernelFactory::SelectKernel(const std::string& kernel_name, + const KernelKey& kernel_key) const { auto iter = kernels_.find(kernel_name); if (iter == kernels_.end()) { - return Kernel(); + return empty_kernel; } auto kernel_iter = iter->second.find(kernel_key); if (kernel_iter == iter->second.end()) { - return Kernel(); + return empty_kernel; } return kernel_iter->second; } @@ -59,8 +61,8 @@ KernelKeyMap KernelFactory::SelectKernelMap( return iter->second; } -bool KernelFactory::IsSelectKernelValid(const std::string& kernel_name, - const KernelKey& kernel_key) const { +bool KernelFactory::HasKernel(const std::string& kernel_name, + const KernelKey& kernel_key) const { auto iter = kernels_.find(kernel_name); PADDLE_ENFORCE_NE( iter, @@ -128,6 +130,16 @@ const Kernel& KernelFactory::SelectKernelOrThrowError( KernelKey(backend, layout, dtype)); } +const KernelArgsDef& KernelFactory::GetFirstKernelArgsDef( + const std::string& kernel_name) const { + auto iter = kernels_.find(kernel_name); + PADDLE_ENFORCE_NE( + iter, + kernels_.end(), + phi::errors::NotFound("The kernel `%s` is not registered.", kernel_name)); + return iter->second.cbegin()->second.args_def(); +} + // print kernel info with json format: // { // "(CPU, Undefined(AnyLayout), complex64)": { diff --git a/paddle/phi/core/kernel_factory.h b/paddle/phi/core/kernel_factory.h index 8fd25b691bdeb..812b6222cb5e2 100644 --- a/paddle/phi/core/kernel_factory.h +++ b/paddle/phi/core/kernel_factory.h @@ -151,30 +151,38 @@ class KernelArgsDef { attribute_defs_.emplace_back(AttributeArgDef(type_index)); } - const paddle::SmallVector& input_defs() const { + const paddle::SmallVector& input_defs() + const { return input_defs_; } - const paddle::SmallVector& output_defs() const { + const paddle::SmallVector& output_defs() + const { return output_defs_; } - const paddle::SmallVector& attribute_defs() const { + const paddle::SmallVector& + attribute_defs() const { return attribute_defs_; } - paddle::SmallVector& input_defs() { return input_defs_; } + paddle::SmallVector& input_defs() { + return input_defs_; + } - paddle::SmallVector& output_defs() { return output_defs_; } + paddle::SmallVector& output_defs() { + return output_defs_; + } - paddle::SmallVector& attribute_defs() { + paddle::SmallVector& attribute_defs() { return attribute_defs_; } private: - paddle::SmallVector input_defs_{{}}; - paddle::SmallVector output_defs_{{}}; - paddle::SmallVector attribute_defs_{{}}; + paddle::SmallVector input_defs_{{}}; + paddle::SmallVector output_defs_{{}}; + paddle::SmallVector attribute_defs_{ + {}}; }; class Kernel { @@ -209,7 +217,7 @@ class Kernel { TensorArgDef& OutputAt(size_t idx) { return args_def_.output_defs().at(idx); } - bool IsValid() { return fn_ != nullptr; } + bool IsValid() const { return fn_ != nullptr; } private: KernelFn fn_{nullptr}; @@ -246,14 +254,17 @@ class KernelFactory { DataLayout layout, DataType dtype) const; - bool IsSelectKernelValid(const std::string& kernel_name, - const KernelKey& kernel_key) const; + bool HasKernel(const std::string& kernel_name, + const KernelKey& kernel_key) const; - Kernel SelectKernel(const std::string& kernel_name, - const KernelKey& kernel_key) const; + const Kernel& SelectKernel(const std::string& kernel_name, + const KernelKey& kernel_key) const; KernelKeyMap SelectKernelMap(const std::string& kernel_name) const; + const KernelArgsDef& GetFirstKernelArgsDef( + const std::string& kernel_name) const; + private: KernelFactory() = default; diff --git a/paddle/phi/core/meta_tensor.cc b/paddle/phi/core/meta_tensor.cc index 04dfbf96031c2..2178855aa0fee 100644 --- a/paddle/phi/core/meta_tensor.cc +++ b/paddle/phi/core/meta_tensor.cc @@ -148,4 +148,6 @@ void MetaTensor::share_dims(const MetaTensor& meta_tensor) { } } +bool MetaTensor::initialized() const { return tensor_ != nullptr; } + } // namespace phi diff --git a/paddle/phi/core/meta_tensor.h b/paddle/phi/core/meta_tensor.h index 10c3a7c1a3de3..3cdbfda61d69c 100644 --- a/paddle/phi/core/meta_tensor.h +++ b/paddle/phi/core/meta_tensor.h @@ -45,10 +45,10 @@ class MetaTensor { : tensor_(const_cast(&tensor)) {} MetaTensor(TensorBase& tensor) : tensor_(&tensor) {} // NOLINT - MetaTensor(const MetaTensor&) = default; MetaTensor(MetaTensor&&) = default; - MetaTensor& operator=(const MetaTensor&) = delete; - MetaTensor& operator=(MetaTensor&&) = delete; + MetaTensor& operator=(MetaTensor&&) = default; + MetaTensor(const MetaTensor&) = default; + MetaTensor& operator=(const MetaTensor&) = default; virtual ~MetaTensor() = default; @@ -64,6 +64,8 @@ class MetaTensor { virtual void share_meta(const MetaTensor& meta_tensor); virtual void share_dims(const MetaTensor& meta_tensor); + virtual bool initialized() const; + private: // Because the lod in compiletime and runtime is different, // so `LoD` cannot in public methods diff --git a/paddle/phi/core/type_defs.h b/paddle/phi/core/type_defs.h index 3c879267bb844..a1e7836088389 100644 --- a/paddle/phi/core/type_defs.h +++ b/paddle/phi/core/type_defs.h @@ -22,7 +22,7 @@ class Kernel; class KernelKey; class KernelArgsDef; class KernelContext; -class KernelSignature; +struct KernelSignature; class ArgumentMappingContext; class InferMetaContext; @@ -35,4 +35,9 @@ using ArgumentMappingFn = std::function; using InferMetaFn = void (*)(InferMetaContext* ctx); +// Global SmallVector size setting +constexpr size_t kInputSmallVectorSize = 10U; +constexpr size_t kAttrSmallVectorSize = 10U; +constexpr size_t kOutputSmallVectorSize = 5U; + } // namespace phi diff --git a/paddle/phi/infermeta/backward.cc b/paddle/phi/infermeta/backward.cc index 84db67978fc23..567f39a915c02 100644 --- a/paddle/phi/infermeta/backward.cc +++ b/paddle/phi/infermeta/backward.cc @@ -315,8 +315,8 @@ void MaxPoolWithIndexGradInferMeta(const MetaTensor& x, dx->share_meta(x); } -void MeshgridGradInferMeta(const std::vector& inputs, - const std::vector& outputs_grad, +void MeshgridGradInferMeta(const std::vector& inputs, + const std::vector& outputs_grad, std::vector inputs_grad) { PADDLE_ENFORCE_GT(outputs_grad.size(), 1, @@ -329,7 +329,7 @@ void MeshgridGradInferMeta(const std::vector& inputs, } } -void MultiDotGradInferMeta(const std::vector& x, +void MultiDotGradInferMeta(const std::vector& x, const MetaTensor& out_grad, std::vector x_grad) { PADDLE_ENFORCE_EQ( diff --git a/paddle/phi/infermeta/backward.h b/paddle/phi/infermeta/backward.h index c51708bb54394..6807438ebbb75 100644 --- a/paddle/phi/infermeta/backward.h +++ b/paddle/phi/infermeta/backward.h @@ -151,11 +151,11 @@ void MaxPoolWithIndexGradInferMeta(const MetaTensor& x, bool adaptive, MetaTensor* dx); -void MeshgridGradInferMeta(const std::vector& inputs, - const std::vector& outputs_grad, +void MeshgridGradInferMeta(const std::vector& inputs, + const std::vector& outputs_grad, std::vector inputs_grad); -void MultiDotGradInferMeta(const std::vector& x, +void MultiDotGradInferMeta(const std::vector& x, const MetaTensor& out_grad, std::vector x_grad); diff --git a/paddle/phi/infermeta/multiary.cc b/paddle/phi/infermeta/multiary.cc index 5fecd3740e930..6cf805bc1a127 100644 --- a/paddle/phi/infermeta/multiary.cc +++ b/paddle/phi/infermeta/multiary.cc @@ -21,7 +21,8 @@ limitations under the License. */ #include "paddle/phi/kernels/funcs/concat_funcs.h" namespace phi { -std::vector GetMetaTensorsDim(const std::vector& tensors) { +std::vector GetMetaTensorsDim( + const std::vector& tensors) { std::vector dims; dims.reserve(tensors.size()); for (const MetaTensor* tensor : tensors) { @@ -279,7 +280,7 @@ void AdamwInferMeta(const MetaTensor& param, master_param_outs); } -void AddNInferMeta(const std::vector& x, +void AddNInferMeta(const std::vector& x, MetaTensor* out, MetaConfig config) { auto N = x.size(); @@ -642,7 +643,7 @@ void BilinearTensorProductInferMeta(const MetaTensor& x, out->set_dtype(x.dtype()); } -void BroadcastTensorsInferMeta(const std::vector& x, +void BroadcastTensorsInferMeta(const std::vector& x, std::vector out) { int target_rank = 0; const auto& input_dims = GetMetaTensorsDim(x); @@ -696,7 +697,7 @@ void BroadcastTensorsInferMeta(const std::vector& x, } } -void ConcatInferMeta(const std::vector& x, +void ConcatInferMeta(const std::vector& x, const Scalar& axis_scalar, MetaTensor* out, MetaConfig config) { @@ -1488,7 +1489,7 @@ void InterpolateInferMeta( } } -void MeshgridInferMeta(const std::vector& inputs, +void MeshgridInferMeta(const std::vector& inputs, std::vector outputs) { const size_t inputs_num = inputs.size(); @@ -1551,7 +1552,8 @@ void MomentumInferMeta(const MetaTensor& param, } } -void MultiDotInferMeta(const std::vector& x, MetaTensor* out) { +void MultiDotInferMeta(const std::vector& x, + MetaTensor* out) { auto inputs_dims = GetMetaTensorsDim(x); const size_t inputs_num = inputs_dims.size(); @@ -1624,7 +1626,7 @@ void MultiDotInferMeta(const std::vector& x, MetaTensor* out) { out->share_lod(*x.at(0)); } -void MultiplexInferMeta(const std::vector& ins, +void MultiplexInferMeta(const std::vector& ins, const MetaTensor& ids, MetaTensor* out) { PADDLE_ENFORCE_NE( @@ -1803,8 +1805,8 @@ void RmspropInferMeta(const MetaTensor& param, } void RnnInferMeta(const MetaTensor& x, - const std::vector& pre_state, - const std::vector& weight_list, + const std::vector& pre_state, + const std::vector& weight_list, paddle::optional sequence_length, float dropout_prob, bool is_bidirec, @@ -1910,7 +1912,7 @@ void SgdInferMeta(const MetaTensor& param, param_out->set_dtype(param.dtype()); } -void StackInferMeta(const std::vector& x, +void StackInferMeta(const std::vector& x, int axis, MetaTensor* out) { PADDLE_ENFORCE_GT(x.size(), @@ -1956,7 +1958,7 @@ void StackInferMeta(const std::vector& x, out->share_lod(*x.at(0)); } -void UnchangedMultiInferMeta(const std::vector& x, +void UnchangedMultiInferMeta(const std::vector& x, std::vector out) { for (size_t i = 0; i < x.size(); ++i) { out[i]->share_meta(*x[i]); diff --git a/paddle/phi/infermeta/multiary.h b/paddle/phi/infermeta/multiary.h index 9137b574ac09d..557855219bb51 100644 --- a/paddle/phi/infermeta/multiary.h +++ b/paddle/phi/infermeta/multiary.h @@ -35,7 +35,8 @@ namespace phi { // // NOTE: The InferMeta Functions in this file are arranged in alphabetic order. -std::vector GetMetaTensorsDim(const std::vector& tensors); +std::vector GetMetaTensorsDim( + const std::vector& tensors); void AdadeltaInferMeta(const MetaTensor& param, const MetaTensor& grad, @@ -117,7 +118,7 @@ void AdamwInferMeta(const MetaTensor& param, MetaTensor* beta2_pow_out, MetaTensor* master_param_outs); -void AddNInferMeta(const std::vector& x, +void AddNInferMeta(const std::vector& x, MetaTensor* out, MetaConfig config = MetaConfig()); @@ -173,10 +174,10 @@ void BilinearTensorProductInferMeta(const MetaTensor& x, MetaTensor* out, MetaConfig config = MetaConfig()); -void BroadcastTensorsInferMeta(const std::vector& x, +void BroadcastTensorsInferMeta(const std::vector& x, std::vector out); -void ConcatInferMeta(const std::vector& x, +void ConcatInferMeta(const std::vector& x, const Scalar& axis_scalar, MetaTensor* out, MetaConfig config = MetaConfig()); @@ -227,7 +228,7 @@ void InterpolateInferMeta( MetaTensor* output, MetaConfig config = MetaConfig()); -void MeshgridInferMeta(const std::vector& inputs, +void MeshgridInferMeta(const std::vector& inputs, std::vector outputs); void MomentumInferMeta(const MetaTensor& param, @@ -245,9 +246,10 @@ void MomentumInferMeta(const MetaTensor& param, MetaTensor* velocity_out, MetaTensor* master_param_out); -void MultiDotInferMeta(const std::vector& x, MetaTensor* out); +void MultiDotInferMeta(const std::vector& x, + MetaTensor* out); -void MultiplexInferMeta(const std::vector& ins, +void MultiplexInferMeta(const std::vector& ins, const MetaTensor& ids, MetaTensor* out); @@ -276,8 +278,8 @@ void RmspropInferMeta(const MetaTensor& param, MetaTensor* mean_grad_out); void RnnInferMeta(const MetaTensor& x, - const std::vector& pre_state, - const std::vector& weight_list, + const std::vector& pre_state, + const std::vector& weight_list, paddle::optional sequence_length, float dropout_prob, bool is_bidirec, @@ -300,11 +302,11 @@ void SgdInferMeta(const MetaTensor& param, MetaTensor* param_out, MetaTensor* master_param_out); -void StackInferMeta(const std::vector& x, +void StackInferMeta(const std::vector& x, int axis, MetaTensor* out); -void UnchangedMultiInferMeta(const std::vector& x, +void UnchangedMultiInferMeta(const std::vector& x, std::vector out); void WarpctcInferMeta(const MetaTensor& logits, diff --git a/paddle/phi/kernels/concat_kernel.h b/paddle/phi/kernels/concat_kernel.h index cf83ab9aaabe1..f5ac2d3cbb75e 100644 --- a/paddle/phi/kernels/concat_kernel.h +++ b/paddle/phi/kernels/concat_kernel.h @@ -32,7 +32,7 @@ DenseTensor Concat(const Context& dev_ctx, const Scalar& axis) { std::vector meta_x; meta_x.reserve(x.size()); - std::vector meta_x_ptr; + std::vector meta_x_ptr; for (const auto* t : x) { meta_x.emplace_back(*t); meta_x_ptr.push_back(&meta_x.back()); diff --git a/paddle/phi/ops/compat/abs_sig.cc b/paddle/phi/ops/compat/abs_sig.cc index b4b94457e6be9..92d29dd0189b5 100644 --- a/paddle/phi/ops/compat/abs_sig.cc +++ b/paddle/phi/ops/compat/abs_sig.cc @@ -21,8 +21,7 @@ KernelSignature AbsOpArgumentMapping(const ArgumentMappingContext& ctx) { } KernelSignature AbsGradOpArgumentMapping(const ArgumentMappingContext& ctx) { - return KernelSignature( - "abs_grad", {"X", GradVarName("Out")}, {}, {GradVarName("X")}); + return KernelSignature("abs_grad", {"X", "Out@GRAD"}, {}, {"X@GRAD"}); } KernelSignature AbsDoubleGradOpArgumentMapping( diff --git a/paddle/phi/ops/compat/activation_sig.cc b/paddle/phi/ops/compat/activation_sig.cc index 8add832c366cf..5900b49946623 100644 --- a/paddle/phi/ops/compat/activation_sig.cc +++ b/paddle/phi/ops/compat/activation_sig.cc @@ -19,26 +19,22 @@ namespace phi { #define DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(func_name, op_name, attrs) \ KernelSignature func_name##GradOpArgumentMapping( \ const ArgumentMappingContext& ctx) { \ - return KernelSignature(op_name "_grad", \ - {"X", GradVarName("Out")}, \ - {attrs}, \ - {GradVarName("X")}); \ + return KernelSignature( \ + op_name "_grad", {"X", "Out@GRAD"}, {attrs}, {"X@GRAD"}); \ } #define DEFINE_ACT_GRAD_DEPOUT_OP_ARGMAP(func_name, op_name, attrs) \ KernelSignature func_name##GradOpArgumentMapping( \ const ArgumentMappingContext& ctx) { \ - return KernelSignature(op_name "_grad", \ - {"Out", GradVarName("Out")}, \ - {attrs}, \ - {GradVarName("X")}); \ + return KernelSignature( \ + op_name "_grad", {"Out", "Out@GRAD"}, {attrs}, {"X@GRAD"}); \ } -#define DEFINE_ACT_GRAD_NODEP_OP_ARGMAP(func_name, op_name, attrs) \ - KernelSignature func_name##GradOpArgumentMapping( \ - const ArgumentMappingContext& ctx) { \ - return KernelSignature( \ - op_name "_grad", {GradVarName("Out")}, {attrs}, {GradVarName("X")}); \ +#define DEFINE_ACT_GRAD_NODEP_OP_ARGMAP(func_name, op_name, attrs) \ + KernelSignature func_name##GradOpArgumentMapping( \ + const ArgumentMappingContext& ctx) { \ + return KernelSignature( \ + op_name "_grad", {"Out@GRAD"}, {attrs}, {"X@GRAD"}); \ } #define comma , @@ -165,15 +161,12 @@ KernelSignature EluOpArgumentMapping(const ArgumentMappingContext& ctx) { } KernelSignature LogitGradOpArgumentMapping(const ArgumentMappingContext& ctx) { - return KernelSignature( - "logit_grad", {"X", GradVarName("Out")}, {"eps"}, {GradVarName("X")}); + return KernelSignature("logit_grad", {"X", "Out@GRAD"}, {"eps"}, {"X@GRAD"}); } KernelSignature EluGradOpArgumentMapping(const ArgumentMappingContext& ctx) { - return KernelSignature("elu_grad", - {"X", "Out", GradVarName("Out")}, - {"alpha"}, - {GradVarName("X")}); + return KernelSignature( + "elu_grad", {"X", "Out", "Out@GRAD"}, {"alpha"}, {"X@GRAD"}); } KernelSignature EluDoubleGradOpArgumentMapping( @@ -198,13 +191,11 @@ KernelSignature PowOpArgumentMapping(const ArgumentMappingContext& ctx) { KernelSignature PowGradOpArgumentMapping(const ArgumentMappingContext& ctx) { if (ctx.HasInput("FactorTensor")) { - return KernelSignature("pow_grad", - {"X", GradVarName("Out")}, - {"FactorTensor"}, - {GradVarName("X")}); + return KernelSignature( + "pow_grad", {"X", "Out@GRAD"}, {"FactorTensor"}, {"X@GRAD"}); } else { return KernelSignature( - "pow_grad", {"X", GradVarName("Out")}, {"factor"}, {GradVarName("X")}); + "pow_grad", {"X", "Out@GRAD"}, {"factor"}, {"X@GRAD"}); } } diff --git a/paddle/phi/ops/compat/adam_sig.cc b/paddle/phi/ops/compat/adam_sig.cc index 0bc31cd28cb6c..958538cd7dfc2 100644 --- a/paddle/phi/ops/compat/adam_sig.cc +++ b/paddle/phi/ops/compat/adam_sig.cc @@ -19,7 +19,7 @@ namespace phi { KernelSignature AdamOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::SmallVector in_names = {"Param", + paddle::SmallVector in_names = {"Param", "Grad", "LearningRate", "Moment1", @@ -28,13 +28,13 @@ KernelSignature AdamOpArgumentMapping(const ArgumentMappingContext& ctx) { "Beta2Pow", "MasterParam", "SkipUpdate"}; - paddle::SmallVector out_names = {"ParamOut", + paddle::SmallVector out_names = {"ParamOut", "Moment1Out", "Moment2Out", "Beta1PowOut", "Beta2PowOut", "MasterParamOut"}; - paddle::SmallVector attr_names; + paddle::SmallVector attr_names; attr_names.emplace_back(ctx.HasInput("Beta1Tensor") ? "Beta1Tensor" : "beta1"); diff --git a/paddle/phi/ops/compat/adamw_sig.cc b/paddle/phi/ops/compat/adamw_sig.cc index 763304bdf3511..e417aa30ba493 100644 --- a/paddle/phi/ops/compat/adamw_sig.cc +++ b/paddle/phi/ops/compat/adamw_sig.cc @@ -19,7 +19,7 @@ namespace phi { KernelSignature AdamwOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::SmallVector in_names = {"Param", + paddle::SmallVector in_names = {"Param", "Grad", "LearningRate", "Moment1", @@ -28,13 +28,13 @@ KernelSignature AdamwOpArgumentMapping(const ArgumentMappingContext& ctx) { "Beta2Pow", "MasterParam", "SkipUpdate"}; - paddle::SmallVector out_names = {"ParamOut", + paddle::SmallVector out_names = {"ParamOut", "Moment1Out", "Moment2Out", "Beta1PowOut", "Beta2PowOut", "MasterParamOut"}; - paddle::SmallVector attr_names; + paddle::SmallVector attr_names; attr_names.emplace_back(ctx.HasInput("Beta1Tensor") ? "Beta1Tensor" : "beta1"); diff --git a/paddle/phi/ops/compat/addmm_sig.cc b/paddle/phi/ops/compat/addmm_sig.cc index b3bc0bb23a71e..3919c875f5606 100644 --- a/paddle/phi/ops/compat/addmm_sig.cc +++ b/paddle/phi/ops/compat/addmm_sig.cc @@ -17,11 +17,10 @@ namespace phi { KernelSignature AddmmGradOpArgumentMapping(const ArgumentMappingContext& ctx) { - return KernelSignature( - "addmm_grad", - {"Input", "X", "Y", GradVarName("Out")}, - {"Alpha", "Beta"}, - {GradVarName("Input"), GradVarName("X"), GradVarName("Y")}); + return KernelSignature("addmm_grad", + {"Input", "X", "Y", "Out@GRAD"}, + {"Alpha", "Beta"}, + {"Input@GRAD", "X@GRAD", "Y@GRAD"}); } } // namespace phi diff --git a/paddle/phi/ops/compat/argsort_sig.cc b/paddle/phi/ops/compat/argsort_sig.cc index 62133a441ff12..70531f16916dd 100644 --- a/paddle/phi/ops/compat/argsort_sig.cc +++ b/paddle/phi/ops/compat/argsort_sig.cc @@ -19,9 +19,9 @@ namespace phi { KernelSignature ArgsortGradOpArgumentMapping( const ArgumentMappingContext& ctx) { return KernelSignature("argsort_grad", - {"Indices", "X", GradVarName("Out")}, + {"Indices", "X", "Out@GRAD"}, {"axis", "descending"}, - {GradVarName("X")}); + {"X@GRAD"}); } } // namespace phi diff --git a/paddle/phi/ops/compat/atan2_sig.cc b/paddle/phi/ops/compat/atan2_sig.cc index 8a6049e67b668..9fef8560df979 100644 --- a/paddle/phi/ops/compat/atan2_sig.cc +++ b/paddle/phi/ops/compat/atan2_sig.cc @@ -17,10 +17,8 @@ namespace phi { KernelSignature Atan2GradOpArgumentMapping(const ArgumentMappingContext& ctx) { - return KernelSignature("atan2_grad", - {"X1", "X2", GradVarName("Out")}, - {}, - {GradVarName("X1"), GradVarName("X2")}); + return KernelSignature( + "atan2_grad", {"X1", "X2", "Out@GRAD"}, {}, {"X1@GRAD", "X2@GRAD"}); } } // namespace phi diff --git a/paddle/phi/ops/compat/batch_norm_sig.cc b/paddle/phi/ops/compat/batch_norm_sig.cc index cfd9f4def933a..14affe60b9d55 100644 --- a/paddle/phi/ops/compat/batch_norm_sig.cc +++ b/paddle/phi/ops/compat/batch_norm_sig.cc @@ -57,27 +57,26 @@ KernelSignature BatchNormOpArgumentMapping(const ArgumentMappingContext& ctx) { KernelSignature BatchNormGradOpArgumentMapping( const ArgumentMappingContext& ctx) { - return KernelSignature( - "batch_norm_grad", - { - "X", - "Scale", - "Bias", - "Mean", - "Variance", - "SavedMean", - "SavedVariance", - "ReserveSpace", - GradVarName("Y"), - }, - {"momentum", - "epsilon", - "data_layout", - "is_test", - "use_global_stats", - "trainable_statistics", - "fuse_with_relu"}, - {GradVarName("X"), GradVarName("Scale"), GradVarName("Bias")}); + return KernelSignature("batch_norm_grad", + { + "X", + "Scale", + "Bias", + "Mean", + "Variance", + "SavedMean", + "SavedVariance", + "ReserveSpace", + "Y@GRAD", + }, + {"momentum", + "epsilon", + "data_layout", + "is_test", + "use_global_stats", + "trainable_statistics", + "fuse_with_relu"}, + {"X@GRAD", "Scale@GRAD", "Bias@GRAD"}); } KernelSignature BatchNormGradGradOpArgumentMapping( diff --git a/paddle/phi/ops/compat/bce_loss_sig.cc b/paddle/phi/ops/compat/bce_loss_sig.cc index 17f76067d22db..5575fa277eb7f 100644 --- a/paddle/phi/ops/compat/bce_loss_sig.cc +++ b/paddle/phi/ops/compat/bce_loss_sig.cc @@ -18,10 +18,8 @@ namespace phi { KernelSignature BCELossGradOpArgumentMapping( const ArgumentMappingContext& ctx) { - return KernelSignature("bce_loss_grad", - {"X", "Label", GradVarName("Out")}, - {}, - {GradVarName("X")}); + return KernelSignature( + "bce_loss_grad", {"X", "Label", "Out@GRAD"}, {}, {"X@GRAD"}); } } // namespace phi diff --git a/paddle/phi/ops/compat/bilinear_tensor_product_sig.cc b/paddle/phi/ops/compat/bilinear_tensor_product_sig.cc index 570bf7ce943d6..95a867fd3f741 100644 --- a/paddle/phi/ops/compat/bilinear_tensor_product_sig.cc +++ b/paddle/phi/ops/compat/bilinear_tensor_product_sig.cc @@ -25,12 +25,9 @@ KernelSignature BilinearTensorProductOpArgumentMapping( KernelSignature BilinearTensorProductGradOpArgumentMapping( const ArgumentMappingContext& ctx) { return KernelSignature("bilinear_tensor_product_grad", - {"X", "Y", "Weight", GradVarName("Out")}, + {"X", "Y", "Weight", "Out@GRAD"}, {}, - {GradVarName("X"), - GradVarName("Y"), - GradVarName("Weight"), - GradVarName("Bias")}); + {"X@GRAD", "Y@GRAD", "Weight@GRAD", "Bias@GRAD"}); } } // namespace phi diff --git a/paddle/phi/ops/compat/broadcast_tensors_sig.cc b/paddle/phi/ops/compat/broadcast_tensors_sig.cc index 2c979c4aedcc8..d0fcbb33be2a7 100644 --- a/paddle/phi/ops/compat/broadcast_tensors_sig.cc +++ b/paddle/phi/ops/compat/broadcast_tensors_sig.cc @@ -19,7 +19,7 @@ namespace phi { KernelSignature BroadcastTensorsGradOpArgumentMapping( const ArgumentMappingContext& ctx) { return KernelSignature( - "broadcast_tensors_grad", {GradVarName("Out")}, {}, {GradVarName("X")}); + "broadcast_tensors_grad", {"Out@GRAD"}, {}, {"X@GRAD"}); } } // namespace phi diff --git a/paddle/phi/ops/compat/cholesky_sig.cc b/paddle/phi/ops/compat/cholesky_sig.cc index 8c7ca75704669..9a26ea5c0c57b 100644 --- a/paddle/phi/ops/compat/cholesky_sig.cc +++ b/paddle/phi/ops/compat/cholesky_sig.cc @@ -18,10 +18,8 @@ namespace phi { KernelSignature CholeskyGradOpArgumentMapping( const ArgumentMappingContext& ctx) { - return KernelSignature("cholesky_grad", - {"Out", GradVarName("Out")}, - {"upper"}, - {GradVarName("X")}); + return KernelSignature( + "cholesky_grad", {"Out", "Out@GRAD"}, {"upper"}, {"X@GRAD"}); } } // namespace phi diff --git a/paddle/phi/ops/compat/cholesky_solve_sig.cc b/paddle/phi/ops/compat/cholesky_solve_sig.cc index 6a9759f8352a0..2696d80a49f43 100644 --- a/paddle/phi/ops/compat/cholesky_solve_sig.cc +++ b/paddle/phi/ops/compat/cholesky_solve_sig.cc @@ -19,9 +19,9 @@ namespace phi { KernelSignature CholeskySolveGradOpArgumentMapping( const ArgumentMappingContext& ctx) { return KernelSignature("cholesky_solve_grad", - {"X", "Y", "Out", GradVarName("Out")}, + {"X", "Y", "Out", "Out@GRAD"}, {"upper"}, - {GradVarName("X"), GradVarName("Y")}); + {"X@GRAD", "Y@GRAD"}); } } // namespace phi diff --git a/paddle/phi/ops/compat/clip_sig.cc b/paddle/phi/ops/compat/clip_sig.cc index 78fa6c36a5149..25a34f2b9c89f 100644 --- a/paddle/phi/ops/compat/clip_sig.cc +++ b/paddle/phi/ops/compat/clip_sig.cc @@ -18,7 +18,7 @@ namespace phi { KernelSignature ClipOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::SmallVector attr_names; + paddle::SmallVector attr_names; attr_names.emplace_back(ctx.HasInput("Min") ? "Min" : "min"); attr_names.emplace_back(ctx.HasInput("Max") ? "Max" : "max"); if (ctx.IsDenseTensorInput("X")) { @@ -57,27 +57,19 @@ KernelSignature ClipOpArgumentMapping(const ArgumentMappingContext& ctx) { KernelSignature ClipGradOpArgumentMapping(const ArgumentMappingContext& ctx) { if (ctx.HasInput("Min")) { if (ctx.HasInput("Max")) { - return KernelSignature("clip_grad", - {"X", GradVarName("Out")}, - {"Min", "Max"}, - {GradVarName("X")}); + return KernelSignature( + "clip_grad", {"X", "Out@GRAD"}, {"Min", "Max"}, {"X@GRAD"}); } else { - return KernelSignature("clip_grad", - {"X", GradVarName("Out")}, - {"Min", "max"}, - {GradVarName("X")}); + return KernelSignature( + "clip_grad", {"X", "Out@GRAD"}, {"Min", "max"}, {"X@GRAD"}); } } else { if (ctx.HasInput("Max")) { - return KernelSignature("clip_grad", - {"X", GradVarName("Out")}, - {"min", "Max"}, - {GradVarName("X")}); + return KernelSignature( + "clip_grad", {"X", "Out@GRAD"}, {"min", "Max"}, {"X@GRAD"}); } else { - return KernelSignature("clip_grad", - {"X", GradVarName("Out")}, - {"min", "max"}, - {GradVarName("X")}); + return KernelSignature( + "clip_grad", {"X", "Out@GRAD"}, {"min", "max"}, {"X@GRAD"}); } } } diff --git a/paddle/phi/ops/compat/complex_sig.cc b/paddle/phi/ops/compat/complex_sig.cc index b9f59c97fb50f..88156677d34df 100644 --- a/paddle/phi/ops/compat/complex_sig.cc +++ b/paddle/phi/ops/compat/complex_sig.cc @@ -17,13 +17,11 @@ namespace phi { KernelSignature RealGradOpArgumentMapping(const ArgumentMappingContext& ctx) { - return KernelSignature( - "real_grad", {GradVarName("Out")}, {}, {GradVarName("X")}); + return KernelSignature("real_grad", {"Out@GRAD"}, {}, {"X@GRAD"}); } KernelSignature ImagGradOpArgumentMapping(const ArgumentMappingContext& ctx) { - return KernelSignature( - "imag_grad", {GradVarName("Out")}, {}, {GradVarName("X")}); + return KernelSignature("imag_grad", {"Out@GRAD"}, {}, {"X@GRAD"}); } } // namespace phi diff --git a/paddle/phi/ops/compat/concat_sig.cc b/paddle/phi/ops/compat/concat_sig.cc index d443f521c6146..d53bb5793bc3a 100644 --- a/paddle/phi/ops/compat/concat_sig.cc +++ b/paddle/phi/ops/compat/concat_sig.cc @@ -25,15 +25,11 @@ KernelSignature ConcatOpArgumentMapping(const ArgumentMappingContext& ctx) { KernelSignature ConcatGradOpArgumentMapping(const ArgumentMappingContext& ctx) { if (ctx.HasInput("AxisTensor")) { - return KernelSignature("concat_grad", - {"X", {GradVarName("Out")}}, - {"AxisTensor"}, - {{GradVarName("X")}}); + return KernelSignature( + "concat_grad", {"X", {"Out@GRAD"}}, {"AxisTensor"}, {{"X@GRAD"}}); } - return KernelSignature("concat_grad", - {"X", {GradVarName("Out")}}, - {"axis"}, - {{GradVarName("X")}}); + return KernelSignature( + "concat_grad", {"X", {"Out@GRAD"}}, {"axis"}, {{"X@GRAD"}}); } } // namespace phi diff --git a/paddle/phi/ops/compat/conv2d_sig.cc b/paddle/phi/ops/compat/conv2d_sig.cc index 7cc0d6ad17535..617c6e289bf2b 100644 --- a/paddle/phi/ops/compat/conv2d_sig.cc +++ b/paddle/phi/ops/compat/conv2d_sig.cc @@ -46,7 +46,7 @@ KernelSignature Conv2dOpArgumentMapping(const ArgumentMappingContext& ctx) { KernelSignature Conv2dGradOpArgumentMapping(const ArgumentMappingContext& ctx) { return KernelSignature("conv2d_grad", - {"Input", "Filter", GradVarName("Output")}, + {"Input", "Filter", "Output@GRAD"}, {"strides", "paddings", "padding_algorithm", @@ -56,7 +56,7 @@ KernelSignature Conv2dGradOpArgumentMapping(const ArgumentMappingContext& ctx) { "use_addto", "workspace_size_MB", "exhaustive_search"}, - {GradVarName("Input"), GradVarName("Filter")}); + {"Input@GRAD", "Filter@GRAD"}); } KernelSignature Conv2dDoubleGradOpArgumentMapping( diff --git a/paddle/phi/ops/compat/conv3d_sig.cc b/paddle/phi/ops/compat/conv3d_sig.cc index b24c08b60c950..c6aae1bf5bb54 100644 --- a/paddle/phi/ops/compat/conv3d_sig.cc +++ b/paddle/phi/ops/compat/conv3d_sig.cc @@ -33,7 +33,7 @@ KernelSignature Conv3dOpArgumentMapping(const ArgumentMappingContext& ctx) { KernelSignature Conv3dGradOpArgumentMapping(const ArgumentMappingContext& ctx) { return KernelSignature("conv2d_grad", - {"Input", "Filter", GradVarName("Output")}, + {"Input", "Filter", "Output@GRAD"}, {"strides", "paddings", "padding_algorithm", @@ -43,7 +43,7 @@ KernelSignature Conv3dGradOpArgumentMapping(const ArgumentMappingContext& ctx) { "use_addto", "workspace_size_MB", "exhaustive_search"}, - {GradVarName("Input"), GradVarName("Filter")}); + {"Input@GRAD", "Filter@GRAD"}); } KernelSignature Conv3dDoubleGradOpArgumentMapping( diff --git a/paddle/phi/ops/compat/conv_transpose_sig.cc b/paddle/phi/ops/compat/conv_transpose_sig.cc index 8697168b82747..a040bce6f78ee 100644 --- a/paddle/phi/ops/compat/conv_transpose_sig.cc +++ b/paddle/phi/ops/compat/conv_transpose_sig.cc @@ -34,7 +34,7 @@ KernelSignature Conv2dTransposeOpArgumentMapping( KernelSignature Conv2dTransposeGradOpArgumentMapping( const ArgumentMappingContext& ctx) { return KernelSignature("conv2d_transpose_grad", - {"Input", "Filter", GradVarName("Output")}, + {"Input", "Filter", "Output@GRAD"}, {"strides", "paddings", "output_padding", @@ -43,7 +43,7 @@ KernelSignature Conv2dTransposeGradOpArgumentMapping( "groups", "dilations", "data_format"}, - {GradVarName("Input"), GradVarName("Filter")}); + {"Input@GRAD", "Filter@GRAD"}); } KernelSignature Conv2dTransposeDoubleGradOpArgumentMapping( @@ -79,7 +79,7 @@ KernelSignature Conv3dTransposeOpArgumentMapping( KernelSignature Conv3dTransposeGradOpArgumentMapping( const ArgumentMappingContext& ctx) { return KernelSignature("conv3d_transpose_grad", - {"Input", "Filter", GradVarName("Output")}, + {"Input", "Filter", "Output@GRAD"}, {"strides", "paddings", "output_padding", @@ -88,7 +88,7 @@ KernelSignature Conv3dTransposeGradOpArgumentMapping( "groups", "dilations", "data_format"}, - {GradVarName("Input"), GradVarName("Filter")}); + {"Input@GRAD", "Filter@GRAD"}); } KernelSignature DepthwiseConv2dTransposeOpArgumentMapping( @@ -109,7 +109,7 @@ KernelSignature DepthwiseConv2dTransposeOpArgumentMapping( KernelSignature DepthwiseConv2dTransposeGradOpArgumentMapping( const ArgumentMappingContext& ctx) { return KernelSignature("depthwise_conv2d_transpose_grad", - {"Input", "Filter", GradVarName("Output")}, + {"Input", "Filter", "Output@GRAD"}, {"strides", "paddings", "output_padding", @@ -118,7 +118,7 @@ KernelSignature DepthwiseConv2dTransposeGradOpArgumentMapping( "groups", "dilations", "data_format"}, - {GradVarName("Input"), GradVarName("Filter")}); + {"Input@GRAD", "Filter@GRAD"}); } } // namespace phi diff --git a/paddle/phi/ops/compat/cross_sig.cc b/paddle/phi/ops/compat/cross_sig.cc index 307c2ac5164b5..2a8a46678cd28 100644 --- a/paddle/phi/ops/compat/cross_sig.cc +++ b/paddle/phi/ops/compat/cross_sig.cc @@ -21,10 +21,8 @@ KernelSignature CrossOpArgumentMapping(const ArgumentMappingContext& ctx) { } KernelSignature CrossGradOpArgumentMapping(const ArgumentMappingContext& ctx) { - return KernelSignature("cross_grad", - {"X", "Y", GradVarName("Out")}, - {"dim"}, - {GradVarName("X"), GradVarName("Y")}); + return KernelSignature( + "cross_grad", {"X", "Y", "Out@GRAD"}, {"dim"}, {"X@GRAD", "Y@GRAD"}); } } // namespace phi diff --git a/paddle/phi/ops/compat/cumprod_sig.cc b/paddle/phi/ops/compat/cumprod_sig.cc index 01084e764ed9e..ffe0ba75bb9df 100644 --- a/paddle/phi/ops/compat/cumprod_sig.cc +++ b/paddle/phi/ops/compat/cumprod_sig.cc @@ -18,10 +18,8 @@ namespace phi { KernelSignature CumprodGradGradOpArgumentMapping( const ArgumentMappingContext& ctx) { - return KernelSignature("cumprod_grad", - {"X", "Out", GradVarName("Out")}, - {"dim"}, - {GradVarName("X")}); + return KernelSignature( + "cumprod_grad", {"X", "Out", "Out@GRAD"}, {"dim"}, {"X@GRAD"}); } } // namespace phi diff --git a/paddle/phi/ops/compat/deformable_conv_sig.cc b/paddle/phi/ops/compat/deformable_conv_sig.cc index a84a084009087..aa2537aa10e13 100644 --- a/paddle/phi/ops/compat/deformable_conv_sig.cc +++ b/paddle/phi/ops/compat/deformable_conv_sig.cc @@ -33,17 +33,14 @@ KernelSignature DeformableConvGradOpArgumentMapping( const ArgumentMappingContext& ctx) { return KernelSignature( "deformable_conv_grad", - {"Input", "Offset", "Filter", "Mask", GradVarName("Output")}, + {"Input", "Offset", "Filter", "Mask", "Output@GRAD"}, {"strides", "paddings", "dilations", "deformable_groups", "groups", "im2col_step"}, - {GradVarName("Input"), - GradVarName("Offset"), - GradVarName("Filter"), - GradVarName("Mask")}); + {"Input@GRAD", "Offset@GRAD", "Filter@GRAD", "Mask@GRAD"}); } } // namespace phi diff --git a/paddle/phi/ops/compat/depthwise_conv2d_sig.cc b/paddle/phi/ops/compat/depthwise_conv2d_sig.cc index d2d7451ecafce..1014d45e70a3f 100644 --- a/paddle/phi/ops/compat/depthwise_conv2d_sig.cc +++ b/paddle/phi/ops/compat/depthwise_conv2d_sig.cc @@ -36,7 +36,7 @@ KernelSignature DepthwiseConv2dOpArgumentMapping( KernelSignature DepthwiseConv2dGradOpArgumentMapping( const ArgumentMappingContext& ctx) { return KernelSignature("depthwise_conv2d_grad", - {"Input", "Filter", GradVarName("Output")}, + {"Input", "Filter", "Output@GRAD"}, {"strides", "paddings", "padding_algorithm", @@ -47,7 +47,7 @@ KernelSignature DepthwiseConv2dGradOpArgumentMapping( "workspace_size_MB", "exhaustive_search", "fuse_relu_before_depthwise_conv"}, - {GradVarName("Input"), GradVarName("Filter")}); + {"Input@GRAD", "Filter@GRAD"}); } KernelSignature DepthwiseConv2dDoubleGradOpArgumentMapping( diff --git a/paddle/phi/ops/compat/determinant_sig.cc b/paddle/phi/ops/compat/determinant_sig.cc index 7bcd30ec5d79b..ee1d53704c123 100644 --- a/paddle/phi/ops/compat/determinant_sig.cc +++ b/paddle/phi/ops/compat/determinant_sig.cc @@ -18,10 +18,8 @@ namespace phi { KernelSignature DeterminantGradOpArgumentMapping( const ArgumentMappingContext& ctx) { - return KernelSignature("determinant_grad", - {"Input", "Out", GradVarName("Out")}, - {}, - {GradVarName("Input")}); + return KernelSignature( + "determinant_grad", {"Input", "Out", "Out@GRAD"}, {}, {"Input@GRAD"}); } } // namespace phi diff --git a/paddle/phi/ops/compat/diag_sig.cc b/paddle/phi/ops/compat/diag_sig.cc index f3245b922c0d9..b232c714c9710 100644 --- a/paddle/phi/ops/compat/diag_sig.cc +++ b/paddle/phi/ops/compat/diag_sig.cc @@ -22,7 +22,7 @@ KernelSignature DiagOpArgumentMapping(const ArgumentMappingContext& ctx) { KernelSignature DiagGradOpArgumentMapping(const ArgumentMappingContext& ctx) { return KernelSignature( - "diag_grad", {"X", GradVarName("Out")}, {"offset"}, {GradVarName("X")}); + "diag_grad", {"X", "Out@GRAD"}, {"offset"}, {"X@GRAD"}); } } // namespace phi diff --git a/paddle/phi/ops/compat/diagonal_sig.cc b/paddle/phi/ops/compat/diagonal_sig.cc index b4a424ec06bf2..94cecc3042a54 100644 --- a/paddle/phi/ops/compat/diagonal_sig.cc +++ b/paddle/phi/ops/compat/diagonal_sig.cc @@ -19,9 +19,9 @@ namespace phi { KernelSignature DiagonalGradOpArgumentMapping( const ArgumentMappingContext& ctx) { return KernelSignature("diagonal_grad", - {"Input", GradVarName("Out")}, + {"Input", "Out@GRAD"}, {"offset", "axis1", "axis2"}, - {GradVarName("Input")}); + {"Input@GRAD"}); } } // namespace phi diff --git a/paddle/phi/ops/compat/digamma_sig.cc b/paddle/phi/ops/compat/digamma_sig.cc index 12ef3056f1e68..6c14dd9bf1744 100644 --- a/paddle/phi/ops/compat/digamma_sig.cc +++ b/paddle/phi/ops/compat/digamma_sig.cc @@ -18,8 +18,7 @@ namespace phi { KernelSignature DigammaGradOpArgumentMapping( const ArgumentMappingContext& ctx) { - return KernelSignature( - "digamma_grad", {"X", GradVarName("Out")}, {}, {GradVarName("X")}); + return KernelSignature("digamma_grad", {"X", "Out@GRAD"}, {}, {"X@GRAD"}); } } // namespace phi diff --git a/paddle/phi/ops/compat/dist_sig.cc b/paddle/phi/ops/compat/dist_sig.cc index 18a30b9b84048..cc702fefbc940 100644 --- a/paddle/phi/ops/compat/dist_sig.cc +++ b/paddle/phi/ops/compat/dist_sig.cc @@ -17,10 +17,8 @@ limitations under the License. */ namespace phi { KernelSignature DistGradOpArgumentMapping(const ArgumentMappingContext& ctx) { - return KernelSignature("dist_grad", - {"X", "Y", "Out", GradVarName("Out")}, - {"p"}, - {GradVarName("X"), GradVarName("Y")}); + return KernelSignature( + "dist_grad", {"X", "Y", "Out", "Out@GRAD"}, {"p"}, {"X@GRAD", "Y@GRAD"}); } } // namespace phi diff --git a/paddle/phi/ops/compat/dot_sig.cc b/paddle/phi/ops/compat/dot_sig.cc index 2437ecc1ca767..2187a7eb4fca0 100644 --- a/paddle/phi/ops/compat/dot_sig.cc +++ b/paddle/phi/ops/compat/dot_sig.cc @@ -17,10 +17,8 @@ limitations under the License. */ namespace phi { KernelSignature DotGradOpArgumentMapping(const ArgumentMappingContext& ctx) { - return KernelSignature("dot_grad", - {"X", "Y", GradVarName("Out")}, - {}, - {GradVarName("X"), GradVarName("Y")}); + return KernelSignature( + "dot_grad", {"X", "Y", "Out@GRAD"}, {}, {"X@GRAD", "Y@GRAD"}); } } // namespace phi diff --git a/paddle/phi/ops/compat/dropout_sig.cc b/paddle/phi/ops/compat/dropout_sig.cc index 6bf229c98bd07..712c5cbb0d634 100644 --- a/paddle/phi/ops/compat/dropout_sig.cc +++ b/paddle/phi/ops/compat/dropout_sig.cc @@ -27,9 +27,9 @@ KernelSignature DropoutOpArgumentMapping(const ArgumentMappingContext& ctx) { KernelSignature DropoutGradOpArgumentMapping( const ArgumentMappingContext& ctx) { return KernelSignature("dropout_grad", - {"Mask", GradVarName("Out")}, + {"Mask", "Out@GRAD"}, {"dropout_prob", "is_test", "dropout_implementation"}, - {GradVarName("X")}); + {"X@GRAD"}); } } // namespace phi diff --git a/paddle/phi/ops/compat/eigh_sig.cc b/paddle/phi/ops/compat/eigh_sig.cc index e50a9a5a12a56..58718b6e32c66 100644 --- a/paddle/phi/ops/compat/eigh_sig.cc +++ b/paddle/phi/ops/compat/eigh_sig.cc @@ -17,13 +17,11 @@ namespace phi { KernelSignature EighGradOpArgumentMapping(const ArgumentMappingContext& ctx) { - return KernelSignature("eigh_grad", - {"Eigenvalues", - "Eigenvectors", - GradVarName("Eigenvalues"), - GradVarName("Eigenvectors")}, - {}, - {GradVarName("X")}); + return KernelSignature( + "eigh_grad", + {"Eigenvalues", "Eigenvectors", "Eigenvalues@GRAD", "Eigenvectors@GRAD"}, + {}, + {"X@GRAD"}); } } // namespace phi diff --git a/paddle/phi/ops/compat/elementwise_sig.cc b/paddle/phi/ops/compat/elementwise_sig.cc index 0a58d86b05b06..19110eb0e0ab8 100644 --- a/paddle/phi/ops/compat/elementwise_sig.cc +++ b/paddle/phi/ops/compat/elementwise_sig.cc @@ -106,10 +106,8 @@ KernelSignature ElementwisePowOpArgumentMapping( KernelSignature ElementwiseAddGradOpArgumentMapping( const ArgumentMappingContext& ctx) { - return KernelSignature("add_grad", - {"X", "Y", GradVarName("Out")}, - {"axis"}, - {GradVarName("X"), GradVarName("Y")}); + return KernelSignature( + "add_grad", {"X", "Y", "Out@GRAD"}, {"axis"}, {"X@GRAD", "Y@GRAD"}); } KernelSignature ElementwiseAddDoubleGradOpArgumentMapping( @@ -128,10 +126,8 @@ KernelSignature ElementwiseAddTripleGradOpArgumentMapping( KernelSignature ElementwiseSubGradOpArgumentMapping( const ArgumentMappingContext& ctx) { - return KernelSignature("subtract_grad", - {"X", "Y", GradVarName("Out")}, - {"axis"}, - {GradVarName("X"), GradVarName("Y")}); + return KernelSignature( + "subtract_grad", {"X", "Y", "Out@GRAD"}, {"axis"}, {"X@GRAD", "Y@GRAD"}); } KernelSignature ElementwiseSubDoubleGradOpArgumentMapping( @@ -143,17 +139,15 @@ KernelSignature ElementwiseSubDoubleGradOpArgumentMapping( KernelSignature ElementwiseDivGradOpArgumentMapping( const ArgumentMappingContext& ctx) { return KernelSignature("divide_grad", - {"X", "Y", "Out", GradVarName("Out")}, + {"X", "Y", "Out", "Out@GRAD"}, {"axis"}, - {GradVarName("X"), GradVarName("Y")}); + {"X@GRAD", "Y@GRAD"}); } KernelSignature ElementwiseFMinGradOpArgumentMapping( const ArgumentMappingContext& ctx) { - return KernelSignature("fmin_grad", - {"X", "Y", GradVarName("Out")}, - {"axis"}, - {GradVarName("X"), GradVarName("Y")}); + return KernelSignature( + "fmin_grad", {"X", "Y", "Out@GRAD"}, {"axis"}, {"X@GRAD", "Y@GRAD"}); } KernelSignature ElementwiseDivDoubleGradOpArgumentMapping( @@ -161,15 +155,13 @@ KernelSignature ElementwiseDivDoubleGradOpArgumentMapping( return KernelSignature("divide_double_grad", {"Y", "Out", "DX", "DDX", "DDY"}, {"axis"}, - {GradVarName("Y"), "DOut", "DDOut"}); + {"Y@GRAD", "DOut", "DDOut"}); } KernelSignature ElementwiseMulGradOpArgumentMapping( const ArgumentMappingContext& ctx) { - return KernelSignature("multiply_grad", - {"X", "Y", GradVarName("Out")}, - {"axis"}, - {GradVarName("X"), GradVarName("Y")}); + return KernelSignature( + "multiply_grad", {"X", "Y", "Out@GRAD"}, {"axis"}, {"X@GRAD", "Y@GRAD"}); } KernelSignature ElementwiseFMaxOpArgumentMapping( @@ -184,10 +176,8 @@ KernelSignature ElementwiseFMinOpArgumentMapping( KernelSignature ElementwiseFMaxGradOpArgumentMapping( const ArgumentMappingContext& ctx) { - return KernelSignature("fmax_grad", - {"X", "Y", GradVarName("Out")}, - {"axis"}, - {GradVarName("X"), GradVarName("Y")}); + return KernelSignature( + "fmax_grad", {"X", "Y", "Out@GRAD"}, {"axis"}, {"X@GRAD", "Y@GRAD"}); } KernelSignature ElementwiseMulDoubleGradOpArgumentMapping( @@ -195,7 +185,7 @@ KernelSignature ElementwiseMulDoubleGradOpArgumentMapping( return KernelSignature("multiply_double_grad", {"X", "Y", "DOut", "DDX", "DDY"}, {"axis"}, - {GradVarName("X"), GradVarName("Y"), "DDOut"}); + {"X@GRAD", "Y@GRAD", "DDOut"}); } KernelSignature ElementwiseMulTripleGradOpArgumentMapping( @@ -209,25 +199,21 @@ KernelSignature ElementwiseMulTripleGradOpArgumentMapping( KernelSignature ElementwiseMaxGradOpArgumentMapping( const ArgumentMappingContext& ctx) { - return KernelSignature("maximum_grad", - {"X", "Y", GradVarName("Out")}, - {"axis"}, - {GradVarName("X"), GradVarName("Y")}); + return KernelSignature( + "maximum_grad", {"X", "Y", "Out@GRAD"}, {"axis"}, {"X@GRAD", "Y@GRAD"}); } KernelSignature ElementwiseMinGradOpArgumentMapping( const ArgumentMappingContext& ctx) { - return KernelSignature("minimum_grad", - {"X", "Y", GradVarName("Out")}, - {"axis"}, - {GradVarName("X"), GradVarName("Y")}); + return KernelSignature( + "minimum_grad", {"X", "Y", "Out@GRAD"}, {"axis"}, {"X@GRAD", "Y@GRAD"}); } KernelSignature ElementwisePowGradOpArgumentMapping( const ArgumentMappingContext& ctx) { return KernelSignature("elementwise_pow_grad", - {"X", "Y", GradVarName("Out")}, + {"X", "Y", "Out@GRAD"}, {"axis"}, - {GradVarName("X"), GradVarName("Y")}); + {"X@GRAD", "Y@GRAD"}); } } // namespace phi diff --git a/paddle/phi/ops/compat/embedding_sig.cc b/paddle/phi/ops/compat/embedding_sig.cc index b79a381dcecc7..48debcafaf235 100644 --- a/paddle/phi/ops/compat/embedding_sig.cc +++ b/paddle/phi/ops/compat/embedding_sig.cc @@ -30,26 +30,26 @@ KernelSignature EmbeddingGradOpArgumentMapping( if (ctx.IsDenseTensorInput("W")) { if ((paddle::any_cast(ctx.Attr("is_sparse"))) == true) { return KernelSignature("embedding_sparse_grad", - {"Ids", "W", GradVarName("Out")}, + {"Ids", "W", "Out@GRAD"}, {"padding_idx"}, - {GradVarName("W")}); + {"W@GRAD"}); } else { return KernelSignature("embedding_grad", - {"Ids", "W", GradVarName("Out")}, + {"Ids", "W", "Out@GRAD"}, {"padding_idx"}, - {GradVarName("W")}); + {"W@GRAD"}); } } else { if ((paddle::any_cast(ctx.Attr("is_sparse"))) == true) { return KernelSignature("sparse_weight_embedding_sparse_grad", - {"Ids", "W", GradVarName("Out")}, + {"Ids", "W", "Out@GRAD"}, {"padding_idx"}, - {GradVarName("W")}); + {"W@GRAD"}); } else { return KernelSignature("sparse_weight_embedding_grad", - {"Ids", "W", GradVarName("Out")}, + {"Ids", "W", "Out@GRAD"}, {"padding_idx"}, - {GradVarName("W")}); + {"W@GRAD"}); } } } diff --git a/paddle/phi/ops/compat/erf_sig.cc b/paddle/phi/ops/compat/erf_sig.cc index 784727a98042d..6cd94e46c3ec3 100644 --- a/paddle/phi/ops/compat/erf_sig.cc +++ b/paddle/phi/ops/compat/erf_sig.cc @@ -17,8 +17,7 @@ namespace phi { KernelSignature ErfGradOpArgumentMapping(const ArgumentMappingContext& ctx) { - return KernelSignature( - "erf_grad", {"X", GradVarName("Out")}, {}, {GradVarName("X")}); + return KernelSignature("erf_grad", {"X", "Out@GRAD"}, {}, {"X@GRAD"}); } } // namespace phi diff --git a/paddle/phi/ops/compat/erfinv_sig.cc b/paddle/phi/ops/compat/erfinv_sig.cc index 490573191533f..37d30aaaeb685 100644 --- a/paddle/phi/ops/compat/erfinv_sig.cc +++ b/paddle/phi/ops/compat/erfinv_sig.cc @@ -17,8 +17,7 @@ namespace phi { KernelSignature ErfinvGradOpArgumentMapping(const ArgumentMappingContext& ctx) { - return KernelSignature( - "erfinv_grad", {"Out", GradVarName("Out")}, {}, {GradVarName("X")}); + return KernelSignature("erfinv_grad", {"Out", "Out@GRAD"}, {}, {"X@GRAD"}); } } // namespace phi diff --git a/paddle/phi/ops/compat/expand_as_sig.cc b/paddle/phi/ops/compat/expand_as_sig.cc index a616b63c10b3c..03b308f4a8b1d 100644 --- a/paddle/phi/ops/compat/expand_as_sig.cc +++ b/paddle/phi/ops/compat/expand_as_sig.cc @@ -22,10 +22,8 @@ KernelSignature ExpandAsOpArgumentMapping(const ArgumentMappingContext& ctx) { KernelSignature ExpandAsGradOpArgumentMapping( const ArgumentMappingContext& ctx) { - return KernelSignature("expand_as_grad", - {"X", GradVarName("Out")}, - {"target_shape"}, - {GradVarName("X")}); + return KernelSignature( + "expand_as_grad", {"X", "Out@GRAD"}, {"target_shape"}, {"X@GRAD"}); } } // namespace phi diff --git a/paddle/phi/ops/compat/expand_sig.cc b/paddle/phi/ops/compat/expand_sig.cc index 3b2e468267da0..c3df1595a2108 100644 --- a/paddle/phi/ops/compat/expand_sig.cc +++ b/paddle/phi/ops/compat/expand_sig.cc @@ -28,20 +28,14 @@ KernelSignature ExpandOpArgumentMapping(const ArgumentMappingContext& ctx) { KernelSignature ExpandGradOpArgumentMapping(const ArgumentMappingContext& ctx) { if (ctx.HasInput("Shape")) { - return KernelSignature("expand_grad", - {"X", GradVarName("Out")}, - {"Shape"}, - {GradVarName("X")}); + return KernelSignature( + "expand_grad", {"X", "Out@GRAD"}, {"Shape"}, {"X@GRAD"}); } else if (ctx.InputSize("expand_shapes_tensor") > 0) { - return KernelSignature("expand_grad", - {"X", GradVarName("Out")}, - {"expand_shapes_tensor"}, - {GradVarName("X")}); + return KernelSignature( + "expand_grad", {"X", "Out@GRAD"}, {"expand_shapes_tensor"}, {"X@GRAD"}); } else { - return KernelSignature("expand_grad", - {"X", GradVarName("Out")}, - {"shape"}, - {GradVarName("X")}); + return KernelSignature( + "expand_grad", {"X", "Out@GRAD"}, {"shape"}, {"X@GRAD"}); } } diff --git a/paddle/phi/ops/compat/flatten_sig.cc b/paddle/phi/ops/compat/flatten_sig.cc index 3e8119c38cf51..122e0efa22b7c 100644 --- a/paddle/phi/ops/compat/flatten_sig.cc +++ b/paddle/phi/ops/compat/flatten_sig.cc @@ -31,7 +31,7 @@ KernelSignature FlattenOpArgumentMapping(const ArgumentMappingContext& ctx) { KernelSignature FlattenGradOpArgumentMapping( const ArgumentMappingContext& ctx) { return KernelSignature( - "flatten_grad", {"XShape", GradVarName("Out")}, {}, {GradVarName("X")}); + "flatten_grad", {"XShape", "Out@GRAD"}, {}, {"X@GRAD"}); } } // namespace phi diff --git a/paddle/phi/ops/compat/frobenius_norm_sig.cc b/paddle/phi/ops/compat/frobenius_norm_sig.cc index 8fddee5edb1d8..1fb53c36cafb2 100644 --- a/paddle/phi/ops/compat/frobenius_norm_sig.cc +++ b/paddle/phi/ops/compat/frobenius_norm_sig.cc @@ -25,9 +25,9 @@ KernelSignature FrobeniusNormOpArgumentMapping( KernelSignature FrobeniusNormGradOpArgumentMapping( const ArgumentMappingContext& ctx) { return KernelSignature("frobenius_norm_grad", - {"X", "Out", GradVarName("Out")}, + {"X", "Out", "Out@GRAD"}, {"dim", "keep_dim", "reduce_all"}, - {GradVarName("X")}); + {"X@GRAD"}); } } // namespace phi diff --git a/paddle/phi/ops/compat/gather_scatter_sig.cc b/paddle/phi/ops/compat/gather_scatter_sig.cc index f71e30f85b09d..a942ebb44086f 100644 --- a/paddle/phi/ops/compat/gather_scatter_sig.cc +++ b/paddle/phi/ops/compat/gather_scatter_sig.cc @@ -17,25 +17,23 @@ namespace phi { KernelSignature GatherNdGradArgumentMapping(const ArgumentMappingContext& ctx) { - return KernelSignature("gather_nd_grad", - {"X", "Index", GradVarName("Out")}, - {}, - {GradVarName("X")}); + return KernelSignature( + "gather_nd_grad", {"X", "Index", "Out@GRAD"}, {}, {"X@GRAD"}); } KernelSignature ScatterGradArgumentMapping(const ArgumentMappingContext& ctx) { return KernelSignature("scatter_grad", - {"Ids", "Updates", GradVarName("Out")}, + {"Ids", "Updates", "Out@GRAD"}, {"overwrite"}, - {GradVarName("X"), GradVarName("Updates")}); + {"X@GRAD", "Updates@GRAD"}); } KernelSignature ScatterNdAddGradArgumentMapping( const ArgumentMappingContext& ctx) { return KernelSignature("scatter_nd_add_grad", - {"Index", "Updates", GradVarName("Out")}, + {"Index", "Updates", "Out@GRAD"}, {}, - {GradVarName("X"), GradVarName("Updates")}); + {"X@GRAD", "Updates@GRAD"}); } } // namespace phi diff --git a/paddle/phi/ops/compat/gather_sig.cc b/paddle/phi/ops/compat/gather_sig.cc index 6c47bbe48b8ee..af9e50638ce70 100644 --- a/paddle/phi/ops/compat/gather_sig.cc +++ b/paddle/phi/ops/compat/gather_sig.cc @@ -27,14 +27,14 @@ KernelSignature GatherOpArgumentMapping(const ArgumentMappingContext& ctx) { KernelSignature GatherGradOpArgumentMapping(const ArgumentMappingContext& ctx) { if (ctx.HasInput("Axis")) { return KernelSignature("gather_grad", - {"X", "Index", GradVarName("Out")}, + {"X", "Index", "Out@GRAD"}, {"Axis", "overwrite"}, - {GradVarName("X")}); + {"X@GRAD"}); } else { return KernelSignature("gather_grad", - {"X", "Index", GradVarName("Out")}, + {"X", "Index", "Out@GRAD"}, {"axis", "overwrite"}, - {GradVarName("X")}); + {"X@GRAD"}); } } diff --git a/paddle/phi/ops/compat/gelu_sig.cc b/paddle/phi/ops/compat/gelu_sig.cc index bf4b47bcf5fa9..45a0ecea713f9 100644 --- a/paddle/phi/ops/compat/gelu_sig.cc +++ b/paddle/phi/ops/compat/gelu_sig.cc @@ -21,10 +21,8 @@ KernelSignature GeluOpArgumentMapping(const ArgumentMappingContext& ctx) { } KernelSignature GeluGradOpArgumentMapping(const ArgumentMappingContext& ctx) { - return KernelSignature("gelu_grad", - {"X", GradVarName("Out")}, - {"approximate"}, - {GradVarName("X")}); + return KernelSignature( + "gelu_grad", {"X", "Out@GRAD"}, {"approximate"}, {"X@GRAD"}); } } // namespace phi diff --git a/paddle/phi/ops/compat/graph_send_recv_sig.cc b/paddle/phi/ops/compat/graph_send_recv_sig.cc index cf36b9baa2d03..9df2cf4d0fe91 100644 --- a/paddle/phi/ops/compat/graph_send_recv_sig.cc +++ b/paddle/phi/ops/compat/graph_send_recv_sig.cc @@ -28,9 +28,9 @@ KernelSignature GraphSendRecvGradOpArgumentMapping( const ArgumentMappingContext& ctx) { return KernelSignature( "graph_send_recv_grad", - {"X", "Src_index", "Dst_index", "Out", "Dst_count", GradVarName("Out")}, + {"X", "Src_index", "Dst_index", "Out", "Dst_count", "Out@GRAD"}, {"pool_type"}, - {GradVarName("X")}); + {"X@GRAD"}); } } // namespace phi diff --git a/paddle/phi/ops/compat/grid_sampler_sig.cc b/paddle/phi/ops/compat/grid_sampler_sig.cc index b76a9770d4ded..486d5230ee7a6 100644 --- a/paddle/phi/ops/compat/grid_sampler_sig.cc +++ b/paddle/phi/ops/compat/grid_sampler_sig.cc @@ -27,9 +27,9 @@ KernelSignature GridSamplerOpArgumentMapping( KernelSignature GridSamplerGradOpArgumentMapping( const ArgumentMappingContext& ctx) { return KernelSignature("grid_sample_grad", - {"X", "Grid", GradVarName("Output")}, + {"X", "Grid", "Output@GRAD"}, {"mode", "padding_mode", "align_corners"}, - {GradVarName("X"), GradVarName("Grid")}); + {"X@GRAD", "Grid@GRAD"}); } } // namespace phi diff --git a/paddle/phi/ops/compat/gumbel_softmax_sig.cc b/paddle/phi/ops/compat/gumbel_softmax_sig.cc index c7585a4e5f39a..65537f8c8948a 100644 --- a/paddle/phi/ops/compat/gumbel_softmax_sig.cc +++ b/paddle/phi/ops/compat/gumbel_softmax_sig.cc @@ -18,10 +18,8 @@ namespace phi { KernelSignature GumbelSoftmaxGradOpArgumentMapping( const ArgumentMappingContext& ctx) { - return KernelSignature("gumbel_softmax_grad", - {"Out", GradVarName("Out")}, - {"axis"}, - {GradVarName("X")}); + return KernelSignature( + "gumbel_softmax_grad", {"Out", "Out@GRAD"}, {"axis"}, {"X@GRAD"}); } } // namespace phi diff --git a/paddle/phi/ops/compat/hierarchical_sigmoid_sig.cc b/paddle/phi/ops/compat/hierarchical_sigmoid_sig.cc index 58c190fb657bb..5393439901b91 100644 --- a/paddle/phi/ops/compat/hierarchical_sigmoid_sig.cc +++ b/paddle/phi/ops/compat/hierarchical_sigmoid_sig.cc @@ -32,44 +32,42 @@ KernelSignature HierarchicalSigmoidOpArgumentMapping( KernelSignature HierarchicalSigmoidGradOpArgumentMapping( const ArgumentMappingContext& ctx) { - if (ctx.IsDenseTensorOutput(GradVarName("W"))) { - return KernelSignature( - "hierarchical_sigmoid_grad", - {"X", - "W", - "Label", - "PathTable", - "PathCode", - "Bias", - "PreOut", - GradVarName("Out")}, - {"num_classes", - "remote_prefetch", - "trainer_id", - "height_sections", - "epmap", - "table_names", - "is_sparse"}, - {GradVarName("X"), GradVarName("W"), GradVarName("Bias")}); - } else if (ctx.IsSelectedRowsOutput(GradVarName("W"))) { - return KernelSignature( - "hierarchical_sigmoid_grad_sr", - {"X", - "W", - "Label", - "PathTable", - "PathCode", - "Bias", - "PreOut", - GradVarName("Out")}, - {"num_classes", - "remote_prefetch", - "trainer_id", - "height_sections", - "epmap", - "table_names", - "is_sparse"}, - {GradVarName("X"), GradVarName("W"), GradVarName("Bias")}); + if (ctx.IsDenseTensorOutput("W@GRAD")) { + return KernelSignature("hierarchical_sigmoid_grad", + {"X", + "W", + "Label", + "PathTable", + "PathCode", + "Bias", + "PreOut", + "Out@GRAD"}, + {"num_classes", + "remote_prefetch", + "trainer_id", + "height_sections", + "epmap", + "table_names", + "is_sparse"}, + {"X@GRAD", "W@GRAD", "Bias@GRAD"}); + } else if (ctx.IsSelectedRowsOutput("W@GRAD")) { + return KernelSignature("hierarchical_sigmoid_grad_sr", + {"X", + "W", + "Label", + "PathTable", + "PathCode", + "Bias", + "PreOut", + "Out@GRAD"}, + {"num_classes", + "remote_prefetch", + "trainer_id", + "height_sections", + "epmap", + "table_names", + "is_sparse"}, + {"X@GRAD", "W@GRAD", "Bias@GRAD"}); } else { return KernelSignature("unregistered", {}, {}, {}); } diff --git a/paddle/phi/ops/compat/huber_loss_sig.cc b/paddle/phi/ops/compat/huber_loss_sig.cc index 6f669a4a8b697..b7bf143fd4041 100644 --- a/paddle/phi/ops/compat/huber_loss_sig.cc +++ b/paddle/phi/ops/compat/huber_loss_sig.cc @@ -24,9 +24,9 @@ KernelSignature HuberLossOpArgumentMapping(const ArgumentMappingContext& ctx) { KernelSignature HuberLossGradOpArgumentMapping( const ArgumentMappingContext& ctx) { return KernelSignature("huber_loss_grad", - {"Residual", GradVarName("Out")}, + {"Residual", "Out@GRAD"}, {"delta"}, - {GradVarName("X"), GradVarName("Y")}); + {"X@GRAD", "Y@GRAD"}); } } // namespace phi diff --git a/paddle/phi/ops/compat/index_sample_sig.cc b/paddle/phi/ops/compat/index_sample_sig.cc index 3b7e3f063d6c1..9c1b7e27f04ec 100644 --- a/paddle/phi/ops/compat/index_sample_sig.cc +++ b/paddle/phi/ops/compat/index_sample_sig.cc @@ -18,10 +18,8 @@ namespace phi { KernelSignature IndexSampleGradOpArgumentMapping( const ArgumentMappingContext& ctx) { - return KernelSignature("index_sample_grad", - {"X", "Index", GradVarName("Out")}, - {}, - {GradVarName("X")}); + return KernelSignature( + "index_sample_grad", {"X", "Index", "Out@GRAD"}, {}, {"X@GRAD"}); } } // namespace phi diff --git a/paddle/phi/ops/compat/index_select_sig.cc b/paddle/phi/ops/compat/index_select_sig.cc index 53eff1bbcd7ed..096ad2332c9ab 100644 --- a/paddle/phi/ops/compat/index_select_sig.cc +++ b/paddle/phi/ops/compat/index_select_sig.cc @@ -18,10 +18,8 @@ namespace phi { KernelSignature IndexSelectGradOpArgumentMapping( const ArgumentMappingContext& ctx) { - return KernelSignature("index_select_grad", - {"X", "Index", GradVarName("Out")}, - {"dim"}, - {GradVarName("X")}); + return KernelSignature( + "index_select_grad", {"X", "Index", "Out@GRAD"}, {"dim"}, {"X@GRAD"}); } } // namespace phi diff --git a/paddle/phi/ops/compat/interpolate_sig.cc b/paddle/phi/ops/compat/interpolate_sig.cc index ba0e971e4ab00..61b0224073060 100644 --- a/paddle/phi/ops/compat/interpolate_sig.cc +++ b/paddle/phi/ops/compat/interpolate_sig.cc @@ -92,81 +92,76 @@ KernelSignature BicubicInterpOpArgumentMapping( KernelSignature BilinearInterpGradOpArgumentMapping( const ArgumentMappingContext& ctx) { - return KernelSignature( - "bilinear_interp_v2_grad", - {"X", "OutSize", "SizeTensor", "Scale", GradVarName("Out")}, - {"data_layout", - "out_d", - "out_h", - "out_w", - "scale", - "interp_method", - "align_corners", - "align_mode"}, - {GradVarName("X")}); + return KernelSignature("bilinear_interp_v2_grad", + {"X", "OutSize", "SizeTensor", "Scale", "Out@GRAD"}, + {"data_layout", + "out_d", + "out_h", + "out_w", + "scale", + "interp_method", + "align_corners", + "align_mode"}, + {"X@GRAD"}); } KernelSignature NearestInterpGradOpArgumentMapping( const ArgumentMappingContext& ctx) { - return KernelSignature( - "nearest_interp_v2_grad", - {"X", "OutSize", "SizeTensor", "Scale", GradVarName("Out")}, - {"data_layout", - "out_d", - "out_h", - "out_w", - "scale", - "interp_method", - "align_corners", - "align_mode"}, - {GradVarName("X")}); + return KernelSignature("nearest_interp_v2_grad", + {"X", "OutSize", "SizeTensor", "Scale", "Out@GRAD"}, + {"data_layout", + "out_d", + "out_h", + "out_w", + "scale", + "interp_method", + "align_corners", + "align_mode"}, + {"X@GRAD"}); } KernelSignature TrilinearInterpGradOpArgumentMapping( const ArgumentMappingContext& ctx) { - return KernelSignature( - "trilinear_interp_v2_grad", - {"X", "OutSize", "SizeTensor", "Scale", GradVarName("Out")}, - {"data_layout", - "out_d", - "out_h", - "out_w", - "scale", - "interp_method", - "align_corners", - "align_mode"}, - {GradVarName("X")}); + return KernelSignature("trilinear_interp_v2_grad", + {"X", "OutSize", "SizeTensor", "Scale", "Out@GRAD"}, + {"data_layout", + "out_d", + "out_h", + "out_w", + "scale", + "interp_method", + "align_corners", + "align_mode"}, + {"X@GRAD"}); } KernelSignature LinearInterpGradOpArgumentMapping( const ArgumentMappingContext& ctx) { - return KernelSignature( - "linear_interp_v2_grad", - {"X", "OutSize", "SizeTensor", "Scale", GradVarName("Out")}, - {"data_layout", - "out_d", - "out_h", - "out_w", - "scale", - "interp_method", - "align_corners", - "align_mode"}, - {GradVarName("X")}); + return KernelSignature("linear_interp_v2_grad", + {"X", "OutSize", "SizeTensor", "Scale", "Out@GRAD"}, + {"data_layout", + "out_d", + "out_h", + "out_w", + "scale", + "interp_method", + "align_corners", + "align_mode"}, + {"X@GRAD"}); } KernelSignature BicubicInterpGradOpArgumentMapping( const ArgumentMappingContext& ctx) { - return KernelSignature( - "bicubic_interp_v2_grad", - {"X", "OutSize", "SizeTensor", "Scale", GradVarName("Out")}, - {"data_layout", - "out_d", - "out_h", - "out_w", - "scale", - "interp_method", - "align_corners", - "align_mode"}, - {GradVarName("X")}); + return KernelSignature("bicubic_interp_v2_grad", + {"X", "OutSize", "SizeTensor", "Scale", "Out@GRAD"}, + {"data_layout", + "out_d", + "out_h", + "out_w", + "scale", + "interp_method", + "align_corners", + "align_mode"}, + {"X@GRAD"}); } } // namespace phi diff --git a/paddle/phi/ops/compat/kldiv_loss_sig.cc b/paddle/phi/ops/compat/kldiv_loss_sig.cc index 22d2f074e9f13..8af0edd316487 100644 --- a/paddle/phi/ops/compat/kldiv_loss_sig.cc +++ b/paddle/phi/ops/compat/kldiv_loss_sig.cc @@ -20,9 +20,9 @@ namespace phi { KernelSignature KLDivLossGradOpArgumentMapping( const ArgumentMappingContext& ctx) { return KernelSignature("kldiv_loss_grad", - {"X", "Target", GradVarName("Loss")}, + {"X", "Target", "Loss@GRAD"}, {"reduction"}, - {GradVarName("X")}); + {"X@GRAD"}); } } // namespace phi diff --git a/paddle/phi/ops/compat/kron_sig.cc b/paddle/phi/ops/compat/kron_sig.cc index 06b6545f58e7c..e2ba41dcadd9d 100644 --- a/paddle/phi/ops/compat/kron_sig.cc +++ b/paddle/phi/ops/compat/kron_sig.cc @@ -17,10 +17,8 @@ namespace phi { KernelSignature KronGradOpArgumentMapping(const ArgumentMappingContext& ctx) { - return KernelSignature("kron_grad", - {"X", "Y", GradVarName("Out")}, - {}, - {GradVarName("X"), GradVarName("Y")}); + return KernelSignature( + "kron_grad", {"X", "Y", "Out@GRAD"}, {}, {"X@GRAD", "Y@GRAD"}); } } // namespace phi diff --git a/paddle/phi/ops/compat/kthvalue_sig.cc b/paddle/phi/ops/compat/kthvalue_sig.cc index 3b1a6a45f9a0a..b04726ec3b3a1 100644 --- a/paddle/phi/ops/compat/kthvalue_sig.cc +++ b/paddle/phi/ops/compat/kthvalue_sig.cc @@ -20,9 +20,9 @@ namespace phi { KernelSignature KthvalueGradOpArgumentMapping( const ArgumentMappingContext& ctx) { return KernelSignature("kthvalue_grad", - {"X", "Indices", GradVarName("Out")}, + {"X", "Indices", "Out@GRAD"}, {"k", "axis", "keepdim"}, - {GradVarName("X")}); + {"X@GRAD"}); } } // namespace phi diff --git a/paddle/phi/ops/compat/label_smooth_sig.cc b/paddle/phi/ops/compat/label_smooth_sig.cc index 4fb62a8ca2675..7607af2b61b7c 100644 --- a/paddle/phi/ops/compat/label_smooth_sig.cc +++ b/paddle/phi/ops/compat/label_smooth_sig.cc @@ -24,10 +24,8 @@ KernelSignature LabelSmoothOpArgumentMapping( KernelSignature LabelSmoothGradOpArgumentMapping( const ArgumentMappingContext& ctx) { - return KernelSignature("label_smooth_grad", - {GradVarName("Out")}, - {"epsilon"}, - {GradVarName("X")}); + return KernelSignature( + "label_smooth_grad", {"Out@GRAD"}, {"epsilon"}, {"X@GRAD"}); } } // namespace phi diff --git a/paddle/phi/ops/compat/layer_norm_sig.cc b/paddle/phi/ops/compat/layer_norm_sig.cc index 4151b9e94fbdc..ab4f9ab817157 100644 --- a/paddle/phi/ops/compat/layer_norm_sig.cc +++ b/paddle/phi/ops/compat/layer_norm_sig.cc @@ -25,11 +25,10 @@ KernelSignature LayerNormOpArgumentMapping(const ArgumentMappingContext& ctx) { KernelSignature LayerNormGradOpArgumentMapping( const ArgumentMappingContext& ctx) { - return KernelSignature( - "layer_norm_grad", - {"X", "Scale", "Bias", "Mean", "Variance", GradVarName("Y")}, - {"epsilon", "begin_norm_axis", "is_test"}, - {GradVarName("X"), GradVarName("Scale"), GradVarName("Bias")}); + return KernelSignature("layer_norm_grad", + {"X", "Scale", "Bias", "Mean", "Variance", "Y@GRAD"}, + {"epsilon", "begin_norm_axis", "is_test"}, + {"X@GRAD", "Scale@GRAD", "Bias@GRAD"}); } } // namespace phi diff --git a/paddle/phi/ops/compat/lerp_sig.cc b/paddle/phi/ops/compat/lerp_sig.cc index 3a8b23ca4c4a4..154424468d660 100644 --- a/paddle/phi/ops/compat/lerp_sig.cc +++ b/paddle/phi/ops/compat/lerp_sig.cc @@ -22,9 +22,9 @@ KernelSignature LerpOpArgumentMapping(const ArgumentMappingContext& ctx) { KernelSignature LerpGradOpArgumentMapping(const ArgumentMappingContext& ctx) { return KernelSignature("lerp_grad", - {"X", "Y", "Weight", "Out", GradVarName("Out")}, + {"X", "Y", "Weight", "Out", "Out@GRAD"}, {}, - {GradVarName("X"), GradVarName("Y")}); + {"X@GRAD", "Y@GRAD"}); } } // namespace phi diff --git a/paddle/phi/ops/compat/lgamma_sig.cc b/paddle/phi/ops/compat/lgamma_sig.cc index 452ba5e2b45a1..192754cc846dc 100644 --- a/paddle/phi/ops/compat/lgamma_sig.cc +++ b/paddle/phi/ops/compat/lgamma_sig.cc @@ -17,8 +17,7 @@ namespace phi { KernelSignature LgammaGradOpArgumentMapping(const ArgumentMappingContext& ctx) { - return KernelSignature( - "lgamma_grad", {"X", GradVarName("Out")}, {}, {GradVarName("X")}); + return KernelSignature("lgamma_grad", {"X", "Out@GRAD"}, {}, {"X@GRAD"}); } } // namespace phi diff --git a/paddle/phi/ops/compat/log_loss_sig.cc b/paddle/phi/ops/compat/log_loss_sig.cc index c4ae746e975a7..adf40bac000e3 100644 --- a/paddle/phi/ops/compat/log_loss_sig.cc +++ b/paddle/phi/ops/compat/log_loss_sig.cc @@ -19,9 +19,9 @@ namespace phi { KernelSignature LogLossGradOpArgumentMapping( const ArgumentMappingContext& ctx) { return KernelSignature("log_loss_grad", - {"Predicted", "Labels", GradVarName("Loss")}, + {"Predicted", "Labels", "Loss@GRAD"}, {"epsilon"}, - {GradVarName("Predicted")}); + {"Predicted@GRAD"}); } } // namespace phi diff --git a/paddle/phi/ops/compat/log_softmax_sig.cc b/paddle/phi/ops/compat/log_softmax_sig.cc index b1ecc6d56768f..20635c89875f8 100644 --- a/paddle/phi/ops/compat/log_softmax_sig.cc +++ b/paddle/phi/ops/compat/log_softmax_sig.cc @@ -18,10 +18,8 @@ namespace phi { KernelSignature LogSoftmaxGradOpArgumentMapping( const ArgumentMappingContext& ctx) { - return KernelSignature("log_softmax_grad", - {"Out", GradVarName("Out")}, - {"axis"}, - {GradVarName("X")}); + return KernelSignature( + "log_softmax_grad", {"Out", "Out@GRAD"}, {"axis"}, {"X@GRAD"}); } } // namespace phi diff --git a/paddle/phi/ops/compat/logsumexp_sig.cc b/paddle/phi/ops/compat/logsumexp_sig.cc index ca7345dbe7049..6d988c71880cb 100644 --- a/paddle/phi/ops/compat/logsumexp_sig.cc +++ b/paddle/phi/ops/compat/logsumexp_sig.cc @@ -19,9 +19,9 @@ namespace phi { KernelSignature LogsumexpGradOpArgumentMapping( const ArgumentMappingContext& ctx) { return KernelSignature("logsumexp_grad", - {"X", "Out", GradVarName("Out")}, + {"X", "Out", "Out@GRAD"}, {"axis", "keepdim", "reduce_all"}, - {GradVarName("X")}); + {"X@GRAD"}); } } // namespace phi diff --git a/paddle/phi/ops/compat/masked_select_sig.cc b/paddle/phi/ops/compat/masked_select_sig.cc index ec0eb90315bc1..47b4f2fac3155 100644 --- a/paddle/phi/ops/compat/masked_select_sig.cc +++ b/paddle/phi/ops/compat/masked_select_sig.cc @@ -23,10 +23,8 @@ KernelSignature MaskedSelectOpArgumentMapping( KernelSignature MaskedSelectGradOpArgumentMapping( const ArgumentMappingContext& ctx) { - return KernelSignature("masked_select_grad", - {"X", "Mask", GradVarName("Y")}, - {}, - {GradVarName("X")}); + return KernelSignature( + "masked_select_grad", {"X", "Mask", "Y@GRAD"}, {}, {"X@GRAD"}); } } // namespace phi diff --git a/paddle/phi/ops/compat/matmul_sig.cc b/paddle/phi/ops/compat/matmul_sig.cc index 771a7c3acc39d..4e125f0dbea27 100644 --- a/paddle/phi/ops/compat/matmul_sig.cc +++ b/paddle/phi/ops/compat/matmul_sig.cc @@ -19,14 +19,14 @@ namespace phi { KernelSignature MatmulGradOpArgumentMapping(const ArgumentMappingContext& ctx) { if (ctx.HasAttr("use_addto")) { return KernelSignature("addto_matmul_grad", - {"X", "Y", GradVarName("Out")}, + {"X", "Y", "Out@GRAD"}, {"trans_x", "trans_y", "use_addto"}, - {GradVarName("X"), GradVarName("Y")}); + {"X@GRAD", "Y@GRAD"}); } else { return KernelSignature("matmul_grad", - {"X", "Y", GradVarName("Out")}, + {"X", "Y", "Out@GRAD"}, {"trans_x", "trans_y"}, - {GradVarName("X"), GradVarName("Y")}); + {"X@GRAD", "Y@GRAD"}); } } diff --git a/paddle/phi/ops/compat/matrix_power_sig.cc b/paddle/phi/ops/compat/matrix_power_sig.cc index 4c9ad4e74ab46..00cb1f82b8047 100644 --- a/paddle/phi/ops/compat/matrix_power_sig.cc +++ b/paddle/phi/ops/compat/matrix_power_sig.cc @@ -18,10 +18,8 @@ namespace phi { KernelSignature MatrixPowerGradOpArgumentMapping( const ArgumentMappingContext& ctx) { - return KernelSignature("matrix_power_grad", - {"X", "Out", GradVarName("Out")}, - {"n"}, - {GradVarName("X")}); + return KernelSignature( + "matrix_power_grad", {"X", "Out", "Out@GRAD"}, {"n"}, {"X@GRAD"}); } } // namespace phi diff --git a/paddle/phi/ops/compat/maxout_sig.cc b/paddle/phi/ops/compat/maxout_sig.cc index d16dd1c8617fe..9e028bc81fbc3 100644 --- a/paddle/phi/ops/compat/maxout_sig.cc +++ b/paddle/phi/ops/compat/maxout_sig.cc @@ -21,10 +21,8 @@ KernelSignature MaxoutArgumentMapping(const ArgumentMappingContext& ctx) { } KernelSignature MaxoutGradArgumentMapping(const ArgumentMappingContext& ctx) { - return KernelSignature("maxout_grad", - {"X", "Out", GradVarName("Out")}, - {"groups", "axis"}, - {GradVarName("X")}); + return KernelSignature( + "maxout_grad", {"X", "Out", "Out@GRAD"}, {"groups", "axis"}, {"X@GRAD"}); } } // namespace phi diff --git a/paddle/phi/ops/compat/mean_sig.cc b/paddle/phi/ops/compat/mean_sig.cc index 6decd0da0b086..461d6ab32cec4 100644 --- a/paddle/phi/ops/compat/mean_sig.cc +++ b/paddle/phi/ops/compat/mean_sig.cc @@ -22,8 +22,7 @@ KernelSignature MeanOpArgumentMapping(const ArgumentMappingContext& ctx) { KernelSignature MeanGradOpGradArgumentMapping( const ArgumentMappingContext& ctx) { - return KernelSignature( - "mean_all_grad", {"X", GradVarName("Out")}, {}, {GradVarName("X")}); + return KernelSignature("mean_all_grad", {"X", "Out@GRAD"}, {}, {"X@GRAD"}); } } // namespace phi diff --git a/paddle/phi/ops/compat/meshgrid_sig.cc b/paddle/phi/ops/compat/meshgrid_sig.cc index 44671c84e7afb..f0c8cc7ea6234 100644 --- a/paddle/phi/ops/compat/meshgrid_sig.cc +++ b/paddle/phi/ops/compat/meshgrid_sig.cc @@ -22,8 +22,7 @@ KernelSignature MeshgridOpArgumentMapping(const ArgumentMappingContext& ctx) { KernelSignature MeshgridGradOpArgumentMapping( const ArgumentMappingContext& ctx) { - return KernelSignature( - "meshgrid_grad", {"X", GradVarName("Out")}, {}, {GradVarName("X")}); + return KernelSignature("meshgrid_grad", {"X", "Out@GRAD"}, {}, {"X@GRAD"}); } } // namespace phi diff --git a/paddle/phi/ops/compat/mode_sig.cc b/paddle/phi/ops/compat/mode_sig.cc index 20994c08aa73c..e21cd69bf60a1 100644 --- a/paddle/phi/ops/compat/mode_sig.cc +++ b/paddle/phi/ops/compat/mode_sig.cc @@ -23,9 +23,9 @@ KernelSignature ModeOpArgumentMapping(const ArgumentMappingContext& ctx) { KernelSignature ModeGradOpArgumentMapping(const ArgumentMappingContext& ctx) { return KernelSignature("mode_grad", - {"X", "Indices", GradVarName("Out")}, + {"X", "Indices", "Out@GRAD"}, {"axis", "keepdim"}, - {GradVarName("X")}); + {"X@GRAD"}); } } // namespace phi diff --git a/paddle/phi/ops/compat/mul_sig.cc b/paddle/phi/ops/compat/mul_sig.cc index 8770db1039eb6..4afff4aa1d7a5 100644 --- a/paddle/phi/ops/compat/mul_sig.cc +++ b/paddle/phi/ops/compat/mul_sig.cc @@ -18,9 +18,9 @@ namespace phi { KernelSignature MulGradOpArgumentMapping(const ArgumentMappingContext& ctx) { return KernelSignature("matmul_with_flatten_grad", - {"X", "Y", GradVarName("Out")}, + {"X", "Y", "Out@GRAD"}, {"x_num_col_dims", "y_num_col_dims"}, - {GradVarName("X"), GradVarName("Y")}); + {"X@GRAD", "Y@GRAD"}); } KernelSignature MulDoubleGradOpArgumentMapping( diff --git a/paddle/phi/ops/compat/multi_dot_sig.cc b/paddle/phi/ops/compat/multi_dot_sig.cc index 2e05bd6d1557a..29af82c9d1d16 100644 --- a/paddle/phi/ops/compat/multi_dot_sig.cc +++ b/paddle/phi/ops/compat/multi_dot_sig.cc @@ -18,8 +18,7 @@ namespace phi { KernelSignature MultiDotGradOpArgumentMapping( const ArgumentMappingContext& ctx) { - return KernelSignature( - "multi_dot_grad", {"X", GradVarName("Out")}, {}, {GradVarName("X")}); + return KernelSignature("multi_dot_grad", {"X", "Out@GRAD"}, {}, {"X@GRAD"}); } } // namespace phi diff --git a/paddle/phi/ops/compat/multiplex_sig.cc b/paddle/phi/ops/compat/multiplex_sig.cc index 9dab4655d1723..538b1c13dda58 100644 --- a/paddle/phi/ops/compat/multiplex_sig.cc +++ b/paddle/phi/ops/compat/multiplex_sig.cc @@ -22,8 +22,7 @@ KernelSignature MultiplexOpArgumentMapping(const ArgumentMappingContext& ctx) { KernelSignature MultiplexGradOpArgumentMapping( const ArgumentMappingContext& ctx) { - return KernelSignature( - "multiplex_grad", {"Ids", GradVarName("Out")}, {}, {GradVarName("X")}); + return KernelSignature("multiplex_grad", {"Ids", "Out@GRAD"}, {}, {"X@GRAD"}); } } // namespace phi diff --git a/paddle/phi/ops/compat/mv_sig.cc b/paddle/phi/ops/compat/mv_sig.cc index 0012f8e1ccb41..e965ddbb72657 100644 --- a/paddle/phi/ops/compat/mv_sig.cc +++ b/paddle/phi/ops/compat/mv_sig.cc @@ -17,10 +17,8 @@ namespace phi { KernelSignature MvGradOpArgumentMapping(const ArgumentMappingContext& ctx) { - return KernelSignature("mv_grad", - {"X", "Vec", GradVarName("Out")}, - {}, - {GradVarName("X"), GradVarName("Vec")}); + return KernelSignature( + "mv_grad", {"X", "Vec", "Out@GRAD"}, {}, {"X@GRAD", "Vec@GRAD"}); } } // namespace phi diff --git a/paddle/phi/ops/compat/nll_loss_sig.cc b/paddle/phi/ops/compat/nll_loss_sig.cc index 87a060ce7a672..f3f9c53178192 100644 --- a/paddle/phi/ops/compat/nll_loss_sig.cc +++ b/paddle/phi/ops/compat/nll_loss_sig.cc @@ -27,11 +27,10 @@ KernelSignature NllLossOpArgumentMapping(const ArgumentMappingContext& ctx) { KernelSignature NllLossGradOpArgumentMapping( const ArgumentMappingContext& ctx) { - return KernelSignature( - "nll_loss_grad", - {"X", "Label", "Weight", "Total_weight", GradVarName("Out")}, - {"ignore_index", "reduction"}, - {GradVarName("X")}); + return KernelSignature("nll_loss_grad", + {"X", "Label", "Weight", "Total_weight", "Out@GRAD"}, + {"ignore_index", "reduction"}, + {"X@GRAD"}); } } // namespace phi diff --git a/paddle/phi/ops/compat/norm_sig.cc b/paddle/phi/ops/compat/norm_sig.cc index a74db9b5686c8..b9e56f3d166d4 100644 --- a/paddle/phi/ops/compat/norm_sig.cc +++ b/paddle/phi/ops/compat/norm_sig.cc @@ -23,9 +23,9 @@ KernelSignature NormOpArgumentMapping(const ArgumentMappingContext& ctx) { KernelSignature NormGradOpArgumentMapping(const ArgumentMappingContext& ctx) { return KernelSignature("norm_grad", - {"X", "Norm", GradVarName("Out")}, + {"X", "Norm", "Out@GRAD"}, {"axis", "epsilon", "is_test"}, - {GradVarName("X")}); + {"X@GRAD"}); } } // namespace phi diff --git a/paddle/phi/ops/compat/p_norm_sig.cc b/paddle/phi/ops/compat/p_norm_sig.cc index d3bff55346c45..82b88aa09ff2f 100644 --- a/paddle/phi/ops/compat/p_norm_sig.cc +++ b/paddle/phi/ops/compat/p_norm_sig.cc @@ -17,9 +17,9 @@ namespace phi { KernelSignature PNormGradOpArgumentMapping(const ArgumentMappingContext& ctx) { return KernelSignature("p_norm_grad", - {"X", "Out", GradVarName("Out")}, + {"X", "Out", "Out@GRAD"}, {"porder", "axis", "epsilon", "keepdim", "asvector"}, - {GradVarName("X")}); + {"X@GRAD"}); } } // namespace phi diff --git a/paddle/phi/ops/compat/pad3d_sig.cc b/paddle/phi/ops/compat/pad3d_sig.cc index c43b98fa27e6b..dd8a37d24b75f 100644 --- a/paddle/phi/ops/compat/pad3d_sig.cc +++ b/paddle/phi/ops/compat/pad3d_sig.cc @@ -29,14 +29,14 @@ KernelSignature Pad3dOpArgumentMapping(const ArgumentMappingContext& ctx) { KernelSignature Pad3dGradOpArgumentMapping(const ArgumentMappingContext& ctx) { if (ctx.HasInput("Paddings")) { return KernelSignature("pad3d_grad", - {"X", GradVarName("Out")}, + {"X", "Out@GRAD"}, {"Paddings", "mode", "value", "data_format"}, - {GradVarName("X")}); + {"X@GRAD"}); } return KernelSignature("pad3d_grad", - {"X", GradVarName("Out")}, + {"X", "Out@GRAD"}, {"paddings", "mode", "value", "data_format"}, - {GradVarName("X")}); + {"X@GRAD"}); } } // namespace phi diff --git a/paddle/phi/ops/compat/pad_sig.cc b/paddle/phi/ops/compat/pad_sig.cc index 4eadbfa98bede..bb870eb256c9e 100644 --- a/paddle/phi/ops/compat/pad_sig.cc +++ b/paddle/phi/ops/compat/pad_sig.cc @@ -18,10 +18,8 @@ namespace phi { KernelSignature PadGradOpArgumentMapping(const ArgumentMappingContext& ctx) { - return KernelSignature("pad_grad", - {GradVarName("Out")}, - {"paddings", "pad_value"}, - {GradVarName("X")}); + return KernelSignature( + "pad_grad", {"Out@GRAD"}, {"paddings", "pad_value"}, {"X@GRAD"}); } } // namespace phi diff --git a/paddle/phi/ops/compat/pixel_shuffle_sig.cc b/paddle/phi/ops/compat/pixel_shuffle_sig.cc index 641288cf12ae2..96cb01a38fc50 100644 --- a/paddle/phi/ops/compat/pixel_shuffle_sig.cc +++ b/paddle/phi/ops/compat/pixel_shuffle_sig.cc @@ -25,9 +25,9 @@ KernelSignature PixelShuffleOpArgumentMapping( KernelSignature PixelShuffleGradOpArgumentMapping( const ArgumentMappingContext& ctx) { return KernelSignature("pixel_shuffle_grad", - {GradVarName("Out")}, + {"Out@GRAD"}, {"upscale_factor", "data_format"}, - {GradVarName("X")}); + {"X@GRAD"}); } } // namespace phi diff --git a/paddle/phi/ops/compat/poisson_sig.cc b/paddle/phi/ops/compat/poisson_sig.cc index e45640c11b6ee..6022c3b608dfb 100644 --- a/paddle/phi/ops/compat/poisson_sig.cc +++ b/paddle/phi/ops/compat/poisson_sig.cc @@ -18,8 +18,7 @@ namespace phi { KernelSignature PoissonGradOpArgumentMapping( const ArgumentMappingContext& ctx) { - return KernelSignature( - "poisson_grad", {GradVarName("Out")}, {}, {GradVarName("X")}); + return KernelSignature("poisson_grad", {"Out@GRAD"}, {}, {"X@GRAD"}); } } // namespace phi diff --git a/paddle/phi/ops/compat/pool_sig.cc b/paddle/phi/ops/compat/pool_sig.cc index 390d3db5e785b..b807b21a1c0b1 100644 --- a/paddle/phi/ops/compat/pool_sig.cc +++ b/paddle/phi/ops/compat/pool_sig.cc @@ -34,7 +34,7 @@ KernelSignature Pool2dOpArgumentMapping(const ArgumentMappingContext& ctx) { KernelSignature Pool2dGradOpArgumentMapping(const ArgumentMappingContext& ctx) { return KernelSignature("pool2d_grad", - {"X", "Out", GradVarName("Out")}, + {"X", "Out", "Out@GRAD"}, {"ksize", "strides", "paddings", @@ -45,7 +45,7 @@ KernelSignature Pool2dGradOpArgumentMapping(const ArgumentMappingContext& ctx) { "global_pooling", "adaptive", "padding_algorithm"}, - {GradVarName("X")}); + {"X@GRAD"}); } KernelSignature Pool2dDoubleGradOpArgumentMapping( @@ -78,9 +78,9 @@ KernelSignature MaxPool2dWithIndexGradOpArgumentMapping( const ArgumentMappingContext& ctx) { return KernelSignature( "max_pool2d_with_index_grad", - {"X", "Mask", GradVarName("Out")}, + {"X", "Mask", "Out@GRAD"}, {"ksize", "strides", "paddings", "global_pooling", "adaptive"}, - {GradVarName("X")}); + {"X@GRAD"}); } KernelSignature Pool3dOpArgumentMapping(const ArgumentMappingContext& ctx) { @@ -101,7 +101,7 @@ KernelSignature Pool3dOpArgumentMapping(const ArgumentMappingContext& ctx) { KernelSignature Pool3dGradOpArgumentMapping(const ArgumentMappingContext& ctx) { return KernelSignature("pool3d_grad", - {"X", "Out", GradVarName("Out")}, + {"X", "Out", "Out@GRAD"}, {"ksize", "strides", "paddings", @@ -112,7 +112,7 @@ KernelSignature Pool3dGradOpArgumentMapping(const ArgumentMappingContext& ctx) { "global_pooling", "adaptive", "padding_algorithm"}, - {GradVarName("X")}); + {"X@GRAD"}); } KernelSignature MaxPool3dWithIndexOpArgumentMapping( @@ -128,9 +128,9 @@ KernelSignature MaxPool3dWithIndexGradOpArgumentMapping( const ArgumentMappingContext& ctx) { return KernelSignature( "max_pool3d_with_index_grad", - {"X", "Mask", GradVarName("Out")}, + {"X", "Mask", "Out@GRAD"}, {"ksize", "strides", "paddings", "global_pooling", "adaptive"}, - {GradVarName("X")}); + {"X@GRAD"}); } } // namespace phi diff --git a/paddle/phi/ops/compat/prelu_sig.cc b/paddle/phi/ops/compat/prelu_sig.cc index 43e5f20a92676..6e25e1d9f754b 100644 --- a/paddle/phi/ops/compat/prelu_sig.cc +++ b/paddle/phi/ops/compat/prelu_sig.cc @@ -23,9 +23,9 @@ KernelSignature PReluOpArgumentMapping(const ArgumentMappingContext& ctx) { KernelSignature PReluGradOpArgumentMapping(const ArgumentMappingContext& ctx) { return KernelSignature("prelu_grad", - {"X", "Alpha", GradVarName("Out")}, + {"X", "Alpha", "Out@GRAD"}, {"data_format", "mode"}, - {GradVarName("X"), GradVarName("Alpha")}); + {"X@GRAD", "Alpha@GRAD"}); } } // namespace phi diff --git a/paddle/phi/ops/compat/psroi_pool_sig.cc b/paddle/phi/ops/compat/psroi_pool_sig.cc index 4d694d9a7759d..df1dc1113cc18 100644 --- a/paddle/phi/ops/compat/psroi_pool_sig.cc +++ b/paddle/phi/ops/compat/psroi_pool_sig.cc @@ -28,9 +28,9 @@ KernelSignature PsroiPoolGradOpArgumentMapping( const ArgumentMappingContext& ctx) { return KernelSignature( "psroi_pool_grad", - {"X", "ROIs", "RoisNum", GradVarName("Out")}, + {"X", "ROIs", "RoisNum", "Out@GRAD"}, {"pooled_height", "pooled_width", "output_channels", "spatial_scale"}, - {GradVarName("X")}); + {"X@GRAD"}); } } // namespace phi diff --git a/paddle/phi/ops/compat/put_along_axis_sig.cc b/paddle/phi/ops/compat/put_along_axis_sig.cc index 5f8dc1cf4cd71..83f0e5f65a0c5 100644 --- a/paddle/phi/ops/compat/put_along_axis_sig.cc +++ b/paddle/phi/ops/compat/put_along_axis_sig.cc @@ -26,9 +26,9 @@ KernelSignature PutAlongAxisArgumentMapping(const ArgumentMappingContext& ctx) { KernelSignature PutAlongAxisGradArgumentMapping( const ArgumentMappingContext& ctx) { return KernelSignature("put_along_axis_grad", - {"Input", "Index", GradVarName("Result")}, + {"Input", "Index", "Result@GRAD"}, {"Axis", "Reduce"}, - {GradVarName("Input"), GradVarName("Value")}); + {"Input@GRAD", "Value@GRAD"}); } } // namespace phi diff --git a/paddle/phi/ops/compat/reduce_sig.cc b/paddle/phi/ops/compat/reduce_sig.cc index cf2edf9f20fc2..a0ba07f5e8e2c 100644 --- a/paddle/phi/ops/compat/reduce_sig.cc +++ b/paddle/phi/ops/compat/reduce_sig.cc @@ -130,41 +130,41 @@ KernelSignature ReduceAllOpArgumentMapping(const ArgumentMappingContext& ctx) { KernelSignature ReduceSumGradOpArgumentMapping( const ArgumentMappingContext& ctx) { return KernelSignature("sum_grad", - {"X", GradVarName("Out")}, + {"X", "Out@GRAD"}, {"dim", "keep_dim", "reduce_all"}, - {GradVarName("X")}); + {"X@GRAD"}); } KernelSignature ReduceMeanGradOpArgumentMapping( const ArgumentMappingContext& ctx) { return KernelSignature("mean_grad", - {"X", GradVarName("Out")}, + {"X", "Out@GRAD"}, {"dim", "keep_dim", "reduce_all"}, - {GradVarName("X")}); + {"X@GRAD"}); } KernelSignature ReduceMaxGradOpArgumentMapping( const ArgumentMappingContext& ctx) { return KernelSignature("max_grad", - {"X", "Out", GradVarName("Out")}, + {"X", "Out", "Out@GRAD"}, {"dim", "keep_dim", "reduce_all"}, - {GradVarName("X")}); + {"X@GRAD"}); } KernelSignature ReduceMinGradOpArgumentMapping( const ArgumentMappingContext& ctx) { return KernelSignature("min_grad", - {"X", "Out", GradVarName("Out")}, + {"X", "Out", "Out@GRAD"}, {"dim", "keep_dim", "reduce_all"}, - {GradVarName("X")}); + {"X@GRAD"}); } KernelSignature ReduceProdGradOpArgumentMapping( const ArgumentMappingContext& ctx) { return KernelSignature("prod_grad", - {"X", "Out", GradVarName("Out")}, + {"X", "Out", "Out@GRAD"}, {"dim", "keep_dim", "reduce_all"}, - {GradVarName("X")}); + {"X@GRAD"}); } } // namespace phi diff --git a/paddle/phi/ops/compat/reshape_sig.cc b/paddle/phi/ops/compat/reshape_sig.cc index 04f64e4035273..a01f2a98c9bdf 100644 --- a/paddle/phi/ops/compat/reshape_sig.cc +++ b/paddle/phi/ops/compat/reshape_sig.cc @@ -41,8 +41,7 @@ KernelSignature ReshapeOpArgumentMapping(const ArgumentMappingContext& ctx) { KernelSignature ReshapeGradOpArgumentMapping( const ArgumentMappingContext& ctx) { - return KernelSignature( - "reshape_grad", {GradVarName("Out")}, {}, {GradVarName("X")}); + return KernelSignature("reshape_grad", {"Out@GRAD"}, {}, {"X@GRAD"}); } KernelSignature ReshapeDoubleGradOpArgumentMapping( diff --git a/paddle/phi/ops/compat/rnn_sig.cc b/paddle/phi/ops/compat/rnn_sig.cc index 352510d5b2e6e..87c99ac13aa07 100644 --- a/paddle/phi/ops/compat/rnn_sig.cc +++ b/paddle/phi/ops/compat/rnn_sig.cc @@ -39,8 +39,8 @@ KernelSignature RnnGradOpArgumentMapping(const ArgumentMappingContext& ctx) { "Out", "DropoutState", "Reserve", - GradVarName("Out"), - GradVarName("State")}, + "Out@GRAD", + "State@GRAD"}, {"dropout_prob", "is_bidirec", "input_size", @@ -49,9 +49,7 @@ KernelSignature RnnGradOpArgumentMapping(const ArgumentMappingContext& ctx) { "mode", "seed", "is_test"}, - {GradVarName("Input"), - GradVarName("PreState"), - GradVarName("WeightList")}); + {"Input@GRAD", "PreState@GRAD", "WeightList@GRAD"}); } } // namespace phi diff --git a/paddle/phi/ops/compat/roi_align_sig.cc b/paddle/phi/ops/compat/roi_align_sig.cc index 1717ec8f78809..7279e82139bbc 100644 --- a/paddle/phi/ops/compat/roi_align_sig.cc +++ b/paddle/phi/ops/compat/roi_align_sig.cc @@ -30,13 +30,13 @@ KernelSignature RoiAlignOpArgumentMapping(const ArgumentMappingContext& ctx) { KernelSignature RoiAlignGradOpArgumentMapping( const ArgumentMappingContext& ctx) { return KernelSignature("roi_align_grad", - {"X", "ROIs", "RoisNum", GradVarName("Out")}, + {"X", "ROIs", "RoisNum", "Out@GRAD"}, {"pooled_height", "pooled_width", "spatial_scale", "sampling_ratio", "aligned"}, - {GradVarName("X")}); + {"X@GRAD"}); } } // namespace phi diff --git a/paddle/phi/ops/compat/roi_pool_sig.cc b/paddle/phi/ops/compat/roi_pool_sig.cc index d04c645f183c6..971b4b9d5bf32 100644 --- a/paddle/phi/ops/compat/roi_pool_sig.cc +++ b/paddle/phi/ops/compat/roi_pool_sig.cc @@ -26,9 +26,9 @@ KernelSignature RoiPoolOpArgumentMapping(const ArgumentMappingContext& ctx) { KernelSignature RoiPoolOpGradArgumentMapping( const ArgumentMappingContext& ctx) { return KernelSignature("roi_pool_grad", - {"X", "ROIs", "RoisNum", "Argmax", GradVarName("Out")}, + {"X", "ROIs", "RoisNum", "Argmax", "Out@GRAD"}, {"pooled_height", "pooled_width", "spatial_scale"}, - {GradVarName("X")}); + {"X@GRAD"}); } } // namespace phi diff --git a/paddle/phi/ops/compat/roll_sig.cc b/paddle/phi/ops/compat/roll_sig.cc index a144f0e8e8a90..e6817555bc4b9 100644 --- a/paddle/phi/ops/compat/roll_sig.cc +++ b/paddle/phi/ops/compat/roll_sig.cc @@ -24,10 +24,8 @@ KernelSignature RollOpArgumentMapping(const ArgumentMappingContext& ctx) { } KernelSignature RollGradOpArgumentMapping(const ArgumentMappingContext& ctx) { - return KernelSignature("roll_grad", - {"X", GradVarName("Out")}, - {"shifts", "axis"}, - {GradVarName("X")}); + return KernelSignature( + "roll_grad", {"X", "Out@GRAD"}, {"shifts", "axis"}, {"X@GRAD"}); } } // namespace phi diff --git a/paddle/phi/ops/compat/segment_pool_sig.cc b/paddle/phi/ops/compat/segment_pool_sig.cc index 97646a2ac31d3..db07343f9ad84 100644 --- a/paddle/phi/ops/compat/segment_pool_sig.cc +++ b/paddle/phi/ops/compat/segment_pool_sig.cc @@ -18,13 +18,12 @@ namespace phi { KernelSignature SegmentPoolGradOpArgumentMapping( const ArgumentMappingContext& ctx) { - return KernelSignature( - "segment_pool_grad", - { - "X", "SegmentIds", "Out", "SummedIds", GradVarName("Out"), - }, - {"pooltype"}, - {GradVarName("X")}); + return KernelSignature("segment_pool_grad", + { + "X", "SegmentIds", "Out", "SummedIds", "Out@GRAD", + }, + {"pooltype"}, + {"X@GRAD"}); } } // namespace phi diff --git a/paddle/phi/ops/compat/selu_sig.cc b/paddle/phi/ops/compat/selu_sig.cc index 23f5cc34515b4..08087584a1094 100644 --- a/paddle/phi/ops/compat/selu_sig.cc +++ b/paddle/phi/ops/compat/selu_sig.cc @@ -19,10 +19,8 @@ namespace phi { KernelSignature SeluGradGradOpArgumentMapping( const ArgumentMappingContext& ctx) { - return KernelSignature("selu_grad", - {"Out", GradVarName("Out")}, - {"scale", "alpha"}, - {GradVarName("X")}); + return KernelSignature( + "selu_grad", {"Out", "Out@GRAD"}, {"scale", "alpha"}, {"X@GRAD"}); } } // namespace phi PD_REGISTER_ARG_MAPPING_FN(selu_grad, phi::SeluGradGradOpArgumentMapping); diff --git a/paddle/phi/ops/compat/set_value_sig.cc b/paddle/phi/ops/compat/set_value_sig.cc index 5feff54b028ba..6ff94a6e263f4 100644 --- a/paddle/phi/ops/compat/set_value_sig.cc +++ b/paddle/phi/ops/compat/set_value_sig.cc @@ -737,96 +737,89 @@ KernelSignature SetValueGradOpArgumentMapping( if (ctx.InputSize("StartsTensorList") > 0) { if (ctx.InputSize("EndsTensorList") > 0) { if (ctx.InputSize("StepsTensorList") > 0) { - return KernelSignature( - "set_value_grad", - {GradVarName("Out")}, - {"StartsTensorList", - "EndsTensorList", - "StepsTensorList", - "axes", - "decrease_axes", - "none_axes"}, - {GradVarName("Input"), GradVarName("ValueTensor")}); + return KernelSignature("set_value_grad", + {"Out@GRAD"}, + {"StartsTensorList", + "EndsTensorList", + "StepsTensorList", + "axes", + "decrease_axes", + "none_axes"}, + {"Input@GRAD", "ValueTensor@GRAD"}); } else { - return KernelSignature( - "set_value_grad", - {GradVarName("Out")}, - {"StartsTensorList", - "EndsTensorList", - "steps", - "axes", - "decrease_axes", - "none_axes"}, - {GradVarName("Input"), GradVarName("ValueTensor")}); + return KernelSignature("set_value_grad", + {"Out@GRAD"}, + {"StartsTensorList", + "EndsTensorList", + "steps", + "axes", + "decrease_axes", + "none_axes"}, + {"Input@GRAD", "ValueTensor@GRAD"}); } } else { if (ctx.InputSize("StepsTensorList") > 0) { - return KernelSignature( - "set_value_grad", - {GradVarName("Out")}, - {"StartsTensorList", - "ends", - "StepsTensorList", - "axes", - "decrease_axes", - "none_axes"}, - {GradVarName("Input"), GradVarName("ValueTensor")}); + return KernelSignature("set_value_grad", + {"Out@GRAD"}, + {"StartsTensorList", + "ends", + "StepsTensorList", + "axes", + "decrease_axes", + "none_axes"}, + {"Input@GRAD", "ValueTensor@GRAD"}); } else { - return KernelSignature( - "set_value_grad", - {GradVarName("Out")}, - {"StartsTensorList", - "ends", - "steps", - "axes", - "decrease_axes", - "none_axes"}, - {GradVarName("Input"), GradVarName("ValueTensor")}); + return KernelSignature("set_value_grad", + {"Out@GRAD"}, + {"StartsTensorList", + "ends", + "steps", + "axes", + "decrease_axes", + "none_axes"}, + {"Input@GRAD", "ValueTensor@GRAD"}); } } } else { if (ctx.InputSize("EndsTensorList") > 0) { if (ctx.InputSize("StepsTensorList") > 0) { - return KernelSignature( - "set_value_grad", - {GradVarName("Out")}, - {"starts", - "EndsTensorList", - "StepsTensorList", - "axes", - "decrease_axes", - "none_axes"}, - {GradVarName("Input"), GradVarName("ValueTensor")}); + return KernelSignature("set_value_grad", + {"Out@GRAD"}, + {"starts", + "EndsTensorList", + "StepsTensorList", + "axes", + "decrease_axes", + "none_axes"}, + {"Input@GRAD", "ValueTensor@GRAD"}); } else { - return KernelSignature( - "set_value_grad", - {GradVarName("Out")}, - {"starts", - "EndsTensorList", - "steps", - "axes", - "decrease_axes", - "none_axes"}, - {GradVarName("Input"), GradVarName("ValueTensor")}); + return KernelSignature("set_value_grad", + {"Out@GRAD"}, + {"starts", + "EndsTensorList", + "steps", + "axes", + "decrease_axes", + "none_axes"}, + {"Input@GRAD", "ValueTensor@GRAD"}); } } else { if (ctx.InputSize("StepsTensorList") > 0) { - return KernelSignature( - "set_value_grad", - {GradVarName("Out")}, - {"starts", - "ends", - "StepsTensorList", - "axes", - "decrease_axes", - "none_axes"}, - {GradVarName("Input"), GradVarName("ValueTensor")}); + return KernelSignature("set_value_grad", + {"Out@GRAD"}, + {"starts", + "ends", + "StepsTensorList", + "axes", + "decrease_axes", + "none_axes"}, + {"Input@GRAD", "ValueTensor@GRAD"}); } else { return KernelSignature( "set_value_grad", - {GradVarName("Out")}, + {"Out@GRAD"}, {"starts", "ends", "steps", "axes", "decrease_axes", "none_axes"}, - {GradVarName("Input"), GradVarName("ValueTensor")}); + {"Input@GRAD", "ValueTensor@GRAD"}); } } } diff --git a/paddle/phi/ops/compat/sigmoid_cross_entropy_with_logits_sig.cc b/paddle/phi/ops/compat/sigmoid_cross_entropy_with_logits_sig.cc index 61ad9627a9612..795e287d53deb 100644 --- a/paddle/phi/ops/compat/sigmoid_cross_entropy_with_logits_sig.cc +++ b/paddle/phi/ops/compat/sigmoid_cross_entropy_with_logits_sig.cc @@ -19,9 +19,9 @@ namespace phi { KernelSignature SigmoidCrossEntropyWithLogitsKernelGradOpArgumentMapping( const ArgumentMappingContext& ctx) { return KernelSignature("sigmoid_cross_entropy_with_logits_grad", - {"X", "Label", GradVarName("Out")}, + {"X", "Label", "Out@GRAD"}, {"normalize", "ignore_index"}, - {GradVarName("X")}); + {"X@GRAD"}); } } // namespace phi diff --git a/paddle/phi/ops/compat/slice_sig.cc b/paddle/phi/ops/compat/slice_sig.cc index ba3bafdaa51c7..607d0b31310b6 100644 --- a/paddle/phi/ops/compat/slice_sig.cc +++ b/paddle/phi/ops/compat/slice_sig.cc @@ -105,74 +105,74 @@ KernelSignature SliceGradOpArgumentMapping(const ArgumentMappingContext& ctx) { if (ctx.HasInput("StartsTensor")) { if (ctx.HasInput("EndsTensor")) { return KernelSignature("slice_grad", - {"Input", GradVarName("Out")}, + {"Input", "Out@GRAD"}, {"axes", "StartsTensor", "EndsTensor", "infer_flags", "decrease_axis"}, - {GradVarName("Input")}); + {"Input@GRAD"}); } else if (ctx.InputSize("EndsTensorList") > 0) { return KernelSignature("slice_grad", - {"Input", GradVarName("Out")}, + {"Input", "Out@GRAD"}, {"axes", "StartsTensor", "EndsTensorList", "infer_flags", "decrease_axis"}, - {GradVarName("Input")}); + {"Input@GRAD"}); } else { return KernelSignature( "slice_grad", - {"Input", GradVarName("Out")}, + {"Input", "Out@GRAD"}, {"axes", "StartsTensor", "ends", "infer_flags", "decrease_axis"}, - {GradVarName("Input")}); + {"Input@GRAD"}); } } else if (ctx.InputSize("StartsTensorList") > 0) { if (ctx.HasInput("EndsTensor")) { return KernelSignature("slice_grad", - {"Input", GradVarName("Out")}, + {"Input", "Out@GRAD"}, {"axes", "StartsTensorList", "EndsTensor", "infer_flags", "decrease_axis"}, - {GradVarName("Input")}); + {"Input@GRAD"}); } else if (ctx.InputSize("EndsTensorList") > 0) { return KernelSignature("slice_grad", - {"Input", GradVarName("Out")}, + {"Input", "Out@GRAD"}, {"axes", "StartsTensorList", "EndsTensorList", "infer_flags", "decrease_axis"}, - {GradVarName("Input")}); + {"Input@GRAD"}); } else { return KernelSignature( "slice_grad", - {"Input", GradVarName("Out")}, + {"Input", "Out@GRAD"}, {"axes", "StartsTensorList", "ends", "infer_flags", "decrease_axis"}, - {GradVarName("Input")}); + {"Input@GRAD"}); } } else { if (ctx.HasInput("EndsTensor")) { return KernelSignature( "slice_grad", - {"Input", GradVarName("Out")}, + {"Input", "Out@GRAD"}, {"axes", "starts", "EndsTensor", "infer_flags", "decrease_axis"}, - {GradVarName("Input")}); + {"Input@GRAD"}); } else if (ctx.InputSize("EndsTensorList") > 0) { return KernelSignature( "slice_grad", - {"Input", GradVarName("Out")}, + {"Input", "Out@GRAD"}, {"axes", "starts", "EndsTensorList", "infer_flags", "decrease_axis"}, - {GradVarName("Input")}); + {"Input@GRAD"}); } else { return KernelSignature( "slice_grad", - {"Input", GradVarName("Out")}, + {"Input", "Out@GRAD"}, {"axes", "starts", "ends", "infer_flags", "decrease_axis"}, - {GradVarName("Input")}); + {"Input@GRAD"}); } } } diff --git a/paddle/phi/ops/compat/softmax_sig.cc b/paddle/phi/ops/compat/softmax_sig.cc index 65a915b51d08a..a30a2a2b06fd5 100644 --- a/paddle/phi/ops/compat/softmax_sig.cc +++ b/paddle/phi/ops/compat/softmax_sig.cc @@ -22,10 +22,8 @@ KernelSignature SoftmaxOpArgumentMapping(const ArgumentMappingContext& ctx) { KernelSignature SoftmaxGradOpArgumentMapping( const ArgumentMappingContext& ctx) { - return KernelSignature("softmax_grad", - {"Out", GradVarName("Out")}, - {"axis"}, - {GradVarName("X")}); + return KernelSignature( + "softmax_grad", {"Out", "Out@GRAD"}, {"axis"}, {"X@GRAD"}); } } // namespace phi diff --git a/paddle/phi/ops/compat/softmax_with_cross_entropy_sig.cc b/paddle/phi/ops/compat/softmax_with_cross_entropy_sig.cc index 9cfc5ded90a49..c75d4f711dc0f 100644 --- a/paddle/phi/ops/compat/softmax_with_cross_entropy_sig.cc +++ b/paddle/phi/ops/compat/softmax_with_cross_entropy_sig.cc @@ -31,13 +31,13 @@ KernelSignature SoftmaxWithCrossEntropyOpArgumentMapping( KernelSignature SoftmaxWithCrossEntropyGradOpArgumentMapping( const ArgumentMappingContext& ctx) { return KernelSignature("cross_entropy_with_softmax_grad", - {"Label", "Softmax", GradVarName("Loss")}, + {"Label", "Softmax", "Loss@GRAD"}, {"soft_label", "use_softmax", "numeric_stable_mode", "ignore_index", "axis"}, - {GradVarName("Logits")}); + {"Logits@GRAD"}); } } // namespace phi diff --git a/paddle/phi/ops/compat/squeeze_sig.cc b/paddle/phi/ops/compat/squeeze_sig.cc index 276246533e89e..c65d77df9808e 100644 --- a/paddle/phi/ops/compat/squeeze_sig.cc +++ b/paddle/phi/ops/compat/squeeze_sig.cc @@ -23,10 +23,8 @@ KernelSignature SqueezeOpArgumentMapping(const ArgumentMappingContext& ctx) { KernelSignature SqueezeGradOpArgumentMapping( const ArgumentMappingContext& ctx) { - return KernelSignature("squeeze_grad", - {"XShape", GradVarName("Out")}, - {"axes"}, - {GradVarName("X")}); + return KernelSignature( + "squeeze_grad", {"XShape", "Out@GRAD"}, {"axes"}, {"X@GRAD"}); } } // namespace phi diff --git a/paddle/phi/ops/compat/stack_sig.cc b/paddle/phi/ops/compat/stack_sig.cc index 97768eb89026e..334fdb29e5f25 100644 --- a/paddle/phi/ops/compat/stack_sig.cc +++ b/paddle/phi/ops/compat/stack_sig.cc @@ -14,8 +14,7 @@ limitations under the License. */ namespace phi { KernelSignature StackGradOpArgumentMapping(const ArgumentMappingContext& ctx) { - return KernelSignature( - "stack_grad", {GradVarName("Y")}, {"axis"}, {GradVarName("X")}); + return KernelSignature("stack_grad", {"Y@GRAD"}, {"axis"}, {"X@GRAD"}); } } // namespace phi diff --git a/paddle/phi/ops/compat/strided_slice_sig.cc b/paddle/phi/ops/compat/strided_slice_sig.cc index 9fb70af0dea51..5421fcd616ce7 100644 --- a/paddle/phi/ops/compat/strided_slice_sig.cc +++ b/paddle/phi/ops/compat/strided_slice_sig.cc @@ -29,35 +29,35 @@ KernelSignature StridedSliceOpArgumentMapping( bool use_attr_ends = !ctx.IsRuntime() && !ends.empty(); bool use_attr_strides = !ctx.IsRuntime() && !strides.empty(); - std::string starts_key = + const char* starts_key = ctx.HasInput("StartsTensor") ? "StartsTensor" : (ctx.InputSize("StartsTensorList") > 0 ? (use_attr_starts ? "starts" : "StartsTensorList") : "starts"); - std::string ends_key = + const char* ends_key = ctx.HasInput("EndsTensor") ? "EndsTensor" : (ctx.InputSize("EndsTensorList") > 0 ? (use_attr_ends ? "ends" : "EndsTensorList") : "ends"); - std::string strides_key = + const char* strides_key = ctx.HasInput("StridesTensor") ? "StridesTensor" : (ctx.InputSize("StridesTensorList") > 0 ? (use_attr_strides ? "strides" : "StridesTensorList") : "strides"); - paddle::SmallVector inputs = {"Input"}; - paddle::SmallVector attrs = {"axes", + paddle::SmallVector inputs = {"Input"}; + paddle::SmallVector attrs = {"axes", starts_key, ends_key, strides_key, "infer_flags", "decrease_axis"}; - paddle::SmallVector outputs = {"Out"}; + paddle::SmallVector outputs = {"Out"}; - std::string kernel_name; + const char* kernel_name; if (ctx.IsDenseTensorVectorInput("Input")) { kernel_name = "strided_slice_array"; } else { @@ -78,35 +78,35 @@ KernelSignature StridedSliceGradOpArgumentMapping( bool use_attr_ends = !ctx.IsRuntime() && !ends.empty(); bool use_attr_strides = !ctx.IsRuntime() && !strides.empty(); - std::string starts_key = + const char* starts_key = ctx.HasInput("StartsTensor") ? "StartsTensor" : (ctx.InputSize("StartsTensorList") > 0 ? (use_attr_starts ? "starts" : "StartsTensorList") : "starts"); - std::string ends_key = + const char* ends_key = ctx.HasInput("EndsTensor") ? "EndsTensor" : (ctx.InputSize("EndsTensorList") > 0 ? (use_attr_ends ? "ends" : "EndsTensorList") : "ends"); - std::string strides_key = + const char* strides_key = ctx.HasInput("StridesTensor") ? "StridesTensor" : (ctx.InputSize("StridesTensorList") > 0 ? (use_attr_strides ? "strides" : "StridesTensorList") : "strides"); - paddle::SmallVector inputs = {"Input", GradVarName("Out")}; - paddle::SmallVector attrs = {"axes", + paddle::SmallVector inputs = {"Input", "Out@GRAD"}; + paddle::SmallVector attrs = {"axes", starts_key, ends_key, strides_key, "infer_flags", "decrease_axis"}; - paddle::SmallVector outputs = {GradVarName("Input")}; + paddle::SmallVector outputs = {"Input@GRAD"}; - std::string kernel_name; + const char* kernel_name; if (ctx.IsDenseTensorVectorInput("Input")) { kernel_name = "strided_slice_array_grad"; } else { diff --git a/paddle/phi/ops/compat/take_along_axis_sig.cc b/paddle/phi/ops/compat/take_along_axis_sig.cc index 27a996a270ddf..a35c1c2db4480 100644 --- a/paddle/phi/ops/compat/take_along_axis_sig.cc +++ b/paddle/phi/ops/compat/take_along_axis_sig.cc @@ -25,9 +25,9 @@ KernelSignature TakeAlongAxisArgumentMapping( KernelSignature TakeAlongAxisGradArgumentMapping( const ArgumentMappingContext& ctx) { return KernelSignature("take_along_axis_grad", - {"Input", "Index", GradVarName("Result")}, + {"Input", "Index", "Result@GRAD"}, {"Axis"}, - {GradVarName("Input")}); + {"Input@GRAD"}); } } // namespace phi diff --git a/paddle/phi/ops/compat/temporal_shift_sig.cc b/paddle/phi/ops/compat/temporal_shift_sig.cc index a686c37ff7e65..a6eed22716ca7 100644 --- a/paddle/phi/ops/compat/temporal_shift_sig.cc +++ b/paddle/phi/ops/compat/temporal_shift_sig.cc @@ -27,9 +27,9 @@ KernelSignature TemporalShiftOpArgumentMapping( KernelSignature TemporalShiftGradOpArgumentMapping( const ArgumentMappingContext& ctx) { return KernelSignature("temporal_shift_grad", - {GradVarName("Out")}, + {"Out@GRAD"}, {"seg_num", "shift_ratio", "data_format"}, - {GradVarName("X")}); + {"X@GRAD"}); } } // namespace phi diff --git a/paddle/phi/ops/compat/tile_sig.cc b/paddle/phi/ops/compat/tile_sig.cc index ca3fa5fe1f86a..be401e40c4974 100644 --- a/paddle/phi/ops/compat/tile_sig.cc +++ b/paddle/phi/ops/compat/tile_sig.cc @@ -33,20 +33,14 @@ KernelSignature TileOpArgumentMapping(const ArgumentMappingContext& ctx) { KernelSignature TileGradOpArgumentMapping(const ArgumentMappingContext& ctx) { if (ctx.HasInput("RepeatTimes")) { - return KernelSignature("tile_grad", - {"X", GradVarName("Out")}, - {"RepeatTimes"}, - {GradVarName("X")}); + return KernelSignature( + "tile_grad", {"X", "Out@GRAD"}, {"RepeatTimes"}, {"X@GRAD"}); } else if (ctx.InputSize("repeat_times_tensor") > 0) { - return KernelSignature("tile_grad", - {"X", GradVarName("Out")}, - {"repeat_times_tensor"}, - {GradVarName("X")}); + return KernelSignature( + "tile_grad", {"X", "Out@GRAD"}, {"repeat_times_tensor"}, {"X@GRAD"}); } else { - return KernelSignature("tile_grad", - {"X", GradVarName("Out")}, - {"repeat_times"}, - {GradVarName("X")}); + return KernelSignature( + "tile_grad", {"X", "Out@GRAD"}, {"repeat_times"}, {"X@GRAD"}); } } diff --git a/paddle/phi/ops/compat/top_k_sig.cc b/paddle/phi/ops/compat/top_k_sig.cc index 8488a18e34ce1..c1073f9efdc6b 100644 --- a/paddle/phi/ops/compat/top_k_sig.cc +++ b/paddle/phi/ops/compat/top_k_sig.cc @@ -29,9 +29,9 @@ KernelSignature TopkOpArgumentMapping(const ArgumentMappingContext& ctx) { KernelSignature TopkGradOpArgumentMapping(const ArgumentMappingContext& ctx) { return KernelSignature("top_k_grad", - {"X", "Indices", GradVarName("Out")}, + {"X", "Indices", "Out@GRAD"}, {"k", "axis", "largest", "sorted"}, - {GradVarName("X")}); + {"X@GRAD"}); } } // namespace phi diff --git a/paddle/phi/ops/compat/trace_sig.cc b/paddle/phi/ops/compat/trace_sig.cc index c3f5d6d287551..2cb7d9a80bce5 100644 --- a/paddle/phi/ops/compat/trace_sig.cc +++ b/paddle/phi/ops/compat/trace_sig.cc @@ -23,9 +23,9 @@ KernelSignature TraceOpArgumentMapping(const ArgumentMappingContext& ctx) { KernelSignature TraceGradOpArgumentMapping(const ArgumentMappingContext& ctx) { return KernelSignature("trace_grad", - {"Input", GradVarName("Out")}, + {"Input", "Out@GRAD"}, {"offset", "axis1", "axis2"}, - {GradVarName("Input")}); + {"Input@GRAD"}); } } // namespace phi diff --git a/paddle/phi/ops/compat/transpose_sig.cc b/paddle/phi/ops/compat/transpose_sig.cc index 90961760cfc66..0f2a3108ec9e6 100644 --- a/paddle/phi/ops/compat/transpose_sig.cc +++ b/paddle/phi/ops/compat/transpose_sig.cc @@ -22,8 +22,7 @@ KernelSignature TransposeOpArgumentMapping(const ArgumentMappingContext& ctx) { KernelSignature TransposeGradOpArgumentMapping( const ArgumentMappingContext& ctx) { - return KernelSignature( - "transpose_grad", {GradVarName("Out")}, {"axis"}, {GradVarName("X")}); + return KernelSignature("transpose_grad", {"Out@GRAD"}, {"axis"}, {"X@GRAD"}); } } // namespace phi diff --git a/paddle/phi/ops/compat/triangular_solve_sig.cc b/paddle/phi/ops/compat/triangular_solve_sig.cc index c56af3e21e53e..851db32a032d6 100644 --- a/paddle/phi/ops/compat/triangular_solve_sig.cc +++ b/paddle/phi/ops/compat/triangular_solve_sig.cc @@ -19,9 +19,9 @@ namespace phi { KernelSignature TriangularSolveGradOpArgumentMapping( const ArgumentMappingContext& ctx) { return KernelSignature("triangular_solve_grad", - {"X", "Y", "Out", GradVarName("Out")}, + {"X", "Y", "Out", "Out@GRAD"}, {"upper", "transpose", "unitriangular"}, - {GradVarName("X"), GradVarName("Y")}); + {"X@GRAD", "Y@GRAD"}); } } // namespace phi diff --git a/paddle/phi/ops/compat/tril_triu_sig.cc b/paddle/phi/ops/compat/tril_triu_sig.cc index 4f79f8650decf..3c5fa15b41cae 100644 --- a/paddle/phi/ops/compat/tril_triu_sig.cc +++ b/paddle/phi/ops/compat/tril_triu_sig.cc @@ -22,10 +22,8 @@ KernelSignature TrilTriuOpArgumentMapping(const ArgumentMappingContext& ctx) { KernelSignature TrilTriuGradOpArgumentMapping( const ArgumentMappingContext& ctx) { - return KernelSignature("tril_triu_grad", - {GradVarName("Out")}, - {"diagonal", "lower"}, - {GradVarName("X")}); + return KernelSignature( + "tril_triu_grad", {"Out@GRAD"}, {"diagonal", "lower"}, {"X@GRAD"}); } } // namespace phi diff --git a/paddle/phi/ops/compat/trunc_sig.cc b/paddle/phi/ops/compat/trunc_sig.cc index 2d35439216da5..7b6a7771fbe89 100644 --- a/paddle/phi/ops/compat/trunc_sig.cc +++ b/paddle/phi/ops/compat/trunc_sig.cc @@ -21,8 +21,7 @@ KernelSignature TruncOpArgumentMapping(const ArgumentMappingContext& ctx) { } KernelSignature TruncGradOpArgumentMapping(const ArgumentMappingContext& ctx) { - return KernelSignature( - "trunc_grad", {GradVarName("Out")}, {}, {GradVarName("X")}); + return KernelSignature("trunc_grad", {"Out@GRAD"}, {}, {"X@GRAD"}); } } // namespace phi diff --git a/paddle/phi/ops/compat/unfold_sig.cc b/paddle/phi/ops/compat/unfold_sig.cc index ddc3b1813cbef..45415616f2969 100644 --- a/paddle/phi/ops/compat/unfold_sig.cc +++ b/paddle/phi/ops/compat/unfold_sig.cc @@ -18,9 +18,9 @@ namespace phi { KernelSignature UnfoldGradOpArgumentMapping(const ArgumentMappingContext& ctx) { return KernelSignature("unfold_grad", - {"X", GradVarName("Y")}, + {"X", "Y@GRAD"}, {"kernel_sizes", "strides", "paddings", "dilations"}, - {GradVarName("X")}); + {"X@GRAD"}); } } // namespace phi diff --git a/paddle/phi/ops/compat/unsqueeze_sig.cc b/paddle/phi/ops/compat/unsqueeze_sig.cc index 20cd9701e83e5..c802c2684b282 100644 --- a/paddle/phi/ops/compat/unsqueeze_sig.cc +++ b/paddle/phi/ops/compat/unsqueeze_sig.cc @@ -35,7 +35,7 @@ KernelSignature UnsqueezeOpArgumentMapping(const ArgumentMappingContext& ctx) { KernelSignature UnsqueezeGradOpArgumentMapping( const ArgumentMappingContext& ctx) { return KernelSignature( - "unsqueeze_grad", {"XShape", GradVarName("Out")}, {}, {GradVarName("X")}); + "unsqueeze_grad", {"XShape", "Out@GRAD"}, {}, {"X@GRAD"}); } } // namespace phi PD_REGISTER_BASE_KERNEL_NAME(unsqueeze2, unsqueeze); diff --git a/paddle/phi/ops/compat/unstack_sig.cc b/paddle/phi/ops/compat/unstack_sig.cc index 41d7fc120a9ef..d03499f94b6be 100644 --- a/paddle/phi/ops/compat/unstack_sig.cc +++ b/paddle/phi/ops/compat/unstack_sig.cc @@ -15,8 +15,7 @@ namespace phi { KernelSignature UnStackGradOpArgumentMapping( const ArgumentMappingContext& ctx) { - return KernelSignature( - "unstack_grad", {GradVarName("Y")}, {"axis"}, {GradVarName("X")}); + return KernelSignature("unstack_grad", {"Y@GRAD"}, {"axis"}, {"X@GRAD"}); } } // namespace phi diff --git a/paddle/phi/ops/compat/warpctc_sig.cc b/paddle/phi/ops/compat/warpctc_sig.cc index 75f440de7f2db..ac3dc366ad8c6 100644 --- a/paddle/phi/ops/compat/warpctc_sig.cc +++ b/paddle/phi/ops/compat/warpctc_sig.cc @@ -25,11 +25,10 @@ KernelSignature WarpctcOpArgumentMapping(const ArgumentMappingContext& ctx) { KernelSignature WarpctcGradOpArgumentMapping( const ArgumentMappingContext& ctx) { - return KernelSignature( - "warpctc_grad", - {"WarpCTCGrad", "Logits", GradVarName("Loss"), "LogitsLength"}, - {"blank", "norm_by_times"}, - {GradVarName("Logits")}); + return KernelSignature("warpctc_grad", + {"WarpCTCGrad", "Logits", "Loss@GRAD", "LogitsLength"}, + {"blank", "norm_by_times"}, + {"Logits@GRAD"}); } } // namespace phi diff --git a/paddle/phi/ops/compat/where_grad_sig.cc b/paddle/phi/ops/compat/where_grad_sig.cc index 71984a26d35af..e0c380672c895 100644 --- a/paddle/phi/ops/compat/where_grad_sig.cc +++ b/paddle/phi/ops/compat/where_grad_sig.cc @@ -18,9 +18,9 @@ namespace phi { KernelSignature WhereGradOpArgumentMapping(const ArgumentMappingContext& ctx) { return KernelSignature("where_grad", - {"Condition", "X", "Y", GradVarName("Out")}, + {"Condition", "X", "Y", "Out@GRAD"}, {}, - {GradVarName("X"), GradVarName("Y")}); + {"X@GRAD", "Y@GRAD"}); } } // namespace phi diff --git a/paddle/phi/ops/compat/yolov3_loss_sig.cc b/paddle/phi/ops/compat/yolov3_loss_sig.cc index bbdadfa93ba96..8d5d82a9e72e3 100644 --- a/paddle/phi/ops/compat/yolov3_loss_sig.cc +++ b/paddle/phi/ops/compat/yolov3_loss_sig.cc @@ -31,25 +31,23 @@ KernelSignature Yolov3LossOpArgumentMapping(const ArgumentMappingContext& ctx) { KernelSignature Yolov3LossGradOpArgumentMapping( const ArgumentMappingContext& ctx) { - return KernelSignature("yolov3_loss_grad", - {"X", - "GTBox", - "GTLabel", - "GTScore", - GradVarName("Loss"), - "ObjectnessMask", - "GTMatchMask"}, - {"anchors", - "anchor_mask", - "class_num", - "ignore_thresh", - "downsample_ratio", - "use_label_smooth", - "scale_x_y"}, - {GradVarName("X"), - GradVarName("GTBox"), - GradVarName("GTLabel"), - GradVarName("GTScore")}); + return KernelSignature( + "yolov3_loss_grad", + {"X", + "GTBox", + "GTLabel", + "GTScore", + "Loss@GRAD", + "ObjectnessMask", + "GTMatchMask"}, + {"anchors", + "anchor_mask", + "class_num", + "ignore_thresh", + "downsample_ratio", + "use_label_smooth", + "scale_x_y"}, + {"X@GRAD", "GTBox@GRAD", "GTLabel@GRAD", "GTScore@GRAD"}); } } // namespace phi diff --git a/paddle/phi/tests/core/test_meta_fn_utils.cc b/paddle/phi/tests/core/test_meta_fn_utils.cc index c90e2f3dbcded..028b9d23352c7 100644 --- a/paddle/phi/tests/core/test_meta_fn_utils.cc +++ b/paddle/phi/tests/core/test_meta_fn_utils.cc @@ -46,9 +46,9 @@ TEST(MetaFnFactory, InferMetaFnExists) { phi::MetaTensor meta_out(&dense_out1); phi::UnchangedInferMeta(meta_x, &meta_out); - auto shared_meat_x = std::make_shared(&dense_x); + auto shared_meat_x = phi::MetaTensor(&dense_x); phi::DenseTensor dense_out2; - auto shared_meta_out = std::make_shared(&dense_out2); + auto shared_meta_out = phi::MetaTensor(&dense_out2); phi::InferMetaContext ctx; ctx.EmplaceBackInput(shared_meat_x); ctx.EmplaceBackOutput(shared_meta_out); @@ -69,9 +69,9 @@ TEST(MetaFnFactory, CopyInferMetaFn) { phi::MetaTensor meta_out(&dense_out1); phi::UnchangedInferMeta(meta_x, &meta_out); - auto shared_meat_x = std::make_shared(&dense_x); + auto shared_meat_x = phi::MetaTensor(&dense_x); phi::DenseTensor dense_out2; - auto shared_meta_out = std::make_shared(&dense_out2); + auto shared_meta_out = phi::MetaTensor(&dense_out2); phi::InferMetaContext ctx; ctx.EmplaceBackInput(shared_meat_x); @@ -90,13 +90,13 @@ TEST(MetaFnFactory, SplitInferMetaFn) { phi::DenseTensor dense_x; dense_x.Resize({4, 10}); phi::MetaTensor meta_x(&dense_x); - auto shared_meat_x = std::make_shared(&dense_x); + auto shared_meat_x = phi::MetaTensor(&dense_x); phi::DenseTensor dense_out1; phi::DenseTensor dense_out2; - paddle::SmallVector> out; - out.push_back(std::make_shared(&dense_out1)); - out.push_back(std::make_shared(&dense_out2)); + paddle::SmallVector out; + out.emplace_back(phi::MetaTensor(&dense_out1)); + out.emplace_back(phi::MetaTensor(&dense_out2)); phi::InferMetaContext ctx; ctx.EmplaceBackInput(shared_meat_x); diff --git a/paddle/testing/CMakeLists.txt b/paddle/testing/CMakeLists.txt index 0cc68bf31617c..2c977e923b5b1 100644 --- a/paddle/testing/CMakeLists.txt +++ b/paddle/testing/CMakeLists.txt @@ -1,5 +1,5 @@ # for paddle test case if(WITH_TESTING) - cc_library(paddle_gtest_main SRCS paddle_gtest_main.cc DEPS init device_context memory gtest gflags proto_desc) + cc_library(paddle_gtest_main SRCS paddle_gtest_main.cc DEPS init device_context memory gtest gflags proto_desc phi_utils) endif() diff --git a/paddle/testing/paddle_gtest_main.cc b/paddle/testing/paddle_gtest_main.cc index 0fb5412ff051f..bb919f0e9110c 100644 --- a/paddle/testing/paddle_gtest_main.cc +++ b/paddle/testing/paddle_gtest_main.cc @@ -14,6 +14,7 @@ limitations under the License. */ #include "gflags/gflags.h" #include "gtest/gtest.h" +#include "paddle/fluid/framework/phi_utils.h" #include "paddle/fluid/memory/allocation/allocator_strategy.h" #include "paddle/fluid/platform/device/npu/npu_info.h" #include "paddle/fluid/platform/flags.h" @@ -85,6 +86,7 @@ int main(int argc, char** argv) { ::GFLAGS_NAMESPACE::ParseCommandLineFlags( &new_argc, &new_argv_address, false); paddle::framework::InitDevices(); + paddle::framework::InitDefaultKernelSignatureMap(); int ret = RUN_ALL_TESTS(); diff --git a/python/paddle/fluid/__init__.py b/python/paddle/fluid/__init__.py index fb9e8d8ece100..13b964274fde2 100644 --- a/python/paddle/fluid/__init__.py +++ b/python/paddle/fluid/__init__.py @@ -212,6 +212,7 @@ def remove_flag_if_exists(name): core.init_glog(sys.argv[0]) # don't init_p2p when in unittest to save time. core.init_devices() + core.init_default_kernel_signatures() # TODO(panyx0718): Avoid doing complex initialization logic in __init__.py. diff --git a/python/paddle/utils/code_gen/api_base.py b/python/paddle/utils/code_gen/api_base.py index 9aa3fc9eafe33..a6bd0a10cb1fa 100644 --- a/python/paddle/utils/code_gen/api_base.py +++ b/python/paddle/utils/code_gen/api_base.py @@ -474,7 +474,7 @@ def gene_infer_meta(self, kernel_output_names, code_indent) -> str: param] == "const std::vector&": meta_tensor_code = meta_tensor_code + f""" {code_indent} auto {param}_meta_vec = MakeMetaTensor({PREFIX_TENSOR_NAME}{param}); -{code_indent} std::vector {param}_metas({param}_meta_vec.size()); +{code_indent} std::vector {param}_metas({param}_meta_vec.size()); {code_indent} for (size_t i = 0; i < {param}_meta_vec.size(); ++i) {{ {code_indent} {param}_metas[i] = &{param}_meta_vec[i]; {code_indent} }} From 0ef3ef28e260278da2ebe4faf5382a43a8ef7389 Mon Sep 17 00:00:00 2001 From: Fan Zhang Date: Sun, 17 Apr 2022 22:00:12 +0800 Subject: [PATCH 198/211] XPUPS Adaptation (#40991) * Adapt XPUPS - 1st version - 3.24 * Adapt XPUPS - update XPU PushSparse - 2nd version - 3.24 * Adapt XPUPS - add XPU PullSparseOp - 3nd version - 3.25 * refactor heter comm kernel * update. test=develop * Adapt XPUPS - modify by compilation - 4th version - 3.27 * update calc_shard_offset. test=develop * update xpu kernel. test=develop * update args of calc_shard_offset * update. test=develop * remove customGradMerger * update. test=develop * heter_comm update * heter_comm update * update calc_shard_offset. test=develop * heter_comm update * update args of calc_shard_offset * update. test=develop * remove customGradMerger * update. test=develop * fix. test=develop * update. test=develop * update. test=develop * update optimizer kernel * Adapt XPUPS - use WITH_XPU_KP and modify wrapper kernel function - 5th version - 3.30 * update. test=develop * update pslib.cmake * update. test=develop * update. test=develop * update. test=develop * update. test=develop * update. test=develop * Adapt XPUPS - modify by kp compilation - 6th version - 3.30 * update. test=develop * update. test=develop * update. test=develop * update optimizer kernel * update. test=develop * update. test=develop * update. test=develop * update. test=develop * update. test=develop * update. test=develop * update. test=develop * update. test=develop * fix. test=develop * fix. test=develop * used by minxu * update heter_comm_inl * fix. test=develop * Adapt XPUPS - modify by kp compilation - 7th version - 3.30 * fix. test=develop * add optimizer kernel. test=develop * fix. test=develop * fix. test=develop * fix. test=develop * fix. test=develop * 3.31 update * Adapt XPUPS - update kp compilation path - 8th version - 3.31 * add optimizer kernel. test=develop * fix kunlun not support size_t. test=develop * fix. test=develop * fix. test=develop * fix. test=develop * fix. test=develop * fix kunlun not support size_t. test=develop * fix. test=develop * fix. test=develop * fix. test=develop * fix. test=develop * update heter_comm_kernel.kps 3.31 * fix. test=develop * fix. test=develop * update heter_comm_kernel.kps 3.31 * fix. test=develop * fix. test=develop * fix. test=develop * fix. test=develop * fix. test=develop * update heter_comm.h 3.31 * fix. test=develop * fix. test=develop * fix. test=develop * fix. test=develop * update hashtable. test=develop * update. test=develop * Adapt XPUPS - update by kp compilation - 9th version - 4.1 * update hashtable. test=develop * fix. test=develop * update hashtable 4.1 * fix. test=develop * fix. test=develop * fix. test=develop * fix. test=develop * fix. test=develop * fix. test=develop * fix. test=develop * Adapt XPUPS - update by kp compilation - 10th version - 4.1 * fix. test=develop * fix. test=develop * fix. test=develop * update. test=develop * modify by compilation 4.1 * update. test=develop * update. test=develop * fix. test=develop * modify by compilation 4.1 * update. test=develop * fix. test=develop * fix. test=develop * fix. test=develop * fix. test=develop * fix. test=develop * fix. test=develop * fix. test=develop * fix. test=develop * modify by compilation 4.1 * fix. test=develop * fix. test=develop * fix. test=develop * modify by compilation 4.1 19:30 * fix. test=develop * update ps_gpu_wrapper.kps 4.1 * fix. test=develop * fix. test=develop * fix. test=develop * fix. test=develop * fix. test=develop * Adapt XPUPS - update by kp compilation - 11th version - 4.1 * fix. test=develop * Adapt XPUPS - update by kp compilation - 12nd version - 4.2 * fix. test=develop * fix. test=develop * modify by compilation 4.2 * 4.2 update * fix. test=develop * template init. test=develop * update 4.6 * fix. test=develop * template init. test=develop * 4.6 modify by compilation * hashtable template init. test=develop * hashtable template init. test=develop * fix. test=develop * fix. test=develop * fix. test=devlop * fix. test=develop * fix. test=develop * fix. test=develop * fix. test=develop * fix. test=develop * fix. test=devlop * fix. test=develop * fix. test=develop * fix. test=develop * fix. test=develop * fix. test=develop * fix. test=develop * fix. test=develop * Adapt XPUPS - update by kp compilation - 13nd version - 4.7 * fix. test=develop * fix. test=develop * fix. test=develop * fix. test=develop * fix. test=develop * fix. test=develop * fix. test=develop * 4.11 update * fix. test=develop * fix. test=develop * 4.11 update * update by pre-commit * fix. test=develop * fix. test=develop * fix. test=develop * fix. test=develop * 4.12 update * fix. test=develop * Adapt XPUPS - update by kp compilation - 14th version - 4.13 * 4.13 update * 4.14 update * 4.14 update * 4.14 update * 4.14 modify by merged latest compilation * retry CI 4.14 * 4.15 pass static check * 4.15 modify by gpups CI * 3.16 update by gpups CI - modify ps_gpu_wrapper.h * 4.16 update * 4.16 pass xpu compile * 4.16 retry CI * 4.16 update Co-authored-by: zmxdream --- paddle/fluid/framework/fleet/CMakeLists.txt | 8 +- .../framework/fleet/heter_ps/CMakeLists.txt | 8 + .../fleet/heter_ps/hashtable_kernel.kps | 23 +- .../framework/fleet/heter_ps/heter_comm.h | 4 + .../framework/fleet/heter_ps/heter_comm_inl.h | 3 - .../fleet/heter_ps/heter_comm_kernel.h | 3 + .../fleet/heter_ps/heter_comm_kernel.kps | 29 +- .../framework/fleet/heter_ps/heter_ps.cc | 61 ++++ .../fleet/heter_ps/heter_resource.cc | 2 + .../framework/fleet/heter_ps/heter_resource.h | 3 + .../fluid/framework/fleet/ps_gpu_wrapper.cc | 97 ++++- .../fluid/framework/fleet/ps_gpu_wrapper.cu | 2 + paddle/fluid/framework/fleet/ps_gpu_wrapper.h | 29 +- .../fluid/framework/fleet/ps_gpu_wrapper.kps | 339 ++++++++++++++++++ paddle/fluid/framework/ps_gpu_trainer.cc | 7 + paddle/fluid/framework/ps_gpu_worker.cc | 2 + paddle/fluid/operators/pull_box_sparse_op.cc | 6 +- paddle/fluid/operators/pull_box_sparse_op.cu | 7 +- paddle/fluid/operators/pull_box_sparse_op.h | 16 + paddle/fluid/pybind/ps_gpu_wrapper_py.cc | 3 + 20 files changed, 613 insertions(+), 39 deletions(-) create mode 100644 paddle/fluid/framework/fleet/heter_ps/heter_ps.cc mode change 100755 => 100644 paddle/fluid/framework/fleet/ps_gpu_wrapper.h create mode 100644 paddle/fluid/framework/fleet/ps_gpu_wrapper.kps diff --git a/paddle/fluid/framework/fleet/CMakeLists.txt b/paddle/fluid/framework/fleet/CMakeLists.txt index c3304e3f9021d..2e9104f40cc60 100644 --- a/paddle/fluid/framework/fleet/CMakeLists.txt +++ b/paddle/fluid/framework/fleet/CMakeLists.txt @@ -12,15 +12,19 @@ else() endif(WITH_PSLIB) if(WITH_HETERPS) - if(WITH_NCCL) + if(WITH_NCCL AND WITH_GPU) nv_library(ps_gpu_wrapper SRCS ps_gpu_wrapper.cu ps_gpu_wrapper.cc DEPS heter_ps gloo_wrapper ${BRPC_DEPS}) add_subdirectory(heter_ps) + elseif(WITH_XPU_KP) + xpu_library(ps_gpu_wrapper SRCS ps_gpu_wrapper.kps ps_gpu_wrapper.cc + DEPS heter_ps gloo_wrapper ${BRPC_DEPS}) + add_subdirectory(heter_ps) elseif(WITH_RCCL) hip_library(ps_gpu_wrapper SRCS ps_gpu_wrapper.cu ps_gpu_wrapper.cc DEPS heter_ps gloo_wrapper ${BRPC_DEPS}) add_subdirectory(heter_ps) - endif(WITH_NCCL) + endif() else() cc_library(ps_gpu_wrapper SRCS ps_gpu_wrapper.cc DEPS gloo_wrapper) endif(WITH_HETERPS) diff --git a/paddle/fluid/framework/fleet/heter_ps/CMakeLists.txt b/paddle/fluid/framework/fleet/heter_ps/CMakeLists.txt index e90d864fa1ab7..8c313694b7e60 100644 --- a/paddle/fluid/framework/fleet/heter_ps/CMakeLists.txt +++ b/paddle/fluid/framework/fleet/heter_ps/CMakeLists.txt @@ -24,6 +24,14 @@ IF(WITH_GPU) endif() ENDIF() +IF(WITH_XPU_KP) + SET(HETERPS_DEPS device_context) + xpu_library(heter_comm_kernel SRCS heter_comm_kernel.h heter_comm_kernel.kps feature_value.h) + xpu_library(hashtable_kernel SRCS hashtable.h hashtable_kernel.kps) + cc_library(heter_comm SRCS heter_comm.h heter_resource.cc DEPS ${HETERPS_DEPS} heter_comm_kernel hashtable_kernel) + cc_library(heter_ps SRCS heter_ps.cc DEPS heter_comm) + # xpu_library(heter_comm SRCS heter_comm.h heter_comm_kernel.kps feature_value.h heter_resource.cc heter_resource.h hashtable.h mem_pool.h DEPS ${HETERPS_DEPS}) +ENDIF() IF(WITH_ROCM) hip_library(heter_comm SRCS heter_comm.h feature_value.h heter_resource.cc heter_resource.h hashtable.h DEPS cub device_context) hip_test(test_heter_comm SRCS feature_value.h DEPS heter_comm) diff --git a/paddle/fluid/framework/fleet/heter_ps/hashtable_kernel.kps b/paddle/fluid/framework/fleet/heter_ps/hashtable_kernel.kps index 55edf883271b9..e879d817b14dd 100644 --- a/paddle/fluid/framework/fleet/heter_ps/hashtable_kernel.kps +++ b/paddle/fluid/framework/fleet/heter_ps/hashtable_kernel.kps @@ -48,7 +48,7 @@ __device__ void update_lr(float& w, float& g2sum, float g, // NOLINT GM2LM(optimizer_config::learning_rate, &local_learning_rate, sizeof(float)); GM2LM(optimizer_config::initial_g2sum, &local_initial_g2sum, sizeof(float)); GM2LM(optimizer_config::min_bound, &local_min_bound, sizeof(float)); - GM2LM(optimizr_config::max_bound, &local_max_bound, sizeof(float)); + GM2LM(optimizer_config::max_bound, &local_max_bound, sizeof(float)); double add_g2sum = 0; double ratio = local_learning_rate * @@ -136,7 +136,7 @@ __device__ void update_value(ValType& val, const GradType& grad) { // NOLINT template __global__ void insert_kernel(Table* table, const KeyType* const keys, - const ValType* const vals, size_t len) { + const ValType* const vals, long long len) { int cid = core_id(); int ncores = core_num(); if (cid >= ncores) { @@ -164,7 +164,7 @@ __global__ void insert_kernel(Table* table, const KeyType* const keys, template __global__ void search_kernel(Table* table, const KeyType* const keys, - ValType* const vals, size_t len) { + ValType* const vals, long long len) { int cid = core_id(); int ncores = core_num(); if (cid >= ncores) { @@ -194,7 +194,7 @@ __global__ void search_kernel(Table* table, const KeyType* const keys, template __global__ void update_kernel(Table* table, const KeyType* const keys, - const GradType* const grads, size_t len) { + const GradType* const grads, long long len) { int cid = core_id(); int ncores = core_num(); if (cid >= ncores) { @@ -251,7 +251,10 @@ void HashTable::get(const KeyType* d_keys, ValType* d_vals, if (len == 0) { return; } - search_kernel<<<4, 64, stream>>>(container_, d_keys, d_vals, len); + long long c_len = (long long)len; + search_kernel><<<4, 64, stream>>>( + container_, d_keys, d_vals, c_len); } template @@ -272,7 +275,10 @@ void HashTable::insert(const KeyType* d_keys, if (len == 0) { return; } - insert_kernel<<<4, 64, stream>>>(container_, d_keys, d_vals, len); + long long c_len = (long long)len; + insert_kernel><<<4, 64, stream>>>( + container_, d_keys, d_vals, c_len); } template @@ -289,7 +295,10 @@ void HashTable::update(const KeyType* d_keys, if (len == 0) { return; } - update_kernel<<<4, 64, stream>>>(container_, d_keys, d_grads, len); + long long c_len = (long long)len; + update_kernel, + GradType><<<4, 64, stream>>>(container_, d_keys, d_grads, + c_len); } template diff --git a/paddle/fluid/framework/fleet/heter_ps/heter_comm.h b/paddle/fluid/framework/fleet/heter_ps/heter_comm.h index 5e4be02962ea9..b5b1c22f30454 100644 --- a/paddle/fluid/framework/fleet/heter_ps/heter_comm.h +++ b/paddle/fluid/framework/fleet/heter_ps/heter_comm.h @@ -153,11 +153,13 @@ class HeterComm { #if defined(PADDLE_WITH_CUDA) platform::CUDAPlace place_; + #elif defined(PADDLE_WITH_XPU_KP) platform::XPUPlace place_; #endif std::shared_ptr all_keys_mem; std::shared_ptr all_grads_mem; + KeyType* all_keys; GradType* all_grads; @@ -228,5 +230,7 @@ class HeterComm { } // end namespace framework } // end namespace paddle + #include "paddle/fluid/framework/fleet/heter_ps/heter_comm_inl.h" + #endif diff --git a/paddle/fluid/framework/fleet/heter_ps/heter_comm_inl.h b/paddle/fluid/framework/fleet/heter_ps/heter_comm_inl.h index 1e66b3cb25031..551b5c38895a9 100644 --- a/paddle/fluid/framework/fleet/heter_ps/heter_comm_inl.h +++ b/paddle/fluid/framework/fleet/heter_ps/heter_comm_inl.h @@ -411,7 +411,6 @@ void HeterComm::merge_grad( auto d_merge_keys = memory::Alloc(place, len * sizeof(KeyType)); KeyType* d_merge_keys_ptr = reinterpret_cast(d_merge_keys->ptr()); - auto d_merge_grads = memory::Alloc(place, len * sizeof(GradType)); GradType* d_merge_grads_ptr = reinterpret_cast(d_merge_grads->ptr()); @@ -1035,7 +1034,6 @@ int HeterComm::gather_multi_node_grad( merge_grad(gpu_num, storage.local_keys, storage.local_grads, merge_num, ret); return ret; } - #endif template @@ -1065,7 +1063,6 @@ void HeterComm::end_pass() { // platform::CUDADeviceGuard guard(dev_id); // tables_[index]->dump_to_cpu(dev_id, stream); //} - } // end namespace framework } // end namespace paddle #endif diff --git a/paddle/fluid/framework/fleet/heter_ps/heter_comm_kernel.h b/paddle/fluid/framework/fleet/heter_ps/heter_comm_kernel.h index 1be3687a7dbee..9d2ee5d272c72 100644 --- a/paddle/fluid/framework/fleet/heter_ps/heter_comm_kernel.h +++ b/paddle/fluid/framework/fleet/heter_ps/heter_comm_kernel.h @@ -41,6 +41,7 @@ class HeterCommKernel { template void calc_shard_index(KeyType* d_keys, long long len, T* shard_index, + int total_devs, const StreamType& stream); template @@ -62,6 +63,7 @@ class HeterCommKernel { const KeyT* d_keys_in, KeyT* d_keys_out, const ValueT* d_values_in, ValueT* d_values_out, int num_items, int begin_bit = 0, + int end_bit = sizeof(KeyT) * 8, StreamType stream = NULL, bool debug_synchronous = false); @@ -75,6 +77,7 @@ class HeterCommKernel { ValuesInputIteratorT d_values_in, AggregatesOutputIteratorT d_aggregates_out, NumRunsOutputIteratorT d_num_runs_out, int num_items, + StreamType stream = NULL, bool debug_synchronous = false); private: diff --git a/paddle/fluid/framework/fleet/heter_ps/heter_comm_kernel.kps b/paddle/fluid/framework/fleet/heter_ps/heter_comm_kernel.kps index a1923a7f6019b..f73757902fef6 100644 --- a/paddle/fluid/framework/fleet/heter_ps/heter_comm_kernel.kps +++ b/paddle/fluid/framework/fleet/heter_ps/heter_comm_kernel.kps @@ -233,8 +233,6 @@ __global__ void fill_dvals_kernel(ValType* d_shard_vals, ValType* d_vals, } } -// xpu implementation of heter_comm_kernel.h - template void HeterCommKernel::fill_idx(T* idx, long long len, const StreamType& stream) { @@ -291,17 +289,21 @@ void HeterCommKernel::sort_pairs(void* d_temp_storage, bool debug_synchronous) {} template +void HeterCommKernel::reduce_by_key(void* d_temp_storage, + size_t& temp_storage_bytes, // NOLINT + KeysInputIteratorT d_keys_in, + UniqueOutputIteratorT d_unique_out, + ValuesInputIteratorT d_values_in, + AggregatesOutputIteratorT d_aggregates_out, + NumRunsOutputIteratorT d_num_runs_out, + int num_items, StreamType stream, + bool debug_synchronous) {} template void HeterCommKernel::fill_idx( int* idx, long long len, const XPUStream& stream); + template void HeterCommKernel::calc_shard_offset( int* idx, int* left, int* right, long long len, int total_devs, const XPUStream& stream); @@ -312,12 +314,14 @@ template void HeterCommKernel::calc_shard_index( template void HeterCommKernel::fill_shard_key( unsigned long* d_shard_keys, unsigned long* d_keys, int* idx, long long len, const XPUStream& stream); + template void HeterCommKernel::fill_shard_grads< unsigned long, paddle::framework::FeaturePushValue, int, XPUStream>( unsigned long* d_shard_keys, unsigned long* d_keys, paddle::framework::FeaturePushValue* d_shard_grads, paddle::framework::FeaturePushValue* d_grads, int* idx, long long len, const XPUStream& stream); + template void HeterCommKernel::fill_dvals( paddle::framework::FeatureValue* d_shard_vals, @@ -348,9 +352,8 @@ template void HeterCommKernel::reduce_by_key< size_t& temp_storage_bytes, // NOLINT unsigned long* d_keys_in, unsigned long* d_unique_out, paddle::framework::FeaturePushValue* d_values_in, - paddle::framework::FeaturePushValue* d_aggregates_out, - int* d_num_runs_out int num_items, XPUStream stream, - bool debug_synchronous); + paddle::framework::FeaturePushValue* d_aggregates_out, int* d_num_runs_out, + int num_items, XPUStream stream, bool debug_synchronous); #endif diff --git a/paddle/fluid/framework/fleet/heter_ps/heter_ps.cc b/paddle/fluid/framework/fleet/heter_ps/heter_ps.cc new file mode 100644 index 0000000000000..3d375209ed14e --- /dev/null +++ b/paddle/fluid/framework/fleet/heter_ps/heter_ps.cc @@ -0,0 +1,61 @@ +/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/framework/fleet/heter_ps/heter_ps.h" +#include + +#ifdef PADDLE_WITH_HETERPS + +namespace paddle { +namespace framework { + +HeterPsBase* HeterPsBase::get_instance( + size_t capacity, std::shared_ptr resource) { + return new HeterPs(capacity, resource); +} + +HeterPs::HeterPs(size_t capacity, std::shared_ptr resource) { + comm_ = + std::make_shared>( + capacity, resource); +} + +HeterPs::~HeterPs() {} + +void HeterPs::pull_sparse(int num, FeatureKey* d_keys, FeatureValue* d_vals, + size_t len) { + comm_->pull_sparse(num, d_keys, d_vals, len); +} + +void HeterPs::build_ps(int num, FeatureKey* h_keys, FeatureValue* h_vals, + size_t len, size_t chunk_size, int stream_num) { + comm_->build_ps(num, h_keys, h_vals, len, chunk_size, stream_num); +} + +int HeterPs::get_index_by_devid(int devid) { + return comm_->get_index_by_devid(devid); +} + +void HeterPs::end_pass() { comm_->end_pass(); } + +void HeterPs::show_one_table(int gpu_num) { comm_->show_one_table(gpu_num); } + +void HeterPs::push_sparse(int num, FeatureKey* d_keys, + FeaturePushValue* d_grads, size_t len) { + // comm_->push_sparse_multi_node(num, d_keys, d_grads, len, opt_); +} + +} // end namespace framework +} // end namespace paddle +#endif diff --git a/paddle/fluid/framework/fleet/heter_ps/heter_resource.cc b/paddle/fluid/framework/fleet/heter_ps/heter_resource.cc index 7074cfb521bdf..b330c9bb9f5ef 100644 --- a/paddle/fluid/framework/fleet/heter_ps/heter_resource.cc +++ b/paddle/fluid/framework/fleet/heter_ps/heter_resource.cc @@ -69,6 +69,7 @@ XPUResource::XPUResource(std::vector& dev_ids, int index) { platform::XPUDeviceGuard guard(dev_id_); local_streams_.resize(dev_ids_.size()); + comm_streams_.resize(dev_ids_.size(), NULL); remote_streams_.resize(dev_ids_.size()); @@ -84,6 +85,7 @@ XPUResource::~XPUResource() { for (size_t i = 0; i < local_streams_.size(); ++i) { PADDLE_ENFORCE_XPU_SUCCESS(xpu_stream_destroy(local_streams_[i])); } + // for (size_t i = 0; i < comm_streams_.size(); ++i) { // PADDLE_ENFORCE_XPU_SUCCESS(xpu_stream_destroy(comm_streams_[i])); // } diff --git a/paddle/fluid/framework/fleet/heter_ps/heter_resource.h b/paddle/fluid/framework/fleet/heter_ps/heter_resource.h index 164fca2276800..17bc12a5af1a7 100644 --- a/paddle/fluid/framework/fleet/heter_ps/heter_resource.h +++ b/paddle/fluid/framework/fleet/heter_ps/heter_resource.h @@ -36,6 +36,7 @@ namespace framework { #if defined(PADDLE_WITH_CUDA) using ppStream = cudaStream_t; + #elif defined(PADDLE_WITH_XPU_KP) using ppStream = XPUStream; #endif @@ -61,6 +62,7 @@ class GPUResource { std::vector local_streams_; std::vector comm_streams_; }; + #elif defined(PADDLE_WITH_XPU_KP) class XPUResource { public: @@ -105,6 +107,7 @@ class HeterPsResource { int get_index_by_devid(int devid); int dev_id(int num); void set_multi_mf(int multi_mf_dim, int max_mf_dim); + ppStream local_stream(int dev_num, int stream_num); ppStream remote_stream(int dev_num, int stream_num); ppStream comm_stream(int dev_num, int stream_num); diff --git a/paddle/fluid/framework/fleet/ps_gpu_wrapper.cc b/paddle/fluid/framework/fleet/ps_gpu_wrapper.cc index 5e1a08f33e3ef..52bfe42cc5028 100644 --- a/paddle/fluid/framework/fleet/ps_gpu_wrapper.cc +++ b/paddle/fluid/framework/fleet/ps_gpu_wrapper.cc @@ -31,6 +31,7 @@ limitations under the License. */ #include #include +#include "paddle/fluid/framework/data_set.h" #include "paddle/fluid/framework/fleet/ps_gpu_wrapper.h" #include "paddle/fluid/platform/timer.h" @@ -690,7 +691,6 @@ void PSGPUWrapper::BuildPull(std::shared_ptr gpu_task) { } #endif VLOG(3) << "GpuPs build hbmps done"; - }; if (multi_mf_dim_) { @@ -753,7 +753,9 @@ void PSGPUWrapper::BuildGPUTask(std::shared_ptr gpu_task) { } std::vector threads(device_num); HeterPs_ = HeterPsBase::get_instance(size_max, resource_); +#ifdef PADDLE_WITH_CUDA HeterPs_->set_nccl_comm_and_size(inner_comms_, inter_comms_, node_size_); +#endif auto build_func = [this, &gpu_task, &feature_keys_count](int i) { VLOG(3) << "building table: " << i; this->HeterPs_->build_ps(i, gpu_task->device_keys_[i].data(), @@ -891,18 +893,27 @@ void PSGPUWrapper::PullSparse(const paddle::platform::Place& place, const std::vector& values, const std::vector& slot_lengths, const int hidden_size) { - VLOG(3) << "Begine Gpu Ps PullSparse"; platform::Timer all_timer; platform::Timer pull_gpups_timer; all_timer.Start(); int64_t total_length = std::accumulate(slot_lengths.begin(), slot_lengths.end(), 0UL); +#ifdef PADDLE_WITH_CUDA + VLOG(3) << "Begine Gpu Ps PullSparse"; auto buf = memory::Alloc(place, total_length * sizeof(FeatureValue)); FeatureValue* total_values_gpu = reinterpret_cast(buf->ptr()); +#endif +#ifdef PADDLE_WITH_XPU_KP + VLOG(3) << "Begine Xpu Ps PullSparse"; + FeatureValue* total_values_gpu = nullptr; + xpu_malloc(reinterpret_cast(&total_values_gpu), + total_length * sizeof(FeatureValue)); +#endif if (platform::is_cpu_place(place)) { PADDLE_THROW(platform::errors::Unimplemented( "Warning:: CPUPlace is not supported in GpuPs now.")); } else if (platform::is_gpu_place(place)) { +#ifdef PADDLE_WITH_CUDA VLOG(3) << "Begin copy keys, key_num[" << total_length << "]"; int device_id = place.GetDeviceId(); int devid_2_index = HeterPs_->get_index_by_devid(device_id); @@ -942,9 +953,63 @@ void PSGPUWrapper::PullSparse(const paddle::platform::Place& place, this->CopyForPull(place, gpu_keys, values, total_values_gpu, gpu_len, static_cast(slot_lengths.size()), hidden_size, total_length); +#endif + } else if (platform::is_xpu_place(place)) { +#ifdef PADDLE_WITH_XPU_KP + VLOG(3) << "Begin copy keys, key_num[" << total_length << "]"; + int device_id = place.GetDeviceId(); + int devid_2_index = HeterPs_->get_index_by_devid(device_id); + LoDTensor& total_keys_tensor = keys_tensor[devid_2_index]; + uint64_t* total_keys = reinterpret_cast( + total_keys_tensor.mutable_data({total_length, 1}, place)); + + // construct slot_level lod info + auto slot_lengths_lod = slot_lengths; + for (size_t i = 1; i < slot_lengths_lod.size(); i++) { + slot_lengths_lod[i] += slot_lengths_lod[i - 1]; + } + + uint64_t* buf_key = nullptr; + int64_t* buf_length = nullptr; + PADDLE_ENFORCE_EQ(xpu_malloc(reinterpret_cast(&buf_key), + keys.size() * sizeof(uint64_t*)), + XPU_SUCCESS, platform::errors::ResourceExhausted( + "XPU has no enough memory")); + PADDLE_ENFORCE_EQ(xpu_malloc(reinterpret_cast(&buf_length), + slot_lengths.size() * sizeof(int64_t)), + XPU_SUCCESS, platform::errors::ResourceExhausted( + "XPU has no enough memory")); + + uint64_t** xpu_keys = reinterpret_cast(&buf_key); + int64_t* xpu_len = reinterpret_cast(buf_length); + PADDLE_ENFORCE_XPU_SUCCESS(xpu_memcpy(xpu_keys, keys.data(), + keys.size() * sizeof(uint64_t*), + XPU_HOST_TO_DEVICE)); + PADDLE_ENFORCE_XPU_SUCCESS(xpu_memcpy(xpu_len, slot_lengths_lod.data(), + slot_lengths.size() * sizeof(int64_t), + XPU_HOST_TO_DEVICE)); + + this->CopyKeys(place, xpu_keys, total_keys, xpu_len, + static_cast(slot_lengths.size()), + static_cast(total_length)); + VLOG(3) << "Begin call PullSparseGPU in GPUPS, dev: " << devid_2_index + << " len: " << total_length; + pull_gpups_timer.Start(); + HeterPs_->pull_sparse(devid_2_index, total_keys, total_values_gpu, + static_cast(total_length)); + // PADDLE_ENFORCE_EQ(ret, 0, platform::errors::PreconditionNotMet( + // "PullSparseGPU failed in GPUPS.")); + pull_gpups_timer.Pause(); + + VLOG(3) << "Begin Copy result to tensor, total_length[" << total_length + << "]"; + this->CopyForPull(place, xpu_keys, values, total_values_gpu, xpu_len, + static_cast(slot_lengths.size()), hidden_size, + total_length); +#endif } else { PADDLE_THROW(platform::errors::PreconditionNotMet( - "GpuPs: PullSparse Only Support CUDAPlace Now.")); + "GpuPs/XpuPs: PullSparse Only Support CUDAPlace or XPUPlace Now.")); } all_timer.Pause(); VLOG(3) << "GpuPs PullSparse total costs: " << all_timer.ElapsedSec() @@ -959,15 +1024,23 @@ void PSGPUWrapper::PushSparseGrad(const paddle::platform::Place& place, const std::vector& grad_values, const std::vector& slot_lengths, const int hidden_size, const int batch_size) { - VLOG(3) << "Begin GPUPS PushSparseGrad"; platform::Timer all_timer; platform::Timer push_gpups_timer; all_timer.Start(); int64_t total_length = std::accumulate(slot_lengths.begin(), slot_lengths.end(), 0UL); +#ifdef PADDLE_WITH_CUDA + VLOG(3) << "Begin GPUPS PushSparseGrad"; auto buf = memory::Alloc(place, total_length * sizeof(FeaturePushValue)); FeaturePushValue* total_grad_values_gpu = reinterpret_cast(buf->ptr()); +#endif +#ifdef PADDLE_WITH_XPU_KP + VLOG(3) << "Begine Xpu Ps PushSparseGrad"; + FeaturePushValue* total_grad_values_gpu = nullptr; + xpu_malloc(reinterpret_cast(&total_grad_values_gpu), + total_length * sizeof(FeaturePushValue)); +#endif if (platform::is_cpu_place(place)) { PADDLE_THROW(platform::errors::Unimplemented( "Warning:: CPUPlace is not supported in GPUPS now.")); @@ -987,6 +1060,22 @@ void PSGPUWrapper::PushSparseGrad(const paddle::platform::Place& place, HeterPs_->push_sparse(devid_2_index, total_keys, total_grad_values_gpu, static_cast(total_length)); push_gpups_timer.Pause(); + } else if (platform::is_xpu_place(place)) { + int device_id = place.GetDeviceId(); + int devid_2_index = HeterPs_->get_index_by_devid(device_id); + LoDTensor& cached_total_keys_tensor = keys_tensor[devid_2_index]; + uint64_t* total_keys = + reinterpret_cast(cached_total_keys_tensor.data()); + VLOG(3) << "Begin copy grad tensor to xpups struct"; + this->CopyForPush(place, grad_values, total_grad_values_gpu, slot_lengths, + hidden_size, total_length, batch_size); + + VLOG(3) << "Begin call PushSparseXPU in XPUPS, dev: " << devid_2_index + << " len: " << total_length; + push_gpups_timer.Start(); + HeterPs_->push_sparse(devid_2_index, total_keys, total_grad_values_gpu, + static_cast(total_length)); + push_gpups_timer.Pause(); } else { PADDLE_THROW(platform::errors::PreconditionNotMet( "GPUPS: PushSparseGrad Only Support CUDAPlace Now.")); diff --git a/paddle/fluid/framework/fleet/ps_gpu_wrapper.cu b/paddle/fluid/framework/fleet/ps_gpu_wrapper.cu index 6a78a617b1fef..cf7d98db27e84 100644 --- a/paddle/fluid/framework/fleet/ps_gpu_wrapper.cu +++ b/paddle/fluid/framework/fleet/ps_gpu_wrapper.cu @@ -105,6 +105,8 @@ __global__ void PushCopy(FeaturePushValue* dest, float** src, int64_t* len, } } +PSGPUWrapper::~PSGPUWrapper() { delete HeterPs_; } + void PSGPUWrapper::CopyForPull(const paddle::platform::Place& place, uint64_t** gpu_keys, const std::vector& values, diff --git a/paddle/fluid/framework/fleet/ps_gpu_wrapper.h b/paddle/fluid/framework/fleet/ps_gpu_wrapper.h old mode 100755 new mode 100644 index c5f674d8b47eb..c38b819822c28 --- a/paddle/fluid/framework/fleet/ps_gpu_wrapper.h +++ b/paddle/fluid/framework/fleet/ps_gpu_wrapper.h @@ -30,16 +30,22 @@ limitations under the License. */ #include "paddle/fluid/framework/fleet/gloo_wrapper.h" #endif #include "paddle/fluid/distributed/ps/thirdparty/round_robin.h" -#include "paddle/fluid/framework/data_set.h" +#include "paddle/fluid/framework/channel.h" #include "paddle/fluid/framework/fleet/heter_context.h" #include "paddle/fluid/framework/fleet/heter_ps/heter_ps_base.h" #include "paddle/fluid/framework/fleet/heter_ps/heter_resource.h" +#include "paddle/fluid/framework/heter_util.h" +#ifdef PADDLE_WITH_CUDA #include "paddle/fluid/framework/fleet/heter_ps/mem_pool.h" +#include "paddle/fluid/platform/device/gpu/gpu_info.h" +#include "paddle/fluid/platform/dynload/nccl.h" +#endif +#ifdef PADDLE_WITH_XPU_KP +#include "paddle/fluid/platform/device/xpu/enforce_xpu.h" +#endif #include "paddle/fluid/framework/scope.h" #include "paddle/fluid/framework/tensor.h" #include "paddle/fluid/framework/variable_helper.h" -#include "paddle/fluid/platform/device/gpu/gpu_info.h" -#include "paddle/fluid/platform/dynload/nccl.h" #include "paddle/fluid/platform/macros.h" // for DISABLE_COPY_AND_ASSIGN #include "paddle/fluid/platform/place.h" #ifdef PADDLE_WITH_PSCORE @@ -55,6 +61,8 @@ namespace framework { #define TYPEALIGN(ALIGNVAL, LEN) \ (((uint64_t)(LEN) + ((ALIGNVAL)-1)) & ~((uint64_t)((ALIGNVAL)-1))) +class Dataset; + #ifdef PADDLE_WITH_PSLIB class AfsWrapper { public: @@ -82,7 +90,7 @@ class AfsWrapper { class PSGPUWrapper { public: - virtual ~PSGPUWrapper() { delete HeterPs_; } + virtual ~PSGPUWrapper(); PSGPUWrapper() { HeterPs_ = NULL; @@ -160,6 +168,7 @@ class PSGPUWrapper { PADDLE_THROW( platform::errors::Unavailable("heter ps need compile with GLOO")); #endif +#ifdef PADDLE_WITH_CUDA if (multi_node_) { int dev_size = dev_ids.size(); // init inner comm @@ -195,6 +204,7 @@ class PSGPUWrapper { platform::errors::Unavailable("heter ps need compile with GLOO")); #endif } +#endif heter_devices_ = dev_ids; data_ready_channel_->Open(); data_ready_channel_->SetCapacity(3); @@ -262,7 +272,11 @@ class PSGPUWrapper { ? 1.0 : config["mf_max_bound"]; for (size_t i = 0; i < heter_devices_.size(); i++) { +#ifdef PADDLE_WITH_CUDA PADDLE_ENFORCE_GPU_SUCCESS(cudaSetDevice(heter_devices_[i])); +#elif defined(PADDLE_WITH_XPU_KP) + PADDLE_ENFORCE_XPU_SUCCESS(xpu_set_device(heter_devices_[i])); +#endif this->SetSparseSGD(nonclk_coeff, clk_coeff, min_bound, max_bound, learning_rate, initial_g2sum, initial_range); this->SetEmbedxSGD(mf_create_thresholds, mf_learning_rate, @@ -270,6 +284,7 @@ class PSGPUWrapper { mf_max_bound); } } + void SetDate(int year, int month, int day) { year_ = year; month_ = month; @@ -297,6 +312,7 @@ class PSGPUWrapper { slot_offset_vector_ = slot_offset_vector; } +#ifdef PADDLE_WITH_CUDA void SetSlotDimVector(const std::vector& slot_mf_dim_vector) { slot_mf_dim_vector_ = slot_mf_dim_vector; assert(slot_mf_dim_vector_.size() == slot_vector_.size()); @@ -330,6 +346,7 @@ class PSGPUWrapper { grad_type_size_ = TYPEALIGN(8, sizeof(FeaturePushValue) + (max_mf_dim_ * sizeof(float))); } +#endif void ShowOneTable(int index) { HeterPs_->show_one_table(index); } @@ -371,9 +388,11 @@ class PSGPUWrapper { int multi_node_{0}; int node_size_; uint64_t table_id_; +#ifdef PADDLE_WITH_CUDA std::vector inner_comms_; std::vector inter_comms_; std::vector inter_ncclids_; +#endif std::vector heter_devices_; std::unordered_set gpu_ps_config_keys_; HeterObjectPool gpu_task_pool_; @@ -388,9 +407,11 @@ class PSGPUWrapper { int day_; int use_afs_api_ = 0; +#ifdef PADDLE_WITH_CUDA std::vector mem_pools_; std::vector hbm_pools_; // in multi mfdim, one table need hbm // pools of totol dims number +#endif std::shared_ptr< paddle::framework::ChannelObject>> diff --git a/paddle/fluid/framework/fleet/ps_gpu_wrapper.kps b/paddle/fluid/framework/fleet/ps_gpu_wrapper.kps new file mode 100644 index 0000000000000..6d69ae0136d68 --- /dev/null +++ b/paddle/fluid/framework/fleet/ps_gpu_wrapper.kps @@ -0,0 +1,339 @@ +/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#ifdef PADDLE_WITH_HETERPS +#include // NOLINT +#include +#include +#include +#include +#include "paddle/fluid/framework/fleet/heter_ps/optimizer_conf.h" +#include "paddle/fluid/framework/fleet/ps_gpu_wrapper.h" +#include "paddle/fluid/framework/lod_tensor.h" +#include "xpu/kernel/cluster_header.h" // NOLINT +#include "xpu/kernel/debug.h" // NOLINT +#include "xpu/kernel/math.h" // NOLINT +#include "xpu/kernel/simd.h" + +namespace paddle { +namespace framework { + +__global__ void PullCopy(float** dest, const FeatureValue* src, + const long long* len, int hidden, int slot_num, + int total_len, unsigned long long** keys) { + int cid = core_id(); + int ncores = core_num(); + if (cid >= ncores) { + return; + } + int thread_id = ncores * cluster_id() + cid; + int nthreads = ncores * cluster_num(); + __local__ int64_t local_len[slot_num]; + GM2LM(len, local_len, slot_num * sizeof(int64_t)); + + for (int i = thread_id; i < slot_num; i += nthreads) { + // max core local memory = 8KB + // slot's max memory size = slot_len * sizeof(FeatureValue) + int slot_len = i ? local_len[i] - local_len[i - 1] : local_len[0]; + int read_len = min(roundup_div(1024 * 8, sizeof(FeatureValue)), slot_len); + int dest_len = i ? local_len[i - 1] : 0; + __local__ FeatureValue local_slot_vals[read_len]; + __local__ float local_dest_vals[read_len * hidden]; + __local__ uint64_t local_slot_keys[read_len]; + + // copy read_len (length) of slots' val to LM + for (int k = 0; k < slot_len; k += read_len) { + int real_read_len = min(read_len, slot_len - k); + GM2LM(src + dest_len + k, local_slot_vals, + real_read_len * sizeof(FeatureValue)); + GM2LM(keys[i] + k, local_slot_keys, real_read_len * sizeof(uint64_t)); + for (int j = 0; j < real_read_len; j++) { + if (local_slot_keys[j] == 0) { + local_dest_vals[j * hidden] = 0; + local_dest_vals[j * hidden + 1] = 0; + local_dest_vals[j * hidden + 2] = 0; + } else { + local_dest_vals[j * hidden] = local_slot_vals[j].show; + local_dest_vals[j * hidden + 1] = local_slot_vals[j].clk; + local_dest_vals[j * hidden + 2] = local_slot_vals[j].lr; + } + + if (local_slot_vals[j].mf_size == 0 || local_slot_keys[j] == 0) { + for (int m = 0; m < hidden - 3; m++) { + local_dest_vals[j * hidden + 3 + m] = 0; + } + } else { + for (int m = 0; m < hidden - 3; m++) { + local_dest_vals[j * hidden + 3 + m] = local_slot_vals[j].mf[1 + m]; + } + } + } + LM2GM(local_dest_vals, dest[i] + k * hidden, + real_read_len * hidden * sizeof(float)); + } + } +} + +__global__ void CopyKeysKernel(unsigned long long** src_keys, + unsigned long long* dest_total_keys, + const long long* len, int slot_num, + int total_len) { + int cid = core_id(); + int ncores = core_num(); + if (cid >= ncores) { + return; + } + int thread_id = ncores * cluster_id() + cid; + int nthreads = ncores * cluster_num(); + __local__ int64_t local_len[slot_num]; + GM2LM(len, local_len, slot_num * sizeof(int64_t)); + + for (int i = thread_id; i < slot_num; i += nthreads) { + // max core local memory = 8KB + int slot_len = i ? local_len[i] - local_len[i - 1] : local_len[0]; + int read_len = min(slot_len, 1024); + int dest_len = i ? local_len[i - 1] : 0; + __local__ uint64_t local_slot_keys[read_len]; + + for (int k = 0; k < slot_len; k += read_len) { + int real_read_len = min(read_len, slot_len - k); + GM2LM(src_keys[i] + k, local_slot_keys, real_read_len * sizeof(uint64_t)); + LM2GM(local_slot_keys, dest_total_keys + dest_len + k, + real_read_len * sizeof(uint64_t)); + } + } +} + +__global__ void PushCopy(FeaturePushValue* dest, float** src, long long* len, + int hidden, int slot_num, int total_len, int bs, + int* slot_vector) { + int cid = core_id(); + int ncores = core_num(); + if (cid >= ncores) { + return; + } + int thread_id = ncores * cluster_id() + cid; + int nthreads = ncores * cluster_num(); + __local__ int64_t local_len[slot_num]; + __local__ int local_slot[slot_num]; + GM2LM(len, local_len, slot_num * sizeof(int64_t)); + GM2LM(slot_vector, local_slot, slot_num * sizeof(int)); + + for (int i = thread_id; i < slot_num; i += nthreads) { + int slot_len = i ? local_len[i] - local_len[i - 1] : local_len[0]; + + // max core local memory = 8KB + // slot's max memory size = slot_len * hidden * 8 + int read_len = min(roundup_div(1024, hidden), slot_len); + int dest_len = i ? local_len[i - 1] : 0; + __local__ float local_slot_grads[read_len * hidden]; + __local__ FeaturePushValue local_dest_grads[read_len]; + + // copy read_len(length) of slots' grad to LM + for (int k = 0; k < slot_len; k += read_len) { + int real_read_len = min(read_len, slot_len - k); + GM2LM(src[i] + k * hidden, local_slot_grads, + real_read_len * hidden * sizeof(float)); + // copy from slots' grad to total grad + for (int j = 0; j < real_read_len; j++) { + local_dest_grads[j].slot = local_slot[i]; + local_dest_grads[j].show = local_slot_grads[j * hidden]; + local_dest_grads[j].clk = local_slot_grads[j * hidden + 1]; + local_dest_grads[j].lr_g = local_slot_grads[j * hidden + 2] * -1. * bs; + for (int m = 0; m < hidden - 3; m++) { + local_dest_grads[j].mf_g[m] = + local_slot_grads[j * hidden + 3 + m] * -1. * bs; + } + } + LM2GM(local_dest_grads, dest + dest_len + k, + real_read_len * sizeof(FeaturePushValue)); + } + } +} + +PSGPUWrapper::~PSGPUWrapper() { + delete HeterPs_; + xpu_free((void*)optimizer_config::nonclk_coeff); + xpu_free((void*)optimizer_config::clk_coeff); + xpu_free((void*)optimizer_config::min_bound); + xpu_free((void*)optimizer_config::max_bound); + xpu_free((void*)optimizer_config::learning_rate); + xpu_free((void*)optimizer_config::initial_g2sum); + xpu_free((void*)optimizer_config::initial_range); + + xpu_free((void*)optimizer_config::mf_create_thresholds); + xpu_free((void*)optimizer_config::mf_learning_rate); + xpu_free((void*)optimizer_config::mf_initial_g2sum); + xpu_free((void*)optimizer_config::mf_initial_range); + xpu_free((void*)optimizer_config::mf_min_bound); + xpu_free((void*)optimizer_config::mf_max_bound); +} + +void PSGPUWrapper::CopyForPull(const paddle::platform::Place& place, + uint64_t** gpu_keys, + const std::vector& values, + const FeatureValue* total_values_gpu, + const int64_t* gpu_len, const int slot_num, + const int hidden_size, + const int64_t total_length) { + XPUStream stream = nullptr; + auto dev_ctx = platform::DeviceContextPool::Instance().Get(place); + stream = static_cast(dev_ctx) + ->x_context() + ->xpu_stream; + float* buf_value = nullptr; + xpu_malloc(reinterpret_cast(&buf_value), + values.size() * sizeof(float*)); + float** gpu_values = reinterpret_cast(&buf_value); + xpu_memcpy(gpu_values, values.data(), values.size() * sizeof(float*), + XPU_HOST_TO_DEVICE); + + unsigned long long** c_keys = (unsigned long long**)gpu_keys; + const long long* c_len = (const long long*)gpu_len; + PullCopy<<<2, 64, stream>>>(gpu_values, total_values_gpu, c_len, hidden_size, + slot_num, total_length, c_keys); + + xpu_wait(stream); +} + +void PSGPUWrapper::CopyKeys(const paddle::platform::Place& place, + uint64_t** origin_keys, uint64_t* total_keys, + const int64_t* gpu_len, int slot_num, + int total_len) { + XPUStream stream = nullptr; + auto dev_ctx = platform::DeviceContextPool::Instance().Get(place); + stream = static_cast(dev_ctx) + ->x_context() + ->xpu_stream; + unsigned long long** o_keys = (unsigned long long**)origin_keys; + unsigned long long* t_keys = (unsigned long long*)total_keys; + const long long* c_len = (const long long*)gpu_len; + CopyKeysKernel<<<2, 64, stream>>>(o_keys, t_keys, c_len, slot_num, total_len); + xpu_wait(stream); +} + +void PSGPUWrapper::CopyForPush(const paddle::platform::Place& place, + const std::vector& grad_values, + FeaturePushValue* total_grad_values_gpu, + const std::vector& slot_lengths, + const int hidden_size, + const int64_t total_length, + const int batch_size) { + XPUStream stream = nullptr; + auto dev_ctx = platform::DeviceContextPool::Instance().Get(place); + stream = static_cast(dev_ctx) + ->x_context() + ->xpu_stream; + auto slot_lengths_lod = slot_lengths; + for (size_t i = 1; i < slot_lengths_lod.size(); i++) { + slot_lengths_lod[i] += slot_lengths_lod[i - 1]; + } + + float* buf_grad_value = nullptr; + int64_t* buf_length = nullptr; + int* buf_slot_vector = nullptr; + + xpu_malloc(reinterpret_cast(&buf_grad_value), + grad_values.size() * sizeof(float*)); + xpu_malloc(reinterpret_cast(&buf_length), + slot_lengths.size() * sizeof(int64_t)); + xpu_malloc(reinterpret_cast(&buf_slot_vector), + slot_lengths_lod.size() * sizeof(int)); + + float** gpu_values = reinterpret_cast(&buf_grad_value); + int64_t* gpu_len = reinterpret_cast(buf_length); + int* d_slot_vector = reinterpret_cast(buf_slot_vector); + xpu_memcpy(gpu_values, grad_values.data(), + grad_values.size() * sizeof(float*), XPU_HOST_TO_DEVICE); + xpu_memcpy(gpu_len, slot_lengths_lod.data(), + slot_lengths.size() * sizeof(int64_t), XPU_HOST_TO_DEVICE); + xpu_memcpy(d_slot_vector, slot_vector_.data(), + slot_lengths_lod.size() * sizeof(int), XPU_HOST_TO_DEVICE); + + long long* c_len = (long long*)gpu_len; + PushCopy<<<2, 64, stream>>>(total_grad_values_gpu, gpu_values, c_len, + hidden_size, slot_lengths.size(), total_length, + batch_size, d_slot_vector); + xpu_wait(stream); +} + +void PSGPUWrapper::SetSparseSGD(float nonclk_coeff, float clk_coeff, + float min_bound, float max_bound, + float learning_rate, float initial_g2sum, + float initial_range) { + xpu_malloc(reinterpret_cast(&optimizer_config::nonclk_coeff), + sizeof(float)); + xpu_malloc(reinterpret_cast(&optimizer_config::clk_coeff), + sizeof(float)); + xpu_malloc(reinterpret_cast(&optimizer_config::min_bound), + sizeof(float)); + xpu_malloc(reinterpret_cast(&optimizer_config::max_bound), + sizeof(float)); + xpu_malloc(reinterpret_cast(&optimizer_config::learning_rate), + sizeof(float)); + xpu_malloc(reinterpret_cast(&optimizer_config::initial_g2sum), + sizeof(float)); + xpu_malloc(reinterpret_cast(&optimizer_config::initial_range), + sizeof(float)); + + xpu_memcpy((void*)optimizer_config::nonclk_coeff, &nonclk_coeff, + sizeof(float), XPU_HOST_TO_DEVICE); + xpu_memcpy((void*)optimizer_config::clk_coeff, &clk_coeff, sizeof(float), + XPU_HOST_TO_DEVICE); + xpu_memcpy((void*)optimizer_config::min_bound, &min_bound, sizeof(float), + XPU_HOST_TO_DEVICE); + xpu_memcpy((void*)optimizer_config::max_bound, &max_bound, sizeof(float), + XPU_HOST_TO_DEVICE); + xpu_memcpy((void*)optimizer_config::learning_rate, &learning_rate, + sizeof(float), XPU_HOST_TO_DEVICE); + xpu_memcpy((void*)optimizer_config::initial_g2sum, &initial_g2sum, + sizeof(float), XPU_HOST_TO_DEVICE); + xpu_memcpy((void*)optimizer_config::initial_range, &initial_range, + sizeof(float), XPU_HOST_TO_DEVICE); +} + +void PSGPUWrapper::SetEmbedxSGD(float mf_create_thresholds, + float mf_learning_rate, float mf_initial_g2sum, + float mf_initial_range, float mf_min_bound, + float mf_max_bound) { + xpu_malloc(reinterpret_cast(&optimizer_config::mf_create_thresholds), + sizeof(float)); + xpu_malloc(reinterpret_cast(&optimizer_config::mf_learning_rate), + sizeof(float)); + xpu_malloc(reinterpret_cast(&optimizer_config::mf_initial_g2sum), + sizeof(float)); + xpu_malloc(reinterpret_cast(&optimizer_config::mf_initial_range), + sizeof(float)); + xpu_malloc(reinterpret_cast(&optimizer_config::mf_min_bound), + sizeof(float)); + xpu_malloc(reinterpret_cast(&optimizer_config::mf_max_bound), + sizeof(float)); + + xpu_memcpy((void*)optimizer_config::mf_create_thresholds, + &mf_create_thresholds, sizeof(float), XPU_HOST_TO_DEVICE); + xpu_memcpy((void*)optimizer_config::mf_initial_g2sum, &mf_initial_g2sum, + sizeof(float), XPU_HOST_TO_DEVICE); + xpu_memcpy((void*)optimizer_config::mf_initial_range, &mf_initial_range, + sizeof(float), XPU_HOST_TO_DEVICE); + xpu_memcpy((void*)optimizer_config::mf_min_bound, &mf_min_bound, + sizeof(float), XPU_HOST_TO_DEVICE); + xpu_memcpy((void*)optimizer_config::mf_max_bound, &mf_max_bound, + sizeof(float), XPU_HOST_TO_DEVICE); + xpu_memcpy((void*)optimizer_config::mf_learning_rate, &mf_learning_rate, + sizeof(float), XPU_HOST_TO_DEVICE); +} + +} // end namespace framework +} // end namespace paddle +#endif diff --git a/paddle/fluid/framework/ps_gpu_trainer.cc b/paddle/fluid/framework/ps_gpu_trainer.cc index e0cf860e5bc7b..e4004c2fbf3b5 100644 --- a/paddle/fluid/framework/ps_gpu_trainer.cc +++ b/paddle/fluid/framework/ps_gpu_trainer.cc @@ -25,7 +25,9 @@ limitations under the License. */ #include "paddle/fluid/framework/trainer.h" #if (defined PADDLE_WITH_NCCL || defined PADDLE_WITH_RCCL) && \ (defined PADDLE_WITH_PSLIB) +#ifdef PADDLE_WITH_CUDA #include "paddle/fluid/platform/cuda_device_guard.h" +#endif namespace paddle { namespace framework { @@ -56,7 +58,12 @@ void PSGPUTrainer::Initialize(const TrainerDesc& trainer_desc, std::vector dev_ids; for (int i = 0; i < place_num; ++i) { int num = trainer_desc.worker_places(i); +#ifdef PADDLE_WITH_CUDA platform::CUDAPlace place = platform::CUDAPlace(num); +#endif +#ifdef PADDLE_WITH_XPU_KP + platform::XPUPlace place = platform::XPUPlace(num); +#endif places_.push_back(place); dev_ids.push_back(num); } diff --git a/paddle/fluid/framework/ps_gpu_worker.cc b/paddle/fluid/framework/ps_gpu_worker.cc index d98deb0f188dc..452c960166cb2 100644 --- a/paddle/fluid/framework/ps_gpu_worker.cc +++ b/paddle/fluid/framework/ps_gpu_worker.cc @@ -20,7 +20,9 @@ limitations under the License. */ #if (defined PADDLE_WITH_NCCL || defined PADDLE_WITH_RCCL) && \ (defined PADDLE_WITH_PSLIB) +#ifdef PADDLE_WITH_CUDA #include "paddle/fluid/platform/cuda_device_guard.h" +#endif #if defined _WIN32 || defined __APPLE__ #else diff --git a/paddle/fluid/operators/pull_box_sparse_op.cc b/paddle/fluid/operators/pull_box_sparse_op.cc index 90e4fc9da0d61..22b43910e6967 100644 --- a/paddle/fluid/operators/pull_box_sparse_op.cc +++ b/paddle/fluid/operators/pull_box_sparse_op.cc @@ -132,5 +132,7 @@ REGISTER_OPERATOR(pull_box_sparse, ops::PullBoxSparseOp, ops::PushBoxSparseOpMaker, ops::PushBoxSparseOpMaker); REGISTER_OPERATOR(push_box_sparse, ops::PushBoxSparseOp); -REGISTER_OP_CPU_KERNEL(pull_box_sparse, ops::PullBoxSparseCPUKernel) -REGISTER_OP_CPU_KERNEL(push_box_sparse, ops::PushBoxSparseCPUKernel) +REGISTER_OP_CPU_KERNEL(pull_box_sparse, ops::PullBoxSparseCPUKernel); +REGISTER_OP_CPU_KERNEL(push_box_sparse, ops::PushBoxSparseCPUKernel); +REGISTER_OP_XPU_KERNEL(pull_box_sparse, ops::PullBoxSparseXPUKernel); +REGISTER_OP_XPU_KERNEL(push_box_sparse, ops::PushBoxSparseXPUKernel); diff --git a/paddle/fluid/operators/pull_box_sparse_op.cu b/paddle/fluid/operators/pull_box_sparse_op.cu index 96a1b1c08b79c..e3407dd3b2e8b 100644 --- a/paddle/fluid/operators/pull_box_sparse_op.cu +++ b/paddle/fluid/operators/pull_box_sparse_op.cu @@ -11,7 +11,6 @@ // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. - #include "paddle/fluid/operators/pull_box_sparse_op.h" #include "paddle/fluid/platform/device/gpu/gpu_info.h" #include "paddle/fluid/platform/device/gpu/gpu_primitives.h" @@ -38,7 +37,7 @@ class PushBoxSparseCUDAKernel : public framework::OpKernel { }; } // namespace operators } // namespace paddle - namespace ops = paddle::operators; -REGISTER_OP_CUDA_KERNEL(pull_box_sparse, ops::PullBoxSparseCUDAKernel) -REGISTER_OP_CUDA_KERNEL(push_box_sparse, ops::PushBoxSparseCUDAKernel) + +REGISTER_OP_CUDA_KERNEL(pull_box_sparse, ops::PullBoxSparseCUDAKernel); +REGISTER_OP_CUDA_KERNEL(push_box_sparse, ops::PushBoxSparseCUDAKernel); diff --git a/paddle/fluid/operators/pull_box_sparse_op.h b/paddle/fluid/operators/pull_box_sparse_op.h index 77021b8961db5..2bde9725abdca 100644 --- a/paddle/fluid/operators/pull_box_sparse_op.h +++ b/paddle/fluid/operators/pull_box_sparse_op.h @@ -114,5 +114,21 @@ class PushBoxSparseCPUKernel : public framework::OpKernel { PushBoxSparseFunctor(ctx); } }; + +template +class PullBoxSparseXPUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext &ctx) const override { + PullBoxSparseFunctor(ctx); + } +}; + +template +class PushBoxSparseXPUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext &ctx) const override { + PushBoxSparseFunctor(ctx); + } +}; } // namespace operators } // namespace paddle diff --git a/paddle/fluid/pybind/ps_gpu_wrapper_py.cc b/paddle/fluid/pybind/ps_gpu_wrapper_py.cc index 42703fc17bde5..2f07a4a40a922 100644 --- a/paddle/fluid/pybind/ps_gpu_wrapper_py.cc +++ b/paddle/fluid/pybind/ps_gpu_wrapper_py.cc @@ -25,6 +25,7 @@ limitations under the License. */ #include #include +#include "paddle/fluid/framework/data_set.h" #include "paddle/fluid/framework/fleet/ps_gpu_wrapper.h" #include "paddle/fluid/pybind/ps_gpu_wrapper_py.h" @@ -39,8 +40,10 @@ void BindPSGPUWrapper(py::module* m) { .def(py::init([]() { return framework::PSGPUWrapper::GetInstance(); })) .def("set_slot_vector", &framework::PSGPUWrapper::SetSlotVector, py::call_guard()) +#ifdef PADDLE_WITH_CUDA .def("set_slot_dim_vector", &framework::PSGPUWrapper::SetSlotDimVector, py::call_guard()) +#endif .def("set_slot_offset_vector", &framework::PSGPUWrapper::SetSlotOffsetVector, py::call_guard()) From 9f06069d4663ea7b7cb989d1d93d02d0550e9282 Mon Sep 17 00:00:00 2001 From: qipengh Date: Mon, 18 Apr 2022 10:10:29 +0800 Subject: [PATCH 199/211] [MLU]add op: reduce_sum, elementwise_sub (#41697) * [MLU]add op: reduce_sum, elementwise_sub * [MLU]del unrelated code --- .../elementwise/elementwise_add_op_mlu.cc | 69 +----- .../operators/elementwise/elementwise_mlu.h | 207 +++++++++++++++++ .../elementwise/elementwise_mul_op_mlu.cc | 47 +--- .../elementwise/elementwise_sub_op_mlu.cc | 112 ++++++++++ paddle/fluid/operators/mlu/mlu_baseop.h | 16 ++ .../reduce_ops/reduce_mean_op_mlu.cc | 41 +--- .../operators/reduce_ops/reduce_op_mlu.h | 73 ++++++ .../operators/reduce_ops/reduce_sum_op_mlu.cc | 78 +++++++ .../mlu/test_elementwise_sub_op_mlu.py | 208 ++++++++++++++++++ .../unittests/mlu/test_reduce_sum_op_mlu.py | 149 +++++++++++++ 10 files changed, 853 insertions(+), 147 deletions(-) create mode 100644 paddle/fluid/operators/elementwise/elementwise_mlu.h create mode 100644 paddle/fluid/operators/elementwise/elementwise_sub_op_mlu.cc create mode 100644 paddle/fluid/operators/reduce_ops/reduce_op_mlu.h create mode 100644 paddle/fluid/operators/reduce_ops/reduce_sum_op_mlu.cc create mode 100644 python/paddle/fluid/tests/unittests/mlu/test_elementwise_sub_op_mlu.py create mode 100644 python/paddle/fluid/tests/unittests/mlu/test_reduce_sum_op_mlu.py diff --git a/paddle/fluid/operators/elementwise/elementwise_add_op_mlu.cc b/paddle/fluid/operators/elementwise/elementwise_add_op_mlu.cc index 47a549dfcde28..98d559df233b3 100644 --- a/paddle/fluid/operators/elementwise/elementwise_add_op_mlu.cc +++ b/paddle/fluid/operators/elementwise/elementwise_add_op_mlu.cc @@ -12,8 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/elementwise/elementwise_add_op.h" -#include "paddle/fluid/operators/mlu/mlu_baseop.h" +#include "paddle/fluid/operators/elementwise/elementwise_mlu.h" namespace paddle { namespace operators { @@ -23,35 +22,7 @@ template class ElementwiseAddMLUKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { - auto* x = ctx.Input("X"); - auto* y = ctx.Input("Y"); - auto* out = ctx.Output("Out"); - out->mutable_data(ctx.GetPlace()); - - int axis = ctx.Attr("axis"); - const auto& x_dims = x->dims(); - const auto& y_dims = y->dims(); - axis = (axis < 0 ? (std::abs(x_dims.size() - y_dims.size()) + axis + 1) - : axis); - int max_dim = std::max(x_dims.size(), y_dims.size()); - std::vector x_dims_array(max_dim); - std::vector y_dims_array(max_dim); - std::vector out_dims_array(max_dim); - GetBroadcastDimsArrays(x_dims, y_dims, x_dims_array.data(), - y_dims_array.data(), out_dims_array.data(), max_dim, - axis); - - MLUCnnlTensorDesc x_desc(max_dim, x_dims_array.data(), - ToCnnlDataType(x->type())); - MLUCnnlTensorDesc y_desc(max_dim, y_dims_array.data(), - ToCnnlDataType(y->type())); - MLUCnnlTensorDesc out_desc(*out); - MLUCnnlOpTensorDesc op_tensor_desc(CNNL_OP_TENSOR_ADD, ToCnnlDataType(), - CNNL_NOT_PROPAGATE_NAN); - - MLUCnnl::OpTensor(ctx, op_tensor_desc.get(), x_desc.get(), GetBasePtr(x), - y_desc.get(), GetBasePtr(y), out_desc.get(), - GetBasePtr(out), ToCnnlDataType()); + MLUOpTensorKernel(ctx, CNNL_OP_TENSOR_ADD); } }; @@ -75,22 +46,8 @@ class ElementwiseAddGradMLUKernel : public framework::OpKernel { if (dx->dims() != dout->dims()) { std::vector dst_dims_vec; std::vector reduce_axes; - auto src_dims = dx->dims(); - auto dout_dims = dout->dims(); - - int src_axis = (src_dims.size() < dout_dims.size() ? axis : 0); - for (int ax = 0; ax < dout_dims.size(); ++ax) { - if ((ax < src_axis || ax >= src_axis + src_dims.size()) || - (dout_dims[ax] > 1 && src_dims[ax - src_axis] == 1)) { - reduce_axes.push_back(ax); - } else { - dst_dims_vec.push_back(dout_dims[ax]); - } - } - if (dst_dims_vec.size() == 0) { - // x is scalar - dst_dims_vec.push_back(1); - } + GetReduceAxesAndDstDims(axis, dout->dims(), dx->dims(), &reduce_axes, + &dst_dims_vec); MLUCnnlReduceDesc reduction_desc( reduce_axes, CNNL_REDUCE_ADD, ToCnnlDataType(), @@ -109,22 +66,8 @@ class ElementwiseAddGradMLUKernel : public framework::OpKernel { if (dy->dims() != dout->dims()) { std::vector dst_dims_vec; std::vector reduce_axes; - auto src_dims = dy->dims(); - auto dout_dims = dout->dims(); - - int src_axis = (src_dims.size() < dout_dims.size() ? axis : 0); - for (int ax = 0; ax < dout_dims.size(); ++ax) { - if ((ax < src_axis || ax >= src_axis + src_dims.size()) || - (dout_dims[ax] > 1 && src_dims[ax - src_axis] == 1)) { - reduce_axes.push_back(ax); - } else { - dst_dims_vec.push_back(dout_dims[ax]); - } - } - if (dst_dims_vec.size() == 0) { - // y is scalar - dst_dims_vec.push_back(1); - } + GetReduceAxesAndDstDims(axis, dout->dims(), dy->dims(), &reduce_axes, + &dst_dims_vec); MLUCnnlReduceDesc reduction_desc( reduce_axes, CNNL_REDUCE_ADD, ToCnnlDataType(), diff --git a/paddle/fluid/operators/elementwise/elementwise_mlu.h b/paddle/fluid/operators/elementwise/elementwise_mlu.h new file mode 100644 index 0000000000000..156cea81c0f63 --- /dev/null +++ b/paddle/fluid/operators/elementwise/elementwise_mlu.h @@ -0,0 +1,207 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#ifdef PADDLE_WITH_MLU +#include +#include "paddle/fluid/operators/elementwise/elementwise_op.h" +#include "paddle/fluid/operators/mlu/mlu_baseop.h" + +namespace paddle { +namespace operators { + +inline void GetReduceAxes(const int axis, const framework::DDim& src_ddims, + const framework::DDim& target_ddims, + std::vector* axes) { + int64_t src_dim_size = src_ddims.size(); + int64_t target_dim_size = target_ddims.size(); + for (int64_t i = 0; i < src_dim_size; ++i) { + if (i < axis || i >= target_dim_size + axis) { + axes->push_back(i); + continue; + } + if (src_ddims[i] > target_ddims[i - axis]) { + axes->push_back(i); + } + } +} + +inline void GetReduceAxesAndDstDims(const int axis, + const framework::DDim& src_ddims, + const framework::DDim& target_ddims, + std::vector* reduce_axes, + std::vector* dst_dims_vec) { + int64_t src_dim_size = src_ddims.size(); + int64_t target_dim_size = target_ddims.size(); + + int src_axis = (target_dim_size < src_dim_size ? axis : 0); + for (int ax = 0; ax < src_dim_size; ++ax) { + if ((ax < src_axis || ax >= src_axis + target_dim_size) || + (src_ddims[ax] > 1 && target_ddims[ax - src_axis] == 1)) { + reduce_axes->push_back(ax); + } else { + dst_dims_vec->push_back(src_ddims[ax]); + } + } + if (dst_dims_vec->size() == 0) { + // target_var is scalar + dst_dims_vec->push_back(1); + } +} + +template +void MLUOpTensorKernel(const framework::ExecutionContext& ctx, + const cnnlOpTensorDesc_t op_tensor_op) { + PADDLE_ENFORCE_EQ( + platform::is_mlu_place(ctx.GetPlace()), true, + platform::errors::Unavailable("This kernel only runs on MLU.")); + PADDLE_ENFORCE_EQ((op_tensor_op == CNNL_OP_TENSOR_ADD) || + (op_tensor_op == CNNL_OP_TENSOR_SUB) || + (op_tensor_op == CNNL_OP_TENSOR_MUL), + true, + platform::errors::Unavailable( + "This kernel of MLU only support ADD, SUB, MUL.")); + + auto* x = ctx.Input("X"); + auto* y = ctx.Input("Y"); + auto* out = ctx.Output("Out"); + out->mutable_data(ctx.GetPlace()); + + int axis = ctx.Attr("axis"); + const auto& x_dims = x->dims(); + const auto& y_dims = y->dims(); + axis = + (axis < 0 ? (std::abs(x_dims.size() - y_dims.size()) + axis + 1) : axis); + int max_dim = std::max(x_dims.size(), y_dims.size()); + std::vector x_dims_array(max_dim); + std::vector y_dims_array(max_dim); + std::vector out_dims_array(max_dim); + GetBroadcastDimsArrays(x_dims, y_dims, x_dims_array.data(), + y_dims_array.data(), out_dims_array.data(), max_dim, + axis); + + MLUCnnlTensorDesc x_desc(max_dim, x_dims_array.data(), ToCnnlDataType()); + MLUCnnlTensorDesc y_desc(max_dim, y_dims_array.data(), ToCnnlDataType()); + MLUCnnlTensorDesc out_desc(*out); + MLUCnnlOpTensorDesc op_tensor_desc(op_tensor_op, ToCnnlDataType(), + CNNL_NOT_PROPAGATE_NAN); + + MLUCnnl::OpTensor(ctx, op_tensor_desc.get(), x_desc.get(), GetBasePtr(x), + y_desc.get(), GetBasePtr(y), out_desc.get(), + GetBasePtr(out), ToCnnlDataType()); +} + +// ------------------ BinaryOp ----------------- +enum BINARY_FUNCTOR { + DIV, + DIVNONAN, +}; + +template +void MLUBinary(const framework::ExecutionContext& ctx, + cnnlComputationPreference_t prefer, + const cnnlTensorDescriptor_t x_desc, const void* x, + const cnnlTensorDescriptor_t y_desc, const void* y, + const cnnlTensorDescriptor_t out_desc, void* out); + +template <> +inline void MLUBinary
(const framework::ExecutionContext& ctx, + cnnlComputationPreference_t prefer, + const cnnlTensorDescriptor_t x_desc, const void* x, + const cnnlTensorDescriptor_t y_desc, const void* y, + const cnnlTensorDescriptor_t out_desc, void* out) { + MLUCnnl::Div(ctx, prefer, x_desc, x, y_desc, y, out_desc, out); +} + +template +void MLUBinaryOp(const framework::ExecutionContext& ctx) { + auto* x = ctx.Input("X"); + auto* y = ctx.Input("Y"); + auto* out = ctx.Output("Out"); + out->mutable_data(ctx.GetPlace()); + + int axis = ctx.Attr("axis"); + const auto& x_dims = x->dims(); + const auto& y_dims = y->dims(); + axis = + (axis < 0 ? (std::abs(x_dims.size() - y_dims.size()) + axis + 1) : axis); + int max_dim = std::max(x_dims.size(), y_dims.size()); + std::vector x_dims_array(max_dim); + std::vector y_dims_array(max_dim); + std::vector out_dims_array(max_dim); + GetBroadcastDimsArrays(x_dims, y_dims, x_dims_array.data(), + y_dims_array.data(), out_dims_array.data(), max_dim, + axis); + + MLUCnnlTensorDesc x_desc(max_dim, x_dims_array.data(), ToCnnlDataType()); + MLUCnnlTensorDesc y_desc(max_dim, y_dims_array.data(), ToCnnlDataType()); + MLUCnnlTensorDesc out_desc(*out, CNNL_LAYOUT_ARRAY, ToCnnlDataType()); + + cnnlComputationPreference_t prefer_type = CNNL_COMPUTATION_HIGH_PRECISION; + MLUBinary(ctx, prefer_type, x_desc.get(), GetBasePtr(x), + y_desc.get(), GetBasePtr(y), out_desc.get(), + GetBasePtr(out)); +} + +// ------------------ UnaryOp ----------------- +enum UNARY_FUNCTOR { + NEG, + RECIPROCAL, +}; + +template +void MLUUnary(const framework::ExecutionContext& ctx, + cnnlComputationPreference_t prefer, + const cnnlTensorDescriptor_t input_desc, const void* input, + const cnnlTensorDescriptor_t ouput_desc, void* output); + +template <> +inline void MLUUnary(const framework::ExecutionContext& ctx, + cnnlComputationPreference_t prefer, + const cnnlTensorDescriptor_t input_desc, + const void* input, + const cnnlTensorDescriptor_t output_desc, + void* output) { + MLUCnnl::Neg(ctx, input_desc, input, output_desc, output); +} + +template <> +inline void MLUUnary(const framework::ExecutionContext& ctx, + cnnlComputationPreference_t prefer, + const cnnlTensorDescriptor_t input_desc, + const void* input, + const cnnlTensorDescriptor_t output_desc, + void* output) { + MLUCnnl::Reciprocal(ctx, input_desc, input, output_desc, output); +} + +template +void MLUUnaryOp(const framework::ExecutionContext& ctx) { + auto* x = ctx.Input("X"); + auto* out = ctx.Output("Out"); + + out->mutable_data(ctx.GetPlace()); + + MLUCnnlTensorDesc x_desc(x, CNNL_LAYOUT_ARRAY, ToCnnlDataType()); + MLUCnnlTensorDesc out_desc(*out, CNNL_LAYOUT_ARRAY, ToCnnlDataType()); + + cnnlComputationPreference_t prefer_type = CNNL_COMPUTATION_HIGH_PRECISION; + MLUUnary(ctx, prefer_type, x_desc.get(), GetBasePtr(x), + out_desc.get(), GetBasePtr(out)); +} + +} // namespace operators +} // namespace paddle +#endif diff --git a/paddle/fluid/operators/elementwise/elementwise_mul_op_mlu.cc b/paddle/fluid/operators/elementwise/elementwise_mul_op_mlu.cc index a7505890f41d4..33603fd73f49c 100644 --- a/paddle/fluid/operators/elementwise/elementwise_mul_op_mlu.cc +++ b/paddle/fluid/operators/elementwise/elementwise_mul_op_mlu.cc @@ -12,8 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/elementwise/elementwise_mul_op.h" -#include "paddle/fluid/operators/mlu/mlu_baseop.h" +#include "paddle/fluid/operators/elementwise/elementwise_mlu.h" namespace paddle { namespace operators { @@ -21,53 +20,11 @@ namespace operators { using Tensor = framework::Tensor; using MLUDeviceContext = platform::MLUDeviceContext; -static void GetReduceAxes(const int axis, const framework::DDim& src_ddims, - const framework::DDim& target_ddims, - std::vector* axes) { - int64_t src_dim_size = src_ddims.size(); - int64_t target_dim_size = target_ddims.size(); - for (int64_t i = 0; i < src_dim_size; ++i) { - if (i < axis || i >= target_dim_size + axis) { - axes->push_back(i); - continue; - } - if (src_ddims[i] > target_ddims[i - axis]) { - axes->push_back(i); - } - } -} - template class ElementwiseMulMLUKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { - auto* x = ctx.Input("X"); - auto* y = ctx.Input("Y"); - auto* out = ctx.Output("Out"); - out->mutable_data(ctx.GetPlace()); - - int axis = ctx.Attr("axis"); - const auto& x_dims = x->dims(); - const auto& y_dims = y->dims(); - axis = (axis < 0 ? (std::abs(x_dims.size() - y_dims.size()) + axis + 1) - : axis); - int max_dim = std::max(x_dims.size(), y_dims.size()); - std::vector x_dims_array(max_dim); - std::vector y_dims_array(max_dim); - std::vector out_dims_array(max_dim); - GetBroadcastDimsArrays(x_dims, y_dims, x_dims_array.data(), - y_dims_array.data(), out_dims_array.data(), max_dim, - axis); - - MLUCnnlTensorDesc x_desc(max_dim, x_dims_array.data(), ToCnnlDataType()); - MLUCnnlTensorDesc y_desc(max_dim, y_dims_array.data(), ToCnnlDataType()); - MLUCnnlTensorDesc out_desc(*out); - MLUCnnlOpTensorDesc op_tensor_desc(CNNL_OP_TENSOR_MUL, ToCnnlDataType(), - CNNL_NOT_PROPAGATE_NAN); - - MLUCnnl::OpTensor(ctx, op_tensor_desc.get(), x_desc.get(), GetBasePtr(x), - y_desc.get(), GetBasePtr(y), out_desc.get(), - GetBasePtr(out), ToCnnlDataType()); + MLUOpTensorKernel(ctx, CNNL_OP_TENSOR_MUL); } }; diff --git a/paddle/fluid/operators/elementwise/elementwise_sub_op_mlu.cc b/paddle/fluid/operators/elementwise/elementwise_sub_op_mlu.cc new file mode 100644 index 0000000000000..7c3d09effa4b1 --- /dev/null +++ b/paddle/fluid/operators/elementwise/elementwise_sub_op_mlu.cc @@ -0,0 +1,112 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include + +#include "paddle/fluid/operators/elementwise/elementwise_mlu.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; + +template +class ElementwiseSubMLUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + MLUOpTensorKernel(ctx, CNNL_OP_TENSOR_SUB); + } +}; + +template +class ElementwiseSubGradMLUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto& dev_ctx = + ctx.template device_context(); + auto* x = ctx.Input("X"); + auto* y = ctx.Input("Y"); + auto* dout = ctx.Input(framework::GradVarName("Out")); + auto* dx = ctx.Output(framework::GradVarName("X")); + auto* dy = ctx.Output(framework::GradVarName("Y")); + int axis = ctx.Attr("axis"); + axis = (axis == -1 ? std::abs(x->dims().size() - y->dims().size()) : axis); + + MLUCnnlTensorDesc dout_desc(*dout); + + if (dx) { + dx->mutable_data(ctx.GetPlace()); + if (dx->dims() != dout->dims()) { + std::vector dst_dims_vec; + std::vector reduce_axes; + GetReduceAxesAndDstDims(axis, dout->dims(), dx->dims(), &reduce_axes, + &dst_dims_vec); + + MLUCnnlReduceDesc reduction_desc( + reduce_axes, CNNL_REDUCE_ADD, ToCnnlDataType(), + CNNL_NOT_PROPAGATE_NAN, CNNL_REDUCE_NO_INDICES, CNNL_32BIT_INDICES); + MLUCnnlTensorDesc dx_desc(dst_dims_vec.size(), dst_dims_vec.data(), + ToCnnlDataType()); + MLUCnnl::Reduce(ctx, true /*need_workspace*/, reduction_desc.get(), + nullptr, dout_desc.get(), GetBasePtr(dout), 0, nullptr, + nullptr, dx_desc.get(), GetBasePtr(dx)); + } else { + framework::TensorCopy(*dout, ctx.GetPlace(), dev_ctx, dx); + } + } + if (dy) { + dy->mutable_data(ctx.GetPlace()); + Tensor* tmp_dout = const_cast(dout); + if (dy->dims() != dout->dims()) { + std::vector dst_dims_vec; + std::vector reduce_axes; + GetReduceAxesAndDstDims(axis, dout->dims(), dy->dims(), &reduce_axes, + &dst_dims_vec); + + MLUCnnlReduceDesc reduction_desc( + reduce_axes, CNNL_REDUCE_ADD, ToCnnlDataType(), + CNNL_NOT_PROPAGATE_NAN, CNNL_REDUCE_NO_INDICES, CNNL_32BIT_INDICES); + MLUCnnlTensorDesc dy_desc(dst_dims_vec.size(), dst_dims_vec.data(), + ToCnnlDataType()); + MLUCnnl::Reduce(ctx, true /*need_workspace*/, reduction_desc.get(), + nullptr, dout_desc.get(), GetBasePtr(dout), 0, nullptr, + nullptr, dy_desc.get(), GetBasePtr(dy)); + tmp_dout = dy; + } + + // call neg op, dy = -dout + MLUCnnlTensorDesc tmp_dout_desc(*tmp_dout); + MLUCnnlTensorDesc dy_desc(*dy); + + MLUUnary(ctx, CNNL_COMPUTATION_HIGH_PRECISION, tmp_dout_desc.get(), + GetBasePtr(tmp_dout), dy_desc.get(), GetBasePtr(dy)); + } + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +namespace plat = paddle::platform; + +REGISTER_OP_MLU_KERNEL(elementwise_sub, ops::ElementwiseSubMLUKernel, + ops::ElementwiseSubMLUKernel, + ops::ElementwiseSubMLUKernel); + +REGISTER_OP_MLU_KERNEL(elementwise_sub_grad, + ops::ElementwiseSubGradMLUKernel, + ops::ElementwiseSubGradMLUKernel, + ops::ElementwiseSubGradMLUKernel); diff --git a/paddle/fluid/operators/mlu/mlu_baseop.h b/paddle/fluid/operators/mlu/mlu_baseop.h index 00ad618329c99..9948c45e24692 100644 --- a/paddle/fluid/operators/mlu/mlu_baseop.h +++ b/paddle/fluid/operators/mlu/mlu_baseop.h @@ -45,6 +45,22 @@ enum MLULogicMethod { CNNL_LOGIC_OP_OR = 7, }; +const std::map MLUReduceOpMap = { + {"reduce_all", CNNL_REDUCE_AND}, {"reduce_any", CNNL_REDUCE_OR}, + {"reduce_max", CNNL_REDUCE_MAX}, {"reduce_mean", CNNL_REDUCE_AVG}, + {"reduce_min", CNNL_REDUCE_MIN}, {"reduce_sum", CNNL_REDUCE_ADD}, + {"reduce_prod", CNNL_REDUCE_MUL}, +}; + +inline cnnlReduceOp_t GetMLUCnnlReduceOp(const std::string reduce_name) { + auto iter = MLUReduceOpMap.find(reduce_name); + if (iter != MLUReduceOpMap.end()) { + return iter->second; + } + PADDLE_THROW(platform::errors::InvalidArgument( + "Not support reduce op type of MLU Device: %s", reduce_name)); +} + inline const void* GetBasePtr(const Tensor* t) { return t->data(); } inline void* GetBasePtr(Tensor* t) { return t->data(); } diff --git a/paddle/fluid/operators/reduce_ops/reduce_mean_op_mlu.cc b/paddle/fluid/operators/reduce_ops/reduce_mean_op_mlu.cc index 89e578dbdb6b7..6e5fd59c45645 100644 --- a/paddle/fluid/operators/reduce_ops/reduce_mean_op_mlu.cc +++ b/paddle/fluid/operators/reduce_ops/reduce_mean_op_mlu.cc @@ -12,9 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/fluid/operators/reduce_ops/reduce_mean_op.h" -#include "paddle/fluid/operators/mlu/mlu_baseop.h" -#include "paddle/fluid/platform/device/mlu/device_context.h" +#include "paddle/fluid/operators/reduce_ops/reduce_op_mlu.h" namespace paddle { namespace operators { @@ -23,42 +21,7 @@ template class ReduceMeanMLUKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& context) const override { - auto* input = context.Input("X"); - auto* output = context.Output("Out"); - output->mutable_data(context.GetPlace()); - - bool reduce_all = context.Attr("reduce_all"); - auto dims = context.Attr>("dim"); - auto input_dims = phi::vectorize(input->dims()); - const auto& input_dim_size = input->dims().size(); - std::vector reduce_dims; - if (reduce_all) { - for (size_t i = 0; i < input_dims.size(); i++) { - reduce_dims.push_back(static_cast(i)); - } - } else { - for (size_t i = 0; i < dims.size(); ++i) { - if (dims[i] < 0) { - reduce_dims.push_back(dims[i] + input_dim_size); - } else { - reduce_dims.push_back(dims[i]); - } - } - } - - MLUCnnlTensorDesc input_desc(*input, CNNL_LAYOUT_ARRAY, - ToCnnlDataType(input->dtype())); - MLUCnnlTensorDesc output_desc(*output, CNNL_LAYOUT_ARRAY, - ToCnnlDataType(output->dtype())); - - MLUCnnlReduceDesc reduction_desc( - reduce_dims, CNNL_REDUCE_AVG, ToCnnlDataType(), - CNNL_NOT_PROPAGATE_NAN, CNNL_REDUCE_NO_INDICES, CNNL_32BIT_INDICES); - - MLUCnnl::Reduce(context, true /*need_workspace*/, reduction_desc.get(), - nullptr, input_desc.get(), GetBasePtr(input), - 0 /*indices_size*/, nullptr, nullptr, output_desc.get(), - GetBasePtr(output)); + MLUReduceOp(context, "reduce_mean"); } }; diff --git a/paddle/fluid/operators/reduce_ops/reduce_op_mlu.h b/paddle/fluid/operators/reduce_ops/reduce_op_mlu.h new file mode 100644 index 0000000000000..95dda354cae7d --- /dev/null +++ b/paddle/fluid/operators/reduce_ops/reduce_op_mlu.h @@ -0,0 +1,73 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#ifdef PADDLE_WITH_MLU +#include +#include +#include "paddle/fluid/operators/mlu/mlu_baseop.h" +#include "paddle/fluid/operators/reduce_ops/reduce_op.h" + +namespace paddle { +namespace operators { + +template +void MLUReduceOp(const framework::ExecutionContext& context, + std::string reduce_name) { + PADDLE_ENFORCE_EQ( + platform::is_mlu_place(context.GetPlace()), true, + platform::errors::Unavailable("This kernel only runs on MLU.")); + auto* input = context.Input("X"); + auto* output = context.Output("Out"); + output->mutable_data(context.GetPlace()); + + bool reduce_all = context.Attr("reduce_all"); + auto dims = context.Attr>("dim"); + auto input_dims = phi::vectorize(input->dims()); + const auto& input_dim_size = input->dims().size(); + std::vector reduce_dims; + if (reduce_all) { + for (size_t i = 0; i < input_dims.size(); i++) { + reduce_dims.push_back(static_cast(i)); + } + } else { + for (size_t i = 0; i < dims.size(); ++i) { + if (dims[i] < 0) { + reduce_dims.push_back(dims[i] + input_dim_size); + } else { + reduce_dims.push_back(dims[i]); + } + } + } + + MLUCnnlTensorDesc input_desc(*input, CNNL_LAYOUT_ARRAY, + ToCnnlDataType(input->dtype())); + MLUCnnlTensorDesc output_desc(*output, CNNL_LAYOUT_ARRAY, + ToCnnlDataType(output->dtype())); + + cnnlReduceOp_t reduce_op = GetMLUCnnlReduceOp(reduce_name); + MLUCnnlReduceDesc reduction_desc(reduce_dims, reduce_op, ToCnnlDataType(), + CNNL_NOT_PROPAGATE_NAN, + CNNL_REDUCE_NO_INDICES, CNNL_32BIT_INDICES); + + MLUCnnl::Reduce(context, true /*need_workspace*/, reduction_desc.get(), + nullptr, input_desc.get(), GetBasePtr(input), + 0 /*indices_size*/, nullptr, nullptr, output_desc.get(), + GetBasePtr(output)); +} + +} // namespace operators +} // namespace paddle +#endif diff --git a/paddle/fluid/operators/reduce_ops/reduce_sum_op_mlu.cc b/paddle/fluid/operators/reduce_ops/reduce_sum_op_mlu.cc new file mode 100644 index 0000000000000..fab8bb23b16ac --- /dev/null +++ b/paddle/fluid/operators/reduce_ops/reduce_sum_op_mlu.cc @@ -0,0 +1,78 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/operators/reduce_ops/reduce_op_mlu.h" + +namespace paddle { +namespace operators { + +template +class ReduceSumMLUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + MLUReduceOp(context, "reduce_sum"); + } +}; + +template +class ReduceSumGradMLUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + auto* in = context.Input("X"); + auto* out_grad = context.Input(framework::GradVarName("Out")); + auto* in_grad = context.Output(framework::GradVarName("X")); + in_grad->mutable_data(context.GetPlace()); + + bool reduce_all = context.Attr("reduce_all"); + auto reduce_dims = context.Attr>("dim"); + auto in_dims = phi::vectorize(in->dims()); + + if (reduce_all) { + reduce_dims.clear(); + for (size_t d = 0; d < in_dims.size(); ++d) { + reduce_dims.push_back(static_cast(d)); + } + } + for (auto& d : reduce_dims) { + if (d < 0) { + d = d + in_dims.size(); + } + } + + Tensor tmp_out(out_grad->dtype()); + auto tmp_output_dims = in_dims; + for (auto d : reduce_dims) { + tmp_output_dims[d] = 1; + } + tmp_out.ShareDataWith(*out_grad); + tmp_out.Resize(phi::make_ddim(tmp_output_dims)); + + MLUCnnlTensorDesc out_desc(tmp_out, CNNL_LAYOUT_ARRAY, ToCnnlDataType()); + MLUCnnlTensorDesc in_grad_desc(*in_grad, CNNL_LAYOUT_ARRAY, + ToCnnlDataType()); + + MLUCnnl::BroadcastTo(context, out_desc.get(), GetBasePtr(&tmp_out), + in_grad_desc.get(), GetBasePtr(in_grad)); + } +}; +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +namespace plat = paddle::platform; + +REGISTER_OP_MLU_KERNEL(reduce_sum, ops::ReduceSumMLUKernel, + ops::ReduceSumMLUKernel); +REGISTER_OP_MLU_KERNEL(reduce_sum_grad, ops::ReduceSumGradMLUKernel, + ops::ReduceSumGradMLUKernel); diff --git a/python/paddle/fluid/tests/unittests/mlu/test_elementwise_sub_op_mlu.py b/python/paddle/fluid/tests/unittests/mlu/test_elementwise_sub_op_mlu.py new file mode 100644 index 0000000000000..9ca5359e05ff7 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/mlu/test_elementwise_sub_op_mlu.py @@ -0,0 +1,208 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import numpy as np +import unittest +import sys +sys.path.append("..") +from op_test import OpTest, skip_check_grad_ci +import paddle +import paddle.fluid as fluid + +paddle.enable_static() + +SEED = 2022 + + +class TestElementwiseSubOp(OpTest): + def setUp(self): + self.set_mlu() + self.op_type = "elementwise_sub" + self.init_dtype() + self.init_input_output() + self.init_axis() + + self.inputs = { + 'X': OpTest.np_dtype_to_fluid_dtype(self.x), + 'Y': OpTest.np_dtype_to_fluid_dtype(self.y) + } + self.attrs = {'axis': self.axis} + self.outputs = {'Out': self.out} + + def set_mlu(self): + self.__class__.use_mlu = True + self.place = paddle.device.MLUPlace(0) + + def init_input_output(self): + self.x = np.random.uniform(0.1, 1, [13, 17]).astype(self.dtype) + self.y = np.random.uniform(0.1, 1, [13, 17]).astype(self.dtype) + self.out = np.subtract(self.x, self.y) + + def init_dtype(self): + self.dtype = np.float32 + + def init_axis(self): + self.axis = 0 + + def test_check_output(self): + self.check_output_with_place(self.place) + + def test_check_grad_normal(self): + self.check_grad_with_place(self.place, ['X', 'Y'], 'Out') + + def test_check_grad_ingore_x(self): + self.check_grad_with_place( + self.place, ['Y'], + 'Out', + max_relative_error=0.005, + no_grad_set=set("X")) + + def test_check_grad_ingore_y(self): + self.check_grad_with_place( + self.place, ['X'], + 'Out', + max_relative_error=0.005, + no_grad_set=set('Y')) + + +@skip_check_grad_ci( + reason="[skip shape check] Use y_shape(1) to test broadcast.") +class TestElementwiseSubOp_scalar(TestElementwiseSubOp): + def setUp(self): + self.set_mlu() + self.op_type = "elementwise_sub" + self.inputs = { + 'X': np.random.rand(10, 3, 4).astype(np.float32), + 'Y': np.random.rand(1).astype(np.float32) + } + self.outputs = {'Out': self.inputs['X'] - self.inputs['Y']} + + +class TestElementwiseSubOp_Vector(TestElementwiseSubOp): + def setUp(self): + self.set_mlu() + self.op_type = "elementwise_sub" + self.inputs = { + 'X': np.random.random((100, )).astype("float32"), + 'Y': np.random.random((100, )).astype("float32") + } + self.outputs = {'Out': self.inputs['X'] - self.inputs['Y']} + + +class TestElementwiseSubOp_broadcast_0(TestElementwiseSubOp): + def setUp(self): + self.set_mlu() + self.op_type = "elementwise_sub" + self.inputs = { + 'X': np.random.rand(100, 3, 2).astype(np.float32), + 'Y': np.random.rand(100).astype(np.float32) + } + self.attrs = {'axis': 0} + self.outputs = { + 'Out': self.inputs['X'] - self.inputs['Y'].reshape(100, 1, 1) + } + + +class TestElementwiseSubOp_broadcast_1(TestElementwiseSubOp): + def setUp(self): + self.set_mlu() + self.op_type = "elementwise_sub" + self.inputs = { + 'X': np.random.rand(2, 100, 3).astype(np.float32), + 'Y': np.random.rand(100).astype(np.float32) + } + self.attrs = {'axis': 1} + self.outputs = { + 'Out': self.inputs['X'] - self.inputs['Y'].reshape(1, 100, 1) + } + + +class TestElementwiseSubOp_broadcast_2(TestElementwiseSubOp): + def setUp(self): + self.set_mlu() + self.op_type = "elementwise_sub" + self.inputs = { + 'X': np.random.rand(2, 3, 100).astype(np.float32), + 'Y': np.random.rand(100).astype(np.float32) + } + self.outputs = { + 'Out': self.inputs['X'] - self.inputs['Y'].reshape(1, 1, 100) + } + + +class TestElementwiseSubOp_broadcast_3(TestElementwiseSubOp): + def setUp(self): + self.set_mlu() + self.op_type = "elementwise_sub" + self.inputs = { + 'X': np.random.rand(2, 10, 12, 3).astype(np.float32), + 'Y': np.random.rand(10, 12).astype(np.float32) + } + self.attrs = {'axis': 1} + self.outputs = { + 'Out': self.inputs['X'] - self.inputs['Y'].reshape(1, 10, 12, 1) + } + + +class TestElementwiseSubOp_broadcast_4(TestElementwiseSubOp): + def setUp(self): + self.set_mlu() + self.op_type = "elementwise_sub" + self.inputs = { + 'X': np.random.rand(2, 5, 3, 12).astype(np.float32), + 'Y': np.random.rand(2, 5, 1, 12).astype(np.float32) + } + self.outputs = {'Out': self.inputs['X'] - self.inputs['Y']} + + +class TestElementwiseSubOp_commonuse_1(TestElementwiseSubOp): + def setUp(self): + self.set_mlu() + self.op_type = "elementwise_sub" + self.inputs = { + 'X': np.random.rand(2, 3, 100).astype(np.float32), + 'Y': np.random.rand(1, 1, 100).astype(np.float32) + } + self.outputs = {'Out': self.inputs['X'] - self.inputs['Y']} + + +class TestElementwiseSubOp_commonuse_2(TestElementwiseSubOp): + def setUp(self): + self.set_mlu() + self.op_type = "elementwise_sub" + self.inputs = { + 'X': np.random.rand(10, 3, 1, 4).astype(np.float32), + 'Y': np.random.rand(10, 1, 12, 1).astype(np.float32) + } + self.outputs = {'Out': self.inputs['X'] - self.inputs['Y']} + + +class TestElementwiseSubOp_xsize_lessthan_ysize(TestElementwiseSubOp): + def setUp(self): + self.set_mlu() + self.op_type = "elementwise_sub" + self.inputs = { + 'X': np.random.rand(10, 12).astype(np.float32), + 'Y': np.random.rand(2, 3, 10, 12).astype(np.float32) + } + self.attrs = {'axis': 2} + self.outputs = { + 'Out': self.inputs['X'].reshape(1, 1, 10, 12) - self.inputs['Y'] + } + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/mlu/test_reduce_sum_op_mlu.py b/python/paddle/fluid/tests/unittests/mlu/test_reduce_sum_op_mlu.py new file mode 100644 index 0000000000000..d2729d77abaa7 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/mlu/test_reduce_sum_op_mlu.py @@ -0,0 +1,149 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import unittest +import numpy as np +import sys +sys.path.append("..") +from op_test import OpTest +import paddle + +paddle.enable_static() + + +class TestMLUReduceSumOp(OpTest): + def setUp(self): + self.init_op_type() + self.initTestCase() + self.set_mlu() + self.attrs = { + 'dim': self.axis, + 'keep_dim': self.keep_dim, + 'reduce_all': self.reduce_all + } + self.inputs = {'X': np.random.random(self.shape).astype("float32")} + if self.attrs['reduce_all']: + self.outputs = {'Out': self.inputs['X'].sum()} + else: + self.outputs = { + 'Out': self.inputs['X'].sum(axis=self.axis, + keepdims=self.attrs['keep_dim']) + } + + def set_mlu(self): + self.__class__.use_mlu = True + self.place = paddle.device.MLUPlace(0) + + def test_check_output(self): + self.check_output_with_place(self.place) + + def test_check_grad(self): + self.check_grad_with_place(self.place, ['X'], 'Out') + + def init_op_type(self): + self.op_type = "reduce_sum" + self.use_mkldnn = False + self.keep_dim = False + self.reduce_all = False + + def initTestCase(self): + self.shape = (5, 6, 10) + self.axis = (0, ) + + +class TestSumOp5D(TestMLUReduceSumOp): + def initTestCase(self): + self.shape = (1, 2, 5, 6, 10) + self.axis = (0, ) + + +class TestSumOp6D(TestMLUReduceSumOp): + def initTestCase(self): + self.shape = (1, 1, 2, 5, 6, 10) + self.axis = (0, ) + + +class TestSumOp8D(TestMLUReduceSumOp): + def initTestCase(self): + self.shape = (1, 3, 1, 2, 1, 4, 3, 10) + self.axis = (0, 3) + + +class Test1DReduce(TestMLUReduceSumOp): + def initTestCase(self): + self.shape = 120 + self.axis = (0, ) + + +class Test2DReduce0(TestMLUReduceSumOp): + def initTestCase(self): + self.shape = (20, 10) + self.axis = (0, ) + + +class Test2DReduce1(TestMLUReduceSumOp): + def initTestCase(self): + self.shape = (20, 10) + self.axis = (1, ) + + +class Test3DReduce0(TestMLUReduceSumOp): + def initTestCase(self): + self.shape = (5, 6, 7) + self.axis = (1, ) + + +class Test3DReduce1(TestMLUReduceSumOp): + def initTestCase(self): + self.shape = (5, 6, 7) + self.axis = (2, ) + + +class Test3DReduce2(TestMLUReduceSumOp): + def initTestCase(self): + self.shape = (5, 6, 7) + self.axis = (-2, ) + + +class Test3DReduce3(TestMLUReduceSumOp): + def initTestCase(self): + self.shape = (5, 6, 7) + self.axis = (1, 2) + + +class TestKeepDimReduce(TestMLUReduceSumOp): + def initTestCase(self): + self.shape = (5, 6, 10) + self.axis = (1, ) + self.keep_dim = True + + +class TestKeepDim8DReduce(TestMLUReduceSumOp): + def initTestCase(self): + self.shape = (2, 5, 3, 2, 2, 3, 4, 2) + self.axis = (3, 4, 5) + self.keep_dim = True + + +class TestReduceAll(TestMLUReduceSumOp): + def initTestCase(self): + self.shape = (5, 6, 2, 10) + self.axis = (0, ) + self.reduce_all = True + + +if __name__ == '__main__': + unittest.main() From c31dd04ce4893f7f41e3032312f8acc5204755be Mon Sep 17 00:00:00 2001 From: Siming Dai <908660116@qq.com> Date: Mon, 18 Apr 2022 10:23:02 +0800 Subject: [PATCH 200/211] Optimization for graph_sample_neighbors API (#41447) * add eids result for graph_sample_neighbors * fix bug * move fisher_yates sample to warp * add cpu eid output * delete comment * delete comment * change nullptr placeholder * optimize sample kernel * fix mutable_data --- .../cpu/graph_sample_neighbors_kernel.cc | 100 ++++++++- .../gpu/graph_sample_neighbors_kernel.cu | 200 ++++++++++++------ .../unittests/test_graph_sample_neighbors.py | 18 +- 3 files changed, 240 insertions(+), 78 deletions(-) diff --git a/paddle/phi/kernels/cpu/graph_sample_neighbors_kernel.cc b/paddle/phi/kernels/cpu/graph_sample_neighbors_kernel.cc index e18848af0dc08..b4321a85ab2ee 100644 --- a/paddle/phi/kernels/cpu/graph_sample_neighbors_kernel.cc +++ b/paddle/phi/kernels/cpu/graph_sample_neighbors_kernel.cc @@ -39,17 +39,42 @@ void SampleUniqueNeighbors( } } +template +void SampleUniqueNeighborsWithEids( + bidiiter src_begin, + bidiiter src_end, + bidiiter eid_begin, + bidiiter eid_end, + int num_samples, + std::mt19937& rng, + std::uniform_int_distribution& dice_distribution) { + int left_num = std::distance(src_begin, src_end); + for (int i = 0; i < num_samples; i++) { + bidiiter r1 = src_begin, r2 = eid_begin; + int random_step = dice_distribution(rng) % left_num; + std::advance(r1, random_step); + std::advance(r2, random_step); + std::swap(*src_begin, *r1); + std::swap(*eid_begin, *r2); + ++src_begin; + ++eid_begin; + --left_num; + } +} + template void SampleNeighbors(const T* row, const T* col_ptr, + const T* eids, const T* input, std::vector* output, std::vector* output_count, + std::vector* output_eids, int sample_size, - int bs) { - // Allocate the memory of output - // Collect the neighbors size + int bs, + bool return_eids) { std::vector> out_src_vec; + std::vector> out_eids_vec; // `sample_cumsum_sizes` record the start position and end position // after sampling. std::vector sample_cumsum_sizes(bs + 1); @@ -65,10 +90,18 @@ void SampleNeighbors(const T* row, std::vector out_src; out_src.resize(cap); out_src_vec.emplace_back(out_src); + if (return_eids) { + std::vector out_eids; + out_eids.resize(cap); + out_eids_vec.emplace_back(out_eids); + } } output_count->resize(bs); output->resize(total_neighbors); + if (return_eids) { + output_eids->resize(total_neighbors); + } std::random_device rd; std::mt19937 rng{rd()}; @@ -85,15 +118,28 @@ void SampleNeighbors(const T* row, int cap = end - begin; if (sample_size < cap) { std::copy(row + begin, row + end, out_src_vec[i].begin()); - // TODO(daisiming): Check whether is correct. - SampleUniqueNeighbors(out_src_vec[i].begin(), - out_src_vec[i].end(), - sample_size, - rng, - dice_distribution); + if (return_eids) { + std::copy(eids + begin, eids + end, out_eids_vec[i].begin()); + SampleUniqueNeighborsWithEids(out_src_vec[i].begin(), + out_src_vec[i].end(), + out_eids_vec[i].begin(), + out_eids_vec[i].end(), + sample_size, + rng, + dice_distribution); + } else { + SampleUniqueNeighbors(out_src_vec[i].begin(), + out_src_vec[i].end(), + sample_size, + rng, + dice_distribution); + } *(output_count->data() + i) = sample_size; } else { std::copy(row + begin, row + end, out_src_vec[i].begin()); + if (return_eids) { + std::copy(eids + begin, eids + end, out_eids_vec[i].begin()); + } *(output_count->data() + i) = cap; } } @@ -107,6 +153,11 @@ void SampleNeighbors(const T* row, std::copy(out_src_vec[i].begin(), out_src_vec[i].begin() + k, output->data() + sample_cumsum_sizes[i]); + if (return_eids) { + std::copy(out_eids_vec[i].begin(), + out_eids_vec[i].begin() + k, + output_eids->data() + sample_cumsum_sizes[i]); + } } } @@ -131,8 +182,35 @@ void GraphSampleNeighborsKernel( std::vector output; std::vector output_count; - SampleNeighbors( - row_data, col_ptr_data, x_data, &output, &output_count, sample_size, bs); + + if (return_eids) { + const T* eids_data = eids.get_ptr()->data(); + std::vector output_eids; + SampleNeighbors(row_data, + col_ptr_data, + eids_data, + x_data, + &output, + &output_count, + &output_eids, + sample_size, + bs, + return_eids); + out_eids->Resize({static_cast(output_eids.size())}); + T* out_eids_data = dev_ctx.template Alloc(out_eids); + std::copy(output_eids.begin(), output_eids.end(), out_eids_data); + } else { + SampleNeighbors(row_data, + col_ptr_data, + nullptr, + x_data, + &output, + &output_count, + nullptr, + sample_size, + bs, + return_eids); + } out->Resize({static_cast(output.size())}); T* out_data = dev_ctx.template Alloc(out); std::copy(output.begin(), output.end(), out_data); diff --git a/paddle/phi/kernels/gpu/graph_sample_neighbors_kernel.cu b/paddle/phi/kernels/gpu/graph_sample_neighbors_kernel.cu index 1757b6b98dbf9..af616963b499a 100644 --- a/paddle/phi/kernels/gpu/graph_sample_neighbors_kernel.cu +++ b/paddle/phi/kernels/gpu/graph_sample_neighbors_kernel.cu @@ -62,9 +62,11 @@ __global__ void SampleKernel(const uint64_t rand_seed, const T* nodes, const T* row, const T* col_ptr, + const T* eids, T* output, + T* output_eids, int* output_ptr, - int* output_idxs) { + bool return_eids) { assert(blockDim.x == WARP_SIZE); assert(blockDim.y == BLOCK_WARPS); @@ -94,10 +96,13 @@ __global__ void SampleKernel(const uint64_t rand_seed, if (deg <= k) { for (int idx = threadIdx.x; idx < deg; idx += WARP_SIZE) { output[out_row_start + idx] = row[in_row_start + idx]; + if (return_eids) { + output_eids[out_row_start + idx] = eids[in_row_start + idx]; + } } } else { for (int idx = threadIdx.x; idx < k; idx += WARP_SIZE) { - output_idxs[out_row_start + idx] = idx; + output[out_row_start + idx] = idx; } #ifdef PADDLE_WITH_CUDA __syncwarp(); @@ -111,7 +116,7 @@ __global__ void SampleKernel(const uint64_t rand_seed, #endif if (num < k) { atomicMax(reinterpret_cast( // NOLINT - output_idxs + out_row_start + num), + output + out_row_start + num), static_cast(idx)); // NOLINT } } @@ -120,8 +125,11 @@ __global__ void SampleKernel(const uint64_t rand_seed, #endif for (int idx = threadIdx.x; idx < k; idx += WARP_SIZE) { - T perm_idx = output_idxs[out_row_start + idx] + in_row_start; + T perm_idx = output[out_row_start + idx] + in_row_start; output[out_row_start + idx] = row[perm_idx]; + if (return_eids) { + output_eids[out_row_start + idx] = eids[perm_idx]; + } } } @@ -148,16 +156,17 @@ template void SampleNeighbors(const Context& dev_ctx, const T* row, const T* col_ptr, + const T* eids, const thrust::device_ptr input, thrust::device_ptr output, thrust::device_ptr output_count, + thrust::device_ptr output_eids, int sample_size, int bs, - int total_sample_num) { + int total_sample_num, + bool return_eids) { thrust::device_vector output_ptr; - thrust::device_vector output_idxs; output_ptr.resize(bs); - output_idxs.resize(total_sample_num); thrust::exclusive_scan( output_count, output_count + bs, output_ptr.begin(), 0); @@ -176,18 +185,26 @@ void SampleNeighbors(const Context& dev_ctx, thrust::raw_pointer_cast(input), row, col_ptr, + eids, thrust::raw_pointer_cast(output), + thrust::raw_pointer_cast(output_eids), thrust::raw_pointer_cast(output_ptr.data()), - thrust::raw_pointer_cast(output_idxs.data())); + return_eids); } -template +template __global__ void FisherYatesSampleKernel(const uint64_t rand_seed, int k, const int64_t num_rows, const T* in_rows, T* src, const T* dst_count) { + assert(blockDim.x == WARP_SIZE); + assert(blockDim.y == BLOCK_WARPS); + + int64_t out_row = blockIdx.x * TILE_SIZE + threadIdx.y; + const int64_t last_row = + min(static_cast(blockIdx.x + 1) * TILE_SIZE, num_rows); #ifdef PADDLE_WITH_HIP hiprandState rng; hiprand_init( @@ -197,20 +214,19 @@ __global__ void FisherYatesSampleKernel(const uint64_t rand_seed, curand_init( rand_seed * gridDim.x + blockIdx.x, threadIdx.y + threadIdx.x, 0, &rng); #endif - CUDA_KERNEL_LOOP(out_row, num_rows) { + + while (out_row < last_row) { const T row = in_rows[out_row]; const T in_row_start = dst_count[row]; const int deg = dst_count[row + 1] - in_row_start; int split; - T tmp; - if (k < deg) { if (deg < 2 * k) { split = k; } else { split = deg - k; } - for (int idx = deg - 1; idx >= split; idx--) { + for (int idx = split + threadIdx.x; idx <= deg - 1; idx += WARP_SIZE) { #ifdef PADDLE_WITH_HIP const int num = hiprand(&rng) % (idx + 1); #else @@ -222,7 +238,11 @@ __global__ void FisherYatesSampleKernel(const uint64_t rand_seed, static_cast( // NOLINT src[in_row_start + idx]))); } +#ifdef PADDLE_WITH_CUDA + __syncwarp(); +#endif } + out_row += BLOCK_WARPS; } } @@ -232,9 +252,12 @@ __global__ void GatherEdge(int k, const T* in_rows, const T* src, const T* dst_count, + const T* eids, T* outputs, + T* output_eids, int* output_ptr, - T* perm_data) { + T* perm_data, + bool return_eids) { assert(blockDim.x == WARP_SIZE); assert(blockDim.y == BLOCK_WARPS); @@ -250,8 +273,10 @@ __global__ void GatherEdge(int k, if (deg <= k) { for (int idx = threadIdx.x; idx < deg; idx += WARP_SIZE) { - const T in_idx = in_row_start + idx; - outputs[out_row_start + idx] = src[in_idx]; + outputs[out_row_start + idx] = src[in_row_start + idx]; + if (return_eids) { + output_eids[out_row_start + idx] = eids[in_row_start + idx]; + } } } else { int split = k; @@ -267,6 +292,10 @@ __global__ void GatherEdge(int k, for (int idx = begin + threadIdx.x; idx < end; idx += WARP_SIZE) { outputs[out_row_start + idx - begin] = src[perm_data[in_row_start + idx]]; + if (return_eids) { + output_eids[out_row_start + idx - begin] = + eids[perm_data[in_row_start + idx]]; + } } } out_row += BLOCK_WARPS; @@ -277,49 +306,48 @@ template void FisherYatesSampleNeighbors(const Context& dev_ctx, const T* row, const T* col_ptr, + const T* eids, T* perm_data, const thrust::device_ptr input, thrust::device_ptr output, thrust::device_ptr output_count, + thrust::device_ptr output_eids, int sample_size, int bs, - int total_sample_num) { + int total_sample_num, + bool return_eids) { thrust::device_vector output_ptr; output_ptr.resize(bs); thrust::exclusive_scan( output_count, output_count + bs, output_ptr.begin(), 0); -#ifdef PADDLE_WITH_HIP - int block = 256; -#else - int block = 1024; -#endif - int max_grid_dimx = dev_ctx.GetCUDAMaxGridDimSize()[0]; - int grid_tmp = (bs + block - 1) / block; - int grid = grid_tmp < max_grid_dimx ? grid_tmp : max_grid_dimx; + constexpr int WARP_SIZE = 32; + constexpr int BLOCK_WARPS = 128 / WARP_SIZE; + constexpr int TILE_SIZE = BLOCK_WARPS * 16; + const dim3 block(WARP_SIZE, BLOCK_WARPS); + const dim3 grid((bs + TILE_SIZE - 1) / TILE_SIZE); - FisherYatesSampleKernel<<>>( + FisherYatesSampleKernel<<>>( 0, sample_size, bs, thrust::raw_pointer_cast(input), perm_data, col_ptr); - constexpr int GATHER_WARP_SIZE = 32; - constexpr int GATHER_BLOCK_WARPS = 128 / GATHER_WARP_SIZE; - constexpr int GATHER_TILE_SIZE = GATHER_BLOCK_WARPS * 16; - const dim3 gather_block(GATHER_WARP_SIZE, GATHER_BLOCK_WARPS); - const dim3 gather_grid((bs + GATHER_TILE_SIZE - 1) / GATHER_TILE_SIZE); - - GatherEdge< - T, - GATHER_WARP_SIZE, - GATHER_BLOCK_WARPS, - GATHER_TILE_SIZE><<>>( + GatherEdge<<>>( sample_size, bs, thrust::raw_pointer_cast(input), row, col_ptr, + eids, thrust::raw_pointer_cast(output), + thrust::raw_pointer_cast(output_eids), thrust::raw_pointer_cast(output_ptr.data()), - perm_data); + perm_data, + return_eids); } template @@ -354,32 +382,78 @@ void GraphSampleNeighborsKernel( T* out_data = dev_ctx.template Alloc(out); thrust::device_ptr output(out_data); - if (!flag_perm_buffer) { - SampleNeighbors(dev_ctx, - row_data, - col_ptr_data, - input, - output, - output_count, - sample_size, - bs, - total_sample_size); + if (return_eids) { + auto* eids_data = eids.get_ptr()->data(); + out_eids->Resize({static_cast(total_sample_size)}); + T* out_eids_data = dev_ctx.template Alloc(out_eids); + thrust::device_ptr output_eids(out_eids_data); + if (!flag_perm_buffer) { + SampleNeighbors(dev_ctx, + row_data, + col_ptr_data, + eids_data, + input, + output, + output_count, + output_eids, + sample_size, + bs, + total_sample_size, + return_eids); + } else { + DenseTensor perm_buffer_out(perm_buffer->type()); + const auto* p_perm_buffer = perm_buffer.get_ptr(); + perm_buffer_out.ShareDataWith(*p_perm_buffer); + T* perm_buffer_out_data = perm_buffer_out.template data(); + FisherYatesSampleNeighbors(dev_ctx, + row_data, + col_ptr_data, + eids_data, + perm_buffer_out_data, + input, + output, + output_count, + output_eids, + sample_size, + bs, + total_sample_size, + return_eids); + } } else { - DenseTensor perm_buffer_out(perm_buffer->type()); - const auto* p_perm_buffer = perm_buffer.get_ptr(); - perm_buffer_out.ShareDataWith(*p_perm_buffer); - T* perm_buffer_out_data = - perm_buffer_out.mutable_data(dev_ctx.GetPlace()); - FisherYatesSampleNeighbors(dev_ctx, - row_data, - col_ptr_data, - perm_buffer_out_data, - input, - output, - output_count, - sample_size, - bs, - total_sample_size); + // How to set null value for output_eids(thrust::device_ptr)? + // We use `output` to fill the position of unused output_eids. + if (!flag_perm_buffer) { + SampleNeighbors(dev_ctx, + row_data, + col_ptr_data, + nullptr, + input, + output, + output_count, + output, + sample_size, + bs, + total_sample_size, + return_eids); + } else { + DenseTensor perm_buffer_out(perm_buffer->type()); + const auto* p_perm_buffer = perm_buffer.get_ptr(); + perm_buffer_out.ShareDataWith(*p_perm_buffer); + T* perm_buffer_out_data = perm_buffer_out.template data(); + FisherYatesSampleNeighbors(dev_ctx, + row_data, + col_ptr_data, + nullptr, + perm_buffer_out_data, + input, + output, + output_count, + output, + sample_size, + bs, + total_sample_size, + return_eids); + } } } diff --git a/python/paddle/fluid/tests/unittests/test_graph_sample_neighbors.py b/python/paddle/fluid/tests/unittests/test_graph_sample_neighbors.py index d2fbeab3fd42c..675a3429ab55f 100644 --- a/python/paddle/fluid/tests/unittests/test_graph_sample_neighbors.py +++ b/python/paddle/fluid/tests/unittests/test_graph_sample_neighbors.py @@ -162,14 +162,14 @@ def check_perm_buffer_error(): self.assertRaises(ValueError, check_perm_buffer_error) def test_sample_result_with_eids(self): - # Note: Currently return eid results is not initialized. paddle.disable_static() row = paddle.to_tensor(self.row) colptr = paddle.to_tensor(self.colptr) nodes = paddle.to_tensor(self.nodes) eids = paddle.to_tensor(self.edges_id) + perm_buffer = paddle.to_tensor(self.edges_id) - out_neighbors, out_count, _ = paddle.incubate.graph_sample_neighbors( + out_neighbors, out_count, out_eids = paddle.incubate.graph_sample_neighbors( row, colptr, nodes, @@ -177,6 +177,16 @@ def test_sample_result_with_eids(self): sample_size=self.sample_size, return_eids=True) + out_neighbors, out_count, out_eids = paddle.incubate.graph_sample_neighbors( + row, + colptr, + nodes, + eids=eids, + perm_buffer=perm_buffer, + sample_size=self.sample_size, + return_eids=True, + flag_perm_buffer=True) + paddle.enable_static() with paddle.static.program_guard(paddle.static.Program()): row = paddle.static.data( @@ -188,7 +198,7 @@ def test_sample_result_with_eids(self): eids = paddle.static.data( name="eids", shape=self.edges_id.shape, dtype=self.nodes.dtype) - out_neighbors, out_count, _ = paddle.incubate.graph_sample_neighbors( + out_neighbors, out_count, out_eids = paddle.incubate.graph_sample_neighbors( row, colptr, nodes, @@ -202,7 +212,7 @@ def test_sample_result_with_eids(self): 'nodes': self.nodes, 'eids': self.edges_id }, - fetch_list=[out_neighbors, out_count]) + fetch_list=[out_neighbors, out_count, out_eids]) if __name__ == "__main__": From 8f469ddd569c34b9c73a35a4df5720fffff293db Mon Sep 17 00:00:00 2001 From: zhangkaihuo Date: Mon, 18 Apr 2022 10:23:33 +0800 Subject: [PATCH 201/211] Add sparse kernel coalesced (#41784) --- .../kernels/funcs/sparse/flatten_indices.cu.h | 57 ++++++ .../kernels/funcs/sparse/flatten_indices.h | 93 +++++++++ paddle/phi/kernels/funcs/sparse/scatter.cu.h | 63 ++++++ paddle/phi/kernels/funcs/sparse/utils.cu.h | 31 +++ paddle/phi/kernels/sparse/coalesced_kernel.h | 30 +++ .../kernels/sparse/cpu/coalesced_kernel.cc | 121 +++++++++++ .../kernels/sparse/cpu/sparse_mask_kernel.cc | 10 +- .../kernels/sparse/gpu/coalesced_kernel.cu | 189 ++++++++++++++++++ .../phi/kernels/sparse/gpu/convolution.cu.h | 55 +---- .../sparse/gpu/convolution_grad_kernel.cu | 24 ++- .../kernels/sparse/gpu/convolution_kernel.cu | 22 +- .../kernels/sparse/gpu/sparse_mask_kernel.cu | 56 ++---- .../phi/kernels/sparse/sparse_utils_kernel.h | 7 +- .../tests/unittests/test_sparse_utils_op.py | 184 ++++++++++++++--- python/paddle/sparse/creation.py | 67 ++++++- python/paddle/utils/code_gen/sparse_api.yaml | 1 + 16 files changed, 861 insertions(+), 149 deletions(-) create mode 100644 paddle/phi/kernels/funcs/sparse/flatten_indices.cu.h create mode 100644 paddle/phi/kernels/funcs/sparse/flatten_indices.h create mode 100644 paddle/phi/kernels/funcs/sparse/scatter.cu.h create mode 100644 paddle/phi/kernels/funcs/sparse/utils.cu.h create mode 100644 paddle/phi/kernels/sparse/coalesced_kernel.h create mode 100644 paddle/phi/kernels/sparse/cpu/coalesced_kernel.cc create mode 100644 paddle/phi/kernels/sparse/gpu/coalesced_kernel.cu diff --git a/paddle/phi/kernels/funcs/sparse/flatten_indices.cu.h b/paddle/phi/kernels/funcs/sparse/flatten_indices.cu.h new file mode 100644 index 0000000000000..26b8549aaafdc --- /dev/null +++ b/paddle/phi/kernels/funcs/sparse/flatten_indices.cu.h @@ -0,0 +1,57 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include "paddle/phi/kernels/funcs/sparse/flatten_indices.h" + +namespace phi { +namespace funcs { +namespace sparse { + +template +__global__ void FlattenIndicesKernel(const IntT* indices, + const IntT* sparse_offsets, + const int64_t non_zero_num, + const int64_t sparse_dim, + IntT* out) { + int tid = threadIdx.x + blockIdx.x * blockDim.x; + phi::funcs::sparse::FlattenIndices(indices, + sparse_offsets, + non_zero_num, + sparse_dim, + tid, + gridDim.x * blockDim.x, + out); +} + +template +__global__ void IndexToCoordinateKernel(const IntT* indexs, + const Dim dims, + const int64_t non_zero_num, + const int64_t sparse_dim, + IntT* indices) { + int tid = threadIdx.x + blockIdx.x * blockDim.x; + IndexToCoordinate(indexs, + dims, + non_zero_num, + sparse_dim, + tid, + gridDim.x * blockDim.x, + indices); +} + +} // namespace sparse +} // namespace funcs +} // namespace phi diff --git a/paddle/phi/kernels/funcs/sparse/flatten_indices.h b/paddle/phi/kernels/funcs/sparse/flatten_indices.h new file mode 100644 index 0000000000000..ca212e4366ec4 --- /dev/null +++ b/paddle/phi/kernels/funcs/sparse/flatten_indices.h @@ -0,0 +1,93 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include "paddle/phi/core/ddim.h" + +namespace phi { +namespace funcs { +namespace sparse { + +template +inline const IntT HOSTDEVICE CoordinateToIndex(const IntT* indices, + const IntT* sparse_offsets, + const int64_t non_zero_num, + const int64_t sparse_dim, + const int i) { + IntT index = 0; + for (IntT j = 0; j < sparse_dim; j++) { + index += indices[j * non_zero_num + i] * sparse_offsets[j]; + } + return index; +} + +template +inline void HOSTDEVICE FlattenIndices(const IntT* indices, + const IntT* sparse_offsets, + const int64_t non_zero_num, + const int64_t sparse_dim, + const int64_t start, + const int64_t stride, + IntT* out) { + for (int64_t i = start; i < non_zero_num; i += stride) { + out[i] = + CoordinateToIndex(indices, sparse_offsets, non_zero_num, sparse_dim, i); + } +} + +// 1. indices.dims().size() == 2 +template +inline void CalcOffsetsPerDim(const DDim& dims, + const int64_t sparse_dim, + IntT* offsets) { + IntT offset = 1; + for (IntT i = sparse_dim - 1; i >= 0; i--) { + offsets[i] = offset; + offset *= dims[i]; + } +} + +template +inline void HOSTDEVICE IndexToCoordinate(const IntT index, + const Dim& dims, + const int64_t non_zero_num, + const int64_t sparse_dim, + const int indices_offset, + IntT* indices) { + IntT tmp_index = index; + for (int j = sparse_dim - 1; j >= 0; j--) { + indices[j * non_zero_num + indices_offset] = tmp_index % dims[j]; + tmp_index /= dims[j]; + } +} + +template +inline void HOSTDEVICE IndexToCoordinate(const IntT* indexs, + const Dim& dims, + const int64_t non_zero_num, + const int64_t sparse_dim, + const int64_t start, + const int64_t stride, + IntT* indices) { + for (int64_t i = start; i < non_zero_num; i += stride) { + IntT tmp_index = indexs[i]; + IndexToCoordinate(tmp_index, dims, non_zero_num, sparse_dim, i, indices); + } +} + +} // namespace sparse +} // namespace funcs +} // namespace phi diff --git a/paddle/phi/kernels/funcs/sparse/scatter.cu.h b/paddle/phi/kernels/funcs/sparse/scatter.cu.h new file mode 100644 index 0000000000000..9ed7cef12a148 --- /dev/null +++ b/paddle/phi/kernels/funcs/sparse/scatter.cu.h @@ -0,0 +1,63 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +namespace phi { +namespace funcs { +namespace sparse { + +/** + * brief: scatter add + * input: the inputs + * unique_value: refer to UpdateIndexKernel notes + * out_index: the output feature index + * non_zero_num: the number of output features + * rulebook_len: the length of rulebook + * channels: the output channel size + * out: the outputs +**/ +template +__global__ void ScatterKernel(const T* input, + const int* unique_value, + const int* out_index, + const int non_zero_num, + const int rulebook_len, + const int channels, + T* out, + const bool subm = false) { + int tid = threadIdx.x + blockIdx.x * blockDim.x; + for (int i = tid; i < non_zero_num * channels; i += gridDim.x * blockDim.x) { + int indices_i = i / channels; + int channels_i = i - indices_i * channels; + + int start = unique_value[indices_i]; + int end = indices_i == non_zero_num - 1 ? rulebook_len + : unique_value[indices_i + 1]; + // max(end-start) = kernel_size + T sum = static_cast(0); + if (subm) { + sum = out[indices_i * channels + channels_i]; + } + for (int j = start; j < end; j++) { + const int out_feature_i = out_index[j]; + sum += input[out_feature_i * channels + channels_i]; + } + out[indices_i * channels + channels_i] = sum; + } +} + +} // namespace sparse +} // namespace funcs +} // namespace phi diff --git a/paddle/phi/kernels/funcs/sparse/utils.cu.h b/paddle/phi/kernels/funcs/sparse/utils.cu.h new file mode 100644 index 0000000000000..074fe1ca42049 --- /dev/null +++ b/paddle/phi/kernels/funcs/sparse/utils.cu.h @@ -0,0 +1,31 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +namespace phi { +namespace funcs { +namespace sparse { + +// brief: calculation the distance between start and end +template +__global__ void DistanceKernel(const T* start, const T* end, T* distance) { + if (threadIdx.x == 0) { + *distance = end - start; + } +} + +} // namespace sparse +} // namespace funcs +} // namespace phi diff --git a/paddle/phi/kernels/sparse/coalesced_kernel.h b/paddle/phi/kernels/sparse/coalesced_kernel.h new file mode 100644 index 0000000000000..0755579a57ade --- /dev/null +++ b/paddle/phi/kernels/sparse/coalesced_kernel.h @@ -0,0 +1,30 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include "paddle/phi/core/dense_tensor.h" +#include "paddle/phi/core/sparse_coo_tensor.h" +#include "paddle/phi/kernels/empty_kernel.h" + +namespace phi { +namespace sparse { + +template +void CoalescedKernel(const Context& dev_ctx, + const SparseCooTensor& x, + SparseCooTensor* out); + +} // namespace sparse +} // namespace phi diff --git a/paddle/phi/kernels/sparse/cpu/coalesced_kernel.cc b/paddle/phi/kernels/sparse/cpu/coalesced_kernel.cc new file mode 100644 index 0000000000000..0ebddf9b683f0 --- /dev/null +++ b/paddle/phi/kernels/sparse/cpu/coalesced_kernel.cc @@ -0,0 +1,121 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/phi/kernels/sparse/coalesced_kernel.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/core/visit_type.h" +#include "paddle/phi/kernels/funcs/sparse/flatten_indices.h" + +namespace phi { +namespace sparse { + +template +void CoalescedCPUKernel(const CPUContext& dev_ctx, + const SparseCooTensor& x, + SparseCooTensor* out) { + const DenseTensor& x_indices = x.non_zero_indices(); + const DenseTensor& x_values = x.non_zero_elements(); + DenseTensor out_indices = phi::EmptyLike(dev_ctx, x_indices); + DenseTensor out_values = phi::EmptyLike(dev_ctx, x_values); + + const int64_t sparse_dim = x.non_zero_indices().dims()[0]; + std::vector sparse_offsets(sparse_dim), x_indexs(x.nnz()); + phi::funcs::sparse::CalcOffsetsPerDim( + x.dims(), sparse_dim, sparse_offsets.data()); + + phi::funcs::sparse::FlattenIndices(x.non_zero_indices().data(), + sparse_offsets.data(), + x.nnz(), + sparse_dim, + 0, + 1, + x_indexs.data()); + + const T* x_values_ptr = x_values.data(); + const int64_t stride = + x.dims().size() == sparse_dim ? 1 : x.dims().size() - sparse_dim; + + std::map> indices_to_index; + for (uint64_t i = 0; i < x_indexs.size(); i++) { + IntT index = x_indexs[i]; + if (indices_to_index.find(index) == indices_to_index.end()) { + std::vector indexs; + indexs.push_back(i); + indices_to_index[index] = indexs; + } else { + indices_to_index[index].push_back(i); + } + } + + const int64_t out_nnz = indices_to_index.size(); + + out_indices.Resize({x_indices.dims()[0], out_nnz}); + if (out_values.dims().size() == 1) { + out_values.Resize(phi::make_ddim({out_nnz})); + } else { + out_values.Resize(phi::make_ddim({out_nnz, x_values.dims()[1]})); + } + + IntT* out_indices_ptr = out_indices.data(); + T* out_values_ptr = out_values.data(); + auto iter = indices_to_index.begin(); + + Dim const_dims; + for (int i = 0; i < x.dims().size(); i++) { + const_dims[i] = x.dims()[i]; + } + + for (int i = 0; iter != indices_to_index.end(); iter++, i++) { + phi::funcs::sparse::IndexToCoordinate( + iter->first, const_dims, out_nnz, sparse_dim, i, out_indices_ptr); + memcpy(out_values_ptr + i * stride, + x_values_ptr + iter->second[0] * stride, + stride * sizeof(T)); + for (uint64_t j = 1; j < iter->second.size(); j++) { + for (int k = 0; k < stride; k++) { + out_values_ptr[i * stride + k] += + x_values_ptr[iter->second[j] * stride + k]; + } + } + } + + out->SetMember(out_indices, out_values, x.dims(), true); +} + +template +void CoalescedKernel(const Context& dev_ctx, + const SparseCooTensor& x, + SparseCooTensor* out) { + PD_VISIT_INTEGRAL_TYPES( + x.non_zero_indices().dtype(), "CoalescedCPUKernel", ([&] { + CoalescedCPUKernel(dev_ctx, x, out); + })); +} + +} // namespace sparse +} // namespace phi + +PD_REGISTER_KERNEL(sort, + CPU, + ALL_LAYOUT, + phi::sparse::CoalescedKernel, + float, + double, + phi::dtype::float16, + uint8_t, + int16_t, + int, + int64_t) { + kernel->InputAt(0).SetDataLayout(phi::DataLayout::SPARSE_COO); +} diff --git a/paddle/phi/kernels/sparse/cpu/sparse_mask_kernel.cc b/paddle/phi/kernels/sparse/cpu/sparse_mask_kernel.cc index c10a240c68430..1508de407caa7 100644 --- a/paddle/phi/kernels/sparse/cpu/sparse_mask_kernel.cc +++ b/paddle/phi/kernels/sparse/cpu/sparse_mask_kernel.cc @@ -20,7 +20,9 @@ limitations under the License. */ #include "paddle/phi/kernels/copy_kernel.h" #include "paddle/phi/kernels/empty_kernel.h" #include "paddle/phi/kernels/funcs/math_function.h" -#include "paddle/phi/kernels/funcs/sparse/common_shape.h" +#include "paddle/phi/kernels/funcs/sparse/flatten_indices.h" + +#include "paddle/phi/api/ext/dispatch.h" namespace phi { namespace sparse { @@ -56,10 +58,10 @@ void SparseMaskCPUKernel(const CPUContext& dev_ctx, std::vector out_indexs(non_zero_num), sparse_offsets(sparse_dim); phi::funcs::sparse::CalcOffsetsPerDim( - dims, sparse_dim, &sparse_offsets); + dims, sparse_dim, sparse_offsets.data()); for (int64_t i = 0; i < non_zero_num; i++) { - int64_t index = phi::funcs::sparse::IndicesToIndex( + int64_t index = phi::funcs::sparse::CoordinateToIndex( indices_ptr, sparse_offsets.data(), non_zero_num, sparse_dim, i); memcpy(out_values_ptr + i * cols, x_ptr + index * cols, cols * sizeof(T)); } @@ -98,7 +100,7 @@ void SparseMaskHelperCPUKernel(const CPUContext& dev_ctx, std::vector sparse_offsets(sparse_dim), x_indexs(x.nnz()), mask_indexs(mask_indices.dims()[1]); phi::funcs::sparse::CalcOffsetsPerDim( - x.dims(), sparse_dim, &sparse_offsets); + x.dims(), sparse_dim, sparse_offsets.data()); phi::funcs::sparse::FlattenIndices(x.non_zero_indices().data(), sparse_offsets.data(), diff --git a/paddle/phi/kernels/sparse/gpu/coalesced_kernel.cu b/paddle/phi/kernels/sparse/gpu/coalesced_kernel.cu new file mode 100644 index 0000000000000..3ffcd28955a53 --- /dev/null +++ b/paddle/phi/kernels/sparse/gpu/coalesced_kernel.cu @@ -0,0 +1,189 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/phi/backends/gpu/gpu_info.h" +#include "paddle/phi/backends/gpu/gpu_launch_config.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/core/visit_type.h" +#include "paddle/phi/kernels/funcs/index_impl.cu.h" +#include "paddle/phi/kernels/funcs/sparse/flatten_indices.cu.h" +#include "paddle/phi/kernels/funcs/sparse/scatter.cu.h" +#include "paddle/phi/kernels/funcs/sparse/utils.cu.h" +#include "paddle/phi/kernels/sparse/coalesced_kernel.h" + +namespace phi { +namespace sparse { + +template +void CoalescedGPUKernel(const GPUContext& dev_ctx, + const SparseCooTensor& x, + SparseCooTensor* out) { + const DenseTensor& x_indices = x.non_zero_indices(); + const DenseTensor& x_values = x.non_zero_elements(); + DenseTensor out_indices = phi::EmptyLike(dev_ctx, x_indices); + DenseTensor out_values = phi::EmptyLike(dev_ctx, x_values); + + const int64_t nnz = x.nnz(); + const int64_t sparse_dim = x.non_zero_indices().dims()[0]; + std::vector sparse_offsets(sparse_dim); + + phi::funcs::sparse::CalcOffsetsPerDim( + x.dims(), sparse_dim, sparse_offsets.data()); + + DenseTensorMeta sparse_offset_meta( + paddle::experimental::CppTypeToDataType::Type(), + {sparse_dim}, + DataLayout::NCHW); + DenseTensor d_sparse_offsets = + phi::Empty(dev_ctx, std::move(sparse_offset_meta)); + DenseTensor indexs = phi::Empty( + dev_ctx, DenseTensorMeta(x_indices.dtype(), {nnz}, x_indices.layout())); + IntT* indexs_ptr = indexs.data(); + + phi::backends::gpu::GpuMemcpyAsync(d_sparse_offsets.data(), + sparse_offsets.data(), + sizeof(IntT) * sparse_dim, +#ifdef PADDLE_WITH_HIP + hipMemcpyHostToDevice, +#else + cudaMemcpyHostToDevice, +#endif + dev_ctx.stream()); + + // 1. flatten indices + auto config = phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, nnz, 1); + phi::funcs::sparse::FlattenIndicesKernel<<>>( + x.non_zero_indices().data(), + d_sparse_offsets.data(), + indexs.numel(), + sparse_dim, + indexs_ptr); + + // 2. get the address of each non-zero values + const T* x_values_ptr = x_values.data(); + const int64_t stride = + x.dims().size() == sparse_dim ? 1 : x.dims().size() - sparse_dim; + DenseTensor values_indexs = phi::Empty( + dev_ctx, DenseTensorMeta(DataType::INT32, {nnz}, DataLayout::NCHW)); + int* values_indexs_ptr = values_indexs.data(); + DenseTensor public_indexs = phi::EmptyLike(dev_ctx, values_indexs); + + // values_indexs = [0,1,2,,,nnz-1] + phi::IndexKernel>( + dev_ctx, &values_indexs, kps::IdentityFunctor()); + phi::IndexKernel>( + dev_ctx, &public_indexs, kps::IdentityFunctor()); + +// 3. sort (indices, values index) +#ifdef PADDLE_WITH_HIP + thrust::sort_by_key(thrust::hip::par.on(dev_ctx.stream()), +#else + thrust::sort_by_key(thrust::cuda::par.on(dev_ctx.stream()), +#endif + indexs_ptr, + indexs_ptr + nnz, + values_indexs_ptr); + + // 4. unique index + thrust::pair new_end = +#ifdef PADDLE_WITH_HIP + thrust::unique_by_key(thrust::hip::par.on(dev_ctx.stream()), +#else + thrust::unique_by_key(thrust::cuda::par.on(dev_ctx.stream()), +#endif + indexs_ptr, + indexs_ptr + nnz, + public_indexs.data()); + + phi::funcs::sparse::DistanceKernel<<<1, 1, 0, dev_ctx.stream()>>>( + indexs_ptr, new_end.first, out_indices.data()); + + IntT out_nnz = 0; + phi::backends::gpu::GpuMemcpyAsync(&out_nnz, + out_indices.data(), + sizeof(IntT), +#ifdef PADDLE_WITH_HIP + hipMemcpyDeviceToHost, +#else + cudaMemcpyDeviceToHost, +#endif + dev_ctx.stream()); + dev_ctx.Wait(); + + out_indices.Resize({x_indices.dims()[0], out_nnz}); + if (out_values.dims().size() == 1) { + out_values.Resize(phi::make_ddim({out_nnz})); + } else { + out_values.Resize(phi::make_ddim({out_nnz, x_values.dims()[1]})); + } + + // 5. scatter the values + config = phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, nnz * stride, 1); + phi::funcs::sparse::ScatterKernel<<>>( + x_values_ptr, + public_indexs.data(), + values_indexs_ptr, + out_nnz, + nnz, + stride, + out_values.data()); + + // 6. convert index to coordinate + Dim const_dims; + for (int i = 0; i < x.dims().size(); i++) { + const_dims[i] = x.dims()[i]; + } + + config = phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, out_nnz, 1); + phi::funcs::sparse::IndexToCoordinateKernel<<>>( + indexs_ptr, const_dims, out_nnz, sparse_dim, out_indices.data()); + + out->SetMember(out_indices, out_values, x.dims(), true); +} + +template +void CoalescedKernel(const Context& dev_ctx, + const SparseCooTensor& x, + SparseCooTensor* out) { + PD_VISIT_INTEGRAL_TYPES( + x.non_zero_indices().dtype(), "CoalescedGPUKernel", ([&] { + CoalescedGPUKernel(dev_ctx, x, out); + })); +} + +} // namespace sparse +} // namespace phi + +PD_REGISTER_KERNEL(sort, + GPU, + ALL_LAYOUT, + phi::sparse::CoalescedKernel, + float, + double, + phi::dtype::float16, + uint8_t, + int16_t, + int, + int64_t) { + kernel->InputAt(0).SetDataLayout(phi::DataLayout::SPARSE_COO); +} diff --git a/paddle/phi/kernels/sparse/gpu/convolution.cu.h b/paddle/phi/kernels/sparse/gpu/convolution.cu.h index 2396a5975de4e..fcbb3c60183eb 100644 --- a/paddle/phi/kernels/sparse/gpu/convolution.cu.h +++ b/paddle/phi/kernels/sparse/gpu/convolution.cu.h @@ -26,6 +26,7 @@ limitations under the License. */ #include "paddle/phi/kernels/copy_kernel.h" #include "paddle/phi/kernels/funcs/index_impl.cu.h" #include "paddle/phi/kernels/funcs/math_function.h" +#include "paddle/phi/kernels/funcs/sparse/utils.cu.h" #include "paddle/phi/kernels/primitive/compute_primitives.h" #include "paddle/phi/kernels/sparse/convolution_kernel.h" @@ -60,46 +61,6 @@ __global__ void GatherKernel(const T* params, } } -/** - * brief: scatter add - * input: the inputs - * unique_value: refer to UpdateIndexKernel notes - * out_index: the output feature index - * non_zero_num: the number of output features - * rulebook_len: the length of rulebook - * channels: the output channel size - * out: the outputs -**/ -template -__global__ void ScatterKernel(const T* input, - const int* unique_value, - const int* out_index, - const int non_zero_num, - const int rulebook_len, - const int channels, - T* out, - const bool subm = false) { - int tid = threadIdx.x + blockIdx.x * blockDim.x; - for (int i = tid; i < non_zero_num * channels; i += gridDim.x * blockDim.x) { - int indices_i = i / channels; - int channels_i = i - indices_i * channels; - - int start = unique_value[indices_i]; - int end = indices_i == non_zero_num - 1 ? rulebook_len - : unique_value[indices_i + 1]; - // max(end-start) = kernel_size - T sum = static_cast(0); - if (subm) { - sum = out[indices_i * channels + channels_i]; - } - for (int j = start; j < end; j++) { - const int out_feature_i = out_index[j]; - sum += input[out_feature_i * channels + channels_i]; - } - out[indices_i * channels + channels_i] = sum; - } -} - template inline IntT* SortedAndUniqueIndex(const Context& dev_ctx, const IntT* rulebook_ptr, @@ -186,14 +147,6 @@ __global__ void UpdateIndexKernel(const T* unique_keys, } } -// brief: calculation the distance between start and end -template -__global__ void DistanceKernel(const T* start, const T* end, T* distance) { - if (threadIdx.x == 0) { - *distance = end - start; - } -} - template __global__ void UpdateOutIndexAndCounterAfterLowerBound( const IntT* x_indexs, @@ -402,7 +355,7 @@ int ProductRuleBook(const Context& dev_ctx, rulebook_ptr + rulebook_rows * rulebook_cols, -1); - DistanceKernel<<<1, 1, 0, dev_ctx.stream()>>>( + phi::funcs::sparse::DistanceKernel<<<1, 1, 0, dev_ctx.stream()>>>( rulebook_ptr, last, rulebook_ptr + 3 * kernel_size * non_zero_num - 1); IntT rulebook_len = 0; phi::backends::gpu::GpuMemcpyAsync( @@ -468,7 +421,7 @@ int ProductRuleBook(const Context& dev_ctx, rulebook_ptr, rulebook_ptr + 3 * rulebook_len, -1); - DistanceKernel<<<1, 1, 0, dev_ctx.stream()>>>( + phi::funcs::sparse::DistanceKernel<<<1, 1, 0, dev_ctx.stream()>>>( rulebook_ptr, last, bound_ptr); phi::backends::gpu::GpuMemcpyAsync(&rulebook_len, bound_ptr, @@ -536,7 +489,7 @@ int ProductRuleBook(const Context& dev_ctx, // thrust::distance doesn't support stream parameters // const int out_non_zero_num = thrust::distance(unique_key_ptr, // new_end.first); - DistanceKernel<<<1, 1, 0, dev_ctx.stream()>>>( + phi::funcs::sparse::DistanceKernel<<<1, 1, 0, dev_ctx.stream()>>>( unique_key_ptr, new_end, rulebook_ptr + rulebook_rows * rulebook_cols - 1); diff --git a/paddle/phi/kernels/sparse/gpu/convolution_grad_kernel.cu b/paddle/phi/kernels/sparse/gpu/convolution_grad_kernel.cu index ed9579fcd5b67..e54e39f5541d5 100644 --- a/paddle/phi/kernels/sparse/gpu/convolution_grad_kernel.cu +++ b/paddle/phi/kernels/sparse/gpu/convolution_grad_kernel.cu @@ -22,6 +22,7 @@ limitations under the License. */ #include "paddle/phi/kernels/copy_kernel.h" #include "paddle/phi/kernels/funcs/blas/blas.h" #include "paddle/phi/kernels/funcs/math_function.h" +#include "paddle/phi/kernels/funcs/sparse/scatter.cu.h" #include "paddle/phi/kernels/sparse/convolution_grad_kernel.h" #include "paddle/phi/kernels/sparse/gpu/convolution.cu.h" @@ -222,17 +223,18 @@ void Conv3dGradGPUKernel(const GPUContext& dev_ctx, config = phi::backends::gpu::GetGpuLaunchConfig1D( dev_ctx, rulebook_len * in_channels, 1); - ScatterKernel<<>>(d_x_features_ptr, - unique_value.data(), - out_index.data(), - x.nnz(), - rulebook_len, - in_channels, - x_grad_values_ptr, - subm); + phi::funcs::sparse::ScatterKernel<<>>( + d_x_features_ptr, + unique_value.data(), + out_index.data(), + x.nnz(), + rulebook_len, + in_channels, + x_grad_values_ptr, + subm); } template diff --git a/paddle/phi/kernels/sparse/gpu/convolution_kernel.cu b/paddle/phi/kernels/sparse/gpu/convolution_kernel.cu index 93da65dc0f7d8..30f0482a0cc36 100644 --- a/paddle/phi/kernels/sparse/gpu/convolution_kernel.cu +++ b/paddle/phi/kernels/sparse/gpu/convolution_kernel.cu @@ -18,6 +18,7 @@ limitations under the License. */ #include "paddle/phi/core/visit_type.h" #include "paddle/phi/kernels/funcs/blas/blas.h" #include "paddle/phi/kernels/funcs/scatter.cu.h" +#include "paddle/phi/kernels/funcs/sparse/scatter.cu.h" #include "paddle/phi/kernels/sparse/convolution_kernel.h" #include "paddle/phi/kernels/sparse/gpu/convolution.cu.h" @@ -169,16 +170,17 @@ void Conv3dGPUKernel(const GPUContext& dev_ctx, } else { config = phi::backends::gpu::GetGpuLaunchConfig1D( dev_ctx, out->nnz() * out_channels, 1); - ScatterKernel<<>>(out_features_ptr, - unique_value.data(), - out_index.data(), - out->nnz(), - n, - out_channels, - out_values_ptr); + phi::funcs::sparse::ScatterKernel<<>>( + out_features_ptr, + unique_value.data(), + out_index.data(), + out->nnz(), + n, + out_channels, + out_values_ptr); } } /** diff --git a/paddle/phi/kernels/sparse/gpu/sparse_mask_kernel.cu b/paddle/phi/kernels/sparse/gpu/sparse_mask_kernel.cu index dff1cc2318f13..4e2d12f33955e 100644 --- a/paddle/phi/kernels/sparse/gpu/sparse_mask_kernel.cu +++ b/paddle/phi/kernels/sparse/gpu/sparse_mask_kernel.cu @@ -23,7 +23,7 @@ limitations under the License. */ #include "paddle/phi/kernels/copy_kernel.h" #include "paddle/phi/kernels/empty_kernel.h" #include "paddle/phi/kernels/funcs/math_function.h" -#include "paddle/phi/kernels/funcs/sparse/common_shape.h" +#include "paddle/phi/kernels/funcs/sparse/flatten_indices.cu.h" #include "paddle/phi/kernels/sparse/sparse_mask_kernel.h" namespace phi { @@ -123,23 +123,6 @@ void SparseMaskKernel(const Context& dev_ctx, })); } -// TODO(zhangkaihuo): Use an op to realize the function of FlattenIndices -template -__global__ void FlattenIndicesKernel(const IntT* indices, - const IntT* sparse_offsets, - const int64_t non_zero_num, - const int64_t sparse_dim, - IntT* out) { - int tid = threadIdx.x + blockIdx.x * blockDim.x; - phi::funcs::sparse::FlattenIndices(indices, - sparse_offsets, - non_zero_num, - sparse_dim, - tid, - gridDim.x * blockDim.x, - out); -} - template __global__ void SparseMaskCopyKernel(const IntT* x_indexs, const IntT* mask_indexs, @@ -192,7 +175,8 @@ void SparseMaskHelperGPUKernel(const GPUContext& dev_ctx, IntT* bound_out_ptr = bound_out.data(); // 1. calc the offsets of per dim - phi::funcs::sparse::CalcOffsetsPerDim(x.dims(), sparse_dim, &sparse_offsets); + phi::funcs::sparse::CalcOffsetsPerDim( + x.dims(), sparse_dim, sparse_offsets.data()); // 2. copy sparse_offsets to device phi::backends::gpu::GpuMemcpyAsync(d_sparse_offsets.data(), sparse_offsets.data(), @@ -207,25 +191,27 @@ void SparseMaskHelperGPUKernel(const GPUContext& dev_ctx, // 3. flatten x indices and mask indices auto config = phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, x_indexs.numel(), 1); - FlattenIndicesKernel<<>>(x.non_zero_indices().data(), - d_sparse_offsets.data(), - x_indexs.numel(), - sparse_dim, - x_indexs_ptr); + phi::funcs::sparse::FlattenIndicesKernel<<>>( + x.non_zero_indices().data(), + d_sparse_offsets.data(), + x_indexs.numel(), + sparse_dim, + x_indexs_ptr); config = phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, mask_indexs.numel(), 1); - FlattenIndicesKernel<<>>(mask_indices.data(), - d_sparse_offsets.data(), - mask_indexs.numel(), - sparse_dim, - mask_indexs_ptr); + phi::funcs::sparse::FlattenIndicesKernel<<>>( + mask_indices.data(), + d_sparse_offsets.data(), + mask_indexs.numel(), + sparse_dim, + mask_indexs_ptr); // 4. call thrust::lower_bound #ifdef PADDLE_WITH_HIP thrust::lower_bound(thrust::hip::par.on(dev_ctx.stream()), diff --git a/paddle/phi/kernels/sparse/sparse_utils_kernel.h b/paddle/phi/kernels/sparse/sparse_utils_kernel.h index 8cf9c0a28648a..072e6f141f8f1 100644 --- a/paddle/phi/kernels/sparse/sparse_utils_kernel.h +++ b/paddle/phi/kernels/sparse/sparse_utils_kernel.h @@ -20,6 +20,7 @@ limitations under the License. */ #include "paddle/phi/core/sparse_coo_tensor.h" #include "paddle/phi/core/sparse_csr_tensor.h" #include "paddle/phi/kernels/empty_kernel.h" +#include "paddle/phi/kernels/sparse/coalesced_kernel.h" namespace phi { namespace sparse { @@ -154,9 +155,9 @@ void SparseCooTensorKernel(const Context& dev_ctx, const DenseTensor& indices, const IntArray& dense_shape, SparseCooTensor* out) { - *out = - SparseCooTensor(indices, values, phi::make_ddim(dense_shape.GetData())); - // TODO(zhangkaihuo): sort and merge the dumplicate indices + SparseCooTensor before_coalesced( + indices, values, phi::make_ddim(dense_shape.GetData())); + CoalescedKernel(dev_ctx, before_coalesced, out); } } // namespace sparse diff --git a/python/paddle/fluid/tests/unittests/test_sparse_utils_op.py b/python/paddle/fluid/tests/unittests/test_sparse_utils_op.py index 89cfc711910ce..c87626a10c631 100644 --- a/python/paddle/fluid/tests/unittests/test_sparse_utils_op.py +++ b/python/paddle/fluid/tests/unittests/test_sparse_utils_op.py @@ -19,6 +19,8 @@ import paddle.fluid.core as core from paddle.fluid.framework import _test_eager_guard +devices = ['cpu', 'gpu'] + class TestSparseCreate(unittest.TestCase): def test_create_coo_by_tensor(self): @@ -30,6 +32,8 @@ def test_create_coo_by_tensor(self): dense_elements = paddle.to_tensor(values, dtype='float32') coo = paddle.sparse.sparse_coo_tensor( dense_indices, dense_elements, dense_shape, stop_gradient=False) + # test the to_string.py + print(coo) assert np.array_equal(indices, coo.indices().numpy()) assert np.array_equal(values, coo.values().numpy()) @@ -37,7 +41,7 @@ def test_create_coo_by_np(self): with _test_eager_guard(): indices = [[0, 1, 2], [1, 2, 0]] values = [1.0, 2.0, 3.0] - dense_shape = [2, 3] + dense_shape = [3, 3] coo = paddle.sparse.sparse_coo_tensor(indices, values, dense_shape) assert np.array_equal(indices, coo.indices().numpy()) assert np.array_equal(values, coo.values().numpy()) @@ -67,6 +71,8 @@ def test_create_csr_by_np(self): dense_shape = [3, 4] csr = paddle.sparse.sparse_csr_tensor(crows, cols, values, dense_shape) + # test the to_string.py + print(csr) assert np.array_equal(crows, csr.crows().numpy()) assert np.array_equal(cols, csr.cols().numpy()) assert np.array_equal(values, csr.values().numpy()) @@ -205,38 +211,154 @@ def test_coo_values_grad(self): def test_sparse_coo_tensor_grad(self): with _test_eager_guard(): - indices = [[0, 1], [0, 1]] - values = [1, 2] - indices = paddle.to_tensor(indices, dtype='int32') - values = paddle.to_tensor( - values, dtype='float32', stop_gradient=False) - sparse_x = paddle.sparse.sparse_coo_tensor( - indices, values, shape=[2, 2], stop_gradient=False) - grad_indices = [[0, 1], [1, 1]] - grad_values = [2, 3] - grad_indices = paddle.to_tensor(grad_indices, dtype='int32') - grad_values = paddle.to_tensor(grad_values, dtype='float32') - sparse_out_grad = paddle.sparse.sparse_coo_tensor( - grad_indices, grad_values, shape=[2, 2]) - sparse_x.backward(sparse_out_grad) - correct_values_grad = [0, 3] - assert np.array_equal(correct_values_grad, values.grad.numpy()) + for device in devices: + if device == 'cpu' or (device == 'gpu' and + paddle.is_compiled_with_cuda()): + paddle.device.set_device(device) + indices = [[0, 1], [0, 1]] + values = [1, 2] + indices = paddle.to_tensor(indices, dtype='int32') + values = paddle.to_tensor( + values, dtype='float32', stop_gradient=False) + sparse_x = paddle.sparse.sparse_coo_tensor( + indices, values, shape=[2, 2], stop_gradient=False) + grad_indices = [[0, 1], [1, 1]] + grad_values = [2, 3] + grad_indices = paddle.to_tensor(grad_indices, dtype='int32') + grad_values = paddle.to_tensor(grad_values, dtype='float32') + sparse_out_grad = paddle.sparse.sparse_coo_tensor( + grad_indices, grad_values, shape=[2, 2]) + sparse_x.backward(sparse_out_grad) + correct_values_grad = [0, 3] + assert np.array_equal(correct_values_grad, + values.grad.numpy()) - place = core.CPUPlace() - indices_cpu = paddle.to_tensor(indices, dtype='int32', place=place) - values_cpu = paddle.to_tensor( - values, dtype='float32', place=place, stop_gradient=False) - sparse_x_cpu = paddle.sparse.sparse_coo_tensor( - indices_cpu, - values_cpu, - shape=[2, 2], - place=place, - stop_gradient=False) + def test_sparse_coo_tensor_sorted(self): + with _test_eager_guard(): + for device in devices: + if device == 'cpu' or (device == 'gpu' and + paddle.is_compiled_with_cuda()): + paddle.device.set_device(device) + #test unsorted and duplicate indices + indices = [[1, 0, 0], [0, 1, 1]] + values = [1.0, 2.0, 3.0] + indices = paddle.to_tensor(indices, dtype='int32') + values = paddle.to_tensor(values, dtype='float32') + sparse_x = paddle.sparse.sparse_coo_tensor(indices, values) + indices_sorted = [[0, 1], [1, 0]] + values_sorted = [5.0, 1.0] + assert np.array_equal(indices_sorted, + sparse_x.indices().numpy()) + assert np.array_equal(values_sorted, + sparse_x.values().numpy()) + + +class TestCooError(unittest.TestCase): + def test_small_shape(self): + with _test_eager_guard(): + with self.assertRaises(ValueError): + indices = [[2, 3], [0, 2]] + values = [1, 2] + # 1. the shape too small + dense_shape = [2, 2] + sparse_x = paddle.sparse.sparse_coo_tensor( + indices, values, shape=dense_shape) + + def test_same_nnz(self): + with _test_eager_guard(): + with self.assertRaises(ValueError): + # 2. test the nnz of indices must same as nnz of values + indices = [[1, 2], [1, 0]] + values = [1, 2, 3] + sparse_x = paddle.sparse.sparse_coo_tensor(indices, values) + + def test_same_dimensions(self): + with _test_eager_guard(): + with self.assertRaises(ValueError): + indices = [[1, 2], [1, 0]] + values = [1, 2, 3] + shape = [2, 3, 4] + sparse_x = paddle.sparse.sparse_coo_tensor( + indices, values, shape=shape) + + def test_indices_dtype(self): + with _test_eager_guard(): + with self.assertRaises(TypeError): + indices = [[1.0, 2.0], [0, 1]] + values = [1, 2] + sparse_x = paddle.sparse.sparse_coo_tensor(indices, values) + + +class TestCsrError(unittest.TestCase): + def test_dimension1(self): + with _test_eager_guard(): + with self.assertRaises(ValueError): + crows = [0, 1, 2, 3] + cols = [0, 1, 2] + values = [1, 2, 3] + shape = [3] + sparse_x = paddle.sparse.sparse_csr_tensor(crows, cols, values, + shape) + + def test_dimension2(self): + with _test_eager_guard(): + with self.assertRaises(ValueError): + crows = [0, 1, 2, 3] + cols = [0, 1, 2] + values = [1, 2, 3] + shape = [3, 3, 3, 3] + sparse_x = paddle.sparse.sparse_csr_tensor(crows, cols, values, + shape) + + def test_same_shape1(self): + with _test_eager_guard(): + with self.assertRaises(ValueError): + crows = [0, 1, 2, 3] + cols = [0, 1, 2, 3] + values = [1, 2, 3] + shape = [3, 4] + sparse_x = paddle.sparse.sparse_csr_tensor(crows, cols, values, + shape) - sparse_out_grad_cpu = paddle.sparse.sparse_coo_tensor( - grad_indices, grad_values, shape=[2, 2], place=place) - sparse_x_cpu.backward(sparse_out_grad_cpu) - assert np.array_equal(correct_values_grad, values_cpu.grad.numpy()) + def test_same_shape2(self): + with _test_eager_guard(): + with self.assertRaises(ValueError): + crows = [0, 1, 2, 3] + cols = [0, 1, 2, 3] + values = [1, 2, 3, 4] + shape = [3, 4] + sparse_x = paddle.sparse.sparse_csr_tensor(crows, cols, values, + shape) + + def test_same_shape3(self): + with _test_eager_guard(): + with self.assertRaises(ValueError): + crows = [0, 1, 2, 3, 0, 1, 2] + cols = [0, 1, 2, 3, 0, 1, 2] + values = [1, 2, 3, 4, 0, 1, 2] + shape = [2, 3, 4] + sparse_x = paddle.sparse.sparse_csr_tensor(crows, cols, values, + shape) + + def test_crows_first_value(self): + with _test_eager_guard(): + with self.assertRaises(ValueError): + crows = [1, 1, 2, 3] + cols = [0, 1, 2] + values = [1, 2, 3] + shape = [3, 4] + sparse_x = paddle.sparse.sparse_csr_tensor(crows, cols, values, + shape) + + def test_dtype(self): + with _test_eager_guard(): + with self.assertRaises(TypeError): + crows = [0, 1, 2, 3.0] + cols = [0, 1, 2] + values = [1, 2, 3] + shape = [3] + sparse_x = paddle.sparse.sparse_csr_tensor(crows, cols, values, + shape) if __name__ == "__main__": diff --git a/python/paddle/sparse/creation.py b/python/paddle/sparse/creation.py index ac9276f3142c0..d494336e1ff50 100644 --- a/python/paddle/sparse/creation.py +++ b/python/paddle/sparse/creation.py @@ -12,6 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. +import paddle from paddle import _C_ops from ..framework import core, dygraph_only from ..framework import _current_expected_place, _get_paddle_place @@ -51,6 +52,13 @@ def _get_place(place): return place +def _check_indices_dtype(dtype): + if dtype not in [paddle.int8, paddle.int16, paddle.int32, paddle.int64]: + raise TypeError( + "the dtype of indices must be 'int8' or 'int16' or 'int32' or 'int64'" + ) + + @dygraph_only def sparse_coo_tensor(indices, values, @@ -117,6 +125,18 @@ def sparse_coo_tensor(indices, if len(indices.shape) != 2: raise ValueError("'indices' must be 2-D.") + nnz = indices.shape[1] + sparse_dim = indices.shape[0] + + _check_indices_dtype(indices.dtype) + + if nnz != values.shape[0]: + raise ValueError( + "the indices and values must have same number of non-zero, but get {} and {}". + format(nnz, values.shape[0])) + + dense_dim = len(values.shape) - 1 + if not indices.place._equals(place): indices = indices._copy_to(place, False) @@ -125,8 +145,17 @@ def sparse_coo_tensor(indices, values = _handle_dtype(values, dtype) values.stop_gradient = stop_gradient + min_shape = _infer_dense_shape(indices) if shape is None: - shape = _infer_dense_shape(indices) + shape = min_shape + else: + if shape < min_shape: + raise ValueError("the minimun shape required is {}, but get {}". + format(min_shape, shape)) + if len(shape) != sparse_dim + dense_dim: + raise ValueError( + "the number of dimensions(len(shape) must be sparse_dim({}) + dense_dim({}), but get {}". + format(sparse_dim, dense_dim, len(shape))) return _C_ops.final_state_sparse_create_sparse_coo_tensor(values, indices, shape) @@ -144,6 +173,7 @@ def sparse_csr_tensor(crows, r""" Constructs a sparse ``paddle.Tensor`` in CSR(Compressed Sparse Row) format according to the ``crows``, ``cols`` and ``values``. + Currently, the crows and cols of each batch must be incrementd. Args: crows(list|tuple|ndarray|Tensor): 1-D array, each element in the rows represents the @@ -202,10 +232,14 @@ def sparse_csr_tensor(crows, cols = to_tensor(cols, dtype=None, place=place, stop_gradient=True) if not isinstance(values, core.eager.Tensor): values = to_tensor(values, dtype, place, stop_gradient) - if len(crows.shape) != 1 or len(cols.shape) != 1 or len(values.shape) != 1: + + _check_indices_dtype(crows.dtype) + _check_indices_dtype(cols.dtype) + + if len(shape) != 2 and len(shape) != 3: raise ValueError( - "SparseCsrTensor only support 2-D or 3-D matrix. The 'crows', 'cols' and 'values' must be 1-D." - ) + "SparseCsrTensor only support 2-D or 3-D matrix. but get shape {}". + format(shape)) if not crows.place._equals(place): crows = crows._copy_to(place, False) @@ -217,5 +251,30 @@ def sparse_csr_tensor(crows, values = values._copy_to(place, False) values = _handle_dtype(values, dtype) values.stop_gradient = stop_gradient + + if len(crows.shape) != 1 or len(cols.shape) != 1 or len(values.shape) != 1: + raise ValueError("The 'crows', 'cols' and 'values' must be 1-D.") + + if (len(cols) != len(values)): + raise ValueError("the length of cols must be same as length of values") + + if len(shape) == 2: + if crows.shape[0] != shape[0] + 1: + raise ValueError( + "The length({}) of crows must be equal to the rows({})+1 of matrix.". + format(crows.shape[0], shape[0])) + if crows[0] != 0: + raise ValueError("the 0th value of crows must be 0") + + if crows[-1] != values.shape[0]: + raise ValueError( + "the last value of crows must be equal the number of non-zero") + else: + if crows.shape[0] % (shape[0] + 1) != 0: + raise ValueError( + "The length({}) of crows must be divisible the rows({})+1 of matrix.". + format(crows.shape[0], shape[0])) + # TODO(zkh2016): check whether the value in crows and cols is legal + return core.eager.sparse_csr_tensor(crows, cols, values, shape, stop_gradient) diff --git a/python/paddle/utils/code_gen/sparse_api.yaml b/python/paddle/utils/code_gen/sparse_api.yaml index 2187d4abb2d63..100d7ad78319b 100644 --- a/python/paddle/utils/code_gen/sparse_api.yaml +++ b/python/paddle/utils/code_gen/sparse_api.yaml @@ -27,6 +27,7 @@ kernel : func : sparse_coo_tensor layout : values + data_type : values backward : create_sparse_coo_tensor_grad - api : csr_values From 2d4fe1637bdab18b203e2274c9fafdc2e689cd48 Mon Sep 17 00:00:00 2001 From: TeFeng Chen Date: Mon, 18 Apr 2022 10:24:58 +0800 Subject: [PATCH 202/211] cinn_launch_op: optimize the overhead of preparing variables before executing cinn compiled program (#41777) * optimize preparation overhead before executing cinn compiled program * update code notes * fix flag annotation * add a flag of auto-tune feature beforehand --- .../framework/paddle2cinn/cinn_compiler.cc | 34 ++++++++ .../framework/paddle2cinn/cinn_compiler.h | 7 ++ paddle/fluid/operators/cinn/CMakeLists.txt | 2 +- .../operators/cinn/cinn_launch_context.cc | 25 +++--- paddle/fluid/operators/cinn/cinn_launch_op.h | 40 ++++------ .../operators/cinn/cinn_launch_op_test.cc | 77 ++++++++++++------- paddle/fluid/platform/flags.cc | 26 +++++++ 7 files changed, 151 insertions(+), 60 deletions(-) diff --git a/paddle/fluid/framework/paddle2cinn/cinn_compiler.cc b/paddle/fluid/framework/paddle2cinn/cinn_compiler.cc index 6cde65f6ab580..83a5b6f82136d 100644 --- a/paddle/fluid/framework/paddle2cinn/cinn_compiler.cc +++ b/paddle/fluid/framework/paddle2cinn/cinn_compiler.cc @@ -31,11 +31,13 @@ #include "cinn/hlir/framework/graph_compiler.h" #include "cinn/hlir/framework/pass.h" #include "cinn/hlir/pass/use_pass.h" +#include "gflags/gflags.h" #include "paddle/fluid/framework/framework.pb.h" #include "paddle/fluid/framework/ir/graph.h" #include "paddle/fluid/framework/ir/graph_helper.h" #include "paddle/fluid/framework/ir/node.h" #include "paddle/fluid/framework/lod_tensor.h" +#include "paddle/fluid/framework/paddle2cinn/build_cinn_pass.h" #include "paddle/fluid/framework/paddle2cinn/cinn_graph_symbolization.h" #include "paddle/fluid/framework/program_desc.h" #include "paddle/fluid/framework/tensor.h" @@ -45,6 +47,7 @@ #include "paddle/fluid/string/string_helper.h" #include "paddle/phi/core/utils/rw_lock.h" +DECLARE_bool(enable_pe_launch_cinn); namespace paddle { namespace framework { namespace paddle2cinn { @@ -217,6 +220,33 @@ void CinnCompiler::Clear() { real_compiled_num_.store(0); } +void CinnCompiler::CheckCompiledValid( + const ir::Graph& graph, + const std::map& input_tensors, + const CinnCompiledObject& compiled_obj) const { + const auto& input_var_names = graph.Get>(kInputVars); + const auto& output_var_names = + graph.Get>(kOutputVars); + auto* launch_context = compiled_obj.launch_context.get(); + // 1. check all of the output variables will be assigned by compiled program + for (auto&& var_name : output_var_names) { + PADDLE_ENFORCE_EQ(launch_context->IsVariableUsed(var_name), true, + platform::errors::PreconditionNotMet( + "Variable(%s) not applied in CINN", var_name)); + } + // 2. check all of the used input variables were correctly deduced by CINN. + for (const auto& var_name : input_var_names) { + // some input variables were not used by CINN because they were eliminated + // by its optimized passes or some operators of it need less inputs + if (!launch_context->IsVariableUsed(var_name)) { + VLOG(4) << "Input variable" << var_name << " not used by cinn"; + continue; + } + launch_context->CheckTensorEquivalent(var_name, + *input_tensors.at(var_name)); + } +} + std::unique_ptr CinnCompiler::CompileGraph( const ir::Graph& graph, const std::map& input_tensors, @@ -244,6 +274,9 @@ std::unique_ptr CinnCompiler::CompileGraph( std::make_unique(target, scope, cinn_graph); GraphCompiler::CompileOptions options; options.with_instantiate_variables = false; + if (!FLAGS_enable_pe_launch_cinn) { + options.with_buffer_handle_instruction_inserted = true; + } auto compiled_res = graph_compiler->Build(options, std::move(fetch_ids), stream); auto compiled_obj = std::make_unique(); @@ -254,6 +287,7 @@ std::unique_ptr CinnCompiler::CompileGraph( compiled_obj->launch_context = std::make_unique(graph, *compiled_obj); + CheckCompiledValid(graph, input_tensors, *compiled_obj); return compiled_obj; } diff --git a/paddle/fluid/framework/paddle2cinn/cinn_compiler.h b/paddle/fluid/framework/paddle2cinn/cinn_compiler.h index 5fa54b302a36d..cf17e68156b3a 100644 --- a/paddle/fluid/framework/paddle2cinn/cinn_compiler.h +++ b/paddle/fluid/framework/paddle2cinn/cinn_compiler.h @@ -103,6 +103,13 @@ class CinnCompiler { const ::cinn::common::Target& target, std::int64_t compiled_num, void* stream = nullptr) const; + // check whether a compiled result is valid by comparing + // the consistency of external variables of the subgraph + void CheckCompiledValid( + const ir::Graph& graph, + const std::map& input_tensors, + const CinnCompiledObject& compiled_obj) const; + std::unordered_map> graphs_; std::unordered_map cache_by_address_; diff --git a/paddle/fluid/operators/cinn/CMakeLists.txt b/paddle/fluid/operators/cinn/CMakeLists.txt index 2406445e6cfa4..862a0d04fbdfe 100644 --- a/paddle/fluid/operators/cinn/CMakeLists.txt +++ b/paddle/fluid/operators/cinn/CMakeLists.txt @@ -3,7 +3,7 @@ include(operators) cc_library(cinn_op_helper SRCS cinn_op_helper.cc DEPS operator device_context) cc_library(cinn_launch_context SRCS cinn_launch_context.cc DEPS ddim lod_tensor scope proto_desc graph build_strategy device_context parallel_executor transform_type cinn) -SET(CINN_OP_DEPS parallel_executor string_helper cinn cinn_compiler cinn_op_helper cinn_launch_context transform_type) +SET(CINN_OP_DEPS parallel_executor string_helper variable_helper cinn cinn_compiler cinn_op_helper cinn_launch_context transform_type) register_operators(DEPS ${CINN_OP_DEPS}) if (WITH_TESTING) diff --git a/paddle/fluid/operators/cinn/cinn_launch_context.cc b/paddle/fluid/operators/cinn/cinn_launch_context.cc index b445527322fd6..a660d59fb4c0f 100644 --- a/paddle/fluid/operators/cinn/cinn_launch_context.cc +++ b/paddle/fluid/operators/cinn/cinn_launch_context.cc @@ -33,6 +33,7 @@ #include "paddle/fluid/framework/paddle2cinn/transform_type.h" #include "paddle/fluid/framework/program_desc.h" #include "paddle/fluid/framework/scope.h" +#include "paddle/fluid/framework/variable_helper.h" #include "paddle/fluid/operators/cinn/cinn_op_helper.h" #include "paddle/fluid/platform/device_context.h" #include "paddle/fluid/platform/place.h" @@ -69,13 +70,6 @@ CinnLaunchContext::CinnLaunchContext(const framework::ir::Graph& graph, graph.Get>(framework::paddle2cinn::kOutputVars); internal_var_names_ = ExtractInternalVarNames(input_var_names, output_var_names); - // check completeness of output variables in compiled result - for (auto&& var_name : output_var_names) { - PADDLE_ENFORCE_EQ(IsVariableUsed(var_name), true, - platform::errors::PreconditionNotMet( - "Variable(%s) not applied in CINN", var_name)); - } - // initialize all execution arguments InitializeArguments(); // DEPRECATED(CtfGo): following callback assignment will be deprecated soon @@ -235,7 +229,7 @@ void CinnLaunchContext::InitializeArguments() { cinn_tensor->shape().data().size()); cinn_buffer->type = cinn::runtime::ToRuntimeType(cinn_tensor->type()); VLOG(4) << string::Sprintf( - "Append an argument:name(%s),dims(%s),type(%s)", + "Append an argument:name(%s),dims(%s),type(%s)", arg, framework::DDim(cinn_buffer->dims, cinn_buffer->dimensions).to_str(), cinn_tensor->type()); name2argument_.emplace(arg, cinn_buffer.get()); @@ -400,7 +394,20 @@ ParallelExecutor* CinnLaunchContext::InitializePE(const platform::Place& place, std::unordered_map scope_map = { {parallel_executor_->GetLocalScopes().front(), scope}}; parallel_executor_->ResetOpHandleScopeMapOfGraphs(scope_map); - parallel_executor_->PrepareVariables(scope); + // instead of using the PrepareVariables function of ParallelExecutor to + // initialize all variables, here we only initialize internal variables + // because external variables are already included in parent scope. + for (auto&& var_name : internal_var_names_) { + auto* var = scope->FindVar(var_name); + if (var != nullptr) { + VLOG(5) << "internal variable:" << var_name + << " has been initialized beforehand in global scope, skipped."; + continue; + } + framework::InitializeVariable(scope->Var(var_name), + framework::proto::VarType::LOD_TENSOR); + } + for (auto&& var_name : initialized_beforehand_vars_) { auto* var = scope->GetVar(var_name); auto* buffer = GetCinnBufferOfVar(var_name); diff --git a/paddle/fluid/operators/cinn/cinn_launch_op.h b/paddle/fluid/operators/cinn/cinn_launch_op.h index 5263aae03ed3f..024bf2bceb3d0 100644 --- a/paddle/fluid/operators/cinn/cinn_launch_op.h +++ b/paddle/fluid/operators/cinn/cinn_launch_op.h @@ -18,7 +18,9 @@ #include #include #include + #include "cinn/common/target.h" +#include "gflags/gflags.h" #include "paddle/fluid/framework/data_type.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/operator.h" @@ -26,6 +28,7 @@ #include "paddle/fluid/operators/cinn/cinn_launch_context.h" #include "paddle/fluid/operators/cinn/cinn_op_helper.h" +DECLARE_bool(enable_pe_launch_cinn); namespace paddle { namespace operators { @@ -101,34 +104,23 @@ class CinnLaunchOpKernel : public framework::OpKernel { const auto& cinn_compiled_object = CinnCompiler::GetInstance()->Compile( compilation_key, inputs_name2tensor, target, stream); details::DebugCinnCompiledResult(cinn_compiled_object); - auto* launch_context = cinn_compiled_object.launch_context.get(); - // Step 3. check the computational consistency of the subgraph - // before and after the compilation - // 3.1 Input variables: tensors of input variables have - // been initialized before graph compiled, just check the - // equiality between tensors of paddle and cinn. - for (const auto& var_name : input_x_variable_names) { - // some input variables don't need for cinn because they are - // eliminated by optimized passes or some cinn operators use - // less variables - if (!launch_context->IsVariableUsed(var_name)) { - VLOG(4) << "Input variable" << var_name << " not used by cinn"; - continue; - } - launch_context->CheckTensorEquivalent(var_name, - *inputs_name2tensor.at(var_name)); - } - // Step 4. Set CINN runtime FLAGS, such as FLAGS_cinn_cudnn_deterministic. + // Step 3. Set CINN runtime FLAGS, such as FLAGS_cinn_cudnn_deterministic. details::SetCinnRuntimeFlags(); - // Step 5. use PE to execute the compiled CINN instructions - // in nodes of the runtime graph - VLOG(4) << "Execute the runtime graph by PE"; - framework::Scope& exec_scope = scope.NewScope(); - auto* pe = launch_context->InitializePE(place, &exec_scope); - pe->RunWithoutFetch(launch_context->GetSkipEagerVars()); + // Step 4. Execute the compiled CINN instructions by a PE or + // by the CINN compiled program in sequential order + if (FLAGS_enable_pe_launch_cinn) { + VLOG(4) << "Execute the runtime graph by PE"; + framework::Scope& exec_scope = scope.NewScope(); + auto* pe = launch_context->InitializePE(place, &exec_scope); + pe->RunWithoutFetch(launch_context->GetSkipEagerVars()); + } else { + VLOG(4) << "Execute the compiled executable program"; + launch_context->UpdateCapturedEnv(scope, place); + LaunchCinnExecution(cinn_compiled_object, *launch_context, stream); + } VLOG(4) << "CinnLaunchOp launch execution done."; } }; diff --git a/paddle/fluid/operators/cinn/cinn_launch_op_test.cc b/paddle/fluid/operators/cinn/cinn_launch_op_test.cc index 585f1caabed05..3e363c56eb93e 100644 --- a/paddle/fluid/operators/cinn/cinn_launch_op_test.cc +++ b/paddle/fluid/operators/cinn/cinn_launch_op_test.cc @@ -32,6 +32,7 @@ USE_OP(cinn_launch); USE_OP(cinn_instruction_run); USE_OP_ITSELF(elementwise_add); DECLARE_double(eager_delete_tensor_gb); +DECLARE_bool(enable_pe_launch_cinn); PD_DECLARE_KERNEL(add, CPU, ALL_LAYOUT); #ifdef PADDLE_WITH_CUDA @@ -42,43 +43,67 @@ namespace paddle::operators { using framework::paddle2cinn::CinnCompiler; -TEST(CinnLaunchOpTest, TestWithElementwiseAdd) { - paddle::framework::InitDevices(); - platform::SetNumThreads(1); - // cache test graph into CinnCompiler - const std::string& test_op_out_name = "cinn_launch_op_out"; - const std::string& add_op_out_name = "add_op_out"; - auto compilation_key = CinnCompiler::GetInstance()->AddGraph( - CreateOnlyElementwiseAddGraph("x", "y", test_op_out_name)); - - // create cinn_launch_op and elementwise_add op - auto cinn_launch_op = paddle::framework::OpRegistry::CreateOp( - "cinn_launch", {{"X", {"x", "y"}}}, {{"Out", {test_op_out_name}}}, - {{"compilation_key", compilation_key}}); - auto elementwise_add_op = paddle::framework::OpRegistry::CreateOp( - "elementwise_add", {{"X", {"x"}}, {"Y", {"y"}}}, - {{"Out", {add_op_out_name}}}, {{}}); - - // Run ops and check the computation results - auto run_and_check_fn = [&](const platform::Place& place) { +class TestCinnLaunchOp : public ::testing::Test { + public: + const char* test_op_out_name = "add_op_out"; + const char* add_op_out_name = "add_op_out"; + std::unique_ptr cinn_launch_op; + std::unique_ptr elementwise_add_op; + + void SetUp() override { + paddle::framework::InitDevices(); + platform::SetNumThreads(1); + // cache test graph into CinnCompiler + auto compilation_key = CinnCompiler::GetInstance()->AddGraph( + CreateOnlyElementwiseAddGraph("x", "y", test_op_out_name)); + + // create cinn_launch_op and elementwise_add op + cinn_launch_op = paddle::framework::OpRegistry::CreateOp( + "cinn_launch", {{"X", {"x", "y"}}}, {{"Out", {test_op_out_name}}}, + {{"compilation_key", compilation_key}}); + elementwise_add_op = paddle::framework::OpRegistry::CreateOp( + "elementwise_add", {{"X", {"x"}}, {"Y", {"y"}}}, + {{"Out", {add_op_out_name}}}, {{}}); + } + + void RunAndCheck(const platform::Place& place) { + // Run ops and check the computation results framework::Scope scope; InitVariablesWithRandomValue({"x", "y"}, {10, 20}, place, &scope); scope.Var(test_op_out_name)->GetMutable(); scope.Var(add_op_out_name)->GetMutable(); - cinn_launch_op->Run(scope, place); elementwise_add_op->Run(scope, place); + cinn_launch_op->Run(scope, place); CompareOpResult(scope.GetVar(test_op_out_name), scope.GetVar(add_op_out_name)); - }; - FLAGS_eager_delete_tensor_gb = -1; + } + + void TearDown() override { CinnCompiler::GetInstance()->Clear(); } +}; +TEST_F(TestCinnLaunchOp, TestRunInstructionByPE) { // CPU - run_and_check_fn(platform::CPUPlace()); - run_and_check_fn(platform::CPUPlace()); + RunAndCheck(platform::CPUPlace()); + // the second run on the same place is to check the cache logic + RunAndCheck(platform::CPUPlace()); +#ifdef PADDLE_WITH_CUDA + // GPU + RunAndCheck(platform::CUDAPlace()); + RunAndCheck(platform::CUDAPlace()); +#endif +} + +TEST_F(TestCinnLaunchOp, TestRunInstructionByCinnProgram) { + // set FLAGS_enable_pe_launch_cinn=false to switch to use + // default scheduler of CINN to execute the compiled program + FLAGS_enable_pe_launch_cinn = false; + + RunAndCheck(platform::CPUPlace()); + RunAndCheck(platform::CPUPlace()); #ifdef PADDLE_WITH_CUDA // GPU - run_and_check_fn(platform::CUDAPlace()); - run_and_check_fn(platform::CUDAPlace()); + RunAndCheck(platform::CUDAPlace()); + RunAndCheck(platform::CUDAPlace()); #endif } diff --git a/paddle/fluid/platform/flags.cc b/paddle/fluid/platform/flags.cc index a43eaa41cfe83..f89452853b49b 100644 --- a/paddle/fluid/platform/flags.cc +++ b/paddle/fluid/platform/flags.cc @@ -751,6 +751,32 @@ PADDLE_DEFINE_EXPORTED_string(allow_cinn_ops, "", */ PADDLE_DEFINE_EXPORTED_string(deny_cinn_ops, "", "It controls the cinn op subset to be not used."); + +/* + * CINN related FLAG + * Name: FLAGS_enable_pe_launch_cinn + * Since Version: 2.3 + * Value Range: bool, default=true + * Example: FLAGS_enable_pe_launch_cinn=true would execute the CINN compiled + * instructions of a paddle graph with ParallelExecutor, otherwise with the + * CINN compiled runtime program in sequential order. + */ +PADDLE_DEFINE_EXPORTED_bool(enable_pe_launch_cinn, true, + "It controls whether to execute cinn compiled " + "program with ParallelExecutor"); + +/* + * CINN related FLAG + * Name: FLAGS_enable_cinn_auto_tune + * Since Version: 2.3 + * Value Range: bool, default=false + * Example: FLAGS_enable_cinn_auto_tune=true would use CINN with its + * auto-tune feature enabled + */ +PADDLE_DEFINE_EXPORTED_bool(enable_cinn_auto_tune, false, + "It controls whether to use cinn with " + "its auto-tune feature enabled"); + #endif DEFINE_int32(record_pool_max_size, 2000000, From 5a103150e419a7acdcdb154ae4c76cce2cc9123d Mon Sep 17 00:00:00 2001 From: Aurelius84 Date: Mon, 18 Apr 2022 10:27:48 +0800 Subject: [PATCH 203/211] [Eager] Add _fallback_legacy_dygraph for npu/xpu/rocm (#41774) * [Eager] add _fallback_legacy_dygraph for npu/xpu/rocm * fix import --- python/paddle/fluid/framework.py | 43 ++++++++++++++++++++++++++++++-- 1 file changed, 41 insertions(+), 2 deletions(-) diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py index a329610eeae83..5dab39a35d478 100644 --- a/python/paddle/fluid/framework.py +++ b/python/paddle/fluid/framework.py @@ -166,6 +166,40 @@ def _in_eager_without_dygraph_check(): return _in_eager_mode_ +# FIXME(dev): We haven't fully verified eager mode on XPU/NPU et.al but +# only GPU/CPU. Remove this after we improve this feature. +_is_first_import_ = True + + +def _fallback_legacy_dygraph(): + global _in_eager_mode_ + global _is_first_import_ + need_fallback = False + # Only enable eager on CPU/GPU + is_not_support = core.is_compiled_with_xpu() or core.is_compiled_with_npu( + ) or core.is_compiled_with_ipu() or core.is_compiled_with_mlu( + ) or core.is_compiled_with_rocm() + + if _in_eager_mode_ and is_not_support: + # switch into legacy dygraph mode + warnings.warn( + "We will fallback into legacy dygraph on NPU/XPU/MLU/IPU/ROCM devices. Because we only support new eager dygraph mode on CPU/GPU currently. " + ) + _in_eager_mode_ = False + if not _is_first_import_: + _enable_legacy_dygraph() + need_fallback = True + + need_fallback = False + _is_first_import_ = False + + return need_fallback + + +# switch into legacy mode if need while import paddle +_fallback_legacy_dygraph() + + def in_dygraph_mode(): """ @@ -206,11 +240,16 @@ def _non_static_mode(): @signature_safe_contextmanager def _test_eager_guard(place=None): - _disable_legacy_dygraph() + # FIXME(dev): We haven't fully verified eager mode on XPU/NPU et.al but + # only GPU/CPU. Remove this after we improve this feature. + already_fallback = _fallback_legacy_dygraph() + if not already_fallback: + _disable_legacy_dygraph() try: yield finally: - _enable_legacy_dygraph() + if not already_fallback: + _enable_legacy_dygraph() global_ipu_index = None From 9f9e591dfd7809e22824c4c6c99c4209d0f2728d Mon Sep 17 00:00:00 2001 From: Wilber Date: Mon, 18 Apr 2022 10:51:18 +0800 Subject: [PATCH 204/211] remove fluid memory pool (#41862) --- .../infrt/kernel/phi/dense_tensor_kernels.cc | 34 +++---- paddle/infrt/paddle/model_parser.cc | 95 +++++++++++++++++++ paddle/infrt/paddle/model_parser.h | 11 +++ 3 files changed, 123 insertions(+), 17 deletions(-) diff --git a/paddle/infrt/kernel/phi/dense_tensor_kernels.cc b/paddle/infrt/kernel/phi/dense_tensor_kernels.cc index 7ffc8de151075..a9b18c769dca8 100644 --- a/paddle/infrt/kernel/phi/dense_tensor_kernels.cc +++ b/paddle/infrt/kernel/phi/dense_tensor_kernels.cc @@ -13,7 +13,9 @@ // limitations under the License. #include "paddle/infrt/kernel/phi/dense_tensor_kernels.h" +#include #include "llvm/Support/ErrorHandling.h" +#include "paddle/infrt/backends/host/phi_allocator.h" #include "paddle/infrt/common/string.h" #include "paddle/infrt/dialect/phi/data_type.h" #include "paddle/infrt/kernel/phi/context_kernels.h" @@ -22,24 +24,13 @@ #include "paddle/infrt/tensor/tensor_map.h" #include "paddle/phi/backends/all_context.h" #include "paddle/phi/common/place.h" +#include "paddle/phi/core/allocator.h" #include "paddle/phi/core/dense_tensor.h" #ifdef INFRT_WITH_GPU #include #endif -namespace paddle { -namespace platform { -using DeviceContext = ::phi::DeviceContext; -} // namespace platform -namespace framework { -using LoDTensor = ::phi::DenseTensor; -void DeserializeFromStream(std::istream& is, - LoDTensor* tensor, - const platform::DeviceContext& dev_ctx); -} -} // namespace paddle - namespace infrt { namespace kernel { namespace phi { @@ -198,6 +189,12 @@ ::infrt::phi::DenseTensorMap LoadParameters(const std::string& file_path) { auto pb_proto_prog = paddle::LoadProgram(model_path); auto main_block = pb_proto_prog->blocks(0); + ::phi::CPUContext ctx; + auto allocator = std::make_unique(); + const auto* allocator_ptr = allocator.get(); + ctx.SetAllocator(allocator_ptr); + ctx.SetHostAllocator(allocator_ptr); + ctx.SetZeroAllocator(allocator_ptr); for (auto& var : main_block.vars()) { if (var.name() == "feed" || var.name() == "fetch" || !var.persistable()) continue; @@ -207,9 +204,7 @@ ::infrt::phi::DenseTensorMap LoadParameters(const std::string& file_path) { case ::paddle::framework::proto::VarType_Type_LOD_TENSOR: { std::unique_ptr<::phi::DenseTensor> tensor{ std::make_unique<::phi::DenseTensor>()}; - ::phi::CPUContext ctx; - ::paddle::framework::DeserializeFromStream( - param_file, tensor.get(), ctx); + ::infrt::paddle::DeserializeFromStream(param_file, tensor.get(), ctx); map.SetDenseTensor(var.name(), std::move(tensor)); } break; default: { @@ -249,11 +244,16 @@ ::infrt::phi::DenseTensorMap LoadCombinedParameters( } } + ::phi::CPUContext ctx; + auto allocator = std::make_unique(); + const auto* allocator_ptr = allocator.get(); + ctx.SetAllocator(allocator_ptr); + ctx.SetHostAllocator(allocator_ptr); + ctx.SetZeroAllocator(allocator_ptr); for (auto& var : tmp) { std::unique_ptr<::phi::DenseTensor> tensor{ std::make_unique<::phi::DenseTensor>()}; - ::phi::CPUContext ctx; - ::paddle::framework::DeserializeFromStream(param_file, tensor.get(), ctx); + ::infrt::paddle::DeserializeFromStream(param_file, tensor.get(), ctx); map.SetDenseTensor(var, std::move(tensor)); } diff --git a/paddle/infrt/paddle/model_parser.cc b/paddle/infrt/paddle/model_parser.cc index f3de1a630451c..da4f8b6420b22 100644 --- a/paddle/infrt/paddle/model_parser.cc +++ b/paddle/infrt/paddle/model_parser.cc @@ -22,6 +22,10 @@ #include "paddle/infrt/common/target.h" #include "paddle/infrt/common/type.h" +#ifdef INFRT_WITH_PHI +#include "paddle/phi/common/data_type.h" +#endif + namespace infrt { namespace paddle { @@ -170,5 +174,96 @@ void LoadParam(const std::string &path, _Variable *out, const Target &target) { LoadLoDTensor(fin, out, target); } +#ifdef INFRT_WITH_PHI +namespace framework_proto = ::paddle::framework::proto; + +inline ::phi::DataType PhiDataType(framework_proto::VarType::Type type) { + using Type = framework_proto::VarType::Type; + switch (static_cast(type)) { + case Type::VarType_Type_BOOL: + return ::phi::DataType::BOOL; + case Type::VarType_Type_INT8: + return ::phi::DataType::INT8; + case Type::VarType_Type_UINT8: + return ::phi::DataType::UINT8; + case Type::VarType_Type_INT16: + return ::phi::DataType::INT16; + case Type::VarType_Type_INT32: + return ::phi::DataType::INT32; + case Type::VarType_Type_INT64: + return ::phi::DataType::INT64; + case Type::VarType_Type_SIZE_T: + return ::phi::DataType::UINT64; + case Type::VarType_Type_FP16: + return ::phi::DataType::FLOAT16; + case Type::VarType_Type_FP32: + return ::phi::DataType::FLOAT32; + case Type::VarType_Type_FP64: + return ::phi::DataType::FLOAT64; + default: + LOG(FATAL) << "unknown data type " << type; + } + return ::phi::DataType::UNDEFINED; +} + +inline void TensorFromStream(std::istream &is, + ::phi::DenseTensor *tensor, + const ::phi::CPUContext &ctx) { + uint32_t version; + is.read(reinterpret_cast(&version), sizeof(version)); + CHECK_EQ(version, 0U); + framework_proto::VarType::TensorDesc desc; + { // int32_t size + // proto buffer + int32_t size = -1; + is.read(reinterpret_cast(&size), sizeof(size)); + CHECK_EQ(is.good(), true); + CHECK_GE(size, 0); + std::unique_ptr buf(new char[size]); + is.read(reinterpret_cast(buf.get()), size); + CHECK_EQ(desc.ParseFromArray(buf.get(), size), true); + } + { // read tensor + std::vector dims; + dims.reserve(static_cast(desc.dims().size())); + std::copy(desc.dims().begin(), desc.dims().end(), std::back_inserter(dims)); + tensor->Resize(::phi::make_ddim(dims)); + void *buf; + size_t size = tensor->numel() * SizeOfType(desc.data_type()); + ctx.HostAlloc(tensor, PhiDataType(desc.data_type()), size); + buf = tensor->data(); + is.read(static_cast(buf), size); + } +} + +void DeserializeFromStream(std::istream &is, + ::phi::DenseTensor *tensor, + const ::phi::CPUContext &dev_ctx) { + { + // the 1st field, unit32_t version for LoDTensor + uint32_t version; + is.read(reinterpret_cast(&version), sizeof(version)); + CHECK_EQ(version, 0U); + } + { + // the 2st field, LoD information + uint64_t lod_level; + is.read(reinterpret_cast(&lod_level), sizeof(lod_level)); + auto &lod = *tensor->mutable_lod(); + lod.resize(lod_level); + for (uint64_t i = 0; i < lod_level; ++i) { + uint64_t size; + is.read(reinterpret_cast(&size), sizeof(size)); + std::vector tmp(size / sizeof(size_t)); + is.read(reinterpret_cast(tmp.data()), + static_cast(size)); + lod[i] = tmp; + } + } + // the 3st filed, Tensor + TensorFromStream(is, tensor, dev_ctx); +} +#endif + } // namespace paddle } // namespace infrt diff --git a/paddle/infrt/paddle/model_parser.h b/paddle/infrt/paddle/model_parser.h index 373f77033dcef..5f039ad5d3ad8 100644 --- a/paddle/infrt/paddle/model_parser.h +++ b/paddle/infrt/paddle/model_parser.h @@ -25,6 +25,11 @@ #include "paddle/infrt/paddle/scope.h" #include "paddle/infrt/paddle/tensor.h" +#ifdef INFRT_WITH_PHI +#include "paddle/phi/backends/cpu/cpu_context.h" +#include "paddle/phi/core/dense_tensor.h" +#endif + namespace infrt { namespace paddle { namespace framework_proto = ::paddle::framework::proto; @@ -53,5 +58,11 @@ void TensorFromStream( const common::Target& target = common::DefaultHostTarget()); void ReadBinaryFile(const std::string& filename, std::string* contents); +#ifdef INFRT_WITH_PHI +void DeserializeFromStream(std::istream& is, + ::phi::DenseTensor* tensor, + const ::phi::CPUContext& dev_ctx); +#endif + } // namespace paddle } // namespace infrt From 7ee9ba2f29f78ab669ee35f9d32e2c967e67c849 Mon Sep 17 00:00:00 2001 From: JingZhuangzhuang <75348594+JZZ-NOTE@users.noreply.github.com> Date: Mon, 18 Apr 2022 10:59:24 +0800 Subject: [PATCH 205/211] fix_poo2d_trt_convert (#41860) --- .../inference/tensorrt/convert/pool2d_op.cc | 58 ++++++++++++++----- 1 file changed, 44 insertions(+), 14 deletions(-) diff --git a/paddle/fluid/inference/tensorrt/convert/pool2d_op.cc b/paddle/fluid/inference/tensorrt/convert/pool2d_op.cc index 7824a0f1e29f4..29acf549cbbc3 100644 --- a/paddle/fluid/inference/tensorrt/convert/pool2d_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/pool2d_op.cc @@ -256,19 +256,51 @@ class Pool2dOpConverter : public OpConverter { if (!adaptive) { if (ceil_mode) { - std::vector input_shape_v; - for (int i = 0; i < input_dims; i++) { - input_shape_v.push_back(input_shape.d[i]); + if (nv_ksize.d[0] % nv_strides.d[0] == 0 && + nv_ksize.d[1] % nv_strides.d[1] == 0) { + nvinfer1::DimsHW pre_pad(0, 0); + nvinfer1::DimsHW post_pad(0, 0); + // If ceil mode is true, we will pad the appropriate size to the + // input. + DealCeilMode(input_shape, ksize, strides, paddings, &pre_pad, + &post_pad, input_dims); + auto *pad_layer = TRT_ENGINE_ADD_LAYER(engine_, Padding, *input1, + pre_pad, post_pad); + + PADDLE_ENFORCE_NOT_NULL( + pad_layer, platform::errors::Fatal( + "Pad layer in poolOp converter could not be " + "created. The pointer to pad layer is `NULL`.")); + input1 = pad_layer->getOutput(0); + + auto *pool_layer = TRT_ENGINE_ADD_LAYER(engine_, Pooling, *input1, + nv_pool_type, nv_ksize); + PADDLE_ENFORCE_NOT_NULL( + pool_layer, + platform::errors::Fatal( + "trt pool layer in converter could not be created.")); + pool_layer->setStride(nv_strides); + pool_layer->setPadding(nv_paddings); + if (padding_algorithm == "SAME") { + pool_layer->setPaddingMode(nvinfer1::PaddingMode::kSAME_UPPER); + } + pool_layer->setAverageCountExcludesPadding(exclusive); + layer = pool_layer; + } else { + std::vector input_shape_v; + for (int i = 0; i < input_dims; i++) { + input_shape_v.push_back(input_shape.d[i]); + } + plugin::PoolPlugin *plugin = new plugin::PoolPlugin( + ceil_mode, plugin_pool_type, adaptive, exclusive, ksize, strides, + paddings, input_shape_v, real_paddings); + auto *pool_layer = engine_->AddPlugin(&input1, 1, plugin); + PADDLE_ENFORCE_NOT_NULL( + pool_layer, + platform::errors::Fatal( + "trt pool plugin layer in converter could not be created.")); + layer = pool_layer; } - plugin::PoolPlugin *plugin = new plugin::PoolPlugin( - ceil_mode, plugin_pool_type, adaptive, exclusive, ksize, strides, - paddings, input_shape_v, real_paddings); - auto *pool_layer = engine_->AddPlugin(&input1, 1, plugin); - PADDLE_ENFORCE_NOT_NULL( - pool_layer, - platform::errors::Fatal( - "trt pool plugin layer in converter could not be created.")); - layer = pool_layer; } else { #if IS_TRT_VERSION_GE(8000) // Exclude padding pixels from the average mean is not supported well by @@ -299,7 +331,6 @@ class Pool2dOpConverter : public OpConverter { pool_layer->setAverageCountExcludesPadding(exclusive); layer = pool_layer; } - } else { // Average pooling needs to exclude the padding pixels from the average // mean. @@ -327,5 +358,4 @@ class Pool2dOpConverter : public OpConverter { } // namespace inference } // namespace paddle -USE_OP_ITSELF(pool2d); REGISTER_TRT_OP_CONVERTER(pool2d, Pool2dOpConverter); From e118edd3c0f1570d61156ea7a17cbfd9c7545bef Mon Sep 17 00:00:00 2001 From: furnace <34057289+windstamp@users.noreply.github.com> Date: Mon, 18 Apr 2022 11:16:22 +0800 Subject: [PATCH 206/211] [NPU] fix conv2d and top_k_v2 fp16 (#41409) [NPU] fix conv2d and top_k_v2 fp16 --- paddle/fluid/operators/conv_op_npu.cc | 52 ++++++++++++++++++++--- paddle/fluid/operators/top_k_v2_op_npu.cc | 2 + 2 files changed, 47 insertions(+), 7 deletions(-) diff --git a/paddle/fluid/operators/conv_op_npu.cc b/paddle/fluid/operators/conv_op_npu.cc index 86a6ec2c3a160..3ace825e7b80d 100644 --- a/paddle/fluid/operators/conv_op_npu.cc +++ b/paddle/fluid/operators/conv_op_npu.cc @@ -20,6 +20,29 @@ namespace operators { using Tensor = framework::Tensor; using NPUDeviceContext = platform::NPUDeviceContext; +static void CastToFP16(const framework::ExecutionContext& ctx, + const aclrtStream& stream, const Tensor& in, + Tensor* out) { + out->mutable_data(ctx.GetPlace()); + NpuOpRunner runner; + runner.SetType("Cast") + .AddInput(in) + .AddOutput(*out) + .AddAttr("dst_type", ACL_FLOAT16) + .Run(stream); +} + +static void CastToFP32(const framework::ExecutionContext& ctx, + const aclrtStream& stream, const Tensor& in, + Tensor* out) { + out->mutable_data(ctx.GetPlace()); + NpuOpRunner runner; + runner.SetType("Cast") + .AddInput(in) + .AddOutput(*out) + .AddAttr("dst_type", ACL_FLOAT) + .Run(stream); +} template class DepthwiseConvNPUKernel : public framework::OpKernel { @@ -356,18 +379,33 @@ class NPUConvGradOpKernel : public framework::OpKernel { auto stream = ctx.template device_context().stream(); if (filter_grad) { - filter_grad->mutable_data(ctx.GetPlace()); + filter_grad->mutable_data(ctx.GetPlace()); std::vector filter_shape_vec = phi::vectorize(filter->dims()); + Tensor filter_grad_fp32(experimental::DataType::FLOAT32); + filter_grad_fp32.Resize(filter_grad->dims()); + + if (framework::TransToProtoVarType(input->dtype()) == + framework::proto::VarType::FP16) { + CastToFP32(ctx, stream, *filter_grad, &filter_grad_fp32); + } else { + filter_grad_fp32.ShareDataWith(*filter_grad); + } + const auto& runner = NpuOpRunner( "Conv2DBackpropFilterD", {input_tensor, output_grad_tensor}, - {*filter_grad}, {{"filter_size", filter_shape_vec}, - {"strides", strides_vec}, - {"pads", paddings}, - {"dilations", dilations_vec}, - {"groups", groups}, - {"data_format", data_format}}); + {filter_grad_fp32}, {{"filter_size", filter_shape_vec}, + {"strides", strides_vec}, + {"pads", paddings}, + {"dilations", dilations_vec}, + {"groups", groups}, + {"data_format", data_format}}); runner.Run(stream); + + if (framework::TransToProtoVarType(input->dtype()) == + framework::proto::VarType::FP16) { + CastToFP16(ctx, stream, filter_grad_fp32, filter_grad); + } } if (input_grad) { input_grad->mutable_data(ctx.GetPlace()); diff --git a/paddle/fluid/operators/top_k_v2_op_npu.cc b/paddle/fluid/operators/top_k_v2_op_npu.cc index dff5c2d3f3937..04e4d88b008e0 100644 --- a/paddle/fluid/operators/top_k_v2_op_npu.cc +++ b/paddle/fluid/operators/top_k_v2_op_npu.cc @@ -89,7 +89,9 @@ class TopkV2NPUKernel : public framework::OpKernel { } // namespace paddle namespace ops = paddle::operators; +namespace plat = paddle::platform; REGISTER_OP_NPU_KERNEL(top_k_v2, ops::TopkV2NPUKernel, + ops::TopkV2NPUKernel, ops::TopkV2NPUKernel, ops::TopkV2NPUKernel, ops::TopkV2NPUKernel); From ceef73c9b5876311e947505c9d04b5cb51cd73f0 Mon Sep 17 00:00:00 2001 From: JZ-LIANG Date: Mon, 18 Apr 2022 11:32:08 +0800 Subject: [PATCH 207/211] [Auto parallel] Transformer MHA & FFN Fused Dist op (#41163) * adapot dist op * [Auto Parallel] Support the auto completion of while_op * add dist_fill_constant_batch_size_like * align infer accuracy --- .../distributed/auto_parallel/dist_loader.py | 18 +- .../distributed/auto_parallel/engine.py | 18 +- .../auto_parallel/operators/__init__.py | 2 + .../operators/dist_fused_attention.py | 211 ++++++++++++++++++ .../operators/dist_fused_feedforward.py | 203 +++++++++++++++++ .../distributed/passes/auto_parallel_amp.py | 1 + .../distributed/passes/auto_parallel_fp16.py | 21 +- 7 files changed, 460 insertions(+), 14 deletions(-) create mode 100644 python/paddle/distributed/auto_parallel/operators/dist_fused_attention.py create mode 100644 python/paddle/distributed/auto_parallel/operators/dist_fused_feedforward.py diff --git a/python/paddle/distributed/auto_parallel/dist_loader.py b/python/paddle/distributed/auto_parallel/dist_loader.py index 187c7cc02855f..9449b52952cd8 100644 --- a/python/paddle/distributed/auto_parallel/dist_loader.py +++ b/python/paddle/distributed/auto_parallel/dist_loader.py @@ -97,15 +97,19 @@ def data_generator(): if not isinstance(data, list): data = to_list(data) - if batch_data is None: - batch_data = [[] for i in range(len(data))] + if self.batch_size == 1: + yield data + batch_data = None + else: + if batch_data is None: + batch_data = [[] for i in range(len(data))] - for idx in range(len(data)): - batch_data[idx].append(data[idx]) + for idx in range(len(data)): + batch_data[idx].append(data[idx]) - if (step + 1) % self.batch_size == 0: - yield batch_data - batch_data = None + if (step + 1) % self.batch_size == 0: + yield batch_data + batch_data = None dataloader = paddle.fluid.io.DataLoader.from_generator( feed_list=self.feed_list, capacity=70, iterable=False) diff --git a/python/paddle/distributed/auto_parallel/engine.py b/python/paddle/distributed/auto_parallel/engine.py index c71ca9b7c6af9..a5fec789dfb37 100644 --- a/python/paddle/distributed/auto_parallel/engine.py +++ b/python/paddle/distributed/auto_parallel/engine.py @@ -194,6 +194,9 @@ def _parallel_program(self, mode, rank): self._apply_post_optimization(dist_main_prog, dist_startup_prog, rank, dist_params_grads) else: + # Apply pre optimization passes + self._apply_pre_optimization(serial_main_program, + serial_startup_program, None, None) # Do logical partition partitioner = Partitioner(dist_context, rank) dist_main_prog, dist_startup_prog, dist_params_grads = partitioner.partition( @@ -231,15 +234,24 @@ def _generate_optimizer(self, main_program, startup_program, params_grads): def _apply_pre_optimization(self, main_program, startup_program, loss, params_grads): + # apply amp pass if self.strategy.amp: config = copy.deepcopy(self.strategy.amp_configs) config["dist_context"] = self._dist_contexts[self.mode] config["params_grads"] = params_grads config["loss"] = loss - auto_parallel_amp_pass = new_pass("auto_parallel_amp", config) - auto_parallel_amp_pass.apply([main_program], [startup_program], - self._pass_contexts[self.mode]) + config["input_data"] = self._feed_vars[self.mode][ + "inputs"] + self._feed_vars[self.mode]["labels"] + if config["use_pure_fp16"]: + config["base_opt"] = self._optimizer + auto_parallel_fp16_pass = new_pass("auto_parallel_fp16", config) + auto_parallel_fp16_pass.apply( + [main_program], [startup_program], self._pass_context) + else: + auto_parallel_amp_pass = new_pass("auto_parallel_amp", config) + auto_parallel_amp_pass.apply([main_program], [startup_program], + self._pass_context) # apply recompute pass if self.strategy.recompute: diff --git a/python/paddle/distributed/auto_parallel/operators/__init__.py b/python/paddle/distributed/auto_parallel/operators/__init__.py index db6f909f8ca7d..3c229746573da 100644 --- a/python/paddle/distributed/auto_parallel/operators/__init__.py +++ b/python/paddle/distributed/auto_parallel/operators/__init__.py @@ -28,3 +28,5 @@ from . import dist_update_loss_scaling from . import dist_split from . import dist_fill_constant_batch_size_like +from . import dist_fused_feedforward +from . import dist_fused_attention diff --git a/python/paddle/distributed/auto_parallel/operators/dist_fused_attention.py b/python/paddle/distributed/auto_parallel/operators/dist_fused_attention.py new file mode 100644 index 0000000000000..bc3992ec03d4b --- /dev/null +++ b/python/paddle/distributed/auto_parallel/operators/dist_fused_attention.py @@ -0,0 +1,211 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from .common import DistributedOperatorImplContainer +from .common import DistributedOperatorImpl +from .common import register_distributed_operator_impl_container +from .common import register_distributed_operator_impl +from ..utils import is_dim_shard, is_dim_replicate +from ..utils import is_valid_list_index +from ..utils import compute_compatible_dim_mapping +from ..utils import compute_compatible_dims_mapping +from ..utils import compute_compatible_and_update_dim_mapping +from .dist_default import DistributedDefaultImpl0 +from ..utils import _get_comm_group, _get_corresponding_rank +from ..process_group import new_process_group + + +class DistributedFusedAttention(DistributedOperatorImplContainer): + def __init__(self, op_type): + super(DistributedFusedAttention, self).__init__(op_type) + + +register_distributed_operator_impl_container( + DistributedFusedAttention("fused_attention")) + + +class DistributedFusedAttentionImpl(DistributedOperatorImpl): + def __init__(self, name): + super(DistributedFusedAttentionImpl, self).__init__(name) + self._forward_implemented = True + self._backward_implemented = True + + def is_input_compatible(self, dist_op): + op_desc = dist_op.serial_op.desc + op_dist_attr = dist_op.dist_attr + x_name = op_desc.input('X')[0] + qkv_w = op_desc.input('QKVW')[0] + qkv_bias = op_desc.input('QKVBias')[0] + out_w = op_desc.input('OutLinearW')[0] + out_bias = op_desc.input('OutLinearBias')[0] + + x_dims_mapping = op_dist_attr.get_input_dims_mapping(x_name) + qkv_w_dims_mapping = op_dist_attr.get_input_dims_mapping(qkv_w) + qkv_bias_dims_mapping = op_dist_attr.get_input_dims_mapping(qkv_bias) + out_w_dims_mapping = op_dist_attr.get_input_dims_mapping(out_w) + out_bias_dims_mapping = op_dist_attr.get_input_dims_mapping(out_bias) + + head_axis = 1 + for mapping in x_dims_mapping[1:-1]: + if is_dim_shard(mapping): + return False + if len(qkv_w_dims_mapping) != 4 or is_dim_replicate(qkv_w_dims_mapping[ + head_axis]): + return False + if len(qkv_bias_dims_mapping) != 3 or is_dim_replicate( + qkv_bias_dims_mapping[head_axis]): + return False + if is_dim_replicate(out_w_dims_mapping[0]): + return False + if is_dim_shard(out_bias_dims_mapping[-1]): + return False + + replicated_dims = [ + qkv_w_dims_mapping[0], qkv_w_dims_mapping[-2], + qkv_w_dims_mapping[-1], qkv_bias_dims_mapping[0], + qkv_bias_dims_mapping[-1], out_w_dims_mapping[-1], + out_bias_dims_mapping[-1] + ] + for mapping in replicated_dims: + if is_dim_shard(mapping): + return False + if qkv_bias_dims_mapping[head_axis] != qkv_w_dims_mapping[head_axis]: + return False + if qkv_bias_dims_mapping[head_axis] != out_w_dims_mapping[0]: + return False + + return True + + def is_output_compatible(self, dist_op): + op_desc = dist_op.serial_op.desc + op_dist_attr = dist_op.dist_attr + + # none of output should be sharded + for out_name in op_desc.output_names(): + out = op_desc.output(out_name)[0] + out_dims_mapping = op_dist_attr.get_output_dims_mapping(out) + for mapping in out_dims_mapping[1:-1]: + if is_dim_shard(mapping): + return False + return True + + def is_auto_compatible(self, dist_op): + if (not self.is_input_compatible(dist_op)) or \ + (not self.is_output_compatible(dist_op)): + return False + + op_desc = dist_op.serial_op.desc + op_dist_attr = dist_op.dist_attr + x_name = op_desc.input('X')[0] + out_names = op_desc.output('Y') + x_dims_mapping = op_dist_attr.get_input_dims_mapping(x_name) + for out_name in out_names: + out_dims_mapping = op_dist_attr.get_output_dims_mapping(out_name) + if x_dims_mapping != out_dims_mapping: + return False + + return True + + def update_dims_mapping(self, dist_op): + changed = False + op_desc = dist_op.serial_op.desc + op_dist_attr = dist_op.dist_attr + x_name = op_desc.input('X')[0] + out_names = op_desc.output('Y') + x_dims_mapping = op_dist_attr.get_input_dims_mapping(x_name) + + for out_name in out_names: + out_dims_mapping = op_dist_attr.get_output_dims_mapping(out_name) + for i in range(len(x_dims_mapping)): + dim_changed = compute_compatible_and_update_dim_mapping( + [x_dims_mapping, out_dims_mapping], [i, i]) + if dim_changed: + changed = True + + return changed + + @staticmethod + def forward(ctx, *args, **kwargs): + + dist_op_context = ctx.dist_op_context + main_block = dist_op_context.work_block + startup_block = dist_op_context.startup_block + src_op = dist_op_context.cur_src_op + rank_id = dist_op_context.rank_id + op_dist_attr = ctx.get_op_dist_attr_for_program(src_op) + + if rank_id not in op_dist_attr.process_mesh.processes: + rank_id = _get_corresponding_rank(ctx, op_dist_attr.process_mesh, + rank_id) + + # infer logic comm presentation + head_axis = 1 + qkv_w = src_op.input('QKVW')[0] + qkv_w_col_dim_mapping = op_dist_attr.get_input_dims_mapping(qkv_w)[ + head_axis] + assert qkv_w_col_dim_mapping >= 0, "col_parallel_matmul's row should be divided by a specific mesh axis, but got [{}]".format( + qkv_w_col_dim_mapping) + process_mesh_shape = op_dist_attr.process_mesh.topology + process_mesh_group = op_dist_attr.process_mesh.processes + + parallel_axis = qkv_w_col_dim_mapping + group_ranks = _get_comm_group(process_mesh_group, process_mesh_shape, + parallel_axis, rank_id) + group = new_process_group(group_ranks) + + # insert op + DistributedDefaultImpl0.forward(ctx, *args, **kwargs) + + # setting comm id + new_op = main_block.ops[-1] + assert new_op.type == "fused_attention" + new_op._set_attr("ring_id", int(group.id)) + + @staticmethod + def backward(ctx, *args, **kwargs): + dist_op_context = ctx.dist_op_context + main_block = dist_op_context.work_block + startup_block = dist_op_context.startup_block + src_op = dist_op_context.cur_src_op + rank_id = dist_op_context.rank_id + op_dist_attr = ctx.get_op_dist_attr_for_program(src_op) + + if rank_id not in op_dist_attr.process_mesh.processes: + rank_id = _get_corresponding_rank(ctx, op_dist_attr.process_mesh, + rank_id) + + # infer logic comm presentation + out_w = src_op.input('OutLinearW')[0] + out_w_col_dim_mapping = op_dist_attr.get_input_dims_mapping(out_w)[-1] + assert out_w_col_dim_mapping >= 0, "col_parallel_matmul's row should be divided by a specific mesh axis, but got [{}]".format( + out_w_col_dim_mapping) + process_mesh_shape = op_dist_attr.process_mesh.topology + process_mesh_group = op_dist_attr.process_mesh.processes + + parallel_axis = out_w_col_dim_mapping + group_ranks = _get_comm_group(process_mesh_group, process_mesh_shape, + parallel_axis, rank_id) + group = new_process_group(group_ranks) + + # insert op + DistributedDefaultImpl0.backward(ctx, *args, **kwargs) + + # setting comm id + new_op = main_block.ops[-1] + assert new_op.type == "fused_attention_grad" + new_op._set_attr("ring_id", int(group.id)) + + +register_distributed_operator_impl( + "fused_attention", DistributedFusedAttentionImpl("tensor_parallel")) diff --git a/python/paddle/distributed/auto_parallel/operators/dist_fused_feedforward.py b/python/paddle/distributed/auto_parallel/operators/dist_fused_feedforward.py new file mode 100644 index 0000000000000..76f526adbbfaa --- /dev/null +++ b/python/paddle/distributed/auto_parallel/operators/dist_fused_feedforward.py @@ -0,0 +1,203 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from .common import DistributedOperatorImplContainer +from .common import DistributedOperatorImpl +from .common import register_distributed_operator_impl_container +from .common import register_distributed_operator_impl +from ..utils import is_dim_shard, is_dim_replicate +from ..utils import is_valid_list_index +from ..utils import compute_compatible_dim_mapping +from ..utils import compute_compatible_dims_mapping +from ..utils import compute_compatible_and_update_dim_mapping +from .dist_default import DistributedDefaultImpl0 +from ..utils import _get_comm_group, _get_corresponding_rank +from ..process_group import new_process_group + + +class DistributedFusedFeedForward(DistributedOperatorImplContainer): + def __init__(self, op_type): + super(DistributedFusedFeedForward, self).__init__(op_type) + + +register_distributed_operator_impl_container( + DistributedFusedFeedForward("fused_feedforward")) + + +class DistributedFusedFeedForwardImpl(DistributedOperatorImpl): + def __init__(self, name): + super(DistributedFusedFeedForwardImpl, self).__init__(name) + self._forward_implemented = True + self._backward_implemented = True + + def is_input_compatible(self, dist_op): + op_desc = dist_op.serial_op.desc + op_dist_attr = dist_op.dist_attr + x_name = op_desc.input('X')[0] + linear1_weight = op_desc.input('Linear1Weight')[0] + linear1_bias = op_desc.input('Linear1Bias')[0] + linear2_weight = op_desc.input('Linear2Weight')[0] + linear2_bias = op_desc.input('Linear2Bias')[0] + + x_dims_mapping = op_dist_attr.get_input_dims_mapping(x_name) + linear1_weight_dims_mapping = op_dist_attr.get_input_dims_mapping( + linear1_weight) + linear1_bias_dims_mapping = op_dist_attr.get_input_dims_mapping( + linear1_bias) + linear2_weight_dims_mapping = op_dist_attr.get_input_dims_mapping( + linear2_weight) + linear2_bias_dims_mapping = op_dist_attr.get_input_dims_mapping( + linear2_bias) + + for mapping in x_dims_mapping[1:-1]: + if is_dim_shard(mapping): + return False + if is_dim_shard(linear1_weight_dims_mapping[-2]) or is_dim_replicate( + linear1_weight_dims_mapping[-1]): + return False + if is_dim_replicate(linear1_bias_dims_mapping[-1]): + return False + if is_dim_replicate(linear2_weight_dims_mapping[-2]) or is_dim_shard( + linear2_weight_dims_mapping[-1]): + return False + if is_dim_shard(linear2_bias_dims_mapping[-1]): + return False + if linear1_weight_dims_mapping[-1] != linear2_weight_dims_mapping[-2]: + return False + + return True + + def is_output_compatible(self, dist_op): + op_desc = dist_op.serial_op.desc + op_dist_attr = dist_op.dist_attr + + # none of output should be sharded + for out_name in op_desc.output_names(): + out = op_desc.output(out_name)[0] + out_dims_mapping = op_dist_attr.get_output_dims_mapping(out) + for mapping in out_dims_mapping[1:-1]: + if is_dim_shard(mapping): + return False + return True + + def is_auto_compatible(self, dist_op): + if (not self.is_input_compatible(dist_op)) or \ + (not self.is_output_compatible(dist_op)): + return False + + op_desc = dist_op.serial_op.desc + op_dist_attr = dist_op.dist_attr + x_name = op_desc.input('X')[0] + out_names = op_desc.output('Out') + x_dims_mapping = op_dist_attr.get_input_dims_mapping(x_name) + for out_name in out_names: + out_dims_mapping = op_dist_attr.get_output_dims_mapping(out_name) + if x_dims_mapping != out_dims_mapping: + return False + + return True + + def update_dims_mapping(self, dist_op): + changed = False + op_desc = dist_op.serial_op.desc + op_dist_attr = dist_op.dist_attr + x_name = op_desc.input('X')[0] + out_names = op_desc.output('Out') + x_dims_mapping = op_dist_attr.get_input_dims_mapping(x_name) + + for out_name in out_names: + out_dims_mapping = op_dist_attr.get_output_dims_mapping(out_name) + for i in range(len(x_dims_mapping)): + dim_changed = compute_compatible_and_update_dim_mapping( + [x_dims_mapping, out_dims_mapping], [i, i]) + if dim_changed: + changed = True + + return changed + + @staticmethod + def forward(ctx, *args, **kwargs): + + dist_op_context = ctx.dist_op_context + main_block = dist_op_context.work_block + startup_block = dist_op_context.startup_block + src_op = dist_op_context.cur_src_op + rank_id = dist_op_context.rank_id + op_dist_attr = ctx.get_op_dist_attr_for_program(src_op) + + if rank_id not in op_dist_attr.process_mesh.processes: + rank_id = _get_corresponding_rank(ctx, op_dist_attr.process_mesh, + rank_id) + + # infer logic comm presentation + linear1_weight = src_op.input('Linear1Weight')[0] + linear1_weight_col_dim_mapping = op_dist_attr.get_input_dims_mapping( + linear1_weight)[-1] + assert linear1_weight_col_dim_mapping >= 0, "col_parallel_matmul's row should be divided by a specific mesh axis, but got [{}]".format( + linear1_weight_col_dim_mapping) + process_mesh_shape = op_dist_attr.process_mesh.topology + process_mesh_group = op_dist_attr.process_mesh.processes + + parallel_axis = linear1_weight_col_dim_mapping + group_ranks = _get_comm_group(process_mesh_group, process_mesh_shape, + parallel_axis, rank_id) + group = new_process_group(group_ranks) + + # insert op + DistributedDefaultImpl0.forward(ctx, *args, **kwargs) + + # setting comm id + new_op = main_block.ops[-1] + assert new_op.type == "fused_feedforward" + new_op._set_attr("ring_id", int(group.id)) + + @staticmethod + def backward(ctx, *args, **kwargs): + + dist_op_context = ctx.dist_op_context + main_block = dist_op_context.work_block + startup_block = dist_op_context.startup_block + src_op = dist_op_context.cur_src_op + rank_id = dist_op_context.rank_id + op_dist_attr = ctx.get_op_dist_attr_for_program(src_op) + + if rank_id not in op_dist_attr.process_mesh.processes: + rank_id = _get_corresponding_rank(ctx, op_dist_attr.process_mesh, + rank_id) + + # infer logic comm presentation + linear2_weight = src_op.input('Linear2Weight')[0] + linear2_weight_col_dim_mapping = op_dist_attr.get_input_dims_mapping( + linear2_weight)[-1] + assert linear2_weight_col_dim_mapping >= 0, "col_parallel_matmul's row should be divided by a specific mesh axis, but got [{}]".format( + linear2_weight_col_dim_mapping) + process_mesh_shape = op_dist_attr.process_mesh.topology + process_mesh_group = op_dist_attr.process_mesh.processes + + parallel_axis = linear2_weight_col_dim_mapping + group_ranks = _get_comm_group(process_mesh_group, process_mesh_shape, + parallel_axis, rank_id) + group = new_process_group(group_ranks) + + # insert op + DistributedDefaultImpl0.backward(ctx, *args, **kwargs) + + # setting comm id + new_op = main_block.ops[-1] + assert new_op.type == "fused_feedforward_grad" + new_op._set_attr("ring_id", int(group.id)) + + +register_distributed_operator_impl( + "fused_feedforward", DistributedFusedFeedForwardImpl("tensor_parallel")) diff --git a/python/paddle/distributed/passes/auto_parallel_amp.py b/python/paddle/distributed/passes/auto_parallel_amp.py index 5fdd88ac1de8a..fe94c25e12d2d 100644 --- a/python/paddle/distributed/passes/auto_parallel_amp.py +++ b/python/paddle/distributed/passes/auto_parallel_amp.py @@ -487,6 +487,7 @@ def __init__(self): self.set_attr("incr_ratio", 2.0) self.set_attr("decr_ratio", 0.8) self.set_attr("use_dynamic_loss_scaling", False) + self.set_attr("input_data", []) self.set_attr("params_grads", []) self._loss_scaling = None self._num_good_steps = None diff --git a/python/paddle/distributed/passes/auto_parallel_fp16.py b/python/paddle/distributed/passes/auto_parallel_fp16.py index 725b4459d7d21..69c3eef7e3771 100644 --- a/python/paddle/distributed/passes/auto_parallel_fp16.py +++ b/python/paddle/distributed/passes/auto_parallel_fp16.py @@ -95,12 +95,21 @@ def _keep_fp32_output(op, out_name): class FP16State(object): - def __init__(self, program, amp_list, dist_context, use_fp16_guard): + def __init__(self, + program, + amp_list, + dist_context, + use_fp16_guard, + input_data_var_names=None): self.program = program self.amp_list = amp_list self.use_fp16_guard = use_fp16_guard self.dist_context = dist_context self.grad_op_to_op_map = self.dist_context.dist_op_context.grad_op_id_to_op_id + if input_data_var_names: + self.input_data_var_names = input_data_var_names + else: + self.input_data_var_names = [] self._op_fp16_dict = { } # op_id --> True/False. 'True' means that the op is should run in fp16 mode. # a trick to determine leaf tensor node in program {varname: generator_op_id} @@ -191,7 +200,7 @@ def resolute_tensor_dtype(self, block): if _keep_fp32_input(op, in_name): continue for in_var_name in op.input(in_name): - if in_var_name not in self.forward_non_leaf_tensors: + if in_var_name not in self.forward_non_leaf_tensors and in_var_name not in self.input_data_var_names: self.set_var_to_fp16(in_var_name, block) for out_name in op.output_names: if _keep_fp32_output(op, out_name): @@ -498,10 +507,14 @@ def _apply_single_impl(self, main_program, startup_program, context): set(self.get_attr("custom_white_list")), set(self.get_attr("custom_black_list")), None) - # TODO support multiple blocks + # NOTE don't not change input data dtype, since it is controled by dataloader + # and which is out of control of FP16 Pass + input_data_var_names = [var.name for var in self.get_attr("input_data")] + with paddle.static.program_guard(main_program, startup_program): fp16_state = FP16State(main_program, amp_list, self.dist_context, - self.get_attr("use_fp16_guard")) + self.get_attr("use_fp16_guard"), + input_data_var_names) is_train = fp16_state._build_state() if is_train: From 0759e99d8a4ba233850dbffe87954a2b6a628776 Mon Sep 17 00:00:00 2001 From: helen88 Date: Mon, 18 Apr 2022 11:39:36 +0800 Subject: [PATCH 208/211] support tril_triu_grad for KL2, *test=kunlun (#41877) --- paddle/fluid/operators/tril_triu_op_xpu.cc | 32 +++++++++++++++++++ .../fluid/platform/device/xpu/xpu2_op_list.h | 3 ++ .../unittests/xpu/test_tril_triu_op_xpu.py | 18 ++++++++--- 3 files changed, 49 insertions(+), 4 deletions(-) diff --git a/paddle/fluid/operators/tril_triu_op_xpu.cc b/paddle/fluid/operators/tril_triu_op_xpu.cc index a44ea8ff689b8..70200fe733a5a 100644 --- a/paddle/fluid/operators/tril_triu_op_xpu.cc +++ b/paddle/fluid/operators/tril_triu_op_xpu.cc @@ -43,6 +43,34 @@ class TrilTriuXPUKernel : public framework::OpKernel { } }; +template +class TrilTriuGradXPUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + const auto* d_out = + context.Input(framework::GradVarName("Out")); + const auto* dout_data = d_out->data(); + auto* d_x = context.Output(framework::GradVarName("X")); + auto* dx_data = d_x->mutable_data(context.GetPlace()); + + const int diagonal = context.Attr("diagonal"); + const bool lower = context.Attr("lower"); + + auto dy_shape = phi::vectorize(d_out->dims()); + auto& dev_ctx = context.template device_context(); + int r = 0; + if (lower) { + r = xpu::tril(dev_ctx.x_context(), dout_data, dx_data, dy_shape, + diagonal); + PADDLE_ENFORCE_XDNN_SUCCESS(r, "tril_op"); + } else { + r = xpu::triu(dev_ctx.x_context(), dout_data, dx_data, dy_shape, + diagonal); + PADDLE_ENFORCE_XDNN_SUCCESS(r, "triu_op"); + } + } +}; + } // namespace operators } // namespace paddle @@ -50,4 +78,8 @@ namespace ops = paddle::operators; REGISTER_OP_XPU_KERNEL( tril_triu, ops::TrilTriuXPUKernel, ops::TrilTriuXPUKernel); +REGISTER_OP_XPU_KERNEL( + tril_triu_grad, + ops::TrilTriuGradXPUKernel, + ops::TrilTriuGradXPUKernel); #endif diff --git a/paddle/fluid/platform/device/xpu/xpu2_op_list.h b/paddle/fluid/platform/device/xpu/xpu2_op_list.h index 6f4826bd8c39a..7b88f261d5a4f 100644 --- a/paddle/fluid/platform/device/xpu/xpu2_op_list.h +++ b/paddle/fluid/platform/device/xpu/xpu2_op_list.h @@ -380,6 +380,9 @@ XPUOpMap& get_kl2_ops() { pOpKernelType(vartype::FP16, XPUPlace())})}, {"tril_triu", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()), pOpKernelType(vartype::INT32, XPUPlace())})}, + {"tril_triu_grad", + XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()), + pOpKernelType(vartype::INT32, XPUPlace())})}, {"tile", XPUKernelSet({pOpKernelType(vartype::INT32, XPUPlace()), pOpKernelType(vartype::INT64, XPUPlace()), pOpKernelType(vartype::BOOL, XPUPlace()), diff --git a/python/paddle/fluid/tests/unittests/xpu/test_tril_triu_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_tril_triu_op_xpu.py index fb6b28d9c2825..ee689efbb38a0 100644 --- a/python/paddle/fluid/tests/unittests/xpu/test_tril_triu_op_xpu.py +++ b/python/paddle/fluid/tests/unittests/xpu/test_tril_triu_op_xpu.py @@ -42,6 +42,7 @@ def setUp(self): self.real_np_op = getattr(np, self.real_op_type) self.set_xpu() self.op_type = "tril_triu" + self.place = paddle.XPUPlace(0) if self.dtype == np.int32: self.X = np.arange( 1, self.get_Xshape_prod() + 1, @@ -69,13 +70,22 @@ def get_Xshape_prod(self): def set_xpu(self): self.__class__.use_xpu = True - self.__class__.no_need_check_grad = True + self.__class__.no_need_check_grad = False self.__class__.op_type = self.real_op_type def test_check_output(self): - if paddle.is_compiled_with_xpu(): - place = paddle.XPUPlace(0) - self.check_output_with_place(place) + self.check_output_with_place(self.place) + + def test_check_grad_normal(self): + if self.dtype == np.int32: + user_defined_grad_outputs = np.random.random( + self.Xshape).astype('float32') + self.check_grad_with_place( + self.place, ['X'], + 'Out', + user_defined_grad_outputs=user_defined_grad_outputs) + else: + self.check_grad_with_place(self.place, ['X'], 'Out') def initTestCase(self): self.diagonal = None From 037c8099a63fe0e645faafb8a79f42da7413d068 Mon Sep 17 00:00:00 2001 From: Yuang Liu Date: Mon, 18 Apr 2022 13:56:15 +0800 Subject: [PATCH 209/211] add_license (#41848) --- paddle/fluid/operators/fused_softmax_mask_op.cu | 14 ++++++++++++++ .../fused_softmax_mask_upper_triangle_op.cu | 14 ++++++++++++++ 2 files changed, 28 insertions(+) diff --git a/paddle/fluid/operators/fused_softmax_mask_op.cu b/paddle/fluid/operators/fused_softmax_mask_op.cu index 2ba5c027a4d76..c4ab4de8a64cb 100644 --- a/paddle/fluid/operators/fused_softmax_mask_op.cu +++ b/paddle/fluid/operators/fused_softmax_mask_op.cu @@ -13,6 +13,20 @@ See the License for the specific language governing permissions and limitations under the License. */ // this file is inspired by: // https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/fused_kernels/scaled_masked_softmax.h +/* Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ #ifdef PADDLE_WITH_CUDA #include diff --git a/paddle/fluid/operators/fused_softmax_mask_upper_triangle_op.cu b/paddle/fluid/operators/fused_softmax_mask_upper_triangle_op.cu index 3bebbee1fb7cc..d4c5b8877056f 100644 --- a/paddle/fluid/operators/fused_softmax_mask_upper_triangle_op.cu +++ b/paddle/fluid/operators/fused_softmax_mask_upper_triangle_op.cu @@ -12,6 +12,20 @@ See the License for the specific language governing permissions and limitations under the License. */ // this file is inspired by: // https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/fused_kernels/scaled_upper_triang_masked_softmax.h +/* Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ #ifdef PADDLE_WITH_CUDA #include From f3531c7baae81c1b868ee0ad3df320b1b10378f6 Mon Sep 17 00:00:00 2001 From: huzhiqiang <912790387@qq.com> Date: Mon, 18 Apr 2022 14:10:41 +0800 Subject: [PATCH 210/211] [infrt] add efficientnet model (#41507) --- .../tests/models/efficientnet-b4/model.py | 26 ++ .../models/efficientnet-b4/net/__init__.py | 15 + .../efficientnet-b4/net/efficientnet.py | 284 +++++++++++++ .../tests/models/efficientnet-b4/net/utils.py | 385 ++++++++++++++++++ paddle/scripts/infrt_build.sh | 13 +- 5 files changed, 718 insertions(+), 5 deletions(-) create mode 100644 paddle/infrt/tests/models/efficientnet-b4/model.py create mode 100644 paddle/infrt/tests/models/efficientnet-b4/net/__init__.py create mode 100644 paddle/infrt/tests/models/efficientnet-b4/net/efficientnet.py create mode 100644 paddle/infrt/tests/models/efficientnet-b4/net/utils.py diff --git a/paddle/infrt/tests/models/efficientnet-b4/model.py b/paddle/infrt/tests/models/efficientnet-b4/model.py new file mode 100644 index 0000000000000..c660c3a46749e --- /dev/null +++ b/paddle/infrt/tests/models/efficientnet-b4/model.py @@ -0,0 +1,26 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# url: https://aistudio.baidu.com/aistudio/projectdetail/3756986?forkThirdPart=1 +from net import EfficientNet +from paddle.jit import to_static +from paddle.static import InputSpec +import paddle +import sys + +model = EfficientNet.from_name('efficientnet-b4') +net = to_static( + model, input_spec=[InputSpec( + shape=[None, 3, 256, 256], name='x')]) +paddle.jit.save(net, sys.argv[1]) diff --git a/paddle/infrt/tests/models/efficientnet-b4/net/__init__.py b/paddle/infrt/tests/models/efficientnet-b4/net/__init__.py new file mode 100644 index 0000000000000..d4e557829ae2c --- /dev/null +++ b/paddle/infrt/tests/models/efficientnet-b4/net/__init__.py @@ -0,0 +1,15 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from .efficientnet import EfficientNet diff --git a/paddle/infrt/tests/models/efficientnet-b4/net/efficientnet.py b/paddle/infrt/tests/models/efficientnet-b4/net/efficientnet.py new file mode 100644 index 0000000000000..a9956fcdc8862 --- /dev/null +++ b/paddle/infrt/tests/models/efficientnet-b4/net/efficientnet.py @@ -0,0 +1,284 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import paddle +import paddle.nn as nn +import paddle.nn.functional as F + +from .utils import (round_filters, round_repeats, drop_connect, + get_same_padding_conv2d, get_model_params, + efficientnet_params, load_pretrained_weights) + + +class MBConvBlock(nn.Layer): + """ + Mobile Inverted Residual Bottleneck Block + + Args: + block_args (namedtuple): BlockArgs, see above + global_params (namedtuple): GlobalParam, see above + + Attributes: + has_se (bool): Whether the block contains a Squeeze and Excitation layer. + """ + + def __init__(self, block_args, global_params): + super().__init__() + self._block_args = block_args + self._bn_mom = global_params.batch_norm_momentum + self._bn_eps = global_params.batch_norm_epsilon + self.has_se = (self._block_args.se_ratio is not None) and ( + 0 < self._block_args.se_ratio <= 1) + self.id_skip = block_args.id_skip # skip connection and drop connect + + # Get static or dynamic convolution depending on image size + Conv2d = get_same_padding_conv2d(image_size=global_params.image_size) + + # Expansion phase + inp = self._block_args.input_filters # number of input channels + oup = self._block_args.input_filters * self._block_args.expand_ratio # number of output channels + if self._block_args.expand_ratio != 1: + self._expand_conv = Conv2d( + in_channels=inp, + out_channels=oup, + kernel_size=1, + bias_attr=False) + self._bn0 = nn.BatchNorm2D( + num_features=oup, momentum=self._bn_mom, epsilon=self._bn_eps) + + # Depthwise convolution phase + k = self._block_args.kernel_size + s = self._block_args.stride + self._depthwise_conv = Conv2d( + in_channels=oup, + out_channels=oup, + groups=oup, # groups makes it depthwise + kernel_size=k, + stride=s, + bias_attr=False) + self._bn1 = nn.BatchNorm2D( + num_features=oup, momentum=self._bn_mom, epsilon=self._bn_eps) + + # Squeeze and Excitation layer, if desired + if self.has_se: + num_squeezed_channels = max(1, + int(self._block_args.input_filters * + self._block_args.se_ratio)) + self._se_reduce = Conv2d( + in_channels=oup, + out_channels=num_squeezed_channels, + kernel_size=1) + self._se_expand = Conv2d( + in_channels=num_squeezed_channels, + out_channels=oup, + kernel_size=1) + + # Output phase + final_oup = self._block_args.output_filters + self._project_conv = Conv2d( + in_channels=oup, + out_channels=final_oup, + kernel_size=1, + bias_attr=False) + self._bn2 = nn.BatchNorm2D( + num_features=final_oup, momentum=self._bn_mom, epsilon=self._bn_eps) + self._swish = nn.Hardswish() + + def forward(self, inputs, drop_connect_rate=None): + """ + :param inputs: input tensor + :param drop_connect_rate: drop connect rate (float, between 0 and 1) + :return: output of block + """ + + # Expansion and Depthwise Convolution + x = inputs + if self._block_args.expand_ratio != 1: + x = self._swish(self._bn0(self._expand_conv(inputs))) + x = self._swish(self._bn1(self._depthwise_conv(x))) + + # Squeeze and Excitation + if self.has_se: + x_squeezed = F.adaptive_avg_pool2d(x, 1) + x_squeezed = self._se_expand( + self._swish(self._se_reduce(x_squeezed))) + x = F.sigmoid(x_squeezed) * x + + x = self._bn2(self._project_conv(x)) + + # Skip connection and drop connect + input_filters, output_filters = self._block_args.input_filters, self._block_args.output_filters + if self.id_skip and self._block_args.stride == 1 and input_filters == output_filters: + if drop_connect_rate: + x = drop_connect( + x, prob=drop_connect_rate, training=self.training) + x = x + inputs # skip connection + return x + + def set_swish(self, memory_efficient=True): + """Sets swish function as memory efficient (for training) or standard (for export)""" + self._swish = nn.Hardswish() if memory_efficient else nn.Swish() + + +class EfficientNet(nn.Layer): + """ + An EfficientNet model. Most easily loaded with the .from_name or .from_pretrained methods + + Args: + blocks_args (list): A list of BlockArgs to construct blocks + global_params (namedtuple): A set of GlobalParams shared between blocks + + Example: + model = EfficientNet.from_pretrained('efficientnet-b0') + + """ + + def __init__(self, blocks_args=None, global_params=None): + super().__init__() + assert isinstance(blocks_args, list), 'blocks_args should be a list' + assert len(blocks_args) > 0, 'block args must be greater than 0' + self._global_params = global_params + self._blocks_args = blocks_args + + # Get static or dynamic convolution depending on image size + Conv2d = get_same_padding_conv2d(image_size=global_params.image_size) + + # Batch norm parameters + bn_mom = self._global_params.batch_norm_momentum + bn_eps = self._global_params.batch_norm_epsilon + + # Stem + in_channels = 3 # rgb + out_channels = round_filters( + 32, self._global_params) # number of output channels + self._conv_stem = Conv2d( + in_channels, out_channels, kernel_size=3, stride=2, bias_attr=False) + self._bn0 = nn.BatchNorm2D( + num_features=out_channels, momentum=bn_mom, epsilon=bn_eps) + + # Build blocks + self._blocks = nn.LayerList([]) + for block_args in self._blocks_args: + + # Update block input and output filters based on depth multiplier. + block_args = block_args._replace( + input_filters=round_filters(block_args.input_filters, + self._global_params), + output_filters=round_filters(block_args.output_filters, + self._global_params), + num_repeat=round_repeats(block_args.num_repeat, + self._global_params)) + + # The first block needs to take care of stride and filter size increase. + self._blocks.append(MBConvBlock(block_args, self._global_params)) + if block_args.num_repeat > 1: + block_args = block_args._replace( + input_filters=block_args.output_filters, stride=1) + for _ in range(block_args.num_repeat - 1): + self._blocks.append( + MBConvBlock(block_args, self._global_params)) + + # Head + in_channels = block_args.output_filters # output of final block + out_channels = round_filters(1280, self._global_params) + self._conv_head = Conv2d( + in_channels, out_channels, kernel_size=1, bias_attr=False) + self._bn1 = nn.BatchNorm2D( + num_features=out_channels, momentum=bn_mom, epsilon=bn_eps) + + # Final linear layer + self._avg_pooling = nn.AdaptiveAvgPool2D(1) + self._dropout = nn.Dropout(self._global_params.dropout_rate) + self._fc = nn.Linear(out_channels, self._global_params.num_classes) + self._swish = nn.Hardswish() + + def set_swish(self, memory_efficient=True): + """Sets swish function as memory efficient (for training) or standard (for export)""" + self._swish = nn.Hardswish() if memory_efficient else nn.Swish() + for block in self._blocks: + block.set_swish(memory_efficient) + + def extract_features(self, inputs): + """ Returns output of the final convolution layer """ + + # Stem + x = self._swish(self._bn0(self._conv_stem(inputs))) + + # Blocks + for idx, block in enumerate(self._blocks): + drop_connect_rate = self._global_params.drop_connect_rate + if drop_connect_rate: + drop_connect_rate *= float(idx) / len(self._blocks) + x = block(x, drop_connect_rate=drop_connect_rate) + + # Head + x = self._swish(self._bn1(self._conv_head(x))) + + return x + + def forward(self, inputs): + """ Calls extract_features to extract features, applies final linear layer, and returns logits. """ + bs = inputs.shape[0] + # Convolution layers + x = self.extract_features(inputs) + + # Pooling and final linear layer + x = self._avg_pooling(x) + x = paddle.reshape(x, (bs, -1)) + x = self._dropout(x) + x = self._fc(x) + return x + + @classmethod + def from_name(cls, model_name, override_params=None): + cls._check_model_name_is_valid(model_name) + blocks_args, global_params = get_model_params(model_name, + override_params) + return cls(blocks_args, global_params) + + @classmethod + def from_pretrained(cls, + model_name, + advprop=False, + num_classes=1000, + in_channels=3): + model = cls.from_name( + model_name, override_params={'num_classes': num_classes}) + load_pretrained_weights( + model, model_name, load_fc=(num_classes == 1000), advprop=advprop) + if in_channels != 3: + Conv2d = get_same_padding_conv2d( + image_size=model._global_params.image_size) + out_channels = round_filters(32, model._global_params) + model._conv_stem = Conv2d( + in_channels, + out_channels, + kernel_size=3, + stride=2, + bias_attr=False) + return model + + @classmethod + def get_image_size(cls, model_name): + cls._check_model_name_is_valid(model_name) + _, _, res, _ = efficientnet_params(model_name) + return res + + @classmethod + def _check_model_name_is_valid(cls, model_name): + """ Validates model name. """ + valid_models = ['efficientnet-b' + str(i) for i in range(9)] + if model_name not in valid_models: + raise ValueError('model_name should be one of: ' + ', '.join( + valid_models)) diff --git a/paddle/infrt/tests/models/efficientnet-b4/net/utils.py b/paddle/infrt/tests/models/efficientnet-b4/net/utils.py new file mode 100644 index 0000000000000..3bf8b4eb73022 --- /dev/null +++ b/paddle/infrt/tests/models/efficientnet-b4/net/utils.py @@ -0,0 +1,385 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import re +import math +from functools import partial +import collections + +import paddle +import paddle.nn as nn +import paddle.nn.functional as F + +# Parameters for the entire model (stem, all blocks, and head) +GlobalParams = collections.namedtuple('GlobalParams', [ + 'batch_norm_momentum', 'batch_norm_epsilon', 'dropout_rate', 'num_classes', + 'width_coefficient', 'depth_coefficient', 'depth_divisor', 'min_depth', + 'drop_connect_rate', 'image_size' +]) + +# Parameters for an individual model block +BlockArgs = collections.namedtuple('BlockArgs', [ + 'kernel_size', 'num_repeat', 'input_filters', 'output_filters', + 'expand_ratio', 'id_skip', 'stride', 'se_ratio' +]) + +# Change namedtuple defaults +GlobalParams.__new__.__defaults__ = (None, ) * len(GlobalParams._fields) +BlockArgs.__new__.__defaults__ = (None, ) * len(BlockArgs._fields) + + +def round_filters(filters, global_params): + """ Calculate and round number of filters based on depth multiplier. """ + multiplier = global_params.width_coefficient + if not multiplier: + return filters + divisor = global_params.depth_divisor + min_depth = global_params.min_depth + filters *= multiplier + min_depth = min_depth or divisor + new_filters = max(min_depth, + int(filters + divisor / 2) // divisor * divisor) + if new_filters < 0.9 * filters: # prevent rounding by more than 10% + new_filters += divisor + return int(new_filters) + + +def round_repeats(repeats, global_params): + """ Round number of filters based on depth multiplier. """ + multiplier = global_params.depth_coefficient + if not multiplier: + return repeats + return int(math.ceil(multiplier * repeats)) + + +def drop_connect(inputs, prob, training): + """Drop input connection""" + if not training: + return inputs + keep_prob = 1.0 - prob + inputs_shape = paddle.shape(inputs) + random_tensor = keep_prob + paddle.rand(shape=[inputs_shape[0], 1, 1, 1]) + binary_tensor = paddle.floor(random_tensor) + output = inputs / keep_prob * binary_tensor + return output + + +def get_same_padding_conv2d(image_size=None): + """ Chooses static padding if you have specified an image size, and dynamic padding otherwise. + Static padding is necessary for ONNX exporting of models. """ + if image_size is None: + return Conv2dDynamicSamePadding + else: + return partial(Conv2dStaticSamePadding, image_size=image_size) + + +class Conv2dDynamicSamePadding(nn.Conv2D): + """ 2D Convolutions like TensorFlow, for a dynamic image size """ + + def __init__(self, + in_channels, + out_channels, + kernel_size, + stride=1, + dilation=1, + groups=1, + bias_attr=None): + super().__init__( + in_channels, + out_channels, + kernel_size, + stride, + 0, + dilation, + groups, + bias_attr=bias_attr) + self.stride = self._stride if len( + self._stride) == 2 else [self._stride[0]] * 2 + + def forward(self, x): + ih, iw = x.shape[-2:] + kh, kw = self.weight.shape[-2:] + sh, sw = self.stride + oh, ow = math.ceil(ih / sh), math.ceil(iw / sw) + pad_h = max((oh - 1) * self.stride[0] + + (kh - 1) * self._dilation[0] + 1 - ih, 0) + pad_w = max((ow - 1) * self.stride[1] + + (kw - 1) * self._dilation[1] + 1 - iw, 0) + if pad_h > 0 or pad_w > 0: + x = F.pad(x, [ + pad_w // 2, pad_w - pad_w // 2, pad_h // 2, pad_h - pad_h // 2 + ]) + return F.conv2d(x, self.weight, self.bias, self.stride, self._padding, + self._dilation, self._groups) + + +class Conv2dStaticSamePadding(nn.Conv2D): + """ 2D Convolutions like TensorFlow, for a fixed image size""" + + def __init__(self, + in_channels, + out_channels, + kernel_size, + image_size=None, + **kwargs): + if 'stride' in kwargs and isinstance(kwargs['stride'], list): + kwargs['stride'] = kwargs['stride'][0] + super().__init__(in_channels, out_channels, kernel_size, **kwargs) + self.stride = self._stride if len( + self._stride) == 2 else [self._stride[0]] * 2 + + # Calculate padding based on image size and save it + assert image_size is not None + ih, iw = image_size if type( + image_size) == list else [image_size, image_size] + kh, kw = self.weight.shape[-2:] + sh, sw = self.stride + oh, ow = math.ceil(ih / sh), math.ceil(iw / sw) + pad_h = max((oh - 1) * self.stride[0] + + (kh - 1) * self._dilation[0] + 1 - ih, 0) + pad_w = max((ow - 1) * self.stride[1] + + (kw - 1) * self._dilation[1] + 1 - iw, 0) + if pad_h > 0 or pad_w > 0: + self.static_padding = nn.Pad2D([ + pad_w // 2, pad_w - pad_w // 2, pad_h // 2, pad_h - pad_h // 2 + ]) + else: + self.static_padding = Identity() + + def forward(self, x): + x = self.static_padding(x) + x = F.conv2d(x, self.weight, self.bias, self.stride, self._padding, + self._dilation, self._groups) + return x + + +class Identity(nn.Layer): + def __init__(self, ): + super().__init__() + + def forward(self, x): + return x + + +def efficientnet_params(model_name): + """ Map EfficientNet model name to parameter coefficients. """ + params_dict = { + # Coefficients: width,depth,resolution,dropout + 'efficientnet-b0': (1.0, 1.0, 224, 0.2), + 'efficientnet-b1': (1.0, 1.1, 240, 0.2), + 'efficientnet-b2': (1.1, 1.2, 260, 0.3), + 'efficientnet-b3': (1.2, 1.4, 300, 0.3), + 'efficientnet-b4': (1.4, 1.8, 380, 0.4), + 'efficientnet-b5': (1.6, 2.2, 456, 0.4), + 'efficientnet-b6': (1.8, 2.6, 528, 0.5), + 'efficientnet-b7': (2.0, 3.1, 600, 0.5), + 'efficientnet-b8': (2.2, 3.6, 672, 0.5), + 'efficientnet-l2': (4.3, 5.3, 800, 0.5), + } + return params_dict[model_name] + + +class BlockDecoder(object): + """ Block Decoder for readability, straight from the official TensorFlow repository """ + + @staticmethod + def _decode_block_string(block_string): + """ Gets a block through a string notation of arguments. """ + assert isinstance(block_string, str) + + ops = block_string.split('_') + options = {} + for op in ops: + splits = re.split(r'(\d.*)', op) + if len(splits) >= 2: + key, value = splits[:2] + options[key] = value + + # Check stride + assert (('s' in options and len(options['s']) == 1) or + (len(options['s']) == 2 and options['s'][0] == options['s'][1])) + + return BlockArgs( + kernel_size=int(options['k']), + num_repeat=int(options['r']), + input_filters=int(options['i']), + output_filters=int(options['o']), + expand_ratio=int(options['e']), + id_skip=('noskip' not in block_string), + se_ratio=float(options['se']) if 'se' in options else None, + stride=[int(options['s'][0])]) + + @staticmethod + def _encode_block_string(block): + """Encodes a block to a string.""" + args = [ + 'r%d' % block.num_repeat, 'k%d' % block.kernel_size, 's%d%d' % + (block.strides[0], block.strides[1]), 'e%s' % block.expand_ratio, + 'i%d' % block.input_filters, 'o%d' % block.output_filters + ] + if 0 < block.se_ratio <= 1: + args.append('se%s' % block.se_ratio) + if block.id_skip is False: + args.append('noskip') + return '_'.join(args) + + @staticmethod + def decode(string_list): + """ + Decodes a list of string notations to specify blocks inside the network. + + :param string_list: a list of strings, each string is a notation of block + :return: a list of BlockArgs namedtuples of block args + """ + assert isinstance(string_list, list) + blocks_args = [] + for block_string in string_list: + blocks_args.append(BlockDecoder._decode_block_string(block_string)) + return blocks_args + + @staticmethod + def encode(blocks_args): + """ + Encodes a list of BlockArgs to a list of strings. + + :param blocks_args: a list of BlockArgs namedtuples of block args + :return: a list of strings, each string is a notation of block + """ + block_strings = [] + for block in blocks_args: + block_strings.append(BlockDecoder._encode_block_string(block)) + return block_strings + + +def efficientnet(width_coefficient=None, + depth_coefficient=None, + dropout_rate=0.2, + drop_connect_rate=0.2, + image_size=None, + num_classes=1000): + """ Get block arguments according to parameter and coefficients. """ + blocks_args = [ + 'r1_k3_s11_e1_i32_o16_se0.25', + 'r2_k3_s22_e6_i16_o24_se0.25', + 'r2_k5_s22_e6_i24_o40_se0.25', + 'r3_k3_s22_e6_i40_o80_se0.25', + 'r3_k5_s11_e6_i80_o112_se0.25', + 'r4_k5_s22_e6_i112_o192_se0.25', + 'r1_k3_s11_e6_i192_o320_se0.25', + ] + blocks_args = BlockDecoder.decode(blocks_args) + + global_params = GlobalParams( + batch_norm_momentum=0.99, + batch_norm_epsilon=1e-3, + dropout_rate=dropout_rate, + drop_connect_rate=drop_connect_rate, + num_classes=num_classes, + width_coefficient=width_coefficient, + depth_coefficient=depth_coefficient, + depth_divisor=8, + min_depth=None, + image_size=image_size, ) + + return blocks_args, global_params + + +def get_model_params(model_name, override_params): + """ Get the block args and global params for a given model """ + if model_name.startswith('efficientnet'): + w, d, s, p = efficientnet_params(model_name) + blocks_args, global_params = efficientnet( + width_coefficient=w, + depth_coefficient=d, + dropout_rate=p, + image_size=s) + else: + raise NotImplementedError('model name is not pre-defined: %s' % + model_name) + if override_params: + global_params = global_params._replace(**override_params) + return blocks_args, global_params + + +url_map = { + 'efficientnet-b0': + '/home/aistudio/data/weights/efficientnet-b0-355c32eb.pdparams', + 'efficientnet-b1': + '/home/aistudio/data/weights/efficientnet-b1-f1951068.pdparams', + 'efficientnet-b2': + '/home/aistudio/data/weights/efficientnet-b2-8bb594d6.pdparams', + 'efficientnet-b3': + '/home/aistudio/data/weights/efficientnet-b3-5fb5a3c3.pdparams', + 'efficientnet-b4': + '/home/aistudio/data/weights/efficientnet-b4-6ed6700e.pdparams', + 'efficientnet-b5': + '/home/aistudio/data/weights/efficientnet-b5-b6417697.pdparams', + 'efficientnet-b6': + '/home/aistudio/data/weights/efficientnet-b6-c76e70fd.pdparams', + 'efficientnet-b7': + '/home/aistudio/data/weights/efficientnet-b7-dcc49843.pdparams', +} + +url_map_advprop = { + 'efficientnet-b0': + '/home/aistudio/data/weights/adv-efficientnet-b0-b64d5a18.pdparams', + 'efficientnet-b1': + '/home/aistudio/data/weights/adv-efficientnet-b1-0f3ce85a.pdparams', + 'efficientnet-b2': + '/home/aistudio/data/weights/adv-efficientnet-b2-6e9d97e5.pdparams', + 'efficientnet-b3': + '/home/aistudio/data/weights/adv-efficientnet-b3-cdd7c0f4.pdparams', + 'efficientnet-b4': + '/home/aistudio/data/weights/adv-efficientnet-b4-44fb3a87.pdparams', + 'efficientnet-b5': + '/home/aistudio/data/weights/adv-efficientnet-b5-86493f6b.pdparams', + 'efficientnet-b6': + '/home/aistudio/data/weights/adv-efficientnet-b6-ac80338e.pdparams', + 'efficientnet-b7': + '/home/aistudio/data/weights/adv-efficientnet-b7-4652b6dd.pdparams', + 'efficientnet-b8': + '/home/aistudio/data/weights/adv-efficientnet-b8-22a8fe65.pdparams', +} + + +def load_pretrained_weights(model, + model_name, + weights_path=None, + load_fc=True, + advprop=False): + """Loads pretrained weights from weights path or download using url. + Args: + model (Module): The whole model of efficientnet. + model_name (str): Model name of efficientnet. + weights_path (None or str): + str: path to pretrained weights file on the local disk. + None: use pretrained weights downloaded from the Internet. + load_fc (bool): Whether to load pretrained weights for fc layer at the end of the model. + advprop (bool): Whether to load pretrained weights + trained with advprop (valid when weights_path is None). + """ + + # AutoAugment or Advprop (different preprocessing) + url_map_ = url_map_advprop if advprop else url_map + state_dict = paddle.load(url_map_[model_name]) + + if load_fc: + model.set_state_dict(state_dict) + else: + state_dict.pop('_fc.weight') + state_dict.pop('_fc.bias') + model.set_state_dict(state_dict) + + print('Loaded pretrained weights for {}'.format(model_name)) diff --git a/paddle/scripts/infrt_build.sh b/paddle/scripts/infrt_build.sh index 6634f5396ac74..2756e3b321150 100755 --- a/paddle/scripts/infrt_build.sh +++ b/paddle/scripts/infrt_build.sh @@ -44,11 +44,6 @@ function update_pd_ops() { cd ${PADDLE_ROOT}/tools/infrt/ python3 generate_pd_op_dialect_from_paddle_op_maker.py python3 generate_phi_kernel_dialect.py - # generate test model - cd ${PADDLE_ROOT} - mkdir -p ${PADDLE_ROOT}/build/models - python3 paddle/infrt/tests/models/abs_model.py ${PADDLE_ROOT}/build/paddle/infrt/tests/abs - python3 paddle/infrt/tests/models/resnet50_model.py ${PADDLE_ROOT}/build/models/resnet50/model } function init() { @@ -114,6 +109,14 @@ function create_fake_models() { # create multi_fc model, this will generate "multi_fc_model" python3 -m pip uninstall -y paddlepaddle python3 -m pip install *whl + + # generate test model + cd ${PADDLE_ROOT} + mkdir -p ${PADDLE_ROOT}/build/models + python3 paddle/infrt/tests/models/abs_model.py ${PADDLE_ROOT}/build/paddle/infrt/tests/abs + python3 paddle/infrt/tests/models/resnet50_model.py ${PADDLE_ROOT}/build/models/resnet50/model + python3 paddle/infrt/tests/models/efficientnet-b4/model.py ${PADDLE_ROOT}/build/models/efficientnet-b4/model + cd ${PADDLE_ROOT}/build python3 ${PADDLE_ROOT}/tools/infrt/fake_models/multi_fc.py python3 ${PADDLE_ROOT}/paddle/infrt/tests/models/linear.py From 34f30f795bf1fd82dca8cd3c8aa89e9fc163daf2 Mon Sep 17 00:00:00 2001 From: lilong12 Date: Mon, 18 Apr 2022 14:14:38 +0800 Subject: [PATCH 211/211] fix bug for eager mode distributed training (#41841) --- .../distributed/collective/ProcessGroup.cc | 5 +- .../distributed/collective/ProcessGroup.h | 4 +- .../collective/ProcessGroupGloo.cc | 5 +- .../distributed/collective/ProcessGroupGloo.h | 3 +- .../collective/ProcessGroupHCCL.cc | 8 +++- .../distributed/collective/ProcessGroupHCCL.h | 2 +- .../collective/ProcessGroupHeter.cc | 20 ++++---- .../collective/ProcessGroupHeter.h | 6 +-- .../collective/ProcessGroupNCCL.cc | 8 +++- .../distributed/collective/ProcessGroupNCCL.h | 2 +- paddle/fluid/pybind/distributed_py.cc | 48 +++++++++---------- python/paddle/distributed/collective.py | 16 +++++-- .../tests/unittests/process_group_gloo.py | 3 +- 13 files changed, 74 insertions(+), 56 deletions(-) diff --git a/paddle/fluid/distributed/collective/ProcessGroup.cc b/paddle/fluid/distributed/collective/ProcessGroup.cc index 6fec3a41e1047..e6d9975f75db6 100644 --- a/paddle/fluid/distributed/collective/ProcessGroup.cc +++ b/paddle/fluid/distributed/collective/ProcessGroup.cc @@ -35,8 +35,9 @@ bool ProcessGroup::Task::Wait(std::chrono::milliseconds timeout) { void ProcessGroup::Task::Synchronize() {} -ProcessGroup::ProcessGroup(int rank, int size, int gid) - : rank_(rank), size_(size), gid_(gid) { +ProcessGroup::ProcessGroup(int rank, int size, const platform::Place& place, + int gid) + : rank_(rank), size_(size), place_(place), gid_(gid) { if (gid != IGNORE_ID) { auto map = ProcessGroupMapFromGid::getInstance(); map->insert(gid_, this); diff --git a/paddle/fluid/distributed/collective/ProcessGroup.h b/paddle/fluid/distributed/collective/ProcessGroup.h index fbc9c1f476202..fca395c5f2bf7 100644 --- a/paddle/fluid/distributed/collective/ProcessGroup.h +++ b/paddle/fluid/distributed/collective/ProcessGroup.h @@ -69,7 +69,8 @@ class ProcessGroup { bool is_completed_ = false; }; - explicit ProcessGroup(int rank, int size, int gid); + explicit ProcessGroup(int rank, int size, const platform::Place& place, + int gid); virtual ~ProcessGroup() {} int GetRank() const { return rank_; } @@ -145,6 +146,7 @@ class ProcessGroup { protected: const int rank_; const int size_; + const platform::Place place_; const int gid_; }; diff --git a/paddle/fluid/distributed/collective/ProcessGroupGloo.cc b/paddle/fluid/distributed/collective/ProcessGroupGloo.cc index 6ddea74d95db6..824341c3cd97d 100644 --- a/paddle/fluid/distributed/collective/ProcessGroupGloo.cc +++ b/paddle/fluid/distributed/collective/ProcessGroupGloo.cc @@ -165,8 +165,9 @@ ProcessGroupGloo::GlooTask::GlooTask( ProcessGroupGloo::ProcessGroupGloo( const std::shared_ptr& store, int rank, int world_size, - int gid, const std::shared_ptr options) - : ProcessGroup(rank, world_size, gid), + const platform::Place& place, int gid, + const std::shared_ptr options) + : ProcessGroup(rank, world_size, place, gid), _tag(0), _store(new GlooStore(store)) { _context = std::make_shared(rank, world_size); diff --git a/paddle/fluid/distributed/collective/ProcessGroupGloo.h b/paddle/fluid/distributed/collective/ProcessGroupGloo.h index 335ca1bd17f2c..1eb8b47a09223 100644 --- a/paddle/fluid/distributed/collective/ProcessGroupGloo.h +++ b/paddle/fluid/distributed/collective/ProcessGroupGloo.h @@ -102,7 +102,8 @@ class ProcessGroupGloo : public ProcessGroup { explicit ProcessGroupGloo( const std::shared_ptr& store, int rank, - int world_size, int gid, std::shared_ptr options); + int world_size, const platform::Place& place, int gid, + std::shared_ptr options); ~ProcessGroupGloo() = default; diff --git a/paddle/fluid/distributed/collective/ProcessGroupHCCL.cc b/paddle/fluid/distributed/collective/ProcessGroupHCCL.cc index 55ecdaaf6bfb7..9ed6c2198df4c 100644 --- a/paddle/fluid/distributed/collective/ProcessGroupHCCL.cc +++ b/paddle/fluid/distributed/collective/ProcessGroupHCCL.cc @@ -17,6 +17,7 @@ #include "paddle/fluid/distributed/collective/HCCLTools.h" #include "paddle/fluid/memory/malloc.h" #include "paddle/fluid/platform/device/npu/hccl_helper.h" +#include "paddle/fluid/platform/device/npu/npu_info.h" #include "paddle/fluid/platform/device_context.h" #include "paddle/fluid/platform/place.h" #include "paddle/phi/api/include/api.h" @@ -97,8 +98,11 @@ bool ProcessGroupHCCL::HCCLTask::Wait(std::chrono::milliseconds timeout) { void ProcessGroupHCCL::HCCLTask::Synchronize() { Wait(kWaitTimeout); } ProcessGroupHCCL::ProcessGroupHCCL(const std::shared_ptr& store, - int rank, int size, int gid) - : ProcessGroup(rank, size, gid), store_(store) {} + int rank, int size, + const platform::Place& place, int gid) + : ProcessGroup(rank, size, place, gid), store_(store) { + platform::SetNPUDeviceId(place_.device); +} void ProcessGroupHCCL::BroadcastUniqueHCCLID( std::vector& hccl_ids) { // NOLINT diff --git a/paddle/fluid/distributed/collective/ProcessGroupHCCL.h b/paddle/fluid/distributed/collective/ProcessGroupHCCL.h index f3d3fa2f8a72a..2f0ff6b9565ea 100644 --- a/paddle/fluid/distributed/collective/ProcessGroupHCCL.h +++ b/paddle/fluid/distributed/collective/ProcessGroupHCCL.h @@ -71,7 +71,7 @@ class ProcessGroupHCCL : public ProcessGroup { }; ProcessGroupHCCL(const std::shared_ptr& store, int rank, int size, - int gid); + const platform::Place& place, int gid); const std::string GetBackendName() const override { return std::string(HCCL_BACKEND_NAME); diff --git a/paddle/fluid/distributed/collective/ProcessGroupHeter.cc b/paddle/fluid/distributed/collective/ProcessGroupHeter.cc index 354a8e23ae41f..ef57bb5ba232c 100644 --- a/paddle/fluid/distributed/collective/ProcessGroupHeter.cc +++ b/paddle/fluid/distributed/collective/ProcessGroupHeter.cc @@ -44,13 +44,11 @@ bool ProcessGroupHeter::HeterTask::Wait(std::chrono::milliseconds timeout) { return true; } -ProcessGroupHeter::ProcessGroupHeter(const std::shared_ptr& store, - int rank, int size, int gid, - int local_rank, int local_size, - int gloo_rank, int gloo_size, - bool with_switch, - std::string switch_endpoint) - : ProcessGroup(rank, size, gid), +ProcessGroupHeter::ProcessGroupHeter( + const std::shared_ptr& store, int rank, int size, + const platform::Place& place, int gid, int local_rank, int local_size, + int gloo_rank, int gloo_size, bool with_switch, std::string switch_endpoint) + : ProcessGroup(rank, size, place, gid), store_(store), local_rank_(local_rank), local_size_(local_size), @@ -60,10 +58,10 @@ ProcessGroupHeter::ProcessGroupHeter(const std::shared_ptr& store, switch_endpoint_(switch_endpoint) { #if defined(PADDLE_WITH_NCCL) inner_pg_ = std::make_shared(store, local_rank, local_size, - IGNORE_ID); + place_, IGNORE_ID); #elif defined(PADDLE_WITH_ASCEND_CL) inner_pg_ = std::make_shared(store, local_rank, local_size, - IGNORE_ID); + place_, IGNORE_ID); #else PADDLE_THROW(platform::errors::Fatal( "ProcessGroupHeter only supports NCCL and HCCL now."); @@ -71,8 +69,8 @@ ProcessGroupHeter::ProcessGroupHeter(const std::shared_ptr& store, if (local_rank_ == 0 && !with_switch_) { auto opts = ProcessGroupGloo::GlooOptions::create(); opts->device = ProcessGroupGloo::createDefaultDevice(); - inter_pg_ = std::make_shared(store, gloo_rank_, - gloo_size_, IGNORE_ID, opts); + inter_pg_ = std::make_shared( + store, gloo_rank_, gloo_size_, place_, IGNORE_ID, opts); } } diff --git a/paddle/fluid/distributed/collective/ProcessGroupHeter.h b/paddle/fluid/distributed/collective/ProcessGroupHeter.h index 05bacd93d7815..640acdfb6a23b 100644 --- a/paddle/fluid/distributed/collective/ProcessGroupHeter.h +++ b/paddle/fluid/distributed/collective/ProcessGroupHeter.h @@ -81,9 +81,9 @@ class ProcessGroupHeter : public ProcessGroup { }; ProcessGroupHeter(const std::shared_ptr& store, int rank, int size, - int gid, int local_rank, int local_size, int gloo_rank, - int gloo_size, bool with_switch, - std::string switch_endpoints); + const platform::Place& place, int gid, int local_rank, + int local_size, int gloo_rank, int gloo_size, + bool with_switch, std::string switch_endpoints); const std::string GetBackendName() const override { return std::string(HETER_BACKEND_NAME); diff --git a/paddle/fluid/distributed/collective/ProcessGroupNCCL.cc b/paddle/fluid/distributed/collective/ProcessGroupNCCL.cc index 30813b904df53..12de7d116e2b5 100644 --- a/paddle/fluid/distributed/collective/ProcessGroupNCCL.cc +++ b/paddle/fluid/distributed/collective/ProcessGroupNCCL.cc @@ -14,6 +14,7 @@ #include "paddle/fluid/distributed/collective/ProcessGroupNCCL.h" #include "paddle/fluid/distributed/collective/Common.h" +#include "paddle/fluid/platform/device/gpu/gpu_info.h" #include "paddle/fluid/platform/device/gpu/nccl_helper.h" #include "paddle/fluid/platform/place.h" #include "paddle/phi/api/include/api.h" @@ -103,8 +104,11 @@ bool ProcessGroupNCCL::NCCLTask::Wait(std::chrono::milliseconds timeout) { void ProcessGroupNCCL::NCCLTask::Synchronize() { Wait(kWaitTimeout); } ProcessGroupNCCL::ProcessGroupNCCL(const std::shared_ptr& store, - int rank, int size, int gid) - : ProcessGroup(rank, size, gid), store_(store) {} + int rank, int size, + const platform::Place& place, int gid) + : ProcessGroup(rank, size, place, gid), store_(store) { + platform::SetDeviceId(place_.device); +} void ProcessGroupNCCL::BroadcastUniqueNCCLID( std::vector& nccl_ids) { // NOLINT diff --git a/paddle/fluid/distributed/collective/ProcessGroupNCCL.h b/paddle/fluid/distributed/collective/ProcessGroupNCCL.h index cca84285ef4de..4b6c3f4031354 100644 --- a/paddle/fluid/distributed/collective/ProcessGroupNCCL.h +++ b/paddle/fluid/distributed/collective/ProcessGroupNCCL.h @@ -77,7 +77,7 @@ class ProcessGroupNCCL : public ProcessGroup { }; ProcessGroupNCCL(const std::shared_ptr& store, int rank, int size, - int gid); + const platform::Place& place, int gid); const std::string GetBackendName() const override { return std::string(NCCL_BACKEND_NAME); diff --git a/paddle/fluid/pybind/distributed_py.cc b/paddle/fluid/pybind/distributed_py.cc index 716cd35f0a614..ab8bf0529dcfc 100644 --- a/paddle/fluid/pybind/distributed_py.cc +++ b/paddle/fluid/pybind/distributed_py.cc @@ -241,49 +241,42 @@ void BindDistributed(py::module *m) { std::shared_ptr>( *m, "ProcessGroupNCCL", ProcessGroup) .def(py::init &, int, int, - int>(), + const platform::CUDAPlace &, int>(), py::arg("store"), py::arg("rank"), py::arg("world_size"), - py::arg("group_id") = 0, py::call_guard()); + py::arg("place"), py::arg("group_id") = 0, + py::call_guard()); +#endif #if defined(PADDLE_WITH_GLOO) && defined(PADDLE_WITH_PSCORE) && \ (defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_ASCEND_CL)) py::class_>( *m, "ProcessGroupHeter", ProcessGroup) - .def(py::init &, int, int, int, - int, int, int, int, bool, std::string>(), + .def(py::init &, int, int, +#if defined(PADDLE_WITH_ASCEND_CL) + const platform::NPUPlace &, +#else + const platform::CUDAPlace &, +#endif + int, int, int, int, int, bool, std::string>(), py::arg("store"), py::arg("rank"), py::arg("world_size"), - py::arg("gid") = 0, py::arg("local_rank") = 0, + py::arg("place"), py::arg("gid") = 0, py::arg("local_rank") = 0, py::arg("local_size") = 1, py::arg("gloo_rank") = 0, py::arg("gloo_size") = 1, py::arg("with_switch") = false, py::arg("switch_endpoint") = "", py::call_guard()); #endif -#endif #if defined(PADDLE_WITH_ASCEND_CL) py::class_>( *m, "ProcessGroupHCCL", ProcessGroup) .def(py::init &, int, int, - int>(), - py::arg("store"), py::arg("rank"), py::arg("world_size"), - py::arg("group_id") = 0, py::call_guard()); - -#if defined(PADDLE_WITH_GLOO) && defined(PADDLE_WITH_PSCORE) && \ - (defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_ASCEND_CL)) - py::class_>( - *m, "ProcessGroupHeter", ProcessGroup) - .def(py::init &, int, int, int, - int, int, int, int, bool, std::string>(), + const platform::NPUPlace &, int>(), py::arg("store"), py::arg("rank"), py::arg("world_size"), - py::arg("gid") = 0, py::arg("local_rank") = 0, - py::arg("local_size") = 1, py::arg("gloo_rank") = 0, - py::arg("gloo_rank") = 1, py::arg("with_switch") = false, - py::arg("switch_endpoint") = "", + py::arg("place"), py::arg("group_id") = 0, py::call_guard()); -#endif + #endif py::class_>( *m, "ProcessGroupGloo", ProcessGroup) .def(py::init &, int, - int, int, std::shared_ptr &>(), + int, const platform::CPUPlace &, int, + std::shared_ptr &>(), py::call_guard()) .def(py::init([](const std::shared_ptr &store, - int rank, int world_size, int gid) { + int rank, int world_size, + const platform::CPUPlace &place, int gid) { auto opts = GlooOptions::create(); char *ifname = getenv(GLOO_SOCKET_IFNAME_ENV.c_str()); if (ifname && strlen(ifname) > 1) { @@ -312,10 +307,11 @@ void BindDistributed(py::module *m) { opts->device = ProcessGroupGloo::createDefaultDevice(); } return std::make_shared(store, rank, world_size, - gid, opts); + place, gid, opts); }), py::arg("store"), py::arg("rank"), py::arg("world_size"), - py::arg("group_id") = 0, py::call_guard()) + py::arg("place"), py::arg("group_id") = 0, + py::call_guard()) .def_static("create_default_device", &ProcessGroupGloo::createDefaultDevice); #endif diff --git a/python/paddle/distributed/collective.py b/python/paddle/distributed/collective.py index 993b45b4eecf9..35ab1193c2b05 100644 --- a/python/paddle/distributed/collective.py +++ b/python/paddle/distributed/collective.py @@ -228,14 +228,23 @@ def _new_process_group_impl(backend, pg_options, group_id=0): pg = None + genv = _get_global_env() assert backend in _valid_backend_list, "Unsupported backend: %s." % backend if backend == "gloo": - pg = core.ProcessGroupGloo(store, rank, world_size, group_id) + place = core.CPUPlace() + pg = core.ProcessGroupGloo(store, rank, world_size, place, group_id) elif backend == "nccl": - pg = core.ProcessGroupNCCL(store, rank, world_size, group_id) + place = core.CUDAPlace(genv.device_id) + pg = core.ProcessGroupNCCL(store, rank, world_size, place, group_id) elif backend == "hccl": - pg = core.ProcessGroupHCCL(store, rank, world_size, group_id) + place = core.NPUPlace(genv.device_id) + pg = core.ProcessGroupHCCL(store, rank, world_size, place, group_id) elif backend == "heter": + place = None + if core.is_compiled_with_cuda(): + place = core.CUDAPlace(genv.device_id) + elif core.is_compiled_with_npu(): + place = core.NPUPlace(genv.device_id) cluster_id = int(os.getenv("CLUSTER_ID", "-1")) assert cluster_id >= 0, "please set the CLUSTER_ID variable." cluster_size = os.getenv("CLUSTER_SIZE", None) @@ -253,6 +262,7 @@ def _new_process_group_impl(backend, store, rank=global_rank, world_size=global_world_size, + place=place, gid=0, local_rank=rank, local_size=world_size, diff --git a/python/paddle/fluid/tests/unittests/process_group_gloo.py b/python/paddle/fluid/tests/unittests/process_group_gloo.py index 03886ab8a147f..9be8a35f1ae1b 100644 --- a/python/paddle/fluid/tests/unittests/process_group_gloo.py +++ b/python/paddle/fluid/tests/unittests/process_group_gloo.py @@ -47,7 +47,8 @@ def test_create_process_group_gloo(self): is_master = True if rank == 0 else False store = paddle.fluid.core.TCPStore("127.0.0.1", 6272, is_master, nranks, datetime.timedelta(0)) - pg = paddle.fluid.core.ProcessGroupGloo(store, rank, nranks) + place = paddle.fluid.core.CPUPlace() + pg = paddle.fluid.core.ProcessGroupGloo(store, rank, nranks, place) # test allreduce sum # rank 0