Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Inference add batch stream #44524

Merged
merged 1 commit into from
Jul 22, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
29 changes: 27 additions & 2 deletions paddle/fluid/inference/api/analysis_predictor.cc
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@
#include "paddle/fluid/inference/api/paddle_analysis_config.h"
#include "paddle/fluid/inference/api/paddle_inference_api.h"
#include "paddle/fluid/inference/api/paddle_inference_pass.h"
#include "paddle/fluid/inference/api/resource_manager.h"
#include "paddle/fluid/inference/utils/io_utils.h"
#include "paddle/fluid/inference/utils/model_utils.h"
#include "paddle/fluid/inference/utils/singleton.h"
Expand All @@ -56,6 +57,7 @@
#include "paddle/phi/common/backend.h"
#include "paddle/phi/common/data_type.h"
#include "paddle/phi/common/place.h"
#include "paddle/phi/core/enforce.h"
#include "paddle/utils/string/split.h"

#if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE)
Expand Down Expand Up @@ -1618,8 +1620,31 @@ bool AnalysisPredictor::ZeroCopyRun() {

#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
bool AnalysisPredictor::ExpRunWithExternalStream(const gpuStream_t stream) {
LOG_FIRST_N(WARNING, 1) << "We will remove this interface in the future. "
"Please use config.SetExecStream instead.";
if (!private_context_) {
PADDLE_THROW(platform::errors::Fatal(
"Please use config.SetExecStream to init gpu resources, and then we "
"will bind gpu resources to execution stream."));
}

if (stream != predictor_stream_) {
#ifdef PADDLE_WITH_HIP
hipStreamSynchronize(static_cast<gpuStream_t>(predictor_stream_));
#else
cudaStreamSynchronize(static_cast<gpuStream_t>(predictor_stream_));
#endif
ResourceManager::Instance().GpuResourceReBindStream(predictor_stream_,
stream);
predictor_stream_ = stream;

auto *dev_ctxs = reinterpret_cast<const std::map<
phi::Place,
std::shared_future<std::unique_ptr<phi::DeviceContext>>> *>(
this->GetDeviceContexts());
auto *dev_ctx =
static_cast<InferGPUContext *>(dev_ctxs->at(place_).get().get());
dev_ctx->SetStream(stream);
}

return ZeroCopyRun();
}
#endif
Expand Down
112 changes: 112 additions & 0 deletions paddle/fluid/inference/api/resource_manager.cc
Original file line number Diff line number Diff line change
Expand Up @@ -17,17 +17,29 @@
#include <memory>
#include <mutex>
#include <unordered_map>
#include <utility>

#include "paddle/fluid/memory/allocation/allocator_facade.h"
#include "paddle/fluid/platform/device/gpu/gpu_types.h"
#include "paddle/phi/backends/gpu/forwards.h"
#include "paddle/phi/backends/gpu/gpu_decls.h"
#include "paddle/phi/backends/gpu/gpu_info.h"
#include "paddle/phi/backends/gpu/gpu_resources.h"
#include "paddle/phi/common/place.h"
#include "paddle/phi/core/allocator.h"
#include "paddle/phi/core/errors.h"
#include "paddle/phi/core/generator.h"
#include "unsupported/Eigen/CXX11/Tensor"

#include "paddle/fluid/platform/enforce.h"

#ifdef PADDLE_WITH_CUDA
#include "paddle/phi/backends/dynload/cublas.h"
#include "paddle/phi/backends/dynload/cudnn.h"
#include "paddle/phi/backends/dynload/cusolver.h"
#include "paddle/phi/backends/dynload/cusparse.h"
#endif // PADDLE_WITH_CUDA

namespace paddle {
namespace internal {

Expand Down Expand Up @@ -237,6 +249,8 @@ void GPUContextResource::DestroySparseHandle() {
phi::DestroySparseHandle(sparse_handle_);
}

phi::Place GPUContextResource::Place() const { return place_; }

gpuStream_t GPUContextResource::GetStream() const { return stream_; }

dnnHandle_t GPUContextResource::GetDnnHandle() const { return dnn_handle_; }
Expand Down Expand Up @@ -291,6 +305,75 @@ std::array<int, 3> GPUContextResource::GetGpuMaxGridDimSize() const {
return max_grid_dim_size_;
}

void GPUContextResource::ReBindStream(gpuStream_t stream) {
owned_stream_ = false;
stream_ = stream;
}

void GPUContextResource::ReBindDnnHandle(gpuStream_t stream) const {
#ifdef PADDLE_WITH_HIP
PADDLE_ENFORCE_GPU_SUCCESS(
phi::dynload::miopenSetStream(dnn_handle_, stream));
#else
PADDLE_RETRY_CUDA_SUCCESS(phi::dynload::cudnnSetStream(dnn_handle_, stream));
#endif
}

void GPUContextResource::ReBindBlasHandle(gpuStream_t stream) const {
#ifdef PADDLE_WITH_HIP
PADDLE_ENFORCE_GPU_SUCCESS(
phi::dynload::rocblas_set_stream(blas_handle_, stream));
#else
PADDLE_RETRY_CUDA_SUCCESS(
phi::dynload::cublasSetStream(blas_handle_, stream));
#endif
}

void GPUContextResource::ReBindBlasTensorCoreHandle(gpuStream_t stream) const {
#ifdef PADDLE_WITH_HIP
PADDLE_ENFORCE_GPU_SUCCESS(
phi::dynload::rocblas_set_stream(blas_tensor_core_handle_, stream));
#else
PADDLE_RETRY_CUDA_SUCCESS(
phi::dynload::cublasSetStream(blas_tensor_core_handle_, stream));
#endif
}

void GPUContextResource::ReBindBlasTF32Handle(gpuStream_t stream) const {
#ifdef PADDLE_WITH_HIP
PADDLE_ENFORCE_GPU_SUCCESS(
phi::dynload::rocblas_set_stream(blas_tf32_tensor_core_handle_, stream));
#else
PADDLE_RETRY_CUDA_SUCCESS(
phi::dynload::cublasSetStream(blas_tf32_tensor_core_handle_, stream));
#endif
}

void GPUContextResource::ReBindSolverDnHandle(gpuStream_t stream) const {
#ifndef PADDLE_WITH_HIP
PADDLE_RETRY_CUDA_SUCCESS(
phi::dynload::cusolverDnSetStream(solver_handle_, stream));
#endif
}

void GPUContextResource::ReBindSparseHandle(gpuStream_t stream) const {
#if defined(PADDLE_WITH_CUDA)
// The generic APIs is supported from CUDA10.1
#if CUDA_VERSION >= 11000
PADDLE_RETRY_CUDA_SUCCESS(
phi::dynload::cusparseSetStream(sparse_handle_, stream));
#endif
#endif
}

void GPUContextResource::ReBindEigenDevice(gpuStream_t stream,
GPUPlace place) const {
auto* allocator = paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(place_)
.get();
eigen_stream_->Reinitialize(stream, allocator, place);
}

#endif

void ResourceManager::InitCPUResource() {
Expand Down Expand Up @@ -359,6 +442,35 @@ GPUContextResource* ResourceManager::GetGPUResource(void* stream) const {
return gpu_resources_.at(stream).get();
}

void ResourceManager::GpuResourceReBindStream(void* old_stream,
void* new_stream) {
PADDLE_ENFORCE_EQ(
gpu_resources_.count(old_stream),
true,
platform::errors::InvalidArgument(
"The stream[%p] not found in gpu_resources.", old_stream));
auto gpu_resource = std::move(gpu_resources_.at(old_stream));
DestroyGPUResource(old_stream);
PADDLE_ENFORCE_EQ(
ref_count_.count(old_stream),
0,
platform::errors::Fatal("gpu resources rebind stream failed."));

gpu_resource->ReBindStream(static_cast<gpuStream_t>(new_stream));
gpu_resource->ReBindDnnHandle(static_cast<gpuStream_t>(new_stream));
gpu_resource->ReBindBlasHandle(static_cast<gpuStream_t>(new_stream));
gpu_resource->ReBindBlasTensorCoreHandle(
static_cast<gpuStream_t>(new_stream));
gpu_resource->ReBindBlasTF32Handle(static_cast<gpuStream_t>(new_stream));
gpu_resource->ReBindSolverDnHandle(static_cast<gpuStream_t>(new_stream));
gpu_resource->ReBindSparseHandle(static_cast<gpuStream_t>(new_stream));
gpu_resource->ReBindEigenDevice(static_cast<gpuStream_t>(new_stream),
gpu_resource->Place());

ref_count_[new_stream]++;
gpu_resources_.emplace(new_stream, std::move(gpu_resource));
}

int ResourceManager::RefCount(void* stream) const {
if (ref_count_.count(stream) == 0) return 0;
return ref_count_.at(stream);
Expand Down
13 changes: 13 additions & 0 deletions paddle/fluid/inference/api/resource_manager.h
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
#include "paddle/fluid/platform/macros.h"
#include "paddle/phi/api/include/tensor.h"
#include "paddle/phi/backends/cpu/forwards.h"
#include "paddle/phi/common/place.h"

#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
#include "paddle/fluid/platform/device/gpu/gpu_types.h"
Expand Down Expand Up @@ -52,6 +53,7 @@ class GPUContextResource {
public:
explicit GPUContextResource(const phi::Place& place, void* stream);
~GPUContextResource();
phi::Place Place() const;

gpuStream_t GetStream() const;
dnnHandle_t GetDnnHandle() const;
Expand All @@ -70,6 +72,16 @@ class GPUContextResource {
int GetGpuMaxThreadsPerBlock() const;
std::array<int, 3> GetGpuMaxGridDimSize() const;

// If stream changes, we need to rebind all handle to new stream.
void ReBindStream(gpuStream_t stream);
void ReBindDnnHandle(gpuStream_t stream) const;
void ReBindBlasHandle(gpuStream_t stream) const;
void ReBindBlasTensorCoreHandle(gpuStream_t stream) const;
void ReBindBlasTF32Handle(gpuStream_t stream) const;
void ReBindSolverDnHandle(gpuStream_t stream) const;
void ReBindSparseHandle(gpuStream_t stream) const;
void ReBindEigenDevice(gpuStream_t stream, GPUPlace place) const;

private:
void InitGPUResource(void* stream);
void DestroyGPUResource();
Expand Down Expand Up @@ -138,6 +150,7 @@ class ResourceManager {
void DestroyGPUResource(void* stream);
GPUContextResource* GetGPUResource(void* stream) const;
int RefCount(void* stream) const;
void GpuResourceReBindStream(void* old_stream, void* new_stream);

private:
void Decrease(void* stream);
Expand Down