Skip to content

Commit

Permalink
Merge pull request #27 from GPUOpen-LibrariesAndSDKs/next-release-5
Browse files Browse the repository at this point in the history
Next release 5
  • Loading branch information
RichardGe authored Feb 11, 2025
2 parents b9e664d + 92e7d4c commit 0d40036
Show file tree
Hide file tree
Showing 11 changed files with 279 additions and 212 deletions.
138 changes: 121 additions & 17 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ option(NO_ENCRYPT "Don't encrypt kernel source and binaries" OFF)
option(NO_UNITTEST "Don't build unit tests" OFF)
option(HIPRT_PREFER_HIP_5 "Prefer HIP 5" OFF)

option(FORCE_DISABLE_CUDA "By default Cuda support is automatically added if a Cuda install is detected. Turn this flag to ON to force Cuda to be disabled." OFF)


find_program(PYTHON_EXECUTABLE
Expand Down Expand Up @@ -186,6 +187,18 @@ function(get_hip_sdk_version result result_path)


endif()

# build hip command for Linux
else()

# If not defined we try to take it from the PATH
if(NOT HIP_PATH)
set(hipCommand "hipcc")

# otherwise, build the hipcc command with full path.
else()
set(hipCommand "${HIP_PATH}/bin/${hipCommand}")
endif()
endif()


Expand Down Expand Up @@ -296,9 +309,12 @@ add_definitions(-D__USE_HIP__)
add_definitions(-DHIPRT_PUBLIC_REPO)


# Enable CUDA if possible
include(${CMAKE_CURRENT_SOURCE_DIR}/contrib/Orochi/Orochi/enable_cuew.cmake)

if (NOT FORCE_DISABLE_CUDA)
# Enable CUDA if possible
include(${CMAKE_CURRENT_SOURCE_DIR}/contrib/Orochi/Orochi/enable_cuew.cmake)
else()
message(STATUS "CUDA support is forced to disabled.")
endif()


# Base output directory
Expand Down Expand Up @@ -361,20 +377,93 @@ if(HIPRT_PREFER_HIP_5)
endif()



# files generated by compile.py and precompile_bitcode.py
if(WIN32)
set(KERNEL_OS_POSTFIX "win")
else()
set(KERNEL_OS_POSTFIX "linux")
endif()
set(KERNEL_HIPRT_COMP "${BASE_OUTPUT_DIR}/${CMAKE_BUILD_TYPE}/hiprt${version_str_}_${HIP_VERSION_STR}_amd.hipfb") # example: hiprt02005_6.2_amd.hipfb
set(KERNEL_UNITTEST_COMP "${BASE_OUTPUT_DIR}/${CMAKE_BUILD_TYPE}/hiprt${version_str_}_${HIP_VERSION_STR}_precompiled_bitcode_${KERNEL_OS_POSTFIX}.hipfb") # example: hiprt02005_6.2_precompiled_bitcode_win.hipfb
set(KERNEL_OROCHI_COMP "${BASE_OUTPUT_DIR}/${CMAKE_BUILD_TYPE}/oro_compiled_kernels.hipfb")


# precompile kernels:
if(PRECOMPILE)
message(">> Execute: ${PYTHON_EXECUTABLE} compile.py --nvidia --hipSdkPath \"${HIP_FINAL_PATH}\"")
execute_process(
COMMAND ${PYTHON_EXECUTABLE} compile.py --nvidia --hipSdkPath ${HIP_FINAL_PATH}
WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/scripts/bitcodes

if(FORCE_DISABLE_CUDA)
set(CUDA_OPTION "")
else()
set(CUDA_OPTION "--nvidia")
endif()


set(bvh_source
${CMAKE_SOURCE_DIR}/hiprt/hiprt_vec.h
${CMAKE_SOURCE_DIR}/hiprt/hiprt_math.h
${CMAKE_SOURCE_DIR}/hiprt/impl/Aabb.h
${CMAKE_SOURCE_DIR}/hiprt/impl/AabbList.h
${CMAKE_SOURCE_DIR}/hiprt/impl/BvhCommon.h
${CMAKE_SOURCE_DIR}/hiprt/impl/BvhNode.h
${CMAKE_SOURCE_DIR}/hiprt/impl/Geometry.h
${CMAKE_SOURCE_DIR}/hiprt/impl/QrDecomposition.h
${CMAKE_SOURCE_DIR}/hiprt/impl/Quaternion.h
${CMAKE_SOURCE_DIR}/hiprt/impl/Transform.h
${CMAKE_SOURCE_DIR}/hiprt/impl/Instance.h
${CMAKE_SOURCE_DIR}/hiprt/impl/InstanceList.h
${CMAKE_SOURCE_DIR}/hiprt/impl/MortonCode.h
${CMAKE_SOURCE_DIR}/hiprt/impl/Scene.h
${CMAKE_SOURCE_DIR}/hiprt/impl/TriangleMesh.h
${CMAKE_SOURCE_DIR}/hiprt/impl/Triangle.h
${CMAKE_SOURCE_DIR}/hiprt/impl/BvhBuilderUtil.h
${CMAKE_SOURCE_DIR}/hiprt/impl/SbvhCommon.h
${CMAKE_SOURCE_DIR}/hiprt/impl/ApiNodeList.h
${CMAKE_SOURCE_DIR}/hiprt/impl/BvhConfig.h
${CMAKE_SOURCE_DIR}/hiprt/impl/MemoryArena.h
${CMAKE_SOURCE_DIR}/hiprt/hiprt_types.h
${CMAKE_SOURCE_DIR}/hiprt/hiprt_common.h
)

message(">> add_custom_command: ${PYTHON_EXECUTABLE} compile.py ${CUDA_OPTION} --hipSdkPath \"${HIP_FINAL_PATH}\"")
add_custom_command(
OUTPUT ${KERNEL_HIPRT_COMP} ${KERNEL_OROCHI_COMP}
COMMAND ${PYTHON_EXECUTABLE} compile.py ${CUDA_OPTION} --hipSdkPath ${HIP_FINAL_PATH}
WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/scripts/bitcodes
COMMENT "Precompiling kernels via compile.py"
VERBATIM
DEPENDS ${bvh_source}
)

# create the 'precompile_kernels' project
add_custom_target(precompile_kernels ALL
DEPENDS ${KERNEL_HIPRT_COMP} ${KERNEL_OROCHI_COMP}
)

if(NOT NO_UNITTEST)
message(">> Execute: ${PYTHON_EXECUTABLE} precompile_bitcode.py --nvidia --hipSdkPath \"${HIP_FINAL_PATH}\"")
execute_process(
COMMAND ${PYTHON_EXECUTABLE} precompile_bitcode.py --nvidia --hipSdkPath ${HIP_FINAL_PATH}

set(unittest_kernel_source
${CMAKE_SOURCE_DIR}/test/bitcodes/custom_func_table.cpp
${CMAKE_SOURCE_DIR}/test/bitcodes/unit_test.cpp
)

message(">> add_custom_command: ${PYTHON_EXECUTABLE} precompile_bitcode.py ${CUDA_OPTION} --hipSdkPath \"${HIP_FINAL_PATH}\"")
add_custom_command(
OUTPUT ${KERNEL_UNITTEST_COMP}
COMMAND ${PYTHON_EXECUTABLE} precompile_bitcode.py ${CUDA_OPTION} --hipSdkPath ${HIP_FINAL_PATH}
DEPENDS ${KERNEL_HIPRT_COMP} # Ensure compile.py has already run.
WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/scripts/bitcodes
COMMENT "Precompiling unit tests kernels via precompile_bitcode.py"
VERBATIM
DEPENDS ${unittest_kernel_source}
)

# create the 'precompile_unittest_kernels' project
add_custom_target(precompile_unittest_kernels ALL
DEPENDS ${KERNEL_UNITTEST_COMP}
)

add_dependencies(${HIPRT_NAME} precompile_unittest_kernels)
endif()


Expand All @@ -386,23 +475,38 @@ endif()
# it's expected the step 'PRECOMPILE' has been executed.
if ( BAKE_COMPILED_KERNEL )

message(">> Generate embedded precompiled")
message(">> precompiled will be embedded.")

set(PYTHON_FILE "${CMAKE_CURRENT_SOURCE_DIR}/contrib/Orochi/scripts/convert_binary_to_array.py")

set(KERNEL_HIPRT_COMP "${BASE_OUTPUT_DIR}/${CMAKE_BUILD_TYPE}/hiprt${version_str_}_${HIP_VERSION_STR}_amd.hipfb")
# HIPRT binary
set(KERNEL_HIPRT_H "${CMAKE_CURRENT_SOURCE_DIR}/hiprt/impl/bvh_build_array.h")
execute_process(
add_custom_command(
OUTPUT ${KERNEL_HIPRT_H}
COMMAND ${PYTHON_EXECUTABLE} ${PYTHON_FILE} ${KERNEL_HIPRT_COMP} ${KERNEL_HIPRT_H}
DEPENDS ${KERNEL_HIPRT_COMP} # Ensure compile.py has already run.
WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
)
COMMENT "Converting HIPRT compiled kernel to header"
VERBATIM
)

set(KERNEL_OROCHI_COMP "${BASE_OUTPUT_DIR}/${CMAKE_BUILD_TYPE}/oro_compiled_kernels.hipfb")
# Orochi binary
set(KERNEL_OROCHI_H "${CMAKE_CURRENT_SOURCE_DIR}/contrib/Orochi/ParallelPrimitives/cache/oro_compiled_kernels.h")
execute_process(
add_custom_command(
OUTPUT ${KERNEL_OROCHI_H}
COMMAND ${PYTHON_EXECUTABLE} ${PYTHON_FILE} ${KERNEL_OROCHI_COMP} ${KERNEL_OROCHI_H}
DEPENDS ${KERNEL_OROCHI_COMP} # Ensure compile.py has already run.
WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
)
COMMENT "Converting Orochi compiled kernel to header"
VERBATIM
)

# Create the 'bake_compiled_kernels' project
add_custom_target(bake_compiled_kernels ALL
DEPENDS ${KERNEL_HIPRT_H} ${KERNEL_OROCHI_H}
)

add_dependencies(${HIPRT_NAME} precompile_kernels bake_compiled_kernels)

endif()

Expand Down
116 changes: 70 additions & 46 deletions hiprt/impl/BvhBuilderKernels.h
Original file line number Diff line number Diff line change
Expand Up @@ -202,8 +202,6 @@ SingletonConstruction( uint32_t index, PrimitiveContainer& primitives, BoxNode*
leafType = InstanceType;
}

primNodes[0].m_parentAddr = 0;

BoxNode root;
root.m_box0 = primitives.fetchAabb( 0 );
root.m_box1.reset();
Expand Down Expand Up @@ -415,58 +413,78 @@ extern "C" __global__ void ComputeMortonCodes_InstanceList_MatrixFrame(
ComputeMortonCodes<InstanceList<MatrixFrame>>( primitives, centroidBox, mortonCodeKeys, mortonCodeValues );
}

extern "C" __global__ void ResetCounters( uint32_t primCount, BoxNode* boxNodes )
template <typename PrimitiveContainer, typename PrimitiveNode, typename Header>
__device__ void ResetCountersAndUpdateLeaves(
const Header* header, PrimitiveContainer& primitives, BoxNode* boxNodes, PrimitiveNode* primNodes )
{
const uint32_t index = threadIdx.x + blockIdx.x * blockDim.x;
if ( index < primCount ) boxNodes[index].m_updateCounter = 0;

if ( index < header->m_boxNodeCount ) boxNodes[index].m_updateCounter = 0;

if constexpr ( is_same<PrimitiveNode, TriangleNode>::value )
{
if ( index < header->m_primNodeCount )
{
primNodes[index] = primitives.fetchTriangleNode( { primNodes[index].m_primIndex0, primNodes[index].m_primIndex1 } );
}
}
else if constexpr ( is_same<PrimitiveNode, InstanceNode>::value )
{
if ( index < primitives.getFrameCount() ) primitives.convertFrame( index );

if ( index < header->m_primNodeCount )
{
const uint32_t primIndex = primNodes[index].m_primIndex;
hiprtTransformHeader transform = primitives.fetchTransformHeader( primIndex );
primNodes[index].m_mask = primitives.fetchMask( primIndex );
if ( transform.frameCount == 1 )
primNodes[index].m_identity =
primitives.copyInvTransformMatrix( transform.frameIndex, primNodes[index].m_matrix ) ? 1 : 0;
}
}
}

template <typename InstanceList>
__device__ void ResetCountersAndUpdateFrames( InstanceList& instanceList, BoxNode* boxNodes )
extern "C" __global__ void ResetCountersAndUpdateLeaves_TriangleMesh_TriangleNode(
const GeomHeader* header, TriangleMesh primitives, BoxNode* boxNodes, TriangleNode* primNodes )
{
const uint32_t index = threadIdx.x + blockIdx.x * blockDim.x;
if ( index < instanceList.getCount() ) boxNodes[index].m_updateCounter = 0;
if ( index < instanceList.getFrameCount() ) instanceList.convertFrame( index );
ResetCountersAndUpdateLeaves( header, primitives, boxNodes, primNodes );
}

extern "C" __global__ void
ResetCountersAndUpdateFrames_InstanceList_SRTFrame( InstanceList<SRTFrame> instanceList, BoxNode* boxNodes )
extern "C" __global__ void ResetCountersAndUpdateLeaves_AabbList_CustomNode(
const GeomHeader* header, AabbList primitives, BoxNode* boxNodes, CustomNode* primNodes )
{
ResetCountersAndUpdateFrames<InstanceList<SRTFrame>>( instanceList, boxNodes );
ResetCountersAndUpdateLeaves( header, primitives, boxNodes, primNodes );
}

extern "C" __global__ void
ResetCountersAndUpdateFrames_InstanceList_MatrixFrame( InstanceList<MatrixFrame> instanceList, BoxNode* boxNodes )
extern "C" __global__ void ResetCountersAndUpdateLeaves_InstanceList_MatrixFrame_InstanceNode(
const SceneHeader* header, InstanceList<MatrixFrame> primitives, BoxNode* boxNodes, InstanceNode* primNodes )
{
ResetCountersAndUpdateFrames<InstanceList<MatrixFrame>>( instanceList, boxNodes );
ResetCountersAndUpdateLeaves( header, primitives, boxNodes, primNodes );
}

template <typename PrimitiveContainer, typename PrimitiveNode>
__device__ void FitBounds( PrimitiveContainer& primitives, BoxNode* boxNodes, PrimitiveNode* primNodes )
extern "C" __global__ void ResetCountersAndUpdateLeaves_InstanceList_SRTFrame_InstanceNode(
const SceneHeader* header, InstanceList<SRTFrame> primitives, BoxNode* boxNodes, InstanceNode* primNodes )
{
ResetCountersAndUpdateLeaves( header, primitives, boxNodes, primNodes );
}

template <typename PrimitiveContainer, typename PrimitiveNode, typename Header>
__device__ void FitBounds( Header* header, PrimitiveContainer& primitives, BoxNode* boxNodes, PrimitiveNode* primNodes )
{
uint32_t index = threadIdx.x + blockIdx.x * blockDim.x;

if ( index >= primitives.getCount() ) return;
if ( index >= header->m_boxNodeCount ) return;

uint32_t parentAddr = primNodes[index].m_parentAddr;
if constexpr ( is_same<PrimitiveNode, TriangleNode>::value )
{
primNodes[index] =
primitives.fetchTriangleNode( make_uint2( primNodes[index].m_primIndex0, primNodes[index].m_primIndex1 ) );
primNodes[index].m_parentAddr = parentAddr;
}
else if constexpr ( is_same<PrimitiveNode, InstanceNode>::value )
BoxNode node = boxNodes[index];
uint32_t internalCount = 0;
for ( uint32_t i = 0; i < node.m_childCount; ++i )
{
const uint32_t primIndex = primNodes[index].m_primIndex;
hiprtTransformHeader transform = primitives.fetchTransformHeader( primIndex );
primNodes[index].m_mask = primitives.fetchMask( primIndex );
if ( transform.frameCount == 1 )
primNodes[index].m_identity =
primitives.copyInvTransformMatrix( transform.frameIndex, primNodes[index].m_matrix ) ? 1 : 0;
if ( node.getChildType( i ) == BoxType ) internalCount++;
}

index = parentAddr;
while ( index != InvalidValue && atomicAdd( &boxNodes[index].m_updateCounter, 1 ) >= boxNodes[index].m_childCount - 1 )
if ( internalCount > 0 ) return;

while ( true )
{
__threadfence();

Expand All @@ -484,33 +502,40 @@ __device__ void FitBounds( PrimitiveContainer& primitives, BoxNode* boxNodes, Pr
if ( node.m_childIndex3 != InvalidValue )
node.m_box3 = getNodeBox( node.m_childIndex3, primitives, boxNodes, primNodes );

index = boxNodes[index].m_parentAddr;
internalCount = 0;
for ( uint32_t i = 0; i < node.m_childCount; ++i )
{
if ( node.getChildType( i ) == BoxType ) internalCount++;
}

__threadfence();

if ( atomicAdd( &node.m_updateCounter, 1 ) < internalCount - 1 ) break;
}
}

extern "C" __global__ void
FitBounds_TriangleMesh_TriangleNode( TriangleMesh primitives, BoxNode* boxNodes, TriangleNode* primNodes )
FitBounds_TriangleMesh_TriangleNode( GeomHeader* header, TriangleMesh primitives, BoxNode* boxNodes, TriangleNode* primNodes )
{
FitBounds<TriangleMesh, TriangleNode>( primitives, boxNodes, primNodes );
FitBounds<TriangleMesh, TriangleNode>( header, primitives, boxNodes, primNodes );
}

extern "C" __global__ void FitBounds_AabbList_CustomNode( AabbList primitives, BoxNode* boxNodes, CustomNode* primNodes )
extern "C" __global__ void
FitBounds_AabbList_CustomNode( GeomHeader* header, AabbList primitives, BoxNode* boxNodes, CustomNode* primNodes )
{
FitBounds<AabbList, CustomNode>( primitives, boxNodes, primNodes );
FitBounds<AabbList, CustomNode>( header, primitives, boxNodes, primNodes );
}

extern "C" __global__ void
FitBounds_InstanceList_SRTFrame_InstanceNode( InstanceList<SRTFrame> primitives, BoxNode* boxNodes, InstanceNode* primNodes )
extern "C" __global__ void FitBounds_InstanceList_SRTFrame_InstanceNode(
SceneHeader* header, InstanceList<SRTFrame> primitives, BoxNode* boxNodes, InstanceNode* primNodes )
{
FitBounds<InstanceList<SRTFrame>, InstanceNode>( primitives, boxNodes, primNodes );
FitBounds<InstanceList<SRTFrame>, InstanceNode>( header, primitives, boxNodes, primNodes );
}

extern "C" __global__ void FitBounds_InstanceList_MatrixFrame_InstanceNode(
InstanceList<MatrixFrame> primitives, BoxNode* boxNodes, InstanceNode* primNodes )
SceneHeader* header, InstanceList<MatrixFrame> primitives, BoxNode* boxNodes, InstanceNode* primNodes )
{
FitBounds<InstanceList<MatrixFrame>, InstanceNode>( primitives, boxNodes, primNodes );
FitBounds<InstanceList<MatrixFrame>, InstanceNode>( header, primitives, boxNodes, primNodes );
}

template <typename PrimitiveContainer, typename PrimitiveNode, typename Header>
Expand Down Expand Up @@ -635,8 +660,7 @@ __device__ void Collapse(
else
primNodes[nodeAddr].m_transform = transform;
}
primNodes[nodeAddr].m_parentAddr = parentAddr;
done = true;
done = true;
}
}

Expand Down
Loading

0 comments on commit 0d40036

Please sign in to comment.