Merge pull request #27 from GPUOpen-LibrariesAndSDKs/next-release-5

Next release 5
GPUOpen-LibrariesAndSDKs · Feb 11, 2025 · 0d40036 · 0d40036
2 parents b9e664d + 92e7d4c
commit 0d40036
Show file tree

Hide file tree

Showing 11 changed files with 279 additions and 212 deletions.
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -16,6 +16,7 @@ option(NO_ENCRYPT "Don't encrypt kernel source and binaries" OFF)
 option(NO_UNITTEST "Don't build unit tests" OFF)
 option(HIPRT_PREFER_HIP_5 "Prefer HIP 5" OFF)
 
+option(FORCE_DISABLE_CUDA "By default Cuda support is automatically added if a Cuda install is detected. Turn this flag to ON to force Cuda to be disabled." OFF)
 
 
 find_program(PYTHON_EXECUTABLE
@@ -186,6 +187,18 @@ function(get_hip_sdk_version result result_path)
 
 
 		endif()
+
+	# build hip command for Linux
+	else()
+
+		# If not defined we try to take it from the PATH
+		if(NOT HIP_PATH)
+			set(hipCommand "hipcc")
+
+		# otherwise, build the hipcc command with full path.
+		else()
+			set(hipCommand "${HIP_PATH}/bin/${hipCommand}")
+		endif()
 	endif()
 
 
@@ -296,9 +309,12 @@ add_definitions(-D__USE_HIP__)
 add_definitions(-DHIPRT_PUBLIC_REPO)
 
 
-# Enable CUDA if possible
-include(${CMAKE_CURRENT_SOURCE_DIR}/contrib/Orochi/Orochi/enable_cuew.cmake)
-
+if (NOT FORCE_DISABLE_CUDA)
+	# Enable CUDA if possible
+	include(${CMAKE_CURRENT_SOURCE_DIR}/contrib/Orochi/Orochi/enable_cuew.cmake)
+else()
+	message(STATUS "CUDA support is forced to disabled.")
+endif()
 
 
 # Base output directory
@@ -361,20 +377,93 @@ if(HIPRT_PREFER_HIP_5)
 endif()
 
 
+
+# files generated by compile.py and precompile_bitcode.py
+if(WIN32)
+	set(KERNEL_OS_POSTFIX "win")
+else()
+	set(KERNEL_OS_POSTFIX "linux")
+endif()
+set(KERNEL_HIPRT_COMP    "${BASE_OUTPUT_DIR}/${CMAKE_BUILD_TYPE}/hiprt${version_str_}_${HIP_VERSION_STR}_amd.hipfb")                                        # example:  hiprt02005_6.2_amd.hipfb
+set(KERNEL_UNITTEST_COMP "${BASE_OUTPUT_DIR}/${CMAKE_BUILD_TYPE}/hiprt${version_str_}_${HIP_VERSION_STR}_precompiled_bitcode_${KERNEL_OS_POSTFIX}.hipfb")   # example:  hiprt02005_6.2_precompiled_bitcode_win.hipfb
+set(KERNEL_OROCHI_COMP "${BASE_OUTPUT_DIR}/${CMAKE_BUILD_TYPE}/oro_compiled_kernels.hipfb")
+
+
 # precompile kernels:
 if(PRECOMPILE)
-	message(">> Execute: ${PYTHON_EXECUTABLE} compile.py --nvidia --hipSdkPath \"${HIP_FINAL_PATH}\"")
-	execute_process(
-		COMMAND ${PYTHON_EXECUTABLE} compile.py --nvidia --hipSdkPath ${HIP_FINAL_PATH}
-		WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/scripts/bitcodes
+
+	if(FORCE_DISABLE_CUDA)
+		set(CUDA_OPTION "")
+	else()
+		set(CUDA_OPTION "--nvidia")
+	endif()
+
+
+	set(bvh_source
+	${CMAKE_SOURCE_DIR}/hiprt/hiprt_vec.h
+	${CMAKE_SOURCE_DIR}/hiprt/hiprt_math.h
+	${CMAKE_SOURCE_DIR}/hiprt/impl/Aabb.h
+	${CMAKE_SOURCE_DIR}/hiprt/impl/AabbList.h
+	${CMAKE_SOURCE_DIR}/hiprt/impl/BvhCommon.h
+	${CMAKE_SOURCE_DIR}/hiprt/impl/BvhNode.h
+	${CMAKE_SOURCE_DIR}/hiprt/impl/Geometry.h
+	${CMAKE_SOURCE_DIR}/hiprt/impl/QrDecomposition.h
+	${CMAKE_SOURCE_DIR}/hiprt/impl/Quaternion.h
+	${CMAKE_SOURCE_DIR}/hiprt/impl/Transform.h
+	${CMAKE_SOURCE_DIR}/hiprt/impl/Instance.h
+	${CMAKE_SOURCE_DIR}/hiprt/impl/InstanceList.h
+	${CMAKE_SOURCE_DIR}/hiprt/impl/MortonCode.h
+	${CMAKE_SOURCE_DIR}/hiprt/impl/Scene.h
+	${CMAKE_SOURCE_DIR}/hiprt/impl/TriangleMesh.h
+	${CMAKE_SOURCE_DIR}/hiprt/impl/Triangle.h
+	${CMAKE_SOURCE_DIR}/hiprt/impl/BvhBuilderUtil.h
+	${CMAKE_SOURCE_DIR}/hiprt/impl/SbvhCommon.h
+	${CMAKE_SOURCE_DIR}/hiprt/impl/ApiNodeList.h
+	${CMAKE_SOURCE_DIR}/hiprt/impl/BvhConfig.h
+	${CMAKE_SOURCE_DIR}/hiprt/impl/MemoryArena.h
+	${CMAKE_SOURCE_DIR}/hiprt/hiprt_types.h
+	${CMAKE_SOURCE_DIR}/hiprt/hiprt_common.h
+	)
+
+	message(">> add_custom_command: ${PYTHON_EXECUTABLE} compile.py ${CUDA_OPTION} --hipSdkPath \"${HIP_FINAL_PATH}\"")
+	add_custom_command(
+			OUTPUT ${KERNEL_HIPRT_COMP} ${KERNEL_OROCHI_COMP}
+			COMMAND ${PYTHON_EXECUTABLE} compile.py ${CUDA_OPTION} --hipSdkPath ${HIP_FINAL_PATH}
+			WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/scripts/bitcodes
+			COMMENT "Precompiling kernels via compile.py"
+			VERBATIM
+			DEPENDS ${bvh_source}
+		)
+
+	# create the 'precompile_kernels' project
+	add_custom_target(precompile_kernels ALL
+		DEPENDS ${KERNEL_HIPRT_COMP} ${KERNEL_OROCHI_COMP}
 		)
 
 	if(NOT NO_UNITTEST)
-		message(">> Execute: ${PYTHON_EXECUTABLE} precompile_bitcode.py --nvidia --hipSdkPath \"${HIP_FINAL_PATH}\"")
-		execute_process(
-			COMMAND ${PYTHON_EXECUTABLE} precompile_bitcode.py --nvidia --hipSdkPath ${HIP_FINAL_PATH}
+
+		set(unittest_kernel_source
+		${CMAKE_SOURCE_DIR}/test/bitcodes/custom_func_table.cpp
+		${CMAKE_SOURCE_DIR}/test/bitcodes/unit_test.cpp
+		)
+
+		message(">> add_custom_command: ${PYTHON_EXECUTABLE} precompile_bitcode.py ${CUDA_OPTION} --hipSdkPath \"${HIP_FINAL_PATH}\"")
+		add_custom_command(
+			OUTPUT ${KERNEL_UNITTEST_COMP}
+			COMMAND ${PYTHON_EXECUTABLE} precompile_bitcode.py ${CUDA_OPTION} --hipSdkPath ${HIP_FINAL_PATH}
+			DEPENDS ${KERNEL_HIPRT_COMP}  # Ensure compile.py has already run.
 			WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/scripts/bitcodes
+			COMMENT "Precompiling unit tests kernels via precompile_bitcode.py"
+			VERBATIM
+			DEPENDS ${unittest_kernel_source}
 			)
+
+		# create the 'precompile_unittest_kernels' project
+		add_custom_target(precompile_unittest_kernels ALL
+			DEPENDS ${KERNEL_UNITTEST_COMP}
+			)
+
+		add_dependencies(${HIPRT_NAME} precompile_unittest_kernels)
 	endif()
 
 
@@ -386,23 +475,38 @@ endif()
 # it's expected the step 'PRECOMPILE' has been executed.
 if ( BAKE_COMPILED_KERNEL )
 
-	message(">> Generate embedded precompiled")
+	message(">> precompiled will be embedded.")
 
 	set(PYTHON_FILE "${CMAKE_CURRENT_SOURCE_DIR}/contrib/Orochi/scripts/convert_binary_to_array.py")
 
-	set(KERNEL_HIPRT_COMP "${BASE_OUTPUT_DIR}/${CMAKE_BUILD_TYPE}/hiprt${version_str_}_${HIP_VERSION_STR}_amd.hipfb")
+	# HIPRT binary
 	set(KERNEL_HIPRT_H "${CMAKE_CURRENT_SOURCE_DIR}/hiprt/impl/bvh_build_array.h")
-	execute_process(
+	add_custom_command(
+		OUTPUT ${KERNEL_HIPRT_H}
 		COMMAND ${PYTHON_EXECUTABLE} ${PYTHON_FILE} ${KERNEL_HIPRT_COMP} ${KERNEL_HIPRT_H}
+		DEPENDS ${KERNEL_HIPRT_COMP}  # Ensure compile.py has already run.
 		WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
-		)
+		COMMENT "Converting HIPRT compiled kernel to header"
+		VERBATIM
+	)
 
-	set(KERNEL_OROCHI_COMP "${BASE_OUTPUT_DIR}/${CMAKE_BUILD_TYPE}/oro_compiled_kernels.hipfb")
+	# Orochi binary
 	set(KERNEL_OROCHI_H "${CMAKE_CURRENT_SOURCE_DIR}/contrib/Orochi/ParallelPrimitives/cache/oro_compiled_kernels.h")
-	execute_process(
+	add_custom_command(
+		OUTPUT ${KERNEL_OROCHI_H}
 		COMMAND ${PYTHON_EXECUTABLE} ${PYTHON_FILE} ${KERNEL_OROCHI_COMP} ${KERNEL_OROCHI_H}
+		DEPENDS ${KERNEL_OROCHI_COMP}  # Ensure compile.py has already run.
 		WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
-		)
+		COMMENT "Converting Orochi compiled kernel to header"
+		VERBATIM
+	)
+
+	# Create the 'bake_compiled_kernels' project
+	add_custom_target(bake_compiled_kernels ALL
+		DEPENDS ${KERNEL_HIPRT_H} ${KERNEL_OROCHI_H}
+	)
+
+	add_dependencies(${HIPRT_NAME} precompile_kernels bake_compiled_kernels)
 
 endif()
 

diff --git a/hiprt/impl/BvhBuilderKernels.h b/hiprt/impl/BvhBuilderKernels.h
@@ -202,8 +202,6 @@ SingletonConstruction( uint32_t index, PrimitiveContainer& primitives, BoxNode*
 		leafType = InstanceType;
 	}
 
-	primNodes[0].m_parentAddr = 0;
-
 	BoxNode root;
 	root.m_box0 = primitives.fetchAabb( 0 );
 	root.m_box1.reset();
@@ -415,58 +413,78 @@ extern "C" __global__ void ComputeMortonCodes_InstanceList_MatrixFrame(
 	ComputeMortonCodes<InstanceList<MatrixFrame>>( primitives, centroidBox, mortonCodeKeys, mortonCodeValues );
 }
 
-extern "C" __global__ void ResetCounters( uint32_t primCount, BoxNode* boxNodes )
+template <typename PrimitiveContainer, typename PrimitiveNode, typename Header>
+__device__ void ResetCountersAndUpdateLeaves(
+	const Header* header, PrimitiveContainer& primitives, BoxNode* boxNodes, PrimitiveNode* primNodes )
 {
 	const uint32_t index = threadIdx.x + blockIdx.x * blockDim.x;
-	if ( index < primCount ) boxNodes[index].m_updateCounter = 0;
+
+	if ( index < header->m_boxNodeCount ) boxNodes[index].m_updateCounter = 0;
+
+	if constexpr ( is_same<PrimitiveNode, TriangleNode>::value )
+	{
+		if ( index < header->m_primNodeCount )
+		{
+			primNodes[index] = primitives.fetchTriangleNode( { primNodes[index].m_primIndex0, primNodes[index].m_primIndex1 } );
+		}
+	}
+	else if constexpr ( is_same<PrimitiveNode, InstanceNode>::value )
+	{
+		if ( index < primitives.getFrameCount() ) primitives.convertFrame( index );
+
+		if ( index < header->m_primNodeCount )
+		{
+			const uint32_t		 primIndex = primNodes[index].m_primIndex;
+			hiprtTransformHeader transform = primitives.fetchTransformHeader( primIndex );
+			primNodes[index].m_mask		   = primitives.fetchMask( primIndex );
+			if ( transform.frameCount == 1 )
+				primNodes[index].m_identity =
+					primitives.copyInvTransformMatrix( transform.frameIndex, primNodes[index].m_matrix ) ? 1 : 0;
+		}
+	}
 }
 
-template <typename InstanceList>
-__device__ void ResetCountersAndUpdateFrames( InstanceList& instanceList, BoxNode* boxNodes )
+extern "C" __global__ void ResetCountersAndUpdateLeaves_TriangleMesh_TriangleNode(
+	const GeomHeader* header, TriangleMesh primitives, BoxNode* boxNodes, TriangleNode* primNodes )
 {
-	const uint32_t index = threadIdx.x + blockIdx.x * blockDim.x;
-	if ( index < instanceList.getCount() ) boxNodes[index].m_updateCounter = 0;
-	if ( index < instanceList.getFrameCount() ) instanceList.convertFrame( index );
+	ResetCountersAndUpdateLeaves( header, primitives, boxNodes, primNodes );
 }
 
-extern "C" __global__ void
-ResetCountersAndUpdateFrames_InstanceList_SRTFrame( InstanceList<SRTFrame> instanceList, BoxNode* boxNodes )
+extern "C" __global__ void ResetCountersAndUpdateLeaves_AabbList_CustomNode(
+	const GeomHeader* header, AabbList primitives, BoxNode* boxNodes, CustomNode* primNodes )
 {
-	ResetCountersAndUpdateFrames<InstanceList<SRTFrame>>( instanceList, boxNodes );
+	ResetCountersAndUpdateLeaves( header, primitives, boxNodes, primNodes );
 }
 
-extern "C" __global__ void
-ResetCountersAndUpdateFrames_InstanceList_MatrixFrame( InstanceList<MatrixFrame> instanceList, BoxNode* boxNodes )
+extern "C" __global__ void ResetCountersAndUpdateLeaves_InstanceList_MatrixFrame_InstanceNode(
+	const SceneHeader* header, InstanceList<MatrixFrame> primitives, BoxNode* boxNodes, InstanceNode* primNodes )
 {
-	ResetCountersAndUpdateFrames<InstanceList<MatrixFrame>>( instanceList, boxNodes );
+	ResetCountersAndUpdateLeaves( header, primitives, boxNodes, primNodes );
 }
 
-template <typename PrimitiveContainer, typename PrimitiveNode>
-__device__ void FitBounds( PrimitiveContainer& primitives, BoxNode* boxNodes, PrimitiveNode* primNodes )
+extern "C" __global__ void ResetCountersAndUpdateLeaves_InstanceList_SRTFrame_InstanceNode(
+	const SceneHeader* header, InstanceList<SRTFrame> primitives, BoxNode* boxNodes, InstanceNode* primNodes )
+{
+	ResetCountersAndUpdateLeaves( header, primitives, boxNodes, primNodes );
+}
+
+template <typename PrimitiveContainer, typename PrimitiveNode, typename Header>
+__device__ void FitBounds( Header* header, PrimitiveContainer& primitives, BoxNode* boxNodes, PrimitiveNode* primNodes )
 {
 	uint32_t index = threadIdx.x + blockIdx.x * blockDim.x;
 
-	if ( index >= primitives.getCount() ) return;
+	if ( index >= header->m_boxNodeCount ) return;
 
-	uint32_t parentAddr = primNodes[index].m_parentAddr;
-	if constexpr ( is_same<PrimitiveNode, TriangleNode>::value )
-	{
-		primNodes[index] =
-			primitives.fetchTriangleNode( make_uint2( primNodes[index].m_primIndex0, primNodes[index].m_primIndex1 ) );
-		primNodes[index].m_parentAddr = parentAddr;
-	}
-	else if constexpr ( is_same<PrimitiveNode, InstanceNode>::value )
+	BoxNode	 node		   = boxNodes[index];
+	uint32_t internalCount = 0;
+	for ( uint32_t i = 0; i < node.m_childCount; ++i )
 	{
-		const uint32_t		 primIndex = primNodes[index].m_primIndex;
-		hiprtTransformHeader transform = primitives.fetchTransformHeader( primIndex );
-		primNodes[index].m_mask		   = primitives.fetchMask( primIndex );
-		if ( transform.frameCount == 1 )
-			primNodes[index].m_identity =
-				primitives.copyInvTransformMatrix( transform.frameIndex, primNodes[index].m_matrix ) ? 1 : 0;
+		if ( node.getChildType( i ) == BoxType ) internalCount++;
 	}
 
-	index = parentAddr;
-	while ( index != InvalidValue && atomicAdd( &boxNodes[index].m_updateCounter, 1 ) >= boxNodes[index].m_childCount - 1 )
+	if ( internalCount > 0 ) return;
+
+	while ( true )
 	{
 		__threadfence();
 
@@ -484,33 +502,40 @@ __device__ void FitBounds( PrimitiveContainer& primitives, BoxNode* boxNodes, Pr
 		if ( node.m_childIndex3 != InvalidValue )
 			node.m_box3 = getNodeBox( node.m_childIndex3, primitives, boxNodes, primNodes );
 
-		index = boxNodes[index].m_parentAddr;
+		internalCount = 0;
+		for ( uint32_t i = 0; i < node.m_childCount; ++i )
+		{
+			if ( node.getChildType( i ) == BoxType ) internalCount++;
+		}
 
 		__threadfence();
+
+		if ( atomicAdd( &node.m_updateCounter, 1 ) < internalCount - 1 ) break;
 	}
 }
 
 extern "C" __global__ void
-FitBounds_TriangleMesh_TriangleNode( TriangleMesh primitives, BoxNode* boxNodes, TriangleNode* primNodes )
+FitBounds_TriangleMesh_TriangleNode( GeomHeader* header, TriangleMesh primitives, BoxNode* boxNodes, TriangleNode* primNodes )
 {
-	FitBounds<TriangleMesh, TriangleNode>( primitives, boxNodes, primNodes );
+	FitBounds<TriangleMesh, TriangleNode>( header, primitives, boxNodes, primNodes );
 }
 
-extern "C" __global__ void FitBounds_AabbList_CustomNode( AabbList primitives, BoxNode* boxNodes, CustomNode* primNodes )
+extern "C" __global__ void
+FitBounds_AabbList_CustomNode( GeomHeader* header, AabbList primitives, BoxNode* boxNodes, CustomNode* primNodes )
 {
-	FitBounds<AabbList, CustomNode>( primitives, boxNodes, primNodes );
+	FitBounds<AabbList, CustomNode>( header, primitives, boxNodes, primNodes );
 }
 
-extern "C" __global__ void
-FitBounds_InstanceList_SRTFrame_InstanceNode( InstanceList<SRTFrame> primitives, BoxNode* boxNodes, InstanceNode* primNodes )
+extern "C" __global__ void FitBounds_InstanceList_SRTFrame_InstanceNode(
+	SceneHeader* header, InstanceList<SRTFrame> primitives, BoxNode* boxNodes, InstanceNode* primNodes )
 {
-	FitBounds<InstanceList<SRTFrame>, InstanceNode>( primitives, boxNodes, primNodes );
+	FitBounds<InstanceList<SRTFrame>, InstanceNode>( header, primitives, boxNodes, primNodes );
 }
 
 extern "C" __global__ void FitBounds_InstanceList_MatrixFrame_InstanceNode(
-	InstanceList<MatrixFrame> primitives, BoxNode* boxNodes, InstanceNode* primNodes )
+	SceneHeader* header, InstanceList<MatrixFrame> primitives, BoxNode* boxNodes, InstanceNode* primNodes )
 {
-	FitBounds<InstanceList<MatrixFrame>, InstanceNode>( primitives, boxNodes, primNodes );
+	FitBounds<InstanceList<MatrixFrame>, InstanceNode>( header, primitives, boxNodes, primNodes );
 }
 
 template <typename PrimitiveContainer, typename PrimitiveNode, typename Header>
@@ -635,8 +660,7 @@ __device__ void Collapse(
 					else
 						primNodes[nodeAddr].m_transform = transform;
 				}
-				primNodes[nodeAddr].m_parentAddr = parentAddr;
-				done							 = true;
+				done = true;
 			}
 		}